Brick Library 0.1
Performance-portable stencil datalayout & codegen
array-mpi.h
Go to the documentation of this file.
1
8#ifndef BRICK_ARRAY_MPI_H
9#define BRICK_ARRAY_MPI_H
10
11#include "brick-mpi.h"
12#include <mpi.h>
13
20inline void elemcpy(bElem *dst, const bElem *src, unsigned long size) {
21#pragma omp simd
22 for (unsigned long i = 0; i < size; ++i)
23 dst[i] = src[i];
24}
25
26template<unsigned dim>
27inline bElem *pack(bElem *arr, BitSet neighbor, bElem *buffer_out, const std::vector<unsigned long> &arrstride,
28 const std::vector<long> &dimlist, const std::vector<long> &padding, const std::vector<long> &ghost) {
29 // Inner region
30 long sec = 0;
31 long st = 0;
32 int d = dim - 1;
33 if (neighbor.get(dim)) {
34 sec = 1;
35 st = padding[d] + dimlist[d];
36 } else if (neighbor.get(-(int) dim)) {
37 sec = -1;
38 st = padding[d] + ghost[d];
39 }
40 if (sec) {
41 for (unsigned i = 0; i < ghost[d]; ++i)
42 buffer_out = pack<dim - 1>(arr + arrstride[d] * (st + i), neighbor, buffer_out,
43 arrstride, dimlist, padding, ghost);
44 } else {
45 for (unsigned i = 0; i < dimlist[d]; ++i)
46 buffer_out = pack<dim - 1>(arr + arrstride[d] * (padding[d] + ghost[d] + i), neighbor, buffer_out,
47 arrstride, dimlist, padding, ghost);
48 }
49
50 return buffer_out;
51}
52
53template<>
54inline bElem *pack<1>(bElem *arr, BitSet neighbor, bElem *buffer_out, const std::vector<unsigned long> &arrstride,
55 const std::vector<long> &dimlist, const std::vector<long> &padding,
56 const std::vector<long> &ghost) {
57 // Inner region
58 long sec = 0;
59 long st = 0;
60 int d = 0;
61 if (neighbor.get(1)) {
62 sec = 1;
63 st = padding[d] + dimlist[d];
64 } else if (neighbor.get(-1)) {
65 sec = -1;
66 st = padding[d] + ghost[d];
67 }
68 if (sec != 0) {
69 elemcpy(buffer_out, arr + st, ghost[d]);
70 return buffer_out + ghost[d];
71 } else {
72 elemcpy(buffer_out, arr + padding[d] + ghost[d], dimlist[d]);
73 return buffer_out + dimlist[d];
74 }
75}
76
77template<unsigned dim>
78inline bElem *unpack(bElem *arr, BitSet neighbor, bElem *buffer_recv, const std::vector<unsigned long> &arrstride,
79 const std::vector<long> &dimlist, const std::vector<long> &padding,
80 const std::vector<long> &ghost) {
81 // Inner region
82 long sec = 0;
83 long st = 0;
84 int d = (int) dim - 1;
85 if (neighbor.get(dim)) {
86 sec = 1;
87 st = padding[d] + dimlist[d] + ghost[d];
88 } else if (neighbor.get(-(int) dim)) {
89 sec = -1;
90 st = padding[d];
91 }
92 if (sec) {
93 for (unsigned i = 0; i < ghost[d]; ++i)
94 buffer_recv = unpack<dim - 1>(arr + arrstride[d] * (st + i), neighbor, buffer_recv,
95 arrstride, dimlist, padding, ghost);
96 } else {
97 for (unsigned i = 0; i < dimlist[d]; ++i)
98 buffer_recv = unpack<dim - 1>(arr + arrstride[d] * (padding[d] + ghost[d] + i), neighbor, buffer_recv,
99 arrstride, dimlist, padding, ghost);
100 }
101 return buffer_recv;
102}
103
104template<>
105inline bElem *unpack<1>(bElem *arr, BitSet neighbor, bElem *buffer_recv, const std::vector<unsigned long> &arrstride,
106 const std::vector<long> &dimlist, const std::vector<long> &padding,
107 const std::vector<long> &ghost) {
108 // Inner region
109 long sec = 0;
110 long st = 0;
111 int d = 0;
112 if (neighbor.get(1)) {
113 sec = 1;
114 st = padding[d] + dimlist[d] + ghost[d];
115 } else if (neighbor.get(-1)) {
116 sec = -1;
117 st = padding[d];
118 }
119 if (sec) {
120 elemcpy(arr + st, buffer_recv, ghost[d]);
121 return buffer_recv + ghost[d];
122 } else {
123 elemcpy(arr + padding[d] + ghost[d], buffer_recv, dimlist[d]);
124 return buffer_recv + dimlist[d];
125 }
126}
127
128
129inline unsigned
130evalsize(BitSet region, const std::vector<long> &dimlist, const std::vector<long> &ghost, bool inner = true) {
131 // Inner region
132 unsigned size = 1;
133 for (int i = 1; i <= (int) dimlist.size(); ++i)
134 if (region.get(i) || region.get(-i))
135 size = size * ghost[i - 1];
136 else
137 size = size * (dimlist[i - 1] - (inner ? 2 * ghost[i - 1] : 0));
138 return size;
139}
140
141extern std::vector<bElem *> arr_buffers_out;
142extern std::vector<bElem *> arr_buffers_recv;
143
144// ID is used to prevent message mismatch from messages with the same node, low performance only for validation testing.
145template<unsigned dim>
146void exchangeArr(bElem *arr, const MPI_Comm &comm, std::unordered_map<uint64_t, int> &rank_map,
147 const std::vector<long> &dimlist, const std::vector<long> &padding, const std::vector<long> &ghost) {
148 std::vector<BitSet> neighbors;
149 allneighbors(0, 1, dim, neighbors);
150 neighbors.erase(neighbors.begin() + (neighbors.size() / 2));
151 std::vector<unsigned long> tot(neighbors.size());
152 std::vector<MPI_Request> requests(neighbors.size() * 2);
153 std::vector<MPI_Status> stats(requests.size());
154
155 std::vector<unsigned long> arrstride(dimlist.size());
156 unsigned long stri = 1;
157
158 for (int i = 0; i < arrstride.size(); ++i) {
159 arrstride[i] = stri;
160 stri = stri * ((padding[i] + ghost[i]) * 2 + dimlist[i]);
161 }
162
163 for (int i = 0; i < (int) neighbors.size(); ++i) {
164 tot[i] = (unsigned long) evalsize(neighbors[i], dimlist, ghost, false);
165 }
166
167 if (arr_buffers_out.size() == 0)
168 for (int i = 0; i < (int) neighbors.size(); ++i) {
169 arr_buffers_recv.emplace_back((bElem*)aligned_alloc(4096, sizeof(bElem) * tot[i]));
170 arr_buffers_out.emplace_back((bElem*)aligned_alloc(4096, sizeof(bElem) * tot[i]));
171 }
172
173 double st = omp_get_wtime(), ed;
174 // Pack
175#pragma omp parallel for
176 for (int i = 0; i < (int) neighbors.size(); ++i)
177 pack<dim>(arr, neighbors[i], arr_buffers_out[i], arrstride, dimlist, padding, ghost);
178
179 ed = omp_get_wtime();
180 packtime += ed - st;
181
182#ifdef BARRIER_TIMESTEP
183 MPI_Barrier(comm);
184#endif
185
186 st = omp_get_wtime();
187
188 for (int i = 0; i < (int) neighbors.size(); ++i) {
189 MPI_Irecv(arr_buffers_recv[i], (int) (tot[i] * sizeof(bElem)), MPI_CHAR, rank_map[neighbors[i].set],
190 (int) neighbors.size() - i - 1, comm, &(requests[i * 2]));
191 MPI_Isend(arr_buffers_out[i], (int) (tot[i] * sizeof(bElem)), MPI_CHAR, rank_map[neighbors[i].set], i, comm,
192 &(requests[i * 2 + 1]));
193 }
194
195 ed = omp_get_wtime();
196 calltime += ed - st;
197 st = ed;
198
199 // Wait
200 MPI_Waitall(static_cast<int>(requests.size()), requests.data(), stats.data());
201
202 ed = omp_get_wtime();
203 waittime += ed - st;
204 st = ed;
205
206 // Unpack
207#pragma omp parallel for
208 for (int i = 0; i < (int) neighbors.size(); ++i)
209 unpack<dim>(arr, neighbors[i], arr_buffers_recv[i], arrstride, dimlist, padding, ghost);
210
211 ed = omp_get_wtime();
212 packtime += ed - st;
213}
214
215inline MPI_Datatype pack_type(BitSet neighbor, const std::vector<long> &dimlist, const std::vector<long> &padding,
216 const std::vector<long> &ghost) {
217 int ndims = dimlist.size();
218 std::vector<int> size(ndims), subsize(ndims), start(ndims);
219 for (long dd = 0; dd < dimlist.size(); ++dd) {
220 long d = (long)dimlist.size() - dd - 1;
221 size[dd] = dimlist[d] + 2 * (padding[d] + ghost[d]);
222 long dim = d + 1;
223 long sec = 0;
224 if (neighbor.get(dim)) {
225 sec = 1;
226 start[dd] = padding[d] + dimlist[d];
227 } else if (neighbor.get(-(int) dim)) {
228 sec = -1;
229 start[dd] = padding[d] + ghost[d];
230 }
231 if (sec) {
232 subsize[dd] = ghost[d];
233 } else {
234 subsize[dd] = dimlist[d];
235 start[dd] = padding[d] + ghost[d];
236 }
237 }
238 MPI_Datatype ret;
239 // Subarray is most contiguous dimension first (largest index)
240 MPI_Type_create_subarray(ndims, size.data(), subsize.data(), start.data(), MPI_ORDER_C, MPI_DOUBLE, &ret);
241 return ret;
242}
243
244inline MPI_Datatype unpack_type(BitSet neighbor, const std::vector<long> &dimlist, const std::vector<long> &padding,
245 const std::vector<long> &ghost) {
246 int ndims = dimlist.size();
247 std::vector<int> size(ndims), subsize(ndims), start(ndims);
248 for (long dd = 0; dd < dimlist.size(); ++dd) {
249 long d = (long)dimlist.size() - dd - 1;
250 size[dd] = dimlist[d] + 2 * (padding[d] + ghost[d]);
251 long dim = d + 1;
252 long sec = 0;
253 if (neighbor.get(dim)) {
254 sec = 1;
255 start[dd] = padding[d] + dimlist[d] + ghost[d];
256 } else if (neighbor.get(-(int) dim)) {
257 sec = -1;
258 start[dd] = padding[d];
259 }
260 if (sec) {
261 subsize[dd] = ghost[d];
262 } else {
263 subsize[dd] = dimlist[d];
264 start[dd] = padding[d] + ghost[d];
265 }
266 }
267 MPI_Datatype ret;
268 // Subarray is most contiguous dimension first (largest index)
269 MPI_Type_create_subarray(ndims, size.data(), subsize.data(), start.data(), MPI_ORDER_C, MPI_DOUBLE, &ret);
270 return ret;
271}
272
273template<unsigned dim>
274void exchangeArrPrepareTypes(std::unordered_map<uint64_t, MPI_Datatype> &stypemap,
275 std::unordered_map<uint64_t, MPI_Datatype> &rtypemap,
276 const std::vector<long> &dimlist, const std::vector<long> &padding,
277 const std::vector<long> &ghost) {
278 std::vector<BitSet> neighbors;
279 allneighbors(0, 1, dim, neighbors);
280 neighbors.erase(neighbors.begin() + (neighbors.size() / 2));
281 std::vector<MPI_Request> requests(neighbors.size() * 2);
282
283 for (auto n: neighbors) {
284 MPI_Datatype MPI_rtype = unpack_type(n, dimlist, padding, ghost);
285 MPI_Type_commit(&MPI_rtype);
286 rtypemap[n.set] = MPI_rtype;
287 MPI_Datatype MPI_stype = pack_type(n, dimlist, padding, ghost);
288 MPI_Type_commit(&MPI_stype);
289 stypemap[n.set] = MPI_stype;
290 }
291}
292
293// Using data types
294template<unsigned dim>
295void exchangeArrTypes(bElem *arr, const MPI_Comm &comm, std::unordered_map<uint64_t, int> &rank_map,
296 std::unordered_map<uint64_t, MPI_Datatype> &stypemap,
297 std::unordered_map<uint64_t, MPI_Datatype> &rtypemap) {
298 std::vector<BitSet> neighbors;
299 allneighbors(0, 1, dim, neighbors);
300 neighbors.erase(neighbors.begin() + (neighbors.size() / 2));
301 std::vector<MPI_Request> requests(neighbors.size() * 2);
302
303 int rank;
304 MPI_Comm_rank(comm, &rank);
305
306 double st = omp_get_wtime(), ed;
307
308 for (int i = 0; i < (int) neighbors.size(); ++i) {
309 MPI_Irecv(arr, 1, rtypemap[neighbors[i].set], rank_map[neighbors[i].set],
310 (int) neighbors.size() - i - 1, comm, &(requests[i * 2]));
311 MPI_Isend(arr, 1, stypemap[neighbors[i].set], rank_map[neighbors[i].set], i, comm, &(requests[i * 2 + 1]));
312 }
313
314 ed = omp_get_wtime();
315 calltime += ed - st;
316 st = ed;
317
318 // Wait
319 std::vector<MPI_Status> stats(requests.size());
320 MPI_Waitall(static_cast<int>(requests.size()), requests.data(), stats.data());
321
322 ed = omp_get_wtime();
323 waittime += ed - st;
324}
325
326typedef struct {
328 std::unordered_map<uint64_t, int> *rank_map;
329 std::unordered_map<uint64_t, int> *id_map;
330 int id;
331} ArrExPack;
332
333template<unsigned dim>
334void exchangeArrAll(std::vector<ArrExPack> arr, const MPI_Comm &comm,
335 const std::vector<long> &dimlist, const std::vector<long> &padding,
336 const std::vector<long> &ghost) {
337 std::vector<BitSet> neighbors;
338 allneighbors(0, 1, dim, neighbors);
339 neighbors.erase(neighbors.begin() + (neighbors.size() / 2));
340 std::vector<bElem *> buffers_out(arr.size() * neighbors.size(), nullptr);
341 std::vector<bElem *> buffers_recv(arr.size() * neighbors.size(), nullptr);
342 std::vector<unsigned long> tot(neighbors.size());
343 std::vector<MPI_Request> requests(arr.size() * neighbors.size() * 2);
344
345 std::vector<unsigned long> arrstride(dimlist.size());
346 unsigned long stri = 1;
347
348 for (int i = 0; i < arrstride.size(); ++i) {
349 arrstride[i] = stri;
350 stri = stri * ((padding[i] + ghost[i]) * 2 + dimlist[i]);
351 }
352
353 for (int i = 0; i < (int) neighbors.size(); ++i) {
354 tot[i] = (unsigned long) evalsize(neighbors[i], dimlist, ghost, false);
355 for (int s = 0; s < arr.size(); ++s) {
356 buffers_recv[i + s * neighbors.size()] = new bElem[tot[i]];
357 buffers_out[i + s * neighbors.size()] = new bElem[tot[i]];
358 }
359 }
360
361 double st = omp_get_wtime(), ed;
362
363 // Pack
364#pragma omp parallel for
365 for (int i = 0; i < (int) neighbors.size(); ++i)
366 for (int s = 0; s < arr.size(); ++s)
367 pack<dim>(arr[s].arr, neighbors[i], buffers_out[i + s * neighbors.size()], arrstride, dimlist, padding, ghost);
368
369 ed = omp_get_wtime();
370 packtime += ed - st;
371
372#ifdef BARRIER_TIMESTEP
373 MPI_Barrier(comm);
374#endif
375
376 st = omp_get_wtime();
377
378 for (int i = 0; i < (int) neighbors.size(); ++i)
379 for (int s = 0; s < arr.size(); ++s) {
380 MPI_Irecv(buffers_recv[i + s * neighbors.size()], (int) (tot[i] * sizeof(bElem)), MPI_CHAR,
381 arr[s].rank_map->at(neighbors[i].set),
382 arr[s].id_map->at(neighbors[i].set) * 100 + (int) neighbors.size() - i - 1,
383 comm, &(requests[i * 2 + s * neighbors.size() * 2]));
384 MPI_Isend(buffers_out[i + s * neighbors.size()], (int) (tot[i] * sizeof(bElem)), MPI_CHAR,
385 arr[s].rank_map->at(neighbors[i].set), arr[s].id * 100 + i, comm,
386 &(requests[i * 2 + s * neighbors.size() * 2 + 1]));
387 }
388
389 ed = omp_get_wtime();
390 calltime += ed - st;
391 st = ed;
392
393 // Wait
394 std::vector<MPI_Status> stats(requests.size());
395 MPI_Waitall(static_cast<int>(requests.size()), requests.data(), stats.data());
396
397 ed = omp_get_wtime();
398 waittime += ed - st;
399 st = ed;
400
401 // Unpack
402#pragma omp parallel for
403 for (int i = 0; i < (int) neighbors.size(); ++i)
404 for (int s = 0; s < arr.size(); ++s)
405 unpack<dim>(arr[s].arr, neighbors[i], buffers_recv[i + s * neighbors.size()], arrstride, dimlist, padding, ghost);
406
407 ed = omp_get_wtime();
408 packtime += ed - st;
409
410 // Cleanup
411 for (auto b: buffers_out)
412 delete[] b;
413 for (auto b: buffers_recv)
414 delete[] b;
415}
416
417#endif //BRICK_ARRAY_MPI_H
unsigned evalsize(BitSet region, const std::vector< long > &dimlist, const std::vector< long > &ghost, bool inner=true)
Definition: array-mpi.h:130
void exchangeArrAll(std::vector< ArrExPack > arr, const MPI_Comm &comm, const std::vector< long > &dimlist, const std::vector< long > &padding, const std::vector< long > &ghost)
Definition: array-mpi.h:334
void exchangeArr(bElem *arr, const MPI_Comm &comm, std::unordered_map< uint64_t, int > &rank_map, const std::vector< long > &dimlist, const std::vector< long > &padding, const std::vector< long > &ghost)
Definition: array-mpi.h:146
bElem * pack(bElem *arr, BitSet neighbor, bElem *buffer_out, const std::vector< unsigned long > &arrstride, const std::vector< long > &dimlist, const std::vector< long > &padding, const std::vector< long > &ghost)
Definition: array-mpi.h:27
std::vector< bElem * > arr_buffers_recv
Definition: array-mpi.cpp:8
bElem * unpack(bElem *arr, BitSet neighbor, bElem *buffer_recv, const std::vector< unsigned long > &arrstride, const std::vector< long > &dimlist, const std::vector< long > &padding, const std::vector< long > &ghost)
Definition: array-mpi.h:78
MPI_Datatype unpack_type(BitSet neighbor, const std::vector< long > &dimlist, const std::vector< long > &padding, const std::vector< long > &ghost)
Definition: array-mpi.h:244
void elemcpy(bElem *dst, const bElem *src, unsigned long size)
Definition: array-mpi.h:20
bElem * pack< 1 >(bElem *arr, BitSet neighbor, bElem *buffer_out, const std::vector< unsigned long > &arrstride, const std::vector< long > &dimlist, const std::vector< long > &padding, const std::vector< long > &ghost)
Definition: array-mpi.h:54
MPI_Datatype pack_type(BitSet neighbor, const std::vector< long > &dimlist, const std::vector< long > &padding, const std::vector< long > &ghost)
Definition: array-mpi.h:215
void exchangeArrTypes(bElem *arr, const MPI_Comm &comm, std::unordered_map< uint64_t, int > &rank_map, std::unordered_map< uint64_t, MPI_Datatype > &stypemap, std::unordered_map< uint64_t, MPI_Datatype > &rtypemap)
Definition: array-mpi.h:295
bElem * unpack< 1 >(bElem *arr, BitSet neighbor, bElem *buffer_recv, const std::vector< unsigned long > &arrstride, const std::vector< long > &dimlist, const std::vector< long > &padding, const std::vector< long > &ghost)
Definition: array-mpi.h:105
void exchangeArrPrepareTypes(std::unordered_map< uint64_t, MPI_Datatype > &stypemap, std::unordered_map< uint64_t, MPI_Datatype > &rtypemap, const std::vector< long > &dimlist, const std::vector< long > &padding, const std::vector< long > &ghost)
Definition: array-mpi.h:274
std::vector< bElem * > arr_buffers_out
Definition: array-mpi.cpp:7
MPI stuff related to bricks.
void allneighbors(BitSet cur, long idx, long dim, std::vector< BitSet > &neighbors)
Enumerate all neighbors.
Definition: brick-mpi.cpp:9
double waittime
Definition: brick-mpi.h:23
double calltime
Definition: brick-mpi.h:23
double packtime
Definition: brick-mpi.cpp:7
i
Definition: 7pt.py:5
int tot
Definition: dag_opt.py:294
Definition: __init__.py:1
Definition: array-mpi.h:326
std::unordered_map< uint64_t, int > * id_map
Definition: array-mpi.h:329
int id
Definition: array-mpi.h:330
bElem * arr
Definition: array-mpi.h:327
std::unordered_map< uint64_t, int > * rank_map
Definition: array-mpi.h:328
Set using bitfield.
Definition: bitset.h:18
bool get(long pos) const
Return whether a number is in the set.
Definition: bitset.h:75
#define bElem
Basic datatype for all brick elements.
Definition: vecscatter.h:13