Brick Library 0.1
Performance-portable stencil datalayout & codegen
dev_shl.h
Go to the documentation of this file.
1
8#ifndef BRICK_DEV_SHL_H
9#define BRICK_DEV_SHL_H
10
11#if defined(CL_SYCL_LANGUAGE_VERSION) || defined(SYCL_LANGUAGE_VERSION)
12
13// template<typename T>
14// inline void dev_shl(cl::sycl::intel::sub_group &SG, T &res, T l, T r, unsigned kn, unsigned cw, unsigned cid) {
15#define dev_shl(res, l, r, kn, cw, cid) do { \
16 auto l_tmp = SG.shuffle_down(l, cw - (kn)); \
17 auto r_tmp = SG.shuffle_up(r, kn); \
18 res = (cid) < kn? l_tmp : r_tmp; \
19 } while(false)
20
21#elif defined(__OPENCL_VERSION__)
22
23#define TWOSHL
24
25#ifdef TWOSHL
26
27/*
28 * These two shuffle implementations are in fact equivalent, however using one shuffle is not stable right now
29 * that produces random errors during computation
30 */
31#define dev_shl(res, l, r, kn, cw, cid) do { \
32 bElem l_tmp = sub_group_shuffle_down(l, cw - (kn)); \
33 bElem r_tmp = sub_group_shuffle_up(r, kn); \
34 res = (cid) < kn? l_tmp : r_tmp; \
35 } while(false)
36
37#else
38
39#define dev_shl(res, l, r, kn, cw, cid) do { \
40 int rk = cw - (kn); \
41 bElem l_tmp = (cid) < rk? r : l; \
42 int oid = (sglid & (OCL_SUBGROUP - cw)) | ((sglid + rk) & (cw - 1)); \
43 res = sub_group_shuffle(l_tmp, oid); \
44 } while(false)
45
46#endif
47
48#elif defined(__CUDACC__) || defined(__HIP__)
49
50// dev_shl works for both NVidia (CUDA) and AMD (HIP)
51template<typename T>
52__device__ __forceinline__ void dev_shl(T &res, T l, T r, int kn, int cw, int cid) {
53#if defined(CUDART_VERSION) && (CUDART_VERSION >= 9000)
54 // CUDA 9.0+ uses *sync
55 T l_tmp = __shfl_down_sync(0xffffffff, l, cw - (kn));
56 T r_tmp = __shfl_up_sync(0xffffffff, r, kn);
57#else
58 // CUDA < 9.0 and HIP works with shfl
59 T l_tmp = __shfl_down(l, cw - (kn));
60 T r_tmp = __shfl_up(r, kn);
61#endif
62 res = (cid) < kn? l_tmp : r_tmp;
63}
64#endif
65
66#endif //BRICK_DEV_SHL_H