11#if defined(CL_SYCL_LANGUAGE_VERSION) || defined(SYCL_LANGUAGE_VERSION)
15#define dev_shl(res, l, r, kn, cw, cid) do { \
16 auto l_tmp = SG.shuffle_down(l, cw - (kn)); \
17 auto r_tmp = SG.shuffle_up(r, kn); \
18 res = (cid) < kn? l_tmp : r_tmp; \
21#elif defined(__OPENCL_VERSION__)
31#define dev_shl(res, l, r, kn, cw, cid) do { \
32 bElem l_tmp = sub_group_shuffle_down(l, cw - (kn)); \
33 bElem r_tmp = sub_group_shuffle_up(r, kn); \
34 res = (cid) < kn? l_tmp : r_tmp; \
39#define dev_shl(res, l, r, kn, cw, cid) do { \
41 bElem l_tmp = (cid) < rk? r : l; \
42 int oid = (sglid & (OCL_SUBGROUP - cw)) | ((sglid + rk) & (cw - 1)); \
43 res = sub_group_shuffle(l_tmp, oid); \
48#elif defined(__CUDACC__) || defined(__HIP__)
52__device__ __forceinline__
void dev_shl(T &res, T l, T r,
int kn,
int cw,
int cid) {
53#if defined(CUDART_VERSION) && (CUDART_VERSION >= 9000)
55 T l_tmp = __shfl_down_sync(0xffffffff, l, cw - (kn));
56 T r_tmp = __shfl_up_sync(0xffffffff, r, kn);
59 T l_tmp = __shfl_down(l, cw - (kn));
60 T r_tmp = __shfl_up(r, kn);
62 res = (cid) < kn? l_tmp : r_tmp;