xpu
device.h
Go to the documentation of this file.
1 
11 #ifndef XPU_DEVICE_H
12 #define XPU_DEVICE_H
13 
14 #include "defines.h"
15 #include "common.h"
16 
17 #include "detail/common.h"
18 #include "detail/type_info.h"
19 
20 #if XPU_IS_CPU
21 #include "detail/platform/cpu/cmem_impl.h"
22 #include "detail/platform/cpu/tpos_impl.h"
23 #elif XPU_IS_HIP_CUDA
24 #include "detail/platform/hip_cuda/cmem_impl.h"
25 #include "detail/platform/hip_cuda/tpos_impl.h"
26 #elif XPU_IS_SYCL
27 #include "detail/platform/sycl/cmem_impl.h"
28 #include "detail/platform/sycl/tpos_impl.h"
29 #else
30 #error "Unsupported XPU target"
31 #endif
32 
33 #include <type_traits>
34 
35 #define XPU_IMAGE(image) XPU_DETAIL_IMAGE(image)
36 #define XPU_EXPORT(obj) XPU_DETAIL_EXPORT(obj)
37 
38 #define XPU_ASSERT(x) XPU_DETAIL_ASSERT(x)
39 
40 namespace xpu {
41 
42 constexpr inline driver_t compilation_target = XPU_DETAIL_COMPILATION_TARGET;
43 
44 struct device_image : detail::device_image {};
45 
46 template<int X, int Y = -1, int Z = -1>
47 struct block_size {
48  static inline constexpr xpu::dim value{X, Y, Z};
49 };
50 
51 struct no_smem {};
52 
56 enum schedule_t {
57  schedule_static, // Avoid conflicts with C++ 'static' keyword
59 };
60 
64 template<schedule_t Schedule = schedule_static, size_t ChunkSize = 0>
66 
70  static constexpr schedule_t schedule = Schedule;
71 
75  static constexpr size_t chunk_size = ChunkSize;
76 };
77 
78 class tpos {
79 
80 public:
81  XPU_D int thread_idx_x() const { return m_impl.thread_idx_x(); }
82  XPU_D int thread_idx_y() const { return m_impl.thread_idx_y(); }
83  XPU_D int thread_idx_z() const { return m_impl.thread_idx_z(); }
84 
85  XPU_D int block_dim_x() const { return m_impl.block_dim_x(); }
86  XPU_D int block_dim_y() const { return m_impl.block_dim_y(); }
87  XPU_D int block_dim_z() const { return m_impl.block_dim_z(); }
88 
89  XPU_D int block_idx_x() const { return m_impl.block_idx_x(); }
90  XPU_D int block_idx_y() const { return m_impl.block_idx_y(); }
91  XPU_D int block_idx_z() const { return m_impl.block_idx_z(); }
92 
93  XPU_D int grid_dim_x() const { return m_impl.grid_dim_x(); }
94  XPU_D int grid_dim_y() const { return m_impl.grid_dim_y(); }
95  XPU_D int grid_dim_z() const { return m_impl.grid_dim_z(); }
96 
97 private:
98  detail::tpos_impl m_impl;
99 
100 public:
101  template<typename... Args>
102  XPU_D tpos(detail::internal_ctor_t, Args &&... args)
103  : m_impl(std::forward<Args>(args)...) {}
104 
105  XPU_D detail::tpos_impl &impl(detail::internal_fn_t) { return m_impl; }
106 };
107 
108 template<typename... Constants>
109 class cmem {
110 
111 public:
112  template<typename Constant>
113  XPU_D const typename Constant::data_t &get() const { return m_impl.template get<Constant>(); }
114 
115 private:
116  detail::cmem_impl<Constants...> m_impl;
117 
118 public:
119  template<typename... Args>
120  XPU_D cmem(detail::internal_ctor_t, Args &&... args)
121  : m_impl(std::forward<Args>(args)...) {}
122 
123 };
124 
125 
126 template<typename Image>
127 struct kernel : detail::action<Image, detail::kernel_tag> {
128  // Defaults
130  using constants = cmem<>;
132 
138 };
139 
140 template<typename Image>
141 struct function : detail::action<Image, detail::function_tag> {
142 };
143 
144 template<typename Image, typename Data>
145 struct constant : detail::action<Image, detail::constant_tag> {
146  using data_t = Data;
147 };
148 
149 template<typename SharedMemory = xpu::no_smem, typename Constants = xpu::cmem<>>
151 
152 public:
153  using shared_memory = SharedMemory;
154  using constants = Constants;
155 
156  XPU_D shared_memory &smem() { return m_smem; }
157  XPU_D const shared_memory &smem() const { return m_smem; }
158 
162  template<typename C>
163  XPU_D const typename C::data_t &cmem() const { return m_cmem.template get<C>(); }
164 
168  XPU_D const constants &cmem() const { return m_cmem; }
169 
174  XPU_D int thread_idx_x() const { return m_pos.thread_idx_x(); }
175 
180  XPU_D int thread_idx_y() const { return m_pos.thread_idx_y(); }
181 
186  XPU_D int thread_idx_z() const { return m_pos.thread_idx_z(); }
187 
192  XPU_D int block_dim_x() const { return m_pos.block_dim_x(); }
193 
198  XPU_D int block_dim_y() const { return m_pos.block_dim_y(); }
199 
204  XPU_D int block_dim_z() const { return m_pos.block_dim_z(); }
205 
210  XPU_D int block_idx_x() const { return m_pos.block_idx_x(); }
211 
216  XPU_D int block_idx_y() const { return m_pos.block_idx_y(); }
217 
222  XPU_D int block_idx_z() const { return m_pos.block_idx_z(); }
223 
228  XPU_D int grid_dim_x() const { return m_pos.grid_dim_x(); }
229 
234  XPU_D int grid_dim_y() const { return m_pos.grid_dim_y(); }
235 
240  XPU_D int grid_dim_z() const { return m_pos.grid_dim_z(); }
241 
242  XPU_D tpos &pos() { return m_pos; }
243  XPU_D const tpos &pos() const { return m_pos; }
244 
245 private:
246  tpos &m_pos;
247  shared_memory &m_smem;
248  const constants &m_cmem;
249 
250 public:
251  XPU_D kernel_context(detail::internal_ctor_t, tpos &pos, shared_memory &smem, const constants &cmem)
252  : m_pos(pos)
253  , m_smem(smem)
254  , m_cmem(cmem) {}
255 
256 };
257 
264 template<typename T>
265 class view {
266 
267 public:
271  view() = default;
272 
276  XPU_D view(T *data, size_t size);
277 
282 
286  XPU_D T *data() const { return m_data; }
287 
291  XPU_D size_t size() const { return m_size; }
292 
296  XPU_D bool empty() const { return m_size == 0; }
297 
298  XPU_D T &operator[](size_t idx);
299  XPU_D const T &operator[](size_t idx) const;
300 
301  XPU_D T &at(size_t idx);
302  XPU_D const T &at(size_t idx) const;
303 
304 private:
305  T *m_data = nullptr;
306  size_t m_size = 0;
307 
308 };
309 
310 // =================================================================================================
311 // Math functions
312 //
313 // Interface is based on CUDA math functions
314 // https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__SINGLE.html#group__CUDA__MATH__SINGLE
315 //
316 // Note: Functions not supported in SYCL or HIP were removed.
317 // =================================================================================================
318 
319 XPU_D constexpr float pi();
320 XPU_D constexpr float pi_2();
321 XPU_D constexpr float pi_4();
322 XPU_D constexpr float deg_to_rad();
323 XPU_D constexpr float sqrt2();
324 
325 XPU_D int abs(int x);
326 XPU_D float abs(float x);
327 
328 XPU_D float acos(float x);
329 XPU_D float acosh(float x);
330 XPU_D float acospi(float x);
331 
332 XPU_D float asin(float x);
333 XPU_D float asinh(float x);
334 XPU_D float asinpi(float x);
335 
336 XPU_D float atan(float x);
337 XPU_D float atan2(float y, float x);
338 XPU_D float atanh(float x);
339 XPU_D float atanpi(float x);
340 XPU_D float atan2pi(float y, float x);
341 
342 XPU_D float cbrt(float x);
343 
344 XPU_D float ceil(float x);
345 
346 XPU_D float copysign(float x, float y);
347 
348 XPU_D float cos(float x);
349 XPU_D float cosh(float x);
350 XPU_D float cospi(float x);
351 
352 // Not supported by HIP or c++11 or sycl
353 // XPU_D float cyl_bessel_i0f(float x);
354 // XPU_D float cyl_bessel_i1f(float x);
355 
356 XPU_D float erf(float x);
357 
358 // Not supported by c++ stdlib
359 // XPU_D float erfinv(float y);
360 
361 XPU_D float erfc(float x);
362 
363 // Not supported by c++ stdlib or sycl
364 // XPU_D float erfcinv(float y);
365 
366 // Not supported by c++ stdlib or sycl
367 // XPU_D float erfcx(float x);
368 
369 XPU_D float exp(float x);
370 XPU_D float exp2(float x);
371 XPU_D float exp10(float x);
372 XPU_D float expm1(float x);
373 
374 XPU_D float fdim(float x, float y);
375 
376 XPU_D float floor(float x);
377 
378 XPU_D float fma(float x, float y, float z);
379 
380 XPU_D float fmod(float x, float y);
381 
382 // Not supported by HIP (as of 4.5)
383 // XPU_D float frexp(float x, int *nptr);
384 
385 XPU_D float hypot(float x, float y);
386 
387 XPU_D int ilogb(float x);
388 
389 XPU_D bool isfinite(float a);
390 
391 XPU_D bool isinf(float a);
392 
393 XPU_D bool isnan(float a);
394 
395 // Not supported by SYCL
396 // XPU_D float j0(float x);
397 // XPU_D float j1(float x);
398 // XPU_D float jn(int n, float x);
399 
400 XPU_D float ldexp(float x, int exp);
401 
402 // single-precision version not supported by HIP (as of 4.5)
403 // XPU_D float lgamma(float x);
404 
405 XPU_D long long int llrint(float x);
406 
407 XPU_D long long int llround(float x);
408 
409 XPU_D float log(float x);
410 XPU_D float log10(float x);
411 XPU_D float log1p(float x);
412 XPU_D float log2(float x);
413 XPU_D float logb(float x);
414 
415 XPU_D long int lrint(float x);
416 
417 XPU_D long int lround(float x);
418 
419 XPU_D int max(int a, int b);
420 XPU_D unsigned int max(unsigned int a, unsigned int b);
421 XPU_D long long int max(long long int a, long long int b);
422 XPU_D unsigned long long int max(unsigned long long int a, unsigned long long int b);
423 XPU_D float max(float a, float b);
424 
425 XPU_D int min(int a, int b);
426 XPU_D unsigned int min(unsigned int a, unsigned int b);
427 XPU_D long long int min(long long int a, long long int b);
428 XPU_D unsigned long long int min(unsigned long long int a, unsigned long long int b);
429 XPU_D float min(float a, float b);
430 
431 // Not supported by HIP (as of 4.5)
432 // XPU_D float modf(float x, float *iptr);
433 
434 XPU_D float nan(const char *tagp);
435 
436 // No supported by sycl
437 // XPU_D float nearbyint(float x);
438 
439 // Not supported by HIP (as of 4.5)
440 // XPU_D float nextafter(float x, float y);
441 
442 // Not supported by SYCL
443 // XPU_D float norm(int dim, const float *a);
444 XPU_D float norm3d(float a, float b, float c);
445 XPU_D float norm4d(float a, float b, float c, float d);
446 
447 // Not supported by c++ stdlib (TODO: provide own implementation?)
448 // XPU_D float normcdf(float y);
449 // XPU_D float normcdfinv(float y);
450 
451 XPU_D float pow(float x, float y);
452 
453 // Not supported by CUDA
454 // XPU_D float pown(float x, int n);
455 // XPU_D float powr(float x, float y);
456 
457 XPU_D float rcbrt(float x);
458 
459 XPU_D float remainder(float x, float y);
460 
461 XPU_D float remquo(float x, float y, int *quo);
462 
463 XPU_D float rint(float x);
464 
465 XPU_D float rhypot(float x, float y);
466 
467 // Not supported by SYCL
468 // XPU_D float rnorm(int dim, const float *a);
469 XPU_D float rnorm3d(float a, float b, float c);
470 XPU_D float rnorm4d(float a, float b, float c, float d);
471 
472 XPU_D float round(float x);
473 
474 XPU_D float rsqrt(float x);
475 
476 // Not supported by SYCL
477 // XPU_D float scalbln(float x, long int n);
478 // XPU_D float scalbn(float x, int n);
479 
480 XPU_D bool signbit(float a);
481 
482 XPU_D void sincos(float x, float *sptr, float *cptr);
483 XPU_D void sincospi(float x, float *sptr, float *cptr);
484 
485 XPU_D float sin(float x);
486 XPU_D float sinh(float x);
487 XPU_D float sinpi(float x);
488 
489 XPU_D float sqrt(float x);
490 
491 XPU_D float tan(float x);
492 XPU_D float tanh(float x);
493 XPU_D float tanpi(float x);
494 
495 XPU_D float tanpi(float x);
496 
497 XPU_D float tgamma(float x);
498 
499 XPU_D float trunc(float x);
500 
501 // Not supported by SYCL
502 // XPU_D float y0(float x);
503 // XPU_D float y1(float x);
504 // XPU_D float yn(int n, float x);
505 
506 XPU_D int atomic_cas(int *addr, int compare, int val);
507 XPU_D unsigned int atomic_cas(unsigned int *addr, unsigned int compare, unsigned int val);
508 XPU_D float atomic_cas(float *addr, float compare, float val);
509 XPU_D int atomic_cas_block(int *addr, int compare, int val);
510 XPU_D unsigned int atomic_cas_block(unsigned int *addr, unsigned int compare, unsigned int val);
511 XPU_D float atomic_cas_block(float *addr, float compare, float val);
512 
513 XPU_D int atomic_add(int *addr, int val);
514 XPU_D unsigned int atomic_add(unsigned int *addr, unsigned int val);
515 XPU_D float atomic_add(float *addr, float val);
516 XPU_D int atomic_add_block(int *addr, int val);
517 XPU_D unsigned int atomic_add_block(unsigned int *addr, unsigned int val);
518 XPU_D float atomic_add_block(float *addr, float val);
519 
520 XPU_D int atomic_sub(int *addr, int val);
521 XPU_D unsigned int atomic_sub(unsigned int *addr, unsigned int val);
522 XPU_D int atomic_sub_block(int *addr, int val);
523 XPU_D unsigned int atomic_sub_block(unsigned int *addr, unsigned int val);
524 
525 XPU_D int atomic_and(int *addr, int val);
526 XPU_D unsigned int atomic_and(unsigned int *addr, unsigned int val);
527 XPU_D int atomic_and_block(int *addr, int val);
528 XPU_D unsigned int atomic_and_block(unsigned int *addr, unsigned int val);
529 
530 XPU_D int atomic_or(int *addr, int val);
531 XPU_D unsigned int atomic_or(unsigned int *addr, unsigned int val);
532 XPU_D int atomic_or_block(int *addr, int val);
533 XPU_D unsigned int atomic_or_block(unsigned int *addr, unsigned int val);
534 
535 XPU_D int atomic_xor(int *addr, int val);
536 XPU_D unsigned int atomic_xor(unsigned int *addr, unsigned int val);
537 XPU_D int atomic_xor_block(int *addr, int val);
538 XPU_D unsigned int atomic_xor_block(unsigned int *addr, unsigned int val);
539 
540 XPU_D int float_as_int(float val);
541 XPU_D float int_as_float(int val);
542 
543 
548 
554 template <typename ContextT>
555 XPU_D void barrier(ContextT &ctx) { barrier(ctx.pos()); }
556 
560 template<typename T, int BlockSize, xpu::driver_t Impl=XPU_COMPILATION_TARGET>
561 class block_scan {
562 
563 public:
567  struct storage_t {};
568 
578  template<typename ContextT>
579  XPU_D block_scan(ContextT &ctx, storage_t &storage);
580 
589  XPU_D block_scan(tpos &pos, storage_t &storage);
590 
591  XPU_D void exclusive_sum(T input, T &output);
592 
593  template<typename ScanOp>
594  XPU_D void exclusive_sum(T input, T &output, T initial_value, ScanOp scan_op);
595 
596  XPU_D void inclusive_sum(T input, T &output);
597 
598  template<typename ScanOp>
599  XPU_D void inclusive_sum(T input, T &output, T initial_value, ScanOp scan_op);
600 };
601 
602 template <typename T, int BlockSize, xpu::driver_t Impl = XPU_COMPILATION_TARGET>
604 {
605 
606 public:
607  struct storage_t {};
608 
609  template<typename ContextT>
610  XPU_D block_reduce(ContextT &ctx, storage_t &storage);
611 
613 
614  XPU_D T sum(T input);
615 
616  template<typename ReduceOp>
617  XPU_D T reduce(T input, ReduceOp reduce_op);
618 };
619 
620 template <typename Key, typename KeyValueType, int BlockSize, int ItemsPerThread = 8, xpu::driver_t Impl = XPU_COMPILATION_TARGET>
622 {
623 
624 public:
625  struct storage_t {};
626 
628 
629  template<typename KeyGetter>
630  XPU_D KeyValueType *sort(KeyValueType *vals, size_t N, KeyValueType *buf, KeyGetter &&getKey);
631 };
632 
633 template<typename Key, int BlockSize, int ItemsPerThread=8, xpu::driver_t Impl=XPU_COMPILATION_TARGET>
634 class block_merge {
635 
636 public:
637  struct storage_t {};
638 
640 
641  template<typename Compare>
642  XPU_D void merge(const Key *a, size_t size_a, const Key *b, size_t size_b, Key *dst, Compare &&);
643 
644 };
645 
646 } // namespace xpu
647 
648 #include "detail/dynamic_loader.h"
649 
650 #if XPU_IS_HIP_CUDA
651 #include "detail/platform/hip_cuda/device.h"
652 #elif XPU_IS_SYCL
653 #include "detail/platform/sycl/device.h"
654 #elif XPU_IS_CPU
655 #include "detail/platform/cpu/device.h"
656 #else
657 #error "Unknown XPU driver."
658 #endif
659 
660 #include "detail/constants.h"
661 #include "detail/view_impl.h"
662 
663 #endif
Definition: device.h:634
XPU_D block_merge(tpos &, storage_t &)
XPU_D void merge(const Key *a, size_t size_a, const Key *b, size_t size_b, Key *dst, Compare &&)
Definition: device.h:604
XPU_D T sum(T input)
XPU_D T reduce(T input, ReduceOp reduce_op)
XPU_D block_reduce(tpos &, storage_t &)
XPU_D block_reduce(ContextT &ctx, storage_t &storage)
Parallel scan inside a block.
Definition: device.h:561
XPU_D void inclusive_sum(T input, T &output, T initial_value, ScanOp scan_op)
XPU_D void exclusive_sum(T input, T &output)
XPU_D void inclusive_sum(T input, T &output)
XPU_D void exclusive_sum(T input, T &output, T initial_value, ScanOp scan_op)
XPU_D block_scan(ContextT &ctx, storage_t &storage)
Construct a block scan object.
XPU_D block_scan(tpos &pos, storage_t &storage)
Construct a block scan object.
Definition: device.h:622
XPU_D KeyValueType * sort(KeyValueType *vals, size_t N, KeyValueType *buf, KeyGetter &&getKey)
XPU_D block_sort(tpos &, storage_t &)
Definition: common.h:86
Definition: device.h:109
XPU_D const Constant::data_t & get() const
Definition: device.h:113
XPU_D cmem(detail::internal_ctor_t, Args &&... args)
Definition: device.h:120
Definition: device.h:150
XPU_D int grid_dim_z() const
Definition: device.h:240
XPU_D int block_dim_x() const
Definition: device.h:192
XPU_D int grid_dim_x() const
Definition: device.h:228
XPU_D int thread_idx_x() const
Definition: device.h:174
Constants constants
Definition: device.h:154
XPU_D int block_idx_x() const
Definition: device.h:210
XPU_D int block_idx_z() const
Definition: device.h:222
XPU_D int grid_dim_y() const
Definition: device.h:234
XPU_D int block_dim_y() const
Definition: device.h:198
XPU_D shared_memory & smem()
Definition: device.h:156
XPU_D const constants & cmem() const
Definition: device.h:168
XPU_D const C::data_t & cmem() const
Definition: device.h:163
XPU_D int thread_idx_y() const
Definition: device.h:180
XPU_D const tpos & pos() const
Definition: device.h:243
XPU_D int block_dim_z() const
Definition: device.h:204
XPU_D kernel_context(detail::internal_ctor_t, tpos &pos, shared_memory &smem, const constants &cmem)
Definition: device.h:251
SharedMemory shared_memory
Definition: device.h:153
XPU_D int thread_idx_z() const
Definition: device.h:186
XPU_D const shared_memory & smem() const
Definition: device.h:157
XPU_D int block_idx_y() const
Definition: device.h:216
XPU_D tpos & pos()
Definition: device.h:242
Definition: device.h:78
XPU_D int block_idx_y() const
Definition: device.h:90
XPU_D int block_dim_z() const
Definition: device.h:87
XPU_D int block_dim_y() const
Definition: device.h:86
XPU_D int thread_idx_z() const
Definition: device.h:83
XPU_D detail::tpos_impl & impl(detail::internal_fn_t)
Definition: device.h:105
XPU_D int block_idx_x() const
Definition: device.h:89
XPU_D int block_idx_z() const
Definition: device.h:91
XPU_D int thread_idx_x() const
Definition: device.h:81
XPU_D int grid_dim_y() const
Definition: device.h:94
XPU_D tpos(detail::internal_ctor_t, Args &&... args)
Definition: device.h:102
XPU_D int thread_idx_y() const
Definition: device.h:82
XPU_D int block_dim_x() const
Definition: device.h:85
XPU_D int grid_dim_z() const
Definition: device.h:95
XPU_D int grid_dim_x() const
Definition: device.h:93
Definition: device.h:265
view()=default
XPU_D bool empty() const
Definition: device.h:296
XPU_D size_t size() const
Definition: device.h:291
XPU_D T & operator[](size_t idx)
XPU_D const T & operator[](size_t idx) const
XPU_D T & at(size_t idx)
XPU_D view(T *data, size_t size)
XPU_D const T & at(size_t idx) const
XPU_D T * data() const
Definition: device.h:286
XPU_D view(buffer< T > &buffer, size_t size)
Common definitions for xpu.
Defines for xpu.
#define XPU_D
Function specifier for device functions. (Replaces device)
Definition: defines.h:21
xpu default namespace.
Definition: common.h:17
XPU_D float cos(float x)
XPU_D float atan2(float y, float x)
XPU_D float tgamma(float x)
XPU_D float log(float x)
XPU_D int max(int a, int b)
XPU_D long int lrint(float x)
XPU_D float tanpi(float x)
XPU_D float nan(const char *tagp)
constexpr XPU_D float sqrt2()
XPU_D int atomic_and(int *addr, int val)
XPU_D float acospi(float x)
XPU_D bool isnan(float a)
XPU_D float log1p(float x)
XPU_D int atomic_xor(int *addr, int val)
XPU_D int ilogb(float x)
XPU_D float rsqrt(float x)
XPU_D int atomic_xor_block(int *addr, int val)
XPU_D float norm3d(float a, float b, float c)
XPU_D float rcbrt(float x)
XPU_D int float_as_int(float val)
XPU_D float rhypot(float x, float y)
XPU_D float rnorm3d(float a, float b, float c)
constexpr XPU_D float pi_2()
XPU_D float remquo(float x, float y, int *quo)
XPU_D float cosh(float x)
XPU_D float round(float x)
XPU_D float ldexp(float x, int exp)
constexpr driver_t compilation_target
Definition: device.h:42
XPU_D float erf(float x)
XPU_D float log2(float x)
XPU_D float ceil(float x)
XPU_D int atomic_or(int *addr, int val)
XPU_D float cospi(float x)
XPU_D float atanpi(float x)
XPU_D float exp10(float x)
XPU_D float tanh(float x)
XPU_D bool isfinite(float a)
XPU_D float rint(float x)
XPU_D float pow(float x, float y)
XPU_D int abs(int x)
XPU_D int atomic_cas_block(int *addr, int compare, int val)
XPU_D float norm4d(float a, float b, float c, float d)
XPU_D float remainder(float x, float y)
XPU_D float sinh(float x)
XPU_D int atomic_sub_block(int *addr, int val)
XPU_D float exp2(float x)
XPU_D float exp(float x)
XPU_D float erfc(float x)
XPU_D float hypot(float x, float y)
XPU_D float sin(float x)
XPU_D float tan(float x)
XPU_D float fma(float x, float y, float z)
XPU_D float fmod(float x, float y)
XPU_D float atan2pi(float y, float x)
XPU_D bool signbit(float a)
XPU_D float rnorm4d(float a, float b, float c, float d)
XPU_D float trunc(float x)
XPU_D int atomic_sub(int *addr, int val)
XPU_D float atanh(float x)
XPU_D float floor(float x)
XPU_D int atomic_add_block(int *addr, int val)
XPU_D float atan(float x)
XPU_D float logb(float x)
XPU_D bool isinf(float a)
constexpr XPU_D float pi_4()
constexpr XPU_D float deg_to_rad()
XPU_D float asin(float x)
XPU_D void sincos(float x, float *sptr, float *cptr)
XPU_D float asinpi(float x)
XPU_D int atomic_add(int *addr, int val)
XPU_D float log10(float x)
XPU_D float acos(float x)
XPU_D float sqrt(float x)
XPU_D float copysign(float x, float y)
schedule_t
OpenMP schedule types. Used for specifying the schedule type for kernels.
Definition: device.h:56
@ schedule_dynamic
Definition: device.h:58
@ schedule_static
Definition: device.h:57
XPU_D int atomic_or_block(int *addr, int val)
XPU_D void sincospi(float x, float *sptr, float *cptr)
XPU_D float fdim(float x, float y)
XPU_D long long int llround(float x)
XPU_D int atomic_cas(int *addr, int compare, int val)
XPU_D float asinh(float x)
XPU_D float sinpi(float x)
XPU_D int min(int a, int b)
XPU_D float int_as_float(int val)
XPU_D long long int llrint(float x)
XPU_D long int lround(float x)
XPU_D int atomic_and_block(int *addr, int val)
XPU_D float expm1(float x)
XPU_D void barrier(tpos &)
Sync all threads in a block.
XPU_D float acosh(float x)
XPU_D float cbrt(float x)
constexpr XPU_D float pi()
driver_t
Definition: common.h:19
Definition: device.h:637
Definition: device.h:607
Temporary storage for the block scan. Should be allocated in shared memory.
Definition: device.h:567
Definition: device.h:47
static constexpr xpu::dim value
Definition: device.h:48
Definition: device.h:625
Definition: device.h:145
Data data_t
Definition: device.h:146
Definition: device.h:44
Definition: common.h:26
Definition: device.h:141
Definition: device.h:127
Definition: device.h:51
OpenMP settings for kernels.
Definition: device.h:65
static constexpr size_t chunk_size
Chunk size. Use 0 for default value from OpenMP.
Definition: device.h:75
static constexpr schedule_t schedule
OpenMP schedule type.
Definition: device.h:70