17 #include "detail/common.h"
18 #include "detail/type_info.h"
21 #include "detail/platform/cpu/cmem_impl.h"
22 #include "detail/platform/cpu/tpos_impl.h"
24 #include "detail/platform/hip_cuda/cmem_impl.h"
25 #include "detail/platform/hip_cuda/tpos_impl.h"
27 #include "detail/platform/sycl/cmem_impl.h"
28 #include "detail/platform/sycl/tpos_impl.h"
30 #error "Unsupported XPU target"
33 #include <type_traits>
35 #define XPU_IMAGE(image) XPU_DETAIL_IMAGE(image)
36 #define XPU_EXPORT(obj) XPU_DETAIL_EXPORT(obj)
38 #define XPU_ASSERT(x) XPU_DETAIL_ASSERT(x)
46 template<
int X,
int Y = -1,
int Z = -1>
64 template<schedule_t Schedule = schedule_static,
size_t ChunkSize = 0>
98 detail::tpos_impl m_impl;
101 template<
typename... Args>
103 : m_impl(std::forward<Args>(args)...) {}
105 XPU_D detail::tpos_impl &
impl(detail::internal_fn_t) {
return m_impl; }
108 template<
typename... Constants>
112 template<
typename Constant>
113 XPU_D const typename Constant::data_t &
get()
const {
return m_impl.template get<Constant>(); }
116 detail::cmem_impl<Constants...> m_impl;
119 template<
typename... Args>
121 : m_impl(std::forward<Args>(args)...) {}
126 template<
typename Image>
127 struct kernel : detail::action<Image, detail::kernel_tag> {
140 template<
typename Image>
141 struct function : detail::action<Image, detail::function_tag> {
144 template<
typename Image,
typename Data>
145 struct constant : detail::action<Image, detail::constant_tag> {
149 template<
typename SharedMemory = xpu::no_smem,
typename Constants = xpu::cmem<>>
163 XPU_D const typename C::data_t &
cmem()
const {
return m_cmem.template get<C>(); }
420 XPU_D unsigned int max(
unsigned int a,
unsigned int b);
421 XPU_D long long int max(
long long int a,
long long int b);
422 XPU_D unsigned long long int max(
unsigned long long int a,
unsigned long long int b);
426 XPU_D unsigned int min(
unsigned int a,
unsigned int b);
427 XPU_D long long int min(
long long int a,
long long int b);
428 XPU_D unsigned long long int min(
unsigned long long int a,
unsigned long long int b);
507 XPU_D unsigned int atomic_cas(
unsigned int *addr,
unsigned int compare,
unsigned int val);
554 template <
typename ContextT>
560 template<
typename T,
int BlockSize, xpu::driver_t Impl=XPU_COMPILATION_TARGET>
578 template<
typename ContextT>
593 template<
typename ScanOp>
598 template<
typename ScanOp>
602 template <
typename T,
int BlockSize, xpu::driver_t Impl = XPU_COMPILATION_TARGET>
609 template<
typename ContextT>
616 template<
typename ReduceOp>
620 template <
typename Key,
typename KeyValueType,
int BlockSize,
int ItemsPerThread = 8, xpu::driver_t Impl = XPU_COMPILATION_TARGET>
629 template<
typename KeyGetter>
630 XPU_D KeyValueType *
sort(KeyValueType *vals,
size_t N, KeyValueType *buf, KeyGetter &&getKey);
633 template<
typename Key,
int BlockSize,
int ItemsPerThread=8, xpu::driver_t Impl=XPU_COMPILATION_TARGET>
641 template<
typename Compare>
642 XPU_D void merge(
const Key *a,
size_t size_a,
const Key *b,
size_t size_b, Key *dst, Compare &&);
648 #include "detail/dynamic_loader.h"
651 #include "detail/platform/hip_cuda/device.h"
653 #include "detail/platform/sycl/device.h"
655 #include "detail/platform/cpu/device.h"
657 #error "Unknown XPU driver."
660 #include "detail/constants.h"
661 #include "detail/view_impl.h"
XPU_D block_merge(tpos &, storage_t &)
XPU_D void merge(const Key *a, size_t size_a, const Key *b, size_t size_b, Key *dst, Compare &&)
XPU_D T reduce(T input, ReduceOp reduce_op)
XPU_D block_reduce(tpos &, storage_t &)
XPU_D block_reduce(ContextT &ctx, storage_t &storage)
Parallel scan inside a block.
Definition: device.h:561
XPU_D void inclusive_sum(T input, T &output, T initial_value, ScanOp scan_op)
XPU_D void exclusive_sum(T input, T &output)
XPU_D void inclusive_sum(T input, T &output)
XPU_D void exclusive_sum(T input, T &output, T initial_value, ScanOp scan_op)
XPU_D block_scan(ContextT &ctx, storage_t &storage)
Construct a block scan object.
XPU_D block_scan(tpos &pos, storage_t &storage)
Construct a block scan object.
XPU_D KeyValueType * sort(KeyValueType *vals, size_t N, KeyValueType *buf, KeyGetter &&getKey)
XPU_D block_sort(tpos &, storage_t &)
XPU_D const Constant::data_t & get() const
Definition: device.h:113
XPU_D cmem(detail::internal_ctor_t, Args &&... args)
Definition: device.h:120
XPU_D int grid_dim_z() const
Definition: device.h:240
XPU_D int block_dim_x() const
Definition: device.h:192
XPU_D int grid_dim_x() const
Definition: device.h:228
XPU_D int thread_idx_x() const
Definition: device.h:174
Constants constants
Definition: device.h:154
XPU_D int block_idx_x() const
Definition: device.h:210
XPU_D int block_idx_z() const
Definition: device.h:222
XPU_D int grid_dim_y() const
Definition: device.h:234
XPU_D int block_dim_y() const
Definition: device.h:198
XPU_D shared_memory & smem()
Definition: device.h:156
XPU_D const constants & cmem() const
Definition: device.h:168
XPU_D const C::data_t & cmem() const
Definition: device.h:163
XPU_D int thread_idx_y() const
Definition: device.h:180
XPU_D const tpos & pos() const
Definition: device.h:243
XPU_D int block_dim_z() const
Definition: device.h:204
XPU_D kernel_context(detail::internal_ctor_t, tpos &pos, shared_memory &smem, const constants &cmem)
Definition: device.h:251
SharedMemory shared_memory
Definition: device.h:153
XPU_D int thread_idx_z() const
Definition: device.h:186
XPU_D const shared_memory & smem() const
Definition: device.h:157
XPU_D int block_idx_y() const
Definition: device.h:216
XPU_D tpos & pos()
Definition: device.h:242
XPU_D int block_idx_y() const
Definition: device.h:90
XPU_D int block_dim_z() const
Definition: device.h:87
XPU_D int block_dim_y() const
Definition: device.h:86
XPU_D int thread_idx_z() const
Definition: device.h:83
XPU_D detail::tpos_impl & impl(detail::internal_fn_t)
Definition: device.h:105
XPU_D int block_idx_x() const
Definition: device.h:89
XPU_D int block_idx_z() const
Definition: device.h:91
XPU_D int thread_idx_x() const
Definition: device.h:81
XPU_D int grid_dim_y() const
Definition: device.h:94
XPU_D tpos(detail::internal_ctor_t, Args &&... args)
Definition: device.h:102
XPU_D int thread_idx_y() const
Definition: device.h:82
XPU_D int block_dim_x() const
Definition: device.h:85
XPU_D int grid_dim_z() const
Definition: device.h:95
XPU_D int grid_dim_x() const
Definition: device.h:93
XPU_D bool empty() const
Definition: device.h:296
XPU_D size_t size() const
Definition: device.h:291
XPU_D T & operator[](size_t idx)
XPU_D const T & operator[](size_t idx) const
XPU_D view(T *data, size_t size)
XPU_D const T & at(size_t idx) const
XPU_D T * data() const
Definition: device.h:286
XPU_D view(buffer< T > &buffer, size_t size)
Common definitions for xpu.
#define XPU_D
Function specifier for device functions. (Replaces device)
Definition: defines.h:21
xpu default namespace.
Definition: common.h:17
XPU_D float atan2(float y, float x)
XPU_D float tgamma(float x)
XPU_D int max(int a, int b)
XPU_D long int lrint(float x)
XPU_D float tanpi(float x)
XPU_D float nan(const char *tagp)
constexpr XPU_D float sqrt2()
XPU_D int atomic_and(int *addr, int val)
XPU_D float acospi(float x)
XPU_D bool isnan(float a)
XPU_D float log1p(float x)
XPU_D int atomic_xor(int *addr, int val)
XPU_D float rsqrt(float x)
XPU_D int atomic_xor_block(int *addr, int val)
XPU_D float norm3d(float a, float b, float c)
XPU_D float rcbrt(float x)
XPU_D int float_as_int(float val)
XPU_D float rhypot(float x, float y)
XPU_D float rnorm3d(float a, float b, float c)
constexpr XPU_D float pi_2()
XPU_D float remquo(float x, float y, int *quo)
XPU_D float cosh(float x)
XPU_D float round(float x)
XPU_D float ldexp(float x, int exp)
constexpr driver_t compilation_target
Definition: device.h:42
XPU_D float log2(float x)
XPU_D float ceil(float x)
XPU_D int atomic_or(int *addr, int val)
XPU_D float cospi(float x)
XPU_D float atanpi(float x)
XPU_D float exp10(float x)
XPU_D float tanh(float x)
XPU_D bool isfinite(float a)
XPU_D float rint(float x)
XPU_D float pow(float x, float y)
XPU_D int atomic_cas_block(int *addr, int compare, int val)
XPU_D float norm4d(float a, float b, float c, float d)
XPU_D float remainder(float x, float y)
XPU_D float sinh(float x)
XPU_D int atomic_sub_block(int *addr, int val)
XPU_D float exp2(float x)
XPU_D float erfc(float x)
XPU_D float hypot(float x, float y)
XPU_D float fma(float x, float y, float z)
XPU_D float fmod(float x, float y)
XPU_D float atan2pi(float y, float x)
XPU_D bool signbit(float a)
XPU_D float rnorm4d(float a, float b, float c, float d)
XPU_D float trunc(float x)
XPU_D int atomic_sub(int *addr, int val)
XPU_D float atanh(float x)
XPU_D float floor(float x)
XPU_D int atomic_add_block(int *addr, int val)
XPU_D float atan(float x)
XPU_D float logb(float x)
XPU_D bool isinf(float a)
constexpr XPU_D float pi_4()
constexpr XPU_D float deg_to_rad()
XPU_D float asin(float x)
XPU_D void sincos(float x, float *sptr, float *cptr)
XPU_D float asinpi(float x)
XPU_D int atomic_add(int *addr, int val)
XPU_D float log10(float x)
XPU_D float acos(float x)
XPU_D float sqrt(float x)
XPU_D float copysign(float x, float y)
schedule_t
OpenMP schedule types. Used for specifying the schedule type for kernels.
Definition: device.h:56
@ schedule_dynamic
Definition: device.h:58
@ schedule_static
Definition: device.h:57
XPU_D int atomic_or_block(int *addr, int val)
XPU_D void sincospi(float x, float *sptr, float *cptr)
XPU_D float fdim(float x, float y)
XPU_D long long int llround(float x)
XPU_D int atomic_cas(int *addr, int compare, int val)
XPU_D float asinh(float x)
XPU_D float sinpi(float x)
XPU_D int min(int a, int b)
XPU_D float int_as_float(int val)
XPU_D long long int llrint(float x)
XPU_D long int lround(float x)
XPU_D int atomic_and_block(int *addr, int val)
XPU_D float expm1(float x)
XPU_D void barrier(tpos &)
Sync all threads in a block.
XPU_D float acosh(float x)
XPU_D float cbrt(float x)
constexpr XPU_D float pi()
driver_t
Definition: common.h:19
Temporary storage for the block scan. Should be allocated in shared memory.
Definition: device.h:567
static constexpr xpu::dim value
Definition: device.h:48
Data data_t
Definition: device.h:146
OpenMP settings for kernels.
Definition: device.h:65
static constexpr size_t chunk_size
Chunk size. Use 0 for default value from OpenMP.
Definition: device.h:75
static constexpr schedule_t schedule
OpenMP schedule type.
Definition: device.h:70