xpu
host.h
Go to the documentation of this file.
1 
9 #ifndef XPU_HOST_H
10 #define XPU_HOST_H
11 
12 #include "defines.h"
13 #include "common.h"
14 #include "detail/common.h"
15 
16 #include <numeric>
17 #include <cstddef>
18 #include <cstdio>
19 #include <cstring>
20 #include <functional>
21 #include <utility>
22 #include <string>
23 #include <string_view>
24 #include <type_traits>
25 #include <vector>
26 
30 namespace xpu {
31 
35 enum direction {
39  h2d = detail::dir_h2d,
40 
44  d2h = detail::dir_d2h,
45 };
46 
47 class exception : public std::exception {
48 
49 public:
50  explicit exception(std::string_view message_) : message(message_) {}
51 
52  const char *what() const noexcept override { return message.c_str(); }
53 
54 private:
55  std::string message;
56 
57 };
58 
62 struct settings {
70  std::string device = "cpu";
71 
79  bool verbose = false;
80 
85  std::function<void(std::string_view)> logging_sink = [](std::string_view msg) {
86  // Use c functions for output to avoid including iostream in host.h ...
87  std::fwrite(msg.data(), 1, msg.size(), stderr);
88  std::fputc('\n', stderr);
89  };
90 
96  bool profile = false;
97 
103  std::vector<driver_t> excluded_backends = {};
104 };
105 
115 inline void initialize(settings = {});
116 
123 template<typename I>
124 void preload();
125 
130 void *malloc_device(size_t size_bytes);
131 
138 template<typename T>
139 T *malloc_device(size_t elems);
140 
145 void *malloc_pinned(size_t size_bytes);
146 
153 template<typename T>
154 T *malloc_pinned(size_t elems);
155 
162 template<typename T>
163 T *malloc_host(size_t elems);
164 
169 void *malloc_managed(size_t size_bytes);
170 
177 template<typename T>
178 T *malloc_managed(size_t elems);
179 
184 inline void free(void *);
185 
189 void stack_alloc(size_t size);
190 
195 void stack_pop(void *head=nullptr);
196 
200 class device {
201 
202 public:
206  static std::vector<device> all();
207 
211  static device active();
212 
217 
222  explicit device(std::string_view xpuid);
223 
230  explicit device(int id);
231 
235  explicit device(driver_t driver, int device_nr);
236 
237  device(const device &) = default;
238  device(device &&) = default;
239  device &operator=(const device &) = default;
240  device &operator=(device &&) = default;
241 
245  int id() const { return m_impl.id; }
246 
250  driver_t backend() const { return static_cast<driver_t>(m_impl.backend); }
251 
255  int device_nr() const { return m_impl.device_nr; }
256 
257 private:
258  detail::device m_impl;
259 
260 public:
262  explicit device(detail::device impl) : m_impl(std::move(impl)) {}
263 
265  detail::device &impl() { return m_impl; }
266 };
267 
271 class device_prop {
272 
273 public:
274  device_prop() = delete;
275 
280 
284  std::string_view name() const { return m_prop.name; }
285 
289  driver_t backend() const { return static_cast<driver_t>(m_prop.driver); }
290 
294  std::string_view arch() const { return m_prop.arch; }
295 
299  size_t shared_mem_size() const { return m_prop.shared_mem_size; }
300 
304  size_t const_mem_size() const { return m_prop.const_mem_size; }
305 
309  size_t warp_size() const { return m_prop.warp_size; }
310 
314  size_t max_threads_per_block() const { return m_prop.max_threads_per_block; }
315 
319  std::array<size_t, 3> max_grid_size() const { return m_prop.max_grid_size; }
320 
325  std::string_view xpuid() const { return m_prop.xpuid; }
326 
330  int id() const { return m_prop.id; }
331 
335  int device_nr() const { return m_prop.device_nr; }
336 
340  size_t global_mem_total() const { return m_prop.global_mem_total; }
341 
345  size_t global_mem_available() const { return m_prop.global_mem_available; }
346 
347 private:
348  detail::device_prop m_prop;
349 };
350 
354 class queue {
355 
356 public:
360  queue();
361 
365  explicit queue(device);
366 
367  template<typename T>
369 
370  template<typename T>
371  void copy(const T *from, T *to, size_t size) { memcpy(to, from, size * sizeof(T)); }
372 
373  void memcpy(void *dst, const void *src, size_t size_bytes);
374 
375  void memset(void *dst, int value, size_t size_bytes);
376 
377  template<typename T>
378  void memset(buffer<T>, int value);
379 
380  template<typename Kernel, typename... Args>
381  void launch(grid params, Args&&... args);
382 
383  void wait();
384 
385 private:
386  std::shared_ptr<detail::queue_handle> m_handle;
387 
388  void do_copy(const void *from, void *to, size_t size, double *ms);
389  void log_copy(const void *from, const void *to, size_t size);
390 };
391 
392 template<typename Kernel>
393 const char *get_name();
394 
395 template<typename Func, typename... Args>
396 void call(Args&&... args);
397 
398 
399 template<typename C>
400 void set(const typename C::data_t &symbol);
401 
409 template<typename T>
410 class h_view {
411 
412 public:
413  using value_type = T;
414 
418  h_view() : m_data(nullptr), m_size(0) {}
419 
423  explicit h_view(buffer<T> &);
424 
428  T *data() { return m_data; }
429  const T *data() const { return m_data; }
430 
434  size_t size() const { return m_size; }
435 
439  size_t size_bytes() const { return m_size * sizeof(T); }
440 
444  bool empty() const { return m_size == 0; }
445 
446  T *begin() { return m_data; }
447  const T *begin() const { return m_data; }
448 
449  T *end() { return m_data + m_size; }
450  const T *end() const { return m_data + m_size; }
451 
452  T &front() { return at(0); }
453  const T &front() const { return at(0); }
454 
455  T &back() { return at(m_size - 1); }
456  const T &back() const { return at(m_size - 1); }
457 
463  T &operator[](size_t i);
464  const T &operator[](size_t i) const;
465 
471  T &at(size_t i);
472  const T &at(size_t i) const;
473 
478  T &unsafe_at(size_t i) { return m_data[i]; }
479  const T &unsafe_at(size_t i) const { return m_data[i]; }
480 
481 private:
482  T *m_data;
483  size_t m_size;
484 
485 public:
487  h_view(T *data, size_t size) : m_data(data), m_size(size) {}
488 };
489 
493 enum class mem_type {
498  pinned = detail::mem_pinned,
499 
503  device = detail::mem_device,
504 
509  managed = detail::mem_managed,
510 
515  host = detail::mem_host,
516 };
517 
522 class ptr_prop {
523 
524 public:
525  ptr_prop() = delete;
526 
531  explicit ptr_prop(const void *);
532 
536  void *ptr() const { return m_prop.ptr; }
537 
542  mem_type type() const { return static_cast<mem_type>(m_prop.type); }
543 
547  bool is_host() const { return m_prop.type == detail::mem_host || m_prop.type == detail::mem_pinned; }
548 
552  xpu::device device() const { return xpu::device{m_prop.dev}; }
553 
558  driver_t backend() const { return static_cast<driver_t>(m_prop.dev.backend); }
559 
560 private:
561  detail::ptr_prop m_prop;
562 };
563 
564 class buffer_prop {
565 
566 public:
567  buffer_prop() = delete;
568  template<typename T>
569  explicit buffer_prop(const buffer<T> &);
570 
571  size_t size() const { return m_size; }
572  size_t size_bytes() const { return m_size_bytes; }
573  buffer_type type() const { return m_type; }
574  void *h_ptr() const { return m_host; }
575  template<typename T>
576  T *h_ptr() const { return static_cast<T *>(m_host); }
577  void *d_ptr() const { return m_device; }
578  template<typename T>
579  T *d_ptr() const { return static_cast<T *>(m_device); }
580 
581  template<typename T>
582  h_view<T> view() const { return h_view<T>{m_host, m_size}; }
583 
584 private:
585  size_t m_size_bytes;
586  size_t m_size;
587  void *m_host;
588  void *m_device;
589  buffer_type m_type;
590 };
591 
596 
597 public:
601  std::string_view name() const { return m_t.name; }
602 
606  double total() const { return std::accumulate(m_t.times.begin(), m_t.times.end(), 0.0); }
607 
611  const std::vector<double> &times() const { return m_t.times; }
612 
617  double throughput() const;
618 
619 private:
620  detail::kernel_timings m_t;
621 
622 public:
624  explicit kernel_timings(detail::kernel_timings t) : m_t(std::move(t)) {}
625 
626 };
627 
632 class timings {
633 
634 public:
635  timings() = default;
636 
640  std::string_view name() const { return m_t.name; }
641 
646  double wall() const { return m_t.wall; }
647 
653  double copy(direction dir) const {
654  return dir == h2d ? m_t.copy_h2d : m_t.copy_d2h;
655  }
656 
661  double memset() const { return m_t.memset; }
662 
667  template<typename K>
668  kernel_timings kernel() const { return kernel(get_name<K>()); }
669 
674  std::vector<kernel_timings> kernels() const;
675 
679  double kernel_time() const {
680  return std::accumulate(m_t.kernels.begin(), m_t.kernels.end(), 0.0,
681  [](double a, const auto &b) { return a + std::accumulate(b.times.begin(), b.times.end(), 0.0); });
682  }
683 
687  std::vector<timings> children() const;
688 
692  bool has_details() const { return m_t.has_details; }
693 
698  double throughput() const;
699 
703  double throughput_kernels() const;
704 
708  double throughput_copy(direction dir) const;
709 
713  double throughput_memset() const;
714 
718  void merge(const timings &other) { m_t.merge(other.m_t); }
719 
720 private:
721  detail::timings m_t;
722  kernel_timings kernel(std::string_view name) const;
723 
724 public:
726  explicit timings(detail::timings t) : m_t(std::move(t)) {}
727 
728 };
729 
734 void push_timer(std::string_view name);
735 
742 
748 
749 public:
755  scoped_timer(std::string_view name, xpu::timings *t=nullptr);
757 
758  scoped_timer(const scoped_timer&) = delete;
762 
763 private:
764  xpu::timings* m_t = nullptr;
765 };
766 
770 void t_add_bytes(size_t bytes);
771 
775 template<typename Kernel>
776 void k_add_bytes(size_t bytes);
777 
778 } // namespace xpu
779 
780 #include "impl/host.tpp"
781 
782 #endif
Definition: host.h:564
buffer_prop()=delete
buffer_prop(const buffer< T > &)
T * d_ptr() const
Definition: host.h:579
h_view< T > view() const
Definition: host.h:582
void * d_ptr() const
Definition: host.h:577
size_t size() const
Definition: host.h:571
T * h_ptr() const
Definition: host.h:576
size_t size_bytes() const
Definition: host.h:572
buffer_type type() const
Definition: host.h:573
void * h_ptr() const
Definition: host.h:574
Definition: common.h:86
Definition: host.h:271
size_t max_threads_per_block() const
Returns the max number of threads in a block.
Definition: host.h:314
driver_t backend() const
Get the backend associated with the device.
Definition: host.h:289
size_t global_mem_available() const
Returns the amount of global memory available in bytes.
Definition: host.h:345
device_prop(device)
Query properties of the given device.
std::string_view xpuid() const
Get the string used to identify the device.
Definition: host.h:325
std::array< size_t, 3 > max_grid_size() const
Returns the max number of threads in a block.
Definition: host.h:319
std::string_view arch() const
Returns the architecture of the device, if applicable.
Definition: host.h:294
int device_nr() const
Get the device number within the backend.
Definition: host.h:335
int id() const
Get the device id.
Definition: host.h:330
size_t const_mem_size() const
Returns the size of constant memory in bytes.
Definition: host.h:304
size_t warp_size() const
Returns the number of threads in a warp.
Definition: host.h:309
std::string_view name() const
Get the name of the device.
Definition: host.h:284
size_t shared_mem_size() const
Returns the size of shared memory per block in bytes.
Definition: host.h:299
device_prop()=delete
size_t global_mem_total() const
Returns the total amount of global memory in bytes.
Definition: host.h:340
Definition: host.h:200
int device_nr() const
Get the device number within the backend.
Definition: host.h:255
static device active()
Get the active device.
device(driver_t driver, int device_nr)
Construct device from driver and device number.
device()
Construct CPU device.
int id() const
Get the device id.
Definition: host.h:245
device(device &&)=default
device(std::string_view xpuid)
Lookup device by string.
driver_t backend() const
Get the backend associated with the device.
Definition: host.h:250
static std::vector< device > all()
Get all available devices.
device & operator=(device &&)=default
detail::device & impl()
Definition: host.h:265
device(int id)
Construct device from device id.
device & operator=(const device &)=default
device(detail::device impl)
Definition: host.h:262
device(const device &)=default
Definition: host.h:47
const char * what() const noexcept override
Definition: host.h:52
exception(std::string_view message_)
Definition: host.h:50
Create a view from a buffer. Create a view from a buffer to access the underlying data on the host....
Definition: host.h:410
T & operator[](size_t i)
const T * end() const
Definition: host.h:450
size_t size_bytes() const
Definition: host.h:439
T & unsafe_at(size_t i)
Definition: host.h:478
const T & front() const
Definition: host.h:453
const T * data() const
Definition: host.h:429
T & back()
Definition: host.h:455
const T & operator[](size_t i) const
const T * begin() const
Definition: host.h:447
T & front()
Definition: host.h:452
T * end()
Definition: host.h:449
size_t size() const
Definition: host.h:434
T & at(size_t i)
bool empty() const
Definition: host.h:444
T value_type
Definition: host.h:413
T * begin()
Definition: host.h:446
const T & at(size_t i) const
const T & back() const
Definition: host.h:456
const T & unsafe_at(size_t i) const
Definition: host.h:479
h_view()
Create an empty view.
Definition: host.h:418
h_view(buffer< T > &)
Create a view from a buffer.
T * data()
Definition: host.h:428
h_view(T *data, size_t size)
Definition: host.h:487
Execution times collected for a kernel.
Definition: host.h:595
double total() const
Definition: host.h:606
std::string_view name() const
Definition: host.h:601
const std::vector< double > & times() const
Definition: host.h:611
kernel_timings(detail::kernel_timings t)
Definition: host.h:624
double throughput() const
Properties of a pointer. Properties of a pointer allocated with malloc_device, malloc_host or malloc_...
Definition: host.h:522
driver_t backend() const
Definition: host.h:558
void * ptr() const
Definition: host.h:536
xpu::device device() const
Definition: host.h:552
ptr_prop()=delete
ptr_prop(const void *)
Create a pointer property object from a pointer.
mem_type type() const
Definition: host.h:542
bool is_host() const
Definition: host.h:547
command queue for a device.
Definition: host.h:354
void memset(buffer< T >, int value)
void launch(grid params, Args &&... args)
void wait()
queue(device)
void copy(const T *from, T *to, size_t size)
Definition: host.h:371
void memset(void *dst, int value, size_t size_bytes)
void copy(buffer< T >, direction)
void memcpy(void *dst, const void *src, size_t size_bytes)
RAII wrapper for timing functions.
Definition: host.h:747
scoped_timer & operator=(scoped_timer &&)=delete
scoped_timer(scoped_timer &&)=delete
scoped_timer(std::string_view name, xpu::timings *t=nullptr)
scoped_timer(const scoped_timer &)=delete
scoped_timer & operator=(const scoped_timer &)=delete
Timing information collected via xpu::push_timer and xpu::pop_timer.
Definition: host.h:632
double throughput_copy(direction dir) const
timings(detail::timings t)
Definition: host.h:726
double throughput() const
std::vector< timings > children() const
std::string_view name() const
Definition: host.h:640
timings()=default
double kernel_time() const
Definition: host.h:679
double copy(direction dir) const
Definition: host.h:653
double throughput_memset() const
double throughput_kernels() const
double memset() const
Definition: host.h:661
void merge(const timings &other)
Definition: host.h:718
kernel_timings kernel() const
Definition: host.h:668
double wall() const
Definition: host.h:646
bool has_details() const
Definition: host.h:692
std::vector< kernel_timings > kernels() const
Common definitions for xpu.
Defines for xpu.
xpu default namespace.
Definition: common.h:17
void * malloc_managed(size_t size_bytes)
Allocate memory that can be accessed by the device and the host.
void preload()
Preload the given device image.
void stack_alloc(size_t size)
Allocate the stack memory on the device.
buffer_type
Definition: common.h:77
void k_add_bytes(size_t bytes)
T * malloc_host(size_t elems)
Allocate pinned memory on the host that can be accessed by the device.
void * malloc_pinned(size_t size_bytes)
Allocate pinned memory on the host that can be accessed by the device.
void call(Args &&... args)
direction
Definition: host.h:35
@ h2d
Host to device transfer.
Definition: host.h:39
@ d2h
Device to host transfer.
Definition: host.h:44
void initialize(settings={})
Initialize xpu.
void set(const typename C::data_t &symbol)
void free(void *)
Free memory allocated with malloc_device, malloc_pinned or malloc_managed.
timings pop_timer()
void push_timer(std::string_view name)
const char * get_name()
void stack_pop(void *head=nullptr)
Pop entries from the stack.
void t_add_bytes(size_t bytes)
void * malloc_device(size_t size_bytes)
Allocate memory on the device.
mem_type
Definition: host.h:493
driver_t
Definition: common.h:19
3d execution grid describing the number of blocks and threads of a kernel Use 'n_blocks' or 'n_thread...
Definition: common.h:51
Settings used to initialize xpu.
Definition: host.h:62
bool verbose
Enable internal logging. Display information about device operations like memory allocation,...
Definition: host.h:79
std::function< void(std::string_view)> logging_sink
Set a custom logging sink. By default messages are written to stderr. Has no effect if 'verbose' is f...
Definition: host.h:85
std::vector< driver_t > excluded_backends
Backends that should be excluded.
Definition: host.h:103
bool profile
Enable profiling of kernels. Value may be overwritten by setting environment variable XPU_PROFILE.
Definition: host.h:96