1#ifndef COLVAR_GPU_SUPPORT_H
2#define COLVAR_GPU_SUPPORT_H
6#include <unordered_map>
8#define COLVARS_STRINGIFY(s) STRINGIFY_HELPER(s)
9#define STRINGIFY_HELPER(s) #s
11#if defined(COLVARS_CUDA)
12#include <cuda_runtime.h>
13#ifdef COLVARS_NVTX_PROFILING
14#include <nvtx3/nvToolsExt.h>
18#if defined(COLVARS_HIP)
20#define cudaError_t hipError_t
24#define cudaFree hipFree
28#define cudaFreeHost hipFreeHost
31#ifndef cudaGetErrorString
32#define cudaGetErrorString hipGetErrorString
35#ifndef cudaGraphAddChildGraphNode
36#define cudaGraphAddChildGraphNode hipGraphAddChildGraphNode
39#ifndef cudaGraphAddKernelNode
40#define cudaGraphAddKernelNode hipGraphAddKernelNode
43#ifndef cudaGraphAddMemcpyNode
44#define cudaGraphAddMemcpyNode hipGraphAddMemcpyNode
47#ifndef cudaGraphAddMemsetNode
48#define cudaGraphAddMemsetNode hipGraphAddMemsetNode
51#ifndef cudaGraphCreate
52#define cudaGraphCreate hipGraphCreate
55#ifndef cudaGraphDestroy
56#define cudaGraphDestroy hipGraphDestroy
59#ifndef cudaGraphExecDestroy
60#define cudaGraphExecDestroy hipGraphExecDestroy
63#ifndef cudaGraphExecMemcpyNodeSetParams
64#define cudaGraphExecMemcpyNodeSetParams hipGraphExecMemcpyNodeSetParams
67#ifndef cudaGraphExec_t
68#define cudaGraphExec_t hipGraphExec_t
71#ifndef cudaGraphInstantiate
72#define cudaGraphInstantiate hipGraphInstantiate
75#ifndef cudaGraphLaunch
76#define cudaGraphLaunch hipGraphLaunch
79#ifndef cudaGraphNode_t
80#define cudaGraphNode_t hipGraphNode_t
84#define cudaGraph_t hipGraph_t
87#ifndef cudaHostAllocMapped
88#define cudaHostAllocMapped hipHostAllocMapped
91#ifndef cudaKernelNodeParams
92#define cudaKernelNodeParams hipKernelNodeParams
96#define cudaMalloc hipMalloc
100#define cudaMallocHost hipMallocHost
104#define cudaMemcpy hipMemcpy
107#ifndef cudaMemcpy3DParms
108#define cudaMemcpy3DParms hipMemcpy3DParms
111#ifndef cudaMemcpyAsync
112#define cudaMemcpyAsync hipMemcpyAsync
115#ifndef cudaMemcpyDeviceToDevice
116#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
119#ifndef cudaMemcpyDeviceToHost
120#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
123#ifndef cudaMemcpyHostToDevice
124#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
127#ifndef cudaMemcpyKind
128#define cudaMemcpyKind hipMemcpyKind
132#define cudaMemset hipMemset
135#ifndef cudaMemsetAsync
136#define cudaMemsetAsync hipMemsetAsync
139#ifndef cudaStreamCreate
140#define cudaStreamCreate hipStreamCreate
143#ifndef cudaStreamDestroy
144#define cudaStreamDestroy hipStreamDestroy
147#ifndef cudaStreamSynchronize
148#define cudaStreamSynchronize hipStreamSynchronize
152#define cudaStream_t hipStream_t
156#define cudaSuccess hipSuccess
159#ifndef make_cudaExtent
160#define make_cudaExtent make_hipExtent
163#ifndef make_cudaPitchedPtr
164#define make_cudaPitchedPtr make_hipPitchedPtr
168#define make_cudaPos make_hipPos
173namespace colvars_gpu {
175#if defined(COLVARS_CUDA) || defined(COLVARS_HIP)
177constexpr unsigned int default_block_size = 128;
179static unsigned int default_reduce_max_num_blocks = 64;
183#if defined(COLVARS_CUDA) || defined(COLVARS_HIP)
184#define COLVARS_HOST_DEVICE __device__ __host__
185#define COLVARS_DEVICE __device__
187#define COLVARS_HOST_DEVICE
188#define COLVARS_DEVICE
192#if ( defined(COLVARS_CUDA) || defined(COLVARS_HIP) )
204 using value_type = T;
214 T* allocate(
size_t n) {
216 if (cudaHostAlloc(&ptr, n *
sizeof(T), cudaHostAllocMapped) != cudaSuccess) {
217 throw std::bad_alloc();
221 void deallocate(T* ptr,
size_t n)
noexcept {
224 template<
typename U,
typename... Args>
225 void construct(U* p, Args&&... args) {
226 new(p) U(std::forward<Args>(args)...);
230 void destroy(U* p)
noexcept {
237#if defined(COLVARS_CUDA) || defined (COLVARS_HIP)
246int gpuAssert(cudaError_t code,
const char *file,
int line);
251#if defined(COLVARS_CUDA) || defined (COLVARS_HIP)
254#define checkGPUError(ans) colvars_gpu::gpuAssert((ans), __FILE__, __LINE__);
257namespace colvars_gpu {
258#if defined(COLVARS_CUDA) || defined (COLVARS_HIP)
263int add_clear_array_node_impl(
264 void* dst,
const size_t num_elements,
const size_t sizeofT,
265 cudaGraphNode_t& node_out, cudaGraph_t& graph,
266 const std::vector<cudaGraphNode_t>& dependencies);
271int add_copy_node_impl(
272 const void* src,
void* dst,
const size_t num_elements,
const size_t sizeofT,
273 cudaMemcpyKind kind, cudaGraphNode_t& node_out, cudaGraph_t& graph,
274 const std::vector<cudaGraphNode_t>& dependencies);
288int add_clear_array_node(
289 T* dst,
const size_t num_elements,
290 cudaGraphNode_t& node_out, cudaGraph_t& graph,
291 const std::vector<cudaGraphNode_t>& dependencies) {
292 return add_clear_array_node_impl(
293 dst, num_elements,
sizeof(T), node_out, graph, dependencies);
311 const T* src, T* dst,
size_t num_elements,
312 cudaMemcpyKind kind, cudaGraphNode_t& node_out, cudaGraph_t& graph,
313 const std::vector<cudaGraphNode_t>& dependencies) {
314 return add_copy_node_impl(src, dst, num_elements,
sizeof(T),
315 kind, node_out, graph, dependencies);
331int prepare_dependencies(
332 const std::vector<std::pair<std::string, bool>>& node_names,
333 std::vector<cudaGraphNode_t>& dependencies,
334 const std::unordered_map<std::string, cudaGraphNode_t>& map,
335 const std::string& caller_operation_name =
"");
338#if defined (COLVARS_NVTX_PROFILING)
349 void set_name_color(
const std::string& name_in,
const uint32_t color_in);
350 inline void start() {
351 nvtxRangePushEx(&nvtx_event_attr);
357 std::string nvtx_event_name;
358 nvtxEventAttributes_t nvtx_event_attr;
Allocator for pinned host memory using cudaHostAlloc.
Definition: colvar_gpu_support.h:202
Class for managing NVTX profiling ranges.
Definition: colvar_gpu_support.h:346