1#ifndef COLVAR_GPU_SUPPORT_H
2#define COLVAR_GPU_SUPPORT_H
6#include <unordered_map>
8#define COLVARS_STRINGIFY(s) STRINGIFY_HELPER(s)
9#define STRINGIFY_HELPER(s) #s
11#if defined(COLVARS_CUDA)
12#include <cuda_runtime.h>
13#ifdef COLVARS_NVTX_PROFILING
14#include <nvtx3/nvToolsExt.h>
16#define COLVARS_SYNC_WARP __syncwarp()
19#if defined(COLVARS_HIP)
20#include <hip/hip_runtime.h>
21#if defined(__HIP_PLATFORM_AMD__)
28 #define COLVARS_SYNC_WARP do {\
29 __builtin_amdgcn_fence(__ATOMIC_RELEASE, "wavefront"); \
30 __builtin_amdgcn_wave_barrier(); \
31 __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "wavefront"); \
33#elif defined(__HIP_PLATFORM_NVIDIA__)
34 #define COLVARS_SYNC_WARP __syncwarp()
36 #error "Unknown HIP platform"
40#if defined(COLVARS_HIP)
42#define cudaError_t hipError_t
46#define cudaFree hipFree
50#define cudaFreeHost hipFreeHost
54#define cudaFreeAsync hipFreeAsync
57#ifndef cudaGetErrorString
58#define cudaGetErrorString hipGetErrorString
61#ifndef cudaGraphAddChildGraphNode
62#define cudaGraphAddChildGraphNode hipGraphAddChildGraphNode
65#ifndef cudaGraphAddKernelNode
66#define cudaGraphAddKernelNode hipGraphAddKernelNode
69#ifndef cudaGraphAddMemcpyNode
70#define cudaGraphAddMemcpyNode hipGraphAddMemcpyNode
73#ifndef cudaGraphAddMemsetNode
74#define cudaGraphAddMemsetNode hipGraphAddMemsetNode
77#ifndef cudaGraphCreate
78#define cudaGraphCreate hipGraphCreate
81#ifndef cudaGraphDestroy
82#define cudaGraphDestroy hipGraphDestroy
85#ifndef cudaGraphExecDestroy
86#define cudaGraphExecDestroy hipGraphExecDestroy
89#ifndef cudaGraphExecMemcpyNodeSetParams
90#define cudaGraphExecMemcpyNodeSetParams hipGraphExecMemcpyNodeSetParams
93#ifndef cudaGraphExec_t
94#define cudaGraphExec_t hipGraphExec_t
97#ifndef cudaGraphInstantiate
98#define cudaGraphInstantiate hipGraphInstantiate
101#ifndef cudaGraphInstantiateWithParams
102#define cudaGraphInstantiateWithParams hipGraphInstantiateWithParams
105#ifndef cudaGraphInstantiateParams
106#define cudaGraphInstantiateParams hipGraphInstantiateParams
109#ifndef cudaGraphInstantiateFlagUpload
110#define cudaGraphInstantiateFlagUpload hipGraphInstantiateFlagUpload
113#ifndef cudaGraphInstantiateSuccess
114#define cudaGraphInstantiateSuccess hipGraphInstantiateSuccess
117#ifndef cudaGraphLaunch
118#define cudaGraphLaunch hipGraphLaunch
121#ifndef cudaGraphNode_t
122#define cudaGraphNode_t hipGraphNode_t
126#define cudaGraph_t hipGraph_t
129#ifndef cudaGraphDebugDotPrint
130#define cudaGraphDebugDotPrint hipGraphDebugDotPrint
133#ifndef cudaGraphDebugDotFlags
134#define cudaGraphDebugDotFlags hipGraphDebugDotFlags
137#ifndef cudaGraphDebugDotFlagsVerbose
138#define cudaGraphDebugDotFlagsVerbose hipGraphDebugDotFlagsVerbose
141#ifndef cudaHostAllocMapped
142#define cudaHostAllocMapped hipHostAllocMapped
145#ifndef cudaHostAllocDefault
146#define cudaHostAllocDefault hipHostAllocDefault
150#define cudaHostAlloc hipHostAlloc
153#ifndef cudaLaunchKernel
154#define cudaLaunchKernel hipLaunchKernel
157#ifndef cudaKernelNodeParams
158#define cudaKernelNodeParams hipKernelNodeParams
162#define cudaMalloc hipMalloc
165#ifndef cudaMallocAsync
166#define cudaMallocAsync hipMallocAsync
169#ifndef cudaMallocHost
170#define cudaMallocHost hipMallocHost
173#ifndef cudaGraphAddMemcpyNode1D
174#define cudaGraphAddMemcpyNode1D hipGraphAddMemcpyNode1D
178#define cudaMemcpy hipMemcpy
181#ifndef cudaMemcpy3DParms
182#define cudaMemcpy3DParms hipMemcpy3DParms
185#ifndef cudaMemcpyAsync
186#define cudaMemcpyAsync hipMemcpyAsync
189#ifndef cudaMemcpyDeviceToDevice
190#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
193#ifndef cudaMemcpyDeviceToHost
194#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
197#ifndef cudaMemcpyHostToDevice
198#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
201#ifndef cudaMemcpyKind
202#define cudaMemcpyKind hipMemcpyKind
206#define cudaMemset hipMemset
209#ifndef cudaMemsetParams
210#define cudaMemsetParams hipMemsetParams
213#ifndef cudaMemsetAsync
214#define cudaMemsetAsync hipMemsetAsync
217#ifndef cudaStreamCreate
218#define cudaStreamCreate hipStreamCreate
221#ifndef cudaStreamDestroy
222#define cudaStreamDestroy hipStreamDestroy
225#ifndef cudaStreamSynchronize
226#define cudaStreamSynchronize hipStreamSynchronize
230#define cudaStream_t hipStream_t
234#define cudaSuccess hipSuccess
237#ifndef make_cudaExtent
238#define make_cudaExtent make_hipExtent
241#ifndef make_cudaPitchedPtr
242#define make_cudaPitchedPtr make_hipPitchedPtr
246#define make_cudaPos make_hipPos
251namespace colvars_gpu {
253#if defined(COLVARS_CUDA) || defined(COLVARS_HIP)
255constexpr unsigned int default_block_size = 128;
257static unsigned int default_reduce_max_num_blocks = 64;
261#if defined(COLVARS_CUDA) || defined(COLVARS_HIP)
262#define COLVARS_HOST_DEVICE __device__ __host__
263#define COLVARS_DEVICE __device__
265#define COLVARS_HOST_DEVICE
266#define COLVARS_DEVICE
271#if defined(COLVARS_CUDA) || defined(COLVARS_HIP)
272template <
typename T,
unsigned long N>
276 using value_type = T;
277 using size_type =
decltype(N);
278 using reference = value_type&;
279 using const_reference =
const value_type&;
281 using const_pointer =
const T*;
282 COLVARS_HOST_DEVICE
constexpr size_type size()
const {
return N;}
283 COLVARS_HOST_DEVICE reference operator[](size_type pos) {
return m_data[pos];}
284 COLVARS_HOST_DEVICE const_reference operator[](size_type pos)
const {
return m_data[pos];}
289#if ( defined(COLVARS_CUDA) || defined(COLVARS_HIP) )
301 using value_type = T;
311 T* allocate(
size_t n) {
313 if (cudaHostAlloc(&ptr, n *
sizeof(T), cudaHostAllocMapped) != cudaSuccess) {
314 throw std::bad_alloc();
318 void deallocate(T* ptr,
size_t n)
noexcept {
319 (void)cudaFreeHost(ptr);
321 template<
typename U,
typename... Args>
322 void construct(U* p, Args&&... args) {
323 new(p) U(std::forward<Args>(args)...);
327 void destroy(U* p)
noexcept {
334#if defined(COLVARS_CUDA) || defined (COLVARS_HIP)
343int gpuAssert(cudaError_t code,
const char *file,
int line);
348#if defined(COLVARS_CUDA) || defined (COLVARS_HIP)
351#define checkGPUError(ans) colvars_gpu::gpuAssert((ans), __FILE__, __LINE__);
354namespace colvars_gpu {
355#if defined(COLVARS_CUDA) || defined (COLVARS_HIP)
360int add_clear_array_node_impl(
361 void* dst,
const size_t num_elements,
const size_t sizeofT,
362 cudaGraphNode_t& node_out, cudaGraph_t& graph,
363 const std::vector<cudaGraphNode_t>& dependencies);
368int add_copy_node_impl(
369 const void* src,
void* dst,
const size_t num_elements,
const size_t sizeofT,
370 cudaMemcpyKind kind, cudaGraphNode_t& node_out, cudaGraph_t& graph,
371 const std::vector<cudaGraphNode_t>& dependencies);
385int add_clear_array_node(
386 T* dst,
const size_t num_elements,
387 cudaGraphNode_t& node_out, cudaGraph_t& graph,
388 const std::vector<cudaGraphNode_t>& dependencies) {
389 return add_clear_array_node_impl(
390 dst, num_elements,
sizeof(T), node_out, graph, dependencies);
408 const T* src, T* dst,
size_t num_elements,
409 cudaMemcpyKind kind, cudaGraphNode_t& node_out, cudaGraph_t& graph,
410 const std::vector<cudaGraphNode_t>& dependencies) {
411 return add_copy_node_impl(src, dst, num_elements,
sizeof(T),
412 kind, node_out, graph, dependencies);
428int prepare_dependencies(
429 const std::vector<std::pair<std::string, bool>>& node_names,
430 std::vector<cudaGraphNode_t>& dependencies,
431 const std::unordered_map<std::string, cudaGraphNode_t>& map,
432 const std::string& caller_operation_name =
"");
435#if defined (COLVARS_NVTX_PROFILING)
446 void set_name_color(
const std::string& name_in,
const uint32_t color_in);
447 inline void start() {
448 nvtxRangePushEx(&nvtx_event_attr);
454 std::string nvtx_event_name;
455 nvtxEventAttributes_t nvtx_event_attr;
Allocator for pinned host memory using cudaHostAlloc.
Definition: colvar_gpu_support.h:299
Definition: colvar_gpu_support.h:273
Class for managing NVTX profiling ranges.
Definition: colvar_gpu_support.h:443