Collective Variables Module - Developer Documentation
Loading...
Searching...
No Matches
colvar_gpu_support.h
1#ifndef COLVAR_GPU_SUPPORT_H
2#define COLVAR_GPU_SUPPORT_H
3
4#include <vector>
5#include <iostream>
6#include <unordered_map>
7
8#define COLVARS_STRINGIFY(s) STRINGIFY_HELPER(s)
9#define STRINGIFY_HELPER(s) #s
10
11#if defined(COLVARS_CUDA)
12#include <cuda_runtime.h>
13#ifdef COLVARS_NVTX_PROFILING
14#include <nvtx3/nvToolsExt.h>
15#endif
16#define COLVARS_SYNC_WARP __syncwarp()
17#endif // defined(COLVARS_CUDA)
18
19#if defined(COLVARS_HIP)
20#include <hip/hip_runtime.h>
21#if defined(__HIP_PLATFORM_AMD__)
28 #define COLVARS_SYNC_WARP do {\
29 __builtin_amdgcn_fence(__ATOMIC_RELEASE, "wavefront"); \
30 __builtin_amdgcn_wave_barrier(); \
31 __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "wavefront"); \
32 } while (0)
33#elif defined(__HIP_PLATFORM_NVIDIA__)
34 #define COLVARS_SYNC_WARP __syncwarp()
35#else
36 #error "Unknown HIP platform"
37#endif
38#endif // defined(COLVARS_HIP)
39
40#if defined(COLVARS_HIP)
41#ifndef cudaError_t
42#define cudaError_t hipError_t
43#endif // cudaError_t
44
45#ifndef cudaFree
46#define cudaFree hipFree
47#endif // cudaFree
48
49#ifndef cudaFreeHost
50#define cudaFreeHost hipFreeHost
51#endif // cudaFreeHost
52
53#ifndef cudaFreeAsync
54#define cudaFreeAsync hipFreeAsync
55#endif // cudaFreeAsync
56
57#ifndef cudaGetErrorString
58#define cudaGetErrorString hipGetErrorString
59#endif // cudaGetErrorString
60
61#ifndef cudaGraphAddChildGraphNode
62#define cudaGraphAddChildGraphNode hipGraphAddChildGraphNode
63#endif // cudaGraphAddChildGraphNode
64
65#ifndef cudaGraphAddKernelNode
66#define cudaGraphAddKernelNode hipGraphAddKernelNode
67#endif // cudaGraphAddKernelNode
68
69#ifndef cudaGraphAddMemcpyNode
70#define cudaGraphAddMemcpyNode hipGraphAddMemcpyNode
71#endif // cudaGraphAddMemcpyNode
72
73#ifndef cudaGraphAddMemsetNode
74#define cudaGraphAddMemsetNode hipGraphAddMemsetNode
75#endif // cudaGraphAddMemsetNode
76
77#ifndef cudaGraphCreate
78#define cudaGraphCreate hipGraphCreate
79#endif // cudaGraphCreate
80
81#ifndef cudaGraphDestroy
82#define cudaGraphDestroy hipGraphDestroy
83#endif // cudaGraphDestroy
84
85#ifndef cudaGraphExecDestroy
86#define cudaGraphExecDestroy hipGraphExecDestroy
87#endif // cudaGraphExecDestroy
88
89#ifndef cudaGraphExecMemcpyNodeSetParams
90#define cudaGraphExecMemcpyNodeSetParams hipGraphExecMemcpyNodeSetParams
91#endif // cudaGraphExecMemcpyNodeSetParams
92
93#ifndef cudaGraphExec_t
94#define cudaGraphExec_t hipGraphExec_t
95#endif // cudaGraphExec_t
96
97#ifndef cudaGraphInstantiate
98#define cudaGraphInstantiate hipGraphInstantiate
99#endif // cudaGraphInstantiate
100
101#ifndef cudaGraphInstantiateWithParams
102#define cudaGraphInstantiateWithParams hipGraphInstantiateWithParams
103#endif // cudaGraphInstantiateWithParams
104
105#ifndef cudaGraphInstantiateParams
106#define cudaGraphInstantiateParams hipGraphInstantiateParams
107#endif // cudaGraphInstantiateParams
108
109#ifndef cudaGraphInstantiateFlagUpload
110#define cudaGraphInstantiateFlagUpload hipGraphInstantiateFlagUpload
111#endif // cudaGraphInstantiateFlagUpload
112
113#ifndef cudaGraphInstantiateSuccess
114#define cudaGraphInstantiateSuccess hipGraphInstantiateSuccess
115#endif // cudaGraphInstantiateSuccess
116
117#ifndef cudaGraphLaunch
118#define cudaGraphLaunch hipGraphLaunch
119#endif // cudaGraphLaunch
120
121#ifndef cudaGraphNode_t
122#define cudaGraphNode_t hipGraphNode_t
123#endif // cudaGraphNode_t
124
125#ifndef cudaGraph_t
126#define cudaGraph_t hipGraph_t
127#endif // cudaGraph_t
128
129#ifndef cudaGraphDebugDotPrint
130#define cudaGraphDebugDotPrint hipGraphDebugDotPrint
131#endif // cudaGraphDebugDotPrint
132
133#ifndef cudaGraphDebugDotFlags
134#define cudaGraphDebugDotFlags hipGraphDebugDotFlags
135#endif // cudaGraphDebugDotFlags
136
137#ifndef cudaGraphDebugDotFlagsVerbose
138#define cudaGraphDebugDotFlagsVerbose hipGraphDebugDotFlagsVerbose
139#endif // cudaGraphDebugDotFlagsVerbose
140
141#ifndef cudaHostAllocMapped
142#define cudaHostAllocMapped hipHostAllocMapped
143#endif // cudaHostAllocMapped
144
145#ifndef cudaHostAllocDefault
146#define cudaHostAllocDefault hipHostAllocDefault
147#endif // cudaHostAllocDefault
148
149#ifndef cudaHostAlloc
150#define cudaHostAlloc hipHostAlloc
151#endif // cudaHostAlloc
152
153#ifndef cudaLaunchKernel
154#define cudaLaunchKernel hipLaunchKernel
155#endif // cudaLaunchKernel
156
157#ifndef cudaKernelNodeParams
158#define cudaKernelNodeParams hipKernelNodeParams
159#endif // cudaKernelNodeParams
160
161#ifndef cudaMalloc
162#define cudaMalloc hipMalloc
163#endif // cudaMalloc
164
165#ifndef cudaMallocAsync
166#define cudaMallocAsync hipMallocAsync
167#endif // cudaMallocAsync
168
169#ifndef cudaMallocHost
170#define cudaMallocHost hipMallocHost
171#endif // cudaMallocHost
172
173#ifndef cudaGraphAddMemcpyNode1D
174#define cudaGraphAddMemcpyNode1D hipGraphAddMemcpyNode1D
175#endif
176
177#ifndef cudaMemcpy
178#define cudaMemcpy hipMemcpy
179#endif // cudaMemcpy
180
181#ifndef cudaMemcpy3DParms
182#define cudaMemcpy3DParms hipMemcpy3DParms
183#endif // cudaMemcpy3DParms
184
185#ifndef cudaMemcpyAsync
186#define cudaMemcpyAsync hipMemcpyAsync
187#endif // cudaMemcpyAsync
188
189#ifndef cudaMemcpyDeviceToDevice
190#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
191#endif // cudaMemcpyDeviceToDevice
192
193#ifndef cudaMemcpyDeviceToHost
194#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
195#endif // cudaMemcpyDeviceToHost
196
197#ifndef cudaMemcpyHostToDevice
198#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
199#endif // cudaMemcpyHostToDevice
200
201#ifndef cudaMemcpyKind
202#define cudaMemcpyKind hipMemcpyKind
203#endif // cudaMemcpyKind
204
205#ifndef cudaMemset
206#define cudaMemset hipMemset
207#endif // cudaMemset
208
209#ifndef cudaMemsetParams
210#define cudaMemsetParams hipMemsetParams
211#endif // cudaMemsetParams
212
213#ifndef cudaMemsetAsync
214#define cudaMemsetAsync hipMemsetAsync
215#endif // cudaMemsetAsync
216
217#ifndef cudaStreamCreate
218#define cudaStreamCreate hipStreamCreate
219#endif // cudaStreamCreate
220
221#ifndef cudaStreamDestroy
222#define cudaStreamDestroy hipStreamDestroy
223#endif // cudaStreamDestroy
224
225#ifndef cudaStreamSynchronize
226#define cudaStreamSynchronize hipStreamSynchronize
227#endif // cudaStreamSynchronize
228
229#ifndef cudaStream_t
230#define cudaStream_t hipStream_t
231#endif // cudaStream_t
232
233#ifndef cudaSuccess
234#define cudaSuccess hipSuccess
235#endif // cudaSuccess
236
237#ifndef make_cudaExtent
238#define make_cudaExtent make_hipExtent
239#endif // make_cudaExtent
240
241#ifndef make_cudaPitchedPtr
242#define make_cudaPitchedPtr make_hipPitchedPtr
243#endif // make_cudaPitchedPtr
244
245#ifndef make_cudaPos
246#define make_cudaPos make_hipPos
247#endif // make_cudaPos
248
249#endif // defined(COLVARS_HIP)
250
251namespace colvars_gpu {
252
253#if defined(COLVARS_CUDA) || defined(COLVARS_HIP)
255constexpr unsigned int default_block_size = 128;
257static unsigned int default_reduce_max_num_blocks = 64;
258// static unsigned int default_atom_wise_num_blocks = 64;
259#endif
260
261#if defined(COLVARS_CUDA) || defined(COLVARS_HIP)
262#define COLVARS_HOST_DEVICE __device__ __host__
263#define COLVARS_DEVICE __device__
264#else
265#define COLVARS_HOST_DEVICE
266#define COLVARS_DEVICE
267#endif
268
269// HIP does not have cuda::std::array since libhipcxx is not a part of the ROCm distribution,
270// so reinvent the wheel...
271#if defined(COLVARS_CUDA) || defined(COLVARS_HIP)
272template <typename T, unsigned long N>
273class array1d {
274public:
275 T m_data[N];
276 using value_type = T;
277 using size_type = decltype(N);
278 using reference = value_type&;
279 using const_reference = const value_type&;
280 using pointer = T*;
281 using const_pointer = const T*;
282 COLVARS_HOST_DEVICE constexpr size_type size() const {return N;}
283 COLVARS_HOST_DEVICE reference operator[](size_type pos) {return m_data[pos];}
284 COLVARS_HOST_DEVICE const_reference operator[](size_type pos) const {return m_data[pos];}
285};
286#endif
287
288// TODO: What about SYCL?
289#if ( defined(COLVARS_CUDA) || defined(COLVARS_HIP) )
298template <typename T>
300public:
301 using value_type = T;
302
303 CudaHostAllocator() = default;
304
305 template<typename U>
306 constexpr CudaHostAllocator(const CudaHostAllocator<U>&) noexcept {}
307
308 friend bool operator==(const CudaHostAllocator&, const CudaHostAllocator&) { return true; }
309 friend bool operator!=(const CudaHostAllocator&, const CudaHostAllocator&) { return false; }
310
311 T* allocate(size_t n) {
312 T* ptr;
313 if (cudaHostAlloc(&ptr, n * sizeof(T), cudaHostAllocMapped) != cudaSuccess) {
314 throw std::bad_alloc();
315 }
316 return ptr;
317 }
318 void deallocate(T* ptr, size_t n) noexcept {
319 (void)cudaFreeHost(ptr);
320 }
321 template<typename U, typename... Args>
322 void construct(U* p, Args&&... args) {
323 new(p) U(std::forward<Args>(args)...);
324 }
325
326 template<typename U>
327 void destroy(U* p) noexcept {
328 p->~U();
329 }
330};
331#endif
332
333
334#if defined(COLVARS_CUDA) || defined (COLVARS_HIP)
343int gpuAssert(cudaError_t code, const char *file, int line);
344#endif
345
346} // namespace colvars_gpu
347
348#if defined(COLVARS_CUDA) || defined (COLVARS_HIP)
351#define checkGPUError(ans) colvars_gpu::gpuAssert((ans), __FILE__, __LINE__);
352#endif
353
354namespace colvars_gpu {
355#if defined(COLVARS_CUDA) || defined (COLVARS_HIP)
356
360int add_clear_array_node_impl(
361 void* dst, const size_t num_elements, const size_t sizeofT,
362 cudaGraphNode_t& node_out, cudaGraph_t& graph,
363 const std::vector<cudaGraphNode_t>& dependencies);
364
368int add_copy_node_impl(
369 const void* src, void* dst, const size_t num_elements, const size_t sizeofT,
370 cudaMemcpyKind kind, cudaGraphNode_t& node_out, cudaGraph_t& graph,
371 const std::vector<cudaGraphNode_t>& dependencies);
372
384template <typename T>
385int add_clear_array_node(
386 T* dst, const size_t num_elements,
387 cudaGraphNode_t& node_out, cudaGraph_t& graph,
388 const std::vector<cudaGraphNode_t>& dependencies) {
389 return add_clear_array_node_impl(
390 dst, num_elements, sizeof(T), node_out, graph, dependencies);
391}
392
406template <typename T>
407int add_copy_node(
408 const T* src, T* dst, size_t num_elements,
409 cudaMemcpyKind kind, cudaGraphNode_t& node_out, cudaGraph_t& graph,
410 const std::vector<cudaGraphNode_t>& dependencies) {
411 return add_copy_node_impl(src, dst, num_elements, sizeof(T),
412 kind, node_out, graph, dependencies);
413}
414
428int prepare_dependencies(
429 const std::vector<std::pair<std::string, bool>>& node_names,
430 std::vector<cudaGraphNode_t>& dependencies,
431 const std::unordered_map<std::string, cudaGraphNode_t>& map,
432 const std::string& caller_operation_name = "");
433
434// NVTX Profiling
435#if defined (COLVARS_NVTX_PROFILING)
444public:
446 void set_name_color(const std::string& name_in, const uint32_t color_in);
447 inline void start() {
448 nvtxRangePushEx(&nvtx_event_attr);
449 }
450 inline void stop() {
451 nvtxRangePop();
452 }
453private:
454 std::string nvtx_event_name;
455 nvtxEventAttributes_t nvtx_event_attr;
456};
457#endif // defined (COLVARS_NVTX_PROFILING)
458#endif // defined(COLVARS_CUDA) || defined (COLVARS_HIP)
459}
460
461#endif // COLVAR_GPU_SUPPORT_H
Allocator for pinned host memory using cudaHostAlloc.
Definition: colvar_gpu_support.h:299
Definition: colvar_gpu_support.h:273
Class for managing NVTX profiling ranges.
Definition: colvar_gpu_support.h:443