Collective Variables Module - Developer Documentation
Loading...
Searching...
No Matches
colvar_gpu_support.h
1#ifndef COLVAR_GPU_SUPPORT_H
2#define COLVAR_GPU_SUPPORT_H
3
4#include <vector>
5#include <iostream>
6#include <unordered_map>
7
8#define COLVARS_STRINGIFY(s) STRINGIFY_HELPER(s)
9#define STRINGIFY_HELPER(s) #s
10
11#if defined(COLVARS_CUDA)
12#include <cuda_runtime.h>
13#ifdef COLVARS_NVTX_PROFILING
14#include <nvtx3/nvToolsExt.h>
15#endif
16#endif // defined(COLVARS_CUDA)
17
18#if defined(COLVARS_HIP)
19#ifndef cudaError_t
20#define cudaError_t hipError_t
21#endif // cudaError_t
22
23#ifndef cudaFree
24#define cudaFree hipFree
25#endif // cudaFree
26
27#ifndef cudaFreeHost
28#define cudaFreeHost hipFreeHost
29#endif // cudaFreeHost
30
31#ifndef cudaGetErrorString
32#define cudaGetErrorString hipGetErrorString
33#endif // cudaGetErrorString
34
35#ifndef cudaGraphAddChildGraphNode
36#define cudaGraphAddChildGraphNode hipGraphAddChildGraphNode
37#endif // cudaGraphAddChildGraphNode
38
39#ifndef cudaGraphAddKernelNode
40#define cudaGraphAddKernelNode hipGraphAddKernelNode
41#endif // cudaGraphAddKernelNode
42
43#ifndef cudaGraphAddMemcpyNode
44#define cudaGraphAddMemcpyNode hipGraphAddMemcpyNode
45#endif // cudaGraphAddMemcpyNode
46
47#ifndef cudaGraphAddMemsetNode
48#define cudaGraphAddMemsetNode hipGraphAddMemsetNode
49#endif // cudaGraphAddMemsetNode
50
51#ifndef cudaGraphCreate
52#define cudaGraphCreate hipGraphCreate
53#endif // cudaGraphCreate
54
55#ifndef cudaGraphDestroy
56#define cudaGraphDestroy hipGraphDestroy
57#endif // cudaGraphDestroy
58
59#ifndef cudaGraphExecDestroy
60#define cudaGraphExecDestroy hipGraphExecDestroy
61#endif // cudaGraphExecDestroy
62
63#ifndef cudaGraphExecMemcpyNodeSetParams
64#define cudaGraphExecMemcpyNodeSetParams hipGraphExecMemcpyNodeSetParams
65#endif // cudaGraphExecMemcpyNodeSetParams
66
67#ifndef cudaGraphExec_t
68#define cudaGraphExec_t hipGraphExec_t
69#endif // cudaGraphExec_t
70
71#ifndef cudaGraphInstantiate
72#define cudaGraphInstantiate hipGraphInstantiate
73#endif // cudaGraphInstantiate
74
75#ifndef cudaGraphLaunch
76#define cudaGraphLaunch hipGraphLaunch
77#endif // cudaGraphLaunch
78
79#ifndef cudaGraphNode_t
80#define cudaGraphNode_t hipGraphNode_t
81#endif // cudaGraphNode_t
82
83#ifndef cudaGraph_t
84#define cudaGraph_t hipGraph_t
85#endif // cudaGraph_t
86
87#ifndef cudaHostAllocMapped
88#define cudaHostAllocMapped hipHostAllocMapped
89#endif // cudaHostAllocMapped
90
91#ifndef cudaKernelNodeParams
92#define cudaKernelNodeParams hipKernelNodeParams
93#endif // cudaKernelNodeParams
94
95#ifndef cudaMalloc
96#define cudaMalloc hipMalloc
97#endif // cudaMalloc
98
99#ifndef cudaMallocHost
100#define cudaMallocHost hipMallocHost
101#endif // cudaMallocHost
102
103#ifndef cudaMemcpy
104#define cudaMemcpy hipMemcpy
105#endif // cudaMemcpy
106
107#ifndef cudaMemcpy3DParms
108#define cudaMemcpy3DParms hipMemcpy3DParms
109#endif // cudaMemcpy3DParms
110
111#ifndef cudaMemcpyAsync
112#define cudaMemcpyAsync hipMemcpyAsync
113#endif // cudaMemcpyAsync
114
115#ifndef cudaMemcpyDeviceToDevice
116#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
117#endif // cudaMemcpyDeviceToDevice
118
119#ifndef cudaMemcpyDeviceToHost
120#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
121#endif // cudaMemcpyDeviceToHost
122
123#ifndef cudaMemcpyHostToDevice
124#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
125#endif // cudaMemcpyHostToDevice
126
127#ifndef cudaMemcpyKind
128#define cudaMemcpyKind hipMemcpyKind
129#endif // cudaMemcpyKind
130
131#ifndef cudaMemset
132#define cudaMemset hipMemset
133#endif // cudaMemset
134
135#ifndef cudaMemsetAsync
136#define cudaMemsetAsync hipMemsetAsync
137#endif // cudaMemsetAsync
138
139#ifndef cudaStreamCreate
140#define cudaStreamCreate hipStreamCreate
141#endif // cudaStreamCreate
142
143#ifndef cudaStreamDestroy
144#define cudaStreamDestroy hipStreamDestroy
145#endif // cudaStreamDestroy
146
147#ifndef cudaStreamSynchronize
148#define cudaStreamSynchronize hipStreamSynchronize
149#endif // cudaStreamSynchronize
150
151#ifndef cudaStream_t
152#define cudaStream_t hipStream_t
153#endif // cudaStream_t
154
155#ifndef cudaSuccess
156#define cudaSuccess hipSuccess
157#endif // cudaSuccess
158
159#ifndef make_cudaExtent
160#define make_cudaExtent make_hipExtent
161#endif // make_cudaExtent
162
163#ifndef make_cudaPitchedPtr
164#define make_cudaPitchedPtr make_hipPitchedPtr
165#endif // make_cudaPitchedPtr
166
167#ifndef make_cudaPos
168#define make_cudaPos make_hipPos
169#endif // make_cudaPos
170
171#endif // defined(COLVARS_HIP)
172
173namespace colvars_gpu {
174
175#if defined(COLVARS_CUDA) || defined(COLVARS_HIP)
177constexpr unsigned int default_block_size = 128;
179static unsigned int default_reduce_max_num_blocks = 64;
180// static unsigned int default_atom_wise_num_blocks = 64;
181#endif
182
183#if defined(COLVARS_CUDA) || defined(COLVARS_HIP)
184#define COLVARS_HOST_DEVICE __device__ __host__
185#define COLVARS_DEVICE __device__
186#else
187#define COLVARS_HOST_DEVICE
188#define COLVARS_DEVICE
189#endif
190
191// TODO: What about SYCL?
192#if ( defined(COLVARS_CUDA) || defined(COLVARS_HIP) )
201template <typename T>
203public:
204 using value_type = T;
205
206 CudaHostAllocator() = default;
207
208 template<typename U>
209 constexpr CudaHostAllocator(const CudaHostAllocator<U>&) noexcept {}
210
211 friend bool operator==(const CudaHostAllocator&, const CudaHostAllocator&) { return true; }
212 friend bool operator!=(const CudaHostAllocator&, const CudaHostAllocator&) { return false; }
213
214 T* allocate(size_t n) {
215 T* ptr;
216 if (cudaHostAlloc(&ptr, n * sizeof(T), cudaHostAllocMapped) != cudaSuccess) {
217 throw std::bad_alloc();
218 }
219 return ptr;
220 }
221 void deallocate(T* ptr, size_t n) noexcept {
222 cudaFreeHost(ptr);
223 }
224 template<typename U, typename... Args>
225 void construct(U* p, Args&&... args) {
226 new(p) U(std::forward<Args>(args)...);
227 }
228
229 template<typename U>
230 void destroy(U* p) noexcept {
231 p->~U();
232 }
233};
234#endif
235
236
237#if defined(COLVARS_CUDA) || defined (COLVARS_HIP)
246int gpuAssert(cudaError_t code, const char *file, int line);
247#endif
248
249} // namespace colvars_gpu
250
251#if defined(COLVARS_CUDA) || defined (COLVARS_HIP)
254#define checkGPUError(ans) colvars_gpu::gpuAssert((ans), __FILE__, __LINE__);
255#endif
256
257namespace colvars_gpu {
258#if defined(COLVARS_CUDA) || defined (COLVARS_HIP)
259
263int add_clear_array_node_impl(
264 void* dst, const size_t num_elements, const size_t sizeofT,
265 cudaGraphNode_t& node_out, cudaGraph_t& graph,
266 const std::vector<cudaGraphNode_t>& dependencies);
267
271int add_copy_node_impl(
272 const void* src, void* dst, const size_t num_elements, const size_t sizeofT,
273 cudaMemcpyKind kind, cudaGraphNode_t& node_out, cudaGraph_t& graph,
274 const std::vector<cudaGraphNode_t>& dependencies);
275
287template <typename T>
288int add_clear_array_node(
289 T* dst, const size_t num_elements,
290 cudaGraphNode_t& node_out, cudaGraph_t& graph,
291 const std::vector<cudaGraphNode_t>& dependencies) {
292 return add_clear_array_node_impl(
293 dst, num_elements, sizeof(T), node_out, graph, dependencies);
294}
295
309template <typename T>
310int add_copy_node(
311 const T* src, T* dst, size_t num_elements,
312 cudaMemcpyKind kind, cudaGraphNode_t& node_out, cudaGraph_t& graph,
313 const std::vector<cudaGraphNode_t>& dependencies) {
314 return add_copy_node_impl(src, dst, num_elements, sizeof(T),
315 kind, node_out, graph, dependencies);
316}
317
331int prepare_dependencies(
332 const std::vector<std::pair<std::string, bool>>& node_names,
333 std::vector<cudaGraphNode_t>& dependencies,
334 const std::unordered_map<std::string, cudaGraphNode_t>& map,
335 const std::string& caller_operation_name = "");
336
337// NVTX Profiling
338#if defined (COLVARS_NVTX_PROFILING)
347public:
349 void set_name_color(const std::string& name_in, const uint32_t color_in);
350 inline void start() {
351 nvtxRangePushEx(&nvtx_event_attr);
352 }
353 inline void stop() {
354 nvtxRangePop();
355 }
356private:
357 std::string nvtx_event_name;
358 nvtxEventAttributes_t nvtx_event_attr;
359};
360#endif // defined (COLVARS_NVTX_PROFILING)
361#endif // defined(COLVARS_CUDA) || defined (COLVARS_HIP)
362}
363
364#endif // COLVAR_GPU_SUPPORT_H
Allocator for pinned host memory using cudaHostAlloc.
Definition: colvar_gpu_support.h:202
Class for managing NVTX profiling ranges.
Definition: colvar_gpu_support.h:346