HIP-specific SHAFFT C++ example with explicit GPU memory management.
#include <shafft/shafft.hpp>
#include <cstdio>
#include <hip/hip_complex.h>
#include <hip/hip_runtime.h>
#include <mpi.h>
#include <vector>
int main(int argc, char** argv) {
MPI_Init(&argc, &argv);
int rank = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
[[maybe_unused]] int rc;
constexpr int ndim = 3;
constexpr int printCount = 4;
std::vector<size_t> dims = {64, 64, 32};
std::vector<int> commDims(ndim, 0);
std::vector<size_t> subsize(ndim), offset(ndim);
int nda = 0, commSize;
commDims,
nda,
subsize,
offset,
commSize,
0,
MPI_COMM_WORLD);
size_t localElems = subsize[0] * subsize[1] * subsize[2];
hipComplex *dData, *dWork;
(void)hipMalloc(&dData, allocSize * sizeof(hipComplex));
(void)hipMalloc(&dWork, allocSize * sizeof(hipComplex));
std::vector<hipComplex> host(localElems, {0.0f, 0.0f});
if (rank == 0 && localElems > 0)
host[0] = {1.0f, 0.0f};
(void)hipMemcpy(dData, host.data(), localElems * sizeof(hipComplex), hipMemcpyHostToDevice);
(void)hipMemcpy(host.data(), dData, localElems * sizeof(hipComplex), hipMemcpyDeviceToHost);
if (rank == 0) {
std::printf("Spectrum[0..%d] =", printCount - 1);
for (int i = 0; i < printCount; ++i)
std::printf(" (%g,%g)", host[i].x, host[i].y);
std::printf("\n");
}
(void)hipMemcpy(host.data(), dData, localElems * sizeof(hipComplex), hipMemcpyDeviceToHost);
if (rank == 0) {
std::printf("Result[0..%d] =", printCount - 1);
for (int i = 0; i < printCount; ++i)
std::printf(" (%g,%g)", host[i].x, host[i].y);
std::printf("\n");
}
(void)hipFree(dData);
(void)hipFree(dWork);
MPI_Finalize();
return 0;
}
N-dimensional distributed FFT plan with RAII semantics.
Definition shafft.hpp:51
int init(const std::vector< int > &commDims, const std::vector< size_t > &dimensions, FFTType type, MPI_Comm comm, TransformLayout output=TransformLayout::REDISTRIBUTED) noexcept
Initialize plan with Cartesian process grid.
int normalize() noexcept override
Apply symmetric normalization (1/sqrt(N) per transform).
int plan() noexcept override
Create backend FFT plans.
int execute(FFTDirection direction) noexcept override
Execute the FFT.
size_t allocSize() const noexcept override
Get required buffer size in complex elements.
int getBuffers(complexf **data, complexf **work) noexcept
Retrieve current buffer pointers.
void release() noexcept override
Release all internal resources.
int setBuffers(complexf *data, complexf *work) noexcept
Attach data and work buffers.
int configurationND(const std::vector< size_t > &size, FFTType precision, std::vector< int > &commDims, int &nda, std::vector< size_t > &subsize, std::vector< size_t > &offset, int &commSize, DecompositionStrategy strategy, size_t memLimit, MPI_Comm comm)
Compute process grid and local layout for N-D distributed FFT.
@ C2C
Single-precision complex-to-complex (float).
@ MINIMIZE_NDA
Minimize distributed axes.
@ BACKWARD
Backward/inverse transform (frequency to time domain).
@ FORWARD
Forward transform (time to frequency domain).