SHAFFT 1.1.0-alpha
A Scalable High-dimensional Accelerated FFT Library
Loading...
Searching...
No Matches
example.cpp

HIP-specific SHAFFT C++ example with explicit GPU memory management.

#include <shafft/shafft.hpp>
#include <cstdio>
#include <hip/hip_complex.h>
#include <hip/hip_runtime.h>
#include <mpi.h>
#include <vector>
int main(int argc, char** argv) {
MPI_Init(&argc, &argv);
// MPI setup
int rank = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
[[maybe_unused]] int rc;
constexpr int ndim = 3;
constexpr int printCount = 4;
std::vector<size_t> dims = {64, 64, 32};
// Get configuration
std::vector<int> commDims(ndim, 0);
std::vector<size_t> subsize(ndim), offset(ndim);
int nda = 0, commSize;
commDims,
nda,
subsize,
offset,
commSize,
0,
MPI_COMM_WORLD);
// Create and plan FFT
rc = fft.init(commDims, dims, shafft::FFTType::C2C, MPI_COMM_WORLD);
rc = fft.plan();
size_t allocSize = fft.allocSize();
size_t localElems = subsize[0] * subsize[1] * subsize[2];
// Allocate GPU buffers (HIP-specific)
hipComplex *dData, *dWork;
(void)hipMalloc(&dData, allocSize * sizeof(hipComplex));
(void)hipMalloc(&dWork, allocSize * sizeof(hipComplex));
// Initialize: delta function at origin (rank 0, index 0)
std::vector<hipComplex> host(localElems, {0.0f, 0.0f});
if (rank == 0 && localElems > 0)
host[0] = {1.0f, 0.0f};
(void)hipMemcpy(dData, host.data(), localElems * sizeof(hipComplex), hipMemcpyHostToDevice);
(void)fft.setBuffers(dData, dWork);
// Forward FFT
(void)fft.normalize();
// Retrieve spectrum
(void)fft.getBuffers(&dData, &dWork);
(void)hipMemcpy(host.data(), dData, localElems * sizeof(hipComplex), hipMemcpyDeviceToHost);
if (rank == 0) {
std::printf("Spectrum[0..%d] =", printCount - 1);
for (int i = 0; i < printCount; ++i)
std::printf(" (%g,%g)", host[i].x, host[i].y);
std::printf("\n");
}
// Backward FFT
(void)fft.setBuffers(dData, dWork);
(void)fft.normalize();
// Retrieve result
(void)fft.getBuffers(&dData, &dWork);
(void)hipMemcpy(host.data(), dData, localElems * sizeof(hipComplex), hipMemcpyDeviceToHost);
if (rank == 0) {
std::printf("Result[0..%d] =", printCount - 1);
for (int i = 0; i < printCount; ++i)
std::printf(" (%g,%g)", host[i].x, host[i].y);
std::printf("\n");
}
// Cleanup
(void)hipFree(dData);
(void)hipFree(dWork);
fft.release();
MPI_Finalize();
return 0;
}
N-dimensional distributed FFT plan with RAII semantics.
Definition shafft.hpp:51
int init(const std::vector< int > &commDims, const std::vector< size_t > &dimensions, FFTType type, MPI_Comm comm, TransformLayout output=TransformLayout::REDISTRIBUTED) noexcept
Initialize plan with Cartesian process grid.
int normalize() noexcept override
Apply symmetric normalization (1/sqrt(N) per transform).
int plan() noexcept override
Create backend FFT plans.
int execute(FFTDirection direction) noexcept override
Execute the FFT.
size_t allocSize() const noexcept override
Get required buffer size in complex elements.
int getBuffers(complexf **data, complexf **work) noexcept
Retrieve current buffer pointers.
void release() noexcept override
Release all internal resources.
int setBuffers(complexf *data, complexf *work) noexcept
Attach data and work buffers.
int configurationND(const std::vector< size_t > &size, FFTType precision, std::vector< int > &commDims, int &nda, std::vector< size_t > &subsize, std::vector< size_t > &offset, int &commSize, DecompositionStrategy strategy, size_t memLimit, MPI_Comm comm)
Compute process grid and local layout for N-D distributed FFT.
@ C2C
Single-precision complex-to-complex (float).
@ MINIMIZE_NDA
Minimize distributed axes.
@ BACKWARD
Backward/inverse transform (frequency to time domain).
@ FORWARD
Forward transform (time to frequency domain).