Program Listing for File MPILinearAlg.hpp¶
↰ Return to documentation for file (pennylane_lightning/core/src/utils/cuda_utils/MPILinearAlg.hpp
)
#pragma once
#include <functional>
#include <memory>
#include <mutex>
#include <cuComplex.h>
#include <cublas_v2.h>
#include <cuda.h>
#include <cusparse_v2.h>
#include "DataBuffer.hpp"
#include "cuError.hpp"
#include "CSRMatrix.hpp"
#include "LinearAlg.hpp"
#include "MPIManager.hpp"
using namespace Pennylane::LightningGPU::MPI;
namespace Pennylane::LightningGPU::Util {
template <class index_type, class Precision, class CFP_t, class DevTypeID = int>
inline void SparseMV_cuSparseMPI(
MPIManager &mpi_manager, const std::size_t &length_local,
const index_type *csrOffsets_ptr, const int64_t csrOffsets_size,
const index_type *columns_ptr, const std::complex<Precision> *values_ptr,
CFP_t *X, CFP_t *Y, DevTypeID device_id, cudaStream_t stream_id,
cusparseHandle_t handle) {
std::vector<std::vector<CSRMatrix<Precision, index_type>>> csrmatrix_blocks;
if (mpi_manager.getRank() == 0) {
csrmatrix_blocks = splitCSRMatrix<Precision, index_type>(
mpi_manager, static_cast<std::size_t>(csrOffsets_size - 1),
csrOffsets_ptr, columns_ptr, values_ptr);
}
mpi_manager.Barrier();
std::vector<CSRMatrix<Precision, index_type>> localCSRMatVector;
for (size_t i = 0; i < mpi_manager.getSize(); i++) {
auto localCSRMat = scatterCSRMatrix<Precision, index_type>(
mpi_manager, csrmatrix_blocks[i], length_local, 0);
localCSRMatVector.push_back(localCSRMat);
}
mpi_manager.Barrier();
DataBuffer<CFP_t, int> d_res_per_block{length_local, device_id, stream_id,
true};
for (size_t i = 0; i < mpi_manager.getSize(); i++) {
// Need to investigate if non-blocking MPI operation can improve
// performace here.
auto &localCSRMatrix = localCSRMatVector[i];
std::size_t color = 0;
if (localCSRMatrix.getValues().size() != 0) {
d_res_per_block.zeroInit();
color = 1;
SparseMV_cuSparse<index_type, Precision, CFP_t>(
localCSRMatrix.getCsrOffsets().data(),
static_cast<int64_t>(localCSRMatrix.getCsrOffsets().size()),
localCSRMatrix.getColumns().data(),
localCSRMatrix.getValues().data(),
static_cast<int64_t>(localCSRMatrix.getValues().size()), X,
d_res_per_block.getData(), device_id, stream_id, handle);
}
PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
mpi_manager.Barrier();
if (mpi_manager.getRank() == i) {
color = 1;
if (localCSRMatrix.getValues().size() == 0) {
d_res_per_block.zeroInit();
}
}
PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
mpi_manager.Barrier();
auto new_mpi_manager = mpi_manager.split(color, mpi_manager.getRank());
int reduce_root_rank = -1;
if (mpi_manager.getRank() == i) {
reduce_root_rank = new_mpi_manager.getRank();
}
mpi_manager.Bcast<int>(reduce_root_rank, i);
if (new_mpi_manager.getComm() != MPI_COMM_NULL) {
new_mpi_manager.Reduce<CFP_t>(d_res_per_block.getData(), Y,
length_local, reduce_root_rank,
"sum");
}
PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
mpi_manager.Barrier();
}
}
} // namespace Pennylane::LightningGPU::Util
api/program_listing_file_pennylane_lightning_core_src_utils_cuda_utils_MPILinearAlg.hpp
Download Python script
Download Notebook
View on GitHub