Program Listing for File CUDAEnvironmentDirectedGraphBuffers.cu

Return to documentation for file (src/flamegpu/simulation/detail/CUDAEnvironmentDirectedGraphBuffers.cu)

#include "flamegpu/simulation/detail/CUDAEnvironmentDirectedGraphBuffers.cuh"

#include <algorithm>

#include "flamegpu/simulation/detail/CUDAAgent.h"
#include "flamegpu/simulation/detail/CUDAErrorChecking.cuh"
#include "flamegpu/simulation/detail/CUDAScatter.cuh"
#include "flamegpu/runtime/detail/curve/HostCurve.cuh"
#include "flamegpu/detail/cuda.cuh"
#ifdef _MSC_VER
#pragma warning(push, 1)
#pragma warning(disable : 4706 4834)
#endif  // _MSC_VER
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
#pragma nv_diag_suppress 1719
#else
#pragma diag_suppress 1719
#endif  // __NVCC_DIAG_PRAGMA_SUPPORT__
#include <cub/cub.cuh>

#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
#pragma nv_diag_default 1719
#else
#pragma diag_default 1719
#endif  // __NVCC_DIAG_PRAGMA_SUPPORT__
#ifdef _MSC_VER
#pragma warning(pop)
#endif  // _MSC_VER

namespace flamegpu {
namespace detail {

CUDAEnvironmentDirectedGraphBuffers::CUDAEnvironmentDirectedGraphBuffers(const EnvironmentDirectedGraphData& description)
    : graph_description(description)
    , vertex_count(0)
    , edge_count(0)
    , requires_rebuild(false) {
    // Initialise the maps
    for (const auto &v : graph_description.vertexProperties)
        vertex_buffers.emplace(v.first, Buffer{v.second.type_size * v.second.elements});
    for (const auto& e : graph_description.edgeProperties)
        edge_buffers.emplace(e.first, Buffer{e.second.type_size * e.second.elements });
}
CUDAEnvironmentDirectedGraphBuffers::~CUDAEnvironmentDirectedGraphBuffers() {
    deallocateVertexBuffers();
    vertex_buffers.clear();
    deallocateEdgeBuffers();
    edge_buffers.clear();
}
void CUDAEnvironmentDirectedGraphBuffers::registerCurveInstance(const std::shared_ptr<detail::curve::HostCurve>& curve) {
    curve_instances.push_back(std::weak_ptr(curve));
}
void CUDAEnvironmentDirectedGraphBuffers::registerCurveInstance(const std::shared_ptr<detail::curve::CurveRTCHost>& curve) {
    rtc_curve_instances.push_back(std::weak_ptr(curve));
}
void CUDAEnvironmentDirectedGraphBuffers::allocateVertexBuffers(const size_type count, const cudaStream_t stream) {
    for (auto& v : graph_description.vertexProperties) {
        auto &vb = vertex_buffers.at(v.first);
        if (!vb.d_ptr) {
            gpuErrchk(cudaMalloc(&vb.d_ptr, count * v.second.type_size * v.second.elements));
            // gpuErrchk(cudaMalloc(&vb.d_ptr_swap, count * v.second.type_size * v.second.elements));  // Todo: required?
            for (const auto & _curve : curve_instances) {
                if (const auto curve = _curve.lock())
                    curve->setEnvironmentDirectedGraphVertexProperty(graph_description.name, v.first, vb.d_ptr, count);
            }
            for (const auto& _curve : rtc_curve_instances) {
                if (const auto curve = _curve.lock()) {
                    memcpy(curve->getEnvironmentDirectedGraphVertexPropertyCachePtr(graph_description.name, v.first), &vb.d_ptr, sizeof(void*));
                    curve->setEnvironmentDirectedGraphVertexPropertyCount(graph_description.name, v.first, count);
                }
            }
        } else {
            THROW exception::UnknownInternalError("Device buffer already allocated, in CUDAEnvironmentDirectedGraphBuffers::allocateVertexBuffers()");
        }
        if (!vb.h_ptr) {
            vb.h_ptr = malloc(count * v.second.type_size * v.second.elements);
        } else {
            THROW exception::UnknownInternalError("Host buffer already allocated, in CUDAEnvironmentDirectedGraphBuffers::allocateVertexBuffers()");
        }
        vb.ready = Buffer::Both;
    }
    // Min length 4, as pbm_swap is used when building graph
    gpuErrchk(cudaMalloc(&d_pbm, sizeof(unsigned int) * std::max<int>(count + 1, 4)));
    gpuErrchk(cudaMalloc(&d_pbm_swap, sizeof(unsigned int) * std::max<int>(count + 1, 4)));
    gpuErrchk(cudaMalloc(&d_ipbm, sizeof(unsigned int)* std::max<int>(count + 1, 4)));
    // Initialise PBMs incase they doesn't contain edges
    gpuErrchk(cudaMemsetAsync(d_pbm, 0, (count + 1) * sizeof(unsigned int), stream));
    gpuErrchk(cudaMemsetAsync(d_ipbm, 0, (count + 1) * sizeof(unsigned int), stream));
    for (const auto& _curve : curve_instances) {
        if (const auto curve = _curve.lock()) {
            curve->setEnvironmentDirectedGraphVertexProperty(graph_description.name, GRAPH_VERTEX_PBM_VARIABLE_NAME, d_pbm, 1);
            curve->setEnvironmentDirectedGraphVertexProperty(graph_description.name, GRAPH_VERTEX_IPBM_VARIABLE_NAME, d_ipbm, 1);
            curve->setEnvironmentDirectedGraphVertexProperty(graph_description.name, GRAPH_VERTEX_IPBM_EDGES_VARIABLE_NAME, d_ipbm, 1);  // IPBM needs to point somewhere
            curve->setEnvironmentDirectedGraphVertexProperty(graph_description.name, GRAPH_VERTEX_INDEX_MAP_VARIABLE_NAME, d_vertex_index_map, 1);  // ID map needs to point somewhere
        }
    }
    for (const auto& _curve : rtc_curve_instances) {
        if (const auto curve = _curve.lock()) {
            memcpy(curve->getEnvironmentDirectedGraphVertexPropertyCachePtr(graph_description.name, GRAPH_VERTEX_PBM_VARIABLE_NAME), &d_pbm, sizeof(void*));
            memcpy(curve->getEnvironmentDirectedGraphVertexPropertyCachePtr(graph_description.name, GRAPH_VERTEX_IPBM_VARIABLE_NAME), &d_ipbm, sizeof(void*));
            memcpy(curve->getEnvironmentDirectedGraphVertexPropertyCachePtr(graph_description.name, GRAPH_VERTEX_IPBM_EDGES_VARIABLE_NAME), &d_ipbm, sizeof(void*));  // IPBM needs to point somewhere
            memcpy(curve->getEnvironmentDirectedGraphVertexPropertyCachePtr(graph_description.name, GRAPH_VERTEX_INDEX_MAP_VARIABLE_NAME), &d_ipbm, sizeof(void*));  // ID map needs to point somewhere
            curve->setEnvironmentDirectedGraphVertexPropertyCount(graph_description.name, GRAPH_VERTEX_INDEX_MAP_VARIABLE_NAME, 1);  // 1 because offset is packed at the end
        }
    }
    vertex_count = count;
}
void CUDAEnvironmentDirectedGraphBuffers::allocateEdgeBuffers(const size_type count) {
    for (auto& e : graph_description.edgeProperties) {
        auto& eb = edge_buffers.at(e.first);
        if (!eb.d_ptr) {
            gpuErrchk(cudaMalloc(&eb.d_ptr, count * e.second.type_size * e.second.elements));
            gpuErrchk(cudaMalloc(&eb.d_ptr_swap, count * e.second.type_size * e.second.elements));
            for (const auto& _curve : curve_instances) {
                if (const auto curve = _curve.lock())
                    curve->setEnvironmentDirectedGraphEdgeProperty(graph_description.name, e.first, eb.d_ptr, count);
            }
            for (const auto& _curve : rtc_curve_instances) {
                if (const auto curve = _curve.lock()) {
                    memcpy(curve->getEnvironmentDirectedGraphEdgePropertyCachePtr(graph_description.name, e.first), &eb.d_ptr, sizeof(void*));
                    curve->setEnvironmentDirectedGraphEdgePropertyCount(graph_description.name, e.first, count);
                }
            }
        } else {
            THROW exception::UnknownInternalError("Device buffer already allocated, in CUDAEnvironmentDirectedGraphBuffers::allocateEdgeBuffers()");
        }
        if (!eb.h_ptr) {
            eb.h_ptr = malloc(count * e.second.type_size * e.second.elements);
        } else {
            THROW exception::UnknownInternalError("Host buffer already allocated, in CUDAEnvironmentDirectedGraphBuffers::allocateEdgeBuffers()");
        }
        eb.ready = Buffer::Both;
    }
    gpuErrchk(cudaMalloc(&d_keys, sizeof(uint64_t) * count));
    gpuErrchk(cudaMalloc(&d_keys_swap, sizeof(uint64_t) * count));
    gpuErrchk(cudaMalloc(&d_vals, sizeof(uint32_t) * (count + 1)));
    gpuErrchk(cudaMalloc(&d_vals_swap, sizeof(uint32_t) * (count + 1)));
    gpuErrchk(cudaMalloc(&d_ipbm_edges, sizeof(uint32_t) * (count + 1)));
    for (const auto& _curve : curve_instances) {
        if (const auto curve = _curve.lock()) {
            curve->setEnvironmentDirectedGraphVertexProperty(graph_description.name, GRAPH_VERTEX_IPBM_EDGES_VARIABLE_NAME, d_ipbm_edges, 1);
        }
    }
    for (const auto& _curve : rtc_curve_instances) {
        if (const auto curve = _curve.lock()) {
            memcpy(curve->getEnvironmentDirectedGraphVertexPropertyCachePtr(graph_description.name, GRAPH_VERTEX_IPBM_EDGES_VARIABLE_NAME), &d_ipbm_edges, sizeof(void*));
            curve->setEnvironmentDirectedGraphVertexPropertyCount(graph_description.name, GRAPH_VERTEX_INDEX_MAP_VARIABLE_NAME, 1);  // 1 because offset is packed at the end
        }
    }
    edge_count = count;
}
void CUDAEnvironmentDirectedGraphBuffers::deallocateVertexBuffers() {
    for (auto& v : vertex_buffers) {
        if (v.second.d_ptr) {
            gpuErrchk(flamegpu::detail::cuda::cudaFree(v.second.d_ptr));
            gpuErrchk(flamegpu::detail::cuda::cudaFree(v.second.d_ptr_swap));
            v.second.d_ptr = nullptr;
        }
        if (v.second.h_ptr) {
            free(v.second.h_ptr);
            v.second.h_ptr = nullptr;
        }
    }
    if (d_pbm) {
        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_pbm));
        d_pbm = nullptr;
    }
    if (d_pbm_swap) {
        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_pbm_swap));
        d_pbm_swap = nullptr;
    }
    if (d_ipbm) {
        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_ipbm));
        d_ipbm = nullptr;
    }
    if (d_vertex_index_map) {
        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_vertex_index_map));
        d_vertex_index_map = nullptr;
    }
    vertex_count = 0;
    h_vertex_index_map.clear();
}
void CUDAEnvironmentDirectedGraphBuffers::deallocateEdgeBuffers() {
    for (auto& e : edge_buffers) {
        if (e.second.d_ptr) {
            gpuErrchk(flamegpu::detail::cuda::cudaFree(e.second.d_ptr));
            gpuErrchk(flamegpu::detail::cuda::cudaFree(e.second.d_ptr_swap));
            e.second.d_ptr = nullptr;
        }
        if (e.second.h_ptr) {
            free(e.second.h_ptr);
            e.second.h_ptr = nullptr;
        }
    }
    if (d_keys) {
        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_keys));
        d_keys = nullptr;
    }
    if (d_vals) {
        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_vals));
        d_vals = nullptr;
    }
    if (d_keys_swap) {
        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_keys_swap));
        d_keys_swap = nullptr;
    }
    if (d_vals_swap) {
        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_vals_swap));
        d_vals_swap = nullptr;
    }
    if (d_ipbm_edges) {
        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_ipbm_edges));
        d_ipbm_edges = nullptr;
    }
    edge_count = 0;
    h_edge_index_map.clear();
}

void CUDAEnvironmentDirectedGraphBuffers::setVertexCount(const size_type count, const cudaStream_t stream) {
    if (vertex_count) {
        deallocateVertexBuffers();
    }
    allocateVertexBuffers(count, stream);
    // Default Init host, mark device out of date
    for (auto& v : graph_description.vertexProperties) {
        auto& vb = vertex_buffers.at(v.first);
        vb.ready = Buffer::Host;
        if (v.first == ID_VARIABLE_NAME) {  // ID needs default 0
            memset(vb.h_ptr, ID_NOT_SET, vertex_count * v.second.type_size * v.second.elements);
            continue;
        }
        // Possibly faster if we checked default_value == 0 and memset, but awkward with vague type and lack of template
        for (unsigned int i = 0; i < vertex_count; ++i) {
            // TODO is this just copy-paste junk?
            memcpy(static_cast<char*>(vb.h_ptr) + i * v.second.type_size * v.second.elements, v.second.default_value, v.second.type_size * v.second.elements);
        }
    }
    // Vertex data has been reset, so ID bounds are nolonger valid
    resetVertexIDBounds();
}

void CUDAEnvironmentDirectedGraphBuffers::setEdgeCount(const size_type count) {
    if (edge_count)
        deallocateEdgeBuffers();
    allocateEdgeBuffers(count);
    // Default Init host, mark device out of date
    for (auto& e : graph_description.edgeProperties) {
        auto& eb = edge_buffers.at(e.first);
        eb.ready = Buffer::Host;
        // Possibly faster if we checked default_value == 0 and memset, but awkward with vague type and lack of template
        for (unsigned int i = 0; i < edge_count; ++i) {
            // TODO is this just copy-paste junk?
            memcpy(static_cast<char*>(eb.h_ptr) + i * e.second.type_size * e.second.elements, e.second.default_value, e.second.type_size * e.second.elements);
        }
    }
}
id_t* CUDAEnvironmentDirectedGraphBuffers::getVertexIDBuffer(const cudaStream_t stream) {
    size_type element_ct = 1;
    return getVertexPropertyBuffer<id_t>(ID_VARIABLE_NAME, element_ct, stream);
}

__global__ void fillKVPairs(uint32_t *keys, uint32_t *vals, const unsigned int *srcdest, unsigned int count, const unsigned int *idMap, const unsigned int id_offset) {
    unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x;
    if (index < count) {
        // To subsort by destination too, we treat the pair of uint32 as a uint64
        keys[index * 2 + 0] = idMap[srcdest[index * 2 + 0] - id_offset];
        keys[index * 2 + 1] = idMap[srcdest[index * 2 + 1] - id_offset];
        vals[index] = index;
    }
}
__global__ void fillKVPairs_inverted(uint32_t* keys, uint32_t* vals, const unsigned int* srcdest, unsigned int count, const unsigned int *idMap, const unsigned int id_offset) {
    unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x;
    if (index < count) {
        // To subsort by destination too, we treat the pair of uint32 as a uint64
        // To invert we must switch the order of the contained uint32's
        keys[index * 2 + 0] = idMap[srcdest[index * 2 + 1] - id_offset];
        keys[index * 2 + 1] = idMap[srcdest[index * 2 + 0] - id_offset];
        vals[index] = index;
    }
}
__global__ void findBinStart(unsigned int *pbm, uint64_t* keys, unsigned int edge_count, unsigned int vertex_count) {
    unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x;
    if (index < edge_count) {
        // Bins correspond to the first uint32 of the pair
        const uint32_t my_vertex = reinterpret_cast<const uint32_t*>(keys)[(index * 2)+1];
        if (index == 0 || my_vertex != reinterpret_cast<const uint32_t*>(keys)[((index - 1) * 2)+1]) {
            // Store the Index of the first edges for the corresponding vertex
            pbm[my_vertex] = index;
        }
    }
    // 1 thread must init the final cell of the PBM
    if (index == 0) {
        pbm[vertex_count] = edge_count;
    }
}
struct ReverseIterator {
    using difference_type = unsigned int;
    using value_type = unsigned int;
    using pointer = unsigned int*;
    using reference = unsigned int&;
    using iterator_category = std::random_access_iterator_tag;
    __host__ __device__ explicit ReverseIterator(unsigned int* _p) : p(_p) { }

    // __device__ ReverseIterator& operator=(const ReverseIterator& other) = default;
    __device__ ReverseIterator operator++ (int a) { p -= a;  return *this; }
    __device__ ReverseIterator operator++ () { p--;  return *this; }
    __device__ unsigned int &operator *() const { return *p; }
    __device__ ReverseIterator operator+(const int& b) const { return ReverseIterator(p - b); }
    __device__ unsigned int &operator[](int b) const { return *(p-b); }
    unsigned int* p;
};
// Borrowed from CUB DeviceScan docs
struct CustomMin {
    template <typename T>
    CUB_RUNTIME_FUNCTION __forceinline__
        T operator()(const T& a, const T& b) const {
        return (b < a) ? b : a;
    }
};
__global__ void buildIDMap(const id_t *IDsIn, unsigned int *indexOut, const unsigned int count, unsigned int *error_count, unsigned int vertex_id_min, unsigned int vertex_id_max) {
    const unsigned int thread_index = blockIdx.x * blockDim.x + threadIdx.x;
    if (thread_index < count) {
        id_t my_thread_id = IDsIn[thread_index];
        // Skip IDs that weren't set
        if (my_thread_id == ID_NOT_SET) {
            atomicInc(error_count + 2, UINT_MAX);
            return;
        }
        if (vertex_id_min <= my_thread_id && my_thread_id <= vertex_id_max) {
            const unsigned int rtn = atomicExch(indexOut + my_thread_id - vertex_id_min, thread_index);
            if (rtn != 0xffffffff) {
                // Report ID collision
                atomicInc(error_count + 0, UINT_MAX);
            }
        } else {
            // Report out of range ID (this should not happen, it's an internal error if it does)
            atomicInc(error_count + 1, UINT_MAX);
        }
    }
}
__global__ void validateSrcDest(id_t *edgeSrcDest, unsigned int *idMap, const unsigned int edge_count, unsigned int *errors, unsigned int vertex_id_min, unsigned int vertex_id_max) {
    const unsigned int thread_index = blockIdx.x * blockDim.x + threadIdx.x;
    if (thread_index < edge_count) {
        const id_t my_src_id = edgeSrcDest[thread_index * 2 + 1];
        const id_t my_dest_id = edgeSrcDest[thread_index * 2 + 0];
        if (my_src_id == ID_NOT_SET) {
            atomicInc(errors + 0, UINT_MAX);
        } else if (vertex_id_min <= my_src_id && my_src_id <= vertex_id_max) {
            if (idMap[my_src_id - vertex_id_min] == 0xffffffff) {
                atomicInc(errors + 2, UINT_MAX);
            }
        } else {
            atomicInc(errors + 2, UINT_MAX);
        }
        if (my_dest_id == ID_NOT_SET) {
            atomicInc(errors + 1, UINT_MAX);
        } else if (vertex_id_min <= my_dest_id && my_dest_id <= vertex_id_max) {
            if (idMap[my_dest_id - vertex_id_min] == 0xffffffff) {
                atomicInc(errors + 3, UINT_MAX);
            }
        } else {
            atomicInc(errors + 3, UINT_MAX);
        }
    }
}
__global__ void translateSrcDest(id_t *edgeSrcDest, unsigned int *idMap, const unsigned int edge_count, unsigned int *errors, unsigned int vertex_id_min, unsigned int vertex_id_max) {
    const unsigned int thread_index = blockIdx.x * blockDim.x + threadIdx.x;
    if (thread_index < edge_count) {
        const id_t my_src_id = edgeSrcDest[thread_index * 2 + 1];
        const id_t my_dest_id = edgeSrcDest[thread_index * 2 + 0];
        const unsigned int src_id = idMap[my_src_id - vertex_id_min];
        const unsigned int dest_id = idMap[my_dest_id - vertex_id_min];
        edgeSrcDest[thread_index * 2 + 1] = src_id;
        edgeSrcDest[thread_index * 2 + 0] = dest_id;
    }
}
void CUDAEnvironmentDirectedGraphBuffers::syncDevice_async(detail::CUDAScatter& scatter, const unsigned int streamID, const cudaStream_t stream) {
    // Copy variable buffers to device
    if (vertex_count) {
        for (auto& v : graph_description.vertexProperties) {
            auto& vb = vertex_buffers.at(v.first);
            if (vb.ready == Buffer::Host) {
                gpuErrchk(cudaMemcpyAsync(vb.d_ptr, vb.h_ptr, vertex_count * v.second.type_size * v.second.elements, cudaMemcpyHostToDevice, stream));
                vb.ready = Buffer::Both;
            }
        }
    }
    if (edge_count) {
        for (auto& e : graph_description.edgeProperties) {
            auto& eb = edge_buffers.at(e.first);
            if (eb.ready == Buffer::Host) {
                gpuErrchk(cudaMemcpyAsync(eb.d_ptr, eb.h_ptr, edge_count * e.second.type_size * e.second.elements, cudaMemcpyHostToDevice, stream));
                eb.ready = Buffer::Both;
            }
        }
    }
    if (requires_rebuild && vertex_count && edge_count) {
        // Construct the vertex ID : index map
        {
            if (vertex_id_min == std::numeric_limits<unsigned int>::max() || vertex_id_max == std::numeric_limits<unsigned int>::min()) {
                THROW flamegpu::exception::IDOutOfBounds("No IDs have been set, in CUDAEnvironmentDirectedGraphBuffers::syncDevice_async()");
            }
            const unsigned int ID_RANGE = 1 + vertex_id_max - vertex_id_min;
            if (ID_RANGE < vertex_count) {
                THROW flamegpu::exception::IDNotSet("Not all vertices have been assigned a unique ID, in CUDAEnvironmentDirectedGraphBuffers::syncDevice_async()");
            }
            if (d_vertex_index_map) {
                gpuErrchk(flamegpu::detail::cuda::cudaFree(d_vertex_index_map));
            }
            if (cudaMalloc(&d_vertex_index_map, sizeof(unsigned int) * (ID_RANGE + 1)) != cudaSuccess) {
                THROW flamegpu::exception::OutOfMemory("Out of memory when allocating ID->index map, Vertex IDs cover too wide a range (%u) consider contiguous IDs, in CUDAEnvironmentDirectedGraphBuffers::syncDevice_async()", ID_RANGE);
            }
            // Copy the offset to the end of the map
            gpuErrchk(cudaMemcpyAsync(d_vertex_index_map + ID_RANGE, &vertex_id_min, sizeof(unsigned int), cudaMemcpyHostToDevice, stream));
            // Add the ID->index map var to curve
            for (const auto& _curve : curve_instances) {
                if (const auto curve = _curve.lock())
                    curve->setEnvironmentDirectedGraphVertexProperty(graph_description.name, GRAPH_VERTEX_INDEX_MAP_VARIABLE_NAME, d_vertex_index_map, ID_RANGE + 1);  // +1 because offset is packed at the end
            }
            for (const auto& _curve : rtc_curve_instances) {
                if (const auto curve = _curve.lock()) {
                    memcpy(curve->getEnvironmentDirectedGraphVertexPropertyCachePtr(graph_description.name, GRAPH_VERTEX_INDEX_MAP_VARIABLE_NAME), &d_vertex_index_map, sizeof(void*));
                    curve->setEnvironmentDirectedGraphVertexPropertyCount(graph_description.name, GRAPH_VERTEX_INDEX_MAP_VARIABLE_NAME, ID_RANGE + 1);  // +1 because offset is packed at the end
                }
            }
            {  // Build the map
                const auto& v_id_b = vertex_buffers.at(ID_VARIABLE_NAME);
                gpuErrchk(cudaMemsetAsync(d_vertex_index_map, 0xffffffff, ID_RANGE * sizeof(unsigned int), stream));
                gpuErrchk(cudaMemsetAsync(d_pbm_swap, 0, 3 * sizeof(unsigned int), stream));  // We will use spare pbm_swap to count errors, save allocating more memory
                const unsigned int BLOCK_SZ = 512;
                const unsigned int BLOCK_CT = static_cast<unsigned int>(ceil(vertex_count / static_cast<float>(BLOCK_SZ)));
                buildIDMap << <BLOCK_CT, BLOCK_SZ, 0, stream >> > (static_cast<id_t*>(v_id_b.d_ptr), d_vertex_index_map, vertex_count, d_pbm_swap, vertex_id_min, vertex_id_max);
                gpuErrchkLaunch();
                unsigned int err_collision_range[3];
                gpuErrchk(cudaMemcpyAsync(err_collision_range, d_pbm_swap, 3 * sizeof(unsigned int), cudaMemcpyDeviceToHost, stream));
                gpuErrchk(cudaStreamSynchronize(stream));
                if (err_collision_range[2] > 0) {
                    THROW flamegpu::exception::IDNotSet("Graph contains %u vertices which have not had their ID set, in CUDAEnvironmentDirectedGraphBuffers::syncDevice_async()", err_collision_range[2]);
                } else if (err_collision_range[0] > 0) {
                    THROW flamegpu::exception::IDCollision("Graph contains invalid vertex IDs, %u vertices reported ID collisions, vertex IDs must be unique or unset, in CUDAEnvironmentDirectedGraphBuffers::syncDevice_async()", err_collision_range[0]);
                } else if (err_collision_range[1] > 0) {
                    THROW flamegpu::exception::UnknownInternalError("Graph contains invalid vertex IDs, %u vertices reported an ID that does not satisfy %u < ID < %u, in CUDAEnvironmentDirectedGraphBuffers::syncDevice_async()", err_collision_range[1], vertex_id_min, vertex_id_max);
                }
            }
            {  // Validate that edge source/dest pairs correspond to valid IDs
                const auto& e_srcdest_b = edge_buffers.at(GRAPH_SOURCE_DEST_VARIABLE_NAME);
                gpuErrchk(cudaMemsetAsync(d_pbm_swap, 0, 4 * sizeof(unsigned int), stream));  // We will use spare pbm_swap to count errors, save allocating more memory
                const unsigned int BLOCK_SZ = 512;
                const unsigned int BLOCK_CT = static_cast<unsigned int>(ceil(edge_count / static_cast<float>(BLOCK_SZ)));
                validateSrcDest << <BLOCK_CT, BLOCK_SZ, 0, stream >> > (static_cast<id_t*>(e_srcdest_b.d_ptr), d_vertex_index_map, edge_count, d_pbm_swap, vertex_id_min, vertex_id_max);
                gpuErrchkLaunch();
                unsigned int err_collision_range[4];  // {src_notset, dest_notset, src_invalid, dest_invalid}
                gpuErrchk(cudaMemcpyAsync(err_collision_range, d_pbm_swap, 4 * sizeof(unsigned int), cudaMemcpyDeviceToHost, stream));
                gpuErrchk(cudaStreamSynchronize(stream));
                if (err_collision_range[0] > 0 || err_collision_range[1] > 0) {
                    THROW flamegpu::exception::IDNotSet("Graph contains %u and %u edges which have not had their source and destinations set respectively, in CUDAEnvironmentDirectedGraphBuffers::syncDevice_async()", err_collision_range[0], err_collision_range[1]);
                } else if (err_collision_range[2] > 0 || err_collision_range[3] > 0) {
                    THROW flamegpu::exception::InvalidID("Graph contains %u and %u edges which have invalid source and destinations set respectively, in CUDAEnvironmentDirectedGraphBuffers::syncDevice_async()", err_collision_range[2], err_collision_range[3]);
                }
            }
        }
        // Rebuild the CSR/VBM (edgesLeaving())
        {
            // Fill Key/Val Pairs
            int blockSize;  // The launch configurator returned block size
            gpuErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blockSize, fillKVPairs, 32, 0));  // Randomly 32
            int gridSize = (edge_count + blockSize - 1) / blockSize;  // Round up according to array size
            fillKVPairs << <gridSize, blockSize, 0, stream >> > (reinterpret_cast<uint32_t*>(d_keys), d_vals, static_cast<unsigned int*>(edge_buffers.at(GRAPH_SOURCE_DEST_VARIABLE_NAME).d_ptr), edge_count, d_vertex_index_map, vertex_id_min);
            gpuErrchkLaunch();
            // Sort Key/Val Pairs according to src->dest
            auto& cub_temp = scatter.CubTemp(streamID);
            size_t temp_req = 0;
            gpuErrchk(cub::DeviceRadixSort::SortPairs(nullptr, temp_req, d_keys, d_keys_swap, d_vals, d_vals_swap, edge_count, 0, sizeof(uint64_t) * 8, stream));
            cub_temp.resize(temp_req);
            gpuErrchk(cub::DeviceRadixSort::SortPairs(cub_temp.getPtr(), cub_temp.getSize(), d_keys, d_keys_swap, d_vals, d_vals_swap, edge_count, 0, sizeof(uint64_t) * 8, stream));
            // Build PBM (For vertices with edges)
            gpuErrchk(cudaMemset(d_pbm, 0xffffffff, (vertex_count + 1) * sizeof(unsigned int)));
            gpuErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blockSize, findBinStart, 32, 0));  // Randomly 32
            gridSize = (edge_count + blockSize - 1) / blockSize;  // Round up according to array size
            findBinStart << <gridSize, blockSize, 0, stream >> > (d_pbm, d_keys_swap, edge_count, vertex_count);
            gpuErrchkLaunch();
            // Build PBM (Fill vertices with no edges)
            temp_req = 0;
            gpuErrchk(cub::DeviceScan::InclusiveScan(nullptr, temp_req, ReverseIterator(d_pbm + vertex_count), ReverseIterator(d_pbm_swap + vertex_count), CustomMin(), vertex_count + 1, stream));
            cub_temp.resize(temp_req);
            gpuErrchk(cub::DeviceScan::InclusiveScan(cub_temp.getPtr(), cub_temp.getSize(), ReverseIterator(d_pbm + vertex_count), ReverseIterator(d_pbm_swap + vertex_count), CustomMin(), vertex_count + 1, stream));
            // Sort edge variables
            std::vector<detail::CUDAScatter::ScatterData> sd;
            for (auto& edge : edge_buffers) {
                edge.second.swap();
                sd.push_back(detail::CUDAScatter::ScatterData{edge.second.element_size, reinterpret_cast<char*>(edge.second.d_ptr_swap), reinterpret_cast<char*>(edge.second.d_ptr)});
            }
            scatter.scatterPosition_async(streamID, stream, d_vals_swap, sd, edge_count);
            // Swap all the swap pointers, so the junk data is in swap
            std::swap(d_keys, d_keys_swap);
            std::swap(d_vals, d_vals_swap);
            std::swap(d_pbm, d_pbm_swap);
            // Update which buffers curve points to
            for (auto& e : graph_description.edgeProperties) {
                auto& eb = edge_buffers.at(e.first);
                for (const auto& _curve : curve_instances) {
                    if (const auto curve = _curve.lock())
                        curve->setEnvironmentDirectedGraphEdgeProperty(graph_description.name, e.first, eb.d_ptr, edge_count);
                }
                for (const auto& _curve : rtc_curve_instances) {
                    if (const auto curve = _curve.lock())
                        memcpy(curve->getEnvironmentDirectedGraphEdgePropertyCachePtr(graph_description.name, e.first), &eb.d_ptr, sizeof(void*));
                }
                eb.ready = Buffer::Device;
            }
            for (const auto& _curve : curve_instances) {
                if (const auto curve = _curve.lock())
                    curve->setEnvironmentDirectedGraphVertexProperty(graph_description.name, GRAPH_VERTEX_PBM_VARIABLE_NAME, d_pbm, 1);
            }
            for (const auto& _curve : rtc_curve_instances) {
                if (const auto curve = _curve.lock())
                    memcpy(curve->getEnvironmentDirectedGraphVertexPropertyCachePtr(graph_description.name, GRAPH_VERTEX_PBM_VARIABLE_NAME), &d_pbm, sizeof(void*));
            }
        }
        {  // Rebuild the CSC/Inverted VBM (edgesJoining())
            int blockSize;  // The launch configurator returned block size
            gpuErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blockSize, fillKVPairs, 32, 0));  // Randomly 32
            int gridSize = (edge_count + blockSize - 1) / blockSize;  // Round up according to array size
            fillKVPairs_inverted << <gridSize, blockSize, 0, stream >> > (reinterpret_cast<uint32_t*>(d_keys), d_vals, static_cast<unsigned int*>(edge_buffers.at(GRAPH_SOURCE_DEST_VARIABLE_NAME).d_ptr), edge_count, d_vertex_index_map, vertex_id_min);
            gpuErrchkLaunch();
            // Sort Key/Val Pairs according to dest->src
            // Cub temp has already been resized above
            auto& cub_temp = scatter.CubTemp(streamID);
            gpuErrchk(cub::DeviceRadixSort::SortPairs(cub_temp.getPtr(), cub_temp.getSize(), d_keys, d_keys_swap, d_vals, d_vals_swap, edge_count, 0, sizeof(uint64_t) * 8, stream));
            // Build inverted PBM (For vertices with edges)
            gpuErrchk(cudaMemset(d_ipbm, 0xffffffff, (vertex_count + 1) * sizeof(unsigned int)));
            gpuErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blockSize, findBinStart, 32, 0));  // Randomly 32
            gridSize = (edge_count + blockSize - 1) / blockSize;  // Round up according to array size
            findBinStart << <gridSize, blockSize, 0, stream >> > (d_ipbm, d_keys_swap, edge_count, vertex_count);
            gpuErrchkLaunch();
            // Build inverted PBM (Fill vertices with no edges)
            gpuErrchk(cub::DeviceScan::InclusiveScan(cub_temp.getPtr(), cub_temp.getSize(), ReverseIterator(d_ipbm + vertex_count), ReverseIterator(d_pbm_swap + vertex_count), CustomMin(), vertex_count + 1, stream));
            // Swap all the swap pointers, so the junk data is in swap
            std::swap(d_keys, d_keys_swap);
            std::swap(d_ipbm_edges, d_vals_swap);
            std::swap(d_ipbm, d_pbm_swap);
            // Update which buffers curve points to
            for (const auto& _curve : curve_instances) {
                if (const auto curve = _curve.lock()) {
                    curve->setEnvironmentDirectedGraphVertexProperty(graph_description.name, GRAPH_VERTEX_IPBM_VARIABLE_NAME, d_ipbm, 1);
                    curve->setEnvironmentDirectedGraphVertexProperty(graph_description.name, GRAPH_VERTEX_IPBM_EDGES_VARIABLE_NAME, d_ipbm_edges, 1);
                }
            }
            for (const auto& _curve : rtc_curve_instances) {
                if (const auto curve = _curve.lock()) {
                    memcpy(curve->getEnvironmentDirectedGraphVertexPropertyCachePtr(graph_description.name, GRAPH_VERTEX_IPBM_VARIABLE_NAME), &d_ipbm, sizeof(void*));
                    memcpy(curve->getEnvironmentDirectedGraphVertexPropertyCachePtr(graph_description.name, GRAPH_VERTEX_IPBM_EDGES_VARIABLE_NAME), &d_ipbm_edges, sizeof(void*));
                }
            }
        }
        {  // Translate edge source/dest pairs and validate that they correspond to valid IDs
            const auto& e_srcdest_b = edge_buffers.at(GRAPH_SOURCE_DEST_VARIABLE_NAME);
            e_srcdest_b.updateHostBuffer(edge_count, stream);  // Copy back to host, before we translate device IDs
            const unsigned int BLOCK_SZ = 512;
            const unsigned int BLOCK_CT = static_cast<unsigned int>(ceil(edge_count / static_cast<float>(BLOCK_SZ)));
            translateSrcDest << <BLOCK_CT, BLOCK_SZ, 0, stream >> > (static_cast<id_t*>(e_srcdest_b.d_ptr), d_vertex_index_map, edge_count, d_pbm_swap, vertex_id_min, vertex_id_max);
            gpuErrchkLaunch()
            // Rebuild the edge index map
            h_edge_index_map.clear();
            for (unsigned int i = 0; i < edge_count; ++i) {
                h_edge_index_map.emplace(std::pair{static_cast<id_t*>(e_srcdest_b.h_ptr)[i * 2 + 1], static_cast<id_t*>(e_srcdest_b.h_ptr)[i * 2 + 0]}, i);
            }
        }
        requires_rebuild = false;
    }
}

void CUDAEnvironmentDirectedGraphBuffers::Buffer::updateHostBuffer(size_type edge_count, cudaStream_t stream) const {
    if (ready == Device) {
        gpuErrchk(cudaMemcpyAsync(h_ptr, d_ptr, edge_count * element_size, cudaMemcpyDeviceToHost, stream));
        gpuErrchk(cudaStreamSynchronize(stream));
        ready = Both;
    }
}
void CUDAEnvironmentDirectedGraphBuffers::resetVertexIDBounds() {
    vertex_id_min = std::numeric_limits<unsigned int>::max();
    vertex_id_max = std::numeric_limits<unsigned int>::min();
}
void CUDAEnvironmentDirectedGraphBuffers::setVertexID(unsigned int vertex_index, id_t vertex_id, cudaStream_t stream) {
    if (vertex_index >= vertex_count) {
        THROW exception::OutOfBoundsException("Vertex index exceeds bounds %u >= %u, "
            "in CUDAEnvironmentDirectedGraphBuffers::setVertexID()\n", vertex_index, vertex_count);
    } else if (vertex_id == ID_NOT_SET) {
        THROW exception::IDOutOfBounds("Vertex ID of %u is not valid, "
            "in CUDAEnvironmentDirectedGraphBuffers::setVertexID()\n", ID_NOT_SET);
    }
    // Purge old vertex ID from host map
    auto& vb = vertex_buffers.at(ID_VARIABLE_NAME);
    vb.updateHostBuffer(vertex_count, stream);
    if (static_cast<id_t*>(vb.h_ptr)[vertex_index] != ID_NOT_SET) {
        h_vertex_index_map.erase(static_cast<id_t*>(vb.h_ptr)[vertex_index]);
    }

    // Add new vertex ID to host map (validate it's not already in us)
    const auto find = h_vertex_index_map.find(vertex_id);
    if (find != h_vertex_index_map.end()) {
        THROW exception::IDCollision("ID collision, %u has already been assigned to vertex at index %u, "
            "in CUDAEnvironmentDirectedGraphBuffers::setVertexID()\n", vertex_id, find->second);
    }
    h_vertex_index_map.emplace(vertex_id, vertex_index);

    // Update vertex's ID in buffer
    static_cast<id_t*>(vb.h_ptr)[vertex_index] = vertex_id;
    vb.ready = Buffer::Host;

    // Update range calc (naive, can be wrong if IDs are changed)
    vertex_id_min = std::min(vertex_id_min, vertex_id);
    vertex_id_max = std::max(vertex_id_max, vertex_id);
}
unsigned int CUDAEnvironmentDirectedGraphBuffers::getVertexIndex(id_t vertex_id) const {
    const auto find = h_vertex_index_map.find(vertex_id);
    if (find == h_vertex_index_map.end()) {
        THROW exception::InvalidID("No vertex found with ID %u, in CUDAEnvironmentDirectedGraphBuffers::getVertexIndex()\n", vertex_id);
    }
    return find->second;
}
void CUDAEnvironmentDirectedGraphBuffers::setEdgeSourceDestination(unsigned int edge_index, id_t src_vertex_id, id_t dest_vertex_id) {
    if (edge_index >= edge_count) {
        THROW exception::OutOfBoundsException("Edge index exceeds bounds %u >= %u, "
            "in CUDAEnvironmentDirectedGraphBuffers::setEdgeSourceDestination()\n", edge_index, edge_count);
    } else if (src_vertex_id == ID_NOT_SET) {
        THROW exception::IDOutOfBounds("Source vertex ID of %u is not valid, "
            "in CUDAEnvironmentDirectedGraphBuffers::setEdgeSourceDestination()\n", ID_NOT_SET);
    } else if (dest_vertex_id == ID_NOT_SET) {
        THROW exception::IDOutOfBounds("Destination vertex ID of %u is not valid, "
            "in CUDAEnvironmentDirectedGraphBuffers::setEdgeSourceDestination()\n", ID_NOT_SET);
    }
    // Purge old edge src/dest from host map
    auto& eb = edge_buffers.at(GRAPH_SOURCE_DEST_VARIABLE_NAME);
    // Don't need to update buffer, src_dest is not stored as ID on device
    id_t& edge_dest = static_cast<id_t*>(eb.h_ptr)[edge_index * 2 + 0];
    id_t& edge_src = static_cast<id_t*>(eb.h_ptr)[edge_index * 2 + 1];

    if (edge_src != ID_NOT_SET && edge_dest != ID_NOT_SET) {
        h_edge_index_map.erase({edge_src, edge_dest});
    } else if ((edge_src == ID_NOT_SET) ^ (edge_dest == ID_NOT_SET)) {
        THROW exception::UnknownInternalError("Edge found without both source and destination set, "
            "in CUDAEnvironmentDirectedGraphBuffers::setEdgeSourceDestination()\n");
    }

    // Add new edge ID to host map (validate it's not already in use)
    const auto find = h_edge_index_map.find({src_vertex_id, dest_vertex_id});
    if (find != h_edge_index_map.end()) {
        THROW exception::IDCollision("Edge collision, an edge has already been assigned source %u dest %u at index %u, "
            "in CUDAEnvironmentDirectedGraphBuffers::setEdgeSourceDestination()\n", src_vertex_id, dest_vertex_id, find->second);
    }
    h_edge_index_map.emplace(std::pair{src_vertex_id, dest_vertex_id}, edge_index);

    // Update edge's src dest in buffer
    edge_dest = dest_vertex_id;
    edge_src = src_vertex_id;
    eb.ready = Buffer::Host;

    // Require rebuild before use
    markForRebuild();
}
unsigned int CUDAEnvironmentDirectedGraphBuffers::getEdgeIndex(id_t src_vertex_id, id_t dest_vertex_id) const {
    const auto find = h_edge_index_map.find({src_vertex_id, dest_vertex_id});
    if (find == h_edge_index_map.end()) {
        THROW exception::InvalidID("No edge found with source %u, dest %u, in CUDAEnvironmentDirectedGraphBuffers::getEdgeIndex()\n", src_vertex_id, dest_vertex_id);
    }
    return find->second;
}

unsigned int CUDAEnvironmentDirectedGraphBuffers::createIfNotExistVertex(id_t vertex_id, const cudaStream_t stream) {
    if (vertex_id == ID_NOT_SET) {
        THROW exception::IDOutOfBounds("Vertex ID of %u is not valid, "
            "in CUDAEnvironmentDirectedGraphBuffers::createIfNotExistVertex()\n", ID_NOT_SET);
    }
    const auto it = h_vertex_index_map.find(vertex_id);
    if (it != h_vertex_index_map.end()) {
        return it->second;
    }
    if (h_vertex_index_map.size() < vertex_count) {
        const unsigned int vertex_index = static_cast<unsigned int>(h_vertex_index_map.size());
        h_vertex_index_map.emplace(vertex_id, vertex_index);
        // Update vertex's ID in buffer
        auto& vb = vertex_buffers.at(ID_VARIABLE_NAME);
        vb.updateHostBuffer(vertex_count, stream);
        static_cast<id_t*>(vb.h_ptr)[vertex_index] = vertex_id;
        vb.ready = Buffer::Host;
        // Update range calc
        vertex_id_min = std::min(vertex_id_min, vertex_id);
        vertex_id_max = std::max(vertex_id_max, vertex_id);
        return vertex_index;
    }
    THROW exception::OutOfBoundsException("Creating vertex with ID %u would exceed available vertices (%u), "
        "in CUDAEnvironmentDirectedGraphBuffers::createIfNotExistVertex()\n", vertex_id, vertex_count);
}
unsigned int CUDAEnvironmentDirectedGraphBuffers::createIfNotExistEdge(id_t source_vertex_id, id_t dest_vertex_id, const cudaStream_t stream) {
    if (source_vertex_id == ID_NOT_SET || dest_vertex_id== ID_NOT_SET) {
        THROW exception::IDOutOfBounds("Vertex ID of %u is not valid, "
            "in CUDAEnvironmentDirectedGraphBuffers::createIfNotExistEdge()\n", ID_NOT_SET);
    }
    const auto it = h_edge_index_map.find({source_vertex_id, dest_vertex_id});
    if (it != h_edge_index_map.end()) {
        return it->second;
    }
    if (h_edge_index_map.size() < edge_count) {
        const unsigned int edge_index = static_cast<unsigned int>(h_edge_index_map.size());
        h_edge_index_map.emplace(std::pair{source_vertex_id, dest_vertex_id}, edge_index);
        // Update vertex's ID in buffer
        auto& eb = edge_buffers.at(GRAPH_SOURCE_DEST_VARIABLE_NAME);
        static_cast<id_t*>(eb.h_ptr)[edge_index * 2 + 0] = dest_vertex_id;
        static_cast<id_t*>(eb.h_ptr)[edge_index * 2 + 1] = source_vertex_id;
        eb.ready = Buffer::Host;
        // Require rebuild before use
        markForRebuild();
        return edge_index;
    }
    THROW exception::OutOfBoundsException("Creating edge with src %u dest %u would exceed available edges (%u), "
        "in CUDAEnvironmentDirectedGraphBuffers::createIfNotExistEdge()\n", source_vertex_id, dest_vertex_id, vertex_count);
}
}  // namespace detail
}  // namespace flamegpu