Program Listing for File CUDAAgent.cu

↰ Return to documentation for file (src/flamegpu/simulation/detail/CUDAAgent.cu)
#include "flamegpu/simulation/detail/CUDAAgent.h"

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>
#include <fstream>
#include <string>
#include <filesystem>
#include <vector>
#include <unordered_map>
#include <utility>
#include <list>
#include <memory>

#ifdef _MSC_VER
#pragma warning(push, 1)
#pragma warning(disable : 4706 4834)
#endif  // _MSC_VER
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
#pragma nv_diag_suppress 1719
#else
#pragma diag_suppress 1719
#endif  // __NVCC_DIAG_PRAGMA_SUPPORT__
#include <cub/cub.cuh>
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
#pragma nv_diag_default 1719
#else
#pragma diag_default 1719
#endif  // __NVCC_DIAG_PRAGMA_SUPPORT__
#ifdef _MSC_VER
#pragma warning(pop)
#endif  // _MSC_VER

#include "flamegpu/version.h"
#include "flamegpu/simulation/detail/CUDAFatAgent.h"
#include "flamegpu/simulation/detail/CUDAAgentStateList.h"
#include "flamegpu/simulation/detail/CUDAErrorChecking.cuh"
#include "flamegpu/simulation/CUDASimulation.h"

#include "flamegpu/model/AgentDescription.h"
#include "flamegpu/model/AgentFunctionDescription.h"
#include "flamegpu/runtime/detail/curve/HostCurve.cuh"
#include "flamegpu/runtime/detail/curve/curve_rtc.cuh"
#include "flamegpu/simulation/detail/CUDAScatter.cuh"
#include "flamegpu/detail/compute_capability.cuh"
#include "flamegpu/util/nvtx.h"
#include "flamegpu/runtime/agent/DeviceAgentVector_impl.h"
#include "flamegpu/detail/cuda.cuh"
#include "flamegpu/simulation/detail/CUDAEnvironmentDirectedGraphBuffers.cuh"

namespace flamegpu {
namespace detail {

CUDAAgent::CUDAAgent(const AgentData& description, const CUDASimulation &_cudaSimulation)
    : agent_description(description)  // This is a master agent, so it must create a new fat_agent
    , fat_agent(std::make_shared<CUDAFatAgent>(agent_description))  // if we create fat agent, we're index 0
    , fat_index(0)
    , cudaSimulation(_cudaSimulation)
    , TOTAL_AGENT_VARIABLE_SIZE(calcTotalVarSize(description)) {
    // Generate state map from fat_agent
    auto fatstate_map = fat_agent->getStateMap(fat_index);
    for (auto &state : description.states) {
        // Find correct fat state
        auto fatstate = fatstate_map.at(state);
        // Construct a regular state map from this
        auto slimstate = std::make_shared<CUDAAgentStateList>(fatstate, *this, fat_index, agent_description);
        // Store in our map
        state_map.emplace(state, slimstate);
    }
}
CUDAAgent::CUDAAgent(
    const AgentData &description,
    const CUDASimulation &_cudaSimulation,
    const std::unique_ptr<CUDAAgent> &master_agent,
    const std::shared_ptr<SubAgentData> &mapping)
    : agent_description(description)
    , fat_agent(master_agent->getFatAgent())
    , fat_index(fat_agent->getMappedAgentCount())
    , cudaSimulation(_cudaSimulation)
    , TOTAL_AGENT_VARIABLE_SIZE(calcTotalVarSize(description)) {
    // This is next agent to be added to fat_agent, so it takes existing count
    // Pass required info, so fat agent can generate new buffers and mappings
    fat_agent->addSubAgent(agent_description, master_agent->getFatIndex(), mapping);
    // Generate state map from fat_agent
    auto fatstate_map = fat_agent->getStateMap(fat_index);
    for (auto &state : agent_description.states) {
        // Find correct fat state
        auto fatstate = fatstate_map.at(state);
        // Construct a regular state map from this
        auto slimstate = std::make_shared<CUDAAgentStateList>(fatstate, *this, fat_index, agent_description, mapping->states.find(state) != mapping->states.end(), mapping->variables);
        // Store in our map
        state_map.emplace(state, slimstate);
    }
}

void CUDAAgent::mapRuntimeVariables(const AgentFunctionData& func, const unsigned int instance_id) const {
    // check the cuda agent state map to find the correct state list for functions starting state
    auto sm = state_map.find(func.initial_state);

    if (sm == state_map.end()) {
        THROW exception::InvalidCudaAgentState("Error: Agent ('%s') state ('%s') was not found "
            "in CUDAAgent::mapRuntimeVariables()",
            agent_description.name.c_str(), func.initial_state.c_str());
    }

    const unsigned int agent_count = this->getStateSize(func.initial_state);
    // loop through the agents variables to map each variable name using cuRVE
    for (const auto &mmp : agent_description.variables) {
        // get a device pointer for the agent variable name
        void* d_ptr = sm->second->getVariablePointer(mmp.first);

        // @todo These two blocks are grim, we keep using getRTCHeader() or getCurve(), which does a map lookup

        // Map variables to agent function (these must be mapped before each function execution as the runtime pointer may have changed to the swapping)
        if (!func.rtc_func_name.empty()) {
            auto& rtc_header = getRTCHeader(func.name);
            memcpy(rtc_header.getAgentVariableCachePtr(mmp.first.c_str()), &d_ptr, sizeof(void*));
        } else {
            auto& curve = getCurve(func.name);
            curve.setAgentVariable(mmp.first, d_ptr, agent_count);
        }

        // Map variables to agent function conditions (these must be mapped before each function execution as the runtime pointer may have changed to the swapping)
        if (!func.rtc_func_condition_name.empty()) {
            auto& rtc_header = getRTCHeader(func.name + "_condition");
            memcpy(rtc_header.getAgentVariableCachePtr(mmp.first.c_str()), &d_ptr, sizeof(void*));
            rtc_header.setAgentVariableCount(mmp.first, agent_count);
        } else if (func.condition) {
            auto& curve = getCurve(func.name + "_condition");
            curve.setAgentVariable(mmp.first, d_ptr, agent_count);
        }
    }
}

void CUDAAgent::setPopulationData(const AgentVector& population, const std::string& state_name, CUDAScatter& scatter, const unsigned int streamId, const cudaStream_t stream) {
    // Validate agent state
    auto our_state = state_map.find(state_name);
    if (our_state == state_map.end()) {
        if (state_name == ModelData::DEFAULT_STATE) {
            THROW exception::InvalidAgentState("Agent '%s' does not use the default state, so the state must be passed explicitly, "
                "in CUDAAgent::setPopulationData()",
                population.getAgentName().c_str());
        } else {
            THROW exception::InvalidAgentState("State '%s' was not found in agent '%s', "
                "in CUDAAgent::setPopulationData()",
                state_name.c_str(), population.getAgentName().c_str());
        }
    }
    // Copy population data
    // This call hierarchy validates agent desc matches
    our_state->second->setAgentData(population, scatter, streamId, stream);
    fat_agent->markIDsUnset();
    // Validate that there are no ID collisions
    validateIDCollisions(stream);
}
void CUDAAgent::getPopulationData(AgentVector& population, const std::string& state_name) const {
    // Validate agent state
    auto our_state = state_map.find(state_name);
    if (our_state == state_map.end()) {
        if (state_name == ModelData::DEFAULT_STATE) {
            THROW exception::InvalidAgentState("Agent '%s' does not use the default state, so the state must be passed explicitly, "
                "in CUDAAgent::getPopulationData()",
                state_name.c_str(), population.getAgentName().c_str());
        } else {
            THROW exception::InvalidAgentState("State '%s' was not found in agent '%s', "
                "in CUDAAgent::getPopulationData()",
                state_name.c_str(), population.getAgentName().c_str());
        }
    }
    // Copy population data
    // This call hierarchy validates agent desc matches
    our_state->second->getAgentData(population);
}
__global__ void generateCollisionFlags(const id_t* d_sortedKeys, id_t* d_flagsOut, unsigned int threads, id_t UNSET_FLAG) {
    const unsigned int id = blockIdx.x * blockDim.x + threadIdx.x;
    if (id < threads) {
        const id_t my_id = d_sortedKeys[id];
        if (my_id != UNSET_FLAG && my_id == d_sortedKeys[id+1]) {
            assert(UNSET_FLAG == 0);
            d_flagsOut[id] = 1;  // my_id; // any non-0 value basically
        }
    }
}
void CUDAAgent::validateIDCollisions(cudaStream_t stream) const {
    flamegpu::util::nvtx::Range range{"CUDAAgent::validateIDCollisions"};
    // All data is on device, so use a device technique to check for collisions
    // Sort agent IDs, have a simple kernel check for neighbouring ID collisions to set a flag
    // Scan that flag
    // This could be improved by reusing buffers from elsewhere (e.g. StreamResources), rather than making temporary allocations for each method call
    // However, I'm also concerned that a model with agents added to multiple states and no agent birth would then pre-allocate larger buffers than required during execution

    // First count total agents across all states
    unsigned int agentCount = 0;
    for (const auto &s : state_map) {
        agentCount += s.second->getSize();
    }
    if (!agentCount) return;
    // Allocate buffers we will use
    id_t * d_keysIn = nullptr, *d_keysOut = nullptr;
    gpuErrchk(cudaMalloc(&d_keysIn, sizeof(id_t) * agentCount));
    gpuErrchk(cudaMalloc(&d_keysOut, sizeof(id_t) * agentCount));
    // Copy agent IDs to keysIn buff
    ptrdiff_t buffOffset = 0;
    for (const auto& s : state_map) {
        const unsigned int t_size = s.second->getSize();
        gpuErrchk(cudaMemcpyAsync(d_keysIn + buffOffset, s.second->getVariablePointer(ID_VARIABLE_NAME), t_size * sizeof(id_t), cudaMemcpyDeviceToDevice, stream));
        buffOffset += t_size;
    }
    // Sort agent ids into d_keysOut
    void* d_temp_storage = nullptr;
    size_t temp_storage_bytes = 0;
    gpuErrchk(cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keysIn, d_keysOut, agentCount, 0, sizeof(id_t) * 8, stream));
    gpuErrchk(cudaMalloc(&d_temp_storage, temp_storage_bytes));
    gpuErrchk(cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keysIn, d_keysOut, agentCount, 0, sizeof(id_t) * 8, stream));
    // Reset d_keysIn
    gpuErrchk(cudaMemsetAsync(d_keysIn, 0, sizeof(id_t) * agentCount, stream));
    // Launch a kernel to set flags if keys overlap their neighbour
    const unsigned int blockSize = 1024;
    const unsigned int blocks = ((agentCount-1) / blockSize) + 1;
    generateCollisionFlags<<<blocks, blockSize, 0, stream>>>(d_keysOut, d_keysIn, agentCount-1, ID_NOT_SET);
    gpuErrchkLaunch();
    // Check whether any flags were set
    size_t temp_storage_bytes2 = 0;
    gpuErrchk(cub::DeviceReduce::Sum(nullptr, temp_storage_bytes2, d_keysIn, d_keysOut, agentCount - 1, stream));
    if (temp_storage_bytes2 > temp_storage_bytes) {
        gpuErrchk(flamegpu::detail::cuda::cudaFree(d_temp_storage));
        temp_storage_bytes = temp_storage_bytes2;
        gpuErrchk(cudaMalloc(&d_temp_storage, temp_storage_bytes));
    }
    gpuErrchk(cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_keysIn, d_keysOut, agentCount - 1, stream));
    id_t flagsSet = 0;
    gpuErrchk(cudaMemcpyAsync(&flagsSet, d_keysOut, sizeof(id_t), cudaMemcpyDeviceToHost, stream));
    // Cleanup
    gpuErrchk(flamegpu::detail::cuda::cudaFree(d_temp_storage));
    gpuErrchk(flamegpu::detail::cuda::cudaFree(d_keysIn));
    gpuErrchk(flamegpu::detail::cuda::cudaFree(d_keysOut));
    if (flagsSet) {
        THROW exception::AgentIDCollision("%u agents of type '%s' share an ID with another agent of the same type, "
            "you may need to explicitly reset agent IDs for 1 or more populations before adding them to the CUDASimulation, "
            "in CUDAAgent::validateIDCollisions()\n",
            static_cast<unsigned int>(flagsSet), agent_description.name.c_str());
    }
    gpuErrchk(cudaStreamSynchronize(stream));
}
unsigned int CUDAAgent::getStateSize(const std::string &state) const {
    // check the cuda agent state map to find the correct state list for functions starting state
    const auto &sm = state_map.find(state);

    if (sm == state_map.end()) {
        THROW exception::InvalidCudaAgentState("Error: Agent ('%s') state ('%s') was not found, "
            "in CUDAAgent::getStateSize()",
            agent_description.name.c_str(), state.c_str());
    }
    return sm->second->getSize();
}
unsigned int CUDAAgent::getStateAllocatedSize(const std::string &state) const {
    // check the cuda agent state map to find the correct state list for functions starting state
    const auto &sm = state_map.find(state);

    if (sm == state_map.end()) {
        THROW exception::InvalidCudaAgentState("Error: Agent ('%s') state ('%s') was not found, "
            "in CUDAAgent::getStateAllocatedSize()",
            agent_description.name.c_str(), state.c_str());
    }
    return sm->second->getAllocatedSize();
}
void CUDAAgent::resizeState(const std::string& state, const unsigned int minimumSize, const bool retainData, const cudaStream_t stream) {
    // check the cuda agent state map to find the correct state list
    const auto& sm = state_map.find(state);

    if (sm == state_map.end()) {
        THROW exception::InvalidCudaAgentState("Error: Agent ('%s') state ('%s') was not found, "
            "in CUDAAgent::getStateAllocatedSize()",
            agent_description.name.c_str(), state.c_str());
    }
    sm->second->resize(minimumSize, retainData, stream);
}

void CUDAAgent::setStateAgentCount(const std::string& state, const unsigned int newSize) {
    // check the cuda agent state map to find the correct state list
    const auto& sm = state_map.find(state);

    if (sm == state_map.end()) {
        THROW exception::InvalidCudaAgentState("Error: Agent ('%s') state ('%s') was not found, "
            "in CUDAAgent::getStateAllocatedSize()",
            agent_description.name.c_str(), state.c_str());
    }
    sm->second->setAgentCount(newSize);
}
CAgentDescription CUDAAgent::getAgentDescription() const {
    return CAgentDescription(agent_description.shared_from_this());
}
void *CUDAAgent::getStateVariablePtr(const std::string &state_name, const std::string &variable_name) {
    // check the cuda agent state map to find the correct state list for functions starting state
    const auto &sm = state_map.find(state_name);

    if (sm == state_map.end()) {
        THROW exception::InvalidCudaAgentState("Error: Agent ('%s') state ('%s') was not found, "
            "in CUDAAgent::getStateVariablePtr()",
            agent_description.name.c_str(), state_name.c_str());
    }
    return sm->second->getVariablePointer(variable_name);
}
void CUDAAgent::processDeath(const AgentFunctionData& func, detail::CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
    // Optionally process agent death
    if (func.has_agent_death) {
        // Agent death operates on all mapped vars, so handled by fat agent
        fat_agent->processDeath(fat_index, func.initial_state, scatter, streamId, stream);
    }
}
void CUDAAgent::transitionState(const std::string &_src, const std::string &_dest, detail::CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
    // All mapped vars need to transition too, so handled by fat agent
    fat_agent->transitionState(fat_index, _src, _dest, scatter, streamId, stream);
}
void CUDAAgent::processFunctionCondition(const AgentFunctionData& func, detail::CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
    // Optionally process function condition
    if ((func.condition) || (!func.rtc_func_condition_name.empty())) {
        // Agent function condition operates on all mapped vars, so handled by fat agent
        fat_agent->processFunctionCondition(fat_index, func.initial_state, scatter, streamId, stream);
    }
}
void CUDAAgent::scatterHostCreation(const std::string &state_name, const unsigned int newSize, char *const d_inBuff, const VarOffsetStruct &offsets, detail::CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
    auto sm = state_map.find(state_name);
    if (sm == state_map.end()) {
        THROW exception::InvalidCudaAgentState("Error: Agent ('%s') state ('%s') was not found "
            "in CUDAAgent::scatterHostCreation()",
            agent_description.name.c_str(), state_name.c_str());
    }
    sm->second->scatterHostCreation(newSize, d_inBuff, offsets, scatter, streamId, stream);
}
void CUDAAgent::scatterSort_async(const std::string &state_name, detail::CUDAScatter &scatter, unsigned int streamId, cudaStream_t stream) {
    auto sm = state_map.find(state_name);
    if (sm == state_map.end()) {
        THROW exception::InvalidCudaAgentState("Error: Agent ('%s') state ('%s') was not found "
            "in CUDAAgent::scatterHostCreation()",
            agent_description.name.c_str(), state_name.c_str());
    }
    sm->second->scatterSort_async(scatter, streamId, stream);
}
void CUDAAgent::mapNewRuntimeVariables_async(const CUDAAgent& func_agent, const AgentFunctionData& func, unsigned int maxLen, detail::CUDAScatter &scatter, unsigned int instance_id, cudaStream_t stream, unsigned int streamId) {
    // Confirm agent output is set
    if (auto oa = func.agent_output.lock()) {
        // check the cuda agent state map to find the correct state list for functions starting state
        auto sm = state_map.find(func.agent_output_state);

        if (sm == state_map.end()) {
            THROW exception::InvalidCudaAgentState("Error: Agent ('%s') state ('%s') was not found "
                "in CUDAAgent::mapNewRuntimeVariables()",
                agent_description.name.c_str(), func.agent_output_state.c_str());
        }
        // Notify scan flag that it might need resizing
        // We need a 3rd array, because a function might combine agent birth, agent death and message output
        scatter.Scan().resize(maxLen, CUDAScanCompaction::AGENT_OUTPUT, streamId);
        // Ensure the scan flag is zeroed
        scatter.Scan().zero_async(CUDAScanCompaction::AGENT_OUTPUT, stream, streamId);

        // Request a buffer for new
        char *d_new_buffer = static_cast<char*>(fat_agent->allocNewBuffer(TOTAL_AGENT_VARIABLE_SIZE, maxLen, agent_description.variables.size()));

        // Store buffer so we can release it later
        {
            std::lock_guard<std::mutex> guard(newBuffsMutex);
            const auto rtn = newBuffs.emplace(func.initial_state, d_new_buffer);
            if (!rtn.second) {
                assert(false);  // Insertion happened (false if element already exists)
            }
        }

        // Init the buffer to default values for variables
        scatter.broadcastInit_async(
            streamId,
            stream,
            agent_description.variables,
            d_new_buffer,
            maxLen, 0);
        // No sync, use of the buffer should be in the same stream

        // loop through the agents variables to map each variable name using cuRVE
        // these must be mapped before each function execution as the runtime pointer may have changed to the swapping
        for (const auto &mmp : agent_description.variables) {
            // get the agent variable size
            const size_t type_size = mmp.second.type_size * mmp.second.elements;

            // get a device pointer for the agent variable name
            void* d_ptr = d_new_buffer;

            // Move the pointer along for next variable
            d_new_buffer += type_size * maxLen;

            // 64 bit align the new buffer start
            if (reinterpret_cast<size_t>(d_new_buffer)%8) {
                d_new_buffer += 8 - (reinterpret_cast<size_t>(d_new_buffer)%8);
            }

            // maximum population num
            if (func.func) {
                auto& curve = func_agent.getCurve(func.name);  // @todo stop map hammering
                curve.setAgentOutputVariable(mmp.first, d_ptr, maxLen);
            } else  {
                auto& rtc_header = func_agent.getRTCHeader(func.name);
                memcpy(rtc_header.getNewAgentVariableCachePtr(mmp.first.c_str()), &d_ptr, sizeof(void*));
                rtc_header.setNewAgentVariableCount(mmp.first, maxLen);
            }
        }
    }
}
void CUDAAgent::releaseNewBuffer(const AgentFunctionData& func) {
    // Confirm agent output is set
    if (auto oa = func.agent_output.lock()) {
        // Release new buffer
        {
            std::lock_guard<std::mutex> guard(newBuffsMutex);
            const auto d_buff = newBuffs.find(func.initial_state);
            if (d_buff != newBuffs.end()) {
                fat_agent->freeNewBuffer(d_buff->second);
                newBuffs.erase(d_buff);
            } else {
                assert(false);  // We don't have a new buffer reserved???
            }
        }
    }
}

void CUDAAgent::scatterNew(const AgentFunctionData& func, const unsigned int newSize, detail::CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
    // Confirm agent output is set
    if (auto oa = func.agent_output.lock()) {
        auto sm = state_map.find(func.agent_output_state);
        if (sm == state_map.end()) {
            THROW exception::InvalidStateName("Agent '%s' does not contain state '%s', "
                "in CUDAAgent::scatterNew()\n",
                agent_description.name.c_str(), func.agent_output_state.c_str());
        }
        // Find new buffer
        void *newBuff = nullptr;
        {
            std::lock_guard<std::mutex> guard(newBuffsMutex);
            const auto d_buff = newBuffs.find(func.initial_state);
            if (d_buff != newBuffs.end()) {
                newBuff = d_buff->second;
            }
        }
        if (!newBuff) {
            THROW exception::InvalidAgentFunc("New buffer not present for function within init state: %s,"
                " in CUDAAgent::scatterNew()\n",
                func.initial_state.c_str());
        }
        unsigned int new_births = sm->second->scatterNew(newBuff, newSize, scatter, streamId, stream);
        fat_agent->notifyDeviceBirths(new_births);
    }
}
void CUDAAgent::clearFunctionCondition(const std::string &state) {
    fat_agent->setConditionState(fat_index, state, 0);
}

void CUDAAgent::addInstantitateRTCFunction(const AgentFunctionData& func, const std::shared_ptr<EnvironmentManager> &env, std::shared_ptr<const detail::CUDAMacroEnvironment> macro_env,
    const std::unordered_map<std::string, std::shared_ptr<CUDAEnvironmentDirectedGraphBuffers>>& directed_graphs, bool function_condition) {
    // Generate the dynamic curve header
    std::shared_ptr<detail::curve::CurveRTCHost> &curve_header = rtc_header_map.emplace(function_condition ? func.name + "_condition" : func.name, std::make_shared<detail::curve::CurveRTCHost>()).first->second;

    // set agent function variables in rtc curve
    for (const auto& mmp : func.parent.lock()->variables) {
        curve_header->registerAgentVariable(mmp.first.c_str(), mmp.second.type.name(), mmp.second.type_size, mmp.second.elements);
    }

    // for normal agent function (e.g. not an agent function condition) append messages and agent outputs
    if (!function_condition) {
        // Set input message variables in curve
        if (auto im = func.message_input.lock()) {
            for (auto message_in_var : im->variables) {
                // register message variables using combined hash
                curve_header->registerMessageInVariable(message_in_var.first.c_str(),
                message_in_var.second.type.name(), message_in_var.second.type_size, message_in_var.second.elements, true, false);
            }
        }
        // Set output message variables in curve
        if (auto om = func.message_output.lock()) {
            for (auto message_out_var : om->variables) {
                // register message variables using combined hash
                curve_header->registerMessageOutVariable(message_out_var.first.c_str(),
                message_out_var.second.type.name(), message_out_var.second.type_size, message_out_var.second.elements, false, true);
            }
        }
        // Set agent output variables in curve
        if (auto ao = func.agent_output.lock()) {
            for (auto agent_out_var : ao->variables) {
                // register message variables using combined hash
                curve_header->registerNewAgentVariable(agent_out_var.first.c_str(),
                agent_out_var.second.type.name(), agent_out_var.second.type_size, agent_out_var.second.elements, false, true);
            }
        }
    }

    // Set environment properties in curve (this includes mapped properties)
    {
        const auto &prop_map = env->getPropertiesMap();
        for (const auto &p : prop_map) {
            const char* variableName = p.first.c_str();
            const char* type = p.second.type.name();
            const unsigned int elements = p.second.elements;
            const ptrdiff_t offset = p.second.offset;
            curve_header->registerEnvVariable(variableName, offset, type, p.second.length/elements, elements);
         }
    }

    // Set Environment macro properties in curve
    macro_env->mapRTCVariables(*curve_header);

    // Set the agent name/state
    curve_header->registerAgent(this->agent_description.name, func.initial_state);

    // Set Environment directed graph properties in curve
    {
        for (const auto& dg : directed_graphs) {
            for (const auto& v : dg.second->getDescription().vertexProperties) {
                curve_header->registerEnvironmentDirectedGraphVertexProperty(dg.first, v.first, v.second.type.name(), v.second.type_size, v.second.elements);
            }
            for (const auto& e : dg.second->getDescription().edgeProperties) {
                curve_header->registerEnvironmentDirectedGraphEdgeProperty(dg.first, e.first, e.second.type.name(), e.second.type_size, e.second.elements);
            }
            curve_header->registerEnvironmentDirectedGraphVertexProperty(dg.first, GRAPH_VERTEX_PBM_VARIABLE_NAME, std::type_index(typeid(unsigned int)).name(), sizeof(unsigned int), 1);
            curve_header->registerEnvironmentDirectedGraphVertexProperty(dg.first, GRAPH_VERTEX_IPBM_VARIABLE_NAME, std::type_index(typeid(unsigned int)).name(), sizeof(unsigned int), 1);
            curve_header->registerEnvironmentDirectedGraphVertexProperty(dg.first, GRAPH_VERTEX_IPBM_EDGES_VARIABLE_NAME, std::type_index(typeid(unsigned int)).name(), sizeof(unsigned int), 1);
            curve_header->registerEnvironmentDirectedGraphVertexProperty(dg.first, GRAPH_VERTEX_INDEX_MAP_VARIABLE_NAME, std::type_index(typeid(unsigned int)).name(), sizeof(unsigned int), 1);
            dg.second->registerCurveInstance(curve_header);
        }
    }

    std::string header_filename = std::string(func.rtc_func_name).append("_impl");
    if (function_condition)
        header_filename.append("_condition");
    header_filename.append("_curve_rtc_dynamic.h");
    curve_header->setFileName(header_filename);

    // get the dynamically generated header from curve rtc
    const std::string curve_dynamic_header = curve_header->getDynamicHeader(env->getBufferLen());

    // output to disk if FLAMEGPU_OUTPUT_RTC_DYNAMIC_FILES macro is set
#ifdef FLAMEGPU_OUTPUT_RTC_DYNAMIC_FILES
        // create string for agent function implementation
        std::string func_impl = std::string(func.rtc_func_name).append("_impl");
        // curve
        std::ofstream file_curve_rtc_header;
        std::string file_curve_rtc_header_filename = func_impl.c_str();
        if (function_condition)
            file_curve_rtc_header_filename.append("_condition");
        file_curve_rtc_header_filename.append("_curve_rtc_dynamic.h");
        file_curve_rtc_header.open(file_curve_rtc_header_filename);
        // Remove first line as it is the filename, which misaligns profiler
        std::string out_s = curve_dynamic_header;
        out_s.erase(0, out_s.find("\n") + 1);
        file_curve_rtc_header << out_s;
        file_curve_rtc_header.close();
        // agent function
        std::ofstream agent_function_file;
        std::string agent_function_filename = func_impl.c_str();
        if (function_condition)
            agent_function_filename.append("_condition");
        agent_function_filename.append(".cu");
        agent_function_file.open(agent_function_filename);
        // Remove first line as it is the filename, which misaligns profiler
        out_s = func.rtc_source;
        out_s.erase(0, out_s.find("\n") + 1);
        agent_function_file << out_s;
        agent_function_file.close();
#endif

    detail::JitifyCache &jitify = detail::JitifyCache::getInstance();
    // switch between normal agent function and agent function condition
    if (!function_condition) {
        const std::string t_func_impl = std::string(func.rtc_func_name).append("_impl");
        const std::vector<std::string> template_args = { t_func_impl.c_str(), func.message_in_type.c_str(), func.message_out_type.c_str() };
        auto kernel_inst = jitify.loadKernel(func.rtc_func_name, template_args, func.rtc_source, curve_dynamic_header);
        // add kernel instance to map
        rtc_func_map.insert(CUDARTCFuncMap::value_type(func.name, std::move(kernel_inst)));
    } else {
        const std::string t_func_impl = std::string(func.rtc_func_condition_name).append("_cdn_impl");
        const std::vector<std::string> template_args = { t_func_impl.c_str() };
        auto kernel_inst = jitify.loadKernel(func.rtc_func_name + "_condition", template_args, func.rtc_condition_source, curve_dynamic_header);
        // add kernel instance to map
        rtc_func_map.insert(CUDARTCFuncMap::value_type(func.name + "_condition", std::move(kernel_inst)));
    }
}

void CUDAAgent::addInstantitateFunction(const AgentFunctionData& func, const std::shared_ptr<EnvironmentManager>& env, std::shared_ptr<const detail::CUDAMacroEnvironment> macro_env, const std::unordered_map<std::string, std::shared_ptr<CUDAEnvironmentDirectedGraphBuffers>>& directed_graphs, bool function_condition) {
    // Generate the host curve instance
    std::shared_ptr<detail::curve::HostCurve> curve = std::make_shared<detail::curve::HostCurve>();

    // Initialising values here, removes the need to "unregister" curve values
    // set agent variables in curve
    for (const auto& mmp : func.parent.lock()->variables) {
        curve->registerAgentVariable(mmp.first, mmp.second.type, mmp.second.type_size, mmp.second.elements);
    }

    // for normal agent function (e.g. not an agent function condition) append messages and agent outputs
    if (!function_condition) {
        // Set input message variables in curve
        if (auto im = func.message_input.lock()) {
            for (auto message_in_var : im->variables) {
                curve->registerMessageInputVariable(message_in_var.first, message_in_var.second.type, message_in_var.second.type_size, message_in_var.second.elements);
            }
        }
        // Set output message variables in curve
        if (auto om = func.message_output.lock()) {
            for (auto message_out_var : om->variables) {
                curve->registerMessageOutputVariable(message_out_var.first, message_out_var.second.type, message_out_var.second.type_size, message_out_var.second.elements);
            }
        }
        // Set agent output variables in curve
        if (auto ao = func.agent_output.lock()) {
            for (auto agent_out_var : ao->variables) {
                curve->registerAgentOutputVariable(agent_out_var.first, agent_out_var.second.type, agent_out_var.second.type_size, agent_out_var.second.elements);
            }
        }
    }

    // Set environment properties in curve (this includes mapped properties)
    {
        const auto& prop_map = env->getPropertiesMap();
        for (const auto& p : prop_map) {
            const unsigned int elements = p.second.elements;
            curve->registerSetEnvironmentProperty(p.first, p.second.type, p.second.length / elements, elements, p.second.offset);
        }
    }

    // Set Environment macro properties in curve
    macro_env->registerCurveVariables(*curve);

    // Set directed graphs in curve
    {
        for (const auto &dg : directed_graphs) {
            for (const auto &v : dg.second->getDescription().vertexProperties) {
                curve->registerEnvironmentDirectedGraphVertexProperty(dg.first, v.first, v.second.type, v.second.type_size, v.second.elements);
            }
            for (const auto& e : dg.second->getDescription().edgeProperties) {
                curve->registerEnvironmentDirectedGraphEdgeProperty(dg.first, e.first, e.second.type, e.second.type_size, e.second.elements);
            }
            curve->registerEnvironmentDirectedGraphVertexProperty(dg.first, GRAPH_VERTEX_PBM_VARIABLE_NAME, std::type_index(typeid(unsigned int)), sizeof(unsigned int), 1);
            curve->registerEnvironmentDirectedGraphVertexProperty(dg.first, GRAPH_VERTEX_IPBM_VARIABLE_NAME, std::type_index(typeid(unsigned int)), sizeof(unsigned int), 1);
            curve->registerEnvironmentDirectedGraphVertexProperty(dg.first, GRAPH_VERTEX_IPBM_EDGES_VARIABLE_NAME, std::type_index(typeid(unsigned int)), sizeof(unsigned int), 1);
            curve->registerEnvironmentDirectedGraphVertexProperty(dg.first, GRAPH_VERTEX_INDEX_MAP_VARIABLE_NAME, std::type_index(typeid(unsigned int)), sizeof(unsigned int), 1);

            dg.second->registerCurveInstance(curve);
        }
    }

    // switch between normal agent function and agent function condition, and add to map
    const std::string key_name = function_condition ? func.name + "_condition" : func.name;
    curve_map.insert(std::unordered_map<std::string, std::shared_ptr<detail::curve::HostCurve>>::value_type(key_name, std::move(curve)));
}

const jitify::experimental::KernelInstantiation& CUDAAgent::getRTCInstantiation(const std::string &function_name) const {
    CUDARTCFuncMap::const_iterator mm = rtc_func_map.find(function_name);
    if (mm == rtc_func_map.end()) {
        THROW exception::InvalidAgentFunc("Function name '%s' is not a runtime compiled agent function in agent '%s', "
            "in CUDAAgent::getRTCInstantiation()\n",
            function_name.c_str(), agent_description.name.c_str());
    }

    return *mm->second;
}
detail::curve::CurveRTCHost& CUDAAgent::getRTCHeader(const std::string &function_name) const {
    CUDARTCHeaderMap::const_iterator mm = rtc_header_map.find(function_name);
    if (mm == rtc_header_map.end()) {
        THROW exception::InvalidAgentFunc("Function name '%s' is not a runtime compiled agent function in agent '%s', "
            "in CUDAAgent::getRTCHeader()\n",
            function_name.c_str(), agent_description.name.c_str());
    }

    return *mm->second;
}
detail::curve::HostCurve& CUDAAgent::getCurve(const std::string &function_name) const {
    auto mm = curve_map.find(function_name);
    if (mm == curve_map.end()) {
        THROW exception::InvalidAgentFunc("Function name '%s' is not a (non-rtc) agent function in agent '%s', "
            "in CUDAAgent::getCurve()\n",
            function_name.c_str(), agent_description.name.c_str());
    }

    return *mm->second;
}

const CUDAAgent::CUDARTCFuncMap& CUDAAgent::getRTCFunctions() const {
    return rtc_func_map;
}

void CUDAAgent::initUnmappedVars(detail::CUDAScatter &scatter, const unsigned int streamId, const cudaStream_t stream) {
    for (auto &s : state_map) {
        s.second->initUnmappedVars(scatter, streamId, stream);
    }
}
void CUDAAgent::initExcludedVars(const std::string &state, const unsigned int count, const unsigned int offset, CUDAScatter& scatter, const unsigned int streamId, const cudaStream_t stream) {
    // check the cuda agent state map to find the correct state list
    const auto& sm = state_map.find(state);

    if (sm == state_map.end()) {
        THROW exception::InvalidCudaAgentState("Error: Agent ('%s') state ('%s') was not found, "
            "in CUDAAgent::initUnmappedVars()",
            agent_description.name.c_str(), state.c_str());
    }
    sm->second->initExcludedVars(count, offset, scatter, streamId, stream);
}
void CUDAAgent::cullUnmappedStates() {
    unsigned int i = 0;
    for (auto &s : state_map) {
        if (!s.second->getIsSubStatelist()) {
            s.second->clear();
            ++i;
        }
    }
    if (i == state_map.size())
        fat_agent->resetIDCounter();
}
void CUDAAgent::cullAllStates() {
    for (auto &s : state_map) {
        s.second->clear();
    }
    fat_agent->resetIDCounter();
}
std::list<std::shared_ptr<VariableBuffer>> CUDAAgent::getUnboundVariableBuffers(const std::string& state) {
    const auto& sm = state_map.find(state);

    if (sm == state_map.end()) {
        THROW exception::InvalidCudaAgentState("Error: Agent ('%s') state ('%s') was not found, "
            "in CUDAAgent::getUnboundVariableBuffers()",
            agent_description.name.c_str(), state.c_str());
    }
    return sm->second->getUnboundVariableBuffers();
}
id_t CUDAAgent::nextID(unsigned int count) {
    return fat_agent->nextID(count);
}
id_t* CUDAAgent::getDeviceNextID() {
    return fat_agent->getDeviceNextID();
}
void CUDAAgent::assignIDs(HostAPI& hostapi, detail::CUDAScatter &scatter, cudaStream_t stream, const unsigned int streamId) {
    fat_agent->assignIDs(hostapi, scatter, stream, streamId);
}

void CUDAAgent::setPopulationVec(const std::string& state_name, const std::shared_ptr<DeviceAgentVector_impl>& d_vec) {
    population_dvec[state_name] = d_vec;
}
std::shared_ptr<DeviceAgentVector_impl> CUDAAgent::getPopulationVec(const std::string& state_name) {
    auto find = population_dvec.find(state_name);
    if (find != population_dvec.end())
        return find->second;
    return nullptr;
}
void CUDAAgent::resetPopulationVecs() {
    for (auto &vec : population_dvec) {
        if (vec.second) {
            vec.second->syncChanges();
            vec.second.reset();
        }
    }
    population_dvec.clear();
}

}  // namespace detail
}  // namespace flamegpu