Program Listing for File HostAgentAPI.cuh

Return to documentation for file (include/flamegpu/runtime/agent/HostAgentAPI.cuh)

#ifndef INCLUDE_FLAMEGPU_RUNTIME_AGENT_HOSTAGENTAPI_CUH_
#define INCLUDE_FLAMEGPU_RUNTIME_AGENT_HOSTAGENTAPI_CUH_
#ifdef _MSC_VER
#pragma warning(push, 1)
#pragma warning(disable : 4706 4834)
#endif  // _MSC_VER
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
#pragma nv_diag_suppress 1719
#else
#pragma diag_suppress 1719
#endif  // __NVCC_DIAG_PRAGMA_SUPPORT__
#include <cub/cub.cuh>
#include <thrust/count.h>
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
#pragma nv_diag_default 1719
#else
#pragma diag_default 1719
#endif  // __NVCC_DIAG_PRAGMA_SUPPORT__
#ifdef _MSC_VER
#pragma warning(pop)
#endif  // _MSC_VER

#include <algorithm>
#include <string>
#include <vector>
#include <functional>
#include <memory>
#include <utility>

#include "flamegpu/simulation/detail/AgentInterface.h"
#include "flamegpu/model/AgentDescription.h"
#include "flamegpu/runtime/HostAPI.h"
#include "flamegpu/simulation/CUDASimulation.h"
#include "flamegpu/simulation/detail/CUDAAgent.h"
#include "flamegpu/runtime/agent/DeviceAgentVector.h"
#include "flamegpu/runtime/agent/DeviceAgentVector_impl.h"
#include "flamegpu/simulation/AgentLoggingConfig_Reductions.cuh"
#include "flamegpu/simulation/AgentLoggingConfig_SumReturn.h"
#include "flamegpu/detail/type_decode.h"

namespace flamegpu {

#define FLAMEGPU_CUSTOM_REDUCTION(funcName, a, b)\
struct funcName ## _impl {\
 public:\
    template <typename OutT>\
    struct binary_function {\
        __host__ __device__ __forceinline__ OutT operator()(const OutT &a, const OutT &b) const;\
    };\
};\
funcName ## _impl funcName;\
template <typename OutT>\
__host__ __device__ __forceinline__ OutT funcName ## _impl::binary_function<OutT>::operator()(const OutT & a, const OutT & b) const

#define FLAMEGPU_CUSTOM_TRANSFORM(funcName, a)\
struct funcName ## _impl {\
 public:\
    template<typename InT, typename OutT>\
    struct unary_function {\
        __host__ __device__ OutT operator()(const InT &a) const;\
    };\
};\
funcName ## _impl funcName;\
template<typename InT, typename OutT>\
__device__ __forceinline__ OutT funcName ## _impl::unary_function<InT, OutT>::operator()(const InT &a) const

class HostAgentAPI {
    friend class CUDASimulation;

 public:
    HostAgentAPI(HostAPI &_api, detail::AgentInterface &_agent, const std::string &_stateName, const VarOffsetStruct &_agentOffsets, HostAPI::AgentDataBuffer&_newAgentData)
        : api(_api)
        , agent(_agent)
        , stateName(_stateName)
        , agentOffsets(_agentOffsets)
        , newAgentData(_newAgentData) { }
    HostAgentAPI(const HostAgentAPI& other)
        : api(other.api)
        , agent(other.agent)
        , stateName(other.stateName)
        , agentOffsets(other.agentOffsets)
        , newAgentData(other.newAgentData)
    { }
    HostNewAgentAPI newAgent();
    /*
     * Returns the number of agents in this state
     */
    unsigned int count();
    template<typename InT>
    InT sum(const std::string &variable) const;
    template<typename InT, typename OutT>
    OutT sum(const std::string &variable) const;
    template<typename InT>
    std::pair<double, double> meanStandardDeviation(const std::string& variable) const;
    template<typename InT>
    InT min(const std::string &variable) const;
    template<typename InT>
    InT max(const std::string &variable) const;
    template<typename InT>
    unsigned int count(const std::string &variable, InT value) const;
    template<typename InT>
    std::vector<unsigned int> histogramEven(const std::string &variable, unsigned int histogramBins, InT lowerBound, InT upperBound) const;
    template<typename InT, typename OutT>
    std::vector<OutT> histogramEven(const std::string &variable, unsigned int histogramBins, InT lowerBound, InT upperBound) const;
    template<typename InT, typename reductionOperatorT>
    InT reduce(const std::string &variable, reductionOperatorT reductionOperator, InT init) const;
    template<typename InT, typename OutT, typename transformOperatorT, typename reductionOperatorT>
    OutT transformReduce(const std::string &variable, transformOperatorT transformOperator, reductionOperatorT reductionOperator, OutT init) const;
    enum Order {Asc, Desc};
    template<typename VarT>
    void sort(const std::string &variable, Order order, int beginBit = 0, int endBit = sizeof(VarT)*8);
    template<typename Var1T, typename Var2T>
    void sort(const std::string &variable1, Order order1, const std::string &variable2, Order order2);
    DeviceAgentVector getPopulationData();

 private:
    static void fillTIDArray_async(unsigned int *d_buffer, unsigned int length, cudaStream_t stream);
    static void sortBuffer_async(void *dest, void*src, unsigned int *position, size_t typeLen, unsigned int length, cudaStream_t stream);
    template<typename InT, typename OutT>
    void sum_async(const std::string& variable, OutT& result, cudaStream_t stream, unsigned int streamId) const;
    template<typename InT>
    void meanStandardDeviation_async(const std::string& variable, std::pair<double, double>& result, cudaStream_t stream, unsigned int streamId) const;
    template<typename InT>
    void min_async(const std::string& variable, InT& result, cudaStream_t stream, unsigned int streamId) const;
    template<typename InT>
    void max_async(const std::string& variable, InT& result, cudaStream_t stream, unsigned int streamId) const;
    template<typename InT>
    unsigned int count_async(const std::string& variable, InT value, cudaStream_t stream) const;
    template<typename InT, typename OutT>
    void histogramEven_async(const std::string& variable, unsigned int histogramBins, InT lowerBound, InT upperBound, std::vector<OutT> &result, cudaStream_t stream, unsigned int streamId) const;
    template<typename InT, typename reductionOperatorT>
    void reduce_async(const std::string& variable, reductionOperatorT reductionOperator, InT init, InT &result, cudaStream_t stream, unsigned int streamId) const;
    template<typename InT, typename OutT, typename transformOperatorT, typename reductionOperatorT>
    OutT transformReduce_async(const std::string& variable, transformOperatorT transformOperator, reductionOperatorT reductionOperator, OutT init, cudaStream_t stream) const;
    template<typename VarT>
    void sort_async(const std::string& variable, Order order, int beginBit, int endBit, cudaStream_t stream, unsigned int streamId);
    template<typename Var1T, typename Var2T>
    void sort_async(const std::string& variable1, Order order1, const std::string& variable2, Order order2, cudaStream_t stream, unsigned int streamId);
    HostAPI &api;
    detail::AgentInterface &agent;
    const std::string stateName;
    const VarOffsetStruct& agentOffsets;
    HostAPI::AgentDataBuffer& newAgentData;
};

//
// Implementation
//

template<typename InT>
InT HostAgentAPI::sum(const std::string &variable) const {
    InT rtn;
    sum_async<InT, InT>(variable, rtn, this->api.stream, this->api.streamId);
    gpuErrchk(cudaStreamSynchronize(this->api.stream));
    return rtn;
}
template<typename InT, typename OutT>
OutT HostAgentAPI::sum(const std::string& variable) const {
    OutT rtn;
    sum_async<InT, OutT>(variable, rtn, this->api.stream, this->api.streamId);
    gpuErrchk(cudaStreamSynchronize(this->api.stream));
    return rtn;
}
template<typename InT, typename OutT>
void HostAgentAPI::sum_async(const std::string &variable, OutT &result, const cudaStream_t stream, const unsigned int streamId) const {
    static_assert(sizeof(InT) <= sizeof(OutT), "Template arg OutT should not be of a smaller size than InT");
    std::shared_ptr<DeviceAgentVector_impl> population = agent.getPopulationVec(stateName);
    if (population) {
        if (this->api.stream != stream) {
            THROW exception::InvalidOperation("Attempting to sync DeviceAgentVector with wrong stream!\nThis should not be possible.\n");
        }
        // If the user has a DeviceAgentVector out, sync changes
        population->syncChanges();
    }
    const CAgentDescription agentDesc(agent.getAgentDescription());
    std::type_index typ = agentDesc.getVariableType(variable);  // This will throw name exception
    if (agentDesc.getVariableLength(variable) != 1) {
        THROW exception::UnsupportedVarType("HostAgentAPI::sum() does not support agent array variables.");
    }
    if (std::type_index(typeid(InT)) != typ) {
        THROW exception::InvalidVarType("Wrong variable type passed to HostAgentAPI::sum(). "
            "This call expects '%s', but '%s' was requested.",
            agentDesc.getVariableType(variable).name(), typeid(InT).name());
    }
    void *var_ptr = agent.getStateVariablePtr(stateName, variable);
    const auto agentCount = agent.getStateSize(stateName);
    // Check if we need to resize cub storage
    auto &cub_temp = api.scatter.CubTemp(streamId);
    size_t tempByte = 0;
    gpuErrchk(cub::DeviceReduce::Sum(nullptr, tempByte, reinterpret_cast<InT*>(var_ptr), reinterpret_cast<OutT*>(api.d_output_space), static_cast<int>(agentCount), stream));
    cub_temp.resize(tempByte);
    // Resize output storage
    api.resizeOutputSpace<OutT>();
    gpuErrchk(cub::DeviceReduce::Sum(cub_temp.getPtr(), cub_temp.getSize(), reinterpret_cast<InT*>(var_ptr), reinterpret_cast<OutT*>(api.d_output_space), static_cast<int>(agentCount), stream));
    gpuErrchkLaunch();
    gpuErrchk(cudaMemcpyAsync(&result, api.d_output_space, sizeof(OutT), cudaMemcpyDeviceToHost, stream));
}
template<typename InT>
std::pair<double, double> HostAgentAPI::meanStandardDeviation(const std::string& variable) const {
    std::pair<double, double> rtn;
    meanStandardDeviation_async<InT>(variable, rtn, this->api.stream, this->api.streamId);
    gpuErrchk(cudaStreamSynchronize(this->api.stream));  // Redundant, meanStandardDeviation_async() is not truly async
    return rtn;
}
template<typename InT>
void HostAgentAPI::meanStandardDeviation_async(const std::string& variable, std::pair<double, double> &result, const cudaStream_t stream, const unsigned int streamId) const {
    std::shared_ptr<DeviceAgentVector_impl> population = agent.getPopulationVec(stateName);
    if (population) {
        if (this->api.stream != stream) {
            THROW exception::InvalidOperation("Attempting to sync DeviceAgentVector with wrong stream!\nThis should not be possible.\n");
        }
        // If the user has a DeviceAgentVector out, sync changes
        population->syncChanges();
    }
    const CAgentDescription agentDesc(agent.getAgentDescription());
    std::type_index typ = agentDesc.getVariableType(variable);  // This will throw name exception
    if (agentDesc.getVariableLength(variable) != 1) {
        THROW exception::UnsupportedVarType("HostAgentAPI::meanStandardDeviation() does not support agent array variables.");
    }
    if (std::type_index(typeid(InT)) != typ) {
        THROW exception::InvalidVarType("Wrong variable type passed to HostAgentAPI::meanStandardDeviation(). "
            "This call expects '%s', but '%s' was requested.",
            agentDesc.getVariableType(variable).name(), typeid(InT).name());
    }
    const auto agentCount = agent.getStateSize(stateName);
    if (agentCount == 0) {
        result = std::make_pair(0.0, 0.0);
    }
    // Calculate mean (We could make this more efficient by leaving sum in device mem?)
    typename sum_input_t<InT>::result_t sum_result;
    sum_async<InT, typename sum_input_t<InT>::result_t>(variable, sum_result, stream, streamId);
    gpuErrchk(cudaStreamSynchronize(stream));
    const double mean = sum_result / static_cast<double>(agentCount);
    // Then for each number: subtract the Mean and square the result
    // Then work out the mean of those squared differences.
    auto lock = std::unique_lock<std::mutex>(detail::STANDARD_DEVIATION_MEAN_mutex);
    gpuErrchk(cudaMemcpyToSymbolAsync(detail::STANDARD_DEVIATION_MEAN, &mean, sizeof(double), 0, cudaMemcpyHostToDevice, stream));
    const double variance = transformReduce_async<InT, double>(variable, detail::standard_deviation_subtract_mean, detail::standard_deviation_add, 0, stream) / static_cast<double>(agentCount);
    lock.unlock();
    // Take the square root of that and we are done!
    result = std::make_pair(mean, sqrt(variance));
}
template<typename InT>
InT HostAgentAPI::min(const std::string& variable) const {
    InT rtn;
    min_async<InT>(variable, rtn, this->api.stream, this->api.streamId);
    gpuErrchk(cudaStreamSynchronize(this->api.stream));
    return rtn;
}
template<typename InT>
void HostAgentAPI::min_async(const std::string &variable, InT& result, const cudaStream_t stream, const unsigned int streamId) const {
    std::shared_ptr<DeviceAgentVector_impl> population = agent.getPopulationVec(stateName);
    if (population) {
        if (this->api.stream != stream) {
            THROW exception::InvalidOperation("Attempting to sync DeviceAgentVector with wrong stream!\nThis should not be possible.\n");
        }
        // If the user has a DeviceAgentVector out, sync changes
        population->syncChanges();
    }
    const CAgentDescription agentDesc(agent.getAgentDescription());
    std::type_index typ = agentDesc.getVariableType(variable);  // This will throw name exception
    if (agentDesc.getVariableLength(variable) != 1) {
        THROW exception::UnsupportedVarType("HostAgentAPI::lowerBound() does not support agent array variables.");
    }
    if (std::type_index(typeid(InT)) != typ) {
        THROW exception::InvalidVarType("Wrong variable type passed to HostAgentAPI::min(). "
            "This call expects '%s', but '%s' was requested.",
            agentDesc.getVariableType(variable).name(), typeid(InT).name());
    }
    void *var_ptr = agent.getStateVariablePtr(stateName, variable);
    const auto agentCount = agent.getStateSize(stateName);
    // Check if we need to resize cub storage
    auto& cub_temp = api.scatter.CubTemp(streamId);
    // Resize cub storage
    size_t tempByte = 0;
    gpuErrchk(cub::DeviceReduce::Min(nullptr, tempByte, reinterpret_cast<InT*>(var_ptr), reinterpret_cast<InT*>(api.d_output_space), static_cast<int>(agentCount), stream));
    gpuErrchkLaunch();
    cub_temp.resize(tempByte);
    // Resize output storage
    api.resizeOutputSpace<InT>();
    gpuErrchk(cub::DeviceReduce::Min(cub_temp.getPtr(), cub_temp.getSize(), reinterpret_cast<InT*>(var_ptr), reinterpret_cast<InT*>(api.d_output_space), static_cast<int>(agentCount), stream));
    gpuErrchkLaunch();
    gpuErrchk(cudaMemcpyAsync(&result, api.d_output_space, sizeof(InT), cudaMemcpyDeviceToHost, stream));
}
template<typename InT>
InT HostAgentAPI::max(const std::string& variable) const {
    InT rtn;
    max_async<InT>(variable, rtn, this->api.stream, this->api.streamId);
    gpuErrchk(cudaStreamSynchronize(this->api.stream));
    return rtn;
}
template<typename InT>
void HostAgentAPI::max_async(const std::string &variable, InT &result, const cudaStream_t stream, const unsigned int streamId) const {
    std::shared_ptr<DeviceAgentVector_impl> population = agent.getPopulationVec(stateName);
    if (population) {
        if (this->api.stream != stream) {
            THROW exception::InvalidOperation("Attempting to sync DeviceAgentVector with wrong stream!\nThis should not be possible.\n");
        }
        // If the user has a DeviceAgentVector out, sync changes
        population->syncChanges();
    }
    const CAgentDescription agentDesc(agent.getAgentDescription());
    std::type_index typ = agentDesc.getVariableType(variable);  // This will throw name exception
    if (agentDesc.getVariableLength(variable) != 1) {
        THROW exception::UnsupportedVarType("HostAgentAPI::max() does not support agent array variables.");
    }
    if (std::type_index(typeid(InT)) != typ) {
        THROW exception::InvalidVarType("Wrong variable type passed to HostAgentAPI::max(). "
            "This call expects '%s', but '%s' was requested.",
            agentDesc.getVariableType(variable).name(), typeid(InT).name());
    }
    void *var_ptr = agent.getStateVariablePtr(stateName, variable);
    const auto agentCount = agent.getStateSize(stateName);
    // Check if we need to resize cub storage
    auto& cub_temp = api.scatter.CubTemp(streamId);
    // Resize cub storage
    size_t tempByte = 0;
    gpuErrchk(cub::DeviceReduce::Max(nullptr, tempByte, reinterpret_cast<InT*>(var_ptr), reinterpret_cast<InT*>(api.d_output_space), static_cast<int>(agentCount), stream));
    cub_temp.resize(tempByte);
    // Resize output storage
    api.resizeOutputSpace<InT>();
    gpuErrchk(cub::DeviceReduce::Max(cub_temp.getPtr(), cub_temp.getSize(), reinterpret_cast<InT*>(var_ptr), reinterpret_cast<InT*>(api.d_output_space), static_cast<int>(agentCount), stream));
    gpuErrchkLaunch();
    gpuErrchk(cudaMemcpyAsync(&result, api.d_output_space, sizeof(InT), cudaMemcpyDeviceToHost, stream));
}
template<typename InT>
unsigned int HostAgentAPI::count(const std::string &variable, InT value) const {
    return count_async<InT>(variable, value, this->api.stream);
}
template<typename InT>
unsigned int HostAgentAPI::count_async(const std::string& variable, InT value, const cudaStream_t stream) const {
    std::shared_ptr<DeviceAgentVector_impl> population = agent.getPopulationVec(stateName);
    if (population) {
        if (this->api.stream != stream) {
            THROW exception::InvalidOperation("Attempting to sync DeviceAgentVector with wrong stream!\nThis should not be possible.\n");
        }
        // If the user has a DeviceAgentVector out, sync changes
        population->syncChanges();
    }
    const CAgentDescription agentDesc(agent.getAgentDescription());
    std::type_index typ = agentDesc.getVariableType(variable);  // This will throw name exception
    if (agentDesc.getVariableLength(variable) != 1) {
        THROW exception::UnsupportedVarType("HostAgentAPI::count() does not support agent array variables.");
    }
    if (std::type_index(typeid(InT)) != typ) {
        THROW exception::InvalidVarType("Wrong variable type passed to HostAgentAPI::count(). "
            "This call expects '%s', but '%s' was requested.",
            agentDesc.getVariableType(variable).name(), typeid(InT).name());
    }
    void *var_ptr = agent.getStateVariablePtr(stateName, variable);
    const auto agentCount = agent.getStateSize(stateName);
    // Cast return from ptrdiff_t (int64_t) to (uint32_t)
    unsigned int rtn = static_cast<unsigned int>(thrust::count(thrust::cuda::par.on(stream), thrust::device_ptr<InT>(reinterpret_cast<InT*>(var_ptr)), thrust::device_ptr<InT>(reinterpret_cast<InT*>(var_ptr) + agentCount), value));
    gpuErrchkLaunch();
    return rtn;
}
template<typename InT>
std::vector<unsigned int> HostAgentAPI::histogramEven(const std::string &variable, unsigned int histogramBins, InT lowerBound, InT upperBound) const {
    std::vector<unsigned int> rtn;
    histogramEven_async<InT, unsigned int>(variable, histogramBins, lowerBound, upperBound, rtn, this->api.stream, this->api.streamId);
    gpuErrchk(cudaStreamSynchronize(this->api.stream));
    return rtn;
}
template<typename InT, typename OutT>
std::vector<OutT> HostAgentAPI::histogramEven(const std::string &variable, unsigned int histogramBins, InT lowerBound, InT upperBound) const {
    std::vector<OutT> rtn;
    histogramEven_async<InT, OutT>(variable, histogramBins, lowerBound, upperBound, rtn, this->api.stream, this->api.streamId);
    gpuErrchk(cudaStreamSynchronize(this->api.stream));
    return rtn;
}
template<typename InT, typename OutT>
void HostAgentAPI::histogramEven_async(const std::string &variable, unsigned int histogramBins, InT lowerBound, InT upperBound, std::vector<OutT>& result, const cudaStream_t stream, const unsigned int streamId) const {
    std::shared_ptr<DeviceAgentVector_impl> population = agent.getPopulationVec(stateName);
    if (population) {
        if (this->api.stream != stream) {
            THROW exception::InvalidOperation("Attempting to sync DeviceAgentVector with wrong stream!\nThis should not be possible.\n");
        }
        // If the user has a DeviceAgentVector out, sync changes
        population->syncChanges();
    }
    if (lowerBound >= upperBound) {
        THROW exception::InvalidArgument("lowerBound (%s) must be lower than < upperBound (%s) in HostAgentAPI::histogramEven().",
            std::to_string(lowerBound).c_str(), std::to_string(upperBound).c_str());
    }
    const CAgentDescription agentDesc(agent.getAgentDescription());
    std::type_index typ = agentDesc.getVariableType(variable);  // This will throw name exception
    if (agentDesc.getVariableLength(variable) != 1) {
        THROW exception::UnsupportedVarType("HostAgentAPI::histogramEven() does not support agent array variables.");
    }
    if (std::type_index(typeid(InT)) != typ) {
        THROW exception::InvalidVarType("Wrong variable type passed to HostAgentAPI::histogramEven(). "
            "This call expects '%s', but '%s' was requested.",
            agentDesc.getVariableType(variable).name(), typeid(InT).name());
    }
    void *var_ptr = agent.getStateVariablePtr(stateName, variable);
    const auto agentCount = agent.getStateSize(stateName);
    // Check if we need to resize cub storage
    auto& cub_temp = api.scatter.CubTemp(streamId);
    // Resize cub storage
    size_t tempByte = 0;
    gpuErrchk(cub::DeviceHistogram::HistogramEven(nullptr, tempByte,
        reinterpret_cast<InT*>(var_ptr), reinterpret_cast<int*>(api.d_output_space), histogramBins + 1, lowerBound, upperBound, static_cast<int>(agentCount), stream));
    gpuErrchkLaunch();
    cub_temp.resize(tempByte);
    // Resize output storage
    api.resizeOutputSpace<OutT>(histogramBins);
    gpuErrchk(cub::DeviceHistogram::HistogramEven(cub_temp.getPtr(), cub_temp.getSize(),
        reinterpret_cast<InT*>(var_ptr), reinterpret_cast<OutT*>(api.d_output_space), histogramBins + 1, lowerBound, upperBound, static_cast<int>(agentCount), stream));
    gpuErrchkLaunch();
    result.resize(histogramBins);
    gpuErrchk(cudaMemcpyAsync(result.data(), api.d_output_space, histogramBins * sizeof(OutT), cudaMemcpyDeviceToHost, stream));
}
template<typename InT, typename reductionOperatorT>
InT HostAgentAPI::reduce(const std::string &variable, reductionOperatorT reductionOperator, InT init) const {
    InT rtn;
    reduce_async<InT, reductionOperatorT>(variable, reductionOperator, init, rtn, this->api.stream, this->api.streamId);
    gpuErrchk(cudaStreamSynchronize(this->api.stream));
    return rtn;
}
template<typename InT, typename reductionOperatorT>
void HostAgentAPI::reduce_async(const std::string & variable, reductionOperatorT /*reductionOperator*/, InT init, InT &result, const cudaStream_t stream, const unsigned int streamId) const {
    std::shared_ptr<DeviceAgentVector_impl> population = agent.getPopulationVec(stateName);
    if (population) {
        if (this->api.stream != stream) {
            THROW exception::InvalidOperation("Attempting to sync DeviceAgentVector with wrong stream!\nThis should not be possible.\n");
        }
        // If the user has a DeviceAgentVector out, sync changes
        population->syncChanges();
    }
    const CAgentDescription agentDesc(agent.getAgentDescription());
    std::type_index typ = agentDesc.getVariableType(variable);  // This will throw name exception
    if (agentDesc.getVariableLength(variable) != detail::type_decode<InT>::len_t) {
        THROW exception::UnsupportedVarType("HostAgentAPI::reduce() does not support agent array variables.");
    }
    if (std::type_index(typeid(typename detail::type_decode<InT>::type_t)) != typ) {
        THROW exception::InvalidVarType("Wrong variable type passed to HostAgentAPI::reduce(). "
            "This call expects '%s', but '%s' was requested.",
            typ.name(), typeid(typename detail::type_decode<InT>::type_t).name());
    }
    void *var_ptr = agent.getStateVariablePtr(stateName, variable);
    const auto agentCount = agent.getStateSize(stateName);
    // Check if we need to resize cub storage
    auto& cub_temp = api.scatter.CubTemp(streamId);
    // Resize cub storage
    size_t tempByte = 0;
    gpuErrchk(cub::DeviceReduce::Reduce(nullptr, tempByte, reinterpret_cast<InT*>(var_ptr), reinterpret_cast<InT*>(api.d_output_space),
        static_cast<int>(agentCount), typename reductionOperatorT::template binary_function<InT>(), init, stream));
    gpuErrchkLaunch();
    cub_temp.resize(tempByte);
    // Resize output storage
    api.resizeOutputSpace<InT>();
    gpuErrchk(cub::DeviceReduce::Reduce(cub_temp.getPtr(), cub_temp.getSize(), reinterpret_cast<InT*>(var_ptr), reinterpret_cast<InT*>(api.d_output_space),
        static_cast<int>(agentCount), typename reductionOperatorT::template binary_function<InT>(), init, stream));
    gpuErrchkLaunch();
    gpuErrchk(cudaMemcpyAsync(&result, api.d_output_space, sizeof(InT), cudaMemcpyDeviceToHost, stream));
}
template<typename InT, typename OutT, typename transformOperatorT, typename reductionOperatorT>
OutT HostAgentAPI::transformReduce(const std::string &variable, transformOperatorT transformOperator, reductionOperatorT reductionOperator, OutT init) const {
    return transformReduce_async<InT, OutT, transformOperatorT, reductionOperatorT>(variable, transformOperator, reductionOperator, init, this->api.stream);
}
template<typename InT, typename OutT, typename transformOperatorT, typename reductionOperatorT>
OutT HostAgentAPI::transformReduce_async(const std::string &variable, transformOperatorT /*transformOperator*/, reductionOperatorT /*reductionOperator*/, OutT init, cudaStream_t stream) const {
    std::shared_ptr<DeviceAgentVector_impl> population = agent.getPopulationVec(stateName);
    if (population) {
        if (this->api.stream != stream) {
            THROW exception::InvalidOperation("Attempting to sync DeviceAgentVector with wrong stream!\nThis should not be possible.\n");
        }
        // If the user has a DeviceAgentVector out, sync changes
        population->syncChanges();
    }
    const CAgentDescription agentDesc(agent.getAgentDescription());
    std::type_index typ = agentDesc.getVariableType(variable);  // This will throw name exception
    if (agentDesc.getVariableLength(variable) != detail::type_decode<InT>::len_t) {
        THROW exception::UnsupportedVarType("HostAgentAPI::transformReduce() does not support agent array variables.");
    }
    if (std::type_index(typeid(typename detail::type_decode<InT>::type_t)) != typ) {
        THROW exception::InvalidVarType("Wrong variable type passed to HostAgentAPI::transformReduce(). "
            "This call expects '%s', but '%s' was requested.",
            typ.name(), typeid(typename detail::type_decode<InT>::type_t).name());
    }
    void *var_ptr = agent.getStateVariablePtr(stateName, variable);
    const auto agentCount = agent.getStateSize(stateName);
    OutT rtn = thrust::transform_reduce(thrust::cuda::par.on(stream), thrust::device_ptr<InT>(reinterpret_cast<InT*>(var_ptr)), thrust::device_ptr<InT>(reinterpret_cast<InT*>(var_ptr) + agentCount),
        typename transformOperatorT::template unary_function<InT, OutT>(), init, typename reductionOperatorT::template binary_function<OutT>());
    gpuErrchkLaunch();
    return rtn;
}


template<typename VarT>
void HostAgentAPI::sort(const std::string &variable, Order order, int beginBit, int endBit) {
    sort_async<VarT>(variable, order, beginBit, endBit, this->api.stream, this->api.streamId);
    gpuErrchk(cudaStreamSynchronize(this->api.stream));
}
template<typename VarT>
void HostAgentAPI::sort_async(const std::string & variable, Order order, int beginBit, int endBit, const cudaStream_t stream, const unsigned int streamId) {
    std::shared_ptr<DeviceAgentVector_impl> population = agent.getPopulationVec(stateName);
    if (population) {
        if (this->api.stream != stream || this->api.streamId != streamId) {
            THROW exception::InvalidOperation("Attempting to sync DeviceAgentVector with wrong stream!\nThis should not be possible.\n");
        }
        // If the user has a DeviceAgentVector out, sync changes
        population->syncChanges();
    }
    auto &scatter = api.agentModel.singletons->scatter;
    auto &scan = scatter.Scan();
    // Check variable is valid
    CAgentDescription agentDesc(agent.getAgentDescription());
    std::type_index typ = agentDesc.getVariableType(variable);  // This will throw name exception
    if (agentDesc.getVariableLength(variable) != 1) {
        THROW exception::UnsupportedVarType("HostAgentAPI::sort() does not support agent array variables.");
    }
    if (std::type_index(typeid(VarT)) != typ) {
        THROW exception::InvalidVarType("Wrong variable type passed to HostAgentAPI::sort(). "
            "This call expects '%s', but '%s' was requested.",
            agentDesc.getVariableType(variable).name(), typeid(VarT).name());
    }
    // We will use scan_flag agent_death/message_output here so resize
    const unsigned int agentCount = agent.getStateSize(stateName);
    void *var_ptr = agent.getStateVariablePtr(stateName, variable);
    const size_t total_variable_buffer_size = sizeof(VarT) * agentCount;
    const unsigned int fake_num_agent = static_cast<unsigned int>(total_variable_buffer_size/sizeof(unsigned int)) +1;
    scan.resize(fake_num_agent, detail::CUDAScanCompaction::AGENT_DEATH, streamId);
    scan.resize(agentCount, detail::CUDAScanCompaction::MESSAGE_OUTPUT, streamId);
    VarT *keys_in = reinterpret_cast<VarT *>(scan.Config(detail::CUDAScanCompaction::Type::AGENT_DEATH, streamId).d_ptrs.scan_flag);
    VarT *keys_out = reinterpret_cast<VarT *>(scan.Config(detail::CUDAScanCompaction::Type::AGENT_DEATH, streamId).d_ptrs.position);
    unsigned int *vals_in = scan.Config(detail::CUDAScanCompaction::Type::MESSAGE_OUTPUT, streamId).d_ptrs.scan_flag;
    unsigned int *vals_out = scan.Config(detail::CUDAScanCompaction::Type::MESSAGE_OUTPUT, streamId).d_ptrs.position;
    // Create array of TID (use scanflag_death.position)
    fillTIDArray_async(vals_in, agentCount, stream);
    // Create array of agent values (use scanflag_death.scan_flag)
    gpuErrchk(cudaMemcpyAsync(keys_in, var_ptr, total_variable_buffer_size, cudaMemcpyDeviceToDevice, stream));
    // Check if we need to resize cub storage
    auto& cub_temp = api.scatter.CubTemp(streamId);
    // Resize cub storage
    size_t tempByte = 0;
    if (order == Asc) {
        gpuErrchk(cub::DeviceRadixSort::SortPairs(nullptr, tempByte, keys_in, keys_out, vals_in, vals_out, agentCount, beginBit, endBit, stream));
    } else {
        gpuErrchk(cub::DeviceRadixSort::SortPairsDescending(nullptr, tempByte, keys_in, keys_out, vals_in, vals_out, agentCount, beginBit, endBit, stream));
    }
    cub_temp.resize(tempByte);
    // pair sort
    if (order == Asc) {
        gpuErrchk(cub::DeviceRadixSort::SortPairs(cub_temp.getPtr(), cub_temp.getSize(), keys_in, keys_out, vals_in, vals_out, agentCount, beginBit, endBit, stream));
    } else {
        gpuErrchk(cub::DeviceRadixSort::SortPairsDescending(cub_temp.getPtr(), cub_temp.getSize(), keys_in, keys_out, vals_in, vals_out, agentCount, beginBit, endBit, stream));
    }
    // Scatter all agent variables
    api.agentModel.agent_map.at(agentDesc.getName())->scatterSort_async(stateName, scatter, streamId, stream);
    if (population) {
        // If the user has a DeviceAgentVector out, purge cache so it redownloads new data on next use
        population->purgeCache();
    }
}


template<typename Var1T, typename Var2T>
void HostAgentAPI::sort(const std::string &variable1, Order order1, const std::string &variable2, Order order2) {
    sort_async<Var1T, Var2T>(variable1, order1, variable2, order2, this->api.stream, this->api.streamId);
    gpuErrchk(cudaStreamSynchronize(this->api.stream));
}
template<typename Var1T, typename Var2T>
void HostAgentAPI::sort_async(const std::string & variable1, Order order1, const std::string & variable2, Order order2, const cudaStream_t stream, const unsigned int streamId) {
    std::shared_ptr<DeviceAgentVector_impl> population = agent.getPopulationVec(stateName);
    if (population) {
        if (this->api.stream != stream || this->api.streamId != streamId) {
            THROW exception::InvalidOperation("Attempting to sync DeviceAgentVector with wrong stream!\nThis should not be possible.\n");
        }
        // If the user has a DeviceAgentVector out, sync changes
        population->syncChanges();
    }
    auto &scatter = api.agentModel.singletons->scatter;
    auto &scan = scatter.Scan();
    const CAgentDescription agentDesc(agent.getAgentDescription());
    {  // Check variable 1 is valid
        std::type_index typ = agentDesc.getVariableType(variable1);  // This will throw name exception
        if (agentDesc.getVariableLength(variable1) != 1) {
            THROW exception::UnsupportedVarType("HostAgentAPI::sort() does not support agent array variables.");
        }
        if (std::type_index(typeid(Var1T)) != typ) {
            THROW exception::InvalidVarType("Wrong type for variable '%s' passed to HostAgentAPI::sort(). "
                "This call expects '%s', but '%s' was requested.",
                variable1.c_str(), agentDesc.getVariableType(variable1).name(), typeid(Var1T).name());
        }
    }
    {  // Check variable 2 is valid
        std::type_index typ = agentDesc.getVariableType(variable2);  // This will throw name exception
        if (agentDesc.getVariableLength(variable2) != 1) {
            THROW exception::UnsupportedVarType("HostAgentAPI::sort() does not support agent array variables.");
        }
        if (std::type_index(typeid(Var2T)) != typ) {
            THROW exception::InvalidVarType("Wrong type for variable '%s' passed to HostAgentAPI::sort(). "
                "This call expects '%s', but '%s' was requested.",
                variable2.c_str(), agentDesc.getVariableType(variable2).name(), typeid(Var2T).name());
        }
    }
    const unsigned int agentCount = agent.getStateSize(stateName);
    // Fill array with var1 keys
    {
        // Resize
        const size_t total_variable_buffer_size = sizeof(Var1T) * agentCount;
        const unsigned int fake_num_agent = static_cast<unsigned int>(total_variable_buffer_size/sizeof(unsigned int)) +1;
        scan.resize(fake_num_agent, detail::CUDAScanCompaction::AGENT_DEATH, streamId);
        // Fill
        void *keys1b = scan.Config(detail::CUDAScanCompaction::Type::AGENT_DEATH, streamId).d_ptrs.position;
        void *var_ptr = agent.getStateVariablePtr(stateName, variable1);
        gpuErrchk(cudaMemcpyAsync(keys1b, var_ptr, total_variable_buffer_size, cudaMemcpyDeviceToDevice, stream));
    }
    // Fill array with var2 keys
    {
        // Resize
        const size_t total_variable_buffer_size = sizeof(Var2T) * agentCount;
        const unsigned int fake_num_agent = static_cast<unsigned int>(total_variable_buffer_size/sizeof(unsigned int)) +1;
        scan.resize(std::max(agentCount, fake_num_agent), detail::CUDAScanCompaction::MESSAGE_OUTPUT, streamId);
        // Fill
        void *keys2 = scan.Config(detail::CUDAScanCompaction::Type::MESSAGE_OUTPUT, streamId).d_ptrs.scan_flag;
        void *var_ptr = agent.getStateVariablePtr(stateName, variable2);
        gpuErrchk(cudaMemcpyAsync(keys2, var_ptr, total_variable_buffer_size, cudaMemcpyDeviceToDevice, stream));
    }
    // Define our buffers (here, after resize)
    Var1T *keys1 = reinterpret_cast<Var1T *>(scan.Config(detail::CUDAScanCompaction::Type::AGENT_DEATH, streamId).d_ptrs.scan_flag);
    Var1T *keys1b = reinterpret_cast<Var1T *>(scan.Config(detail::CUDAScanCompaction::Type::AGENT_DEATH, streamId).d_ptrs.position);
    Var2T *keys2 = reinterpret_cast<Var2T *>(scan.Config(detail::CUDAScanCompaction::Type::MESSAGE_OUTPUT, streamId).d_ptrs.scan_flag);
    unsigned int *vals = scan.Config(detail::CUDAScanCompaction::Type::MESSAGE_OUTPUT, streamId).d_ptrs.position;
    // Init value array
    fillTIDArray_async(vals, agentCount, stream);
    // Process variable 2 first
    {
        // pair sort values
        if (order2 == Asc) {
            thrust::stable_sort_by_key(thrust::cuda::par.on(stream), thrust::device_ptr<Var2T>(keys2), thrust::device_ptr<Var2T>(keys2 + agentCount),
            thrust::device_ptr<unsigned int>(vals), thrust::less<Var2T>());
        } else {
            thrust::stable_sort_by_key(thrust::cuda::par.on(stream), thrust::device_ptr<Var2T>(keys2), thrust::device_ptr<Var2T>(keys2 + agentCount),
            thrust::device_ptr<unsigned int>(vals), thrust::greater<Var2T>());
        }
        gpuErrchkLaunch();
        // sort keys1 based on this order
        sortBuffer_async(keys1, keys1b, vals, sizeof(Var1T), agentCount, stream);
    }
    // Process variable 1 second
    {
        // pair sort
        if (order1 == Asc) {
            thrust::stable_sort_by_key(thrust::cuda::par.on(stream), thrust::device_ptr<Var1T>(keys1), thrust::device_ptr<Var1T>(keys1 + agentCount),
            thrust::device_ptr<unsigned int>(vals), thrust::less<Var1T>());
        } else {
            thrust::stable_sort_by_key(thrust::cuda::par.on(stream), thrust::device_ptr<Var1T>(keys1), thrust::device_ptr<Var1T>(keys1 + agentCount),
            thrust::device_ptr<unsigned int>(vals), thrust::greater<Var1T>());
        }
        gpuErrchkLaunch();
    }
    // Scatter all agent variables
    api.agentModel.agent_map.at(agentDesc.getName())->scatterSort_async(stateName, scatter, streamId, stream);

    if (population) {
        // If the user has a DeviceAgentVector out, purge cache so it redownloads new data on next use
        population->purgeCache();
    }
}

}  // namespace flamegpu

#endif  // INCLUDE_FLAMEGPU_RUNTIME_AGENT_HOSTAGENTAPI_CUH_