跳至主要內容

cuCollections性能测试

BradZhone大约 14 分钟高性能CUDAHash

cuCollections性能测试

0. 测试目标

  • cucollection性能分析(测试在负载因子为0.8/0.9时的性能,以及空载时的插入性能,吞吐量,带宽 etc)key为64bit int,数据使用均匀分布
    • [x] 阅读benchmark代码,修改benchmark中的参数,测试不同负载因子下的性能( dynamic, static)
    • [x] 弄清example中的示例,基本用法,可参考test
    • [x] 根据所需测试性能参数修改benchmark测试

1. 环境配置

  • 测试环境:

    IP Addressx.x.x.x
    OSUbuntu 18.04 LTS
    GPU8xA100-40G / 8x Tesla V100-PCIE-32GB
    InterconnectivityPCIe
    ContainerNGC Pytorch 21.09-py3 / NGC Tensorflow 22.10-tf1-py3
  • pytorch:

  •   # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel_21-09.html#rel_21-09
      docker run -it --name="cucollection-pt21.09" --privileged --net=host --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 -v /data:/data -w / nvcr.io/nvidia/pytorch:21.09-py3 /bin/bash
      
      # 安装cmake(至少3.23)(原来的cmake是conda内安装的)
      wget https://cmake.org/files/v3.23/cmake-3.23.5-linux-x86_64.tar.gz &&\
      tar xf cmake-3.23.5-linux-x86_64.tar.gz &&\
      mv cmake-3.23.5-linux-x86_64 /opt/cmake-3.23.5 &&\
      ln -sf /opt/cmake-3.23.5/bin/* /usr/bin
      
      vim ~/.bashrc
      # 添加下面一行内容
      export PATH=$PATH:/opt/cmake-3.23.5/bin
      source ~/.bashrc
      
      # pull repository
      git clone https://github.com/NVIDIA/cuCollections.git
      
      # build cucollections
      cd cuCollections
      mkdir build
      cd build
      /opt/cmake-3.23.5/bin/cmake ..
      make
    
  • tensorflow:

  •   # https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tensorflow/tags 
      docker run -it --name="cucollection-tf1.15" --privileged --runtime=nvidia --net=host --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 -v /data:/data -w / nvcr.io/nvidia/tensorflow:22.10-tf1-py3 /bin/bash
      
      docker run -it --name="cucollection-tf1.15" --privileged --runtime=nvidia --net=host --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 -v /data:/data -w / nvcr.io/nvidia/tensorflow:22.09-tf1-py3 /bin/bash
      
      # 升级cmake(至少3.23)
      apt-get autoremove cmake &&\
      apt install build-essential libssl-dev &&\
      wget https://cmake.org/files/v3.23/cmake-3.23.5-linux-x86_64.tar.gz &&\
      tar xf cmake-3.23.5-linux-x86_64.tar.gz &&\
      mv cmake-3.23.5-linux-x86_64 /opt/cmake-3.23.5 &&\
      ln -sf /opt/cmake-3.23.5/bin/* /usr/bin
      
      vim ~/.bashrc
      # 添加下面一行内容
      export PATH=$PATH:/opt/cmake-3.23.5/bin
      source ~/.bashrc
      
      # pull repository
      git clone https://github.com/NVIDIA/cuCollections.git
      
      # build cucollections(cmake失败时可不删除build文件夹重新再执行一次cmake ..)
      cd cuCollections &&/
      mkdir build &&/
      cd build &&/
      cmake .. &&/
      make
    

2. Benchmark

  • 修改benchmark内容用于测试:

  • // ./cuCollections/benchmarks/hash_table/static_map_bench.cu
    
    /*
     * Copyright (c) 2020-2022, NVIDIA CORPORATION.
     *
     * Licensed under the Apache License, Version 2.0 (the "License");
     * you may not use this file except in compliance with the License.
     * You may obtain a copy of the License at
     *
     *     http://www.apache.org/licenses/LICENSE-2.0
     *
     * Unless required by applicable law or agreed to in writing, software
     * distributed under the License is distributed on an "AS IS" BASIS,
     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     * See the License for the specific language governing permissions and
     * limitations under the License.
     */
    
    #include <cuco/static_map.cuh>
    
    #include <thrust/device_vector.h>
    #include <thrust/for_each.h>
    
    #include <benchmark/benchmark.h>
    
    #include <fstream>
    #include <random>
    
    enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN };
    
    template <dist_type Dist, typename Key, typename OutputIt>
    static void generate_keys(OutputIt output_begin, OutputIt output_end)
    {
      auto num_keys = std::distance(output_begin, output_end);
    
      std::random_device rd;
      std::mt19937 gen{rd()};
    
      switch (Dist) {
        case dist_type::UNIQUE:
          for (auto i = 0; i < num_keys; ++i) {
            output_begin[i] = i;
          }
          break;
        case dist_type::UNIFORM:
          for (auto i = 0; i < num_keys; ++i) {
            output_begin[i] = std::abs(static_cast<Key>(gen()));
          }
          break;
        case dist_type::GAUSSIAN:
          std::normal_distribution<> dg{1e9, 1e7};
          for (auto i = 0; i < num_keys; ++i) {
            output_begin[i] = std::abs(static_cast<Key>(dg(gen)));
          }
          break;
      }
    }
    
    /**
     * @brief Generates input sizes and hash table occupancies
     *
     */
    static void generate_size_and_occupancy(benchmark::internal::Benchmark* b)
    {
      // for (auto size = 100'000'000; size <= 100'000'000; size *= 10) {
      //   for (auto occupancy = 10; occupancy <= 90; occupancy += 40) {
      //     b->Args({size, occupancy});
      //   }
      // }
      b->Args({80'000'000, 80});
      b->Args({90'000'000, 90});
    }
    
    template <typename Key, typename Value, dist_type Dist>
    static void BM_static_map_insert(::benchmark::State& state)
    {
      using map_type = cuco::static_map<Key, Value>;
    
      std::size_t num_keys = state.range(0);
      float occupancy      = state.range(1) / float{100};
      std::size_t size     = num_keys / occupancy;
      // std::cout<<"num_keys:"<<num_keys<<" occupancy:"<<occupancy<<" size:"<<size<<std::endl;
    
      std::vector<Key> h_keys(num_keys);
      std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
    
      generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
    
      for (std::size_t i = 0; i < num_keys; ++i) {
        Key key           = h_keys[i];
        Value val         = h_keys[i];
        h_pairs[i].first  = key;
        h_pairs[i].second = val;
      }
    
      thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
      thrust::device_vector<Key> d_keys(h_keys);
    
      for (auto _ : state) {
        map_type map{size, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
    
        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
    
        cudaEventRecord(start);
        map.insert(d_pairs.begin(), d_pairs.end());
        cudaEventRecord(stop);
        cudaEventSynchronize(stop);
    
        float ms;
        cudaEventElapsedTime(&ms, start, stop);
    
        state.SetIterationTime(ms / 1000);
        // std::cout<<"size:"<<map.get_size()<<" capacity:"<<map.get_capacity()<<" load_factor:"<<map.get_load_factor()<<std::endl;
      }
    
      state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
                              int64_t(state.range(0)));
    }
    
    template <typename Key, typename Value, dist_type Dist>
    static void BM_static_map_search_all(::benchmark::State& state)
    {
      using map_type = cuco::static_map<Key, Value>;
    
      std::size_t num_keys = state.range(0);
      float occupancy      = state.range(1) / float{100};
      std::size_t size     = num_keys / occupancy;
    
      map_type map{size, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
    
      std::vector<Key> h_keys(num_keys);
      std::vector<Value> h_values(num_keys);
      std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
      std::vector<Value> h_results(num_keys);
    
      generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
    
      for (std::size_t i = 0; i < num_keys; ++i) {
        Key key           = h_keys[i];
        Value val         = h_keys[i];
        h_pairs[i].first  = key;
        h_pairs[i].second = val;
      }
    
      thrust::device_vector<Key> d_keys(h_keys);
      thrust::device_vector<Value> d_results(num_keys);
      thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
    
      map.insert(d_pairs.begin(), d_pairs.end());
      // std::cout<<"INIT: size:"<<map.get_size()<<" capacity:"<<map.get_capacity()<<" load_factor:"<<map.get_load_factor()<<std::endl;
    
      for (auto _ : state) {
        map.find(d_keys.begin(), d_keys.end(), d_results.begin());
        // TODO: get rid of sync and rewrite the benchmark with `nvbench`
        // once https://github.com/NVIDIA/nvbench/pull/80 is merged
        cudaDeviceSynchronize();
        // std::cout<<"size:"<<map.get_size()<<" capacity:"<<map.get_capacity()<<" load_factor:"<<map.get_load_factor()<<std::endl;
      }
    
      state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
                              int64_t(state.range(0)));
    }
    
    template <typename Key, typename Value, dist_type Dist>
    static void BM_static_map_erase_all(::benchmark::State& state)
    {
      using map_type = cuco::static_map<Key, Value>;
    
      std::size_t num_keys = state.range(0);
      float occupancy      = state.range(1) / float{100};
      std::size_t size     = num_keys / occupancy;
    
      // static map with erase support
      map_type map{size,
                   cuco::sentinel::empty_key<Key>{-1},
                   cuco::sentinel::empty_value<Value>{-1},
                   cuco::sentinel::erased_key<Key>{-2}};
    
      std::vector<Key> h_keys(num_keys);
      std::vector<Value> h_values(num_keys);
      std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
      std::vector<Value> h_results(num_keys);
    
      generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
    
      for (std::size_t i = 0; i < num_keys; ++i) {
        Key key           = h_keys[i];
        Value val         = h_keys[i];
        h_pairs[i].first  = key;
        h_pairs[i].second = val;
      }
    
      thrust::device_vector<Key> d_keys(h_keys);
      thrust::device_vector<bool> d_results(num_keys);
      thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
    
      for (auto _ : state) {
        state.PauseTiming();
        map.insert(d_pairs.begin(), d_pairs.end());
        state.ResumeTiming();
    
        map.erase(d_keys.begin(), d_keys.end());
      }
    
      state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
                              int64_t(state.range(0)));
    }
    
    BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIFORM)
      ->Unit(benchmark::kMillisecond)
      ->Apply(generate_size_and_occupancy)
      ->UseManualTime();
    
    BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::UNIFORM)
      ->Unit(benchmark::kMillisecond)
      ->Apply(generate_size_and_occupancy);
    
    BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIFORM)
      ->Unit(benchmark::kMillisecond)
      ->Apply(generate_size_and_occupancy)
      ->UseManualTime();
    
    BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::UNIFORM)
      ->Unit(benchmark::kMillisecond)
      ->Apply(generate_size_and_occupancy);
    
    
  • // ./cuCollections/benchmarks/hash_table/dynamic_map_bench.cu
    /*
     * Copyright (c) 2020-2022, NVIDIA CORPORATION.
     *
     * Licensed under the Apache License, Version 2.0 (the "License");
     * you may not use this file except in compliance with the License.
     * You may obtain a copy of the License at
     *
     *     http://www.apache.org/licenses/LICENSE-2.0
     *
     * Unless required by applicable law or agreed to in writing, software
     * distributed under the License is distributed on an "AS IS" BASIS,
     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     * See the License for the specific language governing permissions and
     * limitations under the License.
     */
    
    #include <synchronization.hpp>
    
    #include <cuco/dynamic_map.cuh>
    
    #include <thrust/device_vector.h>
    
    #include <benchmark/benchmark.h>
    
    #include <iostream>
    #include <random>
    
    enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN };
    
    template <dist_type Dist, typename Key, typename OutputIt>
    static void generate_keys(OutputIt output_begin, OutputIt output_end)
    {
      auto num_keys = std::distance(output_begin, output_end);
    
      std::random_device rd;
      std::mt19937 gen{rd()};
    
      switch (Dist) {
        case dist_type::UNIQUE:
          for (auto i = 0; i < num_keys; ++i) {
            output_begin[i] = i;
          }
          break;
        case dist_type::UNIFORM:
          for (auto i = 0; i < num_keys; ++i) {
            output_begin[i] = std::abs(static_cast<Key>(gen()));
          }
          break;
        case dist_type::GAUSSIAN:
          std::normal_distribution<> dg{1e9, 1e7};
          for (auto i = 0; i < num_keys; ++i) {
            output_begin[i] = std::abs(static_cast<Key>(dg(gen)));
          }
          break;
      }
    }
    
    static void gen_final_size(benchmark::internal::Benchmark* b)
    {
      // for (auto size = 10'000'000; size <= 150'000'000; size += 20'000'000) {
      //   b->Args({size});
      // }
      b->Args({150'000'000});
    }
    
    template <typename Key, typename Value, dist_type Dist>
    static void BM_dynamic_insert(::benchmark::State& state)
    {
      using map_type = cuco::dynamic_map<Key, Value>;
    
      std::size_t num_keys     = state.range(0);
      // std::cout<<num_keys<<std::endl;
      std::size_t initial_size = 1 << 27;
      // std::cout<<initial_size<<std::endl;
    
      std::vector<Key> h_keys(num_keys);
      std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
    
      generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
    
      for (std::size_t i = 0; i < num_keys; ++i) {
        Key key           = h_keys[i];
        Value val         = h_keys[i];
        h_pairs[i].first  = key;
        h_pairs[i].second = val;
      }
    
      thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
    
      std::size_t batch_size = 1E6;
      for (auto _ : state) {
        map_type map{
          initial_size, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
        {
          cuda_event_timer raii{state};
          for (std::size_t i = 0; i < num_keys; i += batch_size) {
            map.insert(d_pairs.begin() + i, d_pairs.begin() + i + batch_size);
            // std::cout<<"size:"<<map.get_size()<<" capacity:"<<map.get_capacity()<<" load_factor:"<<map.get_load_factor()<<std::endl;
          }
        }
      }
    
      state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
                              int64_t(state.range(0)));
    }
    
    template <typename Key, typename Value, dist_type Dist>
    static void BM_dynamic_search_all(::benchmark::State& state)
    {
      using map_type = cuco::dynamic_map<Key, Value>;
    
      std::size_t num_keys     = state.range(0);
      std::size_t initial_size = 1 << 27;
    
      std::vector<Key> h_keys(num_keys);
      std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
    
      generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
    
      for (std::size_t i = 0; i < num_keys; ++i) {
        Key key           = h_keys[i];
        Value val         = h_keys[i];
        h_pairs[i].first  = key;
        h_pairs[i].second = val;
      }
    
      thrust::device_vector<Key> d_keys(h_keys);
      thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
      thrust::device_vector<Value> d_results(num_keys);
    
      map_type map{
        initial_size, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
      map.insert(d_pairs.begin(), d_pairs.end());
    
      for (auto _ : state) {
        cuda_event_timer raii{state};
        map.find(d_keys.begin(), d_keys.end(), d_results.begin());
      }
    
      state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
                              int64_t(state.range(0)));
    }
    
    BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIFORM)
      ->Unit(benchmark::kMillisecond)
      ->Apply(gen_final_size)
      ->UseManualTime();
    
    BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIFORM)
      ->Unit(benchmark::kMillisecond)
      ->Apply(gen_final_size)
      ->UseManualTime();
    
    BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIFORM)
      ->Unit(benchmark::kMillisecond)
      ->Apply(gen_final_size)
      ->UseManualTime();
    
    BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIFORM)
      ->Unit(benchmark::kMillisecond)
      ->Apply(gen_final_size)
      ->UseManualTime();
    
    
  • 重新编译后测试benchmark

  • cd ./build
    make
    cd ./gbenchmarks
    ls
    # DYNAMIC_MAP_BENCH  RBK_BENCH  STATIC_MAP_BENCH
    
    ./DYNAMIC_MAP_BENCH
    ./STATIC_MAP_BENCH
    

3. 测试结果

  • static_map:

    • A100:

    • 配置load_factorOPGPU timeCPU timeOPSTPS
      Key: Int32
      Value: Int32
      Random: Uniform
      load_factor: 0.8
      total_size:100000000
      BM_static_map_insert17.8 ms22.1 ms4.494 B33.402 GB/sec
      BM_static_map_search_all11.1 ms11.1 ms7.207 B53.623 GB/sec
      load_factor: 0.9
      total_size:100000000
      BM_static_map_insert24.8 ms29.2 ms3.629 B27.054 GB/sec
      BM_static_map_search_all16.8 ms16.8 ms5.357 B39.987 GB/sec
      Key: Int64
      Value: Int64
      Random: Uniform
      load_factor: 0.8
      total_size:100000000
      BM_static_map_insert19.9 ms25.9 ms4.020 B60.051 GB/sec
      BM_static_map_search_all13.3 ms13.3 ms6.015 B89.458 GB/sec
      load_factor: 0.9
      total_size:100000000
      BM_static_map_insert29.8 ms36.5 ms3.020 B44.936 GB/sec
      BM_static_map_search_all20.2 ms20.2 ms4.455 B66.476 GB/sec
    • V100:

    • 配置load_factorOPGPU timeCPU timeOPSTPS
      Key: Int32
      Value: Int32
      Random: Uniform
      load_factor: 0.8
      total_size:100000000
      BM_static_map_insert51.3 ms56.0 ms1.559 B11.629 GB/sec
      BM_static_map_search_all22.7 ms22.7 ms3.524 B26.238 GB/sec
      load_factor: 0.9
      total_size:100000000
      BM_static_map_insert63.3 ms68.1 ms1.421 B10.587 GB/sec
      BM_static_map_search_all32.4 ms32.4 ms2.778 B20.688 GB/sec
      Key: Int64
      Value: Int64
      Random: Uniform
      load_factor: 0.8
      total_size:100000000
      BM_static_map_insert58.7 ms68.7 ms1.363 B20.296 GB/sec
      BM_static_map_search_all26.8 ms26.8 ms2.985 B44.415 GB/sec
      load_factor: 0.9
      total_size:100000000
      BM_static_map_insert76.6 ms86.5 ms1.175 B17.506 GB/sec
      BM_static_map_search_all40.5 ms40.5 ms2.222 B33.102 GB/sec
  • dynamic_map: (从空载到插入对应大小key的性能,当超过load_factor设定值时将以放大2倍的形式扩容)

  • // 修改dynamic_map的load_factor方法:
    // ./cuCollections/include/cuco/detail/dynamic_map.inl
    
    namespace cuco {
    
    template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
    dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
      std::size_t initial_capacity,
      sentinel::empty_key<Key> empty_key_sentinel,
      sentinel::empty_value<Value> empty_value_sentinel,
      Allocator const& alloc)
      : empty_key_sentinel_(empty_key_sentinel.value),
        empty_value_sentinel_(empty_value_sentinel.value),
        size_(0),
        capacity_(initial_capacity),
        min_insert_size_(1E4),
        max_load_factor_(0.60), // 修改load_factor为0.8/0.9
        alloc_{alloc}
        {...}
    
    • A100:

    • 配置load_factorOPGPU timeCPU timeOPSTPS
      Key: Int32
      Value: Int32
      Random: Uniform
      load_factor: 0.8
      key_size:150000000
      BM_static_map_insert117.0 ms125.0 ms1.282 B9.529 GB/sec
      BM_static_map_search_all36.0 ms36.0 ms4.167 B31.079 GB/sec
      load_factor: 0.9
      key_size:150000000
      BM_static_map_insert161 ms169 ms0.932 B6.946 GB/sec
      BM_static_map_search_all56.0 ms56.0 ms2.679 B19.954 GB/sec
      Key: Int64
      Value: Int64
      Random: Uniform
      load_factor: 0.8
      key_size:150000000
      BM_static_map_insert162.0 ms174.0 ms0.926 B13.815 GB/sec
      BM_static_map_search_all48.6 ms48.6 ms3.086 B46.037 GB/sec
      load_factor: 0.9
      key_size:150000000
      BM_static_map_insert210 ms222 ms0.714 B10.632 GB/sec
      BM_static_map_search_all81.6 ms81.6 ms1.838 B27.404 GB/sec
    • V100:

    • 配置load_factorOPGPU timeCPU timeOPSTPS
      Key: Int32
      Value: Int32
      Random: Uniform
      load_factor: 0.8
      key_size:150000000
      BM_static_map_insert152 ms158 ms0.987 B7.370 GB/sec
      BM_static_map_search_all63.5 ms63.5 ms2.362 B17.610 GB/sec
      load_factor: 0.9
      key_size:150000000
      BM_static_map_insert216 ms223 ms0.694 B5.173 GB/sec
      BM_static_map_search_all92.8 ms92.8 ms1.616 B12.049 GB/sec
      Key: Int64
      Value: Int64
      Random: Uniform
      load_factor: 0.8
      key_size:150000000
      BM_static_map_insert192 ms206 ms0.781 B11.637 GB/sec
      BM_static_map_search_all83.7 ms83.7 ms1.792 B26.698 GB/sec
      load_factor: 0.9
      key_size:150000000
      BM_static_map_insert287 ms299 ms0.523 B7.800 GB/sec
      BM_static_map_search_all141 ms141 ms1.064 B15.896 GB/sec
  • key: int64(均匀分布) value: 64 bytes(任意值)

  • 每次插入3000000个数据,需要记录每轮插入数据的load_factor和throughput(Gpair/s)

  • e.g.有度20221115152533

  • Gpair/s


#include <cuco/static_map.cuh>

#include <thrust/device_vector.h>
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/logical.h>
#include <thrust/transform.h>

#include <benchmark/benchmark.h>

#include <fstream>
#include <random>


// User-defined value type
struct Embedding_64 {
  int64_t data[8];

  __host__ __device__ Embedding_64() {}
  __host__ __device__ Embedding_64(int64_t x) : data{x} {}
  __device__ bool operator == (Embedding_64 const& cmp)
  {
    return data[0] == cmp.data[0];
  }
};

struct Embedding_128 {
  int64_t data[16];

  __host__ __device__ Embedding_128() {}
  __host__ __device__ Embedding_128(int64_t x) : data{x} {}
  __device__ bool operator == (Embedding_128 const& cmp)
  {
    return data[0] == cmp.data[0];
  }
};

struct Embedding_512 {
  int64_t data[64];

  __host__ __device__ Embedding_512() {}
  __host__ __device__ Embedding_512(int64_t x) : data{x} {}
  __device__ bool operator == (Embedding_512 const& cmp)
  {
    return data[0] == cmp.data[0];
  }
};

// Generate key with uniform distribution
template <typename Key, typename OutputIt>
static void generate_keys(OutputIt output_begin, OutputIt output_end)
{
  auto num_keys = std::distance(output_begin, output_end);

  std::random_device rd;
  std::mt19937 gen{rd()};

  for (auto i = 0; i < num_keys; ++i) {
    output_begin[i] = std::abs(static_cast<Key>(gen()));
  }
}


int main(void)
{
  using Key = int64_t;
  // using Value = int64_t;
  using Value = Embedding_128;
  using map_type = cuco::static_map<Key, Value>;

  std::size_t num_keys = 95'000'000;
  std::size_t size     = 100'000'000;

  std::vector<Key> h_keys(num_keys);
  std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
  generate_keys<Key>(h_keys.begin(), h_keys.end());
  std::cout<<"Key_size:"<<sizeof(Key)<<" Value_size:"<<sizeof(Value)<<std::endl;

  for (std::size_t i = 0; i < num_keys; ++i) {
    Key key           = h_keys[i];
    Value val{i};
    h_pairs[i].first  = key;
    h_pairs[i].second = val;
  }

  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
  thrust::device_vector<Key> d_keys(h_keys);

  map_type map{size, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{Value{-1}}};
  int64_t last_loop_key_size = 0;
  for(size_t i=0; i<num_keys/3'000'000; i+=1)
  {
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    map.insert(d_pairs.begin()+i*3'000'000, d_pairs.begin()+(i+1)*3'000'000);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float time_ms;
    cudaEventElapsedTime(&time_ms, start, stop);

    std::cout<<"time: "<<time_ms<<"ms key_size: "<<map.get_size()<<" total_size: "<<map.get_capacity()<<" load_factor: "<<map.get_load_factor()<<" throughput: "<<(map.get_size()-last_loop_key_size)/time_ms/1'000'000<<"Gpair/s"<<std::endl;
    last_loop_key_size = map.get_size();
  }
  

  /*---Check whether all keys are inside the hashtable---*/ 
  // Reproduce inserted keys
  auto insert_keys =
    thrust::make_transform_iterator(thrust::make_counting_iterator<int32_t>(0),
                                    [] __device__(auto i) { return int64_t{i}; });

  thrust::device_vector<bool> contained(num_keys);
  map.contains(
    insert_keys, insert_keys + num_keys, contained.begin());

  assert(
    thrust::all_of(contained.begin(), contained.end(), [] __device__(auto const& b) { return b; }));

  return 0;
}


  • insert 调用关系:

  • // 用户定义cu文件调用insert方法,传入所有待插入的键值对
    map.insert(d_pairs.begin()+i*loop_step, d_pairs.begin()+(i+1)*loop_step);
    
  • // ./cuCollections/include/cuco/detail/static_map.inl
    // 调用kernel函数
    template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
    template <typename InputIt, typename Hash, typename KeyEqual>
    void static_map<Key, Value, Scope, Allocator>::insert(
      InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream)
    {
      auto num_keys = std::distance(first, last);
      if (num_keys == 0) { return; }
    
      auto const block_size = 128;
      auto const stride     = 1;
      auto const tile_size  = 1;
      auto const grid_size  = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
      auto view             = get_device_mutable_view();
    
      // TODO: memset an atomic variable is unsafe
      static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type));
      CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type), stream));
      std::size_t h_num_successes;
    
      detail::insert<block_size, tile_size><<<grid_size, block_size, 0, stream>>>(
        first, first + num_keys, num_successes_, view, hash, key_equal);
      CUCO_CUDA_TRY(cudaMemcpyAsync(
        &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
    
      CUCO_CUDA_TRY(cudaStreamSynchronize(stream));  // stream sync to ensure h_num_successes is updated
    
      size_ += h_num_successes;
    }
    
  • // ./cuCollections/include/cuco/detail/static_map_kernels.cuh
    // 使用Cooperative Group来取得更高的吞吐并在高load_factor下保持稳定
    // 将所有待插入的kv按照tile_size划分与分配,并进行最终的插入成功次数计数
    template <std::size_t block_size,
              uint32_t tile_size,
              typename InputIt,
              typename atomicT,
              typename viewT,
              typename Hash,
              typename KeyEqual>
    __global__ void insert(
      InputIt first, InputIt last, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal)
    {
      typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
      __shared__ typename BlockReduce::TempStorage temp_storage;
      std::size_t thread_num_successes = 0;
    
      auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
      auto tid  = block_size * blockIdx.x + threadIdx.x;
      auto it   = first + tid / tile_size;
    
      while (it < last) {
        // force conversion to value_type
        typename viewT::value_type const insert_pair{*it};
        if (view.insert(tile, insert_pair, hash, key_equal) && tile.thread_rank() == 0) {
          thread_num_successes++;
        }
        it += (gridDim.x * block_size) / tile_size;
      }
    
      // compute number of successfully inserted elements for each block
      // and atomically add to the grand total
      std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
      if (threadIdx.x == 0) { *num_successes += block_num_successes; }
    }
    
  • // cuCollections/include/cuco/detail/static_map.inl
    // 每个tile每次插入一对kv的具体实现
    template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
    template <typename CG, typename Hash, typename KeyEqual>
    __device__ bool static_map<Key, Value, Scope, Allocator>::device_mutable_view::insert(
      CG const& g, value_type const& insert_pair, Hash hash, KeyEqual key_equal) noexcept
    {
      auto current_slot = initial_slot(g, insert_pair.first, hash);
    
      while (true) {
        key_type const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed);
    
        // The user provide `key_equal` can never be used to compare against `empty_key_sentinel` as the
        // sentinel is not a valid key value. Therefore, first check for the sentinel
        auto const slot_is_available =
          detail::bitwise_compare(existing_key, this->get_empty_key_sentinel()) or
          detail::bitwise_compare(existing_key, this->get_erased_key_sentinel()); // if equal, return 1, means slot is avaliable
    
        // the key we are trying to insert is already in the map, so we return with failure to insert
        if (g.any(not slot_is_available and key_equal(existing_key, insert_pair.first))) {
          return false;
        }
    
        auto const window_contains_available = g.ballot(slot_is_available); // return 0 while no threads in g can get an avaliable slot
    
        // we found an empty slot, but not the key we are inserting, so this must
        // be an empty slot into which we can insert the key
        if (window_contains_available) {
          // the first lane in the group with an empty slot will attempt the insert
          insert_result status{insert_result::CONTINUE};
          uint32_t src_lane = __ffs(window_contains_available) - 1;
    
          if (g.thread_rank() == src_lane) {
            // One single CAS operation if `value_type` is packable
            if constexpr (cuco::detail::is_packable<value_type>()) {
              status = packed_cas(current_slot, insert_pair, key_equal, existing_key);
            }
            // Otherwise, two back-to-back CAS operations
            else {
              #if (__CUDA_ARCH__ < 700)
                status = cas_dependent_write(current_slot, insert_pair, key_equal, existing_key);
              #else
                // 自定义embedding的value大小超过8B,使用b2b_cas分两次插入kv
                status = back_to_back_cas(current_slot, insert_pair, key_equal, existing_key);
              #endif
            }
          }
    
          uint32_t res_status = g.shfl(static_cast<uint32_t>(status), src_lane);
          status              = static_cast<insert_result>(res_status);
    
          // successful insert
          if (status == insert_result::SUCCESS) { return true; }
          // duplicate present during insert
          if (status == insert_result::DUPLICATE) { return false; }
          // if we've gotten this far, a different key took our spot
          // before we could insert. We need to retry the insert on the
          // same window
        }
        // if there are no empty slots in the current window,
        // we move onto the next window
        else {
          // 当前window向后移
          current_slot = next_slot(g, current_slot);
        }
      }
    }
    
  • // cuCollections/include/cuco/detail/static_map.inl
    // 插入操作的cas,key和value分别实现原子操作插入
    // 要么是成功插入返回SUCCESS,要么是当前window无空slot返回CONTINUE,要么是当前key已经在map中了返回DUPLICATE
    template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
    template <typename KeyEqual>
    __device__ static_map<Key, Value, Scope, Allocator>::device_mutable_view::insert_result
    static_map<Key, Value, Scope, Allocator>::device_mutable_view::back_to_back_cas(
      iterator current_slot,
      value_type const& insert_pair,
      KeyEqual key_equal,
      Key expected_key) noexcept
    {
      // back_to_back_cas(current_slot, insert_pair, key_equal, existing_key)
      using cuda::std::memory_order_relaxed;
    
      auto expected_value = this->get_empty_value_sentinel();
    
      // Back-to-back CAS for 8B/8B key/value pairs
      auto& slot_key   = current_slot->first;
      auto& slot_value = current_slot->second;
    
      bool key_success =
        slot_key.compare_exchange_strong(expected_key, insert_pair.first, memory_order_relaxed);
      bool value_success =
        slot_value.compare_exchange_strong(expected_value, insert_pair.second, memory_order_relaxed);
    
      if (key_success) {
        while (not value_success) {
          value_success =
            slot_value.compare_exchange_strong(expected_value = this->get_empty_value_sentinel(),
                                               insert_pair.second,
                                               memory_order_relaxed);
        }
        return insert_result::SUCCESS;
      } else if (value_success) {
        slot_value.store(this->get_empty_value_sentinel(), memory_order_relaxed);
      }
    
      // our key was already present in the slot, so our key is a duplicate
      if (key_equal(insert_pair.first, expected_key)) { return insert_result::DUPLICATE; }
    
      return insert_result::CONTINUE;
    }