API Reference

All classes and functions exist in the facebook::cuda namespace.

namespace facebook::cuda

Functions

template <typename T>
__host__ __device__ __forceinline__ constexpr T facebook::cuda::ceil(T a, T b)

Computes ceil(a / b)

template <typename T>
__host__ __device__ __forceinline__ constexpr T facebook::cuda::floor(T a, T b)

Computes floor(a / b)

__device__ __forceinline__ int facebook::cuda::getWarpId()

Returns the current thread’s warp ID

__device__ __forceinline__ int facebook::cuda::getThreadsInBlock()

Returns the number of threads in the current block (linearized).

__device__ __forceinline__ int facebook::cuda::getWarpsInBlock()

Returns the number of warps in the current block (linearized, rounded to whole warps).

template <typename T>
__device__ __forceinline__ T facebook::cuda::ptrMin(T a, T b)

Pointer comparison using the PTX intrinsic; min() doesn’t work for T*.

template <typename T>
__device__ __forceinline__ T facebook::cuda::ptrMax(T a, T b)

Pointer comparison using the PTX intrinsic; max() doesn’t work for T*

__device__ __forceinline__ int facebook::cuda::getLaneId()

Return the current thread’s lane in the warp

__device__ __forceinline__ unsigned facebook::cuda::getLaneMaskLt()

Return a bitmask with bits set in positions less than the current thread’s lane number in the warp.

__device__ __forceinline__ unsigned facebook::cuda::getLaneMaskLe()

Return a bitmask with bits set in positions less than or equal to the current thread’s lane number in the warp.

__device__ __forceinline__ unsigned facebook::cuda::getLaneMaskGt()

Return a bitmask with bits set in positions greater than the current thread’s lane number in the warp.

__device__ __forceinline__ unsigned facebook::cuda::getLaneMaskGe()

Return a bitmask with bits set in positions greater than or equal to the current thread’s lane number in the warp.

__device__ __forceinline__ int facebook::cuda::getBit(int val, int pos)

Extract a single bit at pos from val

__device__ __forceinline__ unsigned facebook::cuda::setBit(unsigned val, unsigned toInsert, int pos)

Insert a single bit into val at position pos

__device__ __forceinline__ unsigned facebook::cuda::getBitfield(unsigned val, int pos, int len)

Extract a bit field of length len at pos from val

__device__ __forceinline__ unsigned long facebook::cuda::getBitfield(unsigned long val, int pos, int len)

Extract a bit field of length len at pos from val

__device__ __forceinline__ unsigned facebook::cuda::setBitfield(unsigned val, unsigned toInsert, int pos, int len)

Insert len bits of toInsert into val starting at position pos

__device__ __forceinline__ unsigned long facebook::cuda::setBitfield(unsigned long val, unsigned toInsert, int pos, int len)

Insert len bits of toInsert into val starting at position pos

__device__ __forceinline__ constexpr int facebook::cuda::getMSB(int val)

Returns the index of the most significant 1 bit in val.

template <typename T, int Dim, typename IndexT, template< typename U > class PtrTraits>
std::ostream &operator<<(std::ostream &os, const DeviceTensor<T, Dim, IndexT, PtrTraits> &t)

Streaming operator for logging.

__host__ __device__ float facebook::cuda::createRoundingFactor(float max, int n)

Constructs a rounding factor used to truncate elements in a sum such that the sum of the truncated elements is the same no matter what the order of the sum is.

Floating point summation is not associative; using this factor makes it associative, so a parallel sum can be performed in any order (presumably using atomics).

Follows Algorithm 5: Reproducible Sequential Sum in ‘Fast Reproducible Floating-Point Summation’ by Demmel and Nguyen http://www.eecs.berkeley.edu/~hdnguyen/public/papers/ARITH21_Fast_Sum.pdf

For summing x_i, i = 1 to n:

Parameters
  • max: The maximum seen floating point value abs(x_i)
  • n: The number of elements for the sum, or an upper bound estimate

__host__ __device__ float facebook::cuda::truncateWithRoundingFactor(float roundingFactor, float x)

Given the rounding factor in createRoundingFactor calculated using max(|x_i|), truncate x to a value that can be used for a deterministic, reproducible parallel sum of all x_i.

__device__ bool facebook::cuda::inBounds(int x, int padL, const DeviceTensor < float, 2 > & t)
__device__ bool facebook::cuda::inBounds(int y, int x, int padU, int padL, const DeviceTensor < float, 3 > & t)
__device__ __forceinline__ bool facebook::cuda::inBounds(int y, int x, int padU, int padL, const DeviceTensor < float, 4 > & t)
__device__ bool facebook::cuda::inBounds(int x, const DeviceTensor < float, 2 > & t)
__device__ bool facebook::cuda::inBounds(int y, int x, const DeviceTensor < float, 3 > & t)
__device__ bool facebook::cuda::inBounds(int y, int x, const DeviceTensor < float, 4 > & t)
template <int Dim, bool ConjugateTransposeA, bool ConjugateTransposeB, bool Accumulate>
void transposeMM(DeviceTensor<float, Dim> &A, DeviceTensor<float, Dim> &B, DeviceTensor<float, Dim> &C, float invNorm, cudaStream_t s)
template <typename T>
__device__ __forceinline__ T facebook::cuda::shfl(const T val, int srcLane, int width = WARP_SIZE)
template <typename T>
__device__ __forceinline__ T facebook::cuda::shfl_up(const T val, int delta, int width = WARP_SIZE)
template <typename T>
__device__ __forceinline__ T facebook::cuda::shfl_down(const T val, int delta, int width = WARP_SIZE)
template <typename T>
__device__ __forceinline__ T facebook::cuda::shfl_xor(const T val, int laneMask, int width = WARP_SIZE)
template <typename K, typename V>
__device__ __forceinline__ Pair<K, V> facebook::cuda::shfl(const Pair < K, V > & p, int srcLane, int width = WARP_SIZE)
template <typename K, typename V>
__device__ __forceinline__ Pair<K, V> facebook::cuda::shfl_up(const Pair < K, V > & p, int delta, int width = WARP_SIZE)
template <typename K, typename V>
__device__ __forceinline__ Pair<K, V> facebook::cuda::shfl_down(const Pair < K, V > & p, int delta, int width = WARP_SIZE)
template <typename K, typename V>
__device__ __forceinline__ Pair<K, V> facebook::cuda::shfl_xor(const Pair < K, V > & p, int laneMask, int width = WARP_SIZE)
template <int type>
__global__ void facebook::cuda::sleepKernel(double * cycles, int64_t waitCycles)
void cudaSleep(int64_t cycles, int type)
template <typename T, typename Comparator>
__device__ bool facebook::cuda::warpSort(const DeviceTensor < T, 1 > & key, DeviceTensor < T, 1 > & sortedKey)
template <typename T, typename IndexType, typename Comparator>
__device__ bool facebook::cuda::warpSort(const DeviceTensor < T, 1 > & key, DeviceTensor < T, 1 > & sortedKey, DeviceTensor < IndexType, 1 > & sortedKeyIndices)
template <typename K, typename V, typename Comparator>
__device__ bool facebook::cuda::warpSort(const DeviceTensor < K, 1 > & key, const DeviceTensor < V, 1 > & value, DeviceTensor < K, 1 > & sortedKey, DeviceTensor < V, 1 > & sortedValue)
__device__ Pair<float, int> facebook::cuda::warpFindTopKthElement(const DeviceTensor < float, 1 > & data, int k)

Finds the Kth highest floating point value in a linear array [arr, end) without modifying the data and without temporary storage except for registers.

  • K starts at 1.
  • All threads in the warp will return the value.
  • Handles all floats except NaNs.
  • This function minimizes warp divergence.

Returns the number of times the top-Kth element uniquely occurs along with its value.

__device__ void facebook::cuda::warpFindTopKElementsIndexOrder(const DeviceTensor < float, 1 > & data, DeviceTensor < float, 1 > & out, int k)

For a given warp, find and write out the top-k highest floating point values in [start, end) to [out, out + k). The list written out is ordered based on original index order. Handles all floats except NaNs.

template <typename IndexType>
__device__ void facebook::cuda::warpFindTopKElementsIndexOrder(const DeviceTensor < float, 1 > & data, DeviceTensor < float, 1 > & out, DeviceTensor < IndexType, 1 > & indices, int k)

Version of warpFindTopKElementsOrdered which also writes out the indices of the found top elements from data. The list written out is ordered based on original index order. Handles all floats except NaNs. Supports writing out float or integer indices.

__device__ void facebook::cuda::warpFindTopKElementsValueOrder(const DeviceTensor < float, 1 > & data, DeviceTensor < float, 1 > & out, int k)

For a given warp, find and write out the top-k highest floating point values in [start, end) to [out, out + k). The list written out is ordered based on float value. Handles all floats except NaNs.

template <typename IndexType>
__device__ void facebook::cuda::warpFindTopKElementsValueOrder(const DeviceTensor < float, 1 > & data, DeviceTensor < float, 1 > & out, DeviceTensor < IndexType, 1 > & indices, int k)

Version of warpFindTopKElementsOrdered which also writes out the indices of the found top elements from data. The list written out is ordered based on float value. Handles all floats except NaNs. Supports writing out float or integer indices.

template <typename T, typename Comparator>
__device__ T facebook::cuda::warpBitonicSort(T val)

Defines a bitonic sort network to exchange ‘V’ according to SWAP()‘s compare and exchange mechanism across the warp, ordered according to the comparator comp. In other words, if comp is GreaterThan<T>, then lane 0 will contain the highest val presented across the warp

See also http://on-demand.gputechconf.com/gtc/2013/presentations/S3174-Kepler-Shuffle-Tips-Tricks.pdf

template <typename T, typename Op>
__device__ __forceinline__ T facebook::cuda::warpReduce(T val, Op op)

Reduce a value across a warp by applying the commutative function Op. All threads in the warp receive the reduced value. Assumes that all threads in the warp are participating in the reduction.

template <typename T>
__device__ __forceinline__ T facebook::cuda::warpReduceSum(T val)

Sums a register value across all warp threads.

template <typename T>
__device__ __forceinline__ T facebook::cuda::warpReduceMax(T val)

Finds the maximum val across the warp.

template <typename T>
__device__ __forceinline__ T facebook::cuda::warpReduceMin(T val)

Finds the minimum val across the warp.

template <typename T>
__device__ __forceinline__ bool facebook::cuda::warpHasCollision(T val)

Determine if two warp threads have the same value (a collision).

template <typename T>
__device__ __forceinline__ unsigned int facebook::cuda::warpCollisionMask(T val)

Determine if two warp threads have the same value (a collision), and returns a bitmask of the lanes that are known to collide with other lanes. Not all lanes that are mutually colliding return a bit; all lanes with a 1 bit are guaranteed to collide with a lane with a 0 bit, so the mask can be used to serialize execution for lanes that collide with others. (mask | (mask >> 1)) will yield all mutually colliding lanes.

struct Complex

cuComplex wrapper.

Public Functions

__host__ __device__ __forceinline__ facebook::cuda::Complex::Complex()
__host__ __device__ __forceinline__ facebook::cuda::Complex::Complex(float re)
__host__ __device__ __forceinline__ facebook::cuda::Complex::Complex(float re, float im)
__host__ __device__ __forceinline__ facebook::cuda::Complex::Complex(const Complex & c)
__host__ __device__ __forceinline__ facebook::cuda::Complex::Complex(const cuComplex & c)
__host__ __device__ __forceinline__ Complex& facebook::cuda::Complex::operator=(const Complex & c)
__host__ __device__ __forceinline__ bool facebook::cuda::Complex::operator==(const Complex & c) const
__host__ __device__ __forceinline__ bool facebook::cuda::Complex::operator!=(const Complex & c) const
__host__ __device__ __forceinline__ Complex facebook::cuda::Complex::operator-() const
__host__ __device__ __forceinline__ Complex facebook::cuda::Complex::operator-(const Complex & c) const
__host__ __device__ __forceinline__ Complex facebook::cuda::Complex::operator+(const Complex & c) const
__host__ __device__ __forceinline__ Complex facebook::cuda::Complex::operator*(const Complex & c) const
__host__ __device__ __forceinline__ Complex facebook::cuda::Complex::operator/(const Complex & c) const
__host__ __device__ __forceinline__ Complex& facebook::cuda::Complex::operator+=(const Complex & c)
__host__ __device__ __forceinline__ Complex& facebook::cuda::Complex::operator-=(const Complex & c)
__host__ __device__ __forceinline__ Complex& facebook::cuda::Complex::operator*=(const Complex & c)
__host__ __device__ __forceinline__ Complex& facebook::cuda::Complex::operator/=(const Complex & c)
__host__ __device__ __forceinline__ Complex facebook::cuda::Complex::transpose() const
__host__ __device__ __forceinline__ Complex facebook::cuda::Complex::conjugate() const
__host__ __device__ __forceinline__ void facebook::cuda::Complex::cexp(float angle)
__host__ __device__ __forceinline__ float& facebook::cuda::Complex::re()
__host__ __device__ __forceinline__ float& facebook::cuda::Complex::im()
__host__ __device__ __forceinline__ const float& facebook::cuda::Complex::re() const
__host__ __device__ __forceinline__ const float& facebook::cuda::Complex::im() const
__host__ __device__ __forceinline__ facebook::cuda::Complex::operator float2() const
template <typename T>
struct DefaultPtrTraits

Public Types

typedef T *PtrType
template <typename T, int Dim, typename IndexT = int, template< typename U > class PtrTraits = DefaultPtrTraits>
class DeviceTensor

Our tensor type.

Templated multi-dimensional array that supports strided access of elements. Main access is through operator[]; e.g., tensor[x][y][z].

  • T is the contained type (e.g., float)
  • Dim is the tensor rank
  • IndexT is the integer type used for size/stride arrays, and for
  • all indexing math. Default is int, but for large tensors, long
  • can be used instead.
  • PtrTraits are traits applied to our data pointer (T*). By default,
  • this is just T*, but RestrictPtrTraits can be used to apply T*
  • restrict for alias-free analysis.

Public Types

enum [anonymous]

Values:

NumDim = Dim
typedef T DataType
typedef IndexT IndexType
typedef PtrTraits<T>::PtrType DataPtrType
typedef DeviceTensor<T, Dim, IndexT, PtrTraits> TensorType

Public Functions

__host__ __device__ facebook::cuda::DeviceTensor::DeviceTensor()

Default constructor.

__host__ __device__ facebook::cuda::DeviceTensor::DeviceTensor(DataPtrType data, const IndexT sizes[Dim])

Constructor that calculates strides with no padding.

__host__ __device__ facebook::cuda::DeviceTensor::DeviceTensor(DataPtrType data, const IndexT sizes[Dim], const IndexT strides[Dim])

Constructor that takes arbitrary size/stride arrays.

template <int OtherDim>
__host__ __device__ bool facebook::cuda::DeviceTensor::isSameSize(const DeviceTensor < T, OtherDim, IndexT, PtrTraits > & rhs) const

Returns true if the two tensors are of the same dimensionality and size.

template <int OtherDim>
__host__ __device__ bool facebook::cuda::DeviceTensor::isSameSizeAndStride(const DeviceTensor < T, OtherDim, IndexT, PtrTraits > & rhs) const

Returns true if the two tensors are of the same dimensionality, size and stride.

std::string toString() const

Produces a string containing our size and stride array contents; for debugging purposes

template <typename U>
__host__ __device__ DeviceTensor< U, Dim, IndexT, PtrTraits > facebook::cuda::DeviceTensor::cast()

Cast to a tensor of a different type of the same size and stride.

template <typename U>
__host__ __device__ const DeviceTensor< U, Dim, IndexT, PtrTraits > facebook::cuda::DeviceTensor::cast() const
__host__ __device__ __forceinline__ DataPtrType facebook::cuda::DeviceTensor::data()

Returns a raw pointer to the start of our data.

__host__ __device__ __forceinline__ const DataPtrType facebook::cuda::DeviceTensor::data() const

Returns a raw pointer to the start of our data (const).

template <typename U>
__host__ __device__ __forceinline__ PtrTraits<U>::PtrType facebook::cuda::DeviceTensor::dataAs()

Cast to a different datatype.

template <typename U>
__host__ __device__ __forceinline__ const PtrTraits<const U>::PtrType facebook::cuda::DeviceTensor::dataAs() const

Cast to a different datatype.

__host__ __device__ __forceinline__ detail::DeviceSubTensor< DeviceTensor< T, Dim, IndexT, PtrTraits >, Dim-1, PtrTraits > facebook::cuda::DeviceTensor::operator[](IndexT index)

Returns a read/write view of a portion of our tensor.

__host__ __device__ __forceinline__ const detail::DeviceSubTensor< DeviceTensor< T, Dim, IndexT, PtrTraits >, Dim-1, PtrTraits > facebook::cuda::DeviceTensor::operator[](IndexT index) const

Returns a read/write view of a portion of our tensor (const).

__host__ __device__ __forceinline__ int facebook::cuda::DeviceTensor::getSize(int i) const

Returns the size of a given dimension, [0, Dim - 1]. No bounds checking.

__host__ __device__ __forceinline__ int facebook::cuda::DeviceTensor::getStride(int i) const

Returns the stride of a given dimension, [0, Dim - 1]. No bounds checking.

__host__ __device__ long facebook::cuda::DeviceTensor::numElements() const

Returns the total number of elements contained within our data (product of getSize(i))

__host__ __device__ __forceinline__ const IndexT* facebook::cuda::DeviceTensor::sizes() const

Returns the size array.

__host__ __device__ __forceinline__ const IndexT* facebook::cuda::DeviceTensor::strides() const

Returns the stride array.

void permuteDims(const std::vector<int> &perm)

Limited form of resize by permutation, make sure your permutation array is legit. Only works for contiguous tensors.

__host__ __device__ bool facebook::cuda::DeviceTensor::isContiguous() const

Returns true if there is no padding within the tensor and no re-ordering of the dimensions.

(stride(i) == size(i + 1) * stride(i + 1))

__host__ __device__ bool facebook::cuda::DeviceTensor::isConsistentlySized(int i) const

Returns whether a given dimension has only increasing stride from the previous dimension. A tensor that was permuted by exchanging size and stride only will fail this check. If i == 0 just check size > 0. Returns false if stride is <= 0.

__host__ __device__ bool facebook::cuda::DeviceTensor::isConsistentlySized() const
__host__ __device__ bool facebook::cuda::DeviceTensor::isContiguousDim(int i) const

Returns true if the given dimension index has no padding.

__host__ __device__ DeviceTensor< T, Dim, IndexT, PtrTraits > facebook::cuda::DeviceTensor::transpose(int dim1, int dim2) const

Returns a tensor of the same dimension after transposing the two dimensions given. Does not actually move elements; transposition is made by permuting the size/stride arrays.

template <int NewDim>
__host__ __device__ DeviceTensor< T, NewDim, IndexT, PtrTraits > facebook::cuda::DeviceTensor::upcastOuter()

Upcast a tensor of dimension D to some tensor of dimension D’ > D by padding the leading dimensions by 1 e.g., upcasting a 2-d tensor [2][3] to a 4-d tensor [1][1][2][3]

template <int NewDim>
__host__ __device__ DeviceTensor< T, NewDim, IndexT, PtrTraits > facebook::cuda::DeviceTensor::upcastInner()

Upcast a tensor of dimension D to some tensor of dimension D’ > D by padding the lowest/most varying dimensions by 1 e.g., upcasting a 2-d tensor [2][3] to a 4-d tensor [2][3][1][1]

template <int NewDim>
__host__ __device__ DeviceTensor< T, NewDim, IndexT, PtrTraits > facebook::cuda::DeviceTensor::downcastOuter()

Downcast a tensor of dimension D to some tensor of dimension D’ < D by collapsing the leading dimensions. asserts if there is padding on the leading dimensions.

template <int NewDim>
__host__ __device__ DeviceTensor< T, NewDim, IndexT, PtrTraits > facebook::cuda::DeviceTensor::downcastInner()

Downcast a tensor of dimension D to some tensor of dimension D’ < D by collapsing the leading dimensions. asserts if there is padding on the leading dimensions.

template <int SubDim>
__host__ __device__ DeviceTensor< T, SubDim, IndexT, PtrTraits > facebook::cuda::DeviceTensor::view(DataPtrType at)

Returns a tensor that is a view of the SubDim-dimensional slice of this tensor, starting at at.

template <int SubDim>
__host__ __device__ DeviceTensor< T, SubDim, IndexT, PtrTraits > facebook::cuda::DeviceTensor::view()

Returns a tensor that is a view of the SubDim-dimensional slice of this tensor, starting where our data begins

void zero(cudaStream_t stream = 0)

Zeroes out the tensor asynchronously. Asserts if the contents in question are not contiguous.

template <typename T>
class FixedDivisor

Prototype for integer division by a fixed constant via strength reduction to mul/shift.

template <>
template<>
class FixedDivisor<int>

Specialization for calculating quotients by a fixed signed d using integer multiplication and shifts.

Public Types

typedef int Type

Public Functions

FixedDivisor(int d)
__host__ __device__ __forceinline__ int facebook::cuda::FixedDivisor::div(int n)

Calculates q = n / d.

__host__ __device__ __forceinline__ int facebook::cuda::FixedDivisor::mod(int n)

Calculates r = n % d.

__host__ __device__ __forceinline__ void facebook::cuda::FixedDivisor::divMod(int n, int * q, int * r)

Calculates q = n / d and r = n % d together.

template <>
template<>
class FixedDivisor<unsigned int>

Class for calculating quotients by a fixed unsigned d using integer multiplication, addition and shifts.

Public Types

typedef unsigned int Type

Public Functions

FixedDivisor(unsigned int d)
__host__ __device__ __forceinline__ unsigned int facebook::cuda::FixedDivisor::div(unsigned int n)

Calculates q = n / d.

__host__ __device__ __forceinline__ unsigned int facebook::cuda::FixedDivisor::mod(unsigned int n)

Calculates r = n % d.

__host__ __device__ __forceinline__ void facebook::cuda::FixedDivisor::divMod(unsigned int n, unsigned int * q, unsigned int * r)

Calculates q = n / d and r = n % d together.

template <typename T>
struct GreaterThan

Prototype:

template <typename T>
struct Comparator {
  static __device__ __forceinline__ bool compare(const T lhs, const T rhs);
};

Public Static Functions

static __device__ __forceinline__ bool facebook::cuda::GreaterThan::compare(const T lhs, const T rhs)
class KernelTimer
#include <KernelTimer.h>

Utility class for timing execution of a kernel.

Public Functions

KernelTimer()

Constructor starts the timer and adds an event into the current device stream

~KernelTimer()

Destructor releases event resources.

float stop()

Adds a stop event then synchronizes on the stop event to get the actual GPU-side kernel timings for any kernels launched in the current stream. Returns the number of milliseconds elapsed

template <typename T>
struct LessThan

Public Static Functions

static __device__ __forceinline__ bool facebook::cuda::LessThan::compare(const T lhs, const T rhs)
template <typename T>
struct Max

Public Functions

__host__ __device__ __forceinline__ T facebook::cuda::Max::operator()(T a, T b)
template <typename T>
struct Min

Public Functions

__host__ __device__ __forceinline__ T facebook::cuda::Min::operator()(T a, T b)
template <typename T>
struct NumericLimits

Numeric limits for CUDA.

template <>
template<>
struct NumericLimits<float>

Public Static Functions

__device__ static __forceinline__ float facebook::cuda::NumericLimits::minPossible()

The minimum possible valid float (i.e., not NaN)

__device__ static __forceinline__ float facebook::cuda::NumericLimits::maxPossible()

The maximum possible valid float (i.e., not NaN)

template <>
template<>
struct NumericLimits<int>

Public Static Functions

__device__ static __forceinline__ int facebook::cuda::NumericLimits::minPossible()

The minimum possible int.

__device__ static __forceinline__ int facebook::cuda::NumericLimits::maxPossible()

The maximum possible int.

template <>
template<>
struct NumericLimits<unsigned int>

Public Static Functions

__device__ static __forceinline__ unsigned int facebook::cuda::NumericLimits::minPossible()

The minimum possible unsigned int.

__device__ static __forceinline__ unsigned int facebook::cuda::NumericLimits::maxPossible()

The maximum possible unsigned int.

template <typename K, typename V>
struct Pair

A simple pair type for CUDA device usage.

Public Functions

__host__ __device__ __forceinline__ facebook::cuda::Pair::Pair()
__host__ __device__ __forceinline__ facebook::cuda::Pair::Pair(K key, V value)
__host__ __device__ __forceinline__ bool facebook::cuda::Pair::operator==(const Pair < K, V > & rhs) const
__host__ __device__ __forceinline__ bool facebook::cuda::Pair::operator!=(const Pair < K, V > & rhs) const
__host__ __device__ __forceinline__ bool facebook::cuda::Pair::operator<(const Pair < K, V > & rhs) const
__host__ __device__ __forceinline__ bool facebook::cuda::Pair::operator>(const Pair < K, V > & rhs) const

Public Members

K k
V v
template <typename T, int N>
struct RegisterIndexUtils

Utilities for addressing values held in register arrays, but with a dynamic index. For instance, if you had:

float arr[6];
int index = calculation();
arr[index + 1] = doStuffWith(arr[index]);

the dynamic indexing of arr with index requires that the compiler address arr in local memory, not registers, removing any performance benefit. Usually one should use static indexing for register arrays, for example:

#pragma unroll
for (int i = 0; i < 6; ++i) { arr[i] = foo; }

or

arr[3] = foo;

in order to allow the compiler to assign registers to arr, but there are occasions when one needs to dynamically index the array. The arrays in question should often be very small (e.g., N = 2-3) to avoid any lookup penalty.

These utilities translate the dynamic request to a static request, for array sizes N = 1 to 32.

So, to take our original case, you’d use it like:

float arr[6];
int index = calculation();
float val = doStuffWith(RegisterUtils<float, 6>::get(arr, index));
RegisterUtils<float, 6>::set(arr, index + 1, val);

which will preserve the compiler’s ability to assign arr to registers.

Public Static Functions

__device__ static __forceinline__ T facebook::cuda::RegisterIndexUtils::get(const T arr[N], int index)

Retrieve a single value from our thread-local register array.

__device__ static __forceinline__ void facebook::cuda::RegisterIndexUtils::set(T arr[N], int index, T val)

Set a single value in our thread-local register array.

template <typename T, int N>
struct RegisterUtils

Various utilities for dealing with arrays of values which are maintained in thread-local registers. All accesses are done in such a way such that the index is statically known, which preserves the compiler’s ability to allocate the values to registers, as opposed to local memory.

Public Static Functions

template <int Shift>
__device__ static __forceinline__ void facebook::cuda::RegisterUtils::shiftLeft(T arr[N])

Register shifting: move elements towards the beginning of the array (towards 0) by Shift places: arr[i] = arr[i + Shift] The Shift elements at the end are left unchanged.

template <int Shift>
__device__ static __forceinline__ void facebook::cuda::RegisterUtils::shiftRight(T arr[N])

Register shifting: move elements towards the end of the array (towards N - 1) by Shift places: arr[i] = arr[i - Shift] The Shift elements at the beginning are left unchanged.

template <int Rotate>
__device__ static __forceinline__ void facebook::cuda::RegisterUtils::rotateLeft(T arr[N])

Register rotation: move elements cyclically towards the beginning of the array with wrap around (towards 0).

template <int Rotate>
__device__ static __forceinline__ void facebook::cuda::RegisterUtils::rotateRight(T arr[N])

Register rotation: move elements cyclically towards the end of the array with wrap around (towards N - 1).

template <typename T>
struct RestrictPtrTraits

Public Types

typedef T* __restrict__ facebook::cuda::RestrictPtrTraits::PtrType
template <typename T>
struct Sum

Public Functions

__host__ __device__ __forceinline__ T facebook::cuda::Sum::operator()(T a, T b)
template <typename T, int N>
struct WarpRegisterLoaderUtils

Tensor <-> register load/save utils, for managing a set of registers distributed across the warp

Public Static Functions

static __device__ void facebook::cuda::WarpRegisterLoaderUtils::load(T arr[N], const DeviceTensor < T, 1 > & in, const T fill)

Convenience utility to load values from a 1-d array into registers using within-warp striding. Registers for which there is no entry in the array get fillVal as a value

static __device__ void facebook::cuda::WarpRegisterLoaderUtils::save(DeviceTensor < T, 1 > & out, const T arr[N], const int num)

Convenience utility to save values into a 1-d array from registers using within-warp striding. Saves up to num values from the registers.

template <typename K, typename V, int N>
struct WarpRegisterPairLoaderUtils

Tensor <-> register load/save utils for Pair<>, for managing a set of registers distributed across the warp

Public Static Functions

static __device__ void facebook::cuda::WarpRegisterPairLoaderUtils::load(Pair < K, V > arr[N], const DeviceTensor < K, 1 > & in, const K keyFill, const V valueFill)

Like WarpRegisterUtils<T>::load, but for key/value pair types. Initializes the value with the source index.

static __device__ void facebook::cuda::WarpRegisterPairLoaderUtils::load(Pair < K, V > arr[N], const DeviceTensor < K, 1 > & key, const DeviceTensor < V, 1 > & value, const K keyFill, const V valueFill)

Like WarpRegisterUtils<T>::load, but for key/value pair types. The value for each key is at the corresponding index in the value array. The arrays are presumed to be the same size.

static __device__ void facebook::cuda::WarpRegisterPairLoaderUtils::save(DeviceTensor < K, 1 > & key, DeviceTensor < V, 1 > & value, const Pair < K, V > arr[N], const int num)

Like WarpRegisterUtils<T>::save, but for key/value pair types.

template <typename T, int N>
struct WarpRegisterUtils

Utilities for warp-wide held register arrays.

Public Static Functions

static __device__ T facebook::cuda::WarpRegisterUtils::broadcast(const T arr[N], int index)

Broadcast a single value from the warp-wide array arr, considering index as an index across the warp threads. In other words, returns arr[index / warpSize] from lane (index % warpSize) to all threads in the warp.

namespace detail

Type of a subspace of a tensor.

Functions

template <typename T, int N>
__host__ __device__ void facebook::cuda::detail::copy(T to[N], T from[N])
__host__ __device__ __forceinline__ unsigned int facebook::cuda::detail::mulHi(unsigned int x, unsigned int y)

Host and device implementation for 32-bit a * b into 64 bit, return high 32 bits

__device__ __forceinline__ constexpr int facebook::cuda::detail::max(int i, int j)
__device__ __forceinline__ constexpr int facebook::cuda::detail::max(int i, int j, int k)
__device__ __forceinline__ Complex facebook::cuda::detail::ldg(const Complex * p)
__device__ __forceinline__ void facebook::cuda::detail::ldg(Complex & c1, Complex & c2, const Complex * p)
template <bool ConjugateTransposeA, bool ConjugateTransposeB, int FFTSize, int FFTElements, int TileI, int TileJ, int TileK, int TileIThreadIdxY, int TileJThreadIdxZ, bool Accumulate>
facebook::cuda::detail::__launch_bounds__(32 *4 * 2, 2) const
assert()
assert(FFTElements = =blockDim.x)
assert(TileIThreadIdxY = =blockDim.y)
assert(TileJThreadIdxZ = =blockDim.z)
facebook::cuda::detail::assert(numRed % TileK = =0)
bool int int int int int bool bool bool bool bool bool bool Accumulate facebook::cuda::detail::__launch_bounds__(32 * 32, 1) const
facebook::cuda::detail::assert(A. getSize2 = =C.getSize(2))
facebook::cuda::detail::assert(B. getSize2 = =C.getSize(2))
facebook::cuda::detail::assert(ConjugateTransposeA||A. getSize0 = =C.getSize(0))
facebook::cuda::detail::assert(!ConjugateTransposeA||A. getSize1 = =C.getSize(0))
facebook::cuda::detail::assert(ConjugateTransposeB ||B. getSize1 = =C.getSize(1))
facebook::cuda::detail::assert(! ConjugateTransposeB ||B. getSize0 = =C.getSize(1))
facebook::cuda::detail::assert(ConjugateTransposeA|| ConjugateTransposeB ||A. getSize1 = =B.getSize(0))
assert(C_XY_Placement_ThreadIdx_X = =blockDim.x)
facebook::cuda::detail::assert(! StaticUnrollCI ||C. getSize0)%(C_I_Tile *gridDim.x = =0)
facebook::cuda::detail::assert(! StaticUnrollCJ ||C. getSize1)%(C_J_Tile *gridDim.y *blockDim.y = =0)
facebook::cuda::detail::assert(! StaticUnrollReduction || numRed % ReductionUnroll = =0)
facebook::cuda::detail::for()
template <int N, typename T>
__device__ T facebook::cuda::detail::getMulti(const T arr[N], int index, T val)
template <int N, typename T>
__device__ void facebook::cuda::detail::scatterHalfWarp(T arr[N], int index, T val)
template <typename T, typename Comparator, int M, int N>
__device__ void facebook::cuda::detail::warpMergeMN(const T a[M], const T b[N], T dst[M+N])
template <typename T, typename Comparator, int N>
__device__ void facebook::cuda::detail::warpSortRegisters(T a[N], T dst[N])
template <typename T, typename Comparator, int N>
__device__ void facebook::cuda::detail::warpSortRegisters(const DeviceTensor < T, 1 > & key, DeviceTensor < T, 1 > & sortedKey)
template <typename T, typename IndexType, typename Comparator, int N>
__device__ void facebook::cuda::detail::warpSortRegisters(const DeviceTensor < T, 1 > & key, DeviceTensor < T, 1 > & sortedKey, DeviceTensor < IndexType, 1 > & sortedKeyIndices)
template <typename K, typename V, typename Comparator, int N>
__device__ void facebook::cuda::detail::warpSortRegisters(const DeviceTensor < K, 1 > & key, const DeviceTensor < V, 1 > & value, DeviceTensor < K, 1 > & sortedKey, DeviceTensor < V, 1 > & sortedValue)
template <int N, typename T>
__device__ __forceinline__ void facebook::cuda::detail::setArray(T arr[N], T val)

Initialize an array to a value.

__device__ __forceinline__ void facebook::cuda::detail::incrementArray(int val, int counts[16])

In order to force register usage of the bucket count array, we have to unroll the increment selection. Otherwise, local memory is used for counts[] which severely degrades performance.

template <typename T, int N, int ILP>
__device__ __forceinline__ void facebook::cuda::detail::countNybbles(int counts[N], unsigned desired, unsigned desiredMask, int nybbleCheckPos, const DeviceTensor < T, 1 > & data)

We use a most significant to least significant radix selection on the float values, which requires at most sizeof(float) * 2 scans through the array, one for each nybble.

In order to use radix selection, we use the property that for positive floating-point values f1 and f2:

f1 > f2 <=> *(int*)&f1 > *(int*)&f2.

Something similar is true for negative floating point values f1 and f2 after zero-ing the leading sign bit, and except that the order is reversed:

f1 > f2 <=> (*(int*)f1 & 0x7fffffff) < (*(int*)f2 & 0x7fffffff).

This is true even for +/-inf and for denormalized floats. Negative zero is a special case. Selection by radix will give us that -0.0f < +0.0f, which is not true for IEEE fp comparison. We handle this special case when we return the answer seen, not in comparing values here.

+NaNs will lead all positive values, and -NaNs will be minimal values (non-canonical NaNs, if they exist, will be sorted according to this).

The focal point of the radix selection algorithm is the use of countNybbles and the CHECK_NYBBLE macro.

The idea is that we starting out, we don’t know where the Kth highest element lives, so we have to consider every float in the input. We look at the most significant nybble, and each thread counts into 16 buckets the number of floats in its subset of data with that leading nybble. This is done by countNybbles. countNybble takes as arguments desired and desiredMask. It only looks at values v such that (v & desiredMask) == desired. By default, both are 0, so it will look at every float. nybbleCheckPos is the current nybble that is beinig bucketed. It starts at 28, meaning we’re first looking at the most significant nybble.

countNybbles will add a count of nybble distribution to 16 buckets. One iteration through, by counting the distribution of the leading nybble in each float, we figure out what leading nybble the Kth highest float must have. As an example, let’s say that K is 10. By counting the distribution of leading nybbles in the inputs, say we get:

0x2: 1 0x5: 2 0x6: 2 0x7: 11

In this case, none of the floats are negative (otherwise, they would have leading nybble 0x8 -> 0xf). Since we’re looking for the 10th highest float, that cannot have leading nybble 0x2, 0x5 or 0x6 since those counts are less than 10. We walk through the buckets in order, and we warp reduce the counts across all threads to one count when it comes time to look in a particular bucket.

Thus, the 10th highest float must have leading nybble 0x7. The problem then becomes, for the next iteration, finding the (10 - (1+2+2)) = 5th highest float with leading nybble 0x7. Since the count for the nybble 0x7 is not 1, we don’t know the actual answer yet, and we have to continue.

Next iteration through, we no longer have to count every float, just those with leading nybble 0x7 (i.e., floats interpreted as a bit pattern v such that (v & desiredMask) == desired. Even though we have to physically scan the entire input, we are only counting a subset of it.

So:

  • desired changes from 0 -> 0x70000000, and
  • desiredMask changes from 0 -> 0xf0000000.

We continue, and count up the floats with leading nybble 0x7, getting counts:

0x(7)1: 1 0x(7)9: 2 0x(7)b: 1 0x(7)c: 1 0x(7)f: 6

This means that the count of all floats with the prefix 0x7fyyyyyy is 6. We’re only bucketing counts by the second nybble now.

Scanning through, the 5th highest float with prefix 0x7yyyyyyy must have prefix 0x7cyyyyyy, since from lowest to highest above, we reach 5 in bucket 0xc.

Thus, the 10th highest float in the entire set is the unique float with prefix 0x7cyyyyyy. This is unique because the count for this bucket is 1.

If we get through all nybbles to the least significant nybble and still have a count > 1, then that means that the Kth highest element is not unique. For example, in the set 2 2 3 3 3, the 2nd highest element is 3, which is duplicated 3 times.

Since the MSN contains the sign bit, we have to first look at buckets 0-7 to see if the Kth highest float is positive. If so, then we continue looking only at positive floats. If not, then we continue looking only at negative floats, but in reverse order.

Eventually we find a unique Kth highest element if the count is 1 in our bucket, or we end at the LSN with a duplicate count, in which case the Kth highest element is not unique.

Performs a histogram count of the nybbles that occur at the bit position nybbleCheckPos, but only for those ints that match (x & desiredMask) == desired.

In other words, if bits [31, nybbleCheckPos + 4] match those in desired, then return the contents of bits [nybbleCheckPos + 3, nybbleCheckPos].

__device__ __forceinline__ Pair<float, int> facebook::cuda::detail::findAnswer(const DeviceTensor < float, 1 > & data, unsigned desired, unsigned desiredMask, int dupCount)

A warp coherent implementation that finds a value in the data such that the floats, treated as uints ‘v’ match the bit pattern such that (v & desiredMask) == desired.

If the answer found is -0.0f, because -0.0f == +0.0f, it is possible that there are multiple +0.0f results that we’ve ignored in comparing by radix, since radix-wise +0.0f > -0.0f which is not true for IEEE fp.

Thus, if the answer found is -0.0f, then we have to include the count of all +0.0fs present in the duplicate count, in order to treat the comparison the same way that normal sorting mechanisms will treat it.

If the found result is not -0.0f, returns the value found and dupCount as the pair’s value.

If the found result is -0.0f, returns the value found and dupCount plus the number of +0.0f in the data as the pair’s value.

The value need not be unique, but the warp as a whole will return the highest value seen across the warp.

__device__ Pair<float, int> facebook::cuda::detail::warpFindTopKthElementSmall32(const DeviceTensor < float, 1 > & data, int k)

Finds the Kth highest floating point value in a linear array [arr, end) without modifying the data and without temporary storage except for registers. K starts at 1. All threads in the warp will return the value. Handles all floats except NaNs.

This function minimizes warp divergence.

Implementation for small arrays such that the (end - start) <= warpSize.

__device__ Pair<float, int> facebook::cuda::detail::warpFindTopKthElementLarge(const DeviceTensor < float, 1 > & data, int k)

Finds the Kth highest floating point value in a linear array [arr, end) without modifying the data and without temporary storage except for registers.

  • K starts at 1.
  • All threads in the warp will return the value.
  • Handles all floats except NaNs.
  • Negative zero is specialized by findAnswer.
  • This function minimizes warp divergence.

Implementation for large arrays such that there are more elements than warp threads.

__device__ __forceinline__ int facebook::cuda::detail::laneWillWrite(float val, float topK, int & topKToWrite, int & next)

Returns the index into the array that this lane will write. If this lane is not responsible for writing a value, this will return -1.

__device__ float facebook::cuda::detail::warpFindTopKElementsIndexOrder(const DeviceTensor < float, 1 > & data, DeviceTensor < float, 1 > & out, int k)

For a given warp, find and write out the top-k highest floating point values in [start, end) to [out, out + k). The list written out occurs in the original source order (by original index). Returns the k-th highest element seen. Handles all floats except NaNs. Implementation for large arrays such that there are more elements than warp threads.

template <typename IndexType>
__device__ float facebook::cuda::detail::warpFindTopKElementsIndexOrder(const DeviceTensor < float, 1 > & data, DeviceTensor < float, 1 > & out, DeviceTensor < IndexType, 1 > & indices, int k)

Version of warpFindTopKElementsUnorderedLarge, except also writes out the K indices chosen from data into indices.

__device__ void facebook::cuda::detail::warpFindTopKElementsValueOrderSmall(const DeviceTensor < float, 1 > & data, DeviceTensor < float, 1 > & out, int k)

For a given warp, find and write out the top-k highest floating point values in [start, end) to [out, out + k). The list written out is ordered. Handles all floats except NaNs.

template <typename IndexType>
__device__ void facebook::cuda::detail::warpFindTopKElementsValueOrderSmall(const DeviceTensor < float, 1 > & data, DeviceTensor < float, 1 > & out, DeviceTensor < IndexType, 1 > & indices, int k)

Version of warpFindTopKElementsOrderedSmall that also writes out the indices in data of the K elements chosen into indices.

__device__ void facebook::cuda::detail::warpFindTopKElementsValueOrderLarge(const DeviceTensor < float, 1 > & data, DeviceTensor < float, 1 > & out, int k)

For a given warp, find and write out the top-k highest floating point values in [start, end) to [out, out + k). The list written out is ordered. Handles all floats except NaNs. Implementation for large arrays such that there are more elements than warp threads.

template <typename IndexType>
__device__ void facebook::cuda::detail::warpFindTopKElementsValueOrderLarge(const DeviceTensor < float, 1 > & data, DeviceTensor < float, 1 > & out, DeviceTensor < IndexType, 1 > & indices, int k)

Version of warpFindTopKElementsOrderedLage that also writes out the indices in data of the K elements chosen into indices.

template <typename T, typename Comparator>
__device__ __forceinline__ T facebook::cuda::detail::shflSwap(const T x, int mask, int dir)

Variables

bool int int int int int bool bool bool bool bool bool bool Accumulate facebook::cuda::detail::A
bool int int int int int bool bool bool bool bool bool bool Accumulate const DeviceTensor< Complex, 3 > facebook::cuda::detail::B
bool int int int int int bool bool bool bool bool bool bool Accumulate const DeviceTensor< Complex, 3 > DeviceTensor< Complex, 3 > facebook::cuda::detail::C
bool int int int int int bool bool bool bool bool bool bool Accumulate const DeviceTensor< Complex, 3 > DeviceTensor< Complex, 3 > Complex facebook::cuda::detail::invNorm = { const auto xyBase = blockIdx.z * blockDim.x
const auto xy = blockIdx.z * blockDim.x + threadIdx.x
const int numRed = (ConjugateTransposeA) ? A.getSize(0) : A.getSize(1)
Complex c[TileI][TileJ]
bool ConjugateTransposeB
bool int facebook::cuda::detail::FFTSize
bool int int facebook::cuda::detail::C_J_Unroll
bool int int int facebook::cuda::detail::C_I_Tile
bool int int int int facebook::cuda::detail::C_J_Tile
bool int int int int int facebook::cuda::detail::ReductionUnroll
const int ubk = numRed
const int numBatches = A.getSize(2)
const int ubxy = ceil(numBatches, (int)(gridDim.z * blockDim.x)) * gridDim.z * blockDim.x
bool int facebook::cuda::detail::C_XY_Placement_ThreadIdx_X
bool int int int int int bool facebook::cuda::detail::StaticUnrollA
bool int int int int int bool bool facebook::cuda::detail::StaticUnrollB
bool int int int int int bool bool bool facebook::cuda::detail::StaticUnrollCI
bool int int int int int bool bool bool bool facebook::cuda::detail::StaticUnrollCJ
bool int int int int int bool bool bool bool bool facebook::cuda::detail::StaticUnrollXY
bool int int int int int bool bool bool bool bool bool facebook::cuda::detail::StaticUnrollReduction
const int ubi = (StaticUnrollCI) ? C.getSize(0) : ceil(C.getSize(0), (int)(C_I_Tile * gridDim.x)) * C_I_Tile * gridDim.x
const int ubj = (StaticUnrollCJ) ? C.getSize(1) : ceil(C.getSize(1), (int)(C_J_Tile * gridDim.y * blockDim.y)) * C_J_Tile * gridDim.y * blockDim.y
template <typename TensorType, int SubDim, template< typename U > class PtrTraits>
class DeviceSubTensor

A SubDim-rank slice of a parent DeviceTensor.

Public Functions

__host__ __device__ __forceinline__ DeviceSubTensor<TensorType, SubDim - 1, PtrTraits> facebook::cuda::detail::DeviceSubTensor::operator[](typename TensorType::IndexType index)

Returns a view of the data located at our offset (the dimension SubDim - 1 tensor).

__host__ __device__ __forceinline__ const DeviceSubTensor<TensorType, SubDim - 1, PtrTraits> facebook::cuda::detail::DeviceSubTensor::operator[](typename TensorType::IndexType index) const

Returns a view of the data located at our offset (the dimension SubDim - 1 tensor) (const).

__host__ __device__ TensorType::DataType* facebook::cuda::detail::DeviceSubTensor::operator&()
__host__ __device__ const TensorType::DataType* facebook::cuda::detail::DeviceSubTensor::operator&() const
__host__ __device__ __forceinline__ TensorType::DataPtrType facebook::cuda::detail::DeviceSubTensor::data()

Returns a raw accessor to our slice.

__host__ __device__ __forceinline__ const TensorType::DataPtrType facebook::cuda::detail::DeviceSubTensor::data() const

Returns a raw accessor to our slice (const).

template <typename T>
__host__ __device__ T& facebook::cuda::detail::DeviceSubTensor::as()

Cast to a different datatype.

template <typename T>
__host__ __device__ const T& facebook::cuda::detail::DeviceSubTensor::as() const

Cast to a different datatype (const).

template <typename T>
__host__ __device__ __forceinline__ PtrTraits<T>::PtrType facebook::cuda::detail::DeviceSubTensor::dataAs()

Cast to a different datatype.

template <typename T>
__host__ __device__ __forceinline__ PtrTraits<const T>::PtrType facebook::cuda::detail::DeviceSubTensor::dataAs() const

Cast to a different datatype (const)

__device__ __forceinline__ TensorType::DataType facebook::cuda::detail::DeviceSubTensor::ldg() const

Use the texture cache for reads.

template <typename T>
__device__ __forceinline__ T facebook::cuda::detail::DeviceSubTensor::ldgAs() const

Use the texture cache for reads; cast as a particular type.

DeviceTensor<typename TensorType::DataType, SubDim, typename TensorType::IndexType, PtrTraits> view()

Returns a tensor that is a view of the SubDim-dimensional slice of this tensor, starting where our data begins

Friends

friend facebook::cuda::detail::DeviceTensor< typename TensorType::DataType, TensorType::NumDim, typename TensorType::IndexType, PtrTraits >

Our parent tensor can create us.

template <typename TensorType, template< typename U > class PtrTraits>
template<>
class DeviceSubTensor<TensorType, 0, PtrTraits>

Specialization for a view of a single value (0-dimensional)

Public Functions

__host__ __device__ DeviceSubTensor<TensorType, 0, PtrTraits> facebook::cuda::detail::DeviceSubTensor::operator=(typename TensorType::DataType val)
__host__ __device__ facebook::cuda::detail::DeviceSubTensor::operator typename TensorType::DataType &()
__host__ __device__ facebook::cuda::detail::DeviceSubTensor::operator const typename TensorType::DataType &() const
__host__ __device__ TensorType::DataType* facebook::cuda::detail::DeviceSubTensor::operator&()
__host__ __device__ const TensorType::DataType* facebook::cuda::detail::DeviceSubTensor::operator&() const
__host__ __device__ __forceinline__ TensorType::DataPtrType facebook::cuda::detail::DeviceSubTensor::data()

Returns a raw accessor to our slice.

__host__ __device__ __forceinline__ const TensorType::DataPtrType facebook::cuda::detail::DeviceSubTensor::data() const

Returns a raw accessor to our slice (const).

template <typename T>
__host__ __device__ T& facebook::cuda::detail::DeviceSubTensor::as()

Cast to a different datatype.

template <typename T>
__host__ __device__ const T& facebook::cuda::detail::DeviceSubTensor::as() const

Cast to a different datatype (const).

template <typename T>
__host__ __device__ __forceinline__ PtrTraits<T>::PtrType facebook::cuda::detail::DeviceSubTensor::dataAs()

Cast to a different datatype.

template <typename T>
__host__ __device__ __forceinline__ PtrTraits<const T>::PtrType facebook::cuda::detail::DeviceSubTensor::dataAs() const

Cast to a different datatype (const)

__device__ __forceinline__ TensorType::DataType facebook::cuda::detail::DeviceSubTensor::ldg() const

Use the texture cache for reads.

template <typename T>
__device__ __forceinline__ T facebook::cuda::detail::DeviceSubTensor::ldgAs() const

Use the texture cache for reads; cast as a particular type.

Friends

friend facebook::cuda::detail::DeviceTensor< typename TensorType::DataType, 1, typename TensorType::IndexType, PtrTraits >

Our parent tensor can create us.

struct HalfFtor

Public Functions

HalfFtor()
void operator()(int &n)
template <typename T, typename Comparator, int N>
struct Merge

Public Static Functions

static __device__ void facebook::cuda::detail::Merge::splitAndMerge(const T in[N], T out[N])
template <typename T, typename Comparator>
template<>
struct Merge<T, Comparator, 1>

Public Static Functions

static __device__ void facebook::cuda::detail::Merge::splitAndMerge(const T in[1], T out[1])