API Reference¶
All classes and functions exist in the facebook::cuda
namespace.
-
namespace
facebook::
cuda
¶ Functions
- template <typename T>
-
__host__ __device__ __forceinline__ constexpr T facebook::cuda::ceil(T a, T b)
Computes ceil(a / b)
- template <typename T>
-
__host__ __device__ __forceinline__ constexpr T facebook::cuda::floor(T a, T b)
Computes floor(a / b)
-
__device__ __forceinline__ int facebook::cuda::getWarpId()
Returns the current thread’s warp ID
-
__device__ __forceinline__ int facebook::cuda::getThreadsInBlock()
Returns the number of threads in the current block (linearized).
-
__device__ __forceinline__ int facebook::cuda::getWarpsInBlock()
Returns the number of warps in the current block (linearized, rounded to whole warps).
- template <typename T>
-
__device__ __forceinline__ T facebook::cuda::ptrMin(T a, T b)
Pointer comparison using the PTX intrinsic; min() doesn’t work for T*.
- template <typename T>
-
__device__ __forceinline__ T facebook::cuda::ptrMax(T a, T b)
Pointer comparison using the PTX intrinsic; max() doesn’t work for T*
-
__device__ __forceinline__ int facebook::cuda::getLaneId()
Return the current thread’s lane in the warp
-
__device__ __forceinline__ unsigned facebook::cuda::getLaneMaskLt()
Return a bitmask with bits set in positions less than the current thread’s lane number in the warp.
-
__device__ __forceinline__ unsigned facebook::cuda::getLaneMaskLe()
Return a bitmask with bits set in positions less than or equal to the current thread’s lane number in the warp.
-
__device__ __forceinline__ unsigned facebook::cuda::getLaneMaskGt()
Return a bitmask with bits set in positions greater than the current thread’s lane number in the warp.
-
__device__ __forceinline__ unsigned facebook::cuda::getLaneMaskGe()
Return a bitmask with bits set in positions greater than or equal to the current thread’s lane number in the warp.
-
__device__ __forceinline__ int facebook::cuda::getBit(int val, int pos)
Extract a single bit at
pos
fromval
-
__device__ __forceinline__ unsigned facebook::cuda::setBit(unsigned val, unsigned toInsert, int pos)
Insert a single bit into
val
at positionpos
-
__device__ __forceinline__ unsigned facebook::cuda::getBitfield(unsigned val, int pos, int len)
Extract a bit field of length
len
atpos
fromval
-
__device__ __forceinline__ unsigned long facebook::cuda::getBitfield(unsigned long val, int pos, int len)
Extract a bit field of length
len
atpos
fromval
-
__device__ __forceinline__ unsigned facebook::cuda::setBitfield(unsigned val, unsigned toInsert, int pos, int len)
Insert
len
bits oftoInsert
intoval
starting at positionpos
-
__device__ __forceinline__ unsigned long facebook::cuda::setBitfield(unsigned long val, unsigned toInsert, int pos, int len)
Insert
len
bits oftoInsert
intoval
starting at positionpos
-
__device__ __forceinline__ constexpr int facebook::cuda::getMSB(int val)
Returns the index of the most significant 1 bit in
val
.
- template <typename T, int Dim, typename IndexT, template< typename U > class PtrTraits>
-
std::ostream &
operator<<
(std::ostream &os, const DeviceTensor<T, Dim, IndexT, PtrTraits> &t)¶ Streaming operator for logging.
-
__host__ __device__ float facebook::cuda::createRoundingFactor(float max, int n)
Constructs a rounding factor used to truncate elements in a sum such that the sum of the truncated elements is the same no matter what the order of the sum is.
Floating point summation is not associative; using this factor makes it associative, so a parallel sum can be performed in any order (presumably using atomics).
Follows Algorithm 5: Reproducible Sequential Sum in ‘Fast Reproducible Floating-Point Summation’ by Demmel and Nguyen http://www.eecs.berkeley.edu/~hdnguyen/public/papers/ARITH21_Fast_Sum.pdf
For summing x_i, i = 1 to n:
- Parameters
max
: The maximum seen floating point value abs(x_i)n
: The number of elements for the sum, or an upper bound estimate
-
__host__ __device__ float facebook::cuda::truncateWithRoundingFactor(float roundingFactor, float x)
Given the rounding factor in
createRoundingFactor
calculated using max(|x_i|), truncatex
to a value that can be used for a deterministic, reproducible parallel sum of all x_i.
-
__device__ bool facebook::cuda::inBounds(int x, int padL, const DeviceTensor < float, 2 > & t)
-
__device__ bool facebook::cuda::inBounds(int y, int x, int padU, int padL, const DeviceTensor < float, 3 > & t)
-
__device__ __forceinline__ bool facebook::cuda::inBounds(int y, int x, int padU, int padL, const DeviceTensor < float, 4 > & t)
-
__device__ bool facebook::cuda::inBounds(int x, const DeviceTensor < float, 2 > & t)
-
__device__ bool facebook::cuda::inBounds(int y, int x, const DeviceTensor < float, 3 > & t)
-
__device__ bool facebook::cuda::inBounds(int y, int x, const DeviceTensor < float, 4 > & t)
- template <int Dim, bool ConjugateTransposeA, bool ConjugateTransposeB, bool Accumulate>
-
void
transposeMM
(DeviceTensor<float, Dim> &A, DeviceTensor<float, Dim> &B, DeviceTensor<float, Dim> &C, float invNorm, cudaStream_t s)¶
- template <typename T>
-
__device__ __forceinline__ T facebook::cuda::shfl(const T val, int srcLane, int width = WARP_SIZE)
- template <typename T>
-
__device__ __forceinline__ T facebook::cuda::shfl_up(const T val, int delta, int width = WARP_SIZE)
- template <typename T>
-
__device__ __forceinline__ T facebook::cuda::shfl_down(const T val, int delta, int width = WARP_SIZE)
- template <typename T>
-
__device__ __forceinline__ T facebook::cuda::shfl_xor(const T val, int laneMask, int width = WARP_SIZE)
- template <typename K, typename V>
-
__device__ __forceinline__ Pair<K, V> facebook::cuda::shfl(const Pair < K, V > & p, int srcLane, int width = WARP_SIZE)
- template <typename K, typename V>
-
__device__ __forceinline__ Pair<K, V> facebook::cuda::shfl_up(const Pair < K, V > & p, int delta, int width = WARP_SIZE)
- template <typename K, typename V>
-
__device__ __forceinline__ Pair<K, V> facebook::cuda::shfl_down(const Pair < K, V > & p, int delta, int width = WARP_SIZE)
- template <typename K, typename V>
-
__device__ __forceinline__ Pair<K, V> facebook::cuda::shfl_xor(const Pair < K, V > & p, int laneMask, int width = WARP_SIZE)
- template <int type>
-
__global__ void facebook::cuda::sleepKernel(double * cycles, int64_t waitCycles)
-
void
cudaSleep
(int64_t cycles, int type)¶
- template <typename T, typename Comparator>
-
__device__ bool facebook::cuda::warpSort(const DeviceTensor < T, 1 > & key, DeviceTensor < T, 1 > & sortedKey)
- template <typename T, typename IndexType, typename Comparator>
-
__device__ bool facebook::cuda::warpSort(const DeviceTensor < T, 1 > & key, DeviceTensor < T, 1 > & sortedKey, DeviceTensor < IndexType, 1 > & sortedKeyIndices)
- template <typename K, typename V, typename Comparator>
-
__device__ bool facebook::cuda::warpSort(const DeviceTensor < K, 1 > & key, const DeviceTensor < V, 1 > & value, DeviceTensor < K, 1 > & sortedKey, DeviceTensor < V, 1 > & sortedValue)
-
__device__ Pair<float, int> facebook::cuda::warpFindTopKthElement(const DeviceTensor < float, 1 > & data, int k)
Finds the Kth highest floating point value in a linear array [arr, end) without modifying the data and without temporary storage except for registers.
- K starts at 1.
- All threads in the warp will return the value.
- Handles all floats except NaNs.
- This function minimizes warp divergence.
Returns the number of times the top-Kth element uniquely occurs along with its value.
-
__device__ void facebook::cuda::warpFindTopKElementsIndexOrder(const DeviceTensor < float, 1 > & data, DeviceTensor < float, 1 > & out, int k)
For a given warp, find and write out the top-k highest floating point values in [start, end) to [out, out + k). The list written out is ordered based on original index order. Handles all floats except NaNs.
- template <typename IndexType>
-
__device__ void facebook::cuda::warpFindTopKElementsIndexOrder(const DeviceTensor < float, 1 > & data, DeviceTensor < float, 1 > & out, DeviceTensor < IndexType, 1 > & indices, int k)
Version of warpFindTopKElementsOrdered which also writes out the indices of the found top elements from
data
. The list written out is ordered based on original index order. Handles all floats except NaNs. Supports writing out float or integer indices.
-
__device__ void facebook::cuda::warpFindTopKElementsValueOrder(const DeviceTensor < float, 1 > & data, DeviceTensor < float, 1 > & out, int k)
For a given warp, find and write out the top-k highest floating point values in [start, end) to [out, out + k). The list written out is ordered based on float value. Handles all floats except NaNs.
- template <typename IndexType>
-
__device__ void facebook::cuda::warpFindTopKElementsValueOrder(const DeviceTensor < float, 1 > & data, DeviceTensor < float, 1 > & out, DeviceTensor < IndexType, 1 > & indices, int k)
Version of warpFindTopKElementsOrdered which also writes out the indices of the found top elements from
data
. The list written out is ordered based on float value. Handles all floats except NaNs. Supports writing out float or integer indices.
- template <typename T, typename Comparator>
-
__device__ T facebook::cuda::warpBitonicSort(T val)
Defines a bitonic sort network to exchange ‘V’ according to
SWAP()
‘s compare and exchange mechanism across the warp, ordered according to the comparatorcomp
. In other words, ifcomp
isGreaterThan<T>
, then lane 0 will contain the highestval
presented across the warpSee also http://on-demand.gputechconf.com/gtc/2013/presentations/S3174-Kepler-Shuffle-Tips-Tricks.pdf
- template <typename T, typename Op>
-
__device__ __forceinline__ T facebook::cuda::warpReduce(T val, Op op)
Reduce a value across a warp by applying the commutative function
Op
. All threads in the warp receive the reduced value. Assumes that all threads in the warp are participating in the reduction.
- template <typename T>
-
__device__ __forceinline__ T facebook::cuda::warpReduceSum(T val)
Sums a register value across all warp threads.
- template <typename T>
-
__device__ __forceinline__ T facebook::cuda::warpReduceMax(T val)
Finds the maximum
val
across the warp.
- template <typename T>
-
__device__ __forceinline__ T facebook::cuda::warpReduceMin(T val)
Finds the minimum
val
across the warp.
- template <typename T>
-
__device__ __forceinline__ bool facebook::cuda::warpHasCollision(T val)
Determine if two warp threads have the same value (a collision).
- template <typename T>
-
__device__ __forceinline__ unsigned int facebook::cuda::warpCollisionMask(T val)
Determine if two warp threads have the same value (a collision), and returns a bitmask of the lanes that are known to collide with other lanes. Not all lanes that are mutually colliding return a bit; all lanes with a
1
bit are guaranteed to collide with a lane with a0
bit, so the mask can be used to serialize execution for lanes that collide with others. (mask | (mask >> 1)) will yield all mutually colliding lanes.
-
struct
Complex
¶ cuComplex
wrapper.Public Functions
-
__host__ __device__ __forceinline__ facebook::cuda::Complex::Complex()
-
__host__ __device__ __forceinline__ facebook::cuda::Complex::Complex(float re)
-
__host__ __device__ __forceinline__ facebook::cuda::Complex::Complex(float re, float im)
-
__host__ __device__ __forceinline__ facebook::cuda::Complex::Complex(const Complex & c)
-
__host__ __device__ __forceinline__ facebook::cuda::Complex::Complex(const cuComplex & c)
-
__host__ __device__ __forceinline__ Complex& facebook::cuda::Complex::operator=(const Complex & c)
-
__host__ __device__ __forceinline__ bool facebook::cuda::Complex::operator==(const Complex & c) const
-
__host__ __device__ __forceinline__ bool facebook::cuda::Complex::operator!=(const Complex & c) const
-
__host__ __device__ __forceinline__ Complex facebook::cuda::Complex::operator-() const
-
__host__ __device__ __forceinline__ Complex facebook::cuda::Complex::operator-(const Complex & c) const
-
__host__ __device__ __forceinline__ Complex facebook::cuda::Complex::operator+(const Complex & c) const
-
__host__ __device__ __forceinline__ Complex facebook::cuda::Complex::operator*(const Complex & c) const
-
__host__ __device__ __forceinline__ Complex facebook::cuda::Complex::operator/(const Complex & c) const
-
__host__ __device__ __forceinline__ Complex& facebook::cuda::Complex::operator+=(const Complex & c)
-
__host__ __device__ __forceinline__ Complex& facebook::cuda::Complex::operator-=(const Complex & c)
-
__host__ __device__ __forceinline__ Complex& facebook::cuda::Complex::operator*=(const Complex & c)
-
__host__ __device__ __forceinline__ Complex& facebook::cuda::Complex::operator/=(const Complex & c)
-
__host__ __device__ __forceinline__ Complex facebook::cuda::Complex::transpose() const
-
__host__ __device__ __forceinline__ Complex facebook::cuda::Complex::conjugate() const
-
__host__ __device__ __forceinline__ void facebook::cuda::Complex::cexp(float angle)
-
__host__ __device__ __forceinline__ float& facebook::cuda::Complex::re()
-
__host__ __device__ __forceinline__ float& facebook::cuda::Complex::im()
-
__host__ __device__ __forceinline__ const float& facebook::cuda::Complex::re() const
-
__host__ __device__ __forceinline__ const float& facebook::cuda::Complex::im() const
-
__host__ __device__ __forceinline__ facebook::cuda::Complex::operator float2() const
-
- template <typename T, int Dim, typename IndexT = int, template< typename U > class PtrTraits = DefaultPtrTraits>
-
class
DeviceTensor
¶ Our tensor type.
Templated multi-dimensional array that supports strided access of elements. Main access is through
operator[]
; e.g.,tensor[x][y][z]
.T
is the contained type (e.g.,float
)Dim
is the tensor rankIndexT
is the integer type used for size/stride arrays, and for- all indexing math. Default is
int
, but for large tensors,long
- can be used instead.
PtrTraits
are traits applied to our data pointer (T*). By default,- this is just T*, but RestrictPtrTraits can be used to apply T*
- restrict for alias-free analysis.
Public Types
-
typedef T
DataType
¶
-
typedef IndexT
IndexType
¶
-
typedef PtrTraits<T>::PtrType
DataPtrType
¶
-
typedef DeviceTensor<T, Dim, IndexT, PtrTraits>
TensorType
¶
Public Functions
-
__host__ __device__ facebook::cuda::DeviceTensor::DeviceTensor()
Default constructor.
-
__host__ __device__ facebook::cuda::DeviceTensor::DeviceTensor(DataPtrType data, const IndexT sizes[Dim])
Constructor that calculates strides with no padding.
-
__host__ __device__ facebook::cuda::DeviceTensor::DeviceTensor(DataPtrType data, const IndexT sizes[Dim], const IndexT strides[Dim])
Constructor that takes arbitrary size/stride arrays.
- template <int OtherDim>
-
__host__ __device__ bool facebook::cuda::DeviceTensor::isSameSize(const DeviceTensor < T, OtherDim, IndexT, PtrTraits > & rhs) const
Returns true if the two tensors are of the same dimensionality and size.
- template <int OtherDim>
-
__host__ __device__ bool facebook::cuda::DeviceTensor::isSameSizeAndStride(const DeviceTensor < T, OtherDim, IndexT, PtrTraits > & rhs) const
Returns true if the two tensors are of the same dimensionality, size and stride.
-
std::string
toString
() const¶ Produces a string containing our size and stride array contents; for debugging purposes
- template <typename U>
-
__host__ __device__ DeviceTensor< U, Dim, IndexT, PtrTraits > facebook::cuda::DeviceTensor::cast()
Cast to a tensor of a different type of the same size and stride.
- template <typename U>
-
__host__ __device__ const DeviceTensor< U, Dim, IndexT, PtrTraits > facebook::cuda::DeviceTensor::cast() const
-
__host__ __device__ __forceinline__ DataPtrType facebook::cuda::DeviceTensor::data()
Returns a raw pointer to the start of our data.
-
__host__ __device__ __forceinline__ const DataPtrType facebook::cuda::DeviceTensor::data() const
Returns a raw pointer to the start of our data (const).
- template <typename U>
-
__host__ __device__ __forceinline__ PtrTraits<U>::PtrType facebook::cuda::DeviceTensor::dataAs()
Cast to a different datatype.
- template <typename U>
-
__host__ __device__ __forceinline__ const PtrTraits<const U>::PtrType facebook::cuda::DeviceTensor::dataAs() const
Cast to a different datatype.
-
__host__ __device__ __forceinline__ detail::DeviceSubTensor< DeviceTensor< T, Dim, IndexT, PtrTraits >, Dim-1, PtrTraits > facebook::cuda::DeviceTensor::operator[](IndexT index)
Returns a read/write view of a portion of our tensor.
-
__host__ __device__ __forceinline__ const detail::DeviceSubTensor< DeviceTensor< T, Dim, IndexT, PtrTraits >, Dim-1, PtrTraits > facebook::cuda::DeviceTensor::operator[](IndexT index) const
Returns a read/write view of a portion of our tensor (const).
-
__host__ __device__ __forceinline__ int facebook::cuda::DeviceTensor::getSize(int i) const
Returns the size of a given dimension,
[0, Dim - 1]
. No bounds checking.
-
__host__ __device__ __forceinline__ int facebook::cuda::DeviceTensor::getStride(int i) const
Returns the stride of a given dimension,
[0, Dim - 1]
. No bounds checking.
-
__host__ __device__ long facebook::cuda::DeviceTensor::numElements() const
Returns the total number of elements contained within our data (product of
getSize(i)
)
-
__host__ __device__ __forceinline__ const IndexT* facebook::cuda::DeviceTensor::sizes() const
Returns the size array.
-
__host__ __device__ __forceinline__ const IndexT* facebook::cuda::DeviceTensor::strides() const
Returns the stride array.
-
void
permuteDims
(const std::vector<int> &perm)¶ Limited form of resize by permutation, make sure your permutation array is legit. Only works for contiguous tensors.
-
__host__ __device__ bool facebook::cuda::DeviceTensor::isContiguous() const
Returns true if there is no padding within the tensor and no re-ordering of the dimensions.
(stride(i) == size(i + 1) * stride(i + 1))
-
__host__ __device__ bool facebook::cuda::DeviceTensor::isConsistentlySized(int i) const
Returns whether a given dimension has only increasing stride from the previous dimension. A tensor that was permuted by exchanging size and stride only will fail this check. If
i == 0
just checksize > 0
. Returnsfalse
ifstride
is<= 0
.
-
__host__ __device__ bool facebook::cuda::DeviceTensor::isConsistentlySized() const
-
__host__ __device__ bool facebook::cuda::DeviceTensor::isContiguousDim(int i) const
Returns true if the given dimension index has no padding.
-
__host__ __device__ DeviceTensor< T, Dim, IndexT, PtrTraits > facebook::cuda::DeviceTensor::transpose(int dim1, int dim2) const
Returns a tensor of the same dimension after transposing the two dimensions given. Does not actually move elements; transposition is made by permuting the size/stride arrays.
- template <int NewDim>
-
__host__ __device__ DeviceTensor< T, NewDim, IndexT, PtrTraits > facebook::cuda::DeviceTensor::upcastOuter()
Upcast a tensor of dimension
D
to some tensor of dimension D’ > D by padding the leading dimensions by 1 e.g., upcasting a 2-d tensor[2][3]
to a 4-d tensor[1][1][2][3]
- template <int NewDim>
-
__host__ __device__ DeviceTensor< T, NewDim, IndexT, PtrTraits > facebook::cuda::DeviceTensor::upcastInner()
Upcast a tensor of dimension
D
to some tensor of dimension D’ > D by padding the lowest/most varying dimensions by 1 e.g., upcasting a 2-d tensor[2][3]
to a 4-d tensor[2][3][1][1]
- template <int NewDim>
-
__host__ __device__ DeviceTensor< T, NewDim, IndexT, PtrTraits > facebook::cuda::DeviceTensor::downcastOuter()
Downcast a tensor of dimension
D
to some tensor of dimension D’ < D by collapsing the leading dimensions. asserts if there is padding on the leading dimensions.
- template <int NewDim>
-
__host__ __device__ DeviceTensor< T, NewDim, IndexT, PtrTraits > facebook::cuda::DeviceTensor::downcastInner()
Downcast a tensor of dimension
D
to some tensor of dimension D’ < D by collapsing the leading dimensions. asserts if there is padding on the leading dimensions.
- template <int SubDim>
-
__host__ __device__ DeviceTensor< T, SubDim, IndexT, PtrTraits > facebook::cuda::DeviceTensor::view(DataPtrType at)
Returns a tensor that is a view of the
SubDim
-dimensional slice of this tensor, starting atat
.
- template <int SubDim>
-
__host__ __device__ DeviceTensor< T, SubDim, IndexT, PtrTraits > facebook::cuda::DeviceTensor::view()
Returns a tensor that is a view of the
SubDim
-dimensional slice of this tensor, starting where our data begins
-
void
zero
(cudaStream_t stream = 0)¶ Zeroes out the tensor asynchronously. Asserts if the contents in question are not contiguous.
- template <typename T>
-
class
FixedDivisor
¶ Prototype for integer division by a fixed constant via strength reduction to mul/shift.
- template <>
-
template<>
classFixedDivisor
<int>¶ Specialization for calculating quotients by a fixed signed
d
using integer multiplication and shifts.Public Types
-
typedef int
Type
¶
Public Functions
-
FixedDivisor
(int d)¶
-
__host__ __device__ __forceinline__ int facebook::cuda::FixedDivisor::div(int n)
Calculates
q = n / d
.
-
__host__ __device__ __forceinline__ int facebook::cuda::FixedDivisor::mod(int n)
Calculates
r = n % d
.
-
__host__ __device__ __forceinline__ void facebook::cuda::FixedDivisor::divMod(int n, int * q, int * r)
Calculates
q = n / d
andr = n % d
together.
-
typedef int
- template <>
-
template<>
classFixedDivisor
<unsigned int>¶ Class for calculating quotients by a fixed unsigned
d
using integer multiplication, addition and shifts.Public Types
-
typedef unsigned int
Type
¶
Public Functions
-
FixedDivisor
(unsigned int d)¶
-
__host__ __device__ __forceinline__ unsigned int facebook::cuda::FixedDivisor::div(unsigned int n)
Calculates
q = n / d
.
-
__host__ __device__ __forceinline__ unsigned int facebook::cuda::FixedDivisor::mod(unsigned int n)
Calculates
r = n % d
.
-
__host__ __device__ __forceinline__ void facebook::cuda::FixedDivisor::divMod(unsigned int n, unsigned int * q, unsigned int * r)
Calculates
q = n / d
andr = n % d
together.
-
typedef unsigned int
- template <typename T>
-
struct
GreaterThan
¶ Prototype:
template <typename T> struct Comparator { static __device__ __forceinline__ bool compare(const T lhs, const T rhs); };
Public Static Functions
-
static __device__ __forceinline__ bool facebook::cuda::GreaterThan::compare(const T lhs, const T rhs)
-
-
class
KernelTimer
¶ - #include <KernelTimer.h>
Utility class for timing execution of a kernel.
Public Functions
-
KernelTimer
()¶ Constructor starts the timer and adds an event into the current device stream
-
~KernelTimer
()¶ Destructor releases event resources.
-
float
stop
()¶ Adds a stop event then synchronizes on the stop event to get the actual GPU-side kernel timings for any kernels launched in the current stream. Returns the number of milliseconds elapsed
-
- template <typename T>
-
struct
LessThan
¶ Public Static Functions
-
static __device__ __forceinline__ bool facebook::cuda::LessThan::compare(const T lhs, const T rhs)
-
- template <typename T>
-
struct
Max
¶ Public Functions
-
__host__ __device__ __forceinline__ T facebook::cuda::Max::operator()(T a, T b)
-
- template <typename T>
-
struct
Min
¶ Public Functions
-
__host__ __device__ __forceinline__ T facebook::cuda::Min::operator()(T a, T b)
-
- template <typename T>
-
struct
NumericLimits
¶ Numeric limits for CUDA.
- template <>
-
template<>
structNumericLimits
<float>¶ Public Static Functions
-
__device__ static __forceinline__ float facebook::cuda::NumericLimits::minPossible()
The minimum possible valid float (i.e., not NaN)
-
__device__ static __forceinline__ float facebook::cuda::NumericLimits::maxPossible()
The maximum possible valid float (i.e., not NaN)
-
- template <>
-
template<>
structNumericLimits
<int>¶ Public Static Functions
-
__device__ static __forceinline__ int facebook::cuda::NumericLimits::minPossible()
The minimum possible int.
-
__device__ static __forceinline__ int facebook::cuda::NumericLimits::maxPossible()
The maximum possible int.
-
- template <>
-
template<>
structNumericLimits
<unsigned int>¶ Public Static Functions
-
__device__ static __forceinline__ unsigned int facebook::cuda::NumericLimits::minPossible()
The minimum possible unsigned int.
-
__device__ static __forceinline__ unsigned int facebook::cuda::NumericLimits::maxPossible()
The maximum possible unsigned int.
-
- template <typename K, typename V>
-
struct
Pair
¶ A simple pair type for CUDA device usage.
Public Functions
-
__host__ __device__ __forceinline__ facebook::cuda::Pair::Pair()
-
__host__ __device__ __forceinline__ facebook::cuda::Pair::Pair(K key, V value)
-
__host__ __device__ __forceinline__ bool facebook::cuda::Pair::operator==(const Pair < K, V > & rhs) const
-
__host__ __device__ __forceinline__ bool facebook::cuda::Pair::operator!=(const Pair < K, V > & rhs) const
-
__host__ __device__ __forceinline__ bool facebook::cuda::Pair::operator<(const Pair < K, V > & rhs) const
-
__host__ __device__ __forceinline__ bool facebook::cuda::Pair::operator>(const Pair < K, V > & rhs) const
-
- template <typename T, int N>
-
struct
RegisterIndexUtils
¶ Utilities for addressing values held in register arrays, but with a dynamic index. For instance, if you had:
float arr[6]; int index = calculation(); arr[index + 1] = doStuffWith(arr[index]);
the dynamic indexing of
arr
withindex
requires that the compiler addressarr
in local memory, not registers, removing any performance benefit. Usually one should use static indexing for register arrays, for example:#pragma unroll for (int i = 0; i < 6; ++i) { arr[i] = foo; }
or
arr[3] = foo;
in order to allow the compiler to assign registers to
arr
, but there are occasions when one needs to dynamically index the array. The arrays in question should often be very small (e.g., N = 2-3) to avoid any lookup penalty.These utilities translate the dynamic request to a static request, for array sizes N = 1 to 32.
So, to take our original case, you’d use it like:
float arr[6]; int index = calculation(); float val = doStuffWith(RegisterUtils<float, 6>::get(arr, index)); RegisterUtils<float, 6>::set(arr, index + 1, val);
which will preserve the compiler’s ability to assign
arr
to registers.Public Static Functions
-
__device__ static __forceinline__ T facebook::cuda::RegisterIndexUtils::get(const T arr[N], int index)
Retrieve a single value from our thread-local register array.
-
__device__ static __forceinline__ void facebook::cuda::RegisterIndexUtils::set(T arr[N], int index, T val)
Set a single value in our thread-local register array.
-
- template <typename T, int N>
-
struct
RegisterUtils
¶ Various utilities for dealing with arrays of values which are maintained in thread-local registers. All accesses are done in such a way such that the index is statically known, which preserves the compiler’s ability to allocate the values to registers, as opposed to local memory.
Public Static Functions
- template <int Shift>
-
__device__ static __forceinline__ void facebook::cuda::RegisterUtils::shiftLeft(T arr[N])
Register shifting: move elements towards the beginning of the array (towards 0) by
Shift
places: arr[i] = arr[i + Shift] TheShift
elements at the end are left unchanged.
- template <int Shift>
-
__device__ static __forceinline__ void facebook::cuda::RegisterUtils::shiftRight(T arr[N])
Register shifting: move elements towards the end of the array (towards N - 1) by
Shift
places: arr[i] = arr[i - Shift] TheShift
elements at the beginning are left unchanged.
- template <int Rotate>
-
__device__ static __forceinline__ void facebook::cuda::RegisterUtils::rotateLeft(T arr[N])
Register rotation: move elements cyclically towards the beginning of the array with wrap around (towards 0).
- template <int Rotate>
-
__device__ static __forceinline__ void facebook::cuda::RegisterUtils::rotateRight(T arr[N])
Register rotation: move elements cyclically towards the end of the array with wrap around (towards N - 1).
- template <typename T>
-
struct
RestrictPtrTraits
¶ Public Types
-
typedef
T* __restrict__ facebook::cuda::RestrictPtrTraits::PtrType
-
typedef
- template <typename T>
-
struct
Sum
¶ Public Functions
-
__host__ __device__ __forceinline__ T facebook::cuda::Sum::operator()(T a, T b)
-
- template <typename T, int N>
-
struct
WarpRegisterLoaderUtils
¶ Tensor <-> register load/save utils, for managing a set of registers distributed across the warp
Public Static Functions
-
static __device__ void facebook::cuda::WarpRegisterLoaderUtils::load(T arr[N], const DeviceTensor < T, 1 > & in, const T fill)
Convenience utility to load values from a 1-d array into registers using within-warp striding. Registers for which there is no entry in the array get
fillVal
as a value
-
static __device__ void facebook::cuda::WarpRegisterLoaderUtils::save(DeviceTensor < T, 1 > & out, const T arr[N], const int num)
Convenience utility to save values into a 1-d array from registers using within-warp striding. Saves up to
num
values from the registers.
-
- template <typename K, typename V, int N>
-
struct
WarpRegisterPairLoaderUtils
¶ Tensor <-> register load/save utils for Pair<>, for managing a set of registers distributed across the warp
Public Static Functions
-
static __device__ void facebook::cuda::WarpRegisterPairLoaderUtils::load(Pair < K, V > arr[N], const DeviceTensor < K, 1 > & in, const K keyFill, const V valueFill)
Like WarpRegisterUtils<T>::load, but for key/value pair types. Initializes the value with the source index.
-
static __device__ void facebook::cuda::WarpRegisterPairLoaderUtils::load(Pair < K, V > arr[N], const DeviceTensor < K, 1 > & key, const DeviceTensor < V, 1 > & value, const K keyFill, const V valueFill)
Like WarpRegisterUtils<T>::load, but for key/value pair types. The value for each key is at the corresponding index in the value array. The arrays are presumed to be the same size.
-
static __device__ void facebook::cuda::WarpRegisterPairLoaderUtils::save(DeviceTensor < K, 1 > & key, DeviceTensor < V, 1 > & value, const Pair < K, V > arr[N], const int num)
Like WarpRegisterUtils<T>::save, but for key/value pair types.
-
- template <typename T, int N>
-
struct
WarpRegisterUtils
¶ Utilities for warp-wide held register arrays.
Public Static Functions
-
static __device__ T facebook::cuda::WarpRegisterUtils::broadcast(const T arr[N], int index)
Broadcast a single value from the warp-wide array
arr
, consideringindex
as an index across the warp threads. In other words, returns arr[index / warpSize] from lane (index % warpSize) to all threads in the warp.
-
-
namespace
detail
¶ Type of a subspace of a tensor.
Functions
- template <typename T, int N>
-
__host__ __device__ void facebook::cuda::detail::copy(T to[N], T from[N])
-
__host__ __device__ __forceinline__ unsigned int facebook::cuda::detail::mulHi(unsigned int x, unsigned int y)
Host and device implementation for 32-bit a * b into 64 bit, return high 32 bits
-
__device__ __forceinline__ constexpr int facebook::cuda::detail::max(int i, int j)
-
__device__ __forceinline__ constexpr int facebook::cuda::detail::max(int i, int j, int k)
-
__device__ __forceinline__ Complex facebook::cuda::detail::ldg(const Complex * p)
-
__device__ __forceinline__ void facebook::cuda::detail::ldg(Complex & c1, Complex & c2, const Complex * p)
- template <bool ConjugateTransposeA, bool ConjugateTransposeB, int FFTSize, int FFTElements, int TileI, int TileJ, int TileK, int TileIThreadIdxY, int TileJThreadIdxZ, bool Accumulate>
-
facebook::cuda::detail::__launch_bounds__(32 *4 * 2, 2) const
-
assert
()¶
-
assert
(FFTElements = =blockDim.x)¶
-
assert
(TileIThreadIdxY = =blockDim.y)¶
-
assert
(TileJThreadIdxZ = =blockDim.z)¶
-
facebook::cuda::detail::assert(numRed % TileK = =0)
-
bool int int int int int bool bool bool bool bool bool bool Accumulate facebook::cuda::detail::__launch_bounds__(32 * 32, 1) const
-
facebook::cuda::detail::assert(A. getSize2 = =C.getSize(2))
-
facebook::cuda::detail::assert(B. getSize2 = =C.getSize(2))
-
facebook::cuda::detail::assert(ConjugateTransposeA||A. getSize0 = =C.getSize(0))
-
facebook::cuda::detail::assert(!ConjugateTransposeA||A. getSize1 = =C.getSize(0))
-
facebook::cuda::detail::assert(ConjugateTransposeB ||B. getSize1 = =C.getSize(1))
-
facebook::cuda::detail::assert(! ConjugateTransposeB ||B. getSize0 = =C.getSize(1))
-
facebook::cuda::detail::assert(ConjugateTransposeA|| ConjugateTransposeB ||A. getSize1 = =B.getSize(0))
-
assert
(C_XY_Placement_ThreadIdx_X = =blockDim.x)¶
-
facebook::cuda::detail::assert(! StaticUnrollCI ||C. getSize0)%(C_I_Tile *gridDim.x = =0)
-
facebook::cuda::detail::assert(! StaticUnrollCJ ||C. getSize1)%(C_J_Tile *gridDim.y *blockDim.y = =0)
-
facebook::cuda::detail::assert(! StaticUnrollReduction || numRed % ReductionUnroll = =0)
-
facebook::cuda::detail::for()
- template <int N, typename T>
-
__device__ T facebook::cuda::detail::getMulti(const T arr[N], int index, T val)
- template <int N, typename T>
-
__device__ void facebook::cuda::detail::scatterHalfWarp(T arr[N], int index, T val)
- template <typename T, typename Comparator, int M, int N>
-
__device__ void facebook::cuda::detail::warpMergeMN(const T a[M], const T b[N], T dst[M+N])
- template <typename T, typename Comparator, int N>
-
__device__ void facebook::cuda::detail::warpSortRegisters(T a[N], T dst[N])
- template <typename T, typename Comparator, int N>
-
__device__ void facebook::cuda::detail::warpSortRegisters(const DeviceTensor < T, 1 > & key, DeviceTensor < T, 1 > & sortedKey)
- template <typename T, typename IndexType, typename Comparator, int N>
-
__device__ void facebook::cuda::detail::warpSortRegisters(const DeviceTensor < T, 1 > & key, DeviceTensor < T, 1 > & sortedKey, DeviceTensor < IndexType, 1 > & sortedKeyIndices)
- template <typename K, typename V, typename Comparator, int N>
-
__device__ void facebook::cuda::detail::warpSortRegisters(const DeviceTensor < K, 1 > & key, const DeviceTensor < V, 1 > & value, DeviceTensor < K, 1 > & sortedKey, DeviceTensor < V, 1 > & sortedValue)
- template <int N, typename T>
-
__device__ __forceinline__ void facebook::cuda::detail::setArray(T arr[N], T val)
Initialize an array to a value.
-
__device__ __forceinline__ void facebook::cuda::detail::incrementArray(int val, int counts[16])
In order to force register usage of the bucket count array, we have to unroll the increment selection. Otherwise, local memory is used for counts[] which severely degrades performance.
- template <typename T, int N, int ILP>
-
__device__ __forceinline__ void facebook::cuda::detail::countNybbles(int counts[N], unsigned desired, unsigned desiredMask, int nybbleCheckPos, const DeviceTensor < T, 1 > & data)
We use a most significant to least significant radix selection on the float values, which requires at most sizeof(float) * 2 scans through the array, one for each nybble.
In order to use radix selection, we use the property that for positive floating-point values f1 and f2:
f1 > f2 <=> *(int*)&f1 > *(int*)&f2.
Something similar is true for negative floating point values f1 and f2 after zero-ing the leading sign bit, and except that the order is reversed:
f1 > f2 <=> (*(int*)f1 & 0x7fffffff) < (*(int*)f2 & 0x7fffffff).
This is true even for +/-inf and for denormalized floats. Negative zero is a special case. Selection by radix will give us that -0.0f < +0.0f, which is not true for IEEE fp comparison. We handle this special case when we return the answer seen, not in comparing values here.
+NaNs will lead all positive values, and -NaNs will be minimal values (non-canonical NaNs, if they exist, will be sorted according to this).
The focal point of the radix selection algorithm is the use of countNybbles and the CHECK_NYBBLE macro.
The idea is that we starting out, we don’t know where the Kth highest element lives, so we have to consider every float in the input. We look at the most significant nybble, and each thread counts into 16 buckets the number of floats in its subset of data with that leading nybble. This is done by countNybbles. countNybble takes as arguments
desired
anddesiredMask
. It only looks at valuesv
such that (v & desiredMask) == desired. By default, both are 0, so it will look at every float. nybbleCheckPos is the current nybble that is beinig bucketed. It starts at 28, meaning we’re first looking at the most significant nybble.countNybbles will add a count of nybble distribution to 16 buckets. One iteration through, by counting the distribution of the leading nybble in each float, we figure out what leading nybble the Kth highest float must have. As an example, let’s say that K is 10. By counting the distribution of leading nybbles in the inputs, say we get:
0x2: 1 0x5: 2 0x6: 2 0x7: 11
In this case, none of the floats are negative (otherwise, they would have leading nybble 0x8 -> 0xf). Since we’re looking for the 10th highest float, that cannot have leading nybble 0x2, 0x5 or 0x6 since those counts are less than 10. We walk through the buckets in order, and we warp reduce the counts across all threads to one count when it comes time to look in a particular bucket.
Thus, the 10th highest float must have leading nybble 0x7. The problem then becomes, for the next iteration, finding the (10 - (1+2+2)) = 5th highest float with leading nybble 0x7. Since the count for the nybble 0x7 is not 1, we don’t know the actual answer yet, and we have to continue.
Next iteration through, we no longer have to count every float, just those with leading nybble 0x7 (i.e., floats interpreted as a bit pattern v such that (v & desiredMask) == desired. Even though we have to physically scan the entire input, we are only counting a subset of it.
So:
- desired changes from 0 -> 0x70000000, and
- desiredMask changes from 0 -> 0xf0000000.
We continue, and count up the floats with leading nybble 0x7, getting counts:
0x(7)1: 1 0x(7)9: 2 0x(7)b: 1 0x(7)c: 1 0x(7)f: 6
This means that the count of all floats with the prefix 0x7fyyyyyy is 6. We’re only bucketing counts by the second nybble now.
Scanning through, the 5th highest float with prefix 0x7yyyyyyy must have prefix 0x7cyyyyyy, since from lowest to highest above, we reach 5 in bucket 0xc.
Thus, the 10th highest float in the entire set is the unique float with prefix 0x7cyyyyyy. This is unique because the count for this bucket is 1.
If we get through all nybbles to the least significant nybble and still have a count > 1, then that means that the Kth highest element is not unique. For example, in the set 2 2 3 3 3, the 2nd highest element is 3, which is duplicated 3 times.
Since the MSN contains the sign bit, we have to first look at buckets 0-7 to see if the Kth highest float is positive. If so, then we continue looking only at positive floats. If not, then we continue looking only at negative floats, but in reverse order.
Eventually we find a unique Kth highest element if the count is 1 in our bucket, or we end at the LSN with a duplicate count, in which case the Kth highest element is not unique.
Performs a histogram count of the nybbles that occur at the bit position
nybbleCheckPos
, but only for those ints that match (x &desiredMask
) ==desired
.In other words, if bits [31,
nybbleCheckPos
+ 4] match those indesired
, then return the contents of bits [nybbleCheckPos
+ 3,nybbleCheckPos
].
-
__device__ __forceinline__ Pair<float, int> facebook::cuda::detail::findAnswer(const DeviceTensor < float, 1 > & data, unsigned desired, unsigned desiredMask, int dupCount)
A warp coherent implementation that finds a value in the data such that the floats, treated as uints ‘v’ match the bit pattern such that (v & desiredMask) == desired.
If the answer found is -0.0f, because -0.0f == +0.0f, it is possible that there are multiple +0.0f results that we’ve ignored in comparing by radix, since radix-wise +0.0f > -0.0f which is not true for IEEE fp.
Thus, if the answer found is -0.0f, then we have to include the count of all +0.0fs present in the duplicate count, in order to treat the comparison the same way that normal sorting mechanisms will treat it.
If the found result is not -0.0f, returns the value found and
dupCount
as the pair’s value.If the found result is -0.0f, returns the value found and
dupCount
plus the number of +0.0f in the data as the pair’s value.The value need not be unique, but the warp as a whole will return the highest value seen across the warp.
-
__device__ Pair<float, int> facebook::cuda::detail::warpFindTopKthElementSmall32(const DeviceTensor < float, 1 > & data, int k)
Finds the Kth highest floating point value in a linear array [arr, end) without modifying the data and without temporary storage except for registers. K starts at 1. All threads in the warp will return the value. Handles all floats except NaNs.
This function minimizes warp divergence.
Implementation for small arrays such that the
(end - start) <= warpSize
.
-
__device__ Pair<float, int> facebook::cuda::detail::warpFindTopKthElementLarge(const DeviceTensor < float, 1 > & data, int k)
Finds the Kth highest floating point value in a linear array [arr, end) without modifying the data and without temporary storage except for registers.
- K starts at 1.
- All threads in the warp will return the value.
- Handles all floats except NaNs.
- Negative zero is specialized by findAnswer.
- This function minimizes warp divergence.
Implementation for large arrays such that there are more elements than warp threads.
-
__device__ __forceinline__ int facebook::cuda::detail::laneWillWrite(float val, float topK, int & topKToWrite, int & next)
Returns the index into the array that this lane will write. If this lane is not responsible for writing a value, this will return -1.
-
__device__ float facebook::cuda::detail::warpFindTopKElementsIndexOrder(const DeviceTensor < float, 1 > & data, DeviceTensor < float, 1 > & out, int k)
For a given warp, find and write out the top-k highest floating point values in [start, end) to [out, out + k). The list written out occurs in the original source order (by original index). Returns the k-th highest element seen. Handles all floats except NaNs. Implementation for large arrays such that there are more elements than warp threads.
- template <typename IndexType>
-
__device__ float facebook::cuda::detail::warpFindTopKElementsIndexOrder(const DeviceTensor < float, 1 > & data, DeviceTensor < float, 1 > & out, DeviceTensor < IndexType, 1 > & indices, int k)
Version of warpFindTopKElementsUnorderedLarge, except also writes out the K indices chosen from
data
intoindices
.
-
__device__ void facebook::cuda::detail::warpFindTopKElementsValueOrderSmall(const DeviceTensor < float, 1 > & data, DeviceTensor < float, 1 > & out, int k)
For a given warp, find and write out the top-k highest floating point values in [start, end) to [out, out + k). The list written out is ordered. Handles all floats except NaNs.
- template <typename IndexType>
-
__device__ void facebook::cuda::detail::warpFindTopKElementsValueOrderSmall(const DeviceTensor < float, 1 > & data, DeviceTensor < float, 1 > & out, DeviceTensor < IndexType, 1 > & indices, int k)
Version of warpFindTopKElementsOrderedSmall that also writes out the indices in
data
of the K elements chosen intoindices
.
-
__device__ void facebook::cuda::detail::warpFindTopKElementsValueOrderLarge(const DeviceTensor < float, 1 > & data, DeviceTensor < float, 1 > & out, int k)
For a given warp, find and write out the top-k highest floating point values in [start, end) to [out, out + k). The list written out is ordered. Handles all floats except NaNs. Implementation for large arrays such that there are more elements than warp threads.
- template <typename IndexType>
-
__device__ void facebook::cuda::detail::warpFindTopKElementsValueOrderLarge(const DeviceTensor < float, 1 > & data, DeviceTensor < float, 1 > & out, DeviceTensor < IndexType, 1 > & indices, int k)
Version of warpFindTopKElementsOrderedLage that also writes out the indices in
data
of the K elements chosen intoindices
.
- template <typename T, typename Comparator>
-
__device__ __forceinline__ T facebook::cuda::detail::shflSwap(const T x, int mask, int dir)
Variables
-
bool int int int int int bool bool bool bool bool bool bool Accumulate facebook::cuda::detail::A
-
bool int int int int int bool bool bool bool bool bool bool Accumulate const DeviceTensor< Complex, 3 > facebook::cuda::detail::B
-
bool int int int int int bool bool bool bool bool bool bool Accumulate const DeviceTensor< Complex, 3 > DeviceTensor< Complex, 3 > facebook::cuda::detail::C
-
bool int int int int int bool bool bool bool bool bool bool Accumulate const DeviceTensor< Complex, 3 > DeviceTensor< Complex, 3 > Complex facebook::cuda::detail::invNorm
= { const auto xyBase = blockIdx.z * blockDim.x
-
const auto
xy
= blockIdx.z * blockDim.x + threadIdx.x¶
-
const int
numRed
= (ConjugateTransposeA) ? A.getSize(0) : A.getSize(1)¶
-
bool
ConjugateTransposeB
¶
-
bool int facebook::cuda::detail::FFTSize
-
bool int int facebook::cuda::detail::C_J_Unroll
-
bool int int int facebook::cuda::detail::C_I_Tile
-
bool int int int int facebook::cuda::detail::C_J_Tile
-
bool int int int int int facebook::cuda::detail::ReductionUnroll
-
const int
numBatches
= A.getSize(2)¶
-
const int
ubxy
= ceil(numBatches, (int)(gridDim.z * blockDim.x)) * gridDim.z * blockDim.x¶
-
bool int facebook::cuda::detail::C_XY_Placement_ThreadIdx_X
-
bool int int int int int bool facebook::cuda::detail::StaticUnrollA
-
bool int int int int int bool bool facebook::cuda::detail::StaticUnrollB
-
bool int int int int int bool bool bool facebook::cuda::detail::StaticUnrollCI
-
bool int int int int int bool bool bool bool facebook::cuda::detail::StaticUnrollCJ
-
bool int int int int int bool bool bool bool bool facebook::cuda::detail::StaticUnrollXY
-
bool int int int int int bool bool bool bool bool bool facebook::cuda::detail::StaticUnrollReduction
-
const int
ubi
= (StaticUnrollCI) ? C.getSize(0) : ceil(C.getSize(0), (int)(C_I_Tile * gridDim.x)) * C_I_Tile * gridDim.x¶
-
const int
ubj
= (StaticUnrollCJ) ? C.getSize(1) : ceil(C.getSize(1), (int)(C_J_Tile * gridDim.y * blockDim.y)) * C_J_Tile * gridDim.y * blockDim.y¶
- template <typename TensorType, int SubDim, template< typename U > class PtrTraits>
-
class
DeviceSubTensor
¶ A
SubDim
-rank slice of a parent DeviceTensor.Public Functions
-
__host__ __device__ __forceinline__ DeviceSubTensor<TensorType, SubDim - 1, PtrTraits> facebook::cuda::detail::DeviceSubTensor::operator[](typename TensorType::IndexType index)
Returns a view of the data located at our offset (the dimension
SubDim
- 1 tensor).
-
__host__ __device__ __forceinline__ const DeviceSubTensor<TensorType, SubDim - 1, PtrTraits> facebook::cuda::detail::DeviceSubTensor::operator[](typename TensorType::IndexType index) const
Returns a view of the data located at our offset (the dimension
SubDim
- 1 tensor) (const).
-
__host__ __device__ TensorType::DataType* facebook::cuda::detail::DeviceSubTensor::operator&()
-
__host__ __device__ const TensorType::DataType* facebook::cuda::detail::DeviceSubTensor::operator&() const
-
__host__ __device__ __forceinline__ TensorType::DataPtrType facebook::cuda::detail::DeviceSubTensor::data()
Returns a raw accessor to our slice.
-
__host__ __device__ __forceinline__ const TensorType::DataPtrType facebook::cuda::detail::DeviceSubTensor::data() const
Returns a raw accessor to our slice (const).
- template <typename T>
-
__host__ __device__ T& facebook::cuda::detail::DeviceSubTensor::as()
Cast to a different datatype.
- template <typename T>
-
__host__ __device__ const T& facebook::cuda::detail::DeviceSubTensor::as() const
Cast to a different datatype (const).
- template <typename T>
-
__host__ __device__ __forceinline__ PtrTraits<T>::PtrType facebook::cuda::detail::DeviceSubTensor::dataAs()
Cast to a different datatype.
- template <typename T>
-
__host__ __device__ __forceinline__ PtrTraits<const T>::PtrType facebook::cuda::detail::DeviceSubTensor::dataAs() const
Cast to a different datatype (const)
-
__device__ __forceinline__ TensorType::DataType facebook::cuda::detail::DeviceSubTensor::ldg() const
Use the texture cache for reads.
- template <typename T>
-
__device__ __forceinline__ T facebook::cuda::detail::DeviceSubTensor::ldgAs() const
Use the texture cache for reads; cast as a particular type.
-
DeviceTensor<typename TensorType::DataType, SubDim, typename TensorType::IndexType, PtrTraits>
view
()¶ Returns a tensor that is a view of the SubDim-dimensional slice of this tensor, starting where our data begins
Friends
-
friend
facebook::cuda::detail::DeviceTensor< typename TensorType::DataType, TensorType::NumDim, typename TensorType::IndexType, PtrTraits >
Our parent tensor can create us.
-
- template <typename TensorType, template< typename U > class PtrTraits>
-
template<>
classDeviceSubTensor
<TensorType, 0, PtrTraits>¶ Specialization for a view of a single value (0-dimensional)
Public Functions
-
__host__ __device__ DeviceSubTensor<TensorType, 0, PtrTraits> facebook::cuda::detail::DeviceSubTensor::operator=(typename TensorType::DataType val)
-
__host__ __device__ facebook::cuda::detail::DeviceSubTensor::operator typename TensorType::DataType &()
-
__host__ __device__ facebook::cuda::detail::DeviceSubTensor::operator const typename TensorType::DataType &() const
-
__host__ __device__ TensorType::DataType* facebook::cuda::detail::DeviceSubTensor::operator&()
-
__host__ __device__ const TensorType::DataType* facebook::cuda::detail::DeviceSubTensor::operator&() const
-
__host__ __device__ __forceinline__ TensorType::DataPtrType facebook::cuda::detail::DeviceSubTensor::data()
Returns a raw accessor to our slice.
-
__host__ __device__ __forceinline__ const TensorType::DataPtrType facebook::cuda::detail::DeviceSubTensor::data() const
Returns a raw accessor to our slice (const).
- template <typename T>
-
__host__ __device__ T& facebook::cuda::detail::DeviceSubTensor::as()
Cast to a different datatype.
- template <typename T>
-
__host__ __device__ const T& facebook::cuda::detail::DeviceSubTensor::as() const
Cast to a different datatype (const).
- template <typename T>
-
__host__ __device__ __forceinline__ PtrTraits<T>::PtrType facebook::cuda::detail::DeviceSubTensor::dataAs()
Cast to a different datatype.
- template <typename T>
-
__host__ __device__ __forceinline__ PtrTraits<const T>::PtrType facebook::cuda::detail::DeviceSubTensor::dataAs() const
Cast to a different datatype (const)
-
__device__ __forceinline__ TensorType::DataType facebook::cuda::detail::DeviceSubTensor::ldg() const
Use the texture cache for reads.
- template <typename T>
-
__device__ __forceinline__ T facebook::cuda::detail::DeviceSubTensor::ldgAs() const
Use the texture cache for reads; cast as a particular type.
Friends
-
friend
facebook::cuda::detail::DeviceTensor< typename TensorType::DataType, 1, typename TensorType::IndexType, PtrTraits >
Our parent tensor can create us.
-
-
struct
HalfFtor
¶
- template <typename T, typename Comparator, int N>
-
struct
Merge
¶ Public Static Functions
-
static __device__ void facebook::cuda::detail::Merge::splitAndMerge(const T in[N], T out[N])
-
- template <typename T, typename Comparator>
-
template<>
structMerge
<T, Comparator, 1>¶ Public Static Functions
-
static __device__ void facebook::cuda::detail::Merge::splitAndMerge(const T in[1], T out[1])
-