Namespace UtilKernels

Namespace containing utility CUDA kernel wrappers.

Public Functions

Type	Name
void	add_scalar (const double * d_in, double * d_out, double scalar, size_t size, cudaStream_t s) Adds a scalar value to each element of a vector: d_out[i] = d_in[i] + scalar.
void	cast_vector (const T_in const d_in, T_out const d_out, const unsigned int size, cudaStream_t s) Casts a vector from one type to another.
void	elementwise_abs (const double * d_in, double * d_out, size_t size, cudaStream_t s) Computes the element-wise absolute value of a vector: d_out[i] = fabs(d_in[i])
void	elementwise_divide (const double * d_in1, const double * d_in2, double * d_out, size_t size, cudaStream_t s) Computes the element-wise (Hadamard) quotient of two vectors: d_out[i] = d_in1[i] / d_in2[i].
void	elementwise_inverse (const double * d_in, double * d_out, size_t size, cudaStream_t s) Computes the element-wise inverse of a vector: d_out[i] = 1.0 / d_in[i].
void	elementwise_max (const double * d_in1, const double * d_in2, double * d_out, size_t size, cudaStream_t s) Computes the element-wise maximum of two vectors: d_out[i] = fmax(d_in1[i], d_in2[i])
void	elementwise_min (const double * d_in1, const double * d_in2, double * d_out, size_t size, cudaStream_t s) Computes the element-wise minimum of two vectors: d_out[i] = fmin(d_in1[i], d_in2[i])
void	elementwise_multiply (const double * d_in1, const double * d_in2, double * d_out, size_t size, cudaStream_t s) Computes the element-wise (Hadamard) product of two vectors: d_out[i] = d_in1[i] d_in2[i].*
void	elementwise_multiply_add (const double * d_x, const double * d_y, const double * d_z, double * d_out, size_t size, cudaStream_t s) Computes the fused element-wise multiply-add of three vectors: d_out[i] = d_x[i] d_y[i] + d_z[i].*
void	elementwise_power (const double * d_in, double * d_out, double exponent, size_t size, cudaStream_t s) Computes the element-wise power of a vector: d_out[i] = pow(d_in[i], exponent)
void	elementwise_sign (const double * d_in, double * d_out, size_t size, cudaStream_t s) Computes the element-wise sign of a vector: d_out[i] = sign(d_in[i]) (returns -1, 0, or 1).
void	extend_vector (const double * d_in, double * d_out, size_t num_blocks, size_t current_block_size, size_t new_block_size, cudaStream_t s) Extends a vector by padding each block with zeros.
void	max_scalar (const double * d_in, double * d_out, double scalar, size_t size, cudaStream_t s) Computes the element-wise maximum of a vector and a scalar: d_out[i] = fmax(d_in[i], scalar)
void	min_scalar (const double * d_in, double * d_out, double scalar, size_t size, cudaStream_t s) Computes the element-wise minimum of a vector and a scalar: d_out[i] = fmin(d_in[i], scalar)
void	pad_vector (const T_in const d_in, T_out const d_pad, const unsigned int num_blocks, const unsigned int padded_size, cudaStream_t s) Pads each block of a vector to twice the length with zeros.
void	reduce_max (const double * d_in, double * d_out, size_t size, cudaStream_t s) Reduces a vector to its maximum element value on the device.
void	reduce_min (const double * d_in, double * d_out, size_t size, cudaStream_t s) Reduces a vector to its minimum element value on the device.
void	shrink_vector (const double * d_in, double * d_out, size_t num_blocks, size_t current_block_size, size_t new_block_size, cudaStream_t s) Shrinks a vector by removing padding.
void	swap_axes_cutranspose (const T_complex * d_in, T_complex * d_out, const unsigned int num_cols, const unsigned int num_rows, const unsigned int block_size, cudaStream_t s) Swaps the axes of a matrix and using cutranspose.
void	unpad_repad_vector (const T_in const d_in, T_out const d_out, const unsigned int num_blocks, const unsigned int padded_size, const bool unpad, cudaStream_t s) Unpads or repads a vector.

Public Functions Documentation

function add_scalar

Adds a scalar value to each element of a vector: d_out[i] = d_in[i] + scalar.

void UtilKernels::add_scalar (
    const double * d_in,
    double * d_out,
    double scalar,
    size_t size,
    cudaStream_t s
)

Parameters:

d_in Pointer to the input vector.
d_out Pointer to the output vector.
scalar The scalar value to add.
size Total number of elements in the vectors.
s The CUDA stream to use for the operation.

function cast_vector

Casts a vector from one type to another.

template<typename T_in, typename T_out>
void UtilKernels::cast_vector (
    const T_in *const d_in,
    T_out *const d_out,
    const unsigned int size,
    cudaStream_t s
)

Parameters:

d_in Pointer to the input vector.
d_out Pointer to the output vector.
size Size of the input and output vectors.
s The CUDA stream to use for the operation.

Template parameters:

T_in Input type.
T_out Output type.

function elementwise_abs

Computes the element-wise absolute value of a vector: d_out[i] = fabs(d_in[i])

void UtilKernels::elementwise_abs (
    const double * d_in,
    double * d_out,
    size_t size,
    cudaStream_t s
)

Parameters:

d_in Pointer to the input vector.
d_out Pointer to the output vector.
size Total number of elements in the vectors.
s The CUDA stream to use for the operation.

function elementwise_divide

Computes the element-wise (Hadamard) quotient of two vectors: d_out[i] = d_in1[i] / d_in2[i].

void UtilKernels::elementwise_divide (
    const double * d_in1,
    const double * d_in2,
    double * d_out,
    size_t size,
    cudaStream_t s
)

Parameters:

d_in1 Pointer to the first input vector.
d_in2 Pointer to the second input vector.
d_out Pointer to the output vector.
size Total number of elements in the vectors.
s The CUDA stream to use for the operation.

function elementwise_inverse

Computes the element-wise inverse of a vector: d_out[i] = 1.0 / d_in[i].

void UtilKernels::elementwise_inverse (
    const double * d_in,
    double * d_out,
    size_t size,
    cudaStream_t s
)

Parameters:

d_in Pointer to the input vector.
d_out Pointer to the output vector.
size Total number of elements in the vectors.
s The CUDA stream to use for the operation.

function elementwise_max

Computes the element-wise maximum of two vectors: d_out[i] = fmax(d_in1[i], d_in2[i])

void UtilKernels::elementwise_max (
    const double * d_in1,
    const double * d_in2,
    double * d_out,
    size_t size,
    cudaStream_t s
)

Parameters:

d_in1 Pointer to the first input vector.
d_in2 Pointer to the second input vector.
d_out Pointer to the output vector.
size Total number of elements in the vectors.
s The CUDA stream to use for the operation.

function elementwise_min

Computes the element-wise minimum of two vectors: d_out[i] = fmin(d_in1[i], d_in2[i])

void UtilKernels::elementwise_min (
    const double * d_in1,
    const double * d_in2,
    double * d_out,
    size_t size,
    cudaStream_t s
)

Parameters:

d_in1 Pointer to the first input vector.
d_in2 Pointer to the second input vector.
d_out Pointer to the output vector.
size Total number of elements in the vectors.
s The CUDA stream to use for the operation.

function elementwise_multiply

Computes the element-wise (Hadamard) product of two vectors: d_out[i] = d_in1[i] * d_in2[i].

void UtilKernels::elementwise_multiply (
    const double * d_in1,
    const double * d_in2,
    double * d_out,
    size_t size,
    cudaStream_t s
)

Parameters:

d_in1 Pointer to the first input vector.
d_in2 Pointer to the second input vector.
d_out Pointer to the output vector.
size Total number of elements in the vectors.
s The CUDA stream to use for the operation.

function elementwise_multiply_add

Computes the fused element-wise multiply-add of three vectors: d_out[i] = d_x[i] * d_y[i] + d_z[i].

void UtilKernels::elementwise_multiply_add (
    const double * d_x,
    const double * d_y,
    const double * d_z,
    double * d_out,
    size_t size,
    cudaStream_t s
)

Parameters:

d_x Pointer to the first input vector (multiplicand).
d_y Pointer to the second input vector (multiplier).
d_z Pointer to the third input vector (addend).
d_out Pointer to the output vector.
size Total number of elements in the vectors.
s The CUDA stream to use for the operation.

function elementwise_power

Computes the element-wise power of a vector: d_out[i] = pow(d_in[i], exponent)

void UtilKernels::elementwise_power (
    const double * d_in,
    double * d_out,
    double exponent,
    size_t size,
    cudaStream_t s
)

Parameters:

d_in Pointer to the input vector.
d_out Pointer to the output vector.
exponent The scalar power to raise each element to.
size Total number of elements in the vectors.
s The CUDA stream to use for the operation.

function elementwise_sign

Computes the element-wise sign of a vector: d_out[i] = sign(d_in[i]) (returns -1, 0, or 1).

void UtilKernels::elementwise_sign (
    const double * d_in,
    double * d_out,
    size_t size,
    cudaStream_t s
)

Parameters:

d_in Pointer to the input vector.
d_out Pointer to the output vector.
size Total number of elements in the vectors.
s The CUDA stream to use for the operation.

function extend_vector

Extends a vector by padding each block with zeros.

void UtilKernels::extend_vector (
    const double * d_in,
    double * d_out,
    size_t num_blocks,
    size_t current_block_size,
    size_t new_block_size,
    cudaStream_t s
)

Parameters:

d_in Pointer to the input vector.
d_out Pointer to the output vector.
num_blocks Number of blocks in the vector.
current_block_size Current size of each block.
new_block_size New size of each block after padding.
s The CUDA stream to use for the operation.

function max_scalar

Computes the element-wise maximum of a vector and a scalar: d_out[i] = fmax(d_in[i], scalar)

void UtilKernels::max_scalar (
    const double * d_in,
    double * d_out,
    double scalar,
    size_t size,
    cudaStream_t s
)

Parameters:

d_in Pointer to the input vector.
d_out Pointer to the output vector.
scalar The scalar value to compare against.
size Total number of elements in the vectors.
s The CUDA stream to use for the operation.

function min_scalar

Computes the element-wise minimum of a vector and a scalar: d_out[i] = fmin(d_in[i], scalar)

void UtilKernels::min_scalar (
    const double * d_in,
    double * d_out,
    double scalar,
    size_t size,
    cudaStream_t s
)

Parameters:

d_in Pointer to the input vector.
d_out Pointer to the output vector.
scalar The scalar value to compare against.
size Total number of elements in the vectors.
s The CUDA stream to use for the operation.

function pad_vector

Pads each block of a vector to twice the length with zeros.

template<typename T_in, typename T_out>
void UtilKernels::pad_vector (
    const T_in *const d_in,
    T_out *const d_pad,
    const unsigned int num_blocks,
    const unsigned int padded_size,
    cudaStream_t s
)

This function takes an input vector d_in and pads each block of the vector to twice the length with zeros. The padded vector is stored in the output vector d_pad. The number of columns in each block is specified by num_cols. The total size of the vector is specified by size. The padding operation is performed asynchronously on the CUDA stream s.

Parameters:

d_in Pointer to the input vector.
d_pad Pointer to the output padded vector.
num_blocks Number of blocks in the vector.
padded_size Padded size of each block.
s CUDA stream for asynchronous execution.

Template parameters:

T_in Data type of the input and output vectors (real).
T_out Data type of the input and output vectors (real).

function reduce_max

Reduces a vector to its maximum element value on the device.

void UtilKernels::reduce_max (
    const double * d_in,
    double * d_out,
    size_t size,
    cudaStream_t s
)

Parameters:

d_in Pointer to the input vector.
d_out Pointer to a single device double storing the result.
size Total number of elements in the input vector.
s The CUDA stream to use for the operation.

function reduce_min

Reduces a vector to its minimum element value on the device.

void UtilKernels::reduce_min (
    const double * d_in,
    double * d_out,
    size_t size,
    cudaStream_t s
)

Parameters:

d_in Pointer to the input vector.
d_out Pointer to a single device double storing the result.
size Total number of elements in the input vector.
s The CUDA stream to use for the operation.

function shrink_vector

Shrinks a vector by removing padding.

void UtilKernels::shrink_vector (
    const double * d_in,
    double * d_out,
    size_t num_blocks,
    size_t current_block_size,
    size_t new_block_size,
    cudaStream_t s
)

Parameters:

d_in Pointer to the input vector.
d_out Pointer to the output vector.
num_blocks Number of blocks in the vector.
current_block_size Current size of each block.
new_block_size New size of each block after padding.
s The CUDA stream to use for the operation.

function swap_axes_cutranspose

Swaps the axes of a matrix and using cutranspose.

template<typename T_complex>
void UtilKernels::swap_axes_cutranspose (
    const T_complex * d_in,
    T_complex * d_out,
    const unsigned int num_cols,
    const unsigned int num_rows,
    const unsigned int block_size,
    cudaStream_t s
)

Parameters:

d_in Pointer to the input matrix.
d_out Pointer to the output matrix.
num_cols Number of columns in the input matrix.
num_rows Number of rows in the input matrix.
block_size Block size of input matrix.
s The CUDA stream to use for the operation.

Template parameters:

T_complex Data type of the input and output matrices.

function unpad_repad_vector

Unpads or repads a vector.

template<typename T_in, typename T_out>
void UtilKernels::unpad_repad_vector (
    const T_in *const d_in,
    T_out *const d_out,
    const unsigned int num_blocks,
    const unsigned int padded_size,
    const bool unpad,
    cudaStream_t s
)

This function either unpads each block of the vector back to the original length or resets the second half of each block to zeros.

Parameters:

d_in Pointer to the input vector.
d_out Pointer to the output vector.
num_blocks Number of blocks in the vector.
padded_size Padded size of each block.
unpad Flag indicating whether to unpad or repad the vector. If true, the vector will be unpadded. If false, the second half of each block will be reset to zeros.
s The CUDA stream to use for the operation.

Template parameters:

T_in Data type of the input and output vectors (real).
T_out Data type of the input and output vectors (real).

The documentation for this class was generated from the following file src/util_kernels.hpp