Below is a test code for the CUDA functions. CUDA devices cannot directly compute with the QP FP data type. Thus, you have to use Sleef_quadx1 data type to retain a QP FP value in CUDA device codes. This data type has the same structure as the QP FP data type, and you can directly access the number by casting the pointer to the QP FP data type supported by the compiler. Beware of the strict-aliasing rule in this case.
#include <iostream>
#include <quadmath.h>
#include "sleefquadinline_cuda.h"
// Based on the tutorial code at https://developer.nvidia.com/blog/even-easier-introduction-cuda/
__global__ void pow_gpu(int n, Sleef_quadx1 *r, Sleef_quadx1 *x, Sleef_quadx1 *y) {
int index = threadIdx.x, stride = blockDim.x;
for (int i = index; i < n; i += stride)
r[i] = Sleef_powq1_u10cuda(x[i], y[i]);
}
int main(void) {
int N = 1 << 20;
Sleef_quadx1 *rd, *xd, *yd;
cudaMallocManaged(&rd, N*sizeof(Sleef_quadx1));
cudaMallocManaged(&xd, N*sizeof(Sleef_quadx1));
cudaMallocManaged(&yd, N*sizeof(Sleef_quadx1));
__float128 *r = (__float128 *)rd, *x = (__float128 *)xd, *y = (__float128 *)yd;
for (int i = 0; i < N; i++) {
r[i] = 0.0;
x[i] = 1.00001Q;
y[i] = i;
}
pow_gpu<<<1, 256>>>(N, rd, xd, yd);
cudaDeviceSynchronize();
double maxError = 0.0;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabsq(r[i]-powq(x[i], y[i])));
std::cout << "Max error: " << maxError << std::endl;
cudaFree(yd);
cudaFree(xd);
cudaFree(rd);
return 0;
}
Source code for testing CUDA functions
You may want to use both CPU and GPU functions in the same source code. This is possible, as shown in the following test code. You cannot use the library version of the SLEEF functions in CUDA source codes. Please include the header files for inlineable functions along with the header file for CUDA functions. The I/O functions are defined in sleefquadinline_purec_scalar.h. You cannot use SLEEF_QUAD_C or sleef_q in device functions.
// nvcc -O3 hellocudaquad2.cu -I./include --fmad=false -Xcompiler -ffp-contract=off
#include <iostream>
#include <stdio.h>
#include <stdint.h>
#include <stdarg.h>
#include <ctype.h>
#include <assert.h>
#include <emmintrin.h>
#include "sleefquadinline_sse2.h"
#include "sleefquadinline_purec_scalar.h"
#include "sleefquadinline_cuda.h"
#include "sleefinline_sse2.h"
// Based on the tutorial code at https://developer.nvidia.com/blog/even-easier-introduction-cuda/
__global__ void pow_gpu(int n, Sleef_quadx1 *r, Sleef_quadx1 *x, Sleef_quadx1 *y) {
int index = threadIdx.x, stride = blockDim.x;
for (int i = index; i < n; i += stride)
r[i] = Sleef_powq1_u10cuda(x[i], y[i]);
}
int main(void) {
int N = 1 << 20;
Sleef_quadx1 *rd, *xd, *yd;
cudaMallocManaged(&rd, N*sizeof(Sleef_quadx1));
cudaMallocManaged(&xd, N*sizeof(Sleef_quadx1));
cudaMallocManaged(&yd, N*sizeof(Sleef_quadx1));
Sleef_quad *r = (Sleef_quad *)rd, *x = (Sleef_quad *)xd, *y = (Sleef_quad *)yd;
//
for (int i = 0; i < N; i++) {
r[i] = Sleef_cast_from_doubleq1_purec(0);
x[i] = Sleef_cast_from_doubleq1_purec(1.00001);
y[i] = Sleef_cast_from_doubleq1_purec(i);
}
pow_gpu<<<1, 256>>>(N, rd, xd, yd);
cudaDeviceSynchronize();
Sleef_quadx2 maxError = Sleef_splatq2_sse2(Sleef_strtoq("0.0", NULL));
for (int i = 0; i < N; i += 2) {
Sleef_quadx2 r2 = Sleef_loadq2_sse2(&r[i]);
Sleef_quadx2 x2 = Sleef_loadq2_sse2(&x[i]);
Sleef_quadx2 y2 = Sleef_loadq2_sse2(&y[i]);
Sleef_quadx2 q = Sleef_fabsq2_sse2(Sleef_subq2_u05sse2(r2, Sleef_powq2_u10sse2(x2, y2)));
maxError = Sleef_fmaxq2_sse2(maxError, q);
}
Sleef_printf("Max error: %Qg\n",
Sleef_fmaxq1_purec(Sleef_getq2_sse2(maxError, 0), Sleef_getq2_sse2(maxError, 1)));
//
cudaFree(yd);
cudaFree(xd);
cudaFree(rd);
return 0;
}
Source code for testing CUDA functions with CPU functions
Convert QP number to double-precision number
Synopsis
#include <sleefquadinline_cuda.h>
__device__ double Sleef_cast_to_doubleq1_cuda( Sleef_quadx1 a );
Description
These functions convert a QP FP value to a double-precision value.
Convert double-precision number to QP number
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_cast_from_doubleq1_cuda( double a );
Description
These functions convert a double-precision value to a QP FP value.
Convert QP number to 64-bit signed integer
Synopsis
#include <sleefquadinline_cuda.h>
__device__ int64_t Sleef_cast_to_int64q1_cuda( Sleef_quadx1 a );
Description
These functions convert a QP FP value to a 64-bit signed integer.
Convert 64-bit signed integer to QP number
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_cast_from_int64q1_cuda( int64_t a );
Description
These functions convert a 64-bit signed integer to a QP FP value.
Convert QP number to 64-bit unsigned integer
Synopsis
#include <sleefquadinline_cuda.h>
__device__ uint64_t Sleef_cast_to_uint64q1_cuda( Sleef_quadx1 a );
Description
These functions convert a QP FP value to a 64-bit signed integer.
Convert 64-bit unsigned integer to QP number
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_cast_from_uint64q1_cuda( uint64_t a );
Description
These functions convert a 64-bit unsigned integer to a QP FP value.
QP comparison functions
Synopsis
#include <sleefquadinline_cuda.h>
__device__ int32_t Sleef_icmpltq1_cuda( Sleef_quadx1 a, Sleef_quadx1 b );
__device__ int32_t Sleef_icmpleq1_cuda( Sleef_quadx1 a, Sleef_quadx1 b );
__device__ int32_t Sleef_icmpgtq1_cuda( Sleef_quadx1 a, Sleef_quadx1 b );
__device__ int32_t Sleef_icmpgeq1_cuda( Sleef_quadx1 a, Sleef_quadx1 b );
__device__ int32_t Sleef_icmpeqq1_cuda( Sleef_quadx1 a, Sleef_quadx1 b );
__device__ int32_t Sleef_icmpneq1_cuda( Sleef_quadx1 a, Sleef_quadx1 b );
Description
These are the vectorized functions of comparison functions.
QP comparison functions of the second kind
Synopsis
#include <sleefquadinline_cuda.h>
__device__ int32_t Sleef_icmpq1_cuda( Sleef_quadx1 a, Sleef_quadx1 b );
Description
These are the vectorized functions of Sleef_icmpq1_purec.
Check orderedness
Synopsis
#include <sleefquadinline_cuda.h>
__device__ int32_t Sleef_iunordq1_cuda( Sleef_quadx1 a, Sleef_quadx1 b );
Description
These are the vectorized functions of Sleef_iunordq1_purec.
Select elements
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_iselectq1_cuda( int32_t c, Sleef_quadx1 a, Sleef_quadx1 b );
Description
These are the vectorized functions that operate in the same way as the ternary operator.
QP functions for basic arithmetic operations
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_addq1_u05cuda( Sleef_quadx1 a, Sleef_quadx1 b );
__device__ Sleef_quadx1 Sleef_subq1_u05cuda( Sleef_quadx1 a, Sleef_quadx1 b );
__device__ Sleef_quadx1 Sleef_mulq1_u05cuda( Sleef_quadx1 a, Sleef_quadx1 b );
__device__ Sleef_quadx1 Sleef_divq1_u05cuda( Sleef_quadx1 a, Sleef_quadx1 b );
__device__ Sleef_quadx1 Sleef_negq1_cuda( Sleef_quadx1 a );
Description
These are the vectorized functions of the basic arithmetic operations.
Square root functions
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_sqrtq1_u05cuda( Sleef_quadx1 a );
Description
These are the vectorized functions of Sleef_sqrtq1_u05purec.
Sine functions
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_sinq1_u10cuda( Sleef_quadx1 a );
Description
These are the vectorized functions of Sleef_sinq1_u10purec.
Cosine functions
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_cosq1_u10cuda( Sleef_quadx1 a );
Description
These are the vectorized functions of Sleef_cosq1_u10purec.
Tangent functions
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_tanq1_u10cuda( Sleef_quadx1 a );
Description
These are the vectorized functions of Sleef_tanq1_u10purec.
Arc sine functions
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_asinq1_u10cuda( Sleef_quadx1 a );
Description
These are the vectorized functions of Sleef_asinq1_u10purec.
Arc cosine functions
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_acosq1_u10cuda( Sleef_quadx1 a );
Description
These are the vectorized functions of Sleef_acosq1_u10purec.
Arc tangent functions
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_atanq1_u10cuda( Sleef_quadx1 a );
Description
These are the vectorized functions of Sleef_atanq1_u10purec.
Base-e exponential functions
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_expq1_u10cuda( Sleef_quadx1 a );
Description
These are the vectorized functions of Sleef_expq1_u10purec.
Base-2 exponential functions
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_exp2q1_u10cuda( Sleef_quadx1 a );
Description
These are the vectorized functions of Sleef_exp2q1_u10purec.
Base-10 exponentail
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_exp10q1_u10cuda( Sleef_quadx1 a );
Description
These are the vectorized functions of Sleef_exp10q1_u10purec.
Base-e exponential functions minus 1
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_expm1q1_u10cuda( Sleef_quadx1 a );
Description
These are the vectorized functions of Sleef_expm1q1_u10purec.
Natural logarithmic functions
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_logq1_u10cuda( Sleef_quadx1 a );
Description
These are the vectorized functions of Sleef_logq1_u10purec.
Base-2 logarithmic functions
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_log2q1_u10cuda( Sleef_quadx1 a );
Description
These are the vectorized functions of Sleef_log2q1_u10purec.
Base-10 logarithmic functions
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_log10q1_u10cuda( Sleef_quadx1 a );
Description
These are the vectorized functions of Sleef_log10q1_u10purec.
Logarithm of one plus argument
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_log1pq1_u10cuda( Sleef_quadx1 a );
Description
These are the vectorized functions of Sleef_log1pq1_u10purec.
Power functions
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_powq1_u10cuda( Sleef_quadx1 x, Sleef_quadx1 y );
Description
These are the vectorized functions of Sleef_powq1_u10purec.
Hyperbolic sine functions
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_sinhq1_u10cuda( Sleef_quadx1 a );
Description
These are the vectorized functions of Sleef_sinhq1_u10purec.
Hyperbolic cosine functions
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_coshq1_u10cuda( Sleef_quadx1 a );
Description
These are the vectorized functions of Sleef_coshq1_u10purec.
Hyperbolic tangent functions
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_tanhq1_u10cuda( Sleef_quadx1 a );
Description
These are the vectorized functions of Sleef_tanhq1_u10purec.
Inverse hyperbolic sine functions
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_asinhq1_u10cuda( Sleef_quadx1 a );
Description
These are the vectorized functions of Sleef_asinhq1_u10purec.
Inverse hyperbolic cosine functions
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_acoshq1_u10cuda( Sleef_quadx1 a );
Description
These are the vectorized functions of Sleef_acoshq1_u10purec.
Inverse hyperbolic tangent functions
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_atanhq1_u10cuda( Sleef_quadx1 a );
Description
These are the vectorized functions of Sleef_atanhq1_u10purec.
Round to integer towards zero
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_truncq1_cuda( Sleef_quadx1 a );
Description
These are the vectorized functions of Sleef_truncq1_purec.
Round to integer towards minus infinity
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_floorq1_cuda( Sleef_quadx1 a );
Description
These are the vectorized functions of Sleef_floorq1_purec.
Round to integer towards plus infinity
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_ceilq1_cuda( Sleef_quadx1 a );
Description
These are the vectorized functions of Sleef_ceilq1_purec.
Round to integer away from zero
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_roundq1_cuda( Sleef_quadx1 a );
Description
These are the vectorized functions of Sleef_roundq1_purec.
Round to integer, ties round to even
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_rintq1_cuda( Sleef_quadx1 a );
Description
These are the vectorized functions of Sleef_rintq1_purec.
Absolute value
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_fabsq1_cuda( Sleef_quadx1 a );
Description
These are the vectorized functions of Sleef_fabsq1_purec.
Copy sign of a number
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_copysignq1_cuda( Sleef_quadx1 x, Sleef_quadx1 y );
Description
These are the vectorized functions of Sleef_copysignq1_purec.
Maximum of two numbers
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_fmaxq1_cuda( Sleef_quadx1 x, Sleef_quadx1 y );
Description
These are the vectorized functions of Sleef_fmaxq1_purec.
Minimum of two numbers
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_fminq1_cuda( Sleef_quadx1 x, Sleef_quadx1 y );
Description
These are the vectorized functions of Sleef_fminq1_purec.
Positive difference
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_fdimq1_u05cuda( Sleef_quadx1 x, Sleef_quadx1 y );
Description
These are the vectorized functions of Sleef_fdimq1_u05purec.
Floating point remainder
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_fmodq1_cuda( Sleef_quadx1 x, Sleef_quadx1 y );
Description
These are the vectorized functions of Sleef_fmodq1_purec.
Floating point remainder
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_remainderq1_cuda( Sleef_quadx1 x, Sleef_quadx1 y );
Description
These are the vectorized functions of Sleef_remainderq1_purec.
Split a number to fractional and integral components
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_frexpq1_cuda( Sleef_quadx1 x, int32_t * ptr );
Description
These are the vectorized functions of Sleef_frexpq1_purec.
Break a number into integral and fractional parts
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_modfq1_cuda( Sleef_quadx1 x, Sleef_quadx1 * ptr );
Description
These are the vectorized functions of Sleef_modfq1_purec.
2D Euclidian distance
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_hypotq1_u05cuda( Sleef_quadx1 x, Sleef_quadx1 y );
Description
These are the vectorized functions of Sleef_hypotq1_u05purec.
Fused multiply and accumulate
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_fmaq1_u05cuda( Sleef_quadx1 x, Sleef_quadx1 y, Sleef_quadx1 z );
Description
These are the vectorized functions of Sleef_fmaq1_u05purec.
Multiply by integral power of 2
Synopsis
#include <sleefquadinline_cuda.h>
__device__ Sleef_quadx1 Sleef_ldexpq1_cuda( Sleef_quadx1 x, int32_t e );
Description
These are the vectorized functions of Sleef_ldexpq1_purec.
Integer exponent of an FP number
Synopsis
#include <sleefquadinline_cuda.h>
__device__ int32_t Sleef_ilogbq1_cuda( Sleef_quadx1 x );
Description
These are the vectorized functions of Sleef_ilogbq1_purec.