extern "C" __global__ void scale_array(float* data, unsigned int len, float scale) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= len) return; data[idx] *= scale; } extern "C" __global__ void add_arrays( const float* __restrict__ a, const float* __restrict__ b, float* __restrict__ c, unsigned int len ) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= len) return; c[idx] = a[idx] + b[idx]; } extern "C" __global__ void saxpy( float a, const float* __restrict__ x, float* __restrict__ y, unsigned int len ) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= len) return; y[idx] = a * x[idx] + y[idx]; }