pbrt/kernels/src/lib.rs

447 lines
12 KiB
Rust

use cuda_std::prelude::*;
pub mod wavefront;
pub mod workitem;
use cust::context::{CacheConfig, CurrentContext, ResourceLimit};
use cust::device::DeviceAttribute;
use cust::memory::{DeviceCopy, DeviceMemory};
use cust::prelude::*;
use lazy_static::lazy_static;
use parking_lot::Mutex;
use std::error::Error;
use std::ffi::c_void;
use std::sync::Arc;
use crate::Float;
use crate::core::geometry::{Normal, Point, Vector};
use crate::core::medium::Medium;
use crate::core::options::{PBRTOptions, get_options};
use crate::impl_gpu_traits;
use crate::impl_math_gpu_traits;
use crate::spectra::{SampledSpectrum, SampledWavelengths};
use crate::utils::interval::Interval;
pub use workitem::{
EscapedRayQueue, GetBSSRDFAndProbeRayQueue, HitAreaLightQueue, MaterialEvalQueue,
MediumSampleQueue, MediumScatterQueue, PixelSampleStateStorage, RayQueue, ShadowRayQueue,
SubsurfaceScatterQueue,
};
#[repr(C, align(16))]
#[derive(Clone, Copy, Debug, Default, PartialEq)]
pub struct Float4 {
pub v: [f32; 4],
}
pub type Vec4 = Vector<Float, 4>;
impl From<Vec4> for Float4 {
#[inline]
fn from(vec: Vector<f32, 4>) -> Self {
Self { v: vec.0 }
}
}
impl From<Float4> for Vec4 {
#[inline]
fn from(storage: Float4) -> Self {
Vector(storage.v)
}
}
impl_math_gpu_traits!(Vector);
impl_math_gpu_traits!(Normal);
impl_math_gpu_traits!(Point);
impl_gpu_traits!(Interval);
impl_gpu_traits!(Float4);
impl_gpu_traits!(SampledSpectrum);
impl_gpu_traits!(SampledWavelengths);
struct KernelStats {
description: String,
num_launches: usize,
sum_ms: f32,
min_ms: f32,
max_ms: f32,
}
impl KernelStats {
fn new(description: &str) -> Self {
Self {
description: description.to_string(),
num_launches: 0,
sum_ms: 0.0,
min_ms: 0.0,
max_ms: 0.0,
}
}
}
struct ProfilerEvent {
start: Event,
stop: Event,
active: bool,
stats: Option<Arc<Mutex<KernelStats>>>,
}
impl ProfilerEvent {
fn new() -> Result<Self, cust::error::CudaError> {
let start = Event::new(EventFlags::DEFAULT)?;
let stop = Event::new(EventFlags::DEFAULT)?;
Ok(Self {
start,
stop,
active: false,
stats: None,
})
}
fn sync(&mut self) {
if !self.active {
return;
}
if self.stop.synchronize().is_ok() {
// Check timing between start and stop
match self.stop.elapsed_time_f32(&self.start) {
Ok(ms) => {
if let Some(stats_arc) = &self.stats {
let mut stats = stats_arc.lock();
stats.num_launches += 1;
if stats.num_launches == 1 {
stats.sum_ms = ms;
stats.min_ms = ms;
stats.max_ms = ms;
} else {
stats.sum_ms += ms;
stats.min_ms = stats.min_ms.min(ms);
stats.max_ms = stats.max_ms.max(ms);
}
}
}
Err(e) => log::error!("Failed to get elapsed time: {:?}", e),
}
}
self.active = false;
}
}
// --- Profiler Manager ---
struct Profiler {
kernel_stats: Vec<Arc<Mutex<KernelStats>>>,
event_pool: Vec<ProfilerEvent>,
pool_offset: usize,
}
impl Profiler {
fn new() -> Self {
Self {
kernel_stats: Vec::new(),
event_pool: Vec::new(),
pool_offset: 0,
}
}
/// Prepares an event from the pool.
/// Returns a mutable reference to the event, valid as long as the borrow of self.
fn prepare<'a>(&'a mut self, description: &str) -> &'a mut ProfilerEvent {
// Grow pool if empty or needed (simple heuristic)
if self.event_pool.is_empty() {
for _ in 0..128 {
if let Ok(e) = ProfilerEvent::new() {
self.event_pool.push(e);
}
}
}
if self.pool_offset >= self.event_pool.len() {
self.pool_offset = 0;
}
let idx = self.pool_offset;
self.pool_offset += 1;
let pe = &mut self.event_pool[idx];
if pe.active {
pe.sync();
}
pe.active = true;
pe.stats = None;
// Find or create stats
let mut found = None;
for s in &self.kernel_stats {
if s.lock().description == description {
found = Some(s.clone());
break;
}
}
if found.is_none() {
let new_stats = Arc::new(Mutex::new(KernelStats::new(description)));
self.kernel_stats.push(new_stats.clone());
found = Some(new_stats);
}
pe.stats = found;
pe
}
}
pub struct GpuState {
context: Context,
stream: Stream,
profiler: Profiler,
}
impl GpuState {
fn init(device_index: u32) -> Result<Self, Box<dyn Error>> {
cust::init(CudaFlags::empty())?;
let device = Device::get_device(device_index)?;
let name = device.name().unwrap_or_else(|_| "Unknown".into());
let memory = device.total_memory().unwrap_or(0);
let memory_gb = memory as f64 / (1024.0 * 1024.0 * 1024.0);
let major = device
.get_attribute(DeviceAttribute::ComputeCapabilityMajor)
.unwrap_or(0);
let minor = device
.get_attribute(DeviceAttribute::ComputeCapabilityMinor)
.unwrap_or(0);
log::info!(
"Selected GPU: {} ({:.2} GB, SM {}.{})",
name,
memory_gb,
major,
minor
);
let has_unified = device
.get_attribute(DeviceAttribute::UnifiedAddressing)
.unwrap_or(0);
if has_unified == 0 {
panic!("Selected GPU does not support unified addressing.");
}
let context = Context::new(device)?;
CurrentContext::set_resource_limit(ResourceLimit::StackSize, 8192)?;
let stack_size = CurrentContext::get_resource_limit(ResourceLimit::StackSize)?;
log::info!("Reset stack size to {}", stack_size);
CurrentContext::set_resource_limit(ResourceLimit::PrintfFifoSize, 32 * 1024 * 1024)?;
CurrentContext::set_cache_config(CacheConfig::PreferL1)?;
let stream = Stream::new(StreamFlags::DEFAULT, None)?;
Ok(Self {
context,
stream,
profiler: Profiler::new(),
})
}
}
lazy_static! {
static ref GPU_STATE: Mutex<Option<GpuState>> = Mutex::new(None);
}
pub fn gpu_init() {
if !get_options().use_gpu {
return;
}
let device_id = get_options().gpu_device.unwrap_or(0);
log::info!("Initializing GPU Device {}", device_id);
match GpuState::init(device_id) {
Ok(state) => {
#[cfg(feature = "use_nvtx")]
nvtx::name_thread("MAIN_THREAD");
*GPU_STATE.lock() = Some(state);
}
Err(e) => {
panic!("Failed to initialize GPU: {:?}", e);
}
}
}
pub fn gpu_thread_init() {
if let Some(state) = GPU_STATE.lock().as_ref() {
if let Err(e) = CurrentContext::set_current(&state.context) {
log::error!("Failed to set CUDA context for thread: {:?}", e);
}
}
}
pub fn gpu_wait() {
let mut guard = GPU_STATE.lock();
if let Some(state) = guard.as_mut() {
if let Err(e) = state.stream.synchronize() {
log::error!("GPU Wait failed: {:?}", e);
}
}
}
/// Launches a parallel for loop on the GPU.
///
/// # Arguments
/// * `description`: Name for profiling.
/// * `n_items`: Total items (threads).
/// * `function`: Compiled kernel function handle.
/// * `params`: Kernel parameters (must be DeviceCopy).
pub fn gpu_parallel_for<T: DeviceCopy>(
description: &str,
n_items: i32,
function: &Function,
params: &T,
) {
#[cfg(feature = "use_nvtx")]
nvtx::range_push(description);
let mut guard = GPU_STATE.lock();
let state = guard.as_mut().expect("GPU not initialized");
let (_, block_size) = match function.suggested_launch_configuration(0, 0.into()) {
Ok(cfg) => cfg,
Err(e) => panic!(
"Failed to calculate launch config for {}: {:?}",
description, e
),
};
#[cfg(debug_assertions)]
log::debug!("[{}] Block size: {}", description, block_size);
let grid_size = (n_items as u32 + block_size - 1) / block_size;
let stream = &state.stream;
let profiler = &mut state.profiler;
// Save the index we are about to use so we can retrieve the STOP event later
let event_idx = profiler.pool_offset;
{
let pe = profiler.prepare(description);
if let Err(e) = pe.start.record(stream) {
log::error!("Failed to record start event: {:?}", e);
}
}
let params_ptr = params as *const T as *mut c_void;
let n_items_ptr = &n_items as *const i32 as *mut c_void;
let args = [params_ptr, n_items_ptr];
unsafe {
if let Err(e) =
state
.stream
.launch(function, (grid_size, 1, 1), (block_size, 1, 1), 0, &args)
{
panic!("CUDA Launch failed for {}: {:?}", description, e);
}
}
// Retrieve the specific event we just set up.
// Pool_offset was incremented in prepare().
// If event_idx was the one used, the event is at event_idx.
if event_idx < profiler.event_pool.len() {
let pe = &mut profiler.event_pool[event_idx];
if let Err(e) = pe.stop.record(stream) {
log::error!("Failed to record stop event: {:?}", e);
}
}
#[cfg(debug_assertions)]
let _ = state.stream.synchronize();
#[cfg(feature = "use_nvtx")]
nvtx::range_pop();
}
pub fn report_kernel_stats() {
let mut guard = GPU_STATE.lock();
if let Some(state) = guard.as_mut() {
let _ = state.stream.synchronize();
// Process all pending events
for pe in &mut state.profiler.event_pool {
if pe.active {
pe.sync();
}
}
let mut total_ms = 0.0;
for s in &state.profiler.kernel_stats {
total_ms += s.lock().sum_ms;
}
println!("Wavefront Kernel Profile:");
for s in &state.profiler.kernel_stats {
let stats = s.lock();
let percent = if total_ms > 0.0 {
100.0 * stats.sum_ms / total_ms
} else {
0.0
};
println!(
" {:<45} {:5} launches {:9.2} ms / {:5.1}% (avg {:6.3})",
stats.description,
stats.num_launches,
stats.sum_ms,
percent,
if stats.num_launches > 0 {
stats.sum_ms / stats.num_launches as f32
} else {
0.0
}
);
}
println!("\nTotal: {:.2} ms", total_ms);
}
}
pub fn gpu_memset<T: DeviceCopy>(dst: &mut DeviceSlice<T>, value: u8) {
unsafe {
let ptr = dst.as_raw_ptr(); // Returns CUdeviceptr (u64)
let len = dst.len() * std::mem::size_of::<T>();
// We need the `cust::external::cuda` or equivalent sys crate function
log::warn!("gpu_memset requested but raw memset not exposed via safe cust API yet.");
}
}
#[macro_export]
macro_rules! impl_gpu_traits {
($name:ty) => {
unsafe impl cust::memory::DeviceCopy for $name {}
unsafe impl bytemuck::Zeroable for $name {}
unsafe impl bytemuck::Pod for $name {}
};
}
#[macro_export]
macro_rules! impl_math_gpu_traits {
($Struct:ident) => {
#[cfg(feature = "use_gpu")]
unsafe impl<T, const N: usize> cust::memory::DeviceCopy for $Struct<T, N> where
T: cust::memory::DeviceCopy + Copy
{
}
unsafe impl<T, const N: usize> bytemuck::Zeroable for $Struct<T, N> where
T: bytemuck::Zeroable
{
}
unsafe impl<T, const N: usize> bytemuck::Pod for $Struct<T, N> where T: bytemuck::Pod {}
};
}