use cuda_std::prelude::*; pub mod wavefront; pub mod workitem; use cust::context::{CacheConfig, CurrentContext, ResourceLimit}; use cust::device::DeviceAttribute; use cust::memory::{DeviceCopy, DeviceMemory}; use cust::prelude::*; use lazy_static::lazy_static; use parking_lot::Mutex; use std::error::Error; use std::ffi::c_void; use std::sync::Arc; use crate::Float; use crate::core::geometry::{Normal, Point, Vector}; use crate::core::medium::Medium; use crate::core::options::{PBRTOptions, get_options}; use crate::impl_gpu_traits; use crate::impl_math_gpu_traits; use crate::spectra::{SampledSpectrum, SampledWavelengths}; use crate::utils::interval::Interval; pub use workitem::{ EscapedRayQueue, GetBSSRDFAndProbeRayQueue, HitAreaLightQueue, MaterialEvalQueue, MediumSampleQueue, MediumScatterQueue, PixelSampleStateStorage, RayQueue, ShadowRayQueue, SubsurfaceScatterQueue, }; #[repr(C, align(16))] #[derive(Clone, Copy, Debug, Default, PartialEq)] pub struct Float4 { pub v: [f32; 4], } pub type Vec4 = Vector; impl From for Float4 { #[inline] fn from(vec: Vector) -> Self { Self { v: vec.0 } } } impl From for Vec4 { #[inline] fn from(storage: Float4) -> Self { Vector(storage.v) } } impl_math_gpu_traits!(Vector); impl_math_gpu_traits!(Normal); impl_math_gpu_traits!(Point); impl_gpu_traits!(Interval); impl_gpu_traits!(Float4); impl_gpu_traits!(SampledSpectrum); impl_gpu_traits!(SampledWavelengths); struct KernelStats { description: String, num_launches: usize, sum_ms: f32, min_ms: f32, max_ms: f32, } impl KernelStats { fn new(description: &str) -> Self { Self { description: description.to_string(), num_launches: 0, sum_ms: 0.0, min_ms: 0.0, max_ms: 0.0, } } } struct ProfilerEvent { start: Event, stop: Event, active: bool, stats: Option>>, } impl ProfilerEvent { fn new() -> Result { let start = Event::new(EventFlags::DEFAULT)?; let stop = Event::new(EventFlags::DEFAULT)?; Ok(Self { start, stop, active: false, stats: None, }) } fn sync(&mut self) { if !self.active { return; } if self.stop.synchronize().is_ok() { // Check timing between start and stop match self.stop.elapsed_time_f32(&self.start) { Ok(ms) => { if let Some(stats_arc) = &self.stats { let mut stats = stats_arc.lock(); stats.num_launches += 1; if stats.num_launches == 1 { stats.sum_ms = ms; stats.min_ms = ms; stats.max_ms = ms; } else { stats.sum_ms += ms; stats.min_ms = stats.min_ms.min(ms); stats.max_ms = stats.max_ms.max(ms); } } } Err(e) => log::error!("Failed to get elapsed time: {:?}", e), } } self.active = false; } } // --- Profiler Manager --- struct Profiler { kernel_stats: Vec>>, event_pool: Vec, pool_offset: usize, } impl Profiler { fn new() -> Self { Self { kernel_stats: Vec::new(), event_pool: Vec::new(), pool_offset: 0, } } /// Prepares an event from the pool. /// Returns a mutable reference to the event, valid as long as the borrow of self. fn prepare<'a>(&'a mut self, description: &str) -> &'a mut ProfilerEvent { // Grow pool if empty or needed (simple heuristic) if self.event_pool.is_empty() { for _ in 0..128 { if let Ok(e) = ProfilerEvent::new() { self.event_pool.push(e); } } } if self.pool_offset >= self.event_pool.len() { self.pool_offset = 0; } let idx = self.pool_offset; self.pool_offset += 1; let pe = &mut self.event_pool[idx]; if pe.active { pe.sync(); } pe.active = true; pe.stats = None; // Find or create stats let mut found = None; for s in &self.kernel_stats { if s.lock().description == description { found = Some(s.clone()); break; } } if found.is_none() { let new_stats = Arc::new(Mutex::new(KernelStats::new(description))); self.kernel_stats.push(new_stats.clone()); found = Some(new_stats); } pe.stats = found; pe } } pub struct GpuState { context: Context, stream: Stream, profiler: Profiler, } impl GpuState { fn init(device_index: u32) -> Result> { cust::init(CudaFlags::empty())?; let device = Device::get_device(device_index)?; let name = device.name().unwrap_or_else(|_| "Unknown".into()); let memory = device.total_memory().unwrap_or(0); let memory_gb = memory as f64 / (1024.0 * 1024.0 * 1024.0); let major = device .get_attribute(DeviceAttribute::ComputeCapabilityMajor) .unwrap_or(0); let minor = device .get_attribute(DeviceAttribute::ComputeCapabilityMinor) .unwrap_or(0); log::info!( "Selected GPU: {} ({:.2} GB, SM {}.{})", name, memory_gb, major, minor ); let has_unified = device .get_attribute(DeviceAttribute::UnifiedAddressing) .unwrap_or(0); if has_unified == 0 { panic!("Selected GPU does not support unified addressing."); } let context = Context::new(device)?; CurrentContext::set_resource_limit(ResourceLimit::StackSize, 8192)?; let stack_size = CurrentContext::get_resource_limit(ResourceLimit::StackSize)?; log::info!("Reset stack size to {}", stack_size); CurrentContext::set_resource_limit(ResourceLimit::PrintfFifoSize, 32 * 1024 * 1024)?; CurrentContext::set_cache_config(CacheConfig::PreferL1)?; let stream = Stream::new(StreamFlags::DEFAULT, None)?; Ok(Self { context, stream, profiler: Profiler::new(), }) } } lazy_static! { static ref GPU_STATE: Mutex> = Mutex::new(None); } pub fn gpu_init() { if !get_options().use_gpu { return; } let device_id = get_options().gpu_device.unwrap_or(0); log::info!("Initializing GPU Device {}", device_id); match GpuState::init(device_id) { Ok(state) => { #[cfg(feature = "use_nvtx")] nvtx::name_thread("MAIN_THREAD"); *GPU_STATE.lock() = Some(state); } Err(e) => { panic!("Failed to initialize GPU: {:?}", e); } } } pub fn gpu_thread_init() { if let Some(state) = GPU_STATE.lock().as_ref() { if let Err(e) = CurrentContext::set_current(&state.context) { log::error!("Failed to set CUDA context for thread: {:?}", e); } } } pub fn gpu_wait() { let mut guard = GPU_STATE.lock(); if let Some(state) = guard.as_mut() { if let Err(e) = state.stream.synchronize() { log::error!("GPU Wait failed: {:?}", e); } } } /// Launches a parallel for loop on the GPU. /// /// # Arguments /// * `description`: Name for profiling. /// * `n_items`: Total items (threads). /// * `function`: Compiled kernel function handle. /// * `params`: Kernel parameters (must be DeviceCopy). pub fn gpu_parallel_for( description: &str, n_items: i32, function: &Function, params: &T, ) { #[cfg(feature = "use_nvtx")] nvtx::range_push(description); let mut guard = GPU_STATE.lock(); let state = guard.as_mut().expect("GPU not initialized"); let (_, block_size) = match function.suggested_launch_configuration(0, 0.into()) { Ok(cfg) => cfg, Err(e) => panic!( "Failed to calculate launch config for {}: {:?}", description, e ), }; #[cfg(debug_assertions)] log::debug!("[{}] Block size: {}", description, block_size); let grid_size = (n_items as u32 + block_size - 1) / block_size; let stream = &state.stream; let profiler = &mut state.profiler; // Save the index we are about to use so we can retrieve the STOP event later let event_idx = profiler.pool_offset; { let pe = profiler.prepare(description); if let Err(e) = pe.start.record(stream) { log::error!("Failed to record start event: {:?}", e); } } let params_ptr = params as *const T as *mut c_void; let n_items_ptr = &n_items as *const i32 as *mut c_void; let args = [params_ptr, n_items_ptr]; unsafe { if let Err(e) = state .stream .launch(function, (grid_size, 1, 1), (block_size, 1, 1), 0, &args) { panic!("CUDA Launch failed for {}: {:?}", description, e); } } // Retrieve the specific event we just set up. // Pool_offset was incremented in prepare(). // If event_idx was the one used, the event is at event_idx. if event_idx < profiler.event_pool.len() { let pe = &mut profiler.event_pool[event_idx]; if let Err(e) = pe.stop.record(stream) { log::error!("Failed to record stop event: {:?}", e); } } #[cfg(debug_assertions)] let _ = state.stream.synchronize(); #[cfg(feature = "use_nvtx")] nvtx::range_pop(); } pub fn report_kernel_stats() { let mut guard = GPU_STATE.lock(); if let Some(state) = guard.as_mut() { let _ = state.stream.synchronize(); // Process all pending events for pe in &mut state.profiler.event_pool { if pe.active { pe.sync(); } } let mut total_ms = 0.0; for s in &state.profiler.kernel_stats { total_ms += s.lock().sum_ms; } println!("Wavefront Kernel Profile:"); for s in &state.profiler.kernel_stats { let stats = s.lock(); let percent = if total_ms > 0.0 { 100.0 * stats.sum_ms / total_ms } else { 0.0 }; println!( " {:<45} {:5} launches {:9.2} ms / {:5.1}% (avg {:6.3})", stats.description, stats.num_launches, stats.sum_ms, percent, if stats.num_launches > 0 { stats.sum_ms / stats.num_launches as f32 } else { 0.0 } ); } println!("\nTotal: {:.2} ms", total_ms); } } pub fn gpu_memset(dst: &mut DeviceSlice, value: u8) { unsafe { let ptr = dst.as_raw_ptr(); // Returns CUdeviceptr (u64) let len = dst.len() * std::mem::size_of::(); // We need the `cust::external::cuda` or equivalent sys crate function log::warn!("gpu_memset requested but raw memset not exposed via safe cust API yet."); } } #[macro_export] macro_rules! impl_gpu_traits { ($name:ty) => { unsafe impl cust::memory::DeviceCopy for $name {} unsafe impl bytemuck::Zeroable for $name {} unsafe impl bytemuck::Pod for $name {} }; } #[macro_export] macro_rules! impl_math_gpu_traits { ($Struct:ident) => { #[cfg(feature = "use_gpu")] unsafe impl cust::memory::DeviceCopy for $Struct where T: cust::memory::DeviceCopy + Copy { } unsafe impl bytemuck::Zeroable for $Struct where T: bytemuck::Zeroable { } unsafe impl bytemuck::Pod for $Struct where T: bytemuck::Pod {} }; }