pbrt/kernels/src/lib.rs

use cuda_std::prelude::*;

pub mod wavefront;
pub mod workitem;

use cust::context::{CacheConfig, CurrentContext, ResourceLimit};
use cust::device::DeviceAttribute;
use cust::memory::{DeviceCopy, DeviceMemory};
use cust::prelude::*;
use lazy_static::lazy_static;
use parking_lot::Mutex;
use std::error::Error;
use std::ffi::c_void;
use std::sync::Arc;

use crate::Float;
use crate::core::geometry::{Normal, Point, Vector};
use crate::core::medium::Medium;
use crate::core::options::{PBRTOptions, get_options};
use crate::impl_gpu_traits;
use crate::impl_math_gpu_traits;
use crate::spectra::{SampledSpectrum, SampledWavelengths};
use crate::utils::interval::Interval;

pub use workitem::{
    EscapedRayQueue, GetBSSRDFAndProbeRayQueue, HitAreaLightQueue, MaterialEvalQueue,
    MediumSampleQueue, MediumScatterQueue, PixelSampleStateStorage, RayQueue, ShadowRayQueue,
    SubsurfaceScatterQueue,
};

#[repr(C, align(16))]
#[derive(Clone, Copy, Debug, Default, PartialEq)]
pub struct Float4 {
    pub v: [f32; 4],
}

pub type Vec4 = Vector<Float, 4>;

impl From<Vec4> for Float4 {
    #[inline]
    fn from(vec: Vector<f32, 4>) -> Self {
        Self { v: vec.0 }
    }
}

impl From<Float4> for Vec4 {
    #[inline]
    fn from(storage: Float4) -> Self {
        Vector(storage.v)
    }
}

impl_math_gpu_traits!(Vector);
impl_math_gpu_traits!(Normal);
impl_math_gpu_traits!(Point);
impl_gpu_traits!(Interval);
impl_gpu_traits!(Float4);
impl_gpu_traits!(SampledSpectrum);
impl_gpu_traits!(SampledWavelengths);

struct KernelStats {
    description: String,
    num_launches: usize,
    sum_ms: f32,
    min_ms: f32,
    max_ms: f32,
}

impl KernelStats {
    fn new(description: &str) -> Self {
        Self {
            description: description.to_string(),
            num_launches: 0,
            sum_ms: 0.0,
            min_ms: 0.0,
            max_ms: 0.0,
        }
    }
}

struct ProfilerEvent {
    start: Event,
    stop: Event,
    active: bool,
    stats: Option<Arc<Mutex<KernelStats>>>,
}

impl ProfilerEvent {
    fn new() -> Result<Self, cust::error::CudaError> {
        let start = Event::new(EventFlags::DEFAULT)?;
        let stop = Event::new(EventFlags::DEFAULT)?;
        Ok(Self {
            start,
            stop,
            active: false,
            stats: None,
        })
    }

    fn sync(&mut self) {
        if !self.active {
            return;
        }

        if self.stop.synchronize().is_ok() {
            // Check timing between start and stop
            match self.stop.elapsed_time_f32(&self.start) {
                Ok(ms) => {
                    if let Some(stats_arc) = &self.stats {
                        let mut stats = stats_arc.lock();
                        stats.num_launches += 1;
                        if stats.num_launches == 1 {
                            stats.sum_ms = ms;
                            stats.min_ms = ms;
                            stats.max_ms = ms;
                        } else {
                            stats.sum_ms += ms;
                            stats.min_ms = stats.min_ms.min(ms);
                            stats.max_ms = stats.max_ms.max(ms);
                        }
                    }
                }
                Err(e) => log::error!("Failed to get elapsed time: {:?}", e),
            }
        }
        self.active = false;
    }
}

// --- Profiler Manager ---

struct Profiler {
    kernel_stats: Vec<Arc<Mutex<KernelStats>>>,
    event_pool: Vec<ProfilerEvent>,
    pool_offset: usize,
}

impl Profiler {
    fn new() -> Self {
        Self {
            kernel_stats: Vec::new(),
            event_pool: Vec::new(),
            pool_offset: 0,
        }
    }

    /// Prepares an event from the pool.
    /// Returns a mutable reference to the event, valid as long as the borrow of self.
    fn prepare<'a>(&'a mut self, description: &str) -> &'a mut ProfilerEvent {
        // Grow pool if empty or needed (simple heuristic)
        if self.event_pool.is_empty() {
            for _ in 0..128 {
                if let Ok(e) = ProfilerEvent::new() {
                    self.event_pool.push(e);
                }
            }
        }

        if self.pool_offset >= self.event_pool.len() {
            self.pool_offset = 0;
        }

        let idx = self.pool_offset;
        self.pool_offset += 1;

        let pe = &mut self.event_pool[idx];

        if pe.active {
            pe.sync();
        }

        pe.active = true;
        pe.stats = None;

        // Find or create stats
        let mut found = None;
        for s in &self.kernel_stats {
            if s.lock().description == description {
                found = Some(s.clone());
                break;
            }
        }

        if found.is_none() {
            let new_stats = Arc::new(Mutex::new(KernelStats::new(description)));
            self.kernel_stats.push(new_stats.clone());
            found = Some(new_stats);
        }

        pe.stats = found;
        pe
    }
}

pub struct GpuState {
    context: Context,
    stream: Stream,
    profiler: Profiler,
}

impl GpuState {
    fn init(device_index: u32) -> Result<Self, Box<dyn Error>> {
        cust::init(CudaFlags::empty())?;

        let device = Device::get_device(device_index)?;

        let name = device.name().unwrap_or_else(|_| "Unknown".into());
        let memory = device.total_memory().unwrap_or(0);
        let memory_gb = memory as f64 / (1024.0 * 1024.0 * 1024.0);

        let major = device
            .get_attribute(DeviceAttribute::ComputeCapabilityMajor)
            .unwrap_or(0);
        let minor = device
            .get_attribute(DeviceAttribute::ComputeCapabilityMinor)
            .unwrap_or(0);

        log::info!(
            "Selected GPU: {} ({:.2} GB, SM {}.{})",
            name,
            memory_gb,
            major,
            minor
        );

        let has_unified = device
            .get_attribute(DeviceAttribute::UnifiedAddressing)
            .unwrap_or(0);
        if has_unified == 0 {
            panic!("Selected GPU does not support unified addressing.");
        }

        let context = Context::new(device)?;

        CurrentContext::set_resource_limit(ResourceLimit::StackSize, 8192)?;
        let stack_size = CurrentContext::get_resource_limit(ResourceLimit::StackSize)?;
        log::info!("Reset stack size to {}", stack_size);

        CurrentContext::set_resource_limit(ResourceLimit::PrintfFifoSize, 32 * 1024 * 1024)?;
        CurrentContext::set_cache_config(CacheConfig::PreferL1)?;

        let stream = Stream::new(StreamFlags::DEFAULT, None)?;

        Ok(Self {
            context,
            stream,
            profiler: Profiler::new(),
        })
    }
}

lazy_static! {
    static ref GPU_STATE: Mutex<Option<GpuState>> = Mutex::new(None);
}

pub fn gpu_init() {
    if !get_options().use_gpu {
        return;
    }

    let device_id = get_options().gpu_device.unwrap_or(0);
    log::info!("Initializing GPU Device {}", device_id);

    match GpuState::init(device_id) {
        Ok(state) => {
            #[cfg(feature = "use_nvtx")]
            nvtx::name_thread("MAIN_THREAD");
            *GPU_STATE.lock() = Some(state);
        }
        Err(e) => {
            panic!("Failed to initialize GPU: {:?}", e);
        }
    }
}

pub fn gpu_thread_init() {
    if let Some(state) = GPU_STATE.lock().as_ref() {
        if let Err(e) = CurrentContext::set_current(&state.context) {
            log::error!("Failed to set CUDA context for thread: {:?}", e);
        }
    }
}

pub fn gpu_wait() {
    let mut guard = GPU_STATE.lock();
    if let Some(state) = guard.as_mut() {
        if let Err(e) = state.stream.synchronize() {
            log::error!("GPU Wait failed: {:?}", e);
        }
    }
}

/// Launches a parallel for loop on the GPU.
///
/// # Arguments
/// * `description`: Name for profiling.
/// * `n_items`: Total items (threads).
/// * `function`: Compiled kernel function handle.
/// * `params`: Kernel parameters (must be DeviceCopy).
pub fn gpu_parallel_for<T: DeviceCopy>(
    description: &str,
    n_items: i32,
    function: &Function,
    params: &T,
) {
    #[cfg(feature = "use_nvtx")]
    nvtx::range_push(description);

    let mut guard = GPU_STATE.lock();
    let state = guard.as_mut().expect("GPU not initialized");

    let (_, block_size) = match function.suggested_launch_configuration(0, 0.into()) {
        Ok(cfg) => cfg,
        Err(e) => panic!(
            "Failed to calculate launch config for {}: {:?}",
            description, e
        ),
    };

    #[cfg(debug_assertions)]
    log::debug!("[{}] Block size: {}", description, block_size);

    let grid_size = (n_items as u32 + block_size - 1) / block_size;

    let stream = &state.stream;
    let profiler = &mut state.profiler;

    // Save the index we are about to use so we can retrieve the STOP event later
    let event_idx = profiler.pool_offset;

    {
        let pe = profiler.prepare(description);
        if let Err(e) = pe.start.record(stream) {
            log::error!("Failed to record start event: {:?}", e);
        }
    }

    let params_ptr = params as *const T as *mut c_void;
    let n_items_ptr = &n_items as *const i32 as *mut c_void;
    let args = [params_ptr, n_items_ptr];

    unsafe {
        if let Err(e) =
            state
                .stream
                .launch(function, (grid_size, 1, 1), (block_size, 1, 1), 0, &args)
        {
            panic!("CUDA Launch failed for {}: {:?}", description, e);
        }
    }

    // Retrieve the specific event we just set up.
    // Pool_offset was incremented in prepare().
    // If event_idx was the one used, the event is at event_idx.
    if event_idx < profiler.event_pool.len() {
        let pe = &mut profiler.event_pool[event_idx];
        if let Err(e) = pe.stop.record(stream) {
            log::error!("Failed to record stop event: {:?}", e);
        }
    }

    #[cfg(debug_assertions)]
    let _ = state.stream.synchronize();

    #[cfg(feature = "use_nvtx")]
    nvtx::range_pop();
}

pub fn report_kernel_stats() {
    let mut guard = GPU_STATE.lock();
    if let Some(state) = guard.as_mut() {
        let _ = state.stream.synchronize();

        // Process all pending events
        for pe in &mut state.profiler.event_pool {
            if pe.active {
                pe.sync();
            }
        }

        let mut total_ms = 0.0;
        for s in &state.profiler.kernel_stats {
            total_ms += s.lock().sum_ms;
        }

        println!("Wavefront Kernel Profile:");
        for s in &state.profiler.kernel_stats {
            let stats = s.lock();
            let percent = if total_ms > 0.0 {
                100.0 * stats.sum_ms / total_ms
            } else {
                0.0
            };
            println!(
                "  {:<45} {:5} launches {:9.2} ms / {:5.1}% (avg {:6.3})",
                stats.description,
                stats.num_launches,
                stats.sum_ms,
                percent,
                if stats.num_launches > 0 {
                    stats.sum_ms / stats.num_launches as f32
                } else {
                    0.0
                }
            );
        }
        println!("\nTotal: {:.2} ms", total_ms);
    }
}

pub fn gpu_memset<T: DeviceCopy>(dst: &mut DeviceSlice<T>, value: u8) {
    unsafe {
        let ptr = dst.as_raw_ptr(); // Returns CUdeviceptr (u64)
        let len = dst.len() * std::mem::size_of::<T>();

        // We need the `cust::external::cuda` or equivalent sys crate function

        log::warn!("gpu_memset requested but raw memset not exposed via safe cust API yet.");
    }
}

#[macro_export]
macro_rules! impl_gpu_traits {
    ($name:ty) => {
        unsafe impl cust::memory::DeviceCopy for $name {}
        unsafe impl bytemuck::Zeroable for $name {}
        unsafe impl bytemuck::Pod for $name {}
    };
}

#[macro_export]
macro_rules! impl_math_gpu_traits {
    ($Struct:ident) => {
        #[cfg(feature = "use_gpu")]
        unsafe impl<T, const N: usize> cust::memory::DeviceCopy for $Struct<T, N> where
            T: cust::memory::DeviceCopy + Copy
        {
        }

        unsafe impl<T, const N: usize> bytemuck::Zeroable for $Struct<T, N> where
            T: bytemuck::Zeroable
        {
        }

        unsafe impl<T, const N: usize> bytemuck::Pod for $Struct<T, N> where T: bytemuck::Pod {}
    };
}