447 lines
12 KiB
Rust
447 lines
12 KiB
Rust
use cuda_std::prelude::*;
|
|
|
|
pub mod wavefront;
|
|
pub mod workitem;
|
|
|
|
use cust::context::{CacheConfig, CurrentContext, ResourceLimit};
|
|
use cust::device::DeviceAttribute;
|
|
use cust::memory::{DeviceCopy, DeviceMemory};
|
|
use cust::prelude::*;
|
|
use lazy_static::lazy_static;
|
|
use parking_lot::Mutex;
|
|
use std::error::Error;
|
|
use std::ffi::c_void;
|
|
use std::sync::Arc;
|
|
|
|
use crate::Float;
|
|
use crate::core::geometry::{Normal, Point, Vector};
|
|
use crate::core::medium::Medium;
|
|
use crate::core::options::{PBRTOptions, get_options};
|
|
use crate::impl_gpu_traits;
|
|
use crate::impl_math_gpu_traits;
|
|
use crate::spectra::{SampledSpectrum, SampledWavelengths};
|
|
use crate::utils::interval::Interval;
|
|
|
|
pub use workitem::{
|
|
EscapedRayQueue, GetBSSRDFAndProbeRayQueue, HitAreaLightQueue, MaterialEvalQueue,
|
|
MediumSampleQueue, MediumScatterQueue, PixelSampleStateStorage, RayQueue, ShadowRayQueue,
|
|
SubsurfaceScatterQueue,
|
|
};
|
|
|
|
#[repr(C, align(16))]
|
|
#[derive(Clone, Copy, Debug, Default, PartialEq)]
|
|
pub struct Float4 {
|
|
pub v: [f32; 4],
|
|
}
|
|
|
|
pub type Vec4 = Vector<Float, 4>;
|
|
|
|
impl From<Vec4> for Float4 {
|
|
#[inline]
|
|
fn from(vec: Vector<f32, 4>) -> Self {
|
|
Self { v: vec.0 }
|
|
}
|
|
}
|
|
|
|
impl From<Float4> for Vec4 {
|
|
#[inline]
|
|
fn from(storage: Float4) -> Self {
|
|
Vector(storage.v)
|
|
}
|
|
}
|
|
|
|
impl_math_gpu_traits!(Vector);
|
|
impl_math_gpu_traits!(Normal);
|
|
impl_math_gpu_traits!(Point);
|
|
impl_gpu_traits!(Interval);
|
|
impl_gpu_traits!(Float4);
|
|
impl_gpu_traits!(SampledSpectrum);
|
|
impl_gpu_traits!(SampledWavelengths);
|
|
|
|
struct KernelStats {
|
|
description: String,
|
|
num_launches: usize,
|
|
sum_ms: f32,
|
|
min_ms: f32,
|
|
max_ms: f32,
|
|
}
|
|
|
|
impl KernelStats {
|
|
fn new(description: &str) -> Self {
|
|
Self {
|
|
description: description.to_string(),
|
|
num_launches: 0,
|
|
sum_ms: 0.0,
|
|
min_ms: 0.0,
|
|
max_ms: 0.0,
|
|
}
|
|
}
|
|
}
|
|
|
|
struct ProfilerEvent {
|
|
start: Event,
|
|
stop: Event,
|
|
active: bool,
|
|
stats: Option<Arc<Mutex<KernelStats>>>,
|
|
}
|
|
|
|
impl ProfilerEvent {
|
|
fn new() -> Result<Self, cust::error::CudaError> {
|
|
let start = Event::new(EventFlags::DEFAULT)?;
|
|
let stop = Event::new(EventFlags::DEFAULT)?;
|
|
Ok(Self {
|
|
start,
|
|
stop,
|
|
active: false,
|
|
stats: None,
|
|
})
|
|
}
|
|
|
|
fn sync(&mut self) {
|
|
if !self.active {
|
|
return;
|
|
}
|
|
|
|
if self.stop.synchronize().is_ok() {
|
|
// Check timing between start and stop
|
|
match self.stop.elapsed_time_f32(&self.start) {
|
|
Ok(ms) => {
|
|
if let Some(stats_arc) = &self.stats {
|
|
let mut stats = stats_arc.lock();
|
|
stats.num_launches += 1;
|
|
if stats.num_launches == 1 {
|
|
stats.sum_ms = ms;
|
|
stats.min_ms = ms;
|
|
stats.max_ms = ms;
|
|
} else {
|
|
stats.sum_ms += ms;
|
|
stats.min_ms = stats.min_ms.min(ms);
|
|
stats.max_ms = stats.max_ms.max(ms);
|
|
}
|
|
}
|
|
}
|
|
Err(e) => log::error!("Failed to get elapsed time: {:?}", e),
|
|
}
|
|
}
|
|
self.active = false;
|
|
}
|
|
}
|
|
|
|
// --- Profiler Manager ---
|
|
|
|
struct Profiler {
|
|
kernel_stats: Vec<Arc<Mutex<KernelStats>>>,
|
|
event_pool: Vec<ProfilerEvent>,
|
|
pool_offset: usize,
|
|
}
|
|
|
|
impl Profiler {
|
|
fn new() -> Self {
|
|
Self {
|
|
kernel_stats: Vec::new(),
|
|
event_pool: Vec::new(),
|
|
pool_offset: 0,
|
|
}
|
|
}
|
|
|
|
/// Prepares an event from the pool.
|
|
/// Returns a mutable reference to the event, valid as long as the borrow of self.
|
|
fn prepare<'a>(&'a mut self, description: &str) -> &'a mut ProfilerEvent {
|
|
// Grow pool if empty or needed (simple heuristic)
|
|
if self.event_pool.is_empty() {
|
|
for _ in 0..128 {
|
|
if let Ok(e) = ProfilerEvent::new() {
|
|
self.event_pool.push(e);
|
|
}
|
|
}
|
|
}
|
|
|
|
if self.pool_offset >= self.event_pool.len() {
|
|
self.pool_offset = 0;
|
|
}
|
|
|
|
let idx = self.pool_offset;
|
|
self.pool_offset += 1;
|
|
|
|
let pe = &mut self.event_pool[idx];
|
|
|
|
if pe.active {
|
|
pe.sync();
|
|
}
|
|
|
|
pe.active = true;
|
|
pe.stats = None;
|
|
|
|
// Find or create stats
|
|
let mut found = None;
|
|
for s in &self.kernel_stats {
|
|
if s.lock().description == description {
|
|
found = Some(s.clone());
|
|
break;
|
|
}
|
|
}
|
|
|
|
if found.is_none() {
|
|
let new_stats = Arc::new(Mutex::new(KernelStats::new(description)));
|
|
self.kernel_stats.push(new_stats.clone());
|
|
found = Some(new_stats);
|
|
}
|
|
|
|
pe.stats = found;
|
|
pe
|
|
}
|
|
}
|
|
|
|
pub struct GpuState {
|
|
context: Context,
|
|
stream: Stream,
|
|
profiler: Profiler,
|
|
}
|
|
|
|
impl GpuState {
|
|
fn init(device_index: u32) -> Result<Self, Box<dyn Error>> {
|
|
cust::init(CudaFlags::empty())?;
|
|
|
|
let device = Device::get_device(device_index)?;
|
|
|
|
let name = device.name().unwrap_or_else(|_| "Unknown".into());
|
|
let memory = device.total_memory().unwrap_or(0);
|
|
let memory_gb = memory as f64 / (1024.0 * 1024.0 * 1024.0);
|
|
|
|
let major = device
|
|
.get_attribute(DeviceAttribute::ComputeCapabilityMajor)
|
|
.unwrap_or(0);
|
|
let minor = device
|
|
.get_attribute(DeviceAttribute::ComputeCapabilityMinor)
|
|
.unwrap_or(0);
|
|
|
|
log::info!(
|
|
"Selected GPU: {} ({:.2} GB, SM {}.{})",
|
|
name,
|
|
memory_gb,
|
|
major,
|
|
minor
|
|
);
|
|
|
|
let has_unified = device
|
|
.get_attribute(DeviceAttribute::UnifiedAddressing)
|
|
.unwrap_or(0);
|
|
if has_unified == 0 {
|
|
panic!("Selected GPU does not support unified addressing.");
|
|
}
|
|
|
|
let context = Context::new(device)?;
|
|
|
|
CurrentContext::set_resource_limit(ResourceLimit::StackSize, 8192)?;
|
|
let stack_size = CurrentContext::get_resource_limit(ResourceLimit::StackSize)?;
|
|
log::info!("Reset stack size to {}", stack_size);
|
|
|
|
CurrentContext::set_resource_limit(ResourceLimit::PrintfFifoSize, 32 * 1024 * 1024)?;
|
|
CurrentContext::set_cache_config(CacheConfig::PreferL1)?;
|
|
|
|
let stream = Stream::new(StreamFlags::DEFAULT, None)?;
|
|
|
|
Ok(Self {
|
|
context,
|
|
stream,
|
|
profiler: Profiler::new(),
|
|
})
|
|
}
|
|
}
|
|
|
|
lazy_static! {
|
|
static ref GPU_STATE: Mutex<Option<GpuState>> = Mutex::new(None);
|
|
}
|
|
|
|
pub fn gpu_init() {
|
|
if !get_options().use_gpu {
|
|
return;
|
|
}
|
|
|
|
let device_id = get_options().gpu_device.unwrap_or(0);
|
|
log::info!("Initializing GPU Device {}", device_id);
|
|
|
|
match GpuState::init(device_id) {
|
|
Ok(state) => {
|
|
#[cfg(feature = "use_nvtx")]
|
|
nvtx::name_thread("MAIN_THREAD");
|
|
*GPU_STATE.lock() = Some(state);
|
|
}
|
|
Err(e) => {
|
|
panic!("Failed to initialize GPU: {:?}", e);
|
|
}
|
|
}
|
|
}
|
|
|
|
pub fn gpu_thread_init() {
|
|
if let Some(state) = GPU_STATE.lock().as_ref() {
|
|
if let Err(e) = CurrentContext::set_current(&state.context) {
|
|
log::error!("Failed to set CUDA context for thread: {:?}", e);
|
|
}
|
|
}
|
|
}
|
|
|
|
pub fn gpu_wait() {
|
|
let mut guard = GPU_STATE.lock();
|
|
if let Some(state) = guard.as_mut() {
|
|
if let Err(e) = state.stream.synchronize() {
|
|
log::error!("GPU Wait failed: {:?}", e);
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Launches a parallel for loop on the GPU.
|
|
///
|
|
/// # Arguments
|
|
/// * `description`: Name for profiling.
|
|
/// * `n_items`: Total items (threads).
|
|
/// * `function`: Compiled kernel function handle.
|
|
/// * `params`: Kernel parameters (must be DeviceCopy).
|
|
pub fn gpu_parallel_for<T: DeviceCopy>(
|
|
description: &str,
|
|
n_items: i32,
|
|
function: &Function,
|
|
params: &T,
|
|
) {
|
|
#[cfg(feature = "use_nvtx")]
|
|
nvtx::range_push(description);
|
|
|
|
let mut guard = GPU_STATE.lock();
|
|
let state = guard.as_mut().expect("GPU not initialized");
|
|
|
|
let (_, block_size) = match function.suggested_launch_configuration(0, 0.into()) {
|
|
Ok(cfg) => cfg,
|
|
Err(e) => panic!(
|
|
"Failed to calculate launch config for {}: {:?}",
|
|
description, e
|
|
),
|
|
};
|
|
|
|
#[cfg(debug_assertions)]
|
|
log::debug!("[{}] Block size: {}", description, block_size);
|
|
|
|
let grid_size = (n_items as u32 + block_size - 1) / block_size;
|
|
|
|
let stream = &state.stream;
|
|
let profiler = &mut state.profiler;
|
|
|
|
// Save the index we are about to use so we can retrieve the STOP event later
|
|
let event_idx = profiler.pool_offset;
|
|
|
|
{
|
|
let pe = profiler.prepare(description);
|
|
if let Err(e) = pe.start.record(stream) {
|
|
log::error!("Failed to record start event: {:?}", e);
|
|
}
|
|
}
|
|
|
|
let params_ptr = params as *const T as *mut c_void;
|
|
let n_items_ptr = &n_items as *const i32 as *mut c_void;
|
|
let args = [params_ptr, n_items_ptr];
|
|
|
|
unsafe {
|
|
if let Err(e) =
|
|
state
|
|
.stream
|
|
.launch(function, (grid_size, 1, 1), (block_size, 1, 1), 0, &args)
|
|
{
|
|
panic!("CUDA Launch failed for {}: {:?}", description, e);
|
|
}
|
|
}
|
|
|
|
// Retrieve the specific event we just set up.
|
|
// Pool_offset was incremented in prepare().
|
|
// If event_idx was the one used, the event is at event_idx.
|
|
if event_idx < profiler.event_pool.len() {
|
|
let pe = &mut profiler.event_pool[event_idx];
|
|
if let Err(e) = pe.stop.record(stream) {
|
|
log::error!("Failed to record stop event: {:?}", e);
|
|
}
|
|
}
|
|
|
|
#[cfg(debug_assertions)]
|
|
let _ = state.stream.synchronize();
|
|
|
|
#[cfg(feature = "use_nvtx")]
|
|
nvtx::range_pop();
|
|
}
|
|
|
|
pub fn report_kernel_stats() {
|
|
let mut guard = GPU_STATE.lock();
|
|
if let Some(state) = guard.as_mut() {
|
|
let _ = state.stream.synchronize();
|
|
|
|
// Process all pending events
|
|
for pe in &mut state.profiler.event_pool {
|
|
if pe.active {
|
|
pe.sync();
|
|
}
|
|
}
|
|
|
|
let mut total_ms = 0.0;
|
|
for s in &state.profiler.kernel_stats {
|
|
total_ms += s.lock().sum_ms;
|
|
}
|
|
|
|
println!("Wavefront Kernel Profile:");
|
|
for s in &state.profiler.kernel_stats {
|
|
let stats = s.lock();
|
|
let percent = if total_ms > 0.0 {
|
|
100.0 * stats.sum_ms / total_ms
|
|
} else {
|
|
0.0
|
|
};
|
|
println!(
|
|
" {:<45} {:5} launches {:9.2} ms / {:5.1}% (avg {:6.3})",
|
|
stats.description,
|
|
stats.num_launches,
|
|
stats.sum_ms,
|
|
percent,
|
|
if stats.num_launches > 0 {
|
|
stats.sum_ms / stats.num_launches as f32
|
|
} else {
|
|
0.0
|
|
}
|
|
);
|
|
}
|
|
println!("\nTotal: {:.2} ms", total_ms);
|
|
}
|
|
}
|
|
|
|
pub fn gpu_memset<T: DeviceCopy>(dst: &mut DeviceSlice<T>, value: u8) {
|
|
unsafe {
|
|
let ptr = dst.as_raw_ptr(); // Returns CUdeviceptr (u64)
|
|
let len = dst.len() * std::mem::size_of::<T>();
|
|
|
|
// We need the `cust::external::cuda` or equivalent sys crate function
|
|
|
|
log::warn!("gpu_memset requested but raw memset not exposed via safe cust API yet.");
|
|
}
|
|
}
|
|
|
|
#[macro_export]
|
|
macro_rules! impl_gpu_traits {
|
|
($name:ty) => {
|
|
unsafe impl cust::memory::DeviceCopy for $name {}
|
|
unsafe impl bytemuck::Zeroable for $name {}
|
|
unsafe impl bytemuck::Pod for $name {}
|
|
};
|
|
}
|
|
|
|
#[macro_export]
|
|
macro_rules! impl_math_gpu_traits {
|
|
($Struct:ident) => {
|
|
#[cfg(feature = "use_gpu")]
|
|
unsafe impl<T, const N: usize> cust::memory::DeviceCopy for $Struct<T, N> where
|
|
T: cust::memory::DeviceCopy + Copy
|
|
{
|
|
}
|
|
|
|
unsafe impl<T, const N: usize> bytemuck::Zeroable for $Struct<T, N> where
|
|
T: bytemuck::Zeroable
|
|
{
|
|
}
|
|
|
|
unsafe impl<T, const N: usize> bytemuck::Pod for $Struct<T, N> where T: bytemuck::Pod {}
|
|
};
|
|
}
|