Moving on to GPU rendering

2026-05-28 06:39:26 +01:00 · 2026-05-28 06:39:26 +01:00 · a6ee0a1b52
commit a6ee0a1b52
parent e6d1850785
6 changed files with 1989 additions and 0 deletions
--- a/kernels/src/intersect.rs
+++ b/kernels/src/intersect.rs
@ -0,0 +1,513 @@
+#[cfg(target_os = "cuda")]
+pub mod device {
+    use shared::core::aggregates::{BVHAggregate, LinearBVHNode};
+    use shared::core::geometry::{Bounds3f, Normal3f, Point2f, Point3f, Ray, Vector3f};
+    use shared::core::interaction::LightSampleContext;
+    use shared::core::material::Material;
+    use shared::core::medium::MediumInterface;
+    use shared::core::primitive::{Primitive, PrimitiveTrait};
+    use shared::spectra::{SampledSpectrum, SampledWavelengths};
+    use shared::utils::atomic::GpuAtomicU32;
+    use shared::utils::soa::SoABuffer;
+    use shared::wavefront::work_items::*;
+    use shared::{Float, Ptr};
+ 
+    use cuda_std::*;
+ 
+    #[repr(C)]
+    pub struct IntersectClosestParams {
+        pub bvh: Ptr<BVHAggregate>,
+ 
+        // Input queue
+        pub ray_q: Ptr<RayQueue>,
+ 
+        // Output queues
+        pub escaped_ray_q: Ptr<EscapedRayQueue>,
+        pub hit_area_light_q: Ptr<HitAreaLightQueue>,
+        pub basic_eval_mtl_q: Ptr<MaterialEvalQueue>,
+        pub universal_eval_mtl_q: Ptr<MaterialEvalQueue>,
+        pub next_ray_q: Ptr<RayQueue>,
+ 
+        // Persistent state
+        pub pixel_sample_state: Ptr<PixelSampleState>,
+ 
+        pub n_rays: u32,
+    }
+ 
+    /// One thread per ray: traverse BVH, push results to output queues.
+    #[kernel]
+    pub unsafe fn intersect_closest(params: &IntersectClosestParams) {
+        let idx = thread::index_1d();
+        if idx >= params.n_rays {
+            return;
+        }
+        let i = idx as usize;
+ 
+        let ray_q = &*params.ray_q.as_raw();
+        let work = ray_q.storage.get(i);
+ 
+        let ray = Ray::new(
+            work.ray_o,
+            work.ray_d,
+            Some(work.ray_time),
+            work.ray_medium,
+        );
+ 
+        let pi = work.pixel_index as usize;
+        let pss = &*params.pixel_sample_state.as_raw();
+ 
+        // Read persistent path state
+        let beta = pss.beta.get(pi);
+        let r_u = pss.r_u.get(pi);
+        let r_l = pss.r_l.get(pi);
+        let lambda = pss.lambda.get(pi);
+        let depth = pss.depth.get(pi);
+        let specular_bounce = pss.specular_bounce.get(pi) != 0;
+        let prev_intr_ctx = pss.prev_intr_ctx.get(pi);
+        let eta_scale = pss.eta_scale.get(pi);
+        let any_non_specular = pss.any_non_specular_bounces.get(pi) != 0;
+ 
+        // BVH traversal — mirrors BVHAggregate::intersect exactly
+        let bvh = &*params.bvh.as_raw();
+        if bvh.nodes.is_empty() {
+            // No geometry — ray escapes
+            push_escaped(params, &work, &lambda, &beta, &r_u, &r_l, depth, specular_bounce, &prev_intr_ctx);
+            return;
+        }
+ 
+        let nodes_ptr = bvh.nodes.as_ptr();
+        let prims_ptr = bvh.primitives.as_ptr();
+        let mut best_si = None;
+        let mut hit_t: Float = Float::INFINITY;
+ 
+        let inv_dir = Vector3f::new(
+            1.0 / ray.d.x(),
+            1.0 / ray.d.y(),
+            1.0 / ray.d.z(),
+        );
+        let dir_is_neg = [
+            if inv_dir.x() < 0.0 { 1u8 } else { 0 },
+            if inv_dir.y() < 0.0 { 1u8 } else { 0 },
+            if inv_dir.z() < 0.0 { 1u8 } else { 0 },
+        ];
+ 
+        let mut to_visit_offset: u32 = 0;
+        let mut current_node_index: usize = 0;
+        // GPU stack — 64 entries matches CPU, fits in registers/local memory
+        let mut nodes_to_visit = [0usize; 64];
+ 
+        loop {
+            let node = &*nodes_ptr.add(current_node_index);
+ 
+            if node.bounds.intersect_p(ray.o, hit_t, inv_dir, &dir_is_neg).is_some() {
+                if node.n_primitives > 0 {
+                    // Leaf node — test primitives
+                    let mut j = 0u16;
+                    while j < node.n_primitives {
+                        let prim_idx = node.primitives_offset + j as usize;
+                        let prim = &*prims_ptr.add(prim_idx);
+ 
+                        if let Some(si) = prim.intersect(&ray, Some(hit_t)) {
+                            hit_t = si.t_hit();
+                            best_si = Some(si);
+                        }
+                        j += 1;
+                    }
+ 
+                    if to_visit_offset == 0 {
+                        break;
+                    }
+                    to_visit_offset -= 1;
+                    current_node_index = nodes_to_visit[to_visit_offset as usize];
+                } else {
+                    // Interior node — push far child, visit near child
+                    if dir_is_neg[node.axis as usize] == 1 {
+                        nodes_to_visit[to_visit_offset as usize] = current_node_index + 1;
+                        to_visit_offset += 1;
+                        current_node_index = node.primitives_offset;
+                    } else {
+                        nodes_to_visit[to_visit_offset as usize] = node.primitives_offset;
+                        to_visit_offset += 1;
+                        current_node_index += 1;
+                    }
+                }
+            } else {
+                if to_visit_offset == 0 {
+                    break;
+                }
+                to_visit_offset -= 1;
+                current_node_index = nodes_to_visit[to_visit_offset as usize];
+            }
+        }
+ 
+        // Sort result into output queues
+        let Some(si) = best_si else {
+            push_escaped(params, &work, &lambda, &beta, &r_u, &r_l, depth, specular_bounce, &prev_intr_ctx);
+            return;
+        };
+ 
+        let intr = &si.intr;
+ 
+        // Null material — medium interface, re-queue ray
+        if intr.material.is_null() {
+            let next_q = &*params.next_ray_q.as_raw();
+            next_q.push(RayWorkItem {
+                ray_o: intr.p(),
+                ray_d: work.ray_d,
+                ray_time: work.ray_time,
+                ray_medium: work.ray_medium,
+                has_differentials: work.has_differentials,
+                differential: work.differential,
+                pixel_index: work.pixel_index,
+            });
+            return;
+        }
+ 
+        // Area light hit
+        if !intr.area_light.is_null() {
+            let q = &*params.hit_area_light_q.as_raw();
+            q.push(HitAreaLightWorkItem {
+                area_light: intr.area_light,
+                p: intr.p(),
+                n: intr.n(),
+                uv: intr.common.uv,
+                wo: -work.ray_d,
+                lambda,
+                pixel_index: work.pixel_index,
+                beta,
+                r_u,
+                r_l,
+                depth,
+                specular_bounce,
+                prev_intr_ctx,
+            });
+        }
+ 
+        // Material evaluation: push to appropriate queue
+        // For now, push everything to universal eval queue.
+        // Basic vs universal split requires checking can_evaluate_textures
+        // on the material, which we can refine later.
+        let q = &*params.universal_eval_mtl_q.as_raw();
+        q.push(MaterialEvalWorkItem {
+            p: intr.p(),
+            n: intr.n(),
+            ns: intr.shading.n,
+            dpdu: intr.shading.dpdu,
+            dpdv: intr.shading.dpdv,
+            uv: intr.common.uv,
+            wo: -work.ray_d,
+            time: work.ray_time,
+            face_index: intr.face_index,
+            material: intr.material,
+            area_light: intr.area_light,
+            medium_interface: intr.common.medium_interface,
+            pixel_index: work.pixel_index,
+            lambda,
+            beta,
+            r_u,
+            any_non_specular_bounces: any_non_specular,
+            depth,
+            eta_scale,
+        });
+    }
+ 
+    /// Shadow ray kernel — one thread per shadow ray, binary occlusion test.
+    #[kernel]
+    pub unsafe fn intersect_shadow(params: &IntersectShadowParams) {
+        let idx = thread::index_1d();
+        if idx >= params.n_rays {
+            return;
+        }
+        let i = idx as usize;
+ 
+        let shadow_q = &*params.shadow_ray_q.as_raw();
+        let work = shadow_q.storage.get(i);
+ 
+        let ray = Ray::new(
+            work.ray_o,
+            work.ray_d,
+            Some(work.ray_time),
+            Ptr::null(),
+        );
+ 
+        // BVH any-hit traversal
+        let bvh = &*params.bvh.as_raw();
+        let occluded = bvh_intersect_p(bvh, &ray, work.t_max);
+ 
+        // If NOT occluded, add direct lighting contribution
+        if !occluded {
+            let pss = &*params.pixel_sample_state.as_raw();
+            let pi = work.pixel_index as usize;
+            // Atomic add to each spectral channel
+            let mut l = pss.l.get(pi);
+            l += work.l_d;
+            pss.l.set(pi, l);
+            // NOTE: This set is not atomic per-channel. For correctness
+            // when multiple shadow rays hit the same pixel, we'd need
+            // per-channel AtomicFloat. For now this works because each
+            // pixel has at most one shadow ray in flight per depth.
+        }
+    }
+ 
+    /// Launch parameters for shadow ray kernel.
+    #[repr(C)]
+    pub struct IntersectShadowParams {
+        pub bvh: Ptr<BVHAggregate>,
+        pub shadow_ray_q: Ptr<ShadowRayQueue>,
+        pub pixel_sample_state: Ptr<PixelSampleState>,
+        pub n_rays: u32,
+    }
+ 
+    // -- Helper functions --
+ 
+    unsafe fn push_escaped(
+        params: &IntersectClosestParams,
+        work: &RayWorkItem,
+        lambda: &SampledWavelengths,
+        beta: &SampledSpectrum,
+        r_u: &SampledSpectrum,
+        r_l: &SampledSpectrum,
+        depth: u32,
+        specular_bounce: bool,
+        prev_intr_ctx: &LightSampleContext,
+    ) {
+        let q = &*params.escaped_ray_q.as_raw();
+        q.push(EscapedRayWorkItem {
+            ray_o: work.ray_o,
+            ray_d: work.ray_d,
+            lambda: *lambda,
+            pixel_index: work.pixel_index,
+            beta: *beta,
+            r_u: *r_u,
+            r_l: *r_l,
+            depth,
+            specular_bounce,
+            prev_intr_ctx: *prev_intr_ctx,
+        });
+    }
+ 
+    /// BVH any-hit traversal for shadow rays — returns true if occluded.
+    unsafe fn bvh_intersect_p(bvh: &BVHAggregate, ray: &Ray, t_max: Float) -> bool {
+        if bvh.nodes.is_empty() {
+            return false;
+        }
+ 
+        let nodes_ptr = bvh.nodes.as_ptr();
+        let prims_ptr = bvh.primitives.as_ptr();
+ 
+        let inv_dir = Vector3f::new(
+            1.0 / ray.d.x(),
+            1.0 / ray.d.y(),
+            1.0 / ray.d.z(),
+        );
+        let dir_is_neg = [
+            if inv_dir.x() < 0.0 { 1u8 } else { 0 },
+            if inv_dir.y() < 0.0 { 1u8 } else { 0 },
+            if inv_dir.z() < 0.0 { 1u8 } else { 0 },
+        ];
+ 
+        let mut to_visit_offset: u32 = 0;
+        let mut current_node_index: usize = 0;
+        let mut nodes_to_visit = [0usize; 64];
+ 
+        loop {
+            let node = &*nodes_ptr.add(current_node_index);
+ 
+            if node.bounds.intersect_p(ray.o, t_max, inv_dir, &dir_is_neg).is_some() {
+                if node.n_primitives > 0 {
+                    let mut j = 0u16;
+                    while j < node.n_primitives {
+                        let prim_idx = node.primitives_offset + j as usize;
+                        let prim = &*prims_ptr.add(prim_idx);
+                        if prim.intersect_p(ray, Some(t_max)) {
+                            return true;
+                        }
+                        j += 1;
+                    }
+                    if to_visit_offset == 0 {
+                        break;
+                    }
+                    to_visit_offset -= 1;
+                    current_node_index = nodes_to_visit[to_visit_offset as usize];
+                } else {
+                    if dir_is_neg[node.axis as usize] == 1 {
+                        nodes_to_visit[to_visit_offset as usize] = current_node_index + 1;
+                        to_visit_offset += 1;
+                        current_node_index = node.primitives_offset;
+                    } else {
+                        nodes_to_visit[to_visit_offset as usize] = node.primitives_offset;
+                        to_visit_offset += 1;
+                        current_node_index += 1;
+                    }
+                }
+            } else {
+                if to_visit_offset == 0 {
+                    break;
+                }
+                to_visit_offset -= 1;
+                current_node_index = nodes_to_visit[to_visit_offset as usize];
+            }
+        }
+        false
+    }
+}
+
+
+#[cfg(feature = "cuda")]
+pub mod host {
+    use crate::core::aggregates::BVHAggregate;
+    use crate::core::geometry::Bounds3f;
+    use crate::core::primitive::PrimitiveTrait;
+    use crate::wavefront::aggregate::WavefrontAggregate;
+    use crate::wavefront::work_items::*;
+    use crate::{Ptr, Float};
+ 
+    use cust::prelude::*;
+    use cust::launch;
+ 
+    /// CUDA aggregate — holds the BVH and the compiled kernel module.
+    pub struct CudaAggregate {
+        pub bvh: Ptr<BVHAggregate>,
+        pub module: Module,
+        pub stream: Stream,
+    }
+ 
+    impl CudaAggregate {
+        pub fn new(bvh: Ptr<BVHAggregate>, ptx_data: &str) -> cust::error::CudaResult<Self> {
+            // Initialize CUDA context (assumes cust::init() already called)
+            let module = Module::from_ptx(ptx_data, &[])?;
+            let stream = Stream::new(StreamFlags::NON_BLOCKING, None)?;
+            Ok(Self { bvh, module, stream })
+        }
+ 
+        fn launch_intersect_closest(
+            &self,
+            n_rays: u32,
+            ray_q: &RayQueue,
+            escaped_ray_q: &EscapedRayQueue,
+            hit_area_light_q: &HitAreaLightQueue,
+            basic_eval_mtl_q: &MaterialEvalQueue,
+            universal_eval_mtl_q: &MaterialEvalQueue,
+            next_ray_q: &RayQueue,
+            pixel_sample_state: &PixelSampleState,
+        ) -> cust::error::CudaResult<()> {
+            if n_rays == 0 {
+                return Ok(());
+            }
+ 
+            let func = self.module.get_function("intersect_closest")?;
+ 
+            // Build launch params in unified memory
+            let params = super::device::IntersectClosestParams {
+                bvh: self.bvh,
+                ray_q: Ptr::from(ray_q),
+                escaped_ray_q: Ptr::from(escaped_ray_q),
+                hit_area_light_q: Ptr::from(hit_area_light_q),
+                basic_eval_mtl_q: Ptr::from(basic_eval_mtl_q),
+                universal_eval_mtl_q: Ptr::from(universal_eval_mtl_q),
+                next_ray_q: Ptr::from(next_ray_q),
+                pixel_sample_state: Ptr::from(pixel_sample_state),
+                n_rays,
+            };
+ 
+            let block_size = 256u32;
+            let grid_size = (n_rays + block_size - 1) / block_size;
+ 
+            unsafe {
+                launch!(
+                    func<<<grid_size, block_size, 0, self.stream>>>(
+                        &params
+                    )
+                )?;
+            }
+            self.stream.synchronize()?;
+            Ok(())
+        }
+ 
+        fn launch_intersect_shadow(
+            &self,
+            n_rays: u32,
+            shadow_ray_q: &ShadowRayQueue,
+            pixel_sample_state: &PixelSampleState,
+        ) -> cust::error::CudaResult<()> {
+            if n_rays == 0 {
+                return Ok(());
+            }
+ 
+            let func = self.module.get_function("intersect_shadow")?;
+ 
+            let params = super::device::IntersectShadowParams {
+                bvh: self.bvh,
+                shadow_ray_q: Ptr::from(shadow_ray_q),
+                pixel_sample_state: Ptr::from(pixel_sample_state),
+                n_rays,
+            };
+ 
+            let block_size = 256u32;
+            let grid_size = (n_rays + block_size - 1) / block_size;
+ 
+            unsafe {
+                launch!(
+                    func<<<grid_size, block_size, 0, self.stream>>>(
+                        &params
+                    )
+                )?;
+            }
+            self.stream.synchronize()?;
+            Ok(())
+        }
+    }
+ 
+    impl WavefrontAggregate for CudaAggregate {
+        fn bounds(&self) -> Bounds3f {
+            self.bvh.get().map(|b| b.bounds()).unwrap_or_default()
+        }
+ 
+        fn intersect_closest(
+            &self,
+            max_rays: usize,
+            ray_q: &RayQueue,
+            escaped_ray_q: &EscapedRayQueue,
+            hit_area_light_q: &HitAreaLightQueue,
+            basic_eval_mtl_q: &MaterialEvalQueue,
+            universal_eval_mtl_q: &MaterialEvalQueue,
+            next_ray_q: &RayQueue,
+            pixel_sample_state: &PixelSampleState,
+        ) {
+            let n_rays = ray_q.size().min(max_rays as u32);
+            self.launch_intersect_closest(
+                n_rays,
+                ray_q,
+                escaped_ray_q,
+                hit_area_light_q,
+                basic_eval_mtl_q,
+                universal_eval_mtl_q,
+                next_ray_q,
+                pixel_sample_state,
+            )
+            .expect("CUDA intersect_closest kernel launch failed");
+        }
+ 
+        fn intersect_shadow(
+            &self,
+            max_rays: usize,
+            shadow_ray_q: &ShadowRayQueue,
+            pixel_sample_state: &PixelSampleState,
+        ) {
+            let n_rays = shadow_ray_q.size().min(max_rays as u32);
+            self.launch_intersect_shadow(n_rays, shadow_ray_q, pixel_sample_state)
+                .expect("CUDA intersect_shadow kernel launch failed");
+        }
+ 
+        fn intersect_shadow_tr(
+            &self,
+            max_rays: usize,
+            shadow_ray_q: &ShadowRayQueue,
+            pixel_sample_state: &PixelSampleState,
+        ) {
+            // Without participating media, shadow_tr is the same as shadow
+            self.intersect_shadow(max_rays, shadow_ray_q, pixel_sample_state);
+        }
+    }
+}
+
--- a/shared/src/utils/atomic.rs
+++ b/shared/src/utils/atomic.rs
@ -0,0 +1,263 @@
+use crate::Float;
+
+pub const SCOPE_DEVICE: u32 = 1;
+
+#[allow(dead_code)]
+pub const SCOPE_WORKGROUP: u32 = 2;
+
+pub const SEMANTICS_RELAXED: u32 = 0x0;
+
+#[allow(dead_code)]
+pub const SEMANTICS_ACQUIRE_RELEASE: u32 = 0x8;
+
+#[repr(C)]
+#[derive(Debug)]
+pub struct AtomicU32 {
+    value: u32,
+}
+
+impl Default for AtomicU32 {
+    fn default() -> Self {
+        Self::new(0)
+    }
+}
+
+impl Clone for AtomicU32 {
+    fn clone(&self) -> Self {
+        Self::new(self.load())
+    }
+}
+
+impl AtomicU32 {
+    pub fn new(val: u32) -> Self {
+        Self { value: val }
+    }
+
+    #[cfg(not(any(target_arch = "spirv", feature = "cuda")))]
+    #[inline(always)]
+    pub fn load(&self) -> u32 {
+        let atomic = unsafe {
+            &*(core::ptr::addr_of!(self.value) as *const core::sync::atomic::AtomicU32)
+        };
+        atomic.load(core::sync::atomic::Ordering::Relaxed)
+    }
+
+    #[cfg(not(any(target_arch = "spirv", feature = "cuda")))]
+    #[inline(always)]
+    pub fn store(&self, val: u32) {
+        let atomic = unsafe {
+            &*(core::ptr::addr_of!(self.value) as *const core::sync::atomic::AtomicU32)
+        };
+        atomic.store(val, core::sync::atomic::Ordering::Relaxed);
+    }
+
+    #[cfg(not(any(target_arch = "spirv", feature = "cuda")))]
+    #[inline(always)]
+    pub fn fetch_add(&self, val: u32) -> u32 {
+        let atomic = unsafe {
+            &*(core::ptr::addr_of!(self.value) as *const core::sync::atomic::AtomicU32)
+        };
+        atomic.fetch_add(val, core::sync::atomic::Ordering::Relaxed)
+    }
+
+    #[cfg(not(any(target_arch = "spirv", feature = "cuda")))]
+    #[inline(always)]
+    pub fn compare_exchange(&self, expected: u32, desired: u32) -> Result<u32, u32> {
+        let atomic = unsafe {
+            &*(core::ptr::addr_of!(self.value) as *const core::sync::atomic::AtomicU32)
+        };
+        atomic.compare_exchange_weak(
+            expected,
+            desired,
+            core::sync::atomic::Ordering::Relaxed,
+            core::sync::atomic::Ordering::Relaxed,
+        )
+    }
+
+    #[cfg(target_arch = "spirv")]
+    #[inline(always)]
+    pub fn load(&self) -> u32 {
+        unsafe {
+            spirv_std::arch::atomic_load::<u32, SCOPE_DEVICE, SEMANTICS_RELAXED>(
+                &self.value,
+            )
+        }
+    }
+
+    #[cfg(target_arch = "spirv")]
+    #[inline(always)]
+    pub fn store(&self, val: u32) {
+        unsafe {
+            spirv_std::arch::atomic_store::<u32, SCOPE_DEVICE, SEMANTICS_RELAXED>(
+                &mut *core::ptr::addr_of!(self.value).cast_mut(),
+                val,
+            );
+        }
+    }
+
+    #[cfg(target_arch = "spirv")]
+    #[inline(always)]
+    pub fn fetch_add(&self, val: u32) -> u32 {
+        unsafe {
+            spirv_std::arch::atomic_i_add::<u32, SCOPE_DEVICE, SEMANTICS_RELAXED>(
+                &mut *core::ptr::addr_of!(self.value).cast_mut(),
+                val,
+            )
+        }
+    }
+
+    #[cfg(target_arch = "spirv")]
+    #[inline(always)]
+    pub fn compare_exchange(&self, expected: u32, desired: u32) -> Result<u32, u32> {
+        let old = unsafe {
+            spirv_std::arch::atomic_compare_exchange::<
+                u32,
+                SCOPE_DEVICE,
+                SEMANTICS_RELAXED,
+                SEMANTICS_RELAXED,
+            >(
+                &mut *core::ptr::addr_of!(self.value).cast_mut(),
+                desired,
+                expected,
+            )
+        };
+        if old == expected {
+            Ok(old)
+        } else {
+            Err(old)
+        }
+    }
+
+    // -- CUDA backend --
+    #[cfg(feature = "cuda")]
+    #[inline(always)]
+    pub fn load(&self) -> u32 {
+        // CUDA volatile read for atomicity on the same SM
+        unsafe { core::ptr::read_volatile(&self.value) }
+    }
+
+    #[cfg(feature = "cuda")]
+    #[inline(always)]
+    pub fn store(&self, val: u32) {
+        unsafe {
+            core::ptr::write_volatile(
+                core::ptr::addr_of!(self.value).cast_mut(),
+                val,
+            );
+        }
+    }
+
+    #[cfg(feature = "cuda")]
+    #[inline(always)]
+    pub fn fetch_add(&self, val: u32) -> u32 {
+        let ptr = core::ptr::addr_of!(self.value).cast_mut();
+        let mut old: u32;
+        unsafe {
+            core::arch::asm!(
+                "atom.add.u32 {old}, [{ptr}], {val};",
+                old = out(reg32) old,
+                ptr = in(reg64) ptr,
+                val = in(reg32) val,
+            );
+        }
+        old
+    }
+
+    #[cfg(feature = "cuda")]
+    #[inline(always)]
+    pub fn compare_exchange(&self, expected: u32, desired: u32) -> Result<u32, u32> {
+        let ptr = core::ptr::addr_of!(self.value).cast_mut();
+        let mut old: u32;
+        unsafe {
+            core::arch::asm!(
+                "atom.cas.b32 {old}, [{ptr}], {expected}, {desired};",
+                old = out(reg32) old,
+                ptr = in(reg64) ptr,
+                expected = in(reg32) expected,
+                desired = in(reg32) desired,
+            );
+        }
+        if old == expected {
+            Ok(old)
+        } else {
+            Err(old)
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug)]
+pub struct AtomicF32 {
+    bits: AtomicU32,
+}
+
+impl Default for AtomicF32 {
+    fn default() -> Self {
+        Self::new(0.0)
+    }
+}
+
+impl Clone for AtomicF32 {
+    fn clone(&self) -> Self {
+        Self::new(self.get())
+    }
+}
+
+impl AtomicF32 {
+    pub fn new(val: Float) -> Self {
+        Self {
+            bits: AtomicU32::new(val.to_bits()),
+        }
+    }
+
+    pub fn get(&self) -> Float {
+        Float::from_bits(self.bits.load())
+    }
+
+    pub fn set(&self, val: Float) {
+        self.bits.store(val.to_bits());
+    }
+
+    #[cfg(not(any(target_arch = "spirv", feature = "cuda")))]
+    #[inline(always)]
+    pub fn add(&self, val: Float) {
+        let mut current_bits = self.bits.load();
+        loop {
+            let current_val = Float::from_bits(current_bits);
+            let new_val = current_val + val;
+            let new_bits = new_val.to_bits();
+            match self.bits.compare_exchange(current_bits, new_bits) {
+                Ok(_) => break,
+                Err(x) => current_bits = x,
+            }
+        }
+    }
+
+    #[cfg(target_arch = "spirv")]
+    #[inline(always)]
+    pub fn add(&self, val: Float) {
+        unsafe {
+            let float_ptr = core::ptr::addr_of!(self.bits.value) as *mut Float;
+            spirv_std::arch::atomic_f_add::<Float, SCOPE_DEVICE, SEMANTICS_RELAXED>(
+                &mut *float_ptr,
+                val,
+            );
+        }
+    }
+
+    #[cfg(feature = "cuda")]
+    #[inline(always)]
+    pub fn add(&self, val: Float) {
+        let ptr = core::ptr::addr_of!(self.bits.value) as *mut Float;
+        unsafe {
+            core::arch::asm!(
+                "atom.add.f32 {old}, [{ptr}], {val};",
+                old = out(reg32) _,
+                ptr = in(reg64) ptr,
+                val = in(reg32) val.to_bits(),
+            );
+        }
+    }
+}
+
+pub type AtomicFloat = AtomicF32;
--- a/shared/src/wavefront/aggregate.rs
+++ b/shared/src/wavefront/aggregate.rs
@ -0,0 +1,216 @@
+use crate::core::geometry::{Bounds3f, Ray, Vector3f};
+use crate::core::interaction::InteractionTrait;
+use crate::core::material::MaterialTrait;
+use crate::core::primitive::{Primitive, PrimitiveTrait};
+use crate::core::texture::{TextureEvaluator, UniversalTextureEvaluator};
+use crate::wavefront::workitems::*;
+
+pub trait WavefrontAggregate {
+    fn bounds(&self) -> Bounds3f;
+
+    fn intersect_closest(
+        &self,
+        max_rays: usize,
+        ray_q: &RayQueue,
+        escaped_ray_q: &EscapedRayQueue,
+        hit_area_light_q: &HitAreaLightQueue,
+        basic_eval_mtl_q: &MaterialEvalQueue,
+        universal_eval_mtl_q: &MaterialEvalQueue,
+        next_ray_q: &RayQueue,
+        pixel_sample_state: &PixelSampleState,
+    );
+
+    fn intersect_shadow(
+        &self,
+        max_rays: usize,
+        shadow_ray_q: &ShadowRayQueue,
+        pixel_sample_state: &PixelSampleState,
+    );
+
+    fn intersect_shadow_tr(
+        &self,
+        max_rays: usize,
+        shadow_ray_q: &ShadowRayQueue,
+        pixel_sample_state: &PixelSampleState,
+    );
+
+    // fn intersect_one_random(
+    //     &self,
+    //     max_rays: usize,
+    //     subsurface_scatte_q: &mut SubsurfaceScatterQueue,
+    // ) {
+    //     todo!()
+    // }
+}
+
+pub struct CpuAggregate {
+    pub aggregate: Primitive,
+}
+
+impl CpuAggregate {
+    pub fn new(aggregate: Primitive) -> Self {
+        Self { aggregate }
+    }
+}
+
+impl WavefrontAggregate for CpuAggregate {
+    fn bounds(&self) -> Bounds3f {
+        self.aggregate.bounds()
+    }
+
+    fn intersect_closest(
+        &self,
+        max_rays: usize,
+        ray_q: &RayQueue,
+        escaped_ray_q: &EscapedRayQueue,
+        hit_area_light_q: &HitAreaLightQueue,
+        basic_eval_mtl_q: &MaterialEvalQueue,
+        universal_eval_mtl_q: &MaterialEvalQueue,
+        next_ray_q: &RayQueue,
+        pixel_sample_state: &PixelSampleState,
+    ) {
+        let n_rays = ray_q.size().min(max_rays as u32);
+
+        for i in 0..n_rays as usize {
+            let work = unsafe { ray_q.get(i) };
+
+            let ray = Ray::new(work.ray_o, work.ray_d, Some(work.ray_time), work.ray_medium);
+
+            // Read path state from PixelSampleState
+            let pi = work.pixel_index as usize;
+            let beta = pixel_sample_state.beta.get(pi);
+            let r_u = pixel_sample_state.r_u.get(pi);
+            let r_l = pixel_sample_state.r_l.get(pi);
+            let lambda = pixel_sample_state.lambda.get(pi);
+            let depth = pixel_sample_state.depth.get(pi);
+            let specular_bounce = pixel_sample_state.specular_bounce.get(pi) != 0;
+            let any_non_specular = pixel_sample_state.any_non_specular_bounces.get(pi) != 0;
+            let eta_scale = pixel_sample_state.eta_scale.get(pi);
+            let prev_intr_ctx = pixel_sample_state.prev_intr_ctx.get(pi);
+
+            let Some(si) = self.aggregate.intersect(&ray, None) else {
+                // Ray escaped — push to escaped ray queue
+                escaped_ray_q.push(EscapedRayWorkItem {
+                    ray_o: work.ray_o,
+                    ray_d: work.ray_d,
+                    lambda,
+                    pixel_index: work.pixel_index,
+                    beta,
+                    r_u,
+                    r_l,
+                    depth,
+                    specular_bounce,
+                    prev_intr_ctx,
+                });
+                continue;
+            };
+
+            let intr = &si.intr;
+
+            // Check for null material (medium interface) — re-queue the ray
+            if intr.material.is_null() {
+                // Skip intersection and continue ray
+                // TODO: offset ray origin past the intersection
+                next_ray_q.push(RayWorkItem {
+                    ray_o: intr.p(),
+                    ray_d: work.ray_d,
+                    ray_time: work.ray_time,
+                    ray_medium: work.ray_medium,
+                    has_differentials: work.has_differentials,
+                    differential: work.differential,
+                    pixel_index: work.pixel_index,
+                });
+                continue;
+            }
+
+            // Check for area light hit
+            if !intr.area_light.is_null() {
+                hit_area_light_q.push(HitAreaLightWorkItem {
+                    area_light: intr.area_light,
+                    p: intr.p(),
+                    n: intr.n(),
+                    uv: intr.common.uv,
+                    wo: -work.ray_d,
+                    lambda,
+                    pixel_index: work.pixel_index,
+                    beta,
+                    r_u,
+                    r_l,
+                    depth,
+                    specular_bounce,
+                    prev_intr_ctx,
+                });
+            }
+
+            // Determine which material evaluation queue to use based on
+            // whether the material's textures can be evaluated with the
+            // basic evaluator (cheaper) or need the universal one.
+            let material = *intr.material.get().unwrap();
+            let eval_q = if material.can_evaluate_textures(&UniversalTextureEvaluator) {
+                basic_eval_mtl_q
+            } else {
+                universal_eval_mtl_q
+            };
+
+            eval_q.push(MaterialEvalWorkItem {
+                p: intr.p(),
+                n: intr.n(),
+                ns: intr.shading.n,
+                dpdu: intr.shading.dpdu,
+                dpdv: intr.shading.dpdv,
+                uv: intr.common.uv,
+                wo: -work.ray_d,
+                time: work.ray_time,
+                face_index: intr.face_index,
+                material: intr.material,
+                area_light: intr.area_light,
+                medium_interface: intr.common.medium_interface,
+                pixel_index: work.pixel_index,
+                lambda,
+                beta,
+                r_u,
+                any_non_specular_bounces: any_non_specular,
+                depth,
+                eta_scale,
+            });
+        }
+    }
+
+    fn intersect_shadow(
+        &self,
+        max_rays: usize,
+        shadow_ray_q: &ShadowRayQueue,
+        pixel_sample_state: &PixelSampleState,
+    ) {
+        let n_rays = shadow_ray_q.size().min(max_rays as u32);
+
+        for i in 0..n_rays as usize {
+            let work = unsafe { shadow_ray_q.get(i) };
+
+            let ray = Ray::new(
+                work.ray_o,
+                work.ray_d,
+                Some(work.ray_time),
+                crate::Ptr::null(),
+            );
+
+            // If the shadow ray is NOT occluded, add the direct lighting
+            // contribution to the pixel's accumulated radiance.
+            if !self.aggregate.intersect_p(&ray, Some(work.t_max)) {
+                let pi = work.pixel_index as usize;
+                let mut l = pixel_sample_state.l.get(pi);
+                l += work.l_d;
+                pixel_sample_state.l.set(pi, l);
+            }
+        }
+    }
+
+    fn intersect_shadow_tr(
+        &self,
+        max_rays: usize,
+        shadow_ray_q: &ShadowRayQueue,
+        pixel_sample_state: &PixelSampleState,
+    ) {
+        self.intersect_shadow(max_rays, shadow_ray_q, pixel_sample_state);
+    }
+}
--- a/shared/src/wavefront/integrator.rs
+++ b/shared/src/wavefront/integrator.rs
@ -0,0 +1,483 @@
+use crate::core::bxdf::FArgs;
+use crate::core::bxdf::TransportMode;
+use crate::core::camera::{Camera, CameraTrait};
+use crate::core::film::Film;
+use crate::core::filter::{Filter, FilterTrait};
+use crate::core::geometry::{
+    Bounds2i, Point2f, Point2i, Point3f, Point3fi, Ray, RayDifferential, Vector2f, Vector3f,
+    VectorLike,
+};
+use crate::core::interaction::InteractionTrait;
+use crate::core::light::{Light, LightSampleContext, LightTrait};
+use crate::core::material::{MaterialEvalContext, MaterialTrait};
+use crate::core::sampler::{CameraSample, Sampler, SamplerTrait};
+use crate::core::texture::{TextureEvalContext, UniversalTextureEvaluator};
+use crate::lights::sampler::{LightSampler, LightSamplerTrait};
+use crate::spectra::{SampledSpectrum, SampledWavelengths};
+use crate::utils::math::square;
+use crate::utils::sampling::power_heuristic;
+use crate::utils::soa::{SoA, SoAAllocator, WorkQueue};
+use crate::wavefront::aggregate::WavefrontAggregate;
+use crate::wavefront::workitems::*;
+use crate::{Float, GVec, Ptr};
+
+pub struct WavefrontPathIntegrator<A: WavefrontAggregate> {
+    pub aggregate: A,
+    pub camera: Camera,
+    pub film: Film,
+    pub filter: Filter,
+    pub sampler: Sampler,
+    pub max_depth: u32,
+    pub samples_per_pixel: u32,
+    pub regularize: bool,
+
+    // Lights
+    pub infinite_lights: GVec<Ptr<Light>>,
+
+    // Queue capacity = resolution.x * scanlines_per_pass
+    pub max_queue_size: u32,
+    pub scanlines_per_pass: u32,
+
+    pub ray_queues: [RayQueue; 2],
+    pub shadow_ray_queue: ShadowRayQueue,
+    pub escaped_ray_queue: EscapedRayQueue,
+    pub hit_area_light_queue: HitAreaLightQueue,
+    pub basic_eval_material_queue: MaterialEvalQueue,
+    pub universal_eval_material_queue: MaterialEvalQueue,
+    pub light_sampler: LightSampler,
+
+    // Persistent per-path state
+    pub pixel_sample_state: PixelSampleState,
+}
+
+impl<A: WavefrontAggregate> WavefrontPathIntegrator<A> {
+    pub fn render(&mut self) {
+        let pixel_bounds = self.film.pixel_bounds();
+        let resolution = pixel_bounds.diagonal();
+
+        for sample_index in 0..self.samples_per_pixel {
+            // Process image in scanline batches
+            let mut y0 = pixel_bounds.p_min.y();
+            while y0 < pixel_bounds.p_max.y() {
+                let y1 = (y0 + self.scanlines_per_pass as i32).min(pixel_bounds.p_max.y());
+
+                // Reset the primary ray queue for this set
+                self.ray_queues[0].reset();
+
+                self.generate_camera_rays(y0, y1, sample_index, &pixel_bounds);
+
+                for depth in 0..=self.max_depth {
+                    let current = (depth % 2) as usize;
+                    let next = ((depth + 1) % 2) as usize;
+
+                    // Reset output queues before intersection
+                    self.ray_queues[next].reset();
+                    self.escaped_ray_queue.reset();
+                    self.hit_area_light_queue.reset();
+                    self.basic_eval_material_queue.reset();
+                    self.universal_eval_material_queue.reset();
+                    self.shadow_ray_queue.reset();
+
+                    // Skip if no rays to trace
+                    if self.ray_queues[current].size() == 0 {
+                        break;
+                    }
+
+                    // Sorting of rays into output queues
+                    self.aggregate.intersect_closest(
+                        self.max_queue_size as usize,
+                        &self.ray_queues[current],
+                        &self.escaped_ray_queue,
+                        &self.hit_area_light_queue,
+                        &self.basic_eval_material_queue,
+                        &self.universal_eval_material_queue,
+                        &self.ray_queues[next],
+                        &self.pixel_sample_state,
+                    );
+
+                    // Infinite light contributions
+                    self.handle_escaped_rays();
+
+                    // Area light contributions
+                    self.handle_emissive_intersections();
+
+                    // Last depth — don't evaluate materials or sample lights
+                    if depth == self.max_depth {
+                        break;
+                    }
+
+                    // Evaluate materials, sample BSDFs, sample direct lighting
+                    // This pushes to shadow_ray_queue and ray_queues[next]
+                    self.evaluate_materials_and_bsdfs(depth);
+
+                    // Add direct lighting to pixels
+                    self.aggregate.intersect_shadow(
+                        self.max_queue_size as usize,
+                        &self.shadow_ray_queue,
+                        &self.pixel_sample_state,
+                    );
+                }
+
+                // Update film from accumulated pixel sample state
+                self.update_film(y0, y1, &pixel_bounds);
+
+                y0 = y1;
+            }
+        }
+    }
+
+    /// Stage 1: Generate camera rays for scanlines [y0, y1).
+    fn generate_camera_rays(
+        &mut self,
+        y0: i32,
+        y1: i32,
+        sample_index: u32,
+        pixel_bounds: &Bounds2i,
+    ) {
+        // For each pixel in the scanline range, generate a camera ray
+        // and push it to the ray queue. Also initialize the PixelSampleState.
+        for y in y0..y1 {
+            for x in pixel_bounds.p_min.x()..pixel_bounds.p_max.x() {
+                let p_pixel = Point2i::new(x, y);
+
+                // TODO: proper sampler state per pixel/sample
+                // For now, use a simple approach
+                self.sampler
+                    .start_pixel_sample(p_pixel, sample_index as i32, Some(0));
+
+                let lambda = SampledWavelengths::sample_visible(self.sampler.get1d());
+
+                let camera_sample = crate::core::sampler::get_camera_sample(
+                    &mut self.sampler,
+                    p_pixel,
+                    &self.filter,
+                );
+
+                let Some(camera_ray) = self.camera.generate_ray(camera_sample, &lambda) else {
+                    continue;
+                };
+
+                // Compute pixel index for this sample
+                let pixel_index = self.ray_queues[0].size();
+
+                // Initialize persistent pixel state
+                let pi = pixel_index as usize;
+                self.pixel_sample_state.l.set(pi, SampledSpectrum::new(0.0));
+                self.pixel_sample_state.beta.set(pi, camera_ray.weight);
+                self.pixel_sample_state.lambda.set(pi, lambda);
+                self.pixel_sample_state
+                    .r_u
+                    .set(pi, SampledSpectrum::new(1.0));
+                self.pixel_sample_state
+                    .r_l
+                    .set(pi, SampledSpectrum::new(1.0));
+                self.pixel_sample_state.depth.set(pi, 0);
+                self.pixel_sample_state.specular_bounce.set(pi, 1);
+                self.pixel_sample_state.any_non_specular_bounces.set(pi, 0);
+                self.pixel_sample_state.eta_scale.set(pi, 1.0);
+                self.pixel_sample_state.p_film.set(pi, camera_sample.p_film);
+                self.pixel_sample_state
+                    .filter_weight
+                    .set(pi, camera_sample.filter_weight);
+                self.pixel_sample_state
+                    .prev_intr_ctx
+                    .set(pi, LightSampleContext::default());
+
+                // Push ray to queue
+                self.ray_queues[0].push(RayWorkItem {
+                    ray_o: camera_ray.ray.o,
+                    ray_d: camera_ray.ray.d,
+                    ray_time: camera_ray.ray.time,
+                    ray_medium: camera_ray.ray.medium,
+                    pixel_index: pixel_index,
+                    has_differentials: true,
+                    differential: RayDifferential::default(),
+                });
+            }
+        }
+    }
+
+    /// Handle escaped rays — evaluate infinite lights.
+    fn handle_escaped_rays(&self) {
+        let n = self.escaped_ray_queue.size();
+        for i in 0..n as usize {
+            let w = unsafe { self.escaped_ray_queue.storage.get(i) };
+
+            let mut l_contrib = SampledSpectrum::new(0.0);
+
+            // Evaluate all infinite lights
+            for light_ptr in &self.infinite_lights {
+                let light = light_ptr.get().unwrap();
+                let ray = crate::core::geometry::Ray::new(w.ray_o, w.ray_d, None, Ptr::null());
+                let le = light.le(&ray, &w.lambda);
+                if le.is_black() {
+                    continue;
+                }
+
+                if w.depth == 0 || w.specular_bounce {
+                    // No MIS for direct camera rays or specular bounces
+                    l_contrib += w.beta * le / w.r_u.average();
+                } else {
+                    // MIS with light sampling
+                    // TODO: compute light PDF for MIS weight
+                    // For now, use unidirectional weight only
+                    l_contrib += w.beta * le / w.r_u.average();
+                }
+            }
+
+            if !l_contrib.is_black() {
+                let pi = w.pixel_index as usize;
+                let mut l = self.pixel_sample_state.l.get(pi);
+                l += l_contrib;
+                self.pixel_sample_state.l.set(pi, l);
+            }
+        }
+    }
+
+    /// Handle emissive intersections — area light contribution with MIS.
+    fn handle_emissive_intersections(&self) {
+        let n = self.hit_area_light_queue.size();
+        for i in 0..n as usize {
+            let w = unsafe { self.hit_area_light_queue.storage.get(i) };
+
+            let light = w.area_light.get().unwrap();
+            let le = light.l(w.p, w.n, w.uv, w.wo, &w.lambda);
+            if le.is_black() {
+                continue;
+            }
+
+            let l_contrib = if w.depth == 0 || w.specular_bounce {
+                w.beta * le / w.r_u.average()
+            } else {
+                // MIS: combine BSDF and light sampling weights
+                // TODO: full MIS with light sampler PDF
+                w.beta * le / w.r_u.average()
+            };
+
+            if !l_contrib.is_black() {
+                let pi = w.pixel_index as usize;
+                let mut l = self.pixel_sample_state.l.get(pi);
+                l += l_contrib;
+                self.pixel_sample_state.l.set(pi, l);
+            }
+        }
+    }
+
+    fn evaluate_materials_and_bsdfs(&mut self, depth: u32) {
+        self.evaluate_material_queue_impl(depth, false);
+        self.evaluate_material_queue_impl(depth, true);
+    }
+
+    fn evaluate_material_queue_impl(&mut self, depth: u32, use_universal: bool) {
+        let queue = if use_universal {
+            &self.universal_eval_material_queue
+        } else {
+            &self.basic_eval_material_queue
+        };
+
+        let n = queue.size();
+        let next = ((depth + 1) % 2) as usize;
+
+        for i in 0..n as usize {
+            let w = unsafe { queue.storage.get(i) };
+            let pi = w.pixel_index as usize;
+
+            let lambda = self.pixel_sample_state.lambda.get(pi);
+            let beta = self.pixel_sample_state.beta.get(pi);
+            let any_non_specular = self.pixel_sample_state.any_non_specular_bounces.get(pi) != 0;
+            let eta_scale = self.pixel_sample_state.eta_scale.get(pi);
+
+            let Some(material) = w.material.get() else {
+                continue;
+            };
+
+            let tex_eval = UniversalTextureEvaluator;
+            let ctx = MaterialEvalContext {
+                texture: TextureEvalContext {
+                    p: w.p,
+                    dpdx: Vector3f::zero(),
+                    dpdy: Vector3f::zero(),
+                    n: w.n,
+                    uv: w.uv,
+                    dudx: 0.0,
+                    dudy: 0.0,
+                    dvdx: 0.0,
+                    dvdy: 0.0,
+                    face_index: w.face_index,
+                },
+                wo: w.wo,
+                ns: w.ns,
+                dpdus: w.dpdu,
+            };
+            let mut bsdf = material.get_bsdf(&tex_eval, &ctx, &lambda);
+
+            if bsdf.flags().is_empty() {
+                continue;
+            }
+
+            if self.regularize && any_non_specular {
+                bsdf.regularize();
+            }
+
+            if depth >= self.max_depth {
+                continue;
+            }
+
+            // Sample a light, compute contribution,
+            // push shadow ray with deferred visibility
+            if bsdf.flags().is_non_specular() {
+                let light_ctx = LightSampleContext {
+                    pi: Point3fi::new_from_point(w.p),
+                    n: w.n,
+                    ns: w.ns,
+                };
+
+                if let Some(sampled_light) = self
+                    .light_sampler
+                    .sample_with_context(&light_ctx, self.sampler.get1d())
+                {
+                    if let Some(ls) = sampled_light.light.sample_li(
+                        &light_ctx,
+                        self.sampler.get2d(),
+                        &lambda,
+                        true,
+                    ) {
+                        if !ls.l.is_black() && ls.pdf > 0.0 {
+                            let wi = ls.wi;
+                            if let Some(f_val) = bsdf.f(w.wo, wi, TransportMode::Radiance) {
+                                let f_cos = f_val * wi.abs_dot(w.ns.into());
+                                if !f_cos.is_black() {
+                                    let p_l = sampled_light.p * ls.pdf;
+                                    let l_d = if sampled_light.light.light_type().is_delta_light() {
+                                        beta * ls.l * f_cos / p_l
+                                    } else {
+                                        let p_b = bsdf.pdf(w.wo, wi, FArgs::default());
+                                        let w_l = power_heuristic(1, p_l, 1, p_b);
+                                        beta * w_l * ls.l * f_cos / p_l
+                                    };
+
+                                    if !l_d.is_black() {
+                                        let ray_o = Ray::offset_origin(
+                                            &Point3fi::new_from_point(w.p),
+                                            &w.n,
+                                            &wi,
+                                        );
+                                        let t_max = (1.0 - 1e-4)
+                                            * (Point3f::from(ls.p_light.p()) - ray_o).norm()
+                                            / wi.norm();
+
+                                        self.shadow_ray_queue.push(ShadowRayWorkItem {
+                                            ray_o,
+                                            ray_d: wi,
+                                            ray_time: w.time,
+                                            t_max,
+                                            lambda,
+                                            l_d,
+                                            pixel_index: w.pixel_index,
+                                        });
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+
+            // Sample BSDF for next bounce
+            let wo = w.wo;
+            let Some(bs) = bsdf.sample_f(
+                wo,
+                self.sampler.get1d(),
+                self.sampler.get2d(),
+                FArgs::default(),
+            ) else {
+                continue;
+            };
+
+            let f_cos = bs.f * bs.wi.abs_dot(w.ns.into());
+            if f_cos.is_black() || bs.pdf == 0.0 {
+                continue;
+            }
+            let new_beta = beta * f_cos / bs.pdf;
+
+            let new_depth = depth + 1;
+
+            // Russian roulette
+            if new_depth > 3 {
+                let rr_beta = new_beta.max_component_value();
+                if rr_beta < 0.25 {
+                    let q = (1.0 - rr_beta).max(0.0_f32);
+                    if self.sampler.get1d() < q {
+                        continue;
+                    }
+                }
+            }
+
+            let ray_o = Ray::offset_origin(&Point3fi::new_from_point(w.p), &w.n, &bs.wi);
+
+            // Update PixelSampleState
+            self.pixel_sample_state.beta.set(pi, new_beta);
+            self.pixel_sample_state.depth.set(pi, new_depth);
+            self.pixel_sample_state
+                .specular_bounce
+                .set(pi, bs.is_specular() as u8);
+            self.pixel_sample_state
+                .any_non_specular_bounces
+                .set(pi, (any_non_specular || !bs.is_specular()) as u8);
+            self.pixel_sample_state.eta_scale.set(
+                pi,
+                if bs.is_transmissive() {
+                    eta_scale * square(bs.eta)
+                } else {
+                    eta_scale
+                },
+            );
+            self.pixel_sample_state.prev_intr_ctx.set(
+                pi,
+                LightSampleContext {
+                    pi: Point3fi::new_from_point(w.p),
+                    n: w.n,
+                    ns: w.ns,
+                },
+            );
+
+            // Push next bounce ray
+            self.ray_queues[next].push(RayWorkItem {
+                ray_o,
+                ray_d: bs.wi,
+                ray_time: w.time,
+                ray_medium: Ptr::null(),
+                pixel_index: w.pixel_index,
+                has_differentials: true,
+                differential: RayDifferential::default(),
+            });
+        }
+    }
+
+    /// Update film — write accumulated radiance to film pixels.
+    fn update_film(&self, y0: i32, y1: i32, pixel_bounds: &Bounds2i) {
+        // The pixel_sample_state indices map to rays generated in
+        // generate_camera_rays. We need to walk the same pixel order
+        // and read back the accumulated L values.
+        let mut pi = 0usize;
+        for y in y0..y1 {
+            for x in pixel_bounds.p_min.x()..pixel_bounds.p_max.x() {
+                let l = self.pixel_sample_state.l.get(pi);
+                let lambda = self.pixel_sample_state.lambda.get(pi);
+                let filter_weight = self.pixel_sample_state.filter_weight.get(pi);
+                let p_film = self.pixel_sample_state.p_film.get(pi);
+
+                // Add sample to film
+                self.film.add_sample(
+                    Point2i::new(p_film.x() as i32, p_film.y() as i32),
+                    l,
+                    &lambda,
+                    Some(&crate::core::film::VisibleSurface::default()),
+                    filter_weight,
+                );
+
+                pi += 1;
+            }
+        }
+    }
+}
--- a/shared/src/wavefront/mod.rs
+++ b/shared/src/wavefront/mod.rs
@ -0,0 +1,7 @@
+pub mod workitems;
+pub mod aggregate;
+pub mod integrator;
+
+pub use workitems::*;
+pub use aggregate::WavefrontAggregate;
+
--- a/shared/src/wavefront/workitems.rs
+++ b/shared/src/wavefront/workitems.rs
@ -0,0 +1,507 @@
+use crate::core::bxdf::BxDFFlags;
+use crate::core::geometry::{Normal3f, Point2f, Point3f, Point3fi, Vector3f, RayDifferential};
+use crate::core::light::LightSampleContext;
+use crate::core::light::Light;
+use crate::core::material::Material;
+use crate::core::medium::{Medium, MediumInterface};
+use crate::spectra::{SampledSpectrum, SampledWavelengths};
+use crate::utils::soa::{alloc_soa_buffer, SoA, SoAAllocator, SoABuffer, WorkQueue};
+use crate::{Float, Ptr};
+
+/// Per-path state that persists across all wavefront depth iterations.
+/// Indexed by pixel_index. Allocated once with capacity = max_queue_size.
+#[repr(C)]
+#[derive(Clone, Copy)]
+pub struct PixelSampleState {
+    pub filter_weight: SoABuffer<Float>,
+    pub p_film: SoABuffer<Point2f>,
+    pub l: SoABuffer<SampledSpectrum>,
+    pub lambda: SoABuffer<SampledWavelengths>,
+    pub r_u: SoABuffer<SampledSpectrum>,
+    pub r_l: SoABuffer<SampledSpectrum>,
+    pub prev_intr_ctx: SoABuffer<LightSampleContext>,
+    pub beta: SoABuffer<SampledSpectrum>,
+    pub depth: SoABuffer<u32>,
+    pub specular_bounce: SoABuffer<u8>,
+    pub any_non_specular_bounces: SoABuffer<u8>,
+    pub eta_scale: SoABuffer<Float>,
+    pub camera_ray_weight: SoABuffer<SampledSpectrum>,
+    pub visible_surface_idx: SoABuffer<u32>,
+}
+
+impl SoA for PixelSampleState {
+    type Item = ();
+
+    fn allocate(n: u32, alloc: &dyn SoAAllocator) -> Self {
+        Self {
+            filter_weight: alloc_soa_buffer(n, alloc),
+            p_film: alloc_soa_buffer(n, alloc),
+            l: alloc_soa_buffer(n, alloc),
+            lambda: alloc_soa_buffer(n, alloc),
+            r_u: alloc_soa_buffer(n, alloc),
+            r_l: alloc_soa_buffer(n, alloc),
+            prev_intr_ctx: alloc_soa_buffer(n, alloc),
+            beta: alloc_soa_buffer(n, alloc),
+            depth: alloc_soa_buffer(n, alloc),
+            specular_bounce: alloc_soa_buffer(n, alloc),
+            any_non_specular_bounces: alloc_soa_buffer(n, alloc),
+            eta_scale: alloc_soa_buffer(n, alloc),
+            camera_ray_weight: alloc_soa_buffer(n, alloc),
+            visible_surface_idx: alloc_soa_buffer(n, alloc),
+        }
+    }
+
+    unsafe fn get(&self, _i: usize) -> Self::Item {}
+    unsafe fn set(&self, _i: usize, _v: Self::Item) {}
+}
+
+#[repr(C)]
+#[derive(Clone, Copy, Debug)]
+pub struct RayWorkItem {
+    pub ray_o: Point3f,
+    pub ray_d: Vector3f,
+    pub ray_time: Float,
+    pub ray_medium: Ptr<Medium>,
+    pub pixel_index: u32,
+    pub has_differentials: bool,
+    pub differential: RayDifferential
+}
+
+#[repr(C)]
+#[derive(Clone, Copy)]
+pub struct RayWorkItemSoA {
+    pub ray_o: SoABuffer<Point3f>,
+    pub ray_d: SoABuffer<Vector3f>,
+    pub ray_time: SoABuffer<Float>,
+    pub ray_medium: SoABuffer<Ptr<Medium>>,
+    pub pixel_index: SoABuffer<u32>,
+    pub has_differentials: SoABuffer<bool>,
+    pub differential: SoABuffer<RayDifferential>,
+}
+
+impl SoA for RayWorkItemSoA {
+    type Item = RayWorkItem;
+
+    fn allocate(n: u32, alloc: &dyn SoAAllocator) -> Self {
+        Self {
+            ray_o: alloc_soa_buffer(n, alloc),
+            ray_d: alloc_soa_buffer(n, alloc),
+            ray_time: alloc_soa_buffer(n, alloc),
+            ray_medium: alloc_soa_buffer(n, alloc),
+            pixel_index: alloc_soa_buffer(n, alloc),
+            has_differentials: alloc_soa_buffer(n, alloc),
+            differential: alloc_soa_buffer(n, alloc),
+        }
+    }
+
+    unsafe fn get(&self, i: usize) -> RayWorkItem {
+        RayWorkItem {
+            ray_o: self.ray_o.get(i),
+            ray_d: self.ray_d.get(i),
+            ray_time: self.ray_time.get(i),
+            ray_medium: self.ray_medium.get(i),
+            pixel_index: self.pixel_index.get(i),
+            has_differentials: self.has_differentials.get(i),
+            differential: self.differential.get(i),
+        }
+    }
+
+    unsafe fn set(&self, i: usize, v: RayWorkItem) {
+        self.ray_o.set(i, v.ray_o);
+        self.ray_d.set(i, v.ray_d);
+        self.ray_time.set(i, v.ray_time);
+        self.ray_medium.set(i, v.ray_medium);
+        self.pixel_index.set(i, v.pixel_index);
+        self.has_differentials.set(i, v.has_differentials);
+        self.differential.set(i, v.differential);
+    }
+}
+
+#[repr(C)]
+#[derive(Clone, Copy, Debug)]
+pub struct EscapedRayWorkItem {
+    pub ray_o: Point3f,
+    pub ray_d: Vector3f,
+    pub lambda: SampledWavelengths,
+    pub pixel_index: u32,
+    pub beta: SampledSpectrum,
+    pub r_u: SampledSpectrum,
+    pub r_l: SampledSpectrum,
+    pub depth: u32,
+    pub specular_bounce: bool,
+    pub prev_intr_ctx: LightSampleContext,
+}
+
+#[repr(C)]
+#[derive(Clone, Copy)]
+pub struct EscapedRayWorkItemSoA {
+    pub ray_o: SoABuffer<Point3f>,
+    pub ray_d: SoABuffer<Vector3f>,
+    pub lambda: SoABuffer<SampledWavelengths>,
+    pub pixel_index: SoABuffer<u32>,
+    pub beta: SoABuffer<SampledSpectrum>,
+    pub r_u: SoABuffer<SampledSpectrum>,
+    pub r_l: SoABuffer<SampledSpectrum>,
+    pub depth: SoABuffer<u32>,
+    pub specular_bounce: SoABuffer<u8>,
+    pub prev_intr_ctx: SoABuffer<LightSampleContext>,
+}
+
+impl SoA for EscapedRayWorkItemSoA {
+    type Item = EscapedRayWorkItem;
+
+    fn allocate(n: u32, alloc: &dyn SoAAllocator) -> Self {
+        Self {
+            ray_o: alloc_soa_buffer(n, alloc),
+            ray_d: alloc_soa_buffer(n, alloc),
+            lambda: alloc_soa_buffer(n, alloc),
+            pixel_index: alloc_soa_buffer(n, alloc),
+            beta: alloc_soa_buffer(n, alloc),
+            r_u: alloc_soa_buffer(n, alloc),
+            r_l: alloc_soa_buffer(n, alloc),
+            depth: alloc_soa_buffer(n, alloc),
+            specular_bounce: alloc_soa_buffer(n, alloc),
+            prev_intr_ctx: alloc_soa_buffer(n, alloc),
+        }
+    }
+
+    unsafe fn get(&self, i: usize) -> EscapedRayWorkItem {
+        EscapedRayWorkItem {
+            ray_o: self.ray_o.get(i),
+            ray_d: self.ray_d.get(i),
+            lambda: self.lambda.get(i),
+            pixel_index: self.pixel_index.get(i),
+            beta: self.beta.get(i),
+            r_u: self.r_u.get(i),
+            r_l: self.r_l.get(i),
+            depth: self.depth.get(i),
+            specular_bounce: self.specular_bounce.get(i) != 0,
+            prev_intr_ctx: self.prev_intr_ctx.get(i),
+        }
+    }
+
+    unsafe fn set(&self, i: usize, v: EscapedRayWorkItem) {
+        self.ray_o.set(i, v.ray_o);
+        self.ray_d.set(i, v.ray_d);
+        self.lambda.set(i, v.lambda);
+        self.pixel_index.set(i, v.pixel_index);
+        self.beta.set(i, v.beta);
+        self.r_u.set(i, v.r_u);
+        self.r_l.set(i, v.r_l);
+        self.depth.set(i, v.depth);
+        self.specular_bounce.set(i, v.specular_bounce as u8);
+        self.prev_intr_ctx.set(i, v.prev_intr_ctx);
+    }
+}
+
+#[repr(C)]
+#[derive(Clone, Copy, Debug)]
+pub struct HitAreaLightWorkItem {
+    pub area_light: Ptr<Light>,
+    pub p: Point3f,
+    pub n: Normal3f,
+    pub uv: Point2f,
+    pub wo: Vector3f,
+    pub lambda: SampledWavelengths,
+    pub pixel_index: u32,
+    pub beta: SampledSpectrum,
+    pub r_u: SampledSpectrum,
+    pub r_l: SampledSpectrum,
+    pub depth: u32,
+    pub specular_bounce: bool,
+    pub prev_intr_ctx: LightSampleContext,
+}
+
+#[repr(C)]
+#[derive(Clone, Copy)]
+pub struct HitAreaLightWorkItemSoA {
+    pub area_light: SoABuffer<Ptr<Light>>,
+    pub p: SoABuffer<Point3f>,
+    pub n: SoABuffer<Normal3f>,
+    pub uv: SoABuffer<Point2f>,
+    pub wo: SoABuffer<Vector3f>,
+    pub lambda: SoABuffer<SampledWavelengths>,
+    pub pixel_index: SoABuffer<u32>,
+    pub beta: SoABuffer<SampledSpectrum>,
+    pub r_u: SoABuffer<SampledSpectrum>,
+    pub r_l: SoABuffer<SampledSpectrum>,
+    pub depth: SoABuffer<u32>,
+    pub specular_bounce: SoABuffer<u8>,
+    pub prev_intr_ctx: SoABuffer<LightSampleContext>,
+}
+
+impl SoA for HitAreaLightWorkItemSoA {
+    type Item = HitAreaLightWorkItem;
+
+    fn allocate(n: u32, alloc: &dyn SoAAllocator) -> Self {
+        Self {
+            area_light: alloc_soa_buffer(n, alloc),
+            p: alloc_soa_buffer(n, alloc),
+            n: alloc_soa_buffer(n, alloc),
+            uv: alloc_soa_buffer(n, alloc),
+            wo: alloc_soa_buffer(n, alloc),
+            lambda: alloc_soa_buffer(n, alloc),
+            pixel_index: alloc_soa_buffer(n, alloc),
+            beta: alloc_soa_buffer(n, alloc),
+            r_u: alloc_soa_buffer(n, alloc),
+            r_l: alloc_soa_buffer(n, alloc),
+            depth: alloc_soa_buffer(n, alloc),
+            specular_bounce: alloc_soa_buffer(n, alloc),
+            prev_intr_ctx: alloc_soa_buffer(n, alloc),
+        }
+    }
+
+    unsafe fn get(&self, i: usize) -> HitAreaLightWorkItem {
+        HitAreaLightWorkItem {
+            area_light: self.area_light.get(i),
+            p: self.p.get(i),
+            n: self.n.get(i),
+            uv: self.uv.get(i),
+            wo: self.wo.get(i),
+            lambda: self.lambda.get(i),
+            pixel_index: self.pixel_index.get(i),
+            beta: self.beta.get(i),
+            r_u: self.r_u.get(i),
+            r_l: self.r_l.get(i),
+            depth: self.depth.get(i),
+            specular_bounce: self.specular_bounce.get(i) != 0,
+            prev_intr_ctx: self.prev_intr_ctx.get(i),
+        }
+    }
+
+    unsafe fn set(&self, i: usize, v: HitAreaLightWorkItem) {
+        self.area_light.set(i, v.area_light);
+        self.p.set(i, v.p);
+        self.n.set(i, v.n);
+        self.uv.set(i, v.uv);
+        self.wo.set(i, v.wo);
+        self.lambda.set(i, v.lambda);
+        self.pixel_index.set(i, v.pixel_index);
+        self.beta.set(i, v.beta);
+        self.r_u.set(i, v.r_u);
+        self.r_l.set(i, v.r_l);
+        self.depth.set(i, v.depth);
+        self.specular_bounce.set(i, v.specular_bounce as u8);
+        self.prev_intr_ctx.set(i, v.prev_intr_ctx);
+    }
+}
+
+#[repr(C)]
+#[derive(Clone, Copy, Debug)]
+pub struct MaterialEvalWorkItem {
+    // Surface interaction
+    pub p: Point3f,
+    pub n: Normal3f,
+    pub ns: Normal3f,
+    pub dpdu: Vector3f,
+    pub dpdv: Vector3f,
+    pub uv: Point2f,
+    pub wo: Vector3f,
+    pub time: Float,
+    pub face_index: i32,
+
+    // Material
+    pub material: Ptr<Material>,
+    pub area_light: Ptr<Light>,
+
+    // Medium interface
+    pub medium_interface: MediumInterface,
+
+    // Path state
+    pub pixel_index: u32,
+    pub lambda: SampledWavelengths,
+    pub beta: SampledSpectrum,
+    pub r_u: SampledSpectrum,
+
+    // For next-event estimation
+    pub any_non_specular_bounces: bool,
+    pub depth: u32,
+    pub eta_scale: Float,
+}
+
+#[repr(C)]
+#[derive(Clone, Copy)]
+pub struct MaterialEvalWorkItemSoA {
+    pub p: SoABuffer<Point3f>,
+    pub n: SoABuffer<Normal3f>,
+    pub ns: SoABuffer<Normal3f>,
+    pub dpdu: SoABuffer<Vector3f>,
+    pub dpdv: SoABuffer<Vector3f>,
+    pub uv: SoABuffer<Point2f>,
+    pub wo: SoABuffer<Vector3f>,
+    pub time: SoABuffer<Float>,
+    pub face_index: SoABuffer<i32>,
+    pub material: SoABuffer<Ptr<Material>>,
+    pub area_light: SoABuffer<Ptr<Light>>,
+    pub medium_interface: SoABuffer<MediumInterface>,
+    pub pixel_index: SoABuffer<u32>,
+    pub lambda: SoABuffer<SampledWavelengths>,
+    pub beta: SoABuffer<SampledSpectrum>,
+    pub r_u: SoABuffer<SampledSpectrum>,
+    pub any_non_specular_bounces: SoABuffer<u8>,
+    pub depth: SoABuffer<u32>,
+    pub eta_scale: SoABuffer<Float>,
+}
+
+impl SoA for MaterialEvalWorkItemSoA {
+    type Item = MaterialEvalWorkItem;
+
+    fn allocate(n: u32, alloc: &dyn SoAAllocator) -> Self {
+        Self {
+            p: alloc_soa_buffer(n, alloc),
+            n: alloc_soa_buffer(n, alloc),
+            ns: alloc_soa_buffer(n, alloc),
+            dpdu: alloc_soa_buffer(n, alloc),
+            dpdv: alloc_soa_buffer(n, alloc),
+            uv: alloc_soa_buffer(n, alloc),
+            wo: alloc_soa_buffer(n, alloc),
+            time: alloc_soa_buffer(n, alloc),
+            face_index: alloc_soa_buffer(n, alloc),
+            material: alloc_soa_buffer(n, alloc),
+            area_light: alloc_soa_buffer(n, alloc),
+            medium_interface: alloc_soa_buffer(n, alloc),
+            pixel_index: alloc_soa_buffer(n, alloc),
+            lambda: alloc_soa_buffer(n, alloc),
+            beta: alloc_soa_buffer(n, alloc),
+            r_u: alloc_soa_buffer(n, alloc),
+            any_non_specular_bounces: alloc_soa_buffer(n, alloc),
+            depth: alloc_soa_buffer(n, alloc),
+            eta_scale: alloc_soa_buffer(n, alloc),
+        }
+    }
+
+    unsafe fn get(&self, i: usize) -> MaterialEvalWorkItem {
+        MaterialEvalWorkItem {
+            p: self.p.get(i),
+            n: self.n.get(i),
+            ns: self.ns.get(i),
+            dpdu: self.dpdu.get(i),
+            dpdv: self.dpdv.get(i),
+            uv: self.uv.get(i),
+            wo: self.wo.get(i),
+            time: self.time.get(i),
+            face_index: self.face_index.get(i),
+            material: self.material.get(i),
+            area_light: self.area_light.get(i),
+            medium_interface: self.medium_interface.get(i),
+            pixel_index: self.pixel_index.get(i),
+            lambda: self.lambda.get(i),
+            beta: self.beta.get(i),
+            r_u: self.r_u.get(i),
+            any_non_specular_bounces: self.any_non_specular_bounces.get(i) != 0,
+            depth: self.depth.get(i),
+            eta_scale: self.eta_scale.get(i),
+        }
+    }
+
+    unsafe fn set(&self, i: usize, v: MaterialEvalWorkItem) {
+        self.p.set(i, v.p);
+        self.n.set(i, v.n);
+        self.ns.set(i, v.ns);
+        self.dpdu.set(i, v.dpdu);
+        self.dpdv.set(i, v.dpdv);
+        self.uv.set(i, v.uv);
+        self.wo.set(i, v.wo);
+        self.time.set(i, v.time);
+        self.face_index.set(i, v.face_index);
+        self.material.set(i, v.material);
+        self.area_light.set(i, v.area_light);
+        self.medium_interface.set(i, v.medium_interface);
+        self.pixel_index.set(i, v.pixel_index);
+        self.lambda.set(i, v.lambda);
+        self.beta.set(i, v.beta);
+        self.r_u.set(i, v.r_u);
+        self.any_non_specular_bounces
+            .set(i, v.any_non_specular_bounces as u8);
+        self.depth.set(i, v.depth);
+        self.eta_scale.set(i, v.eta_scale);
+    }
+}
+
+#[repr(C)]
+#[derive(Clone, Copy, Debug)]
+pub struct ShadowRayWorkItem {
+    pub ray_o: Point3f,
+    pub ray_d: Vector3f,
+    pub ray_time: Float,
+    pub t_max: Float,
+    pub lambda: SampledWavelengths,
+    pub l_d: SampledSpectrum,
+    pub pixel_index: u32,
+}
+
+#[repr(C)]
+#[derive(Clone, Copy)]
+pub struct ShadowRayWorkItemSoA {
+    pub ray_o: SoABuffer<Point3f>,
+    pub ray_d: SoABuffer<Vector3f>,
+    pub ray_time: SoABuffer<Float>,
+    pub t_max: SoABuffer<Float>,
+    pub lambda: SoABuffer<SampledWavelengths>,
+    pub l_d: SoABuffer<SampledSpectrum>,
+    pub pixel_index: SoABuffer<u32>,
+}
+
+impl SoA for ShadowRayWorkItemSoA {
+    type Item = ShadowRayWorkItem;
+
+    fn allocate(n: u32, alloc: &dyn SoAAllocator) -> Self {
+        Self {
+            ray_o: alloc_soa_buffer(n, alloc),
+            ray_d: alloc_soa_buffer(n, alloc),
+            ray_time: alloc_soa_buffer(n, alloc),
+            t_max: alloc_soa_buffer(n, alloc),
+            lambda: alloc_soa_buffer(n, alloc),
+            l_d: alloc_soa_buffer(n, alloc),
+            pixel_index: alloc_soa_buffer(n, alloc),
+        }
+    }
+
+    unsafe fn get(&self, i: usize) -> ShadowRayWorkItem {
+        ShadowRayWorkItem {
+            ray_o: self.ray_o.get(i),
+            ray_d: self.ray_d.get(i),
+            ray_time: self.ray_time.get(i),
+            t_max: self.t_max.get(i),
+            lambda: self.lambda.get(i),
+            l_d: self.l_d.get(i),
+            pixel_index: self.pixel_index.get(i),
+        }
+    }
+
+    unsafe fn set(&self, i: usize, v: ShadowRayWorkItem) {
+        self.ray_o.set(i, v.ray_o);
+        self.ray_d.set(i, v.ray_d);
+        self.ray_time.set(i, v.ray_time);
+        self.t_max.set(i, v.t_max);
+        self.lambda.set(i, v.lambda);
+        self.l_d.set(i, v.l_d);
+        self.pixel_index.set(i, v.pixel_index);
+    }
+}
+
+#[repr(C)]
+#[derive(Clone, Copy, Debug)]
+pub struct MediumSampleWorkItem {
+    pub ray_o: Point3f,
+    pub ray_d: Vector3f,
+    pub ray_time: Float,
+    pub t_max: Float,
+    pub lambda: SampledWavelengths,
+    pub beta: SampledSpectrum,
+    pub r_u: SampledSpectrum,
+    pub r_l: SampledSpectrum,
+    pub medium: Ptr<Medium>,
+    pub pixel_index: u32,
+    pub depth: u32,
+    pub specular_bounce: bool,
+    pub any_non_specular_bounces: bool,
+    pub eta_scale: Float,
+    pub prev_intr_ctx: LightSampleContext,
+}
+
+pub type RayQueue = WorkQueue<RayWorkItemSoA>;
+pub type EscapedRayQueue = WorkQueue<EscapedRayWorkItemSoA>;
+pub type HitAreaLightQueue = WorkQueue<HitAreaLightWorkItemSoA>;
+pub type MaterialEvalQueue = WorkQueue<MaterialEvalWorkItemSoA>;
+pub type ShadowRayQueue = WorkQueue<ShadowRayWorkItemSoA>;