Spaces:

wlyu
/

FaceLift

Running on Zero

App Files Files Community

wlyu-adobe commited on Oct 13

Commit

8f04a1a

1 Parent(s): 133857a

Initial commit

Browse files

Files changed (3) hide show

gslrm/model/gslrm.py +3 -917
gslrm/model/utils_losses.py +0 -309
splat_viewer.html +0 -277

gslrm/model/gslrm.py CHANGED Viewed

@@ -22,11 +22,8 @@ Classes:
 """
 import copy
-import os
-import time
-from typing import Dict, List, Optional, Tuple, Union
-import cv2
 import lpips
 import numpy as np
 import torch
@@ -35,17 +32,13 @@ import torch.nn.functional as F
 from easydict import EasyDict as edict
 from einops import rearrange
 from einops.layers.torch import Rearrange
-from PIL import Image
 # Local imports
 from .utils_losses import PerceptualLoss, SsimLoss
 from .gaussians_renderer import (
     GaussianModel,
-    RGB2SH,
     deferred_gaussian_render,
-    imageseq2video,
     render_opencv_cam,
-    render_turntable,
 )
 from .transform_data import SplitData, TransformInput, TransformTarget
 from .utils_transformer import (
@@ -225,238 +218,6 @@ class GaussiansUpsampler(nn.Module):
         return xyz, features, scaling, rotation, opacity
-class LossComputer(nn.Module):
-    """
-    Computes various loss functions for training the GSLRM model.
-    Supports multiple loss types:
-    - L2 (MSE) loss
-    - LPIPS perceptual loss
-    - Custom perceptual loss
-    - SSIM loss
-    - Pixel alignment loss
-    - Point distance regularization loss
-    """
-    def __init__(self, config: edict):
-        super().__init__()
-        self.config = config
-        # Initialize loss modules based on config
-        self._init_loss_modules()
-    def _init_loss_modules(self):
-        """Initialize the various loss computation modules."""
-        # LPIPS loss
-        if self.config.training.losses.lpips_loss_weight > 0.0:
-            self.lpips_loss_module = lpips.LPIPS(net="vgg")
-            self.lpips_loss_module.eval()
-            # Freeze LPIPS parameters
-            for param in self.lpips_loss_module.parameters():
-                param.requires_grad = False
-        # Perceptual loss
-        if self.config.training.losses.perceptual_loss_weight > 0.0:
-            self.perceptual_loss_module = PerceptualLoss()
-            self.perceptual_loss_module.eval()
-            # Freeze perceptual loss parameters
-            for param in self.perceptual_loss_module.parameters():
-                param.requires_grad = False
-        # SSIM loss
-        if self.config.training.losses.ssim_loss_weight > 0.0:
-            self.ssim_loss_module = SsimLoss()
-            self.ssim_loss_module.eval()
-            # Freeze SSIM parameters
-            for param in self.ssim_loss_module.parameters():
-                param.requires_grad = False
-    def forward(
-        self,
-        rendering: torch.Tensor,        # [b, v, 3, h, w]
-        target: torch.Tensor,           # [b, v, 3, h, w]
-        img_aligned_xyz: torch.Tensor,  # [b, v, 3, h, w]
-        input: edict,
-        result_softpa: Optional[edict] = None,
-        create_visual: bool = False,
-    ) -> edict:
-        """
-        Compute all losses between rendered and target images.
-        Args:
-            rendering: Rendered images in range [0, 1]
-            target: Target images in range [0, 1]
-            img_aligned_xyz: Image-aligned 3D positions
-            input: Input data containing ray information
-            result_softpa: Additional results (unused)
-            create_visual: Whether to create visualization images
-        Returns:
-            Dictionary containing all loss values and metrics
-        """
-        b, v, _, h, w = rendering.size()
-        rendering_flat = rendering.reshape(b * v, -1, h, w)
-        target_flat = target.reshape(b * v, -1, h, w)
-        # Handle alpha channel if present
-        mask = None
-        if target_flat.size(1) == 4:
-            target_flat, mask = target_flat.split([3, 1], dim=1)
-        # Compute individual losses
-        losses = self._compute_all_losses(
-            rendering_flat, target_flat, img_aligned_xyz, input, mask, b, v, h, w
-        )
-        # Compute total weighted loss
-        total_loss = self._compute_total_loss(losses)
-        # Create visualization if requested
-        visual = self._create_visual(rendering_flat, target_flat, v) if create_visual else None
-        # Compile loss metrics
-        return self._compile_loss_metrics(losses, total_loss, visual)
-    def _compute_all_losses(self, rendering, target, img_aligned_xyz, input, mask, b, v, h, w):
-        """Compute all individual loss components."""
-        losses = {}
-        # L2 (MSE) loss
-        losses['l2'] = self._compute_l2_loss(rendering, target)
-        losses['psnr'] = -10.0 * torch.log10(losses['l2'])
-        # LPIPS loss
-        losses['lpips'] = self._compute_lpips_loss(rendering, target)
-        # Perceptual loss
-        losses['perceptual'] = self._compute_perceptual_loss(rendering, target)
-        # SSIM loss
-        losses['ssim'] = self._compute_ssim_loss(rendering, target)
-        # Pixel alignment loss
-        losses['pixelalign'] = self._compute_pixelalign_loss(
-            img_aligned_xyz, input, mask, b, v, h, w
-        )
-        # Point distance loss
-        losses['pointsdist'] = self._compute_pointsdist_loss(
-            img_aligned_xyz, input, b, v, h, w
-        )
-        return losses
-    def _compute_l2_loss(self, rendering, target):
-        """Compute L2 (MSE) loss."""
-        if self.config.training.losses.l2_loss_weight > 0.0:
-            return F.mse_loss(rendering, target)
-        return torch.tensor(1e-8, device=rendering.device)
-    def _compute_lpips_loss(self, rendering, target):
-        """Compute LPIPS perceptual loss."""
-        if self.config.training.losses.lpips_loss_weight > 0.0:
-            # LPIPS expects inputs in range [-1, 1]
-            return self.lpips_loss_module(
-                rendering * 2.0 - 1.0, target * 2.0 - 1.0
-            ).mean()
-        return torch.tensor(0.0, device=rendering.device)
-    def _compute_perceptual_loss(self, rendering, target):
-        """Compute custom perceptual loss."""
-        if self.config.training.losses.perceptual_loss_weight > 0.0:
-            return self.perceptual_loss_module(rendering, target)
-        return torch.tensor(0.0, device=rendering.device)
-    def _compute_ssim_loss(self, rendering, target):
-        """Compute SSIM loss."""
-        if self.config.training.losses.ssim_loss_weight > 0.0:
-            return self.ssim_loss_module(rendering, target)
-        return torch.tensor(0.0, device=rendering.device)
-    def _compute_pixelalign_loss(self, img_aligned_xyz, input, mask, b, v, h, w):
-        """Compute pixel alignment loss."""
-        if self.config.training.losses.pixelalign_loss_weight > 0.0:
-            # Compute orthogonal component to ray direction
-            xyz_vec = img_aligned_xyz - input.ray_o
-            ortho_vec = (
-                xyz_vec
-                - torch.sum(xyz_vec.detach() * input.ray_d, dim=2, keepdim=True)
-                * input.ray_d
-            )
-            # Apply mask if enabled
-            if self.config.training.losses.get("masked_pixelalign_loss", False):
-                assert mask is not None, "mask is None but masked_pixelalign_loss is enabled"
-                mask_reshaped = mask.view(b, v, 1, h, w)
-                ortho_vec = ortho_vec * mask_reshaped
-            return torch.mean(ortho_vec.norm(dim=2, p=2))
-        return torch.tensor(0.0, device=img_aligned_xyz.device)
-    def _compute_pointsdist_loss(self, img_aligned_xyz, input, b, v, h, w):
-        """Compute point distance regularization loss."""
-        if self.config.training.losses.pointsdist_loss_weight > 0.0:
-            # Target mean distance (distance from origin to ray origin)
-            target_mean_dist = torch.norm(input.ray_o, dim=2, p=2, keepdim=True)
-            target_std_dist = 0.5
-            # Predicted distance
-            pred_dist = (img_aligned_xyz - input.ray_o).norm(dim=2, p=2, keepdim=True)
-            # Normalize to target distribution
-            pred_dist_detach = pred_dist.detach()
-            pred_mean = pred_dist_detach.mean(dim=(2, 3, 4), keepdim=True)
-            pred_std = pred_dist_detach.std(dim=(2, 3, 4), keepdim=True)
-            target_dist = (pred_dist_detach - pred_mean) / (pred_std + 1e-8) * target_std_dist + target_mean_dist
-            return torch.mean((pred_dist - target_dist) ** 2)
-        return torch.tensor(0.0, device=img_aligned_xyz.device)
-    def _compute_total_loss(self, losses):
-        """Compute weighted sum of all losses."""
-        weights = self.config.training.losses
-        return (
-            weights.l2_loss_weight * losses['l2']
-            + weights.lpips_loss_weight * losses['lpips']
-            + weights.perceptual_loss_weight * losses['perceptual']
-            + weights.ssim_loss_weight * losses['ssim']
-            + weights.pixelalign_loss_weight * losses['pixelalign']
-            + weights.pointsdist_loss_weight * losses['pointsdist']
-        )
-    def _create_visual(self, rendering, target, v):
-        """Create visualization by concatenating target and rendering."""
-        visual = torch.cat((target, rendering), dim=3).detach().cpu()  # [b*v, c, h, w*2]
-        visual = rearrange(visual, "(b v) c h (m w) -> (b h) (v m w) c", v=v, m=2)
-        return (visual.numpy() * 255.0).clip(0.0, 255.0).astype(np.uint8)
-    def _compile_loss_metrics(self, losses, total_loss, visual):
-        """Compile all loss metrics into a dictionary."""
-        l2_loss = losses['l2']
-        return edict(
-            loss=total_loss,
-            l2_loss=l2_loss,
-            psnr=losses['psnr'],
-            lpips_loss=losses['lpips'],
-            perceptual_loss=losses['perceptual'],
-            ssim_loss=losses['ssim'],
-            pixelalign_loss=losses['pixelalign'],
-            pointsdist_loss=losses['pointsdist'],
-            visual=visual,
-            # Normalized losses for logging
-            norm_perceptual_loss=losses['perceptual'] / l2_loss,
-            norm_lpips_loss=losses['lpips'] / l2_loss,
-            norm_ssim_loss=losses['ssim'] / l2_loss,
-            norm_pixelalign_loss=losses['pixelalign'] / l2_loss,
-            norm_pointsdist_loss=losses['pointsdist'] / l2_loss,
-        )
 class GSLRM(nn.Module):
     """
     Gaussian Splatting Large Reconstruction Model.
@@ -575,7 +336,6 @@ class GSLRM(nn.Module):
     def _init_rendering_modules(self, config: edict) -> None:
         """Initialize rendering and loss computation modules."""
         self.gaussian_renderer = Renderer(config)
-        self.loss_calculator = LossComputer(config)
     def _init_training_state(self, config: edict) -> None:
         """Initialize training state management variables."""
@@ -584,101 +344,6 @@ class GSLRM(nn.Module):
         self.training_max_step = None
         self.original_config = copy.deepcopy(config)
-    def set_training_step(self, current_step: int, start_step: int, max_step: int) -> None:
-        """
-        Update training step and dynamically adjust configuration based on training phase.
-        Args:
-            current_step: Current training step
-            start_step: Starting step of training
-            max_step: Maximum training steps
-        """
-        self.training_step = current_step
-        self.training_start_step = start_step
-        self.training_max_step = max_step
-        # Determine if config modification is needed based on warmup settings
-        needs_config_modification = self._should_modify_config_for_warmup(current_step)
-        if needs_config_modification:
-            # Always use original config as base for modifications
-            self.config = copy.deepcopy(self.original_config)
-            self._apply_warmup_modifications(current_step)
-        else:
-            # Restore original configuration
-            self.config = self.original_config
-        # Update loss calculator with current config
-        self.loss_calculator.config = self.config
-    def _should_modify_config_for_warmup(self, current_step: int) -> bool:
-        """Check if configuration should be modified for warmup phases."""
-        pointsdist_warmup = (
-            self.config.training.losses.get("warmup_pointsdist", False)
-            and current_step < 1000
-        )
-        l2_warmup = (
-            self.config.training.schedule.get("l2_warmup_steps", 0) > 0
-            and current_step < self.config.training.schedule.l2_warmup_steps
-        )
-        return pointsdist_warmup or l2_warmup
-    def _apply_warmup_modifications(self, current_step: int) -> None:
-        """Apply configuration modifications for warmup phases."""
-        # Point distance warmup phase
-        if (self.config.training.losses.get("warmup_pointsdist", False)
-            and current_step < 1000):
-            self.config.training.losses.l2_loss_weight = 0.0
-            self.config.training.losses.perceptual_loss_weight = 0.0
-            self.config.training.losses.pointsdist_loss_weight = 0.1
-            self.config.model.clip_xyz = False  # Disable xyz clipping during warmup
-        # L2 loss warmup phase
-        if (self.config.training.schedule.get("l2_warmup_steps", 0) > 0
-            and current_step < self.config.training.schedule.l2_warmup_steps):
-            self.config.training.losses.perceptual_loss_weight = 0.0
-            self.config.training.losses.lpips_loss_weight = 0.0
-    def set_current_step(self, current_step: int, start_step: int, max_step: int) -> None:
-        """Backward compatibility wrapper for set_training_step."""
-        self.set_training_step(current_step, start_step, max_step)
-    def train(self, mode: bool = True) -> None:
-        """
-        Override train method to keep frozen modules in eval mode.
-        Args:
-            mode: Whether to set training mode (True) or evaluation mode (False)
-        """
-        super().train(mode)
-        # Keep loss calculator in eval mode to prevent training of frozen components
-        if self.loss_calculator is not None:
-            self.loss_calculator.eval()
-    def get_parameter_overview(self) -> edict:
-        """
-        Get overview of trainable parameters in each module.
-        Returns:
-            Dictionary containing parameter counts for each major component
-        """
-        def count_trainable_params(module: nn.Module) -> int:
-            return sum(p.numel() for p in module.parameters() if p.requires_grad)
-        return edict(
-            patch_embedder=count_trainable_params(self.patch_embedder),
-            gaussian_position_embeddings=self.gaussian_position_embeddings.data.numel(),
-            transformer_total=(
-                count_trainable_params(self.transformer_layers) +
-                count_trainable_params(self.input_layer_norm)
-            ),
-            gaussian_upsampler=count_trainable_params(self.gaussian_upsampler),
-            pixel_gaussian_decoder=count_trainable_params(self.pixel_gaussian_decoder),
-        )
-    def get_overview(self) -> edict:
-        """Backward compatibility wrapper for get_parameter_overview."""
-        return self.get_parameter_overview()
     def _create_transformer_layer_runner(self, start_layer: int, end_layer: int):
         """
@@ -843,149 +508,6 @@ class GSLRM(nn.Module):
         return aligned_positions
-    @staticmethod
-    def translate_legacy_state_dict(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
-        """
-        Translate legacy model parameter names to new parameter names.
-        This function allows loading models saved with the old variable names
-        by mapping them to the new, cleaner variable names.
-        Args:
-            state_dict: Dictionary containing model parameters with old names
-        Returns:
-            Dictionary with parameters mapped to new names
-        """
-        # Define the mapping from old names to new names
-        name_mapping = {
-            # Data processors
-            'split_data.': 'data_splitter.',
-            'transform_input.': 'input_transformer.',
-            'transform_target.': 'target_transformer.',
-            # Tokenizer
-            'image_tokenizer.': 'patch_embedder.',
-            # Positional embeddings
-            'refsrc_marker': 'view_type_embeddings',
-            'gaussians_pos_embedding': 'gaussian_position_embeddings',
-            # Transformer
-            'transformer_input_layernorm.': 'input_layer_norm.',
-            'transformer.': 'transformer_layers.',
-            # Gaussian modules
-            'upsampler.': 'gaussian_upsampler.',
-            'image_token_decoder.': 'pixel_gaussian_decoder.',
-            # Rendering modules
-            'renderer.': 'gaussian_renderer.',
-            'loss_computer.': 'loss_calculator.',
-        }
-        # Create new state dict with translated names
-        new_state_dict = {}
-        for old_key, value in state_dict.items():
-            new_key = old_key
-            # Apply name mappings
-            for old_pattern, new_pattern in name_mapping.items():
-                if old_key.startswith(old_pattern):
-                    new_key = old_key.replace(old_pattern, new_pattern, 1)
-                    break
-            # Fix specific key naming issues
-            # Change loss_computer.perceptual_loss_module.Net to loss_computer.perceptual_loss_module.net
-            if "loss_computer.perceptual_loss_module.Net" in new_key:
-                old_net_key = new_key
-                new_key = new_key.replace("loss_computer.perceptual_loss_module.Net", "loss_computer.perceptual_loss_module.net")
-                print(f"Renamed checkpoint key: {old_net_key} -> {new_key}")
-            # Also handle the new naming convention
-            elif "loss_calculator.perceptual_loss_module.Net" in new_key:
-                old_net_key = new_key
-                new_key = new_key.replace("loss_calculator.perceptual_loss_module.Net", "loss_calculator.perceptual_loss_module.net")
-                print(f"Renamed checkpoint key: {old_net_key} -> {new_key}")
-            new_state_dict[new_key] = value
-        return new_state_dict
-    def load_state_dict(self, state_dict: Dict[str, torch.Tensor], strict: bool = True):
-        """
-        Load model state dict with automatic legacy name translation.
-        Args:
-            state_dict: Model state dictionary (potentially with old parameter names)
-            strict: Whether to strictly enforce parameter name matching
-        """
-        # Check if this is a legacy state dict by looking for old parameter names
-        legacy_indicators = [
-            'image_tokenizer.',
-            'refsrc_marker',
-            'gaussians_pos_embedding',
-            'transformer_input_layernorm.',
-            'upsampler.',
-            'image_token_decoder.',
-            'renderer.',
-            'loss_computer.'
-        ]
-        is_legacy = any(
-            any(key.startswith(indicator) for key in state_dict.keys())
-            for indicator in legacy_indicators
-        )
-        if is_legacy:
-            print("Detected legacy model format. Translating parameter names...")
-            state_dict = self.translate_legacy_state_dict(state_dict)
-            print("Parameter name translation completed.")
-        # Load the (potentially translated) state dict
-        return super().load_state_dict(state_dict, strict=strict)
-    @classmethod
-    def load_from_checkpoint(
-        cls,
-        checkpoint_path: str,
-        config: edict,
-        map_location: Optional[str] = None
-    ) -> 'GSLRM':
-        """
-        Load model from checkpoint with automatic legacy name translation.
-        Args:
-            checkpoint_path: Path to the checkpoint file
-            config: Model configuration
-            map_location: Device to map tensors to (e.g., 'cpu', 'cuda:0')
-        Returns:
-            Loaded GSLRM model
-        """
-        # Create model instance
-        model = cls(config)
-        # Load checkpoint
-        checkpoint = torch.load(checkpoint_path, map_location=map_location)
-        # Extract state dict (handle different checkpoint formats)
-        if isinstance(checkpoint, dict):
-            if 'model_state_dict' in checkpoint:
-                state_dict = checkpoint['model_state_dict']
-            elif 'state_dict' in checkpoint:
-                state_dict = checkpoint['state_dict']
-            else:
-                state_dict = checkpoint
-        else:
-            state_dict = checkpoint
-        # Load state dict with automatic translation
-        model.load_state_dict(state_dict)
-        print(f"Successfully loaded model from {checkpoint_path}")
-        return model
     def _create_gaussian_models_and_stats(
         self,
         xyz: torch.Tensor,
@@ -1180,7 +702,6 @@ class GSLRM(nn.Module):
         )
         # Perform rendering and loss computation if target data is available
-        loss_metrics = None
         rendered_images = None
         if target_data is not None:
@@ -1193,17 +714,6 @@ class GSLRM(nn.Module):
                 C2W=target_data.c2w,
                 fxfycxcy=target_data.fxfycxcy,
             )
-            # Compute losses if rendered and target have matching dimensions
-            if rendered_images.shape[1] == target_data.image.shape[1]:
-                loss_metrics = self.loss_calculator(
-                    rendered_images,
-                    target_data.image,
-                    pixel_aligned_xyz,
-                    input_data,
-                    create_visual=create_visual,
-                    result_softpa=gaussian_splat_result,
-                )
         # Create Gaussian models for each batch item and compute usage statistics
         gaussian_models, pixel_aligned_positions, usage_statistics = self._create_gaussian_models_and_stats(
@@ -1211,12 +721,6 @@ class GSLRM(nn.Module):
             num_pixel_aligned_gaussians, num_views, height, width, patch_size
         )
-        # Add usage statistics to loss metrics for logging
-        if loss_metrics is not None:
-            loss_metrics.gaussians_usage = torch.tensor(
-                np.mean(np.array(usage_statistics))
-            ).float()
         # Compile final results
         return edict(
             input=input_data,
@@ -1224,424 +728,6 @@ class GSLRM(nn.Module):
             gaussians=gaussian_models,
             pixelalign_xyz=pixel_aligned_positions,
             img_tokens=image_patch_tokens,
-            loss_metrics=loss_metrics,
             render=rendered_images,
-        )
-    @torch.no_grad()
-    def save_visualization_outputs(
-        self,
-        output_directory: str,
-        model_results: edict,
-        batch_data: edict,
-        save_all_items: bool = False
-    ) -> None:
-        """
-        Save visualization outputs including rendered images and Gaussian models.
-        Args:
-            output_directory: Directory to save outputs
-            model_results: Results from model forward pass
-            batch_data: Original batch data
-            save_all_items: Whether to save all batch items or just the first
-        """
-        os.makedirs(output_directory, exist_ok=True)
-        input_data, target_data = model_results.input, model_results.target
-        # Save supervision visualization if available
-        if (model_results.loss_metrics is not None and
-            model_results.loss_metrics.visual is not None):
-            batch_uids = [
-                target_data.index[b, 0, -1].item()
-                for b in range(target_data.index.size(0))
-            ]
-            uid_range = f"{batch_uids[0]:08}_{batch_uids[-1]:08}"
-            # Save supervision comparison image
-            Image.fromarray(model_results.loss_metrics.visual).save(
-                os.path.join(output_directory, f"supervision_{uid_range}.jpg")
-            )
-            # Save UIDs for reference
-            with open(os.path.join(output_directory, "uids.txt"), "w") as f:
-                uid_string = "_".join([f"{uid:08}" for uid in batch_uids])
-                f.write(uid_string)
-            # Save input images
-            input_visualization = rearrange(
-                input_data.image, "batch views channels height width -> (batch height) (views width) channels"
-            )
-            input_visualization = (
-                (input_visualization.cpu().numpy() * 255.0).clip(0.0, 255.0).astype(np.uint8)
-            )
-            Image.fromarray(input_visualization[..., :3]).save(
-                os.path.join(output_directory, f"input_{uid_range}.jpg")
-            )
-        # Process each batch item individually
-        batch_size = input_data.image.size(0)
-        for batch_idx in range(batch_size):
-            item_uid = input_data.index[batch_idx, 0, -1].item()
-            # Render turntable visualization
-            turntable_image = render_turntable(model_results.gaussians[batch_idx])
-            Image.fromarray(turntable_image).save(
-                os.path.join(output_directory, f"turntable_{item_uid}.jpg")
-            )
-            # Save individual input images during inference
-            if self.config.inference:
-                individual_input = rearrange(
-                    input_data.image[batch_idx], "views channels height width -> height (views width) channels"
-                )
-                individual_input = (
-                    (individual_input.cpu().numpy() * 255.0).clip(0.0, 255.0).astype(np.uint8)
-                )
-                Image.fromarray(individual_input[..., :3]).save(
-                    os.path.join(output_directory, f"input_{item_uid}.jpg")
-                )
-            # Extract image dimensions and create opacity/depth visualizations
-            _, num_views, _, img_height, img_width = input_data.image.size()
-            patch_size = self.config.model.image_tokenizer.patch_size
-            # Get opacity values for pixel-aligned Gaussians
-            gaussian_opacity = model_results.gaussians[batch_idx].get_opacity
-            pixel_opacity = gaussian_opacity[-num_views * img_height * img_width:]
-            # Reshape opacity to image format
-            opacity_visualization = rearrange(
-                pixel_opacity,
-                "(views height width patch_h patch_w) channels -> (height patch_h) (views width patch_w) channels",
-                views=num_views,
-                height=img_height // patch_size,
-                width=img_width // patch_size,
-                patch_h=patch_size,
-                patch_w=patch_size,
-            ).squeeze(-1).cpu().numpy()
-            opacity_visualization = (opacity_visualization * 255.0).clip(0.0, 255.0).astype(np.uint8)
-            # Get 3D positions and compute depth visualization
-            gaussian_positions = model_results.gaussians[batch_idx].get_xyz
-            pixel_positions = gaussian_positions[-num_views * img_height * img_width:]
-            # Reshape positions to image format
-            pixel_positions_reshaped = rearrange(
-                pixel_positions,
-                "(views height width patch_h patch_w) coords -> views coords (height patch_h) (width patch_w)",
-                views=num_views,
-                height=img_height // patch_size,
-                width=img_width // patch_size,
-                patch_h=patch_size,
-                patch_w=patch_size,
-            )
-            # Compute distances from ray origins
-            ray_distances = (pixel_positions_reshaped - input_data.ray_o[batch_idx]).norm(dim=1, p=2)
-            distance_visualization = rearrange(ray_distances, "views height width -> height (views width)")
-            distance_visualization = distance_visualization.cpu().numpy()
-            # Normalize distances for visualization
-            dist_min, dist_max = distance_visualization.min(), distance_visualization.max()
-            distance_visualization = (distance_visualization - dist_min) / (dist_max - dist_min)
-            distance_visualization = (distance_visualization * 255.0).clip(0.0, 255.0).astype(np.uint8)
-            # Combine opacity and depth visualizations
-            combined_visualization = np.concatenate([opacity_visualization, distance_visualization], axis=0)
-            Image.fromarray(combined_visualization).save(
-                os.path.join(output_directory, f"aligned_gs_opacity_depth_{item_uid}.jpg")
-            )
-            # Save unfiltered Gaussian model for small images during early training
-            if (self.config.model.image_tokenizer.image_size <= 256 and
-                self.training_step is not None and self.training_step <= 5000):
-                model_results.gaussians[batch_idx].save_ply(
-                    os.path.join(output_directory, f"gaussians_{item_uid}_unfiltered.ply")
-                )
-            # Save filtered Gaussian model
-            camera_origins = None  # Could use input_data.ray_o[batch_idx, :, :, 0, 0] if needed
-            default_crop_box = [-0.91, 0.91, -0.91, 0.91, -0.91, 0.91]
-            model_results.gaussians[batch_idx].apply_all_filters(
-                opacity_thres=0.02,
-                crop_bbx=default_crop_box,
-                cam_origins=camera_origins,
-                nearfar_percent=(0.0001, 1.0),
-            ).save_ply(os.path.join(output_directory, f"gaussians_{item_uid}.ply"))
-            print(f"Saved visualization for UID: {item_uid}")
-            # Break after first item unless saving all
-            if not save_all_items:
-                break
-    @torch.no_grad()
-    def save_visuals(self, out_dir: str, result: edict, batch: edict, save_all: bool = False) -> None:
-        """Backward compatibility wrapper for save_visualization_outputs."""
-        self.save_visualization_outputs(out_dir, result, batch, save_all)
-    @torch.no_grad()
-    def save_evaluation_results(
-        self,
-        output_directory: str,
-        model_results: edict,
-        batch_data: edict,
-        dataset
-    ) -> None:
-        """Save comprehensive evaluation results including metrics, visualizations, and 3D models."""
-        from .utils_metrics import compute_psnr, compute_lpips, compute_ssim
-        os.makedirs(output_directory, exist_ok=True)
-        input_data, target_data = model_results.input, model_results.target
-        for batch_idx in range(input_data.image.size(0)):
-            item_uid = input_data.index[batch_idx, 0, -1].item()
-            item_output_dir = os.path.join(output_directory, f"{item_uid:08d}")
-            os.makedirs(item_output_dir, exist_ok=True)
-            # Save input image
-            input_image = rearrange(
-                input_data.image[batch_idx], "views channels height width -> height (views width) channels"
-            )
-            input_image = (input_image.cpu().numpy() * 255.0).clip(0.0, 255.0).astype(np.uint8)
-            Image.fromarray(input_image[..., :3]).save(os.path.join(item_output_dir, "input.png"))
-            # Save ground truth vs prediction comparison
-            comparison_image = torch.stack((target_data.image[batch_idx], model_results.render[batch_idx]), dim=0)
-            num_views = comparison_image.size(1)
-            if num_views > 10:
-                comparison_image = comparison_image[:, ::num_views // 10, :, :, :]
-            comparison_image = rearrange(
-                comparison_image, "comparison_type views channels height width -> (comparison_type height) (views width) channels"
-            )
-            comparison_image = (comparison_image.cpu().numpy() * 255.0).clip(0.0, 255.0).astype(np.uint8)
-            Image.fromarray(comparison_image).save(os.path.join(item_output_dir, "gt_vs_pred.png"))
-            # Compute and save metrics
-            per_view_psnr = compute_psnr(target_data.image[batch_idx], model_results.render[batch_idx])
-            per_view_lpips = compute_lpips(target_data.image[batch_idx], model_results.render[batch_idx])
-            per_view_ssim = compute_ssim(target_data.image[batch_idx], model_results.render[batch_idx])
-            # Save per-view metrics
-            view_ids = target_data.index[batch_idx, :, 0].cpu().numpy()
-            with open(os.path.join(item_output_dir, "perview_metrics.txt"), "w") as f:
-                for i in range(per_view_psnr.size(0)):
-                    f.write(
-                        f"view {view_ids[i]:0>6}, psnr: {per_view_psnr[i].item():.4f}, "
-                        f"lpips: {per_view_lpips[i].item():.4f}, ssim: {per_view_ssim[i].item():.4f}\n"
-                    )
-            # Save average metrics
-            avg_psnr = per_view_psnr.mean().item()
-            avg_lpips = per_view_lpips.mean().item()
-            avg_ssim = per_view_ssim.mean().item()
-            with open(os.path.join(item_output_dir, "metrics.txt"), "w") as f:
-                f.write(f"psnr: {avg_psnr:.4f}\nlpips: {avg_lpips:.4f}\nssim: {avg_ssim:.4f}\n")
-            print(f"UID {item_uid}: PSNR={avg_psnr:.4f}, LPIPS={avg_lpips:.4f}, SSIM={avg_ssim:.4f}")
-            # Save Gaussian model
-            crop_box = None
-            if self.config.model.get("clip_xyz", False):
-                if self.config.model.get("half_bbx_size", None) is not None:
-                    half_size = self.config.model.half_bbx_size
-                    crop_box = [-half_size, half_size, -half_size, half_size, -half_size, half_size]
-                else:
-                    crop_box = [-0.91, 0.91, -0.91, 0.91, -0.91, 0.91]
-            model_results.gaussians[batch_idx].apply_all_filters(
-                opacity_thres=0.02, crop_bbx=crop_box, cam_origins=None, nearfar_percent=(0.0001, 1.0)
-            ).save_ply(os.path.join(item_output_dir, "gaussians.ply"))
-            # Create turntable visualization
-            num_turntable_views = 150
-            render_resolution = input_image.shape[0]
-            turntable_frames = render_turntable(
-                model_results.gaussians[batch_idx], rendering_resolution=render_resolution, num_views=num_turntable_views
-            )
-            turntable_frames = rearrange(
-                turntable_frames, "height (views width) channels -> views height width channels", views=num_turntable_views
-            )
-            turntable_frames = np.ascontiguousarray(turntable_frames)
-            # Save basic turntable video
-            imageseq2video(turntable_frames, os.path.join(item_output_dir, "turntable.mp4"), fps=30)
-            # Save description and preview if available
-            try:
-                description = dataset.get_description(item_uid)["prompt"]
-                if len(description) > 0:
-                    with open(os.path.join(item_output_dir, "description.txt"), "w") as f:
-                        f.write(description)
-                    # Create preview image (subsample to 10 views)
-                    preview_frames = turntable_frames[::num_turntable_views // 10]
-                    preview_image = rearrange(preview_frames, "views height width channels -> height (views width) channels")
-                    Image.fromarray(preview_image).save(os.path.join(item_output_dir, "turntable_preview.png"))
-            except (AttributeError, KeyError):
-                pass
-            # Create turntable with input overlay
-            border_width = 2
-            target_width = render_resolution
-            target_height = int(input_image.shape[0] / input_image.shape[1] * target_width)
-            resized_input = cv2.resize(
-                input_image, (target_width - border_width * 2, target_height - border_width * 2), interpolation=cv2.INTER_AREA
-            )
-            bordered_input = np.pad(
-                resized_input, ((border_width, border_width), (border_width, border_width), (0, 0)),
-                mode="constant", constant_values=200
-            )
-            input_sequence = np.tile(bordered_input[None], (turntable_frames.shape[0], 1, 1, 1))
-            combined_frames = np.concatenate((turntable_frames, input_sequence), axis=1)
-            imageseq2video(combined_frames, os.path.join(item_output_dir, "turntable_with_input.mp4"), fps=30)
-    @torch.no_grad()
-    def save_evaluations(self, out_dir: str, result: edict, batch: edict, dataset) -> None:
-        """Backward compatibility wrapper for save_evaluation_results."""
-        self.save_evaluation_results(out_dir, result, batch, dataset)
-    @torch.no_grad()
-    def save_validation_results(
-        self,
-        output_directory: str,
-        model_results: edict,
-        batch_data: edict,
-        dataset,
-        save_visualizations: bool = False
-    ) -> Dict[str, float]:
-        """Save validation results and compute aggregated metrics."""
-        from .utils_metrics import compute_psnr, compute_lpips, compute_ssim
-        os.makedirs(output_directory, exist_ok=True)
-        input_data, target_data = model_results.input, model_results.target
-        validation_metrics = {"psnr": [], "lpips": [], "ssim": []}
-        for batch_idx in range(input_data.image.size(0)):
-            item_uid = input_data.index[batch_idx, 0, -1].item()
-            should_save_visuals = (batch_idx == 0) and save_visualizations
-            # Compute metrics (RGB only)
-            target_image = target_data.image[batch_idx][:, :3, ...]
-            per_view_psnr = compute_psnr(target_image, model_results.render[batch_idx])
-            per_view_lpips = compute_lpips(target_image, model_results.render[batch_idx])
-            per_view_ssim = compute_ssim(target_image, model_results.render[batch_idx])
-            avg_psnr = per_view_psnr.mean().item()
-            avg_lpips = per_view_lpips.mean().item()
-            avg_ssim = per_view_ssim.mean().item()
-            validation_metrics["psnr"].append(avg_psnr)
-            validation_metrics["lpips"].append(avg_lpips)
-            validation_metrics["ssim"].append(avg_ssim)
-            # Save visualizations only for first item if requested
-            if should_save_visuals:
-                item_output_dir = os.path.join(output_directory, f"{item_uid:08d}")
-                os.makedirs(item_output_dir, exist_ok=True)
-                # Save input image
-                input_image = rearrange(
-                    input_data.image[batch_idx][:, :3, ...], "views channels height width -> height (views width) channels"
-                )
-                input_image = (input_image.cpu().numpy() * 255.0).clip(0.0, 255.0).astype(np.uint8)
-                Image.fromarray(input_image).save(os.path.join(item_output_dir, "input.png"))
-                # Save ground truth vs prediction comparison
-                comparison_image = torch.stack((target_image, model_results.render[batch_idx]), dim=0)
-                num_views = comparison_image.size(1)
-                if num_views > 10:
-                    comparison_image = comparison_image[:, ::num_views // 10, :, :, :]
-                comparison_image = rearrange(
-                    comparison_image, "comparison_type views channels height width -> (comparison_type height) (views width) channels"
-                )
-                comparison_image = (comparison_image.cpu().numpy() * 255.0).clip(0.0, 255.0).astype(np.uint8)
-                Image.fromarray(comparison_image).save(os.path.join(item_output_dir, "gt_vs_pred.png"))
-                # Save per-view metrics
-                view_ids = target_data.index[batch_idx, :, 0].cpu().numpy()
-                with open(os.path.join(item_output_dir, "perview_metrics.txt"), "w") as f:
-                    for i in range(per_view_psnr.size(0)):
-                        f.write(
-                            f"view {view_ids[i]:0>6}, psnr: {per_view_psnr[i].item():.4f}, "
-                            f"lpips: {per_view_lpips[i].item():.4f}, ssim: {per_view_ssim[i].item():.4f}\n"
-                        )
-                # Save averaged metrics
-                with open(os.path.join(item_output_dir, "metrics.txt"), "w") as f:
-                    f.write(f"psnr: {avg_psnr:.4f}\nlpips: {avg_lpips:.4f}\nssim: {avg_ssim:.4f}\n")
-                print(f"Validation UID {item_uid}: PSNR={avg_psnr:.4f}, LPIPS={avg_lpips:.4f}, SSIM={avg_ssim:.4f}")
-                # Save Gaussian model
-                crop_box = None
-                if self.config.model.get("clip_xyz", False):
-                    if self.config.model.get("half_bbx_size", None) is not None:
-                        half_size = self.config.model.half_bbx_size
-                        crop_box = [-half_size, half_size, -half_size, half_size, -half_size, half_size]
-                    else:
-                        crop_box = [-0.91, 0.91, -0.91, 0.91, -0.91, 0.91]
-                model_results.gaussians[batch_idx].apply_all_filters(
-                    opacity_thres=0.02, crop_bbx=crop_box, cam_origins=None, nearfar_percent=(0.0001, 1.0)
-                ).save_ply(os.path.join(item_output_dir, "gaussians.ply"))
-                # Create turntable visualization
-                num_turntable_views = 150
-                render_resolution = input_image.shape[0]
-                turntable_frames = render_turntable(
-                    model_results.gaussians[batch_idx], rendering_resolution=render_resolution, num_views=num_turntable_views
-                )
-                turntable_frames = rearrange(
-                    turntable_frames, "height (views width) channels -> views height width channels", views=num_turntable_views
-                )
-                turntable_frames = np.ascontiguousarray(turntable_frames)
-                imageseq2video(turntable_frames, os.path.join(item_output_dir, "turntable.mp4"), fps=30)
-                # Create turntable with input overlay
-                border_width = 2
-                target_width = render_resolution
-                target_height = int(input_image.shape[0] / input_image.shape[1] * target_width)
-                resized_input = cv2.resize(
-                    input_image, (target_width - border_width * 2, target_height - border_width * 2), interpolation=cv2.INTER_AREA
-                )
-                bordered_input = np.pad(
-                    resized_input, ((border_width, border_width), (border_width, border_width), (0, 0)),
-                    mode="constant", constant_values=200
-                )
-                input_sequence = np.tile(bordered_input[None], (turntable_frames.shape[0], 1, 1, 1))
-                combined_frames = np.concatenate((turntable_frames, input_sequence), axis=1)
-                imageseq2video(combined_frames, os.path.join(item_output_dir, "turntable_with_input.mp4"), fps=30)
-        # Return averaged metrics
-        return {
-            "psnr": torch.tensor(validation_metrics["psnr"]).mean().item(),
-            "lpips": torch.tensor(validation_metrics["lpips"]).mean().item(),
-            "ssim": torch.tensor(validation_metrics["ssim"]).mean().item(),
-        }
-    @torch.no_grad()
-    def save_validations(
-        self,
-        out_dir: str,
-        result: edict,
-        batch: edict,
-        dataset,
-        save_img: bool = False
-    ) -> Dict[str, float]:
-        """Backward compatibility wrapper for save_validation_results."""
-        return self.save_validation_results(out_dir, result, batch, dataset, save_img)

 """
 import copy
+from typing import List, Optional, Tuple
 import lpips
 import numpy as np
 import torch
 from easydict import EasyDict as edict
 from einops import rearrange
 from einops.layers.torch import Rearrange
 # Local imports
 from .utils_losses import PerceptualLoss, SsimLoss
 from .gaussians_renderer import (
     GaussianModel,
     deferred_gaussian_render,
     render_opencv_cam,
 )
 from .transform_data import SplitData, TransformInput, TransformTarget
 from .utils_transformer import (
         return xyz, features, scaling, rotation, opacity
 class GSLRM(nn.Module):
     """
     Gaussian Splatting Large Reconstruction Model.
     def _init_rendering_modules(self, config: edict) -> None:
         """Initialize rendering and loss computation modules."""
         self.gaussian_renderer = Renderer(config)
     def _init_training_state(self, config: edict) -> None:
         """Initialize training state management variables."""
         self.training_max_step = None
         self.original_config = copy.deepcopy(config)
     def _create_transformer_layer_runner(self, start_layer: int, end_layer: int):
         """
         return aligned_positions
     def _create_gaussian_models_and_stats(
         self,
         xyz: torch.Tensor,
         )
         # Perform rendering and loss computation if target data is available
         rendered_images = None
         if target_data is not None:
                 C2W=target_data.c2w,
                 fxfycxcy=target_data.fxfycxcy,
             )
         # Create Gaussian models for each batch item and compute usage statistics
         gaussian_models, pixel_aligned_positions, usage_statistics = self._create_gaussian_models_and_stats(
             num_pixel_aligned_gaussians, num_views, height, width, patch_size
         )
         # Compile final results
         return edict(
             input=input_data,
             gaussians=gaussian_models,
             pixelalign_xyz=pixel_aligned_positions,
             img_tokens=image_patch_tokens,
+            loss_metrics=None,
             render=rendered_images,
+        )

gslrm/model/utils_losses.py DELETED Viewed

@@ -1,309 +0,0 @@
-# Copyright (C) 2025, FaceLift Research Group
-# https://github.com/weijielyu/FaceLift
-#
-# This software is free for non-commercial, research and evaluation use
-# under the terms of the LICENSE.md file.
-#
-# For inquiries contact: wlyu3@ucmerced.edu
-"""
-Perceptual Loss Implementation using VGG19 and SSIM Loss Implementation.
-Adapted from https://github.com/zhengqili/Crowdsampling-the-Plenoptic-Function/blob/f5216f312cf82d77f8d20454b5eeb3930324630a/models/networks.py#L1478
-"""
-import os
-from typing import List, Tuple, Union, Optional
-import scipy.io
-import torch
-import torch.nn as nn
-from pytorch_msssim import SSIM
-# VGG19 ImageNet normalization constants
-IMAGENET_MEAN = [123.6800, 116.7790, 103.9390]
-# VGG19 layer configuration
-VGG19_LAYER_INDICES = [0, 2, 5, 7, 10, 12, 14, 16, 19, 21, 23, 25, 28, 30, 32, 34]
-VGG19_LAYER_NAMES = [
-    "conv1", "conv2", "conv3", "conv4", "conv5", "conv6", "conv7", "conv8",
-    "conv9", "conv10", "conv11", "conv12", "conv13", "conv14", "conv15", "conv16"
-]
-VGG19_CHANNEL_SIZES = [64, 64, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512, 512, 512, 512, 512]
-# Perceptual loss weighting factors
-LAYER_WEIGHTS = [1.0, 1/2.6, 1/4.8, 1/3.7, 1/5.6, 10/1.5]
-class VGG19(nn.Module):
-    """
-    VGG19 network implementation for perceptual loss computation.
-    This class implements the VGG19 architecture with specific layer outputs
-    used for computing perceptual losses at different scales.
-    """
-    def __init__(self) -> None:
-        """Initialize VGG19 network layers."""
-        super(VGG19, self).__init__()
-        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=True)
-        self.relu1 = nn.ReLU(inplace=True)
-        self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=True)
-        self.relu2 = nn.ReLU(inplace=True)
-        self.max1 = nn.AvgPool2d(kernel_size=2, stride=2)
-        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1, bias=True)
-        self.relu3 = nn.ReLU(inplace=True)
-        self.conv4 = nn.Conv2d(128, 128, kernel_size=3, padding=1, bias=True)
-        self.relu4 = nn.ReLU(inplace=True)
-        self.max2 = nn.AvgPool2d(kernel_size=2, stride=2)
-        self.conv5 = nn.Conv2d(128, 256, kernel_size=3, padding=1, bias=True)
-        self.relu5 = nn.ReLU(inplace=True)
-        self.conv6 = nn.Conv2d(256, 256, kernel_size=3, padding=1, bias=True)
-        self.relu6 = nn.ReLU(inplace=True)
-        self.conv7 = nn.Conv2d(256, 256, kernel_size=3, padding=1, bias=True)
-        self.relu7 = nn.ReLU(inplace=True)
-        self.conv8 = nn.Conv2d(256, 256, kernel_size=3, padding=1, bias=True)
-        self.relu8 = nn.ReLU(inplace=True)
-        self.max3 = nn.AvgPool2d(kernel_size=2, stride=2)
-        self.conv9 = nn.Conv2d(256, 512, kernel_size=3, padding=1, bias=True)
-        self.relu9 = nn.ReLU(inplace=True)
-        self.conv10 = nn.Conv2d(512, 512, kernel_size=3, padding=1, bias=True)
-        self.relu10 = nn.ReLU(inplace=True)
-        self.conv11 = nn.Conv2d(512, 512, kernel_size=3, padding=1, bias=True)
-        self.relu11 = nn.ReLU(inplace=True)
-        self.conv12 = nn.Conv2d(512, 512, kernel_size=3, padding=1, bias=True)
-        self.relu12 = nn.ReLU(inplace=True)
-        self.max4 = nn.AvgPool2d(kernel_size=2, stride=2)
-        self.conv13 = nn.Conv2d(512, 512, kernel_size=3, padding=1, bias=True)
-        self.relu13 = nn.ReLU(inplace=True)
-        self.conv14 = nn.Conv2d(512, 512, kernel_size=3, padding=1, bias=True)
-        self.relu14 = nn.ReLU(inplace=True)
-        self.conv15 = nn.Conv2d(512, 512, kernel_size=3, padding=1, bias=True)
-        self.relu15 = nn.ReLU(inplace=True)
-        self.conv16 = nn.Conv2d(512, 512, kernel_size=3, padding=1, bias=True)
-        self.relu16 = nn.ReLU(inplace=True)
-        self.max5 = nn.AvgPool2d(kernel_size=2, stride=2)
-    def forward(self, x: torch.Tensor, return_style: int) -> Union[List[torch.Tensor], Tuple[torch.Tensor, ...]]:
-        """
-        Forward pass through VGG19 network.
-        Args:
-            x: Input tensor of shape [B, 3, H, W]
-            return_style: If > 0, return style features as list; otherwise return content features as tuple
-        Returns:
-            Either a list of style features or tuple of content features from different layers
-        """
-        out1 = self.conv1(x)
-        out2 = self.relu1(out1)
-        out3 = self.conv2(out2)
-        out4 = self.relu2(out3)
-        out5 = self.max1(out4)
-        out6 = self.conv3(out5)
-        out7 = self.relu3(out6)
-        out8 = self.conv4(out7)
-        out9 = self.relu4(out8)
-        out10 = self.max2(out9)
-        out11 = self.conv5(out10)
-        out12 = self.relu5(out11)
-        out13 = self.conv6(out12)
-        out14 = self.relu6(out13)
-        out15 = self.conv7(out14)
-        out16 = self.relu7(out15)
-        out17 = self.conv8(out16)
-        out18 = self.relu8(out17)
-        out19 = self.max3(out18)
-        out20 = self.conv9(out19)
-        out21 = self.relu9(out20)
-        out22 = self.conv10(out21)
-        out23 = self.relu10(out22)
-        out24 = self.conv11(out23)
-        out25 = self.relu11(out24)
-        out26 = self.conv12(out25)
-        out27 = self.relu12(out26)
-        out28 = self.max4(out27)
-        out29 = self.conv13(out28)
-        out30 = self.relu13(out29)
-        out31 = self.conv14(out30)
-        out32 = self.relu14(out31)
-        if return_style > 0:
-            return [out2, out7, out12, out21, out30]
-        else:
-            return out4, out9, out14, out23, out32
-class PerceptualLoss(nn.Module):
-    """
-    Perceptual Loss module using pre-trained VGG19.
-    This class implements perceptual loss by comparing features extracted from
-    different layers of a pre-trained VGG19 network. It computes weighted
-    differences across multiple scales to capture both low-level and high-level
-    visual differences between images.
-    """
-    def __init__(self, device: str = "cpu", weight_file: Optional[str] = None) -> None:
-        """
-        Initialize PerceptualLoss module.
-        Args:
-            device: Device to run computations on ('cpu' or 'cuda')
-            weight_file: Path to VGG19 weight file. If None, uses default path or environment variable.
-        Raises:
-            FileNotFoundError: If weight file is not found
-            RuntimeError: If weight file cannot be loaded
-        """
-        super().__init__()
-        self.device = device
-        self.net = VGG19()
-        # Determine weight file path
-        if weight_file is None:
-            # Check environment variable first
-            weight_file = os.environ.get('VGG19_WEIGHTS_PATH')
-            if weight_file is None:
-                # Fallback to default path
-                weight_file = "/sensei-fs/users/kaiz/repos/weight-collections/imagenet-vgg-verydeep-19.mat"
-        # Load VGG19 weights
-        if not os.path.isfile(weight_file):
-            raise FileNotFoundError(
-                f"VGG19 weight file not found: {weight_file}\n"
-                f"Download it from: https://www.vlfeat.org/matconvnet/models/imagenet-vgg-verydeep-19.mat\n"
-                f"Expected MD5: 106118b7cf60435e6d8e04f6a6dc3657"
-            )
-        try:
-            vgg_rawnet = scipy.io.loadmat(weight_file)
-            vgg_layers = vgg_rawnet["layers"][0]
-        except Exception as e:
-            raise RuntimeError(f"Failed to load VGG19 weights from {weight_file}: {e}")
-        # Load pre-trained weights into the network
-        self._load_pretrained_weights(vgg_layers)
-        # Set network to evaluation mode and freeze parameters
-        self.net = self.net.eval().to(device)
-        for param in self.net.parameters():
-            param.requires_grad = False
-    def _load_pretrained_weights(self, vgg_layers) -> None:
-        """Load pre-trained VGG19 weights into the network."""
-        for layer_idx in range(len(VGG19_LAYER_NAMES)):
-            layer_name = VGG19_LAYER_NAMES[layer_idx]
-            mat_layer_idx = VGG19_LAYER_INDICES[layer_idx]
-            channel_size = VGG19_CHANNEL_SIZES[layer_idx]
-            # Extract weights and biases from MATLAB format
-            layer_weights = torch.from_numpy(
-                vgg_layers[mat_layer_idx][0][0][2][0][0]
-            ).permute(3, 2, 0, 1)
-            layer_biases = torch.from_numpy(
-                vgg_layers[mat_layer_idx][0][0][2][0][1]
-            ).view(channel_size)
-            # Assign to network
-            getattr(self.net, layer_name).weight = nn.Parameter(layer_weights)
-            getattr(self.net, layer_name).bias = nn.Parameter(layer_biases)
-    def _compute_l1_error(self, truth: torch.Tensor, pred: torch.Tensor) -> torch.Tensor:
-        """
-        Compute L1 (Mean Absolute Error) between two tensors.
-        Args:
-            truth: Ground truth tensor
-            pred: Predicted tensor
-        Returns:
-            L1 error as a scalar tensor
-        """
-        return torch.mean(torch.abs(truth - pred))
-    def forward(self, pred_img: torch.Tensor, real_img: torch.Tensor) -> torch.Tensor:
-        """
-        Compute perceptual loss between predicted and real images.
-        Args:
-            pred_img: Predicted image tensor of shape [B, 3, H, W] in range [0, 1]
-            real_img: Real image tensor of shape [B, 3, H, W] in range [0, 1]
-        Returns:
-            Perceptual loss as a scalar tensor
-        """
-        # Convert to ImageNet normalization (RGB -> BGR and subtract mean)
-        imagenet_mean = torch.tensor(IMAGENET_MEAN, dtype=torch.float32, device=pred_img.device)
-        imagenet_mean = imagenet_mean.view(1, 3, 1, 1)
-        # Scale to [0, 255] and apply ImageNet normalization
-        real_img_normalized = real_img * 255.0 - imagenet_mean
-        pred_img_normalized = pred_img * 255.0 - imagenet_mean
-        # Extract features from both images
-        real_features = self.net(real_img_normalized, return_style=0)
-        pred_features = self.net(pred_img_normalized, return_style=0)
-        # Compute weighted L1 losses at different scales
-        losses = []
-        # Raw image loss
-        raw_loss = self._compute_l1_error(real_img_normalized, pred_img_normalized)
-        losses.append(raw_loss * LAYER_WEIGHTS[0])
-        # Feature losses at different VGG layers
-        for i, (real_feat, pred_feat) in enumerate(zip(real_features, pred_features)):
-            feature_loss = self._compute_l1_error(real_feat, pred_feat)
-            losses.append(feature_loss * LAYER_WEIGHTS[i + 1])
-        # Combine all losses and normalize
-        total_loss = sum(losses) / 255.0
-        return total_loss
-class SsimLoss(nn.Module):
-    """
-    SSIM Loss module that computes 1 - SSIM for image similarity.
-    Args:
-        data_range: Range of input data (default: 1.0 for [0,1] range)
-    """
-    def __init__(self, data_range: float = 1.0) -> None:
-        super().__init__()
-        self.data_range = data_range
-        self.ssim_module = SSIM(
-            win_size=11,
-            win_sigma=1.5,
-            data_range=self.data_range,
-            size_average=True,
-            channel=3,
-        )
-    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-        """
-        Compute SSIM loss between two image tensors.
-        Args:
-            x: Image tensor of shape (N, C, H, W)
-            y: Image tensor of shape (N, C, H, W)
-        Returns:
-            SSIM loss (1 - SSIM similarity)
-        """
-        return 1.0 - self.ssim_module(x, y)

splat_viewer.html DELETED Viewed

@@ -1,277 +0,0 @@
-<!DOCTYPE html>
-<html lang="en" dir="ltr">
-	<head>
-		<title>WebGL Gaussian Splat Viewer</title>
-		<meta charset="utf-8" />
-		<meta
-			name="viewport"
-			content="width=device-width, initial-scale=1, minimum-scale=1, maximum-scale=1, user-scalable=no"
-		/>
-		<meta name="apple-mobile-web-app-capable" content="yes" />
-		<meta
-			name="apple-mobile-web-app-status-bar-style"
-			content="black-translucent"
-		/>
-		<style>
-			body {
-				overflow: hidden;
-				margin: 0;
-				height: 100vh;
-				width: 100vw;
-				font-family: sans-serif;
-				background: black;
-    			text-shadow: 0 0 3px black;
-			}
-			a, body {
-				color: white;
-			}
-			#info {
-				z-index: 100;
-				position: absolute;
-				top: 10px;
-				left: 15px;
-			}
-			h3 {
-				margin: 5px 0;
-			}
-			p {
-				margin: 5px 0;
-				font-size: small;
-			}
-			.cube-wrapper {
-				transform-style: preserve-3d;
-			}
-			.cube {
-				transform-style: preserve-3d;
-				transform: rotateX(45deg) rotateZ(45deg);
-				animation: rotation 2s infinite;
-			}
-			.cube-faces {
-				transform-style: preserve-3d;
-				height: 80px;
-				width: 80px;
-				position: relative;
-				transform-origin: 0 0;
-				transform: translateX(0) translateY(0) translateZ(-40px);
-			}
-			.cube-face {
-				position: absolute;
-				inset: 0;
-				background: #0017ff;
-				border: solid 1px #ffffff;
-			}
-			.cube-face.top {
-				transform: translateZ(80px);
-			}
-			.cube-face.front {
-				transform-origin: 0 50%;
-				transform: rotateY(-90deg);
-			}
-			.cube-face.back {
-				transform-origin: 0 50%;
-				transform: rotateY(-90deg) translateZ(-80px);
-			}
-			.cube-face.right {
-				transform-origin: 50% 0;
-				transform: rotateX(-90deg) translateY(-80px);
-			}
-			.cube-face.left {
-				transform-origin: 50% 0;
-				transform: rotateX(-90deg) translateY(-80px) translateZ(80px);
-			}
-			@keyframes rotation {
-				0% {
-					transform: rotateX(45deg) rotateY(0) rotateZ(45deg);
-					animation-timing-function: cubic-bezier(
-						0.17,
-						0.84,
-						0.44,
-						1
-					);
-				}
-				50% {
-					transform: rotateX(45deg) rotateY(0) rotateZ(225deg);
-					animation-timing-function: cubic-bezier(
-						0.76,
-						0.05,
-						0.86,
-						0.06
-					);
-				}
-				100% {
-					transform: rotateX(45deg) rotateY(0) rotateZ(405deg);
-					animation-timing-function: cubic-bezier(
-						0.17,
-						0.84,
-						0.44,
-						1
-					);
-				}
-			}
-			.scene,
-			#message {
-				position: absolute;
-				display: flex;
-				top: 0;
-				right: 0;
-				left: 0;
-				bottom: 0;
-				z-index: 2;
-				height: 100%;
-				width: 100%;
-				align-items: center;
-				justify-content: center;
-			}
-			#message {
-				font-weight: bold;
-				font-size: large;
-				color: red;
-				pointer-events: none;
-			}
-			details {
-				font-size: small;
-			}
-			#progress {
-				position: absolute;
-				top: 0;
-				height: 5px;
-				background: blue;
-				z-index: 99;
-				transition: width 0.1s ease-in-out;
-			}
-			#quality {
-				position: absolute;
-				bottom: 10px;
-				z-index: 999;
-				right: 10px;
-			}
-			#caminfo {
-				position: absolute;
-				top: 10px;
-				z-index: 999;
-				right: 10px;
-			}
-			#canvas {
-				display: block;
-				position: absolute;
-				top: 0;
-				left: 0;
-				width: 100%;
-				height: 100%;
-				touch-action: none;
-			}
-			#instructions {
-				background: rgba(0,0,0,0.6);
-				white-space: pre-wrap;
-				padding: 10px;
-				border-radius: 10px;
-				font-size: x-small;
-			}
-			body.nohf .nohf {
-				display: none;
-			}
-			body.nohf #progress, body.nohf .cube-face {
-				background: #ff9d0d;
-			}
-		</style>
-	</head>
-	<body>
-		<script>
-			if(location.host.includes('hf.space')) document.body.classList.add('nohf');
-		</script>
-		<div id="info">
-			<h3 class="nohf">WebGL 3D Gaussian Splat Viewer</h3>
-			<p>
-			<small class="nohf">
-				By <a href="https://twitter.com/antimatter15">Kevin Kwok</a>.
-				Code on
-				<a href="https://github.com/antimatter15/splat">Github</a
-				>.
-			</small>
-		</p>
-			<details>
-			<summary>Use mouse or arrow keys to navigate.</summary>
-<div id="instructions">movement (arrow keys)
-- left/right arrow keys to strafe side to side
-- up/down arrow keys to move forward/back
-- space to jump
-camera angle (wasd)
-- a/d to turn camera left/right
-- w/s to tilt camera up/down
-- q/e to roll camera counterclockwise/clockwise
-- i/k and j/l to orbit
-trackpad
-- scroll up/down/left/right to orbit
-- pinch to move forward/back
-- ctrl key + scroll to move forward/back
-- shift + scroll to move up/down or strafe
-mouse
-- click and drag to orbit
-- right click (or ctrl/cmd key) and drag up/down to move
-touch (mobile)
-- one finger to orbit
-- two finger pinch to move forward/back
-- two finger rotate to rotate camera clockwise/counterclockwise
-- two finger pan to move side-to-side and up-down
-gamepad
-- if you have a game controller connected it should work
-other
-- press 0-9 to switch to one of the pre-loaded camera views
-- press '-' or '+'key to cycle loaded cameras
-- press p to resume default animation
-- drag and drop .ply file to convert to .splat
-- drag and drop cameras.json to load cameras
-</div>
-		</details>
-		</div>
-		<div id="progress"></div>
-		<div id="message"></div>
-		<div class="scene" id="spinner">
-			<div class="cube-wrapper">
-				<div class="cube">
-					<div class="cube-faces">
-						<div class="cube-face bottom"></div>
-						<div class="cube-face top"></div>
-						<div class="cube-face left"></div>
-						<div class="cube-face right"></div>
-						<div class="cube-face back"></div>
-						<div class="cube-face front"></div>
-					</div>
-				</div>
-			</div>
-		</div>
-		<canvas id="canvas"></canvas>
-		<div id="quality">
-			<span id="fps"></span>
-		</div>
-		<div id="caminfo">
-			<span id="camid"></span>
-		</div>
-		<script src="main.js"></script>
-	</body>
-</html>