arcisvlm / data /camera_augment.py
Hardik Sanghvi
feat: integrate Gemma 4 E2B backbone for production-quality VLM inference
7a564e3
Raw
History Blame Contribute Delete
8.12 kB
"""
Camera/Scene Augmentation — synthetic diversity for prototype LoRA training.
Generates diverse camera perspectives from a single image by simulating
different camera types, scenes, and conditions. Used in Stage 5 (prototype
LoRA training) to create varied conditioning inputs so each prototype
specializes on a different camera/scene combination.
Augmentations:
- Crop position: simulate PTZ pan/tilt (random crop of original)
- Brightness/contrast: simulate day/night/indoor/outdoor lighting
- Color temperature: simulate warm (indoor) vs cool (outdoor) cameras
- Noise: simulate low-light camera sensor noise
- Blur: simulate focus/motion blur from cheap cameras
- Resolution: simulate different camera resolutions
Reference: Train on augmented data so hypernetwork learns to differentiate
camera/scene contexts and generate specialized adapters.
"""
from __future__ import annotations
import random
from typing import Optional
import torch
import torch.nn.functional as F
class CameraAugmentor:
"""
Applies random camera-specific augmentations to simulate diverse cameras.
Args:
img_size: Target output image size (square)
crop_range: Min/max fraction of image to crop (simulates zoom level)
brightness_range: Min/max brightness multiplier
contrast_range: Min/max contrast multiplier
noise_range: Min/max Gaussian noise std
blur_prob: Probability of applying Gaussian blur
color_temp_range: Min/max color temperature shift
"""
def __init__(
self,
img_size: int = 448,
crop_range: tuple[float, float] = (0.5, 1.0),
brightness_range: tuple[float, float] = (0.5, 1.5),
contrast_range: tuple[float, float] = (0.7, 1.3),
noise_range: tuple[float, float] = (0.0, 0.05),
blur_prob: float = 0.3,
color_temp_range: tuple[float, float] = (-0.1, 0.1),
):
self.img_size = img_size
self.crop_range = crop_range
self.brightness_range = brightness_range
self.contrast_range = contrast_range
self.noise_range = noise_range
self.blur_prob = blur_prob
self.color_temp_range = color_temp_range
def __call__(self, image: torch.Tensor) -> tuple[torch.Tensor, dict]:
"""
Apply random camera augmentation.
Args:
image: [3, H, W] or [B, 3, H, W] image tensor (0-1 range)
Returns:
augmented_image: same shape, augmented
aug_params: dict describing which augmentations were applied
"""
squeeze = False
if image.dim() == 3:
image = image.unsqueeze(0)
squeeze = True
B, C, H, W = image.shape
aug_params = {}
# 1. Random crop (simulates PTZ zoom/position)
crop_frac = random.uniform(*self.crop_range)
crop_h = int(H * crop_frac)
crop_w = int(W * crop_frac)
top = random.randint(0, H - crop_h)
left = random.randint(0, W - crop_w)
image = image[:, :, top:top + crop_h, left:left + crop_w]
image = F.interpolate(image, size=(self.img_size, self.img_size), mode="bilinear", align_corners=False)
aug_params["crop"] = {"frac": crop_frac, "top": top, "left": left}
# 2. Brightness
brightness = random.uniform(*self.brightness_range)
image = image * brightness
aug_params["brightness"] = brightness
# 3. Contrast
contrast = random.uniform(*self.contrast_range)
mean = image.mean(dim=[2, 3], keepdim=True)
image = (image - mean) * contrast + mean
aug_params["contrast"] = contrast
# 4. Color temperature shift
temp = random.uniform(*self.color_temp_range)
if abs(temp) > 0.01:
# Warm: boost red, reduce blue. Cool: opposite.
image[:, 0] += temp # Red channel
image[:, 2] -= temp # Blue channel
aug_params["color_temp"] = temp
# 5. Gaussian noise (sensor noise)
noise_std = random.uniform(*self.noise_range)
if noise_std > 0.001:
noise = torch.randn_like(image) * noise_std
image = image + noise
aug_params["noise_std"] = noise_std
# 6. Gaussian blur (focus/motion blur)
if random.random() < self.blur_prob:
kernel_size = random.choice([3, 5])
sigma = random.uniform(0.5, 2.0)
image = _gaussian_blur(image, kernel_size, sigma)
aug_params["blur"] = {"kernel": kernel_size, "sigma": sigma}
else:
aug_params["blur"] = None
# Clamp to valid range
image = image.clamp(0, 1)
if squeeze:
image = image.squeeze(0)
return image, aug_params
class ScenePresetAugmentor:
"""
Applies pre-defined scene augmentation presets for prototype diversity.
Each preset simulates a specific camera installation scenario.
"""
PRESETS = {
"lobby_daytime": {"brightness": 1.2, "contrast": 1.0, "color_temp": 0.05, "noise": 0.01, "crop_frac": 0.8},
"lobby_nighttime": {"brightness": 0.5, "contrast": 0.8, "color_temp": -0.05, "noise": 0.04, "crop_frac": 0.8},
"parking_day": {"brightness": 1.3, "contrast": 1.2, "color_temp": 0.0, "noise": 0.01, "crop_frac": 0.6},
"parking_night": {"brightness": 0.3, "contrast": 0.7, "color_temp": -0.08, "noise": 0.05, "crop_frac": 0.6},
"gate_clear": {"brightness": 1.1, "contrast": 1.1, "color_temp": 0.02, "noise": 0.01, "crop_frac": 0.9},
"gate_rain": {"brightness": 0.7, "contrast": 0.9, "color_temp": -0.03, "noise": 0.03, "crop_frac": 0.9},
"corridor_empty": {"brightness": 0.9, "contrast": 1.0, "color_temp": 0.08, "noise": 0.02, "crop_frac": 1.0},
"corridor_crowded": {"brightness": 0.9, "contrast": 1.0, "color_temp": 0.08, "noise": 0.02, "crop_frac": 0.7},
"rooftop_twilight": {"brightness": 0.4, "contrast": 1.1, "color_temp": -0.1, "noise": 0.03, "crop_frac": 0.5},
"rooftop_noon": {"brightness": 1.4, "contrast": 1.3, "color_temp": 0.0, "noise": 0.01, "crop_frac": 0.5},
}
def __init__(self, img_size: int = 448):
self.img_size = img_size
def augment(self, image: torch.Tensor, preset_name: str) -> torch.Tensor:
"""Apply a named scene preset augmentation."""
preset = self.PRESETS.get(preset_name, self.PRESETS["lobby_daytime"])
if image.dim() == 3:
image = image.unsqueeze(0)
B, C, H, W = image.shape
# Crop
frac = preset["crop_frac"]
ch, cw = int(H * frac), int(W * frac)
top = (H - ch) // 2
left = (W - cw) // 2
image = image[:, :, top:top + ch, left:left + cw]
image = F.interpolate(image, size=(self.img_size, self.img_size), mode="bilinear", align_corners=False)
# Brightness + contrast
mean = image.mean(dim=[2, 3], keepdim=True)
image = ((image - mean) * preset["contrast"] + mean) * preset["brightness"]
# Color temperature
image[:, 0] += preset["color_temp"]
image[:, 2] -= preset["color_temp"]
# Noise
if preset["noise"] > 0:
image = image + torch.randn_like(image) * preset["noise"]
return image.clamp(0, 1).squeeze(0)
@classmethod
def list_presets(cls) -> list[str]:
return list(cls.PRESETS.keys())
def _gaussian_blur(image: torch.Tensor, kernel_size: int, sigma: float) -> torch.Tensor:
"""Apply Gaussian blur to image tensor."""
# Create 1D Gaussian kernel
coords = torch.arange(kernel_size, dtype=torch.float32) - kernel_size // 2
kernel_1d = torch.exp(-0.5 * (coords / sigma) ** 2)
kernel_1d = kernel_1d / kernel_1d.sum()
# Create 2D kernel
kernel_2d = kernel_1d.unsqueeze(0) * kernel_1d.unsqueeze(1)
kernel_2d = kernel_2d.unsqueeze(0).unsqueeze(0) # [1, 1, K, K]
kernel_2d = kernel_2d.expand(image.shape[1], -1, -1, -1).to(image.device) # [C, 1, K, K]
pad = kernel_size // 2
return F.conv2d(image, kernel_2d, padding=pad, groups=image.shape[1])