arcisvlm / data /camera_augment.py

Hardik Sanghvi

feat: integrate Gemma 4 E2B backbone for production-quality VLM inference

7a564e3 3 months ago

8.12 kB

	"""
	Camera/Scene Augmentation — synthetic diversity for prototype LoRA training.

	Generates diverse camera perspectives from a single image by simulating
	different camera types, scenes, and conditions. Used in Stage 5 (prototype
	LoRA training) to create varied conditioning inputs so each prototype
	specializes on a different camera/scene combination.

	Augmentations:
	- Crop position: simulate PTZ pan/tilt (random crop of original)
	- Brightness/contrast: simulate day/night/indoor/outdoor lighting
	- Color temperature: simulate warm (indoor) vs cool (outdoor) cameras
	- Noise: simulate low-light camera sensor noise
	- Blur: simulate focus/motion blur from cheap cameras
	- Resolution: simulate different camera resolutions

	Reference: Train on augmented data so hypernetwork learns to differentiate
	camera/scene contexts and generate specialized adapters.
	"""

	from __future__ import annotations

	import random
	from typing import Optional

	import torch
	import torch.nn.functional as F


	class CameraAugmentor:
	"""
	Applies random camera-specific augmentations to simulate diverse cameras.

	Args:
	img_size: Target output image size (square)
	crop_range: Min/max fraction of image to crop (simulates zoom level)
	brightness_range: Min/max brightness multiplier
	contrast_range: Min/max contrast multiplier
	noise_range: Min/max Gaussian noise std
	blur_prob: Probability of applying Gaussian blur
	color_temp_range: Min/max color temperature shift
	"""

	def __init__(
	self,
	img_size: int = 448,
	crop_range: tuple[float, float] = (0.5, 1.0),
	brightness_range: tuple[float, float] = (0.5, 1.5),
	contrast_range: tuple[float, float] = (0.7, 1.3),
	noise_range: tuple[float, float] = (0.0, 0.05),
	blur_prob: float = 0.3,
	color_temp_range: tuple[float, float] = (-0.1, 0.1),
	):
	self.img_size = img_size
	self.crop_range = crop_range
	self.brightness_range = brightness_range
	self.contrast_range = contrast_range
	self.noise_range = noise_range
	self.blur_prob = blur_prob
	self.color_temp_range = color_temp_range

	def __call__(self, image: torch.Tensor) -> tuple[torch.Tensor, dict]:
	"""
	Apply random camera augmentation.

	Args:
	image: [3, H, W] or [B, 3, H, W] image tensor (0-1 range)
	Returns:
	augmented_image: same shape, augmented
	aug_params: dict describing which augmentations were applied
	"""
	squeeze = False
	if image.dim() == 3:
	image = image.unsqueeze(0)
	squeeze = True

	B, C, H, W = image.shape
	aug_params = {}

	# 1. Random crop (simulates PTZ zoom/position)
	crop_frac = random.uniform(*self.crop_range)
	crop_h = int(H * crop_frac)
	crop_w = int(W * crop_frac)
	top = random.randint(0, H - crop_h)
	left = random.randint(0, W - crop_w)
	image = image[:, :, top:top + crop_h, left:left + crop_w]
	image = F.interpolate(image, size=(self.img_size, self.img_size), mode="bilinear", align_corners=False)
	aug_params["crop"] = {"frac": crop_frac, "top": top, "left": left}

	# 2. Brightness
	brightness = random.uniform(*self.brightness_range)
	image = image * brightness
	aug_params["brightness"] = brightness

	# 3. Contrast
	contrast = random.uniform(*self.contrast_range)
	mean = image.mean(dim=[2, 3], keepdim=True)
	image = (image - mean) * contrast + mean
	aug_params["contrast"] = contrast

	# 4. Color temperature shift
	temp = random.uniform(*self.color_temp_range)
	if abs(temp) > 0.01:
	# Warm: boost red, reduce blue. Cool: opposite.
	image[:, 0] += temp # Red channel
	image[:, 2] -= temp # Blue channel
	aug_params["color_temp"] = temp

	# 5. Gaussian noise (sensor noise)
	noise_std = random.uniform(*self.noise_range)
	if noise_std > 0.001:
	noise = torch.randn_like(image) * noise_std
	image = image + noise
	aug_params["noise_std"] = noise_std

	# 6. Gaussian blur (focus/motion blur)
	if random.random() < self.blur_prob:
	kernel_size = random.choice([3, 5])
	sigma = random.uniform(0.5, 2.0)
	image = _gaussian_blur(image, kernel_size, sigma)
	aug_params["blur"] = {"kernel": kernel_size, "sigma": sigma}
	else:
	aug_params["blur"] = None

	# Clamp to valid range
	image = image.clamp(0, 1)

	if squeeze:
	image = image.squeeze(0)

	return image, aug_params


	class ScenePresetAugmentor:
	"""
	Applies pre-defined scene augmentation presets for prototype diversity.

	Each preset simulates a specific camera installation scenario.
	"""

	PRESETS = {
	"lobby_daytime": {"brightness": 1.2, "contrast": 1.0, "color_temp": 0.05, "noise": 0.01, "crop_frac": 0.8},
	"lobby_nighttime": {"brightness": 0.5, "contrast": 0.8, "color_temp": -0.05, "noise": 0.04, "crop_frac": 0.8},
	"parking_day": {"brightness": 1.3, "contrast": 1.2, "color_temp": 0.0, "noise": 0.01, "crop_frac": 0.6},
	"parking_night": {"brightness": 0.3, "contrast": 0.7, "color_temp": -0.08, "noise": 0.05, "crop_frac": 0.6},
	"gate_clear": {"brightness": 1.1, "contrast": 1.1, "color_temp": 0.02, "noise": 0.01, "crop_frac": 0.9},
	"gate_rain": {"brightness": 0.7, "contrast": 0.9, "color_temp": -0.03, "noise": 0.03, "crop_frac": 0.9},
	"corridor_empty": {"brightness": 0.9, "contrast": 1.0, "color_temp": 0.08, "noise": 0.02, "crop_frac": 1.0},
	"corridor_crowded": {"brightness": 0.9, "contrast": 1.0, "color_temp": 0.08, "noise": 0.02, "crop_frac": 0.7},
	"rooftop_twilight": {"brightness": 0.4, "contrast": 1.1, "color_temp": -0.1, "noise": 0.03, "crop_frac": 0.5},
	"rooftop_noon": {"brightness": 1.4, "contrast": 1.3, "color_temp": 0.0, "noise": 0.01, "crop_frac": 0.5},
	}

	def __init__(self, img_size: int = 448):
	self.img_size = img_size

	def augment(self, image: torch.Tensor, preset_name: str) -> torch.Tensor:
	"""Apply a named scene preset augmentation."""
	preset = self.PRESETS.get(preset_name, self.PRESETS["lobby_daytime"])

	if image.dim() == 3:
	image = image.unsqueeze(0)

	B, C, H, W = image.shape

	# Crop
	frac = preset["crop_frac"]
	ch, cw = int(H * frac), int(W * frac)
	top = (H - ch) // 2
	left = (W - cw) // 2
	image = image[:, :, top:top + ch, left:left + cw]
	image = F.interpolate(image, size=(self.img_size, self.img_size), mode="bilinear", align_corners=False)

	# Brightness + contrast
	mean = image.mean(dim=[2, 3], keepdim=True)
	image = ((image - mean) * preset["contrast"] + mean) * preset["brightness"]

	# Color temperature
	image[:, 0] += preset["color_temp"]
	image[:, 2] -= preset["color_temp"]

	# Noise
	if preset["noise"] > 0:
	image = image + torch.randn_like(image) * preset["noise"]

	return image.clamp(0, 1).squeeze(0)

	@classmethod
	def list_presets(cls) -> list[str]:
	return list(cls.PRESETS.keys())


	def _gaussian_blur(image: torch.Tensor, kernel_size: int, sigma: float) -> torch.Tensor:
	"""Apply Gaussian blur to image tensor."""
	# Create 1D Gaussian kernel
	coords = torch.arange(kernel_size, dtype=torch.float32) - kernel_size // 2
	kernel_1d = torch.exp(-0.5 * (coords / sigma) ** 2)
	kernel_1d = kernel_1d / kernel_1d.sum()

	# Create 2D kernel
	kernel_2d = kernel_1d.unsqueeze(0) * kernel_1d.unsqueeze(1)
	kernel_2d = kernel_2d.unsqueeze(0).unsqueeze(0) # [1, 1, K, K]
	kernel_2d = kernel_2d.expand(image.shape[1], -1, -1, -1).to(image.device) # [C, 1, K, K]

	pad = kernel_size // 2
	return F.conv2d(image, kernel_2d, padding=pad, groups=image.shape[1])