Spaces:

BiasLab2025
/

contextual-communication-demo-2

Paused

App Files Files Community

raheebhassan commited on Feb 10

Commit

4fec4e4

1 Parent(s): fa7e75c

Initial Commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +8 -35
.github/copilot-instructions.md +417 -0
.gitignore +69 -0
API.md +1029 -0
README.md +507 -7
_segmentation_comparison.ipynb +0 -0
app.py +0 -0
checkpoints/tic_lambda_0.0035.pth.tar +3 -0
checkpoints/tic_lambda_0.013.pth.tar +3 -0
checkpoints/tic_lambda_0.025.pth.tar +3 -0
checkpoints/tic_lambda_0.0483.pth.tar +3 -0
checkpoints/tic_lambda_0.0932.pth.tar +3 -0
detection/__init__.py +30 -0
detection/base.py +83 -0
detection/bytetrack.py +358 -0
detection/detr.py +215 -0
detection/factory.py +50 -0
detection/grounding_dino.py +215 -0
detection/torchvision_detectors.py +300 -0
detection/tracker.py +387 -0
detection/utils.py +124 -0
detection/yolo.py +98 -0
detection/yolo_world.py +188 -0
examples.sh +272 -0
model_cache.py +62 -0
requirements.txt +26 -0
roi_compressor.py +183 -0
roi_detection_eval.py +639 -0
roi_segmenter.py +355 -0
segmentation/__init__.py +33 -0
segmentation/base.py +169 -0
segmentation/factory.py +124 -0
segmentation/fake.py +412 -0
segmentation/mask2former.py +310 -0
segmentation/maskrcnn.py +310 -0
segmentation/sam3.py +187 -0
segmentation/segformer.py +193 -0
segmentation/utils.py +90 -0
segmentation/yolo.py +188 -0
vae/RSTB.py +813 -0
vae/__init__.py +57 -0
vae/roi_tic.py +132 -0
vae/tic_model.py +989 -0
vae/transformer_layers.py +250 -0
vae/utils.py +102 -0
vae/visualization.py +94 -0
video/__init__.py +55 -0
video/chunk_compressor.py +877 -0
video/gpu_memory.py +194 -0
video/mask_cache.py +69 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,8 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+checkpoints/tic_lambda_0.0932.pth.tar filter=lfs diff=lfs merge=lfs -text
+checkpoints/tic_lambda_0.0035.pth.tar filter=lfs diff=lfs merge=lfs -text
+checkpoints/tic_lambda_0.013.pth.tar filter=lfs diff=lfs merge=lfs -text
+checkpoints/tic_lambda_0.025.pth.tar filter=lfs diff=lfs merge=lfs -text
+checkpoints/tic_lambda_0.0483.pth.tar filter=lfs diff=lfs merge=lfs -text
+images/*/*.jpg filter=lfs diff=lfs merge=lfs -text
+images/*/*.jpeg filter=lfs diff=lfs merge=lfs -text
+images/*/*.png filter=lfs diff=lfs merge=lfs -text

.github/copilot-instructions.md ADDED Viewed

	@@ -0,0 +1,417 @@

+# ROI-VAE Image Compression - Copilot Instructions
+## Project Overview
+ROI-based VAE image compression using TIC (Transformer-based Image Compression). The system preserves quality in Regions of Interest (ROI) while aggressively compressing backgrounds using configurable quality factors.
+## Architecture
+### Core Pipeline
+1. **Segmentation** (`segmentation/` module) → 2. **Compression** (`vae/` module) → 3. **Output**
+   - Segmentation creates binary masks (1=ROI, 0=background)
+   - Compression applies variable quality based on mask using `sigma` parameter
+### Key Components
+**Segmentation Module** (`segmentation/`):
+- Abstract base class `BaseSegmenter` defines common interface
+- Implementations:
+  - `SegFormerSegmenter` - Cityscapes semantic segmentation (19 classes: road, car, building, person, etc.)
+  - `YOLOSegmenter` - COCO instance segmentation (80 classes)
+  - `Mask2FormerSegmenter` - Swin Transformer-based panoptic/semantic segmentation (COCO: 133 classes, ADE20K: 150 classes)
+  - `MaskRCNNSegmenter` - ResNet50-FPN instance segmentation (COCO: 80 classes)
+  - `SAM3Segmenter` - Prompt-based segmentation (natural language prompt → mask via text-conditioned detector + SAM)
+  - `FakeSegmenter` - Detection + tracking → bbox masks (fast, non-pixel-perfect)
+- **Fake Segmentation** (NEW): Detection-based segmentation for speed
+  - Creates rectangular masks from detection bounding boxes
+  - Uses object tracking for temporal consistency (ByteTrack, BoTSORT, SimpleTracker)
+  - Available methods: `fake_yolo` (default, ByteTrack), `fake_yolo_botsort`, `fake_detr`, `fake_fasterrcnn`, `fake_retinanet`, `fake_fcos`, `fake_deformable_detr`, `fake_grounding_dino`
+  - Much faster than pixel-perfect segmentation (~60-100 fps vs 10-30 fps)
+  - Memory estimates in `gpu_memory.py`: 120-200 MB per frame (vs 180-500 MB for full segmentation)
+- Factory pattern: `create_segmenter('yolo', device='cuda')` or `create_segmenter('fake_yolo', device='cuda')`
+- Extensible for future models
+- Utils: `visualize_mask()`, `save_mask()`, `calculate_roi_stats()`
+**Compression Module** (`vae/`):
+- `tic_model.py`: Base `TIC` class - Transformer-based VAE with encoder, decoder, hyperprior
+- `RSTB.py`: Residual Swin Transformer Blocks and attention modules
+- `transformer_layers.py`: Generic transformer components (MLP, attention, drop path)
+- `roi_tic.py`: `ModifiedTIC` class extending base TIC with ROI-aware forward pass
+- `utils.py`: `compress_image()`, `compute_padding()` for image processing
+- `visualization.py`: `highlight_roi()`, `create_comparison_grid()` for results
+- Handles checkpoint loading with compressai version compatibility fixes
+**Detection Module** (`detection/`):
+- Abstract base class `BaseDetector` defines common interface
+- Factory pattern: `create_detector('yolo', device='cuda')`
+- Implementations:
+  - `YOLODetector` - Ultralytics YOLO (closed-vocabulary COCO weights)
+  - Torchvision: Faster R-CNN, RetinaNet, SSD, FCOS
+  - Transformers: DETR, Deformable DETR
+  - `EfficientDetDetector` - optional via `effdet`
+  - `YOLOWorldDetector` - open-vocabulary detection (Ultralytics YOLO-World; requires prompts)
+  - `GroundingDINODetector` - open-vocabulary detection (Transformers; requires prompts)
+- CLI: `roi_detection_eval.py` evaluates detection retention before vs after ROI compression
+**TIC Model** (`vae/tic_model.py`):
+- Transformer-based VAE with encoder (`g_a`), decoder (`g_s`), and hyperprior (`h_a`, `h_s`)
+- Uses RSTB (Residual Swin Transformer Blocks) for feature extraction
+- Channels: N=192, M=192 (expansion layer)
+- Critical: Images must be padded to multiples of 256 (use `compute_padding()`)
+**ModifiedTIC** (`vae/roi_tic.py`):
+- Extends base TIC with ROI-aware forward pass
+- Takes mask + sigma parameter to create quality factors
+- Applies `similarity_loss` tensor: 1.0 for ROI pixels, sigma for background
+- Integrates mask through `simi_net` and `sub_impor_net` branches
+## Critical Conventions
+### Model Cache Locations
+- By default, auto-downloaded model artifacts are kept inside `checkpoints/`:
+  - Hugging Face cache: `checkpoints/hf/`
+  - Torch/torchvision cache: `checkpoints/torch/`
+### Checkpoint Loading Pattern
+```python
+from vae import load_checkpoint
+# Automatically handles compressai version mismatch
+model = load_checkpoint('checkpoints/tic_lambda_0.0483.pth.tar', N=192, M=192, device='cuda')
+# Note: model.update(force=True) is called automatically
+```
+Manual loading:
+```python
+# Fix compressai version mismatch - required for all checkpoint loading
+state_dict = checkpoint["state_dict"]
+new_state_dict = {}
+for k, v in state_dict.items():
+    if "entropy_bottleneck._matrix" in k:
+        new_key = k.replace("entropy_bottleneck._matrix", "entropy_bottleneck.matrices.")
+        # ... similar replacements for _bias, _factor
+```
+Always call `model.update(force=True)` after loading checkpoints.
+### Image Preprocessing
+1. Convert PIL to torch tensor: `x = torch.from_numpy(np.array(img)).float() / 255.0`
+2. Permute to [B, C, H, W]: `x = x.permute(2, 0, 1).unsqueeze(0)`
+3. Pad to 256 multiples using `compute_padding(h, w, min_div=256)`
+4. Apply mask at same resolution as input image
+### Sigma Parameter
+- Range: 0.01 - 1.0 (lower = more background compression)
+- Default: 0.3
+- ROI pixels always get quality factor 1.0
+- Applied via `torch.where(mask > 0.5, 1.0, sigma)`
+### Available Checkpoints
+Located in `checkpoints/` directory with different lambda (rate-distortion) values:
+- `tic_lambda_0.0035.pth.tar` - Lowest bitrate (highest compression)
+- `tic_lambda_0.013.pth.tar` - Low bitrate (N=128, M=192)
+- `tic_lambda_0.025.pth.tar` - Medium-low bitrate
+- `tic_lambda_0.0483.pth.tar` - **Default** - Medium bitrate
+- `tic_lambda_0.0932.pth.tar` - High bitrate (better quality)
+- `yolo26x-seg.pt` - YOLO segmentation model
+## Development Workflows
+### Using Segmentation Module (New)
+```python
+from segmentation import create_segmenter
+# Available methods: segformer, yolo, mask2former, maskrcnn, sam3
+# Fake methods: fake_yolo, fake_yolo_botsort, fake_detr, fake_fasterrcnn, etc.
+segmenter = create_segmenter('mask2former', device='cuda', model_type='coco')
+# Segment image
+mask = segmenter(image, target_classes=['car', 'person'])
+# Fast segmentation with detection + tracking (non-pixel-perfect)
+fake_seg = create_segmenter('fake_yolo', device='cuda')
+mask = fake_seg(image, target_classes=['person'])  # Uses ByteTrack tracking
+# Much faster: ~60-100 fps vs 10-30 fps for pixel-perfect segmentation
+# Add new segmentation method
+from segmentation import register_segmenter, BaseSegmenter
+class MySegmenter(BaseSegmenter):
+    def load_model(self): ...
+    def segment(self, image, target_classes, **kwargs): ...
+    def get_available_classes(self): ...
+register_segmenter('my_method', MySegmenter)
+```
+### Using Compression Module (New)
+```python
+from vae import load_checkpoint, compress_image
+from PIL import Image
+import numpy as np
+# Load model
+model = load_checkpoint('checkpoints/tic_lambda_0.0483.pth.tar', device='cuda')
+# Compress image with mask
+image = Image.open('input.jpg')
+mask = np.zeros((image.height, image.width))  # Your mask here
+result = compress_image(image, mask, model, sigma=0.3, device='cuda')
+compressed = result['compressed']  # PIL Image
+bpp = result['bpp']  # Bits per pixel
+# Visualize results
+from vae import create_comparison_grid
+grid = create_comparison_grid(image, compressed, mask, bpp, sigma=0.3, lambda_val=0.0483)
+grid.save('comparison.jpg')
+```
+### Using Detection Module (New)
+```python
+from detection import create_detector
+# Closed-vocabulary
+det = create_detector('yolo', device='cuda', model_path='checkpoints/yolo26x.pt')
+dets = det(image, conf_threshold=0.25)
+# Open-vocabulary (must pass prompts/classes)
+det_ov = create_detector('yolo_world', device='cuda')
+dets_ov = det_ov(image, conf_threshold=0.25, classes='person,car')
+```
+### Detection Eval (CLI)
+```bash
+# Compare before vs after (already-compressed)
+python roi_detection_eval.py \
+  --before images/car/0016cf15fa4d4e16.jpg \
+  --after results/compressed.jpg \
+  --detectors yolo detr \
+  --viz-dir results/det_viz
+# Open-vocabulary eval (YOLO-World requires prompts)
+python roi_detection_eval.py \
+  --before images/person/kodim04.png \
+  --checkpoint checkpoints/tic_lambda_0.0483.pth.tar \
+  --sigma 0.3 \
+  --seg-method yolo --seg-classes person \
+  --detectors yolo_world \
+  --open-vocab-classes "person,car" \
+  --viz-dir results/det_viz
+```
+### Running Compression (CLI)
+```bash
+# Basic compression with segmentation
+python roi_compressor.py \
+    --input images/car/0016cf15fa4d4e16.jpg \
+    --output results/compressed.jpg \
+    --checkpoint checkpoints/tic_lambda_0.0483.pth.tar \
+    --sigma 0.3 \
+    --seg-classes car \
+    --seg-method yolo
+# Fast compression with detection-based fake segmentation (~3x faster)
+python roi_compressor.py \
+    --input images/car/0016cf15fa4d4e16.jpg \
+    --output results/compressed.jpg \
+    --checkpoint checkpoints/tic_lambda_0.0483.pth.tar \
+    --sigma 0.3 \
+    --seg-classes car \
+    --seg-method fake_yolo
+# With comparison grid (original, compressed, ROI highlighted)
+python roi_compressor.py ... --highlight
+```
+### Standalone Segmentation (CLI)
+```bash
+# Using Mask2Former with COCO panoptic
+python roi_segmenter.py \
+    --input images/car/0016cf15fa4d4e16.jpg \
+    --output results/mask.png \
+    --method mask2former \
+    --classes car building person \
+    --visualize
+# Fast segmentation with detection + ByteTrack tracking
+python roi_segmenter.py \
+    --input data/videos/Person_doing_handstand.mp4 \
+    --output results/masks.mp4 \
+    --method fake_yolo \
+    --classes person \
+    --resize-height 480 \
+    --smooth-patience 10 \
+    --visualize
+# Other fake methods (detection + tracking)
+# fake_yolo_botsort (YOLO + BoTSORT)
+# fake_detr (DETR + SimpleTracker)
+# fake_fasterrcnn, fake_retinanet, fake_fcos, etc.
+```
+### Adding New Segmentation Models
+1. Create new file in `segmentation/` (e.g., `sam.py`)
+2. Extend `BaseSegmenter` and implement abstract methods:
+   - `load_model()`: Load model weights
+   - `segment()`: Generate mask from image
+   - `get_available_classes()`: Return supported classes/capabilities
+3. Register in `segmentation/__init__.py` or use `register_segmenter()`
+4. Use via `create_segmenter('your_method', ...)`
+### Testing Examples
+- `roi_segmenter.py`: CLI tool for standalone segmentation
+- `roi_compressor.py`: CLI tool for ROI-based image compression
+- `roi_segmenter.py`: CLI tool for standalone segmentation
+- `roi_compressor.py`: CLI tool for ROI-based image compression
+- `segmentation/`: Modular segmentation with abstract base class
+  - `base.py`: `BaseSegmenter` abstract class
+  - `segformer.py`: Cityscapes semantic segmentation
+  - `yolo.py`: COCO instance segmentation
+  - `factory.py`: Factory pattern for creating segmenters
+  - `utils.py`: Visualization and I/O utilities
+- `vae/`: Modular compression with ROI support
+  - `tic_model.py`: Base `TIC` class (Transformer-based VAE)
+  - `RSTB.py`: Residual Swin Transformer Blocks
+  - `transformer_layers.py`: Generic transformer components
+  - `roi_tic.py`: `ModifiedTIC` class and checkpoint loading
+  - `utils.py`: `compress_image()`, `compute_padding()`
+  - `visualization.py`: `highlight_roi()`, `create_comparison_grid()`
+- `roi_segmenter.py`: CLI tool for standalone segmentation
+- `roi_compressor.py`: CLI tool for ROI-based compression
+- `vae_compress.py`: Legacy ROI compression script (updated to use modules)
+- `*.bak`: Backup files from pre-modularization (tic_model, RSTB, etc.)
+## Dependencies
+- PyTorch + torchvision for model
+- compressai for entropy models (version sensitive - see checkpoint loading)
+- transformers for SegFormer + DETR/Deformable DETR + Grounding DINO
+- ultralytics for YOLO + YOLO-World
+- effdet (optional) for EfficientDet detector
+- timm for model layers
+## Common Pitfalls
+1. **Padding**: Forgetting to pad images to 256 multiples causes dimension mismatches
+2. **Checkpoint keys**: Old checkpoints use `_matrix/_bias/_factor` naming that must be converted
+3. **Mask resolution**: Mask must match input image size; it's automatically downsampled in forward pass
+4. **Mask downsampling**: In ModifiedTIC, mask is downsampled to 1/2 resolution before simi_net (which further downsamples 8x to match 16x16 latent)
+5. **Device mismatch**: Ensure mask, sigma tensor, and model are on same device
+6. **Model update**: Must call `model.update(force=True)` after loading for entropy models
+## Project Structure
+- `.github/copilot-instructions.md`: This file - comprehensive development guide
+- `examples.sh`: Example commands for running compression and segmentation
+- `README.md`: Project overview and quick start guide
+- `requirements.txt`: Python dependencies
+**CLI Tools:**
+- `roi_segmenter.py`: CLI tool for standalone segmentation
+- `roi_compressor.py`: CLI tool for ROI-based image compression
+- `app.py`: Gradio demo with Image and Video tabs
+**Core Modules:**
+- `segmentation/`: Modular segmentation with abstract base class
+  - `base.py`: `BaseSegmenter` abstract class
+  - `segformer.py`: Cityscapes semantic segmentation (19 classes)
+  - `yolo.py`: COCO instance segmentation (80 classes)
+  - `mask2former.py`: Swin-based panoptic/semantic (COCO: 133, ADE20K: 150 classes)
+  - `maskrcnn.py`: ResNet50-FPN instance segmentation (COCO: 80 classes)
+  - `sam3.py`: Prompt-based segmentation
+  - `factory.py`: Factory pattern for creating segmenters
+  - `utils.py`: Visualization and I/O utilities
+- `vae/`: Modular compression with ROI support
+  - `tic_model.py`: Base `TIC` class (Transformer-based VAE)
+  - `RSTB.py`: Residual Swin Transformer Blocks
+  - `transformer_layers.py`: Generic transformer components
+  - `roi_tic.py`: `ModifiedTIC` class and checkpoint loading
+  - `utils.py`: `compress_image()`, `compute_padding()`
+  - `visualization.py`: `highlight_roi()`, `create_comparison_grid()`
+- `video/`: Video compression with streaming support
+  - `video_processor.py`: `VideoProcessor` class for video compression
+  - `motion_analyzer.py`: `MotionAnalyzer` for scene complexity estimation
+  - `chunk_compressor.py`: `ChunkCompressor` and `BandwidthController`
+- `detection/`: Object detection and tracking
+  - `tracker.py`: `SimpleTracker` IoU-based multi-object tracker
+  - `utils.py`: `draw_detections()`, `draw_tracks()`
+## Video Processing
+### Video Module Usage
+```python
+from video import VideoProcessor, CompressionSettings
+# Create processor
+processor = VideoProcessor(device='cuda')
+processor.load_models(
+    quality_level=4,
+    segmentation_method='sam3',
+    detection_method='yolo',
+    enable_tracking=True,
+)
+# Static mode (fixed settings)
+settings = CompressionSettings(
+    mode='static',
+    quality_level=4,
+    sigma=0.3,
+    output_fps=15.0,
+    target_classes=['person', 'car'],
+)
+for chunk in processor.process_static('input.mp4', settings):
+    # Stream chunks in real-time
+    print(f"Chunk {chunk.chunk_index}: {len(chunk.frames)} frames at {chunk.fps} FPS")
+# Dynamic mode (bandwidth-adaptive)
+settings = CompressionSettings(
+    mode='dynamic',
+    target_bandwidth_kbps=500,
+    min_fps=5,
+    max_fps=30,
+    chunk_duration_sec=1.0,
+    target_classes=['person', 'car'],
+)
+for chunk in processor.process_dynamic('input.mp4', settings):
+    # Adaptive FPS and quality per chunk based on motion
+    print(f"Chunk {chunk.chunk_index}: fps={chunk.fps:.1f}, quality={chunk.quality_level}")
+```
+### Motion-Adaptive Compression
+The dynamic mode analyzes each chunk for:
+- **Motion magnitude**: Mean pixel change between frames
+- **Motion coverage**: Fraction of pixels with significant motion
+- **Scene complexity**: Edge density and texture variance
+- **Scene changes**: Large global differences
+High-motion scenes get:
+- More frames (higher FPS)
+- Higher spatial compression (lower quality/sigma) to stay within bandwidth
+Low-motion scenes get:
+- Fewer frames (lower FPS)
+- Better spatial quality (higher quality/sigma)
+### Object Tracking
+```python
+from detection import SimpleTracker, draw_tracks
+tracker = SimpleTracker(iou_threshold=0.3, max_age=30)
+for frame_detections in frame_by_frame_detections:
+    tracks = tracker.update(frame_detections)
+    # tracks contains track_id, label, bbox, history
+# Draw tracks with trails
+img = draw_tracks(frame, tracks, show_id=True, show_trail=True)
+```
+## Coding Guidelines
+- Don't create unnecessary files—focus on core functionality.
+- Ensure all scripts have clear argument parsing and help messages.
+- Maintain consistent coding style and comments for clarity.
+- Validate inputs (image paths, checkpoint paths, segmentation classes).
+- Include error handling for common issues (file not found, dimension mismatches).
+- Document all functions and classes with docstrings.
+- Write modular code to facilitate testing and future extensions.
+- Use ipynb files for prototyping but keep main logic in .py files.

.gitignore ADDED Viewed

	@@ -0,0 +1,69 @@

+# --- Python ---
+__pycache__/
+*.py[cod]
+*.pyd
+*.so
+*.egg-info/
+dist/
+build/
+.eggs/
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+.pytype/
+.coverage
+coverage.xml
+htmlcov/
+# --- Virtual environments ---
+venv/
+.venv/
+env/
+ENV/
+# --- Jupyter ---
+.ipynb_checkpoints/
+# --- OS / editor ---
+.DS_Store
+Thumbs.db
+.vscode/
+.idea/
+# --- Secrets / local config ---
+.env
+.env.*
+*.key
+*.pem
+# --- Logs / temp ---
+*.log
+logs/
+tmp/
+.cache/
+# --- Gradio / HF Spaces artifacts ---
+flagged/
+gradio_cached_examples/
+.gradio/
+# --- Data / media ---
+data/
+# --- Model + framework caches (keep your curated checkpoints, ignore auto-download caches) ---
+checkpoints/hf/
+checkpoints/torch/
+checkpoints/yolo*.pt
+# --- Common ML experiment outputs ---
+runs/
+wandb/
+outputs/
+results/
+# --- Large artifacts (uncomment if you don't want binaries tracked) ---
+# *.pth
+# *.pt
+# *.pth.tar
+# *.onnx
+# *.ckpt

API.md ADDED Viewed

	@@ -0,0 +1,1029 @@

+# API Documentation
+This document describes the Gradio API endpoints exposed by the ROI-VAE image and video compression application. The API allows programmatic access to segmentation, compression, detection, and full pipeline processing for both images and videos.
+**Live Demo:** https://biaslab2025-contextual-communication-demo.hf.space
+## Table of Contents
+- [Quick Start](#quick-start)
+- [Important Notes](#important-notes)
+- [Image API Endpoints](#image-api-endpoints)
+  - [/segment](#1-segment---generate-roi-mask)
+  - [/compress](#2-compress---compress-image)
+  - [/detect](#3-detect---object-detection)
+  - [/detect_overlay](#31-detect_overlay---detection-with-visualization)
+  - [/process](#4-process---full-image-pipeline)
+- [Video API Endpoints](#video-api-endpoints)
+  - [/segment_video](#1-segment_video---segment-video)
+  - [/compress_video](#2-compress_video---compress-video)
+  - [/detect_video](#3-detect_video---video-detection)
+  - [/process_video](#4-process_video---full-video-pipeline)
+- [Streaming Video API Endpoints](#streaming-video-api-endpoints)
+  - [/stream_process_video](#1-stream_process_video---full-streaming-pipeline)
+  - [/stream_compress_video](#2-stream_compress_video---simplified-streaming-compression)
+- [Class Reference](#class-reference)
+- [Error Handling](#error-handling)
+- [GPU Quota Handling](#handling-gpu-quota-on-hf-spaces)
+- [cURL Examples](#using-with-curl)
+- [Example Scripts](#example-scripts)
+---
+## Quick Start
+### Installation
+```bash
+pip install gradio_client
+```
+### Image Processing
+```python
+from gradio_client import Client, handle_file
+# Connect to the API
+client = Client("https://biaslab2025-contextual-communication-demo.hf.space")
+# Or local: client = Client("http://localhost:7860")
+# Full pipeline: segment → compress → detect
+compressed, mask, bpp, ratio, coverage, detections_json = client.predict(
+    handle_file("path/to/image.jpg"),
+    "car, person",      # segmentation prompt
+    "sam3",             # segmentation method
+    4,                  # quality level (1-5)
+    0.3,                # sigma (background compression)
+    True,               # run detection
+    "yolo",             # detection method
+    "",                 # detection classes (empty for closed-vocab)
+    api_name="/process"
+)
+print(f"Compression: {bpp:.4f} bpp ({ratio:.2f}x)")
+```
+### Video Processing
+```python
+from gradio_client import Client, handle_file
+import json
+client = Client("http://localhost:7860")
+# Full pipeline with static settings
+output_video, stats_json = client.predict(
+    handle_file("path/to/video.mp4"),
+    "person, car",      # segmentation classes
+    "sam3",             # segmentation method
+    "static",           # mode: "static" or "dynamic"
+    4,                  # quality level (1-5)
+    0.3,                # sigma
+    15.0,               # output FPS
+    500,                # bandwidth (dynamic mode)
+    5,                  # min_fps (dynamic mode)
+    30,                 # max_fps (dynamic mode)
+    False,              # run detection
+    "yolo",             # detection method
+    None,               # mask_file_path (optional)
+    api_name="/process_video"
+)
+stats = json.loads(stats_json)
+print(f"Compressed video: {output_video}")
+print(f"Total frames: {stats['total_frames']}")
+```
+---
+## Important Notes
+### File Handling
+Always wrap file paths with `handle_file()` when using `gradio_client`:
+```python
+from gradio_client import handle_file
+# ✅ Correct
+client.predict(handle_file("image.jpg"), ...)
+# ❌ Incorrect - will fail with validation error
+client.predict("image.jpg", ...)
+```
+### Detection Output Format
+All detection endpoints return JSON strings with this structure:
+```python
+import json
+detections = json.loads(detections_json)
+# Each detection has:
+# - label: str (class name)
+# - score: float (confidence 0-1)
+# - bbox_xyxy: list[float] (bounding box [x1, y1, x2, y2])
+```
+### Open-Vocabulary Detectors
+The following detectors require a `classes` parameter:
+- `yolo_world` - YOLO-World
+- `grounding_dino` - Grounding DINO
+Closed-vocabulary detectors (`yolo`, `detr`, `faster_rcnn`, etc.) use pretrained COCO classes and ignore the `classes` parameter.
+---
+## Image API Endpoints
+### 1. `/segment` - Generate ROI Mask
+Segments an image to create a Region of Interest (ROI) mask.
+**Parameters:**
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `image` | Image | required | Input image file |
+| `prompt` | str | `"object"` | Comma-separated classes or natural language prompt |
+| `method` | str | `"sam3"` | Segmentation method (see [methods](#segmentation-methods)) |
+| `return_overlay` | bool | `False` | If `True`, returns image with ROI highlighted instead of mask |
+**Returns:**
+| Output | Type | Description |
+|--------|------|-------------|
+| `result_image` | Image | Grayscale mask OR image with ROI overlay (if `return_overlay=True`) |
+| `roi_coverage` | float | Fraction of image covered by ROI (0.0-1.0) |
+| `classes_used` | str | JSON list of classes/prompts used |
+**Example:**
+```python
+# Get binary mask (default)
+mask, coverage, classes = client.predict(
+    handle_file("car_scene.jpg"),
+    "car, road",
+    "sam3",
+    False,  # return_overlay
+    api_name="/segment"
+)
+print(f"ROI covers {coverage*100:.2f}% of image")
+# Get image with ROI highlighted
+highlighted, coverage, classes = client.predict(
+    handle_file("car_scene.jpg"),
+    "car, road",
+    "sam3",
+    True,   # return_overlay=True
+    api_name="/segment"
+)
+```
+---
+### 2. `/compress` - Compress Image
+Compresses an image using TIC VAE, optionally with an ROI mask for variable quality.
+**Parameters:**
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `image` | Image | required | Input image file |
+| `mask_image` | Image | `None` | ROI mask (white=ROI, black=background) |
+| `quality` | int | `4` | Quality level 1-5 |
+| `sigma` | float | `0.3` | Background preservation (0.01-1.0) |
+**Quality Levels:**
+| Level | Lambda | Description |
+|-------|--------|-------------|
+| 1 | 0.0035 | Smallest file |
+| 2 | 0.013 | Smaller file |
+| 3 | 0.025 | Balanced |
+| 4 | 0.0483 | Higher quality (default) |
+| 5 | 0.0932 | Best quality |
+**Returns:**
+| Output | Type | Description |
+|--------|------|-------------|
+| `compressed_image` | Image | Compressed output image |
+| `bpp` | float | Bits per pixel |
+| `compression_ratio` | float | Compression ratio (24/bpp) |
+**Example:**
+```python
+# Compress without mask (uniform quality)
+compressed, bpp, ratio = client.predict(
+    handle_file("image.jpg"),
+    None,   # no mask
+    4,      # quality
+    0.3,    # sigma (ignored without mask)
+    api_name="/compress"
+)
+# Compress with ROI mask
+mask, _, _ = client.predict(handle_file("image.jpg"), "person", "yolo", False, api_name="/segment")
+compressed, bpp, ratio = client.predict(
+    handle_file("image.jpg"),
+    handle_file(mask),
+    4,
+    0.2,    # aggressive background compression
+    api_name="/compress"
+)
+```
+---
+### 3. `/detect` - Object Detection
+Runs object detection on an image and returns detection results as JSON.
+**Parameters:**
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `image` | Image | required | Input image file |
+| `method` | str | `"yolo"` | Detection method (see [methods](#detection-methods)) |
+| `classes` | str | `""` | Comma-separated classes (required for open-vocab detectors) |
+| `confidence` | float | `0.25` | Confidence threshold (0.0-1.0) |
+**Returns:**
+| Output | Type | Description |
+|--------|------|-------------|
+| `detections_json` | str | JSON string of detection results |
+**Example - Closed-Vocabulary:**
+```python
+import json
+# YOLO detection (COCO classes)
+dets_json = client.predict(
+    handle_file("street_scene.jpg"),
+    "yolo",
+    "",      # no classes needed
+    0.25,
+    api_name="/detect"
+)
+detections = json.loads(dets_json)
+for det in detections:
+    print(f"{det['label']}: {det['score']:.2f}")
+```
+**Example - Open-Vocabulary:**
+```python
+# YOLO-World with custom classes
+dets_json = client.predict(
+    handle_file("image.jpg"),
+    "yolo_world",
+    "hat, backpack, umbrella",  # custom classes required
+    0.25,
+    api_name="/detect"
+)
+```
+---
+### 3.1. `/detect_overlay` - Detection with Visualization
+Runs object detection and returns the image with bounding boxes drawn.
+**Parameters:**
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `image` | Image | required | Input image file |
+| `method` | str | `"yolo"` | Detection method (see [methods](#detection-methods)) |
+| `classes` | str | `""` | Comma-separated classes (required for open-vocab detectors) |
+| `confidence` | float | `0.25` | Confidence threshold (0.0-1.0) |
+**Returns:**
+| Output | Type | Description |
+|--------|------|-------------|
+| `result_image` | Image | Image with detection bounding boxes |
+| `detections_json` | str | JSON string of detection results |
+**Example:**
+```python
+import json
+# Get image with detection boxes
+result_img, dets_json = client.predict(
+    handle_file("street_scene.jpg"),
+    "yolo",
+    "",
+    0.25,
+    api_name="/detect_overlay"
+)
+# result_img is a file path to the image with boxes drawn
+print(f"Image with boxes: {result_img}")
+detections = json.loads(dets_json)
+```
+---
+### 4. `/process` - Full Image Pipeline
+Runs the complete pipeline: segmentation → compression → optional detection.
+**Parameters:**
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `image` | Image | required | Input image file |
+| `prompt` | str | `"object"` | Segmentation prompt/classes |
+| `segmentation_method` | str | `"sam3"` | ROI segmentation method |
+| `quality` | int | `4` | Compression quality (1-5) |
+| `sigma` | float | `0.3` | Background preservation (0.01-1.0) |
+| `run_detection` | bool | `False` | Whether to run detection on output |
+| `detection_method` | str | `"yolo"` | Detector to use |
+| `detection_classes` | str | `""` | Classes for open-vocab detectors |
+**Returns:**
+| Output | Type | Description |
+|--------|------|-------------|
+| `compressed_image` | Image | Compressed output image |
+| `mask_image` | Image | Generated ROI mask |
+| `bpp` | float | Bits per pixel |
+| `compression_ratio` | float | Compression ratio |
+| `roi_coverage` | float | ROI coverage percentage (0-1) |
+| `detections_json` | str | JSON detections (empty list if `run_detection=False`) |
+**Example:**
+```python
+import json
+compressed, mask, bpp, ratio, coverage, dets_json = client.predict(
+    handle_file("street.jpg"),
+    "car, person, road",
+    "sam3",
+    4,
+    0.3,
+    True,   # run detection
+    "yolo",
+    "",
+    api_name="/process"
+)
+print(f"ROI Coverage: {coverage*100:.2f}%")
+print(f"Compression: {bpp:.4f} bpp ({ratio:.2f}x)")
+print(f"Detections: {len(json.loads(dets_json))}")
+```
+---
+## Video API Endpoints
+### 1. `/segment_video` - Segment Video
+Segments a video to find ROI regions, returning either a mask file or overlay video.
+**Parameters:**
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `video_path` | Video | required | Input video file |
+| `prompt` | str | `"object"` | Comma-separated classes or natural language prompt |
+| `method` | str | `"sam3"` | Segmentation method |
+| `return_overlay` | bool | `False` | If `True`, returns video with ROI highlighted |
+| `output_fps` | float | `15.0` | Output framerate (max 30) |
+**Returns:**
+| Output | Type | Description |
+|--------|------|-------------|
+| `result_path` | File/Video | Mask file (NPZ) OR video with ROI overlay |
+| `stats_json` | str | JSON with frame count, coverage, and classes |
+**Example:**
+```python
+import json
+# Get mask file for reuse in compression
+mask_file, stats_json = client.predict(
+    handle_file("video.mp4"),
+    "person, car",
+    "sam3",
+    False,   # return masks file
+    15.0,    # fps
+    api_name="/segment_video"
+)
+stats = json.loads(stats_json)
+print(f"Processed {stats['total_frames']} frames")
+print(f"Avg ROI coverage: {stats['avg_roi_coverage']*100:.2f}%")
+# Get video with ROI overlay for visualization
+overlay_video, _ = client.predict(
+    handle_file("video.mp4"),
+    "person, car",
+    "sam3",
+    True,    # return overlay video
+    15.0,
+    api_name="/segment_video"
+)
+```
+---
+### 2. `/compress_video` - Compress Video
+Compresses a video with optional ROI mask preservation.
+**Parameters:**
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `video_path` | Video | required | Input video file |
+| `mask_file_path` | str | `None` | Path to pre-computed masks (from `/segment_video`) |
+| `quality` | int | `4` | Quality level (1-5) |
+| `sigma` | float | `0.3` | Background preservation (0.01-1.0) |
+| `output_fps` | float | `15.0` | Target output framerate |
+**Returns:**
+| Output | Type | Description |
+|--------|------|-------------|
+| `compressed_video` | Video | Compressed output video |
+| `stats_json` | str | JSON with compression statistics |
+**Example:**
+```python
+import json
+# First, segment to get masks
+mask_file, _ = client.predict(
+    handle_file("video.mp4"), "person", "sam3", False, 15.0,
+    api_name="/segment_video"
+)
+# Then compress with cached masks (3-5x faster!)
+compressed, stats_json = client.predict(
+    handle_file("video.mp4"),
+    mask_file,   # reuse masks
+    4,           # quality
+    0.3,         # sigma
+    15.0,        # fps
+    api_name="/compress_video"
+)
+stats = json.loads(stats_json)
+print(f"Compression ratio: {stats['compression_ratio']}x")
+print(f"Total size: {stats['total_size_kb']} KB")
+```
+---
+### 3. `/detect_video` - Video Detection
+Runs object detection on each frame of a video.
+**Parameters:**
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `video_path` | Video | required | Input video file |
+| `method` | str | `"yolo"` | Detection method |
+| `classes` | str | `""` | Comma-separated classes (required for open-vocab) |
+| `confidence` | float | `0.25` | Confidence threshold (0.0-1.0) |
+| `return_overlay` | bool | `False` | If `True`, returns video with detection boxes |
+| `output_fps` | float | `15.0` | Output framerate (max 30) |
+**Returns:**
+| Output | Type | Description |
+|--------|------|-------------|
+| `result_video` | Video | Video with detection boxes (if `return_overlay=True`), None otherwise |
+| `detections_json` | str | JSON with per-frame detections |
+**Example:**
+```python
+import json
+# Get per-frame detections JSON
+_, dets_json = client.predict(
+    handle_file("video.mp4"),
+    "yolo",
+    "",
+    0.25,
+    False,   # return JSON only
+    15.0,
+    api_name="/detect_video"
+)
+data = json.loads(dets_json)
+print(f"Total detections: {data['total_detections']}")
+print(f"Avg per frame: {data['avg_detections_per_frame']}")
+# Get video with detection overlays
+det_video, _ = client.predict(
+    handle_file("video.mp4"),
+    "yolo",
+    "",
+    0.25,
+    True,    # return overlay video
+    15.0,
+    api_name="/detect_video"
+)
+```
+---
+### 4. `/process_video` - Full Video Pipeline
+Processes a video with ROI-based compression (segment → compress), with optional detection.
+**Parameters:**
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `video_path` | Video | required | Input video file |
+| `prompt` | str | `"object"` | Segmentation prompt/classes |
+| `segmentation_method` | str | `"sam3"` | ROI segmentation method |
+| `mode` | str | `"static"` | `"static"` or `"dynamic"` |
+| `quality` | int | `4` | Quality level 1-5 (static mode) |
+| `sigma` | float | `0.3` | Background preservation (static mode) |
+| `output_fps` | float | `15.0` | Target framerate (static mode) |
+| `bandwidth_kbps` | float | `500.0` | Target bandwidth (dynamic mode) |
+| `min_fps` | float | `5.0` | Minimum framerate (dynamic mode) |
+| `max_fps` | float | `30.0` | Maximum framerate (dynamic mode) |
+| `aggressiveness` | float | `0.5` | Bandwidth savings strategy (dynamic mode): `0.0` = use full bandwidth (high FPS always), `0.5` = moderate savings, `1.0` = maximum savings (aggressive FPS reduction for low motion) |
+| `run_detection` | bool | `False` | Whether to run detection/tracking |
+| `detection_method` | str | `"yolo"` | Detector to use |
+| `mask_file_path` | str | `None` | Path to pre-computed masks (skips segmentation) |
+**Returns:**
+| Output | Type | Description |
+|--------|------|-------------|
+| `output_video` | Video | Compressed video |
+| `stats_json` | str | JSON with detailed statistics |
+**Example - Static Mode:**
+```python
+import json
+output, stats_json = client.predict(
+    handle_file("video.mp4"),
+    "person, car",
+    "sam3",
+    "static",
+    4, 0.3, 15.0,          # static: quality, sigma, fps
+    500, 5, 30,            # dynamic: bandwidth, min_fps, max_fps (ignored)
+    False, "yolo", None,
+    api_name="/process_video"
+)
+stats = json.loads(stats_json)
+print(f"Processed {stats['total_frames']} frames")
+```
+**Example - Dynamic Mode:**
+```python
+output, stats_json = client.predict(
+    handle_file("video.mp4"),
+    "person",
+    "yolo",
+    "dynamic",
+    4, 0.3, 15.0,          # static settings (ignored)
+    750,                    # target bandwidth 750 kbps
+    8,                      # min FPS
+    30,                     # max FPS
+    True, "yolo", None,
+    api_name="/process_video"
+)
+```
+---
+## Class Reference
+### Segmentation Methods
+**Pixel-Perfect Segmentation:**
+| Method | Description | Classes |
+|--------|-------------|---------|
+| `sam3` | Prompt-based (natural language) | Any text prompt |
+### Segmentation Methods
+| Method | Description | Classes |
+|--------|-------------|---------|
+| `sam3` | Prompt-based (natural language) | Any text prompt |
+| `yolo` | YOLO instance segmentation | 80 COCO classes |
+| `segformer` | Cityscapes semantic segmentation | 19 classes |
+| `mask2former` | Swin-based panoptic/semantic | 133 COCO / 150 ADE20K |
+| `maskrcnn` | ResNet50-FPN instance segmentation | 80 COCO classes |
+| `fake_yolo` | Fast bbox-based (YOLO + ByteTrack) | 80 COCO classes |
+| `fake_yolo_botsort` | Fast bbox-based (YOLO + BoTSORT) | 80 COCO classes |
+| `fake_detr` | Fast bbox-based (DETR + ByteTrack) | 80 COCO classes |
+| `fake_fasterrcnn` | Fast bbox-based (Faster R-CNN + ByteTrack) | 80 COCO classes |
+| `fake_retinanet` | Fast bbox-based (RetinaNet + ByteTrack) | 80 COCO classes |
+| `fake_fcos` | Fast bbox-based (FCOS + ByteTrack) | 80 COCO classes |
+| `fake_deformable_detr` | Fast bbox-based (Deformable DETR + ByteTrack) | 80 COCO classes |
+| `fake_grounding_dino` | Fast bbox-based (Grounding DINO + ByteTrack) | Requires prompt |
+**Note:** `fake_*` methods create rectangular masks from detection bounding boxes with object tracking. Faster than pixel-perfect segmentation, suitable for video when precise boundaries aren't critical.
+### Detection Methods
+**Closed-Vocabulary (COCO pretrained):**
+| Method | Description |
+|--------|-------------|
+| `yolo` | Ultralytics YOLO |
+| `detr` | Facebook DETR |
+| `faster_rcnn` | Faster R-CNN |
+| `retinanet` | RetinaNet |
+| `fcos` | FCOS |
+| `ssd` | SSD300 |
+| `deformable_detr` | Deformable DETR |
+**Open-Vocabulary (requires `classes` parameter):**
+| Method | Description |
+|--------|-------------|
+| `yolo_world` | YOLO-World |
+| `grounding_dino` | Grounding DINO |
+### COCO Classes (80)
+```
+person, bicycle, car, motorcycle, airplane, bus, train, truck, boat,
+traffic light, fire hydrant, stop sign, parking meter, bench, bird, cat,
+dog, horse, sheep, cow, elephant, bear, zebra, giraffe, backpack, umbrella,
+handbag, tie, suitcase, frisbee, skis, snowboard, sports ball, kite,
+baseball bat, baseball glove, skateboard, surfboard, tennis racket, bottle,
+wine glass, cup, fork, knife, spoon, bowl, banana, apple, sandwich, orange,
+broccoli, carrot, hot dog, pizza, donut, cake, chair, couch, potted plant,
+bed, dining table, toilet, tv, laptop, mouse, remote, keyboard, cell phone,
+microwave, oven, toaster, sink, refrigerator, book, clock, vase, scissors,
+teddy bear, hair drier, toothbrush
+```
+### Cityscapes Classes (19)
+```
+road, sidewalk, building, wall, fence, pole, traffic light, traffic sign,
+vegetation, terrain, sky, person, rider, car, truck, bus, train, motorcycle,
+bicycle
+```
+---
+## Error Handling
+```python
+try:
+    result = client.predict(
+        handle_file("image.jpg"),
+        ...,
+        api_name="/endpoint"
+    )
+except Exception as e:
+    print(f"API Error: {e}")
+```
+**Common Errors:**
+| Error | Cause | Solution |
+|-------|-------|----------|
+| Validation error for ImageData | Missing `handle_file()` | Wrap file paths with `handle_file()` |
+| File does not exist | Invalid path | Check file path is correct |
+| Empty detection classes | Open-vocab detector without classes | Provide classes for `yolo_world`, `grounding_dino` |
+| GPU quota exceeded | HF Spaces limit | Wait and retry (see below) |
+---
+## Handling GPU Quota on HF Spaces
+When using Hugging Face Spaces with ZeroGPU, you may encounter quota limits:
+```
+You have exceeded your GPU quota (60s requested vs. 0s left). Try again in 0:05:30
+```
+### Automatic Retry with Backoff
+```python
+import time
+import re
+def extract_wait_time(error_msg):
+    """Extract wait time from GPU quota error message."""
+    match = re.search(r'Try again in (\d+):(\d+)(?::(\d+))?', error_msg)
+    if match:
+        if match.group(3):  # HH:MM:SS
+            return int(match.group(1)) * 3600 + int(match.group(2)) * 60 + int(match.group(3))
+        else:  # MM:SS
+            return int(match.group(1)) * 60 + int(match.group(2))
+    return 60
+def call_with_retry(client, *args, api_name, max_retries=5):
+    """Call API with exponential backoff retry."""
+    delay = 10
+    for attempt in range(max_retries):
+        try:
+            return client.predict(*args, api_name=api_name)
+        except Exception as e:
+            error_msg = str(e)
+            if "exceeded your GPU quota" in error_msg:
+                wait_time = extract_wait_time(error_msg)
+                actual_delay = max(delay, wait_time + 5)
+                print(f"⏳ GPU quota exhausted. Waiting {actual_delay}s... (attempt {attempt + 1})")
+                time.sleep(actual_delay)
+                delay *= 2
+            else:
+                raise
+    raise Exception("Max retries reached")
+# Usage
+result = call_with_retry(
+    client,
+    handle_file("image.jpg"),
+    "car", "sam3", False, 4, 0.3, False, "yolo", "",
+    api_name="/process"
+)
+```
+---
+## Using with cURL
+### Upload File First
+```bash
+# Upload image
+FILE_URL=$(curl -s -X POST http://localhost:7860/upload \
+  -F "files=@image.jpg" | \
+  python3 -c "import sys, json; print(json.load(sys.stdin)[0])")
+```
+### Call Endpoints
+```bash
+# Segment
+curl -X POST http://localhost:7860/api/segment \
+  -H "Content-Type: application/json" \
+  -d "{\"data\": [\"$FILE_URL\", \"car, person\", \"sam3\", false]}"
+# Compress (no mask)
+curl -X POST http://localhost:7860/api/compress \
+  -H "Content-Type: application/json" \
+  -d "{\"data\": [\"$FILE_URL\", null, 4, 0.3]}"
+# Detect
+curl -X POST http://localhost:7860/api/detect \
+  -H "Content-Type: application/json" \
+  -d "{\"data\": [\"$FILE_URL\", \"yolo\", \"\", 0.25, false]}"
+# Full pipeline
+curl -X POST http://localhost:7860/api/process \
+  -H "Content-Type: application/json" \
+  -d "{\"data\": [\"$FILE_URL\", \"car, person\", \"sam3\", 4, 0.3, true, \"yolo\", \"\"]}"
+```
+---
+## Performance Guide
+### Choosing Segmentation Methods
+**Use Pixel-Perfect Segmentation when:**
+- You need precise object boundaries
+- Working with single images or small videos
+- Quality is more important than speed
+- Computing time/power is not constrained
+**Use Fast Segmentation (fake_*) when:**
+- Processing large videos or real-time streams
+- Speed is critical (2-3x faster)
+- Rectangular masks are acceptable
+- Need temporal consistency (tracking maintains object IDs)
+### Performance Benchmarks
+**Video Processing (480p, 30 frames):**
+| Method | Speed | Use Case |
+|--------|-------|----------|
+| `fake_yolo` | ~70 fps | Real-time video, fastest |
+| `fake_yolo_botsort` | ~65 fps | Real-time with robust tracking |
+| `fake_detr` | ~40 fps | Good speed + accuracy balance |
+| `fake_fasterrcnn` | ~30 fps | Accurate detection |
+| `yolo` (pixel-perfect) | ~30 fps | Instance segmentation |
+| `sam3` | ~15 fps | Prompt-based, highest flexibility |
+| `mask2former` | ~20 fps | Panoptic segmentation |
+**Detection Performance (with batch support):**
+| Detector | Single-Frame | Batch (30 frames) | Speedup |
+|----------|--------------|-------------------|---------|
+| YOLO26x | ~40 fps | ~70 fps | 1.75x |
+| DETR | ~15 fps | ~40 fps | 2.67x |
+| Faster R-CNN | ~12 fps | ~30 fps | 2.50x |
+### Example: Fast Video Processing
+```python
+from gradio_client import Client, handle_file
+import json
+import time
+client = Client("http://localhost:7860")
+# Method 1: Fast fake segmentation (recommended for video)
+start = time.time()
+output1, stats1 = client.predict(
+    handle_file("long_video.mp4"),
+    "person, car",
+    "fake_yolo",     # Fast detection + tracking
+    "static",
+    4,
+    0.3,
+    15.0,
+    500, 5, 30, False, "yolo", None,
+    api_name="/process_video"
+)
+fast_time = time.time() - start
+# Method 2: Pixel-perfect segmentation
+start = time.time()
+output2, stats2 = client.predict(
+    handle_file("long_video.mp4"),
+    "person, car",
+    "yolo",          # Pixel-perfect YOLO26x-seg
+    "static",
+    4,
+    0.3,
+    15.0,
+    500, 5, 30, False, "yolo", None,
+    api_name="/process_video"
+)
+perfect_time = time.time() - start
+stats1_data = json.loads(stats1)
+stats2_data = json.loads(stats2)
+print(f"Fast segmentation: {fast_time:.2f}s")
+print(f"Pixel-perfect: {perfect_time:.2f}s")
+print(f"Speedup: {perfect_time/fast_time:.2f}x faster")
+print(f"Compression ratio (fast): {stats1_data['compression_ratio']:.2f}x")
+print(f"Compression ratio (perfect): {stats2_data['compression_ratio']:.2f}x")
+```
+### Example: Tracker Comparison
+```python
+# Test different trackers with same detector
+trackers = {
+    "ByteTrack (default)": "fake_yolo",
+    "BoTSORT": "fake_yolo_botsort",
+}
+for name, method in trackers.items():
+    output, stats = client.predict(
+        handle_file("test_video.mp4"),
+        "person",
+        method,
+        "static",
+        4, 0.3, 15.0,
+        500, 5, 30, False, "yolo", None,
+        api_name="/process_video"
+    )
+    stats_data = json.loads(stats)
+    print(f"{name}: {stats_data['avg_roi_coverage']:.2f}% avg coverage")
+```
+---
+## Example Scripts
+### Batch Image Processing
+```python
+from gradio_client import Client, handle_file
+from pathlib import Path
+client = Client("http://localhost:7860")
+output_dir = Path("compressed_output")
+output_dir.mkdir(exist_ok=True)
+for img_path in Path("images").glob("*.jpg"):
+    print(f"Processing {img_path.name}...")
+    compressed, mask, bpp, ratio, coverage, _ = client.predict(
+        handle_file(str(img_path)),
+        "car, person",
+        "sam3",
+        4, 0.3,
+        False, "", "",
+        api_name="/process"
+    )
+    # Save compressed image
+    output_path = output_dir / f"compressed_{img_path.name}"
+    with open(output_path, "wb") as f:
+        f.write(open(compressed, "rb").read())
+    print(f"  BPP: {bpp:.4f}, Ratio: {ratio:.2f}x, ROI: {coverage*100:.2f}%")
+```
+### Video Processing with Mask Caching
+```python
+from gradio_client import Client, handle_file
+import json
+client = Client("http://localhost:7860")
+video_path = "input_video.mp4"
+# Step 1: Segment video (one-time cost)
+mask_file, seg_stats = client.predict(
+    handle_file(video_path),
+    "person, car",
+    "sam3",
+    False,  # return mask file
+    15.0,
+    api_name="/segment_video"
+)
+print(f"Segmented video, masks saved to: {mask_file}")
+# Step 2: Compress with different settings, reusing masks
+for quality in [3, 4, 5]:
+    compressed, comp_stats = client.predict(
+        handle_file(video_path),
+        mask_file,   # reuse cached masks
+        quality,
+        0.3,
+        15.0,
+        api_name="/compress_video"
+    )
+    stats = json.loads(comp_stats)
+    print(f"Quality {quality}: {stats['compression_ratio']}x compression")
+```
+### Detection Comparison (Original vs Compressed)
+```python
+from gradio_client import Client, handle_file
+import json
+client = Client("http://localhost:7860")
+image = "street_scene.jpg"
+# Detect on original
+_, dets_orig = client.predict(
+    handle_file(image), "yolo", "", 0.25, False,
+    api_name="/detect"
+)
+orig_count = len(json.loads(dets_orig))
+print(f"Original: {orig_count} detections")
+# Compress and detect
+compressed, _, bpp, ratio, _, dets_comp = client.predict(
+    handle_file(image),
+    "car, person, road",
+    "sam3",
+    4, 0.3,
+    True, "yolo", "",
+    api_name="/process"
+)
+comp_count = len(json.loads(dets_comp))
+retention = comp_count / orig_count * 100 if orig_count else 0
+print(f"Compressed ({ratio:.2f}x): {comp_count} detections")
+print(f"Detection retention: {retention:.1f}%")
+```
+---
+## Additional Resources
+- **Web UI**: Visit `http://localhost:7860` for interactive interface
+- **GitHub**: See repository for source code and examples
+- **Model Checkpoints**: Available in `checkpoints/` directory
+- **Test Images**: Sample images in `data/images/` directory

README.md CHANGED Viewed

@@ -1,13 +1,513 @@
 ---
-title: Contextual Communication Demo 2
-emoji: 🦀
-colorFrom: purple
-colorTo: yellow
 sdk: gradio
-sdk_version: 6.5.1
-python_version: '3.12'
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Contextual Communication Demo
+emoji: "📡"
+colorFrom: blue
+colorTo: purple
 sdk: gradio
+sdk_version: "6.2.0"
 app_file: app.py
 pinned: false
 ---
+# Contextual Communication Demo
+An interactive demo for **contextual communication in bandwidth-degraded environments** (e.g., ISR collection from drones). The core idea is **context-aware compression**: transmit an extremely compact latent representation while ensuring the **decoded output remains useful for downstream decision-making** (e.g., object detection).
+This repository implements **contextual spatial compression** for EO/IR-style imagery using an ROI-aware learned image compression model (TIC-style VAE) guided by segmentation masks.
+## Features
+- **Contextual (ROI) compression**: preserves fidelity in mission-relevant regions while aggressively compressing non-relevant background.
+- **Mission-driven context extraction**: map a mission prompt to ROI masks via multiple segmentation strategies:
+    - Class-based segmentation (SegFormer / YOLO / Mask2Former / Mask R-CNN)
+    - Prompt/referring segmentation (SAM3)
+    - Optional object detection overlays to visualize task retention on the decoded image
+- **Two operator knobs** for bandwidth adaptation:
+    - **Background preservation** ($\sigma$, 0.01–1.0): lower = more background degradation
+    - **Transmission quality** (checkpoint/lambda selection): higher = larger payload / better reconstruction
+- **CLI tools** for segmentation, ROI compression, and before/after detection retention.
+## Setup
+```bash
+pip install -r requirements.txt
+```
+Checkpoints are expected under `checkpoints/` (e.g., `checkpoints/tic_lambda_0.0483.pth.tar`).
+By default, model weights/caches downloaded by detection/segmentation backends are also stored under `checkpoints/`:
+- Hugging Face models under `checkpoints/hf/`
+- Torch/torchvision weights under `checkpoints/torch/`
+## Usage
+### Interactive Demo (Hugging Face Spaces / Local)
+This repo includes a Gradio app intended for Hugging Face Spaces (`app_file: app.py`). To run locally:
+```bash
+python app.py
+```
+In the UI:
+- Enter a **Mission** and choose a **Context Extraction Method (ROI)**.
+- Tune the two knobs to match bandwidth constraints:
+    - **Transmission quality** (checkpoint selection)
+    - **Background preservation** ($\sigma$)
+- Optionally enable **object detection overlays**.
+Note: the app includes a **Video** tab placeholder (inactive).
+### CLI: Contextual Spatial Compression (Images)
+```bash
+python roi_compressor.py \
+    --input data/images/car/0016cf15fa4d4e16.jpg \
+    --output results/compressed.jpg \
+    --checkpoint checkpoints/tic_lambda_0.0483.pth.tar \
+    --sigma 0.3 \
+    --seg-method yolo \
+    --seg-classes car \
+    --highlight
+```
+Key arguments:
+- `--sigma`: background quality (lower = more compression)
+- `--seg-method`: `segformer`, `yolo`, `mask2former`, `maskrcnn`
+- `--load-mask`: bypass segmentation using a precomputed mask
+### CLI: Segmentation Only
+```bash
+python roi_segmenter.py \
+    --input data/images/car/0016cf15fa4d4e16.jpg \
+    --output results/mask.png \
+    --method segformer \
+    --classes car \
+    --visualize
+```
+Prompt-based segmentation (SAM3):
+```bash
+python roi_segmenter.py \
+    --input data/images/car/0016cf15fa4d4e16.jpg \
+    --output results/mask.png \
+    --method sam3 \
+    --prompt "a car" \
+    --visualize
+```
+### CLI: Detection Retention (Before vs After)
+Compare original vs already-compressed:
+```bash
+python roi_detection_eval.py \
+    --before data/images/car/0016cf15fa4d4e16.jpg \
+    --after results/compressed.jpg \
+    --detectors yolo fasterrcnn detr \
+    --viz-dir results/det_viz
+```
+Or generate the "after" image via ROI compression and then evaluate:
+```bash
+python roi_detection_eval.py \
+    --before data/images/car/0016cf15fa4d4e16.jpg \
+    --checkpoint checkpoints/tic_lambda_0.0483.pth.tar \
+    --sigma 0.3 \
+    --seg-method yolo --seg-classes car \
+    --detectors yolo fasterrcnn \
+    --save-after results/after.jpg \
+    --viz-dir results/det_viz
+```
+Open-vocabulary example (YOLO-World):
+```bash
+python roi_detection_eval.py \
+    --before data/images/person/kodim04.png \
+    --checkpoint checkpoints/tic_lambda_0.0483.pth.tar \
+    --sigma 0.3 \
+    --seg-method yolo --seg-classes person \
+    --detectors yolo_world \
+    --open-vocab-classes "person,car" \
+    --viz-dir results/det_viz
+```
+## Project Structure
+```
+.
+├── app.py                    # Gradio demo (Hugging Face Spaces)
+├── model_cache.py            # Cache routing to `checkpoints/`
+├── roi_compressor.py         # CLI: contextual (ROI) image compression
+├── roi_segmenter.py          # CLI: ROI mask generation
+├── roi_detection_eval.py     # CLI: before/after detection retention
+├── segmentation/             # Segmenters + factory
+├── detection/                # Detectors + factory
+├── vae/                      # ROI-aware TIC model + compression utils
+├── checkpoints/              # Compression checkpoints + model caches
+├── data/images/                   # Sample images
+├── examples.sh
+└── _segmentation_comparison.ipynb
+```
+## Modular API
+Segmentation:
+```python
+from segmentation import create_segmenter
+segmenter = create_segmenter("yolo", device="cuda", conf_threshold=0.3)
+mask = segmenter(image, target_classes=["car", "person"])
+```
+Compression:
+```python
+from vae import load_checkpoint, compress_image
+model = load_checkpoint("checkpoints/tic_lambda_0.0483.pth.tar", device="cuda")
+out = compress_image(image, mask, model, sigma=0.3, device="cuda")
+compressed = out["compressed"]
+bpp = out["bpp"]
+```
+## Notes
+- OpenCV is included via `opencv-python-headless` (recommended for server/Spaces environments).
+- Some backends download weights on first use; caches are routed under `checkpoints/`.
+- Output directories like `results/` are created at runtime by the CLIs.
+---
+title: Contextual Communication Demo
+emoji: "📡"
+colorFrom: blue
+colorTo: purple
+sdk: gradio
+sdk_version: "6.2.0"
+app_file: app.py
+pinned: false
+---
+# Contextual Communication Demo
+An interactive demo for **contextual communication in bandwidth-degraded environments** (e.g., ISR collection from drones). The core idea is **context-aware compression**: transmit an extremely compact latent representation while ensuring the **decoded output remains useful for downstream decision-making** (e.g., object detection).
+This repository implements **contextual spatial compression** for EO/IR-style imagery using an ROI-aware learned image compression model (TIC-style VAE) guided by segmentation masks.
+## Features
+- **Contextual (ROI) compression**: Preserves fidelity in mission-relevant regions while aggressively compressing non-relevant background.
+- **Mission-driven context extraction**: A mission prompt can be mapped to ROI masks via multiple segmentation strategies:
+    - **Class-based segmentation** (e.g., SegFormer / YOLO / Mask2Former / Mask R-CNN)
+    - **Prompt/referring segmentation** (SAM3)
+    - Optional **object detection overlays** to evaluate task retention on decoded outputs
+- **Two operator knobs** for bandwidth adaptation:
+    - **Background preservation** (`sigma`, 0.01–1.0): lower = more background degradation
+    - **Overall quality level** (checkpoint/lambda selection): higher = larger file / better reconstruction
+- **Visualization**: Compare input vs decoded output and optionally highlight context regions.
+- **CLI tools**: Scripts for segmentation, ROI compression, and before/after detection eval.
+## Setup
+1.  **Install Dependencies**:
+    ```bash
+    pip install -r requirements.txt
+    ```
+2.  **Model Checkpoints**:
+    Checkpoints are located in `checkpoints/` directory. Main checkpoint: `checkpoints/tic_lambda_0.0483.pth.tar`
+    By default, model weights/caches downloaded by detection/segmentation backends are also stored under `checkpoints/`
+    (Hugging Face models under `checkpoints/hf/`, torchvision weights under `checkpoints/torch/`).
+## Usage
+### Interactive Demo (Hugging Face Spaces / Local)
+This repo includes a Gradio app intended for Hugging Face Spaces (`app_file: app.py`). To run locally:
+```bash
+python app.py
+```
+In the UI:
+- Enter a **Mission** and choose a **Context Extraction Method (ROI)**.
+- Tune the two knobs to match bandwidth constraints:
+    - **Transmission quality** (checkpoint selection)
+    - **Background preservation** ($\sigma$)
+- Optionally enable **object detection overlays** to visualize task retention on the decoded image.
+Note: the app includes a **Video** tab placeholder (inactive).
+### Contextual Spatial Compression (Images)
+Run the compression script with an input image:
+```bash
+python roi_compressor.py \
+    --input data/images/car/0016cf15fa4d4e16.jpg \
+    --output results/compressed.jpg \
+    --checkpoint checkpoints/tic_lambda_0.0483.pth.tar \
+    --sigma 0.3 \
+    --seg-classes car \
+    --highlight
+```
+**Arguments:**
+- `--input`: Path to input image.
+- `--output`: Path to save compressed image.
+- `--checkpoint`: Path to model checkpoint.
+- `--sigma`: Background quality factor (lower = more compression). Default: 0.3.
+- `--lambda`: Rate-distortion tradeoff parameter (default: 0.0483).
+- `--seg-method`: Segmentation method (`segformer`, `yolo`, `mask2former`, `maskrcnn`). Default: `segformer`.
+- `--seg-classes`: List of classes to treat as ROI (e.g., `car`, `person`).
+- `--highlight`: Save a comparison grid with ROI highlighted.
+Tip: you can bypass segmentation by providing `--load-mask`.
+### Segmentation Only
+Generate segmentation masks without compression:
+```bash
+python roi_segmenter.py \
+    --input data/images/car/0016cf15fa4d4e16.jpg \
+    --output results/mask.png \
+    --method segformer \
+    --classes car \
+    --visualize
+```
+Prompt-based segmentation (SAM3):
+```bash
+python roi_segmenter.py \
+    --input data/images/car/0016cf15fa4d4e16.jpg \
+    --output results/mask.png \
+    --method sam3 \
+    --prompt "a car" \
+    --visualize
+```
+## Project Structure
+```
+.
+├── app.py                    # Gradio demo (Hugging Face Spaces)
+├── README.md
+├── requirements.txt
+├── model_cache.py            # Cache routing to `checkpoints/`
+├── examples.sh               # Example CLI commands
+├── _segmentation_comparison.ipynb
+├── roi_compressor.py         # CLI: contextual (ROI) image compression
+├── roi_segmenter.py          # CLI: ROI mask generation
+├── roi_detection_eval.py     # CLI: before/after detection retention
+├── checkpoints/              # Compression checkpoints + model caches
+├── data/images/                   # Sample images
+├── segmentation/             # Segmenters + factory
+├── detection/                # Detectors + factory
+└── vae/                      # ROI-aware TIC model + compression utils
+```
+## Modular API
+### Using Segmentation Module
+```python
+from segmentation import create_segmenter
+# Create a segmenter
+segmenter = create_segmenter('yolo', device='cuda', conf_threshold=0.3)
+# Segment image
+mask = segmenter(image, target_classes=['car', 'person'])
+```
+### Using Compression Module
+```python
+from vae import load_checkpoint, compress_image
+from PIL import Image
+# Load model
+model = load_checkpoint('checkpoints/tic_lambda_0.0483.pth.tar', device='cuda')
+# Compress with ROI mask
+result = compress_image(image, mask, model, sigma=0.3, device='cuda')
+compressed_img = result['compressed']
+bpp = result['bpp']
+```
+## Object Detection (New)
+An extendable object detection module is available in `detection/` with multiple implemented backends:
+- YOLO (Ultralytics)
+- YOLO-World (Ultralytics, open-vocabulary)
+- Faster R-CNN (torchvision)
+- RetinaNet (torchvision)
+- SSD (torchvision)
+- FCOS (torchvision)
+- DETR (transformers)
+- Deformable DETR (transformers, if supported by your installed version)
+- EfficientDet (optional, requires `effdet`)
+- Grounding DINO (transformers, open-vocabulary)
+Open-vocabulary detectors (YOLO-World / Grounding DINO) require text prompts/classes at runtime.
+### Evaluate Detection Before/After ROI Compression
+Compare an original image vs an already-compressed image:
+```bash
+python roi_detection_eval.py \
+    --before data/images/car/0016cf15fa4d4e16.jpg \
+    --after results/compressed.jpg \
+    --detectors yolo fasterrcnn detr \
+    --viz-dir results/det_viz
+```
+Or generate the "after" image via ROI compression and then evaluate:
+```bash
+python roi_detection_eval.py \
+    --before data/images/car/0016cf15fa4d4e16.jpg \
+    --checkpoint checkpoints/tic_lambda_0.0483.pth.tar \
+    --sigma 0.3 \
+    --seg-method yolo --seg-classes car \
+    --detectors yolo fasterrcnn \
+    --save-after results/after.jpg \
+    --viz-dir results/det_viz
+```
+Open-vocabulary example (YOLO-World):
+```bash
+python roi_detection_eval.py \
+    --before data/images/person/kodim04.png \
+    --checkpoint checkpoints/tic_lambda_0.0483.pth.tar \
+    --sigma 0.3 \
+    --seg-method yolo --seg-classes person \
+    --detectors yolo_world \
+    --open-vocab-classes "person,car" \
+    --viz-dir results/det_viz
+```
+Open-vocabulary example (Grounding DINO):
+```bash
+python roi_detection_eval.py \
+    --before data/images/car/0016cf15fa4d4e16.jpg \
+    --checkpoint checkpoints/tic_lambda_0.0483.pth.tar \
+    --sigma 0.3 \
+    --seg-method yolo --seg-classes car \
+    --detectors grounding_dino \
+    --open-vocab-classes "car,person" \
+    --viz-dir results/det_viz
+```
+## Programmatic API
+The application exposes a Gradio API for programmatic access to all features:
+### Image API
+- `/segment` - Segment image → mask or overlay
+- `/compress` - Compress image with optional ROI mask
+- `/detect` - Run object detection → JSON or overlay
+- `/process` - Full pipeline: segment → compress → detect
+### Video API (Buffered)
+- `/segment_video` - Segment video → mask file or overlay video
+- `/compress_video` - Compress video with optional cached masks
+- `/detect_video` - Run detection on video → JSON or overlay video
+- `/process_video` - Full pipeline with static/dynamic modes
+### Video API (Streaming - NEW!)
+- `/stream_process_video` - Stream compressed chunks progressively (HLS-style)
+- `/stream_compress_video` - Stream chunks with pre-computed masks
+**Key difference**: Streaming endpoints yield chunks as they're produced (low latency, ~1 second for first chunk) instead of buffering the entire video. Perfect for real-time streaming applications.
+See [API.md](API.md) for complete documentation with examples.
+See [STREAMING_API.md](STREAMING_API.md) for streaming API guide and comparison.
+### Quick Example
+```python
+from gradio_client import Client, handle_file
+client = Client("http://localhost:7860")
+# Image: segment → compress → detect
+compressed, mask, bpp, ratio, coverage, detections = client.predict(
+    handle_file("image.jpg"),
+    "car, person",  # mission prompt
+    "sam3",         # ROI method
+    4,              # quality level (1-5)
+    0.3,            # sigma (background preservation)
+    True,           # run detection
+    "yolo",         # detection method
+    "",             # detection classes
+    api_name="/process"
+)
+# Video: streaming compression (chunk-by-chunk)
+chunk_stream = client.submit(
+    handle_file("video.mp4"),
+    "person, car",
+    "sam3", "static",
+    4, 0.3, 15.0,
+    api_name="/stream_process_video"
+)
+for chunk_json in chunk_stream:
+    chunk = json.loads(chunk_json)
+    if chunk.get("status") == "complete":
+        break
+    print(f"Chunk {chunk['chunk_index']}: {len(chunk['frames'])} frames")
+```
+### JavaScript/Frontend Integration
+**Yes, streaming works great with JavaScript!** The `@gradio/client` package fully supports async iterators for streaming:
+```javascript
+import { Client } from "@gradio/client";
+const client = await Client.connect("http://localhost:7860");
+const stream = client.submit("/stream_process_video", {
+  video_path: videoFile,
+  prompt: "person, car",
+  segmentation_method: "sam3",
+  mode: "static",
+  quality: 4,
+  sigma: 0.3,
+  output_fps: 15.0,
+  frame_format: "jpeg",
+  frame_quality: 85
+});
+for await (const msg of stream) {
+  const chunk = JSON.parse(msg.data);
+  if (chunk.status === "complete") break;
+  // Display frames immediately
+  displayFrame(`data:image/jpeg;base64,${chunk.frames[0]}`);
+}
+```
+**Complete examples available:**
+- [examples/streaming_demo.html](examples/streaming_demo.html) - Standalone HTML demo
+- [examples/streaming_client.ts](examples/streaming_client.ts) - React/TypeScript/Vanilla JS examples
+See [STREAMING_API.md](STREAMING_API.md) for detailed streaming guide.```
+````

_segmentation_comparison.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

app.py ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/tic_lambda_0.0035.pth.tar ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:03f2627ba49ddc117c2235868e179e48cd7fbb343b0ab637f6bac575f447b44f
+size 93931400

checkpoints/tic_lambda_0.013.pth.tar ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:821df1b78110d8bb6d01809a55b64ea1da8f17ed5e893d33ed258ee180054385
+size 93922614

checkpoints/tic_lambda_0.025.pth.tar ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:867f34e9949cc02800d1c8eff48c4f67ff7436d0e8c8208aec572fd78d83affb
+size 168141319

checkpoints/tic_lambda_0.0483.pth.tar ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9686371eed833dc2a6fdd7c07cf3695290c482f53b75db5be662892a45a71430
+size 168229460

checkpoints/tic_lambda_0.0932.pth.tar ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0b7ddfa2f5f1b5082d087135a806f11ae65e8f8da5f724c5e510db264d3816c0
+size 168141383

detection/__init__.py ADDED Viewed

	@@ -0,0 +1,30 @@

+"""Object detection module.
+Provides an extendable interface + factory similar to `segmentation/`.
+Detectors:
+- yolo (Ultralytics)
+- yolo_world (Ultralytics YOLO-World; open-vocabulary)
+- fasterrcnn, retinanet, ssd, fcos (torchvision)
+- efficientdet (optional: effdet)
+- detr, deformable_detr (transformers)
+- grounding_dino (transformers; open-vocabulary)
+Tracking:
+- SimpleTracker (IoU-based multi-object tracking)
+"""
+from .base import BaseDetector, Detection
+from .factory import create_detector, register_detector, get_available_detectors
+from .tracker import SimpleTracker, Track, draw_tracks
+__all__ = [
+    "BaseDetector",
+    "Detection",
+    "create_detector",
+    "register_detector",
+    "get_available_detectors",
+    "SimpleTracker",
+    "Track",
+    "draw_tracks",
+]

detection/base.py ADDED Viewed

	@@ -0,0 +1,83 @@

+"""Abstract base class for object detection models."""
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Sequence, Union
+from PIL import Image
+@dataclass(frozen=True)
+class Detection:
+    """Single detection result."""
+    label: str
+    score: float
+    # [x1, y1, x2, y2] in pixel coordinates
+    bbox_xyxy: List[float]
+class BaseDetector(ABC):
+    """Common interface for all object detectors."""
+    def __init__(self, device: str = "cuda", **kwargs: Any):
+        self.device = device
+        self.model = None
+        self._is_loaded = False
+    @abstractmethod
+    def load_model(self) -> None:
+        """Load weights and prepare for inference."""
+    @abstractmethod
+    def detect(
+        self,
+        image: Image.Image,
+        conf_threshold: float = 0.25,
+        **kwargs: Any,
+    ) -> List[Detection]:
+        """Run detection and return a list of detections."""
+    @abstractmethod
+    def get_available_classes(self) -> Union[List[str], Dict[str, int], None]:
+        """Return the class list (or mapping) supported by this detector.
+        For open-vocabulary / prompt-based detectors, return None.
+        """
+    def ensure_loaded(self) -> None:
+        if not self._is_loaded:
+            self.load_model()
+            self._is_loaded = True
+    def __call__(
+        self,
+        image: Image.Image,
+        conf_threshold: float = 0.25,
+        **kwargs: Any,
+    ) -> List[Detection]:
+        self.ensure_loaded()
+        return self.detect(image=image, conf_threshold=conf_threshold, **kwargs)
+    def _classes_to_list(self) -> List[str]:
+        avail = self.get_available_classes()
+        if avail is None:
+            return []
+        if isinstance(avail, dict):
+            return list(avail.keys())
+        if isinstance(avail, (list, tuple, set)):
+            return list(avail)
+        try:
+            return list(avail)  # type: ignore[arg-type]
+        except Exception:
+            return []
+    def supports_label(self, label: str) -> bool:
+        """Best-effort check whether the detector has a given label."""
+        classes = [c.lower() for c in self._classes_to_list()]
+        if not classes:
+            return True
+        return label.lower() in classes

detection/bytetrack.py ADDED Viewed

	@@ -0,0 +1,358 @@

+"""ByteTrack: Multi-object tracker using high/low confidence detection matching.
+Based on: https://github.com/ifzhang/ByteTrack
+Simple, fast, and strong multi-object tracker without ReID features.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+def linear_assignment(cost_matrix: np.ndarray) -> Tuple[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
+    """Solve linear assignment problem using LAP (Hungarian algorithm).
+    Args:
+        cost_matrix: Cost matrix (N x M)
+    Returns:
+        matches: Array of (row, col) pairs
+        unmatched: Tuple of (unmatched_rows, unmatched_cols)
+    """
+    try:
+        import lap
+        _, x, y = lap.lapjv(cost_matrix, extend_cost=True, cost_limit=1e6)
+        matches = np.array([[idx, x[idx]] for idx in range(len(x)) if x[idx] >= 0])
+        unmatched_a = np.where(x < 0)[0]
+        unmatched_b = np.where(y < 0)[0]
+        return matches, (unmatched_a, unmatched_b)
+    except ImportError:
+        # Fallback to greedy matching
+        return _greedy_assignment(cost_matrix)
+def _greedy_assignment(cost_matrix: np.ndarray) -> Tuple[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
+    """Greedy assignment fallback."""
+    matches = []
+    matched_rows = set()
+    matched_cols = set()
+    n_rows, n_cols = cost_matrix.shape
+    pairs = []
+    for i in range(n_rows):
+        for j in range(n_cols):
+            pairs.append((cost_matrix[i, j], i, j))
+    pairs.sort(key=lambda x: x[0])
+    for cost, i, j in pairs:
+        if i not in matched_rows and j not in matched_cols:
+            matches.append([i, j])
+            matched_rows.add(i)
+            matched_cols.add(j)
+    unmatched_rows = np.array([i for i in range(n_rows) if i not in matched_rows])
+    unmatched_cols = np.array([j for j in range(n_cols) if j not in matched_cols])
+    return np.array(matches) if matches else np.empty((0, 2)), (unmatched_rows, unmatched_cols)
+@dataclass
+class STrack:
+    """Single track for ByteTrack."""
+    track_id: int
+    label: str
+    tlbr: np.ndarray  # [x1, y1, x2, y2]
+    score: float
+    # State
+    state: str = "tracked"  # tracked, lost, removed
+    frame_id: int = 0
+    tracklet_len: int = 0
+    # History
+    history: List[Tuple[int, np.ndarray, float]] = field(default_factory=list)
+    @property
+    def tlwh(self) -> np.ndarray:
+        """Top-left-width-height format."""
+        x1, y1, x2, y2 = self.tlbr
+        return np.array([x1, y1, x2 - x1, y2 - y1])
+    def activate(self, frame_id: int):
+        """Activate new track."""
+        self.track_id = self.next_id()
+        self.tracklet_len = 0
+        self.state = "tracked"
+        self.frame_id = frame_id
+        self.history = [(frame_id, self.tlbr.copy(), self.score)]
+    def re_activate(self, new_track: 'STrack', frame_id: int):
+        """Reactivate lost track."""
+        self.tlbr = new_track.tlbr
+        self.score = new_track.score
+        self.tracklet_len = 0
+        self.state = "tracked"
+        self.frame_id = frame_id
+        self.history.append((frame_id, self.tlbr.copy(), self.score))
+    def update(self, new_track: 'STrack', frame_id: int):
+        """Update with new detection."""
+        self.tlbr = new_track.tlbr
+        self.score = new_track.score
+        self.tracklet_len += 1
+        self.state = "tracked"
+        self.frame_id = frame_id
+        self.history.append((frame_id, self.tlbr.copy(), self.score))
+    def mark_lost(self):
+        """Mark as lost."""
+        self.state = "lost"
+    def mark_removed(self):
+        """Mark as removed."""
+        self.state = "removed"
+    _count = 0
+    @staticmethod
+    def next_id() -> int:
+        STrack._count += 1
+        return STrack._count
+    @staticmethod
+    def reset_id():
+        STrack._count = 0
+class ByteTracker:
+    """ByteTrack: Multi-object tracker using high/low confidence matching.
+    Key features:
+    - Two-stage matching with high/low confidence detections
+    - Hungarian algorithm for optimal assignment
+    - No ReID features needed (fast and simple)
+    """
+    def __init__(
+        self,
+        track_thresh: float = 0.5,
+        match_thresh: float = 0.8,
+        track_buffer: int = 30,
+        frame_rate: int = 30,
+    ):
+        """
+        Args:
+            track_thresh: High confidence threshold for first matching
+            match_thresh: IoU threshold for matching
+            track_buffer: Frames to keep lost tracks
+            frame_rate: Video frame rate
+        """
+        self.track_thresh = track_thresh
+        self.match_thresh = match_thresh
+        self.track_buffer = track_buffer
+        self.frame_rate = frame_rate
+        self.tracked_stracks: List[STrack] = []
+        self.lost_stracks: List[STrack] = []
+        self.removed_stracks: List[STrack] = []
+        self.frame_id = 0
+        self.max_time_lost = int(frame_rate / 30.0 * track_buffer)
+    def reset(self):
+        """Reset tracker state."""
+        self.tracked_stracks = []
+        self.lost_stracks = []
+        self.removed_stracks = []
+        self.frame_id = 0
+        STrack.reset_id()
+    def update(self, detections: List[Dict]) -> List[Dict]:
+        """Update with new detections.
+        Args:
+            detections: List of dicts with label, score, bbox_xyxy
+        Returns:
+            List of track dicts with track_id, label, bbox_xyxy, score
+        """
+        self.frame_id += 1
+        activated_stracks = []
+        refind_stracks = []
+        lost_stracks = []
+        removed_stracks = []
+        # Separate high and low confidence detections
+        remain_high_inds = [i for i, d in enumerate(detections) if d["score"] >= self.track_thresh]
+        remain_low_inds = [i for i, d in enumerate(detections) if d["score"] < self.track_thresh]
+        dets_high = [detections[i] for i in remain_high_inds]
+        dets_low = [detections[i] for i in remain_low_inds]
+        # Convert to STrack format
+        detections_high = [self._det_to_strack(d) for d in dets_high]
+        detections_low = [self._det_to_strack(d) for d in dets_low]
+        # ---- Step 1: Match high-confidence detections with tracked tracks ----
+        strack_pool = self.tracked_stracks
+        # Compute IoU distance
+        dists = self._iou_distance(strack_pool, detections_high)
+        # Hungarian matching
+        matches, u_track, u_detection = self._matching(dists, strack_pool, detections_high)
+        # Update matched tracks
+        for itracked, idet in matches:
+            track = strack_pool[itracked]
+            det = detections_high[idet]
+            track.update(det, self.frame_id)
+            activated_stracks.append(track)
+        # ---- Step 2: Match unmatched tracks with low-confidence detections ----
+        detections_low = [detections_low[i] for i in u_detection if i < len(detections_low)]
+        r_tracked_stracks = [strack_pool[i] for i in u_track if strack_pool[i].state == "tracked"]
+        dists = self._iou_distance(r_tracked_stracks, detections_low)
+        matches, u_track, u_detection_low = self._matching(dists, r_tracked_stracks, detections_low)
+        for itracked, idet in matches:
+            track = r_tracked_stracks[itracked]
+            det = detections_low[idet]
+            track.update(det, self.frame_id)
+            activated_stracks.append(track)
+        # Unmatched tracks become lost
+        for it in u_track:
+            track = r_tracked_stracks[it]
+            track.mark_lost()
+            lost_stracks.append(track)
+        # ---- Step 3: Match unmatched high-confidence detections with lost tracks ----
+        detections_high_second = [detections_high[i] for i in u_detection]
+        dists = self._iou_distance(self.lost_stracks, detections_high_second)
+        matches, u_lost, u_detection = self._matching(dists, self.lost_stracks, detections_high_second)
+        for ilost, idet in matches:
+            track = self.lost_stracks[ilost]
+            det = detections_high_second[idet]
+            track.re_activate(det, self.frame_id)
+            refind_stracks.append(track)
+        # ---- Step 4: Initialize new tracks ----
+        for inew in u_detection:
+            track = detections_high_second[inew]
+            if track.score >= self.track_thresh:
+                track.activate(self.frame_id)
+                activated_stracks.append(track)
+        # ---- Step 5: Remove long-lost tracks ----
+        for track in self.lost_stracks:
+            if self.frame_id - track.frame_id > self.max_time_lost:
+                track.mark_removed()
+                removed_stracks.append(track)
+        # Update state lists
+        self.tracked_stracks = [t for t in self.tracked_stracks if t.state == "tracked"]
+        self.tracked_stracks = activated_stracks + refind_stracks
+        self.lost_stracks = [t for t in self.lost_stracks if t.state == "lost"]
+        self.lost_stracks.extend(lost_stracks)
+        self.lost_stracks = [t for t in self.lost_stracks if t not in removed_stracks]
+        self.removed_stracks.extend(removed_stracks)
+        # Convert to output format
+        return self._stracks_to_output(self.tracked_stracks)
+    def _det_to_strack(self, det: Dict) -> STrack:
+        """Convert detection dict to STrack."""
+        return STrack(
+            track_id=-1,
+            label=det["label"],
+            tlbr=np.array(det["bbox_xyxy"]),
+            score=det["score"],
+        )
+    def _iou_distance(self, atracks: List[STrack], btracks: List[STrack]) -> np.ndarray:
+        """Compute IoU distance matrix."""
+        if not atracks or not btracks:
+            return np.zeros((len(atracks), len(btracks)))
+        atlbrs = np.array([track.tlbr for track in atracks])
+        btlbrs = np.array([track.tlbr for track in btracks])
+        ious = self._batch_iou(atlbrs, btlbrs)
+        cost_matrix = 1 - ious
+        return cost_matrix
+    def _batch_iou(self, boxes_a: np.ndarray, boxes_b: np.ndarray) -> np.ndarray:
+        """Batch IoU computation."""
+        area_a = (boxes_a[:, 2] - boxes_a[:, 0]) * (boxes_a[:, 3] - boxes_a[:, 1])
+        area_b = (boxes_b[:, 2] - boxes_b[:, 0]) * (boxes_b[:, 3] - boxes_b[:, 1])
+        iw = np.minimum(boxes_a[:, None, 2], boxes_b[:, 2]) - np.maximum(boxes_a[:, None, 0], boxes_b[:, 0])
+        ih = np.minimum(boxes_a[:, None, 3], boxes_b[:, 3]) - np.maximum(boxes_a[:, None, 1], boxes_b[:, 1])
+        iw = np.maximum(iw, 0)
+        ih = np.maximum(ih, 0)
+        inter = iw * ih
+        union = area_a[:, None] + area_b - inter
+        ious = inter / np.maximum(union, 1e-6)
+        return ious
+    def _matching(
+        self,
+        dists: np.ndarray,
+        atracks: List[STrack],
+        btracks: List[STrack],
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """Match tracks using Hungarian algorithm."""
+        if not atracks or not btracks:
+            return np.empty((0, 2), dtype=int), np.arange(len(atracks)), np.arange(len(btracks))
+        # Filter by threshold
+        dists[dists > 1 - self.match_thresh] = 1e6
+        matches, (u_track, u_detection) = linear_assignment(dists)
+        return matches, u_track, u_detection
+    def _stracks_to_output(self, stracks: List[STrack]) -> List[Dict]:
+        """Convert STracks to output dict format."""
+        result = []
+        for track in stracks:
+            result.append({
+                "track_id": track.track_id,
+                "label": track.label,
+                "bbox_xyxy": track.tlbr.tolist(),
+                "score": track.score,
+                "frame_id": track.frame_id,
+                "tracklet_len": track.tracklet_len,
+            })
+        return result
+class BoTSORT(ByteTracker):
+    """BoTSORT: ByteTrack with camera motion compensation and ReID.
+    For simplicity, this is a lightweight version without ReID features.
+    Adds Kalman filter for state prediction over ByteTrack.
+    """
+    def __init__(
+        self,
+        track_thresh: float = 0.5,
+        match_thresh: float = 0.8,
+        track_buffer: int = 30,
+        frame_rate: int = 30,
+    ):
+        super().__init__(track_thresh, match_thresh, track_buffer, frame_rate)
+        # BoTSORT would add Kalman filter here, but for detection-based tracking
+        # we keep it simple and inherit ByteTrack behavior

detection/detr.py ADDED Viewed

	@@ -0,0 +1,215 @@

+"""DETR and Deformable DETR via Hugging Face Transformers."""
+from __future__ import annotations
+from typing import Dict, List, Optional, Union
+import torch
+from PIL import Image
+from .base import BaseDetector, Detection
+from model_cache import hf_cache_dir, ensure_default_checkpoint_dirs
+class DETRDetector(BaseDetector):
+    def __init__(self, device: str = "cuda", model_name: str = "facebook/detr-resnet-50", **kwargs):
+        super().__init__(device=device, **kwargs)
+        self.model_name = model_name
+        self._device = torch.device(device if torch.cuda.is_available() or device == "cpu" else "cpu")
+        self.processor = None
+        self._id2label: Dict[int, str] = {}
+    def load_model(self) -> None:
+        from transformers import DetrForObjectDetection, DetrImageProcessor
+        ensure_default_checkpoint_dirs()
+        cache_dir = str(hf_cache_dir())
+        self.processor = DetrImageProcessor.from_pretrained(self.model_name, cache_dir=cache_dir)
+        self.model = DetrForObjectDetection.from_pretrained(self.model_name, cache_dir=cache_dir).to(self._device).eval()
+        self._id2label = dict(getattr(self.model.config, "id2label", {}) or {})
+    def get_available_classes(self) -> Union[List[str], Dict[str, int], None]:
+        if not self._id2label:
+            return None
+        return {name: int(i) for i, name in self._id2label.items()}
+    def detect(self, image: Image.Image, conf_threshold: float = 0.25, **kwargs) -> List[Detection]:
+        assert self.model is not None and self.processor is not None
+        img = image.convert("RGB")
+        inputs = self.processor(images=img, return_tensors="pt").to(self._device)
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        target_sizes = torch.tensor([img.size[::-1]], device=self._device)  # (h, w)
+        results = self.processor.post_process_object_detection(
+            outputs,
+            threshold=float(conf_threshold),
+            target_sizes=target_sizes,
+        )[0]
+        dets: List[Detection] = []
+        for score, label_id, box in zip(results["scores"], results["labels"], results["boxes"]):
+            lid = int(label_id)
+            label = self._id2label.get(lid, str(lid))
+            b = [float(v) for v in box.tolist()]
+            dets.append(Detection(label=label, score=float(score), bbox_xyxy=b))
+        return dets
+    def detect_batch(self, images: List[Image.Image], conf_threshold: float = 0.25, **kwargs) -> List[List[Detection]]:
+        """Batch detection for video processing - faster than frame-by-frame.
+        Args:
+            images: List of PIL Images
+            conf_threshold: Confidence threshold
+        Returns:
+            List of detection lists, one per image
+        """
+        assert self.model is not None and self.processor is not None
+        if not images:
+            return []
+        # Convert all images to RGB
+        imgs = [img.convert("RGB") for img in images]
+        # Batch process images
+        inputs = self.processor(images=imgs, return_tensors="pt").to(self._device)
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        # Target sizes for each image (h, w)
+        target_sizes = torch.tensor([img.size[::-1] for img in imgs], device=self._device)
+        results = self.processor.post_process_object_detection(
+            outputs,
+            threshold=float(conf_threshold),
+            target_sizes=target_sizes,
+        )
+        # Parse results for each image
+        all_detections = []
+        for result in results:
+            frame_dets = []
+            for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]):
+                lid = int(label_id)
+                label = self._id2label.get(lid, str(lid))
+                b = [float(v) for v in box.tolist()]
+                frame_dets.append(Detection(label=label, score=float(score), bbox_xyxy=b))
+            all_detections.append(frame_dets)
+        return all_detections
+class DeformableDETRDetector(BaseDetector):
+    """Deformable DETR wrapper.
+    This relies on transformers' Deformable DETR implementations and checkpoints.
+    If your transformers build doesn't include Deformable DETR, this will raise.
+    Default checkpoint is `SenseTime/deformable-detr`.
+    """
+    def __init__(self, device: str = "cuda", model_name: str = "SenseTime/deformable-detr", **kwargs):
+        super().__init__(device=device, **kwargs)
+        self.model_name = model_name
+        self._device = torch.device(device if torch.cuda.is_available() or device == "cpu" else "cpu")
+        self.processor = None
+        self._id2label: Dict[int, str] = {}
+    def load_model(self) -> None:
+        try:
+            from transformers import AutoImageProcessor, AutoModelForObjectDetection
+        except Exception as e:
+            raise ImportError("transformers is required for Deformable DETR") from e
+        ensure_default_checkpoint_dirs()
+        cache_dir = str(hf_cache_dir())
+        self.processor = AutoImageProcessor.from_pretrained(self.model_name, cache_dir=cache_dir)
+        self.model = AutoModelForObjectDetection.from_pretrained(self.model_name, cache_dir=cache_dir).to(self._device).eval()
+        self._id2label = dict(getattr(self.model.config, "id2label", {}) or {})
+    def get_available_classes(self) -> Union[List[str], Dict[str, int], None]:
+        if not self._id2label:
+            return None
+        return {name: int(i) for i, name in self._id2label.items()}
+    def detect(self, image: Image.Image, conf_threshold: float = 0.25, **kwargs) -> List[Detection]:
+        assert self.model is not None and self.processor is not None
+        img = image.convert("RGB")
+        inputs = self.processor(images=img, return_tensors="pt").to(self._device)
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        # Prefer standard post_process if provided
+        if hasattr(self.processor, "post_process_object_detection"):
+            target_sizes = torch.tensor([img.size[::-1]], device=self._device)
+            results = self.processor.post_process_object_detection(
+                outputs,
+                threshold=float(conf_threshold),
+                target_sizes=target_sizes,
+            )[0]
+            dets: List[Detection] = []
+            for score, label_id, box in zip(results["scores"], results["labels"], results["boxes"]):
+                lid = int(label_id)
+                label = self._id2label.get(lid, str(lid))
+                b = [float(v) for v in box.tolist()]
+                dets.append(Detection(label=label, score=float(score), bbox_xyxy=b))
+            return dets
+        # Fallback: no post_process available
+        raise RuntimeError("This Deformable DETR processor does not support post_process_object_detection")
+    def detect_batch(self, images: List[Image.Image], conf_threshold: float = 0.25, **kwargs) -> List[List[Detection]]:
+        """Batch detection for video processing - faster than frame-by-frame.
+        Args:
+            images: List of PIL Images
+            conf_threshold: Confidence threshold
+        Returns:
+            List of detection lists, one per image
+        """
+        assert self.model is not None and self.processor is not None
+        if not images:
+            return []
+        # Convert all images to RGB
+        imgs = [img.convert("RGB") for img in images]
+        # Batch process images
+        inputs = self.processor(images=imgs, return_tensors="pt").to(self._device)
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        # Post-process requires method support
+        if not hasattr(self.processor, "post_process_object_detection"):
+            raise RuntimeError("This Deformable DETR processor does not support post_process_object_detection")
+        # Target sizes for each image (h, w)
+        target_sizes = torch.tensor([img.size[::-1] for img in imgs], device=self._device)
+        results = self.processor.post_process_object_detection(
+            outputs,
+            threshold=float(conf_threshold),
+            target_sizes=target_sizes,
+        )
+        # Parse results for each image
+        all_detections = []
+        for result in results:
+            frame_dets = []
+            for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]):
+                lid = int(label_id)
+                label = self._id2label.get(lid, str(lid))
+                b = [float(v) for v in box.tolist()]
+                frame_dets.append(Detection(label=label, score=float(score), bbox_xyxy=b))
+            all_detections.append(frame_dets)
+        return all_detections

detection/factory.py ADDED Viewed

	@@ -0,0 +1,50 @@

+"""Factory + registry for object detectors."""
+from __future__ import annotations
+from typing import Dict, List, Type
+from .base import BaseDetector
+from .yolo import YOLODetector
+from .torchvision_detectors import (
+    FasterRCNNDetector,
+    RetinaNetDetector,
+    SSDDetector,
+    EfficientDetDetector,
+    FCOSDetector,
+)
+from .detr import DETRDetector, DeformableDETRDetector
+from .grounding_dino import GroundingDINODetector
+from .yolo_world import YOLOWorldDetector
+DETECTOR_REGISTRY: Dict[str, Type[BaseDetector]] = {
+    "yolo": YOLODetector,
+    "yolo_world": YOLOWorldDetector,
+    "fasterrcnn": FasterRCNNDetector,
+    "retinanet": RetinaNetDetector,
+    "ssd": SSDDetector,
+    "efficientdet": EfficientDetDetector,
+    "fcos": FCOSDetector,
+    "detr": DETRDetector,
+    "deformable_detr": DeformableDETRDetector,
+    "grounding_dino": GroundingDINODetector,
+}
+def register_detector(name: str, detector_class: Type[BaseDetector]) -> None:
+    if not issubclass(detector_class, BaseDetector):
+        raise ValueError(f"{detector_class} must extend BaseDetector")
+    DETECTOR_REGISTRY[name.lower()] = detector_class
+def create_detector(method: str, device: str = "cuda", **kwargs) -> BaseDetector:
+    method_lower = method.lower()
+    if method_lower not in DETECTOR_REGISTRY:
+        available = ", ".join(sorted(DETECTOR_REGISTRY.keys()))
+        raise ValueError(f"Unknown detector: '{method}'. Available: {available}")
+    return DETECTOR_REGISTRY[method_lower](device=device, **kwargs)
+def get_available_detectors() -> List[str]:
+    return sorted(DETECTOR_REGISTRY.keys())

detection/grounding_dino.py ADDED Viewed

	@@ -0,0 +1,215 @@

+"""Grounding DINO open-vocabulary object detection via Hugging Face Transformers."""
+from __future__ import annotations
+from typing import Any, List, Optional, Sequence, Union
+import torch
+from PIL import Image
+from .base import BaseDetector, Detection
+from model_cache import hf_cache_dir, ensure_default_checkpoint_dirs
+def _normalize_prompts(prompts: Optional[Union[str, Sequence[str]]]) -> Optional[List[str]]:
+    if prompts is None:
+        return None
+    if isinstance(prompts, str):
+        # allow comma-separated convenience
+        parts = [p.strip() for p in prompts.split(",")]
+        parts = [p for p in parts if p]
+        return parts or None
+    out = [str(p).strip() for p in prompts]
+    out = [p for p in out if p]
+    return out or None
+class GroundingDINODetector(BaseDetector):
+    """Grounding DINO wrapper.
+    Usage:
+      detector = create_detector('grounding_dino', device='cuda')
+      dets = detector(image, prompts=['person', 'car'])
+    Notes:
+      - This is an open-vocabulary model; `get_available_classes()` returns None.
+      - Pass prompts via `prompts=` or `classes=`; you may also pass raw `text=`.
+    """
+    def __init__(
+        self,
+        device: str = "cuda",
+        model_name: str = "IDEA-Research/grounding-dino-base",
+        **kwargs: Any,
+    ):
+        super().__init__(device=device, **kwargs)
+        self.model_name = model_name
+        self._device = torch.device(device if (torch.cuda.is_available() or device == "cpu") else "cpu")
+        self.processor = None
+    def load_model(self) -> None:
+        try:
+            from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
+        except Exception as e:
+            raise ImportError(
+                "transformers>=4.36 is required for Grounding DINO (AutoProcessor + AutoModelForZeroShotObjectDetection)"
+            ) from e
+        ensure_default_checkpoint_dirs()
+        cache_dir = str(hf_cache_dir())
+        self.processor = AutoProcessor.from_pretrained(self.model_name, cache_dir=cache_dir)
+        self.model = AutoModelForZeroShotObjectDetection.from_pretrained(self.model_name, cache_dir=cache_dir).to(self._device).eval()
+    def get_available_classes(self):
+        return None
+    def detect(self, image: Image.Image, conf_threshold: float = 0.25, **kwargs: Any) -> List[Detection]:
+        assert self.model is not None and self.processor is not None
+        img = image.convert("RGB")
+        prompts = _normalize_prompts(kwargs.get("prompts") or kwargs.get("classes") or kwargs.get("labels"))
+        text: Optional[str] = kwargs.get("text")
+        if text is None:
+            if not prompts:
+                raise ValueError("GroundingDINO requires `prompts`/`classes` (or raw `text`) for open-vocabulary detection")
+            # GroundingDINO expects a single string; period-delimited works well
+            text = " . ".join(prompts)
+            if not text.endswith("."):
+                text = text + " ."
+        threshold = float(kwargs.get("threshold", conf_threshold))
+        inputs = self.processor(images=img, text=text, return_tensors="pt")
+        inputs = {k: v.to(self._device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        target_sizes = torch.tensor([img.size[::-1]], device=self._device)  # (h, w)
+        # Prefer the processor's helper if available.
+        if hasattr(self.processor, "post_process_grounded_object_detection"):
+            # Move input_ids to CPU for post-processing (some versions have device mismatch bugs)
+            input_ids_for_postprocess = inputs["input_ids"].cpu()
+            results = self.processor.post_process_grounded_object_detection(
+                outputs,
+                input_ids_for_postprocess,
+                threshold=threshold,
+                target_sizes=target_sizes.cpu(),
+            )[0]
+        else:
+            raise RuntimeError("This processor does not support post_process_grounded_object_detection")
+        boxes = results.get("boxes")
+        scores = results.get("scores")
+        text_labels = results.get("text_labels") or results.get("labels")
+        if boxes is None or scores is None:
+            return []
+        dets: List[Detection] = []
+        for i in range(int(scores.shape[0])):
+            score = float(scores[i])
+            box = boxes[i]
+            b = [float(v) for v in box.tolist()]
+            label: str
+            if isinstance(text_labels, (list, tuple)) and i < len(text_labels):
+                label = str(text_labels[i])
+            elif torch.is_tensor(text_labels) and i < int(text_labels.shape[0]):
+                lid = int(text_labels[i])
+                if prompts and 0 <= lid < len(prompts):
+                    label = prompts[lid]
+                else:
+                    label = str(lid)
+            else:
+                label = "object"
+            dets.append(Detection(label=label, score=score, bbox_xyxy=b))
+        return dets
+    def detect_batch(self, images: List[Image.Image], conf_threshold: float = 0.25, **kwargs: Any) -> List[List[Detection]]:
+        """Batch detection for video processing.
+        Note: Grounding DINO requires the same text prompt for all images in batch.
+        Args:
+            images: List of PIL Images
+            conf_threshold: Confidence threshold
+            **kwargs: Should include 'prompts'/'classes' or 'text'
+        Returns:
+            List of detection lists, one per image
+        """
+        assert self.model is not None and self.processor is not None
+        if not images:
+            return []
+        imgs = [img.convert("RGB") for img in images]
+        prompts = _normalize_prompts(kwargs.get("prompts") or kwargs.get("classes") or kwargs.get("labels"))
+        text: Optional[str] = kwargs.get("text")
+        if text is None:
+            if not prompts:
+                raise ValueError("GroundingDINO requires `prompts`/`classes` (or raw `text`) for open-vocabulary detection")
+            text = " . ".join(prompts)
+            if not text.endswith("."):
+                text = text + " ."
+        threshold = float(kwargs.get("threshold", conf_threshold))
+        # Batch process images with same text prompt
+        inputs = self.processor(images=imgs, text=[text] * len(imgs), return_tensors="pt")
+        inputs = {k: v.to(self._device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        target_sizes = torch.tensor([img.size[::-1] for img in imgs], device=self._device)
+        if not hasattr(self.processor, "post_process_grounded_object_detection"):
+            raise RuntimeError("This processor does not support post_process_grounded_object_detection")
+        # Post-process
+        input_ids_for_postprocess = inputs["input_ids"].cpu()
+        results = self.processor.post_process_grounded_object_detection(
+            outputs,
+            input_ids_for_postprocess,
+            threshold=threshold,
+            target_sizes=target_sizes.cpu(),
+        )
+        # Parse results for each image
+        all_detections = []
+        for result in results:
+            boxes = result.get("boxes")
+            scores = result.get("scores")
+            text_labels = result.get("text_labels") or result.get("labels")
+            frame_dets = []
+            if boxes is not None and scores is not None:
+                for i in range(int(scores.shape[0])):
+                    score = float(scores[i])
+                    box = boxes[i]
+                    b = [float(v) for v in box.tolist()]
+                    label: str
+                    if isinstance(text_labels, (list, tuple)) and i < len(text_labels):
+                        label = str(text_labels[i])
+                    elif torch.is_tensor(text_labels) and i < int(text_labels.shape[0]):
+                        lid = int(text_labels[i])
+                        if prompts and 0 <= lid < len(prompts):
+                            label = prompts[lid]
+                        else:
+                            label = str(lid)
+                    else:
+                        label = "object"
+                    frame_dets.append(Detection(label=label, score=score, bbox_xyxy=b))
+            all_detections.append(frame_dets)
+        return all_detections

detection/torchvision_detectors.py ADDED Viewed

	@@ -0,0 +1,300 @@

+"""Torchvision-based object detectors: Faster R-CNN, RetinaNet, SSD, FCOS.
+Also includes an EfficientDet wrapper (optional dependency: effdet).
+"""
+from __future__ import annotations
+from model_cache import ensure_default_checkpoint_dirs
+from typing import Dict, List, Optional, Union
+import numpy as np
+import torch
+from PIL import Image
+from torchvision.transforms.functional import to_tensor
+from .base import BaseDetector, Detection
+# Ensure torchvision/torch hub downloads land under `checkpoints/` by default.
+ensure_default_checkpoint_dirs()
+class _TorchvisionCOCODetector(BaseDetector):
+    _categories: Optional[List[str]] = None
+    def __init__(self, device: str = "cuda", **kwargs):
+        super().__init__(device=device, **kwargs)
+        self._device = torch.device(device if torch.cuda.is_available() or device == "cpu" else "cpu")
+    def get_available_classes(self) -> Union[List[str], Dict[str, int], None]:
+        if not self._categories:
+            return None
+        return {name: int(i) for i, name in enumerate(self._categories)}
+    def detect(self, image: Image.Image, conf_threshold: float = 0.25, **kwargs) -> List[Detection]:
+        assert self.model is not None
+        img = image.convert("RGB")
+        x = to_tensor(img).to(self._device)
+        with torch.no_grad():
+            out = self.model([x])[0]
+        boxes = out.get("boxes")
+        labels = out.get("labels")
+        scores = out.get("scores")
+        if boxes is None or labels is None or scores is None:
+            return []
+        boxes = boxes.detach().cpu().numpy()
+        labels = labels.detach().cpu().numpy().astype(int)
+        scores = scores.detach().cpu().numpy()
+        dets: List[Detection] = []
+        for b, l, s in zip(boxes, labels, scores):
+            if float(s) < float(conf_threshold):
+                continue
+            label_name = str(int(l))
+            if self._categories and 0 <= int(l) < len(self._categories):
+                label_name = self._categories[int(l)]
+            dets.append(
+                Detection(
+                    label=label_name,
+                    score=float(s),
+                    bbox_xyxy=[float(v) for v in b.tolist()],
+                )
+            )
+        return dets
+    def detect_batch(self, images: List[Image.Image], conf_threshold: float = 0.25, **kwargs) -> List[List[Detection]]:
+        """Batch detection for video processing - much faster than frame-by-frame.
+        Args:
+            images: List of PIL Images
+            conf_threshold: Confidence threshold
+        Returns:
+            List of detection lists, one per image
+        """
+        assert self.model is not None
+        if not images:
+            return []
+        # Convert all images to tensors
+        imgs = [img.convert("RGB") for img in images]
+        tensors = [to_tensor(img).to(self._device) for img in imgs]
+        # Batch inference - torchvision models accept list of tensors
+        with torch.no_grad():
+            outputs = self.model(tensors)
+        # Parse results for each image
+        all_detections = []
+        for out in outputs:
+            boxes = out.get("boxes")
+            labels = out.get("labels")
+            scores = out.get("scores")
+            frame_dets = []
+            if boxes is not None and labels is not None and scores is not None:
+                boxes = boxes.detach().cpu().numpy()
+                labels = labels.detach().cpu().numpy().astype(int)
+                scores = scores.detach().cpu().numpy()
+                for b, l, s in zip(boxes, labels, scores):
+                    if float(s) < float(conf_threshold):
+                        continue
+                    label_name = str(int(l))
+                    if self._categories and 0 <= int(l) < len(self._categories):
+                        label_name = self._categories[int(l)]
+                    frame_dets.append(
+                        Detection(
+                            label=label_name,
+                            score=float(s),
+                            bbox_xyxy=[float(v) for v in b.tolist()],
+                        )
+                    )
+            all_detections.append(frame_dets)
+        return all_detections
+class FasterRCNNDetector(_TorchvisionCOCODetector):
+    def load_model(self) -> None:
+        from torchvision.models.detection import fasterrcnn_resnet50_fpn
+        from torchvision.models.detection import FasterRCNN_ResNet50_FPN_Weights
+        weights = FasterRCNN_ResNet50_FPN_Weights.DEFAULT
+        self._categories = list(weights.meta.get("categories", []))
+        self.model = fasterrcnn_resnet50_fpn(weights=weights).to(self._device).eval()
+class RetinaNetDetector(_TorchvisionCOCODetector):
+    def load_model(self) -> None:
+        from torchvision.models.detection import retinanet_resnet50_fpn
+        from torchvision.models.detection import RetinaNet_ResNet50_FPN_Weights
+        weights = RetinaNet_ResNet50_FPN_Weights.DEFAULT
+        self._categories = list(weights.meta.get("categories", []))
+        self.model = retinanet_resnet50_fpn(weights=weights).to(self._device).eval()
+class SSDDetector(_TorchvisionCOCODetector):
+    def load_model(self) -> None:
+        from torchvision.models.detection import ssd300_vgg16
+        from torchvision.models.detection import SSD300_VGG16_Weights
+        weights = SSD300_VGG16_Weights.DEFAULT
+        self._categories = list(weights.meta.get("categories", []))
+        self.model = ssd300_vgg16(weights=weights).to(self._device).eval()
+class FCOSDetector(_TorchvisionCOCODetector):
+    def load_model(self) -> None:
+        from torchvision.models.detection import fcos_resnet50_fpn
+        from torchvision.models.detection import FCOS_ResNet50_FPN_Weights
+        weights = FCOS_ResNet50_FPN_Weights.DEFAULT
+        self._categories = list(weights.meta.get("categories", []))
+        self.model = fcos_resnet50_fpn(weights=weights).to(self._device).eval()
+class EfficientDetDetector(BaseDetector):
+    """EfficientDet via `effdet` (optional dependency).
+    This is implemented so the module is extendable, but requires:
+      - `pip install effdet`
+    We default to `tf_efficientdet_d0`.
+    """
+    def __init__(self, device: str = "cuda", model_name: str = "tf_efficientdet_d0", **kwargs):
+        super().__init__(device=device, **kwargs)
+        self.model_name = model_name
+        self._device = torch.device(device if torch.cuda.is_available() or device == "cpu" else "cpu")
+        self._categories: Optional[List[str]] = None
+        self._input_size_hw: Optional[tuple[int, int]] = None
+    def load_model(self) -> None:
+        try:
+            from effdet import create_model
+            from effdet.config import get_efficientdet_config
+        except Exception as e:
+            raise ImportError(
+                "EfficientDet requires the optional package 'effdet'. "
+                "Install it with: pip install effdet"
+            ) from e
+        self.model = create_model(
+            self.model_name,
+            pretrained=True,
+            bench_task="predict",
+        ).to(self._device).eval()
+        # effdet prediction bench expects a fixed input resolution.
+        cfg = get_efficientdet_config(self.model_name)
+        # cfg.image_size is [H, W]
+        self._input_size_hw = (int(cfg.image_size[0]), int(cfg.image_size[1]))
+        # effdet uses COCO labels by default for pretrained models
+        # Keep class list unknown here to avoid hard-coding.
+        self._categories = None
+    def get_available_classes(self) -> Union[List[str], Dict[str, int], None]:
+        return self._categories
+    def detect(self, image: Image.Image, conf_threshold: float = 0.25, **kwargs) -> List[Detection]:
+        assert self.model is not None
+        img = image.convert("RGB")
+        orig_w, orig_h = img.size
+        # Resize to the model's configured input size. This avoids internal shape mismatch
+        # issues in effdet when given arbitrary resolutions.
+        target_h, target_w = self._input_size_hw or (512, 512)
+        resized = img.resize((target_w, target_h))
+        x0 = to_tensor(resized)
+        in_h, in_w = target_h, target_w
+        x = x0.unsqueeze(0).to(self._device)
+        with torch.no_grad():
+            pred = self.model(x)
+        # pred is a dict-like with 'detections': [B, max_det, 6]
+        det = pred[0].detach().cpu().numpy()
+        out: List[Detection] = []
+        sx = orig_w / float(in_w) if in_w > 0 else 1.0
+        sy = orig_h / float(in_h) if in_h > 0 else 1.0
+        for row in det:
+            x1, y1, x2, y2, score, cls = row.tolist()
+            if float(score) < float(conf_threshold):
+                continue
+            out.append(
+                Detection(
+                    label=str(int(cls)),
+                    score=float(score),
+                    bbox_xyxy=[float(x1) * sx, float(y1) * sy, float(x2) * sx, float(y2) * sy],
+                )
+            )
+        return out
+    def detect_batch(self, images: List[Image.Image], conf_threshold: float = 0.25, **kwargs) -> List[List[Detection]]:
+        """Batch detection for video processing.
+        Args:
+            images: List of PIL Images
+            conf_threshold: Confidence threshold
+        Returns:
+            List of detection lists, one per image
+        """
+        assert self.model is not None
+        if not images:
+            return []
+        # Convert and resize all images
+        target_h, target_w = self._input_size_hw or (512, 512)
+        tensors = []
+        scales = []
+        for img in images:
+            img_rgb = img.convert("RGB")
+            orig_w, orig_h = img_rgb.size
+            resized = img_rgb.resize((target_w, target_h))
+            tensors.append(to_tensor(resized))
+            sx = orig_w / float(target_w) if target_w > 0 else 1.0
+            sy = orig_h / float(target_h) if target_h > 0 else 1.0
+            scales.append((sx, sy))
+        # Batch inference
+        batch = torch.stack(tensors).to(self._device)
+        with torch.no_grad():
+            preds = self.model(batch)
+        # Parse results for each image
+        all_detections = []
+        for pred, (sx, sy) in zip(preds, scales):
+            det = pred.detach().cpu().numpy()
+            frame_dets = []
+            for row in det:
+                x1, y1, x2, y2, score, cls = row.tolist()
+                if float(score) < float(conf_threshold):
+                    continue
+                frame_dets.append(
+                    Detection(
+                        label=str(int(cls)),
+                        score=float(score),
+                        bbox_xyxy=[float(x1) * sx, float(y1) * sy, float(x2) * sx, float(y2) * sy],
+                    )
+                )
+            all_detections.append(frame_dets)
+        return all_detections

detection/tracker.py ADDED Viewed

	@@ -0,0 +1,387 @@

+"""Simple IoU-based object tracker for video processing.
+Provides basic multi-object tracking using detection-to-track association
+based on IoU overlap and optional appearance features.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Sequence, Tuple
+import numpy as np
+@dataclass
+class Track:
+    """A tracked object across frames."""
+    track_id: int
+    label: str
+    # History of bounding boxes [(frame_idx, bbox_xyxy, score)]
+    history: List[Tuple[int, List[float], float]] = field(default_factory=list)
+    # Current state
+    last_bbox: List[float] = field(default_factory=list)
+    last_score: float = 0.0
+    last_frame: int = -1
+    # Track status
+    age: int = 0  # Frames since first detection
+    hits: int = 0  # Total detection matches
+    time_since_update: int = 0  # Frames since last match
+    # Color for visualization (RGB)
+    color: Tuple[int, int, int] = (0, 255, 0)
+class SimpleTracker:
+    """IoU-based multi-object tracker.
+    Associates detections to existing tracks using IoU overlap.
+    Creates new tracks for unmatched detections.
+    Removes tracks that haven't been updated for too long.
+    """
+    def __init__(
+        self,
+        iou_threshold: float = 0.3,
+        max_age: int = 30,
+        min_hits: int = 3,
+        label_match: bool = True,
+    ):
+        """
+        Args:
+            iou_threshold: Minimum IoU for detection-track association
+            max_age: Maximum frames a track survives without update
+            min_hits: Minimum detections before track is confirmed
+            label_match: Require label match for association
+        """
+        self.iou_threshold = iou_threshold
+        self.max_age = max_age
+        self.min_hits = min_hits
+        self.label_match = label_match
+        self._tracks: Dict[int, Track] = {}
+        self._next_id: int = 1
+        self._frame_idx: int = 0
+        # Color palette for tracks
+        self._colors = [
+            (255, 0, 0), (0, 255, 0), (0, 0, 255),
+            (255, 255, 0), (255, 0, 255), (0, 255, 255),
+            (255, 128, 0), (128, 0, 255), (0, 255, 128),
+            (255, 0, 128), (128, 255, 0), (0, 128, 255),
+        ]
+    def reset(self):
+        """Reset tracker state."""
+        self._tracks = {}
+        self._next_id = 1
+        self._frame_idx = 0
+    def update(
+        self,
+        detections: List[Dict],
+        frame_idx: Optional[int] = None,
+    ) -> List[Dict]:
+        """Update tracks with new detections.
+        Args:
+            detections: List of detection dicts with label, score, bbox_xyxy
+            frame_idx: Optional frame index (auto-incremented if None)
+        Returns:
+            List of track dicts with track_id, label, bbox_xyxy, score
+        """
+        if frame_idx is None:
+            self._frame_idx += 1
+        else:
+            self._frame_idx = frame_idx
+        # Increment age for all tracks
+        for track in self._tracks.values():
+            track.time_since_update += 1
+        if not detections:
+            # No detections - just age tracks
+            self._remove_old_tracks()
+            return self._get_active_tracks()
+        # Build cost matrix (negative IoU for Hungarian matching)
+        track_ids = list(self._tracks.keys())
+        det_indices = list(range(len(detections)))
+        if track_ids and det_indices:
+            # Compute IoU matrix
+            iou_matrix = self._compute_iou_matrix(
+                [self._tracks[tid].last_bbox for tid in track_ids],
+                [d["bbox_xyxy"] for d in detections],
+            )
+            # Greedy matching (could use Hungarian for optimal)
+            matches, unmatched_tracks, unmatched_dets = self._greedy_match(
+                iou_matrix,
+                track_ids,
+                det_indices,
+                detections,
+            )
+        else:
+            matches = []
+            unmatched_tracks = track_ids
+            unmatched_dets = det_indices
+        # Update matched tracks
+        for track_id, det_idx in matches:
+            det = detections[det_idx]
+            track = self._tracks[track_id]
+            track.last_bbox = det["bbox_xyxy"]
+            track.last_score = det["score"]
+            track.last_frame = self._frame_idx
+            track.hits += 1
+            track.time_since_update = 0
+            track.history.append((self._frame_idx, det["bbox_xyxy"], det["score"]))
+        # Create new tracks for unmatched detections
+        for det_idx in unmatched_dets:
+            det = detections[det_idx]
+            color = self._colors[self._next_id % len(self._colors)]
+            track = Track(
+                track_id=self._next_id,
+                label=det["label"],
+                last_bbox=det["bbox_xyxy"],
+                last_score=det["score"],
+                last_frame=self._frame_idx,
+                age=1,
+                hits=1,
+                time_since_update=0,
+                color=color,
+            )
+            track.history.append((self._frame_idx, det["bbox_xyxy"], det["score"]))
+            self._tracks[self._next_id] = track
+            self._next_id += 1
+        # Remove old tracks
+        self._remove_old_tracks()
+        return self._get_active_tracks()
+    def update_batch(
+        self,
+        detections_list: List[List[Dict]],
+    ) -> List[Dict]:
+        """Update tracks with a batch of frames.
+        Args:
+            detections_list: List of detection lists, one per frame
+        Returns:
+            List of all track dicts with full history
+        """
+        for dets in detections_list:
+            self.update(dets)
+        return self._get_all_tracks_with_history()
+    def _compute_iou_matrix(
+        self,
+        boxes_a: List[List[float]],
+        boxes_b: List[List[float]],
+    ) -> np.ndarray:
+        """Compute IoU matrix between two sets of boxes."""
+        n_a = len(boxes_a)
+        n_b = len(boxes_b)
+        if n_a == 0 or n_b == 0:
+            return np.zeros((n_a, n_b))
+        iou_matrix = np.zeros((n_a, n_b))
+        for i, box_a in enumerate(boxes_a):
+            for j, box_b in enumerate(boxes_b):
+                iou_matrix[i, j] = self._box_iou(box_a, box_b)
+        return iou_matrix
+    def _box_iou(self, a: List[float], b: List[float]) -> float:
+        """Compute IoU between two boxes."""
+        ax1, ay1, ax2, ay2 = a
+        bx1, by1, bx2, by2 = b
+        inter_x1 = max(ax1, bx1)
+        inter_y1 = max(ay1, by1)
+        inter_x2 = min(ax2, bx2)
+        inter_y2 = min(ay2, by2)
+        inter_w = max(0.0, inter_x2 - inter_x1)
+        inter_h = max(0.0, inter_y2 - inter_y1)
+        inter = inter_w * inter_h
+        area_a = max(0.0, ax2 - ax1) * max(0.0, ay2 - ay1)
+        area_b = max(0.0, bx2 - bx1) * max(0.0, by2 - by1)
+        union = area_a + area_b - inter
+        return float(inter / union) if union > 0 else 0.0
+    def _greedy_match(
+        self,
+        iou_matrix: np.ndarray,
+        track_ids: List[int],
+        det_indices: List[int],
+        detections: List[Dict],
+    ) -> Tuple[List[Tuple[int, int]], List[int], List[int]]:
+        """Greedy matching based on IoU."""
+        matches: List[Tuple[int, int]] = []
+        matched_tracks = set()
+        matched_dets = set()
+        # Sort by IoU (descending)
+        n_tracks, n_dets = iou_matrix.shape
+        pairs = []
+        for i in range(n_tracks):
+            for j in range(n_dets):
+                if iou_matrix[i, j] >= self.iou_threshold:
+                    pairs.append((iou_matrix[i, j], i, j))
+        pairs.sort(reverse=True, key=lambda x: x[0])
+        for iou_val, track_idx, det_idx in pairs:
+            if track_idx in matched_tracks or det_idx in matched_dets:
+                continue
+            track_id = track_ids[track_idx]
+            det = detections[det_idx]
+            # Check label match if required
+            if self.label_match:
+                if self._tracks[track_id].label != det["label"]:
+                    continue
+            matches.append((track_id, det_idx))
+            matched_tracks.add(track_idx)
+            matched_dets.add(det_idx)
+        unmatched_tracks = [track_ids[i] for i in range(n_tracks) if i not in matched_tracks]
+        unmatched_dets = [j for j in range(n_dets) if j not in matched_dets]
+        return matches, unmatched_tracks, unmatched_dets
+    def _remove_old_tracks(self):
+        """Remove tracks that haven't been updated recently."""
+        to_remove = []
+        for track_id, track in self._tracks.items():
+            if track.time_since_update > self.max_age:
+                to_remove.append(track_id)
+        for track_id in to_remove:
+            del self._tracks[track_id]
+    def _get_active_tracks(self) -> List[Dict]:
+        """Get currently active (confirmed) tracks."""
+        result = []
+        for track in self._tracks.values():
+            # Only return confirmed tracks
+            if track.hits >= self.min_hits:
+                result.append({
+                    "track_id": track.track_id,
+                    "label": track.label,
+                    "bbox_xyxy": track.last_bbox,
+                    "score": track.last_score,
+                    "age": track.age,
+                    "color": track.color,
+                })
+        return result
+    def _get_all_tracks_with_history(self) -> List[Dict]:
+        """Get all tracks with full history."""
+        result = []
+        for track in self._tracks.values():
+            if track.hits >= self.min_hits:
+                result.append({
+                    "track_id": track.track_id,
+                    "label": track.label,
+                    "bbox_xyxy": track.last_bbox,
+                    "score": track.last_score,
+                    "age": track.age,
+                    "hits": track.hits,
+                    "color": track.color,
+                    "history": [
+                        {"frame": h[0], "bbox_xyxy": h[1], "score": h[2]}
+                        for h in track.history
+                    ],
+                })
+        return result
+def draw_tracks(
+    image,
+    tracks: List[Dict],
+    show_id: bool = True,
+    show_trail: bool = False,
+    trail_length: int = 10,
+):
+    """Draw tracks on image.
+    Args:
+        image: PIL Image
+        tracks: List of track dicts
+        show_id: Show track ID
+        show_trail: Show movement trail
+        trail_length: Number of trail points
+    Returns:
+        Image with tracks drawn (PIL Image)
+    """
+    from PIL import Image, ImageDraw, ImageFont
+    img = image.copy()
+    draw = ImageDraw.Draw(img)
+    try:
+        font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 14)
+    except Exception:
+        font = ImageFont.load_default()
+    for track in tracks:
+        bbox = track["bbox_xyxy"]
+        color = track.get("color", (0, 255, 0))
+        track_id = track.get("track_id", 0)
+        label = track.get("label", "")
+        score = track.get("score", 0.0)
+        # Draw bounding box
+        x1, y1, x2, y2 = [int(c) for c in bbox]
+        draw.rectangle([x1, y1, x2, y2], outline=color, width=2)
+        # Draw label with track ID
+        if show_id:
+            text = f"#{track_id} {label}"
+        else:
+            text = f"{label} {score:.2f}"
+        # Text background
+        text_bbox = draw.textbbox((x1, y1 - 20), text, font=font)
+        draw.rectangle(text_bbox, fill=color)
+        draw.text((x1, y1 - 20), text, fill=(255, 255, 255), font=font)
+        # Draw trail if history available
+        if show_trail and "history" in track:
+            history = track["history"][-trail_length:]
+            if len(history) > 1:
+                centers = []
+                for h in history:
+                    hbbox = h["bbox_xyxy"]
+                    cx = (hbbox[0] + hbbox[2]) / 2
+                    cy = (hbbox[1] + hbbox[3]) / 2
+                    centers.append((int(cx), int(cy)))
+                for i in range(len(centers) - 1):
+                    # Fade trail
+                    alpha = (i + 1) / len(centers)
+                    trail_color = tuple(int(c * alpha) for c in color)
+                    draw.line([centers[i], centers[i + 1]], fill=trail_color, width=2)
+    return img

detection/utils.py ADDED Viewed

	@@ -0,0 +1,124 @@

+"""Detection utilities: IoU, matching, drawing, metrics."""
+from __future__ import annotations
+from dataclasses import asdict
+from typing import Dict, Iterable, List, Optional, Sequence, Tuple
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+from .base import Detection
+def box_iou_xyxy(a: Sequence[float], b: Sequence[float]) -> float:
+    ax1, ay1, ax2, ay2 = a
+    bx1, by1, bx2, by2 = b
+    inter_x1 = max(ax1, bx1)
+    inter_y1 = max(ay1, by1)
+    inter_x2 = min(ax2, bx2)
+    inter_y2 = min(ay2, by2)
+    inter_w = max(0.0, inter_x2 - inter_x1)
+    inter_h = max(0.0, inter_y2 - inter_y1)
+    inter = inter_w * inter_h
+    area_a = max(0.0, ax2 - ax1) * max(0.0, ay2 - ay1)
+    area_b = max(0.0, bx2 - bx1) * max(0.0, by2 - by1)
+    union = area_a + area_b - inter
+    return float(inter / union) if union > 0 else 0.0
+def greedy_match_detections(
+    before: List[Detection],
+    after: List[Detection],
+    iou_threshold: float = 0.5,
+    require_same_label: bool = True,
+) -> List[Tuple[int, int, float]]:
+    """Greedy match: for each `before` detection (sorted by score), pick best IoU `after`.
+    Returns list of (before_idx, after_idx, iou).
+    """
+    before_order = sorted(range(len(before)), key=lambda i: before[i].score, reverse=True)
+    used_after = set()
+    matches: List[Tuple[int, int, float]] = []
+    for bi in before_order:
+        best = None
+        best_iou = 0.0
+        for ai in range(len(after)):
+            if ai in used_after:
+                continue
+            if require_same_label and before[bi].label != after[ai].label:
+                continue
+            iou = box_iou_xyxy(before[bi].bbox_xyxy, after[ai].bbox_xyxy)
+            if iou >= iou_threshold and iou > best_iou:
+                best = ai
+                best_iou = iou
+        if best is not None:
+            used_after.add(best)
+            matches.append((bi, best, float(best_iou)))
+    return matches
+def summarize_before_after(
+    before: List[Detection],
+    after: List[Detection],
+    iou_threshold: float = 0.5,
+) -> Dict:
+    matches = greedy_match_detections(before, after, iou_threshold=iou_threshold, require_same_label=True)
+    matched_before = {bi for (bi, _, _) in matches}
+    matched_after = {ai for (_, ai, _) in matches}
+    retention = (len(matches) / len(before)) if before else 1.0
+    emergence = (len(after) - len(matches))
+    ious = [iou for (_, _, iou) in matches]
+    score_before = [before[bi].score for (bi, _, _) in matches]
+    score_after = [after[ai].score for (_, ai, _) in matches]
+    return {
+        "num_before": len(before),
+        "num_after": len(after),
+        "matched": len(matches),
+        "retention": float(retention),
+        "new_after": int(emergence),
+        "mean_iou_matched": float(np.mean(ious)) if ious else None,
+        "mean_score_before_matched": float(np.mean(score_before)) if score_before else None,
+        "mean_score_after_matched": float(np.mean(score_after)) if score_after else None,
+    }
+def detections_to_dict(dets: List[Detection]) -> List[Dict]:
+    return [asdict(d) for d in dets]
+def draw_detections(
+    image: Image.Image,
+    detections: List[Detection],
+    max_dets: Optional[int] = 50,
+    color: Tuple[int, int, int] = (0, 255, 0),
+) -> Image.Image:
+    img = image.copy()
+    draw = ImageDraw.Draw(img)
+    try:
+        font = ImageFont.load_default()
+    except Exception:
+        font = None
+    dets = detections[: max_dets or len(detections)]
+    for d in dets:
+        x1, y1, x2, y2 = d.bbox_xyxy
+        draw.rectangle([x1, y1, x2, y2], outline=color, width=2)
+        label = f"{d.label} {d.score:.2f}"
+        if font is not None:
+            draw.text((x1 + 2, y1 + 2), label, fill=color, font=font)
+        else:
+            draw.text((x1 + 2, y1 + 2), label, fill=color)
+    return img

detection/yolo.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""YOLO detector via Ultralytics."""
+from __future__ import annotations
+from typing import Dict, List, Optional, Union
+import numpy as np
+from PIL import Image
+from .base import BaseDetector, Detection
+from model_cache import default_checkpoint_path, ensure_default_checkpoint_dirs
+class YOLODetector(BaseDetector):
+    def __init__(self, device: str = "cuda", model_path: str = default_checkpoint_path("yolo26x.pt"), **kwargs):
+        super().__init__(device=device, **kwargs)
+        self.model_path = model_path
+        self._names: Dict[int, str] = {}
+    def load_model(self) -> None:
+        ensure_default_checkpoint_dirs()
+        from ultralytics import YOLO
+        self.model = YOLO(self.model_path)
+        # ultralytics uses 'cuda:0' style
+        self._device_arg = 0 if self.device.startswith("cuda") else "cpu"
+        self._names = dict(getattr(self.model, "names", {}) or {})
+    def get_available_classes(self) -> Union[List[str], Dict[str, int], None]:
+        if not self._names:
+            return None
+        # map name->id
+        return {name: int(idx) for idx, name in self._names.items()}
+    def detect(self, image: Image.Image, conf_threshold: float = 0.25, **kwargs) -> List[Detection]:
+        assert self.model is not None
+        img = np.asarray(image.convert("RGB"))
+        res = self.model.predict(source=img, conf=float(conf_threshold), device=self._device_arg, verbose=False)
+        if not res:
+            return []
+        r0 = res[0]
+        if getattr(r0, "boxes", None) is None:
+            return []
+        boxes = r0.boxes
+        xyxy = boxes.xyxy.detach().cpu().numpy()
+        conf = boxes.conf.detach().cpu().numpy()
+        cls = boxes.cls.detach().cpu().numpy().astype(int)
+        out: List[Detection] = []
+        for (b, s, c) in zip(xyxy, conf, cls):
+            label = self._names.get(int(c), str(int(c)))
+            out.append(Detection(label=label, score=float(s), bbox_xyxy=[float(x) for x in b.tolist()]))
+        return out
+    def detect_batch(self, images: List[Image.Image], conf_threshold: float = 0.25, **kwargs) -> List[List[Detection]]:
+        """Batch detection for video processing - much faster than frame-by-frame.
+        Args:
+            images: List of PIL Images
+            conf_threshold: Confidence threshold
+        Returns:
+            List of detection lists, one per image
+        """
+        assert self.model is not None
+        if not images:
+            return []
+        # Convert all images to numpy arrays
+        imgs = [np.asarray(img.convert("RGB")) for img in images]
+        # TRUE batch inference - YOLO processes all frames together
+        results = self.model.predict(source=imgs, conf=float(conf_threshold), device=self._device_arg, verbose=False)
+        # Parse results for each frame
+        all_detections = []
+        for result in results:
+            if not result or getattr(result, "boxes", None) is None:
+                all_detections.append([])
+                continue
+            boxes = result.boxes
+            xyxy = boxes.xyxy.detach().cpu().numpy()
+            conf = boxes.conf.detach().cpu().numpy()
+            cls = boxes.cls.detach().cpu().numpy().astype(int)
+            frame_dets = []
+            for (b, s, c) in zip(xyxy, conf, cls):
+                label = self._names.get(int(c), str(int(c)))
+                frame_dets.append(Detection(label=label, score=float(s), bbox_xyxy=[float(x) for x in b.tolist()]))
+            all_detections.append(frame_dets)
+        return all_detections

detection/yolo_world.py ADDED Viewed

	@@ -0,0 +1,188 @@

+"""YOLO-World open-vocabulary object detection via Ultralytics."""
+from __future__ import annotations
+from typing import Any, Dict, List, Optional, Sequence, Union
+import numpy as np
+from PIL import Image
+from .base import BaseDetector, Detection
+from model_cache import default_checkpoint_path, ensure_default_checkpoint_dirs
+def _normalize_classes(classes: Optional[Union[str, Sequence[str]]]) -> Optional[List[str]]:
+    if classes is None:
+        return None
+    if isinstance(classes, str):
+        parts = [c.strip() for c in classes.split(",")]
+        parts = [c for c in parts if c]
+        return parts or None
+    out = [str(c).strip() for c in classes]
+    out = [c for c in out if c]
+    return out or None
+class YOLOWorldDetector(BaseDetector):
+    """YOLO-World wrapper.
+    Usage:
+      detector = create_detector('yolo_world', device='cuda')
+      dets = detector(image, classes=['person', 'car'])
+    Notes:
+      - This is open-vocabulary; `get_available_classes()` returns None.
+      - Requires the Ultralytics `YOLOWorld` class (available in ultralytics>=8).
+    """
+    def __init__(
+        self,
+        device: str = "cuda",
+        model_path: str = default_checkpoint_path("yolo26s-world.pt"),
+        **kwargs: Any,
+    ):
+        super().__init__(device=device, **kwargs)
+        self.model_path = model_path
+        self._device_arg = "cpu"
+        self._names: Dict[int, str] = {}
+    def load_model(self) -> None:
+        ensure_default_checkpoint_dirs()
+        try:
+            from ultralytics import YOLOWorld
+        except Exception as e:
+            raise ImportError("ultralytics is required for YOLO-World detection") from e
+        self.model = YOLOWorld(self.model_path)
+        self._device_arg = 0 if self.device.startswith("cuda") else "cpu"
+    def get_available_classes(self):
+        return None
+    def detect(self, image: Image.Image, conf_threshold: float = 0.25, **kwargs: Any) -> List[Detection]:
+        assert self.model is not None
+        classes = _normalize_classes(kwargs.get("classes") or kwargs.get("prompts") or kwargs.get("labels"))
+        if not classes:
+            raise ValueError("YOLOWorld requires `classes`/`prompts` (comma-separated string or list) for open-vocabulary detection")
+        # Tell the model which classes to look for.
+        # Note: set_classes triggers CLIP text encoding which can have device mismatch issues.
+        # We work around this by temporarily moving to CPU for set_classes, then back to GPU for inference.
+        if hasattr(self.model, "set_classes"):
+            import torch
+            try:
+                # Try setting classes directly first
+                self.model.set_classes(classes)
+            except RuntimeError as e:
+                if "device" in str(e).lower():
+                    # Device mismatch - try moving model to CPU temporarily
+                    if hasattr(self.model, 'model'):
+                        original_device = next(self.model.model.parameters()).device
+                        self.model.model.to('cpu')
+                        self.model.set_classes(classes)
+                        self.model.model.to(original_device)
+                else:
+                    raise
+        img = np.asarray(image.convert("RGB"))
+        res = self.model.predict(source=img, conf=float(conf_threshold), device=self._device_arg, verbose=False)
+        if not res:
+            return []
+        r0 = res[0]
+        # Handle names attribute - can be dict, list, or other types
+        raw_names = getattr(r0, "names", {})
+        if isinstance(raw_names, dict):
+            names = raw_names
+        elif isinstance(raw_names, (list, tuple)):
+            names = {i: str(n) for i, n in enumerate(raw_names)}
+        else:
+            names = {}
+        if getattr(r0, "boxes", None) is None:
+            return []
+        boxes = r0.boxes
+        xyxy = boxes.xyxy.detach().cpu().numpy()
+        conf = boxes.conf.detach().cpu().numpy()
+        cls = boxes.cls.detach().cpu().numpy().astype(int)
+        out: List[Detection] = []
+        for (b, s, c) in zip(xyxy, conf, cls):
+            label = names.get(int(c))
+            if not label:
+                # fallback: index into the user-provided classes if the mapping is missing
+                label = classes[int(c)] if 0 <= int(c) < len(classes) else str(int(c))
+            out.append(Detection(label=str(label), score=float(s), bbox_xyxy=[float(x) for x in b.tolist()]))
+        return out
+    def detect_batch(self, images: List[Image.Image], conf_threshold: float = 0.25, **kwargs: Any) -> List[List[Detection]]:
+        """Batch detection for video processing - much faster than frame-by-frame.
+        Args:
+            images: List of PIL Images
+            conf_threshold: Confidence threshold
+            **kwargs: Should include 'classes'/'prompts' for open-vocabulary detection
+        Returns:
+            List of detection lists, one per image
+        """
+        assert self.model is not None
+        if not images:
+            return []
+        classes = _normalize_classes(kwargs.get("classes") or kwargs.get("prompts") or kwargs.get("labels"))
+        if not classes:
+            raise ValueError("YOLOWorld requires `classes`/`prompts` (comma-separated string or list) for open-vocabulary detection")
+        # Set classes for the model
+        if hasattr(self.model, "set_classes"):
+            import torch
+            try:
+                self.model.set_classes(classes)
+            except RuntimeError as e:
+                if "device" in str(e).lower():
+                    if hasattr(self.model, 'model'):
+                        original_device = next(self.model.model.parameters()).device
+                        self.model.model.to('cpu')
+                        self.model.set_classes(classes)
+                        self.model.model.to(original_device)
+                else:
+                    raise
+        # Convert all images to numpy arrays
+        imgs = [np.asarray(img.convert("RGB")) for img in images]
+        # TRUE batch inference
+        results = self.model.predict(source=imgs, conf=float(conf_threshold), device=self._device_arg, verbose=False)
+        # Parse results for each frame
+        all_detections = []
+        for result in results:
+            # Handle names attribute
+            raw_names = getattr(result, "names", {})
+            if isinstance(raw_names, dict):
+                names = raw_names
+            elif isinstance(raw_names, (list, tuple)):
+                names = {i: str(n) for i, n in enumerate(raw_names)}
+            else:
+                names = {}
+            frame_dets = []
+            if getattr(result, "boxes", None) is not None:
+                boxes = result.boxes
+                xyxy = boxes.xyxy.detach().cpu().numpy()
+                conf = boxes.conf.detach().cpu().numpy()
+                cls = boxes.cls.detach().cpu().numpy().astype(int)
+                for (b, s, c) in zip(xyxy, conf, cls):
+                    label = names.get(int(c))
+                    if not label:
+                        label = classes[int(c)] if 0 <= int(c) < len(classes) else str(int(c))
+                    frame_dets.append(Detection(label=str(label), score=float(s), bbox_xyxy=[float(x) for x in b.tolist()]))
+            all_detections.append(frame_dets)
+        return all_detections

examples.sh ADDED Viewed

	@@ -0,0 +1,272 @@

+#!/bin/bash
+# Example commands for ROI-VAE compression
+# Covers: Car, Building, Person, Boat
+# Make sure you're in the roi-vae directory
+cd "$(dirname "$0")"
+# Create results directory if it doesn't exist
+mkdir -p results
+echo "ROI-VAE Compression Examples"
+echo "============================="
+echo
+# ---------------------------------------------------------
+# 1. CAR
+# ---------------------------------------------------------
+echo "1. CAR (YOLO)"
+echo "-------------"
+# 1a. Only Image
+echo "a) Compressing (Image Only)..."
+python roi_compressor.py \
+    --input data/images/car/0016cf15fa4d4e16.jpg \
+    --output results/car_compressed.jpg \
+    --checkpoint checkpoints/tic_lambda_0.0483.pth.tar \
+    --sigma 0.3 \
+    --seg-method yolo \
+    --seg-classes car
+# 1b. With Comparison
+echo "b) Compressing (With Comparison)..."
+python roi_compressor.py \
+    --input data/images/car/0016cf15fa4d4e16.jpg \
+    --output results/car_comparison.jpg \
+    --checkpoint checkpoints/tic_lambda_0.0483.pth.tar \
+    --sigma 0.3 \
+    --seg-method yolo \
+    --seg-classes car \
+    --highlight
+echo
+echo "---------------------------------------------------------"
+echo
+# ---------------------------------------------------------
+# 2. BUILDING
+# ---------------------------------------------------------
+echo "2. BUILDING (SegFormer)"
+echo "-----------------------"
+# 2a. Only Image
+echo "a) Compressing (Image Only)..."
+python roi_compressor.py \
+    --input data/images/building/000571767ec7a593.jpg \
+    --output results/building_compressed.jpg \
+    --checkpoint checkpoints/tic_lambda_0.0483.pth.tar \
+    --sigma 0.3 \
+    --seg-classes building
+# 2b. With Comparison
+echo "b) Compressing (With Comparison)..."
+python roi_compressor.py \
+    --input data/images/building/000571767ec7a593.jpg \
+    --output results/building_comparison.jpg \
+    --checkpoint checkpoints/tic_lambda_0.0483.pth.tar \
+    --sigma 0.3 \
+    --seg-classes building \
+    --highlight
+echo
+echo "---------------------------------------------------------"
+echo
+# ---------------------------------------------------------
+# 3. PERSON
+# ---------------------------------------------------------
+echo "3. PERSON (YOLO)"
+echo "----------------"
+# 3a. Only Image
+echo "a) Compressing (Image Only)..."
+python roi_compressor.py \
+    --input data/images/person/kodim04.png \
+    --output results/person_compressed.jpg \
+    --checkpoint checkpoints/tic_lambda_0.0483.pth.tar \
+    --sigma 0.3 \
+    --seg-method yolo \
+    --seg-classes person
+# 3b. With Comparison
+echo "b) Compressing (With Comparison)..."
+python roi_compressor.py \
+    --input data/images/person/kodim04.png \
+    --output results/person_comparison.jpg \
+    --checkpoint checkpoints/tic_lambda_0.0483.pth.tar \
+    --sigma 0.3 \
+    --seg-method yolo \
+    --seg-classes person \
+    --highlight
+echo
+echo "---------------------------------------------------------"
+echo
+# ---------------------------------------------------------
+# 4. BOAT
+# ---------------------------------------------------------
+echo "4. BOAT (YOLO)"
+echo "--------------"
+# 4a. Only Image
+echo "a) Compressing (Image Only)..."
+python roi_compressor.py \
+    --input data/images/boat/kodim06.png \
+    --output results/boat_compressed.jpg \
+    --checkpoint checkpoints/tic_lambda_0.0483.pth.tar \
+    --sigma 0.3 \
+    --seg-method yolo \
+    --seg-classes boat
+# 4b. With Comparison
+echo "b) Compressing (With Comparison)..."
+python roi_compressor.py \
+    --input data/images/boat/kodim06.png \
+    --output results/boat_comparison.jpg \
+    --checkpoint checkpoints/tic_lambda_0.0483.pth.tar \
+    --sigma 0.3 \
+    --seg-method yolo \
+    --seg-classes boat \
+    --highlight
+echo
+echo "---------------------------------------------------------"
+echo
+# ---------------------------------------------------------
+# 5. PARAMETER COMPARISON
+# ---------------------------------------------------------
+echo "5. PARAMETER COMPARISON"
+echo "-----------------------"
+# 5a. Sigma Comparison (Background Quality)
+echo "a) Sigma Comparison (Background Quality)..."
+# Low Sigma (High Compression)
+echo "   - Low Sigma (0.1)"
+python roi_compressor.py \
+    --input data/images/car/0016cf15fa4d4e16.jpg \
+    --output results/sigma_low.jpg \
+    --checkpoint checkpoints/tic_lambda_0.0483.pth.tar \
+    --sigma 0.1 \
+    --seg-method yolo \
+    --seg-classes car \
+    --highlight
+# High Sigma (Low Compression)
+echo "   - High Sigma (0.9)"
+python roi_compressor.py \
+    --input data/images/car/0016cf15fa4d4e16.jpg \
+    --output results/sigma_high.jpg \
+    --checkpoint checkpoints/tic_lambda_0.0483.pth.tar \
+    --sigma 0.9 \
+    --seg-method yolo \
+    --seg-classes car \
+    --highlight
+# 5b. Lambda Comparison (Rate-Distortion)
+echo "b) Lambda Comparison (Rate-Distortion)..."
+# Low Lambda
+echo "   - Low Lambda (0.013)"
+python roi_compressor.py \
+    --input data/images/car/0016cf15fa4d4e16.jpg \
+    --output results/lambda_low.jpg \
+    --checkpoint checkpoints/tic_lambda_0.013.pth.tar \
+    --sigma 0.3 \
+    --lambda 0.013 \
+    --N 128 \
+    --M 192 \
+    --seg-method yolo \
+    --seg-classes car \
+    --highlight
+# High Lambda
+echo "   - High Lambda (0.0932)"
+python roi_compressor.py \
+    --input data/images/car/0016cf15fa4d4e16.jpg \
+    --output results/lambda_high.jpg \
+    --checkpoint checkpoints/tic_lambda_0.0932.pth.tar \
+    --sigma 0.3 \
+    --lambda 0.0932 \
+    --seg-method yolo \
+    --seg-classes car \
+    --highlight
+echo
+echo "---------------------------------------------------------"
+echo
+# -------------------------------------------------
+# 5. Standalone Segmentation (Mask R-CNN)
+# -------------------------------------------------
+echo "--- Running Standalone Segmentation (Mask R-CNN) ---"
+python roi_segmenter.py \
+    --input images/car/0016cf15fa4d4e16.jpg \
+    --output results/mask_rcnn.png \
+    --method maskrcnn \
+    --classes car \
+    --visualize
+# -------------------------------------------------
+# 6. Video Compression (Static Mode)
+# -------------------------------------------------
+echo "--- Running Video Compression (Static Mode) ---"
+python roi_compressor.py \
+    --input data/videos/traffic.mp4 \
+    --output results/video_static.mp4 \
+    --quality-level 4 \
+    --sigma 0.4 \
+    --seg-method yolo \
+    --seg-classes car person \
+    --video-mode static \
+    --output-fps 10
+# -------------------------------------------------
+# 7. Video Compression (Dynamic Mode)
+# -------------------------------------------------
+echo "--- Running Video Compression (Dynamic Mode) ---"
+python roi_compressor.py \
+    --input data/videos/traffic.mp4 \
+    --output results/video_dynamic.mp4 \
+    --quality-level 4 \
+    --seg-method yolo \
+    --seg-classes car person \
+    --video-mode dynamic \
+    --target-bandwidth-kbps 800 \
+    --min-fps 5 \
+    --max-fps 20
+# -------------------------------------------------
+# 8. Video Detection Evaluation (Static Mode)
+# -------------------------------------------------
+echo "--- Running Video Detection Evaluation (Static Mode) ---"
+python roi_detection_eval.py \
+    --before data/videos/traffic.mp4 \
+    --video-mode static \
+    --sigma 0.3 \
+    --output-fps 10 \
+    --quality-level 4 \
+    --seg-method yolo \
+    --seg-classes car person \
+    --detectors yolo \
+    --max-video-frames 30 \
+    --video-sample-interval 3 \
+    --save-after results/video_eval_compressed.mp4 \
+    --out results/video_detection_eval.json \
+    --viz-dir results/video_detection_viz
+# -------------------------------------------------
+# 9. Video Detection Evaluation (Comparing Two Videos)
+# -------------------------------------------------
+echo "--- Running Video Detection Evaluation (Compare Two Videos) ---"
+python roi_detection_eval.py \
+    --before data/videos/traffic.mp4 \
+    --after results/video_static.mp4 \
+    --detectors yolo \
+    --max-video-frames 30 \
+    --video-sample-interval 3 \
+    --out results/video_compare_eval.json
+echo
+echo "============================="
+echo "All examples complete! Check results/ directory"

model_cache.py ADDED Viewed

	@@ -0,0 +1,62 @@

+"""Centralized defaults for model checkpoint/cache locations.
+Goal: keep all auto-downloaded model artifacts inside this repo's `checkpoints/`
+directory by default (instead of user-wide cache dirs or repo root).
+"""
+from __future__ import annotations
+import os
+from pathlib import Path
+PROJECT_ROOT = Path(__file__).resolve().parent
+CHECKPOINTS_DIR = PROJECT_ROOT / "checkpoints"
+# Hugging Face will create subfolders like `hub/`, `datasets/`, etc under HF_HOME.
+HF_HOME_DIR = CHECKPOINTS_DIR / "hf"
+# Torchvision uses torch.hub.load_state_dict_from_url which respects TORCH_HOME.
+TORCH_HOME_DIR = CHECKPOINTS_DIR / "torch"
+def ensure_default_checkpoint_dirs() -> None:
+    """Ensure checkpoint dirs exist and set cache-related env vars.
+    This is intentionally a best-effort helper. If the user has explicitly set
+    env vars already, we do not override them.
+    """
+    CHECKPOINTS_DIR.mkdir(parents=True, exist_ok=True)
+    HF_HOME_DIR.mkdir(parents=True, exist_ok=True)
+    TORCH_HOME_DIR.mkdir(parents=True, exist_ok=True)
+    # Hugging Face
+    os.environ.setdefault("HF_HOME", str(HF_HOME_DIR))
+    # Compatibility env vars across transformers/huggingface-hub versions.
+    os.environ.setdefault("TRANSFORMERS_CACHE", str(HF_HOME_DIR / "hub"))
+    os.environ.setdefault("HUGGINGFACE_HUB_CACHE", str(HF_HOME_DIR / "hub"))
+    # Torch / torchvision
+    os.environ.setdefault("TORCH_HOME", str(TORCH_HOME_DIR))
+def hf_cache_dir() -> Path:
+    ensure_default_checkpoint_dirs()
+    return HF_HOME_DIR
+def torch_home_dir() -> Path:
+    ensure_default_checkpoint_dirs()
+    return TORCH_HOME_DIR
+def checkpoints_dir() -> Path:
+    ensure_default_checkpoint_dirs()
+    return CHECKPOINTS_DIR
+def default_checkpoint_path(filename: str) -> str:
+    """Return an absolute path under `checkpoints/` for a given filename."""
+    return str(checkpoints_dir() / filename)

requirements.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+# Core dependencies
+torch>=2.0.0
+torchvision>=0.15.0
+numpy>=1.24.0
+pillow>=9.5.0
+matplotlib>=3.7.0
+# Prefer headless OpenCV for servers / Hugging Face Spaces.
+opencv-python-headless>=4.7.0
+# Model dependencies
+compressai>=1.2.4
+timm>=0.9.0
+# Segmentation
+transformers>=4.36.0
+ultralytics>=8.0.0
+# Detection (optional)
+effdet>=0.4.1
+# Demo app (Hugging Face Spaces)
+gradio>=6.2.0,<7
+openai>=1.0.0
+gradio_client
+scipy
+tqdm

roi_compressor.py ADDED Viewed

	@@ -0,0 +1,183 @@

+"""
+CLI for ROI-based image/video compression using modular compression framework.
+"""
+import argparse
+import os
+import sys
+from pathlib import Path
+import numpy as np
+from PIL import Image
+import torch
+from segmentation import create_segmenter
+from vae import load_checkpoint, compress_image
+from vae.visualization import create_comparison_grid
+from video import VideoProcessor, CompressionSettings
+from video.video_processor import frames_to_video_bytes
+# Command-line interface
+def main():
+    parser = argparse.ArgumentParser(description="ROI-based Image/Video Compressor.")
+    # I/O
+    parser.add_argument("--input", required=True, help="Path to input image or video file.")
+    parser.add_argument("--output", required=True, help="Path to save compressed output file.")
+    parser.add_argument("--checkpoint", default="checkpoints/tic_lambda_0.0483.pth.tar", help="Path to VAE model checkpoint.")
+    parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="Device to run on.")
+    # General Compression Settings
+    parser.add_argument("--quality-level", type=int, default=4, choices=range(7), help="Base quality level (0-6, higher is better). Affects VAE model selection.")
+    parser.add_argument("--sigma", type=float, default=0.3, help="Background quality factor (0.01-1.0). Lower means more background compression.")
+    # Segmentation Settings
+    parser.add_argument("--seg-method", default="yolo", help="Segmentation method to use.")
+    parser.add_argument("--seg-classes", nargs="+", required=True, help="Classes to segment as ROI.")
+    parser.add_argument("--seg-model", help="Path to a specific segmentation model checkpoint (optional).")
+    # Video-Specific Settings
+    parser.add_argument("--video-mode", choices=['static', 'dynamic'], default='static', help="Video compression mode: 'static' for fixed settings, 'dynamic' for adaptive.")
+    parser.add_argument("--target-bandwidth-kbps", type=int, default=1000, help="[Dynamic Mode] Target bandwidth in kbps.")
+    parser.add_argument("--output-fps", type=float, default=15.0, help="[Static Mode] Output framerate.")
+    parser.add_argument("--min-fps", type=float, default=5.0, help="[Dynamic Mode] Minimum framerate.")
+    parser.add_argument("--max-fps", type=float, default=30.0, help="[Dynamic Mode] Maximum framerate.")
+    parser.add_argument("--chunk-duration-sec", type=float, default=1.0, help="[Dynamic Mode] Duration of video chunks for analysis.")
+    # Detection/Tracking Settings (for video)
+    parser.add_argument("--detection-method", default="yolo", help="Object detection method for video.")
+    parser.add_argument("--enable-tracking", action="store_true", help="Enable object tracking in video.")
+    # Visualization
+    parser.add_argument("--highlight", action="store_true", help="Create a comparison grid image (for image input only).")
+    parser.add_argument("--viz-dir", help="Directory to save visualization artifacts (e.g., masks).")
+    args = parser.parse_args()
+    # --- Input Type Check ---
+    input_path = args.input.lower()
+    is_video = any(input_path.endswith(ext) for ext in ['.mp4', '.avi', '.mov', '.mkv'])
+    if is_video:
+        print("Processing video input...")
+        process_video(args)
+    else:
+        print("Processing image input...")
+        process_image(args)
+def process_image(args):
+    """Compresses a single image."""
+    print(f"Loading VAE model from {args.checkpoint}...")
+    model = load_checkpoint(args.checkpoint, device=args.device)
+    model.eval()
+    print(f"Loading segmenter '{args.seg_method}'...")
+    seg_kwargs = dict(device=args.device)
+    if args.seg_model:
+        seg_kwargs['model_path'] = args.seg_model
+    segmenter = create_segmenter(args.seg_method, **seg_kwargs)
+    print(f"Loading image from {args.input}...")
+    image = Image.open(args.input).convert("RGB")
+    print(f"Segmenting image for classes: {args.seg_classes}...")
+    mask = segmenter(image, target_classes=args.seg_classes)
+    if args.viz_dir:
+        if not os.path.exists(args.viz_dir):
+            os.makedirs(args.viz_dir)
+        mask_path = os.path.join(args.viz_dir, "mask.png")
+        Image.fromarray((mask * 255).astype(np.uint8)).save(mask_path)
+        print(f"Saved segmentation mask to {mask_path}")
+    print(f"Compressing image with sigma={args.sigma}...")
+    result = compress_image(
+        image,
+        mask,
+        model,
+        sigma=args.sigma,
+        device=args.device
+    )
+    compressed_img = result['compressed']
+    bpp = result['bpp']
+    print(f"Saving compressed image to {args.output} (BPP: {bpp:.4f})")
+    compressed_img.save(args.output)
+    if args.highlight:
+        print("Creating comparison grid...")
+        lambda_val = float(os.path.basename(args.checkpoint).split('_')[-1].replace('.pth.tar', ''))
+        grid = create_comparison_grid(image, compressed_img, mask, bpp, args.sigma, lambda_val)
+        grid_path = args.output.replace(os.path.splitext(args.output)[1], "_comparison.jpg")
+        grid.save(grid_path)
+        print(f"Saved comparison grid to {grid_path}")
+def process_video(args):
+    """Compresses a video using static or dynamic settings."""
+    processor = VideoProcessor(device=args.device)
+    print("Loading models for video processing...")
+    processor.load_models(
+        quality_level=args.quality_level,
+        segmentation_method=args.seg_method,
+        detection_method=args.detection_method,
+        enable_tracking=args.enable_tracking,
+    )
+    # Simple progress callback
+    def progress_callback(current, total, message):
+        percent = int(100 * current / max(1, total))
+        print(f"[{percent:3d}%] {message}")
+    if args.video_mode == 'static':
+        settings = CompressionSettings(
+            mode='static',
+            quality_level=args.quality_level,
+            sigma=args.sigma,
+            output_fps=args.output_fps,
+            target_classes=args.seg_classes,
+        )
+        print(f"Starting STATIC video compression with FPS={settings.output_fps}, Sigma={settings.sigma}...")
+        print("Using offline batch processing (GPU memory optimized)...")
+        chunks = processor.process_static_offline(args.input, settings, progress_callback=progress_callback)
+    else: # dynamic
+        settings = CompressionSettings(
+            mode='dynamic',
+            target_bandwidth_kbps=args.target_bandwidth_kbps,
+            min_fps=args.min_fps,
+            max_fps=args.max_fps,
+            chunk_duration_sec=args.chunk_duration_sec,
+            target_classes=args.seg_classes,
+            quality_level=args.quality_level,
+        )
+        print(f"Starting DYNAMIC video compression with Target Bandwidth={settings.target_bandwidth_kbps} kbps...")
+        print("Using offline batch processing (GPU memory optimized)...")
+        chunks = processor.process_dynamic_offline(args.input, settings, progress_callback=progress_callback)
+    if not chunks:
+        print("No frames were processed. Exiting.")
+        return
+    # Collect all frames from chunks
+    all_frames = []
+    for chunk in chunks:
+        all_frames.extend(chunk.frames)
+    # Determine the final video's FPS
+    if args.video_mode == 'static':
+        final_fps = args.output_fps
+    else:
+        # For dynamic, use weighted average FPS from chunks
+        total_frames = sum(len(c.frames) for c in chunks)
+        total_duration = sum(len(c.frames) / c.fps for c in chunks)
+        final_fps = total_frames / total_duration if total_duration > 0 else args.max_fps
+    print(f"\nRe-encoding {len(all_frames)} frames into final video at ~{final_fps:.2f} FPS...")
+    video_bytes = frames_to_video_bytes(all_frames, fps=final_fps)
+    print(f"Saving compressed video to {args.output}...")
+    with open(args.output, "wb") as f:
+        f.write(video_bytes)
+    print("Done.")
+if __name__ == "__main__":
+    main()

roi_detection_eval.py ADDED Viewed

	@@ -0,0 +1,639 @@

+"""CLI: evaluate object detection before vs after ROI compression.
+Supports both images and videos.
+Image modes:
+1) Compare two images:
+   python roi_detection_eval.py --before img.jpg --after img_compressed.jpg --detectors yolo fasterrcnn
+2) Create the "after" image via ROI compression, then evaluate:
+   python roi_detection_eval.py --before img.jpg \
+     --checkpoint checkpoints/tic_lambda_0.0483.pth.tar --sigma 0.3 \
+     --seg-method yolo --seg-classes car person \
+     --detectors yolo fasterrcnn
+Video modes:
+3) Compare two videos:
+   python roi_detection_eval.py --before video.mp4 --after video_compressed.mp4 --detectors yolo
+4) Create the "after" video via ROI compression, then evaluate:
+   python roi_detection_eval.py --before video.mp4 \
+     --video-mode static --sigma 0.3 --output-fps 10 \
+     --seg-method yolo --seg-classes car person \
+     --detectors yolo
+Outputs JSON summary + optional visualization images/videos.
+"""
+from __future__ import annotations
+import json
+import os
+import tempfile
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+from PIL import Image
+from detection import create_detector, get_available_detectors
+from detection.utils import (
+    detections_to_dict,
+    draw_detections,
+    summarize_before_after,
+)
+VIDEO_EXTENSIONS = {'.mp4', '.avi', '.mov', '.mkv', '.webm'}
+def _is_video(path: str) -> bool:
+    """Check if file is a video based on extension."""
+    return Path(path).suffix.lower() in VIDEO_EXTENSIONS
+def _load_image(path: str) -> Image.Image:
+    return Image.open(path).convert("RGB")
+def _normalize_open_vocab_classes(raw: Optional[str]) -> Optional[str]:
+    if raw is None:
+        return None
+    s = str(raw).strip()
+    return s or None
+def _maybe_make_after_image(
+    before_img: Image.Image,
+    args,
+) -> Image.Image:
+    """Create after image via ROI compression if --after not provided."""
+    if args.after:
+        return _load_image(args.after)
+    if not args.checkpoint:
+        raise SystemExit("Provide either --after or --checkpoint (to generate after via ROI compression).")
+    from vae import load_checkpoint, compress_image
+    from segmentation import create_segmenter
+    from segmentation.utils import load_mask
+    if args.mask:
+        mask = load_mask(args.mask)
+    else:
+        if not args.seg_method:
+            raise SystemExit("When not using --mask, provide --seg-method.")
+        seg_kwargs = {}
+        if args.seg_method == "yolo":
+            seg_kwargs["conf_threshold"] = args.seg_conf
+        if args.seg_method == "mask2former":
+            seg_kwargs["model_type"] = args.seg_model_type
+        segmenter = create_segmenter(args.seg_method, device=args.device, **seg_kwargs)
+        if args.seg_method == "sam3":
+            if not args.seg_prompt:
+                raise SystemExit("For --seg-method sam3, provide --seg-prompt.")
+            mask = segmenter(before_img, target_classes=[args.seg_prompt])
+        else:
+            if not args.seg_classes:
+                raise SystemExit("Provide --seg-classes (or use --seg-method sam3 + --seg-prompt).")
+            mask = segmenter(before_img, target_classes=args.seg_classes)
+    model = load_checkpoint(args.checkpoint, N=args.N, M=args.M, device=args.device)
+    out = compress_image(before_img, mask, model, sigma=float(args.sigma), device=args.device)
+    after_img = out["compressed"]
+    if args.save_after:
+        Path(args.save_after).parent.mkdir(parents=True, exist_ok=True)
+        after_img.save(args.save_after)
+    return after_img
+def _extract_video_frames(
+    video_path: str,
+    max_frames: Optional[int] = None,
+    sample_interval: int = 1,
+) -> Tuple[List[Image.Image], float]:
+    """Extract frames from a video file.
+    Args:
+        video_path: Path to video file
+        max_frames: Max frames to extract (None = all)
+        sample_interval: Extract every Nth frame
+    Returns:
+        (frames, fps)
+    """
+    try:
+        import cv2
+    except ImportError:
+        raise ImportError("opencv-python required for video processing")
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise ValueError(f"Cannot open video: {video_path}")
+    fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
+    frames = []
+    frame_idx = 0
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        if frame_idx % sample_interval == 0:
+            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frames.append(Image.fromarray(frame_rgb))
+        frame_idx += 1
+        if max_frames is not None and len(frames) >= max_frames:
+            break
+    cap.release()
+    return frames, fps
+def _maybe_make_after_video(
+    before_path: str,
+    args,
+) -> Tuple[str, List[Image.Image], List[Image.Image]]:
+    """Create after video via ROI compression if --after not provided.
+    Returns:
+        (after_video_path, before_frames, after_frames)
+    """
+    from video import VideoProcessor, CompressionSettings
+    from video.video_processor import frames_to_video_bytes
+    # Extract frames from before video
+    sample_interval = max(1, int(args.video_sample_interval))
+    before_frames, fps = _extract_video_frames(
+        before_path,
+        max_frames=args.max_video_frames,
+        sample_interval=sample_interval,
+    )
+    print(f"Extracted {len(before_frames)} frames from before video (sampled every {sample_interval} frames)")
+    if args.after:
+        # Load after video frames
+        after_frames, _ = _extract_video_frames(
+            args.after,
+            max_frames=args.max_video_frames,
+            sample_interval=sample_interval,
+        )
+        print(f"Extracted {len(after_frames)} frames from after video")
+        return args.after, before_frames, after_frames
+    # Create compressed video
+    print("Creating compressed video via ROI compression...")
+    processor = VideoProcessor(device=args.device)
+    processor.load_models(
+        quality_level=args.quality_level,
+        segmentation_method=args.seg_method or "yolo",
+        detection_method="yolo",
+        enable_tracking=False,
+    )
+    if args.video_mode == "static":
+        settings = CompressionSettings(
+            mode="static",
+            quality_level=args.quality_level,
+            sigma=args.sigma,
+            output_fps=args.output_fps,
+            target_classes=args.seg_classes or [],
+            enable_tracking=False,
+        )
+        chunks = processor.process_static(before_path, settings)
+    else:  # dynamic
+        settings = CompressionSettings(
+            mode="dynamic",
+            target_bandwidth_kbps=args.target_bandwidth_kbps,
+            min_fps=args.min_fps,
+            max_fps=args.max_fps,
+            chunk_duration_sec=args.chunk_duration_sec,
+            target_classes=args.seg_classes or [],
+            quality_level=args.quality_level,
+            enable_tracking=False,
+        )
+        chunks = processor.process_dynamic(before_path, settings)
+    # Collect compressed frames
+    all_compressed_frames = []
+    for chunk in chunks:
+        all_compressed_frames.extend(chunk.frames)
+        if args.max_video_frames and len(all_compressed_frames) >= args.max_video_frames:
+            all_compressed_frames = all_compressed_frames[:args.max_video_frames]
+            break
+    print(f"Compressed {len(all_compressed_frames)} frames")
+    # Sample the after frames at the same interval as before frames
+    # Note: The compression may change frame count, so we align by time proportion
+    after_frames = []
+    if len(all_compressed_frames) >= len(before_frames):
+        # Sample at regular intervals
+        step = len(all_compressed_frames) / len(before_frames)
+        for i in range(len(before_frames)):
+            idx = min(int(i * step), len(all_compressed_frames) - 1)
+            after_frames.append(all_compressed_frames[idx])
+    else:
+        # Use all available frames
+        after_frames = all_compressed_frames
+        # Extend before_frames to match
+        before_frames = before_frames[:len(after_frames)]
+    # Save compressed video if requested
+    after_path = args.save_after
+    if after_path:
+        video_bytes = frames_to_video_bytes(all_compressed_frames, fps=args.output_fps)
+        Path(after_path).parent.mkdir(parents=True, exist_ok=True)
+        with open(after_path, "wb") as f:
+            f.write(video_bytes)
+        print(f"Saved compressed video to {after_path}")
+    else:
+        # Create temp file
+        with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp:
+            after_path = tmp.name
+        video_bytes = frames_to_video_bytes(all_compressed_frames, fps=args.output_fps)
+        with open(after_path, "wb") as f:
+            f.write(video_bytes)
+    return after_path, before_frames, after_frames
+def _evaluate_video_detections(
+    before_frames: List[Image.Image],
+    after_frames: List[Image.Image],
+    detector,
+    det_kwargs: Dict,
+    iou_threshold: float,
+) -> Dict:
+    """Evaluate detection across video frames.
+    Returns:
+        Summary dict with per-frame and aggregate stats
+    """
+    frame_results = []
+    total_before_dets = 0
+    total_after_dets = 0
+    total_matched = 0
+    total_lost = 0
+    total_new = 0
+    for i, (before_frame, after_frame) in enumerate(zip(before_frames, after_frames)):
+        before_dets = detector(before_frame, **det_kwargs)
+        after_dets = detector(after_frame, **det_kwargs)
+        summary = summarize_before_after(before_dets, after_dets, iou_threshold=iou_threshold)
+        # Extract values using correct keys from summarize_before_after
+        num_before = summary["num_before"]
+        num_after = summary["num_after"]
+        matched = summary["matched"]
+        lost = num_before - matched
+        new_dets = summary["new_after"]
+        frame_results.append({
+            "frame_index": i,
+            "before_count": num_before,
+            "after_count": num_after,
+            "matched": matched,
+            "lost": lost,
+            "new_detections": new_dets,
+        })
+        total_before_dets += num_before
+        total_after_dets += num_after
+        total_matched += matched
+        total_lost += lost
+        total_new += new_dets
+    retention_rate = total_matched / max(total_before_dets, 1)
+    return {
+        "total_frames": len(before_frames),
+        "total_before_detections": total_before_dets,
+        "total_after_detections": total_after_dets,
+        "total_matched": total_matched,
+        "total_lost": total_lost,
+        "total_new": total_new,
+        "retention_rate": retention_rate,
+        "avg_before_per_frame": total_before_dets / max(len(before_frames), 1),
+        "avg_after_per_frame": total_after_dets / max(len(after_frames), 1),
+        "frame_results": frame_results,
+    }
+def _create_video_visualization(
+    before_frames: List[Image.Image],
+    after_frames: List[Image.Image],
+    before_dets_list: List[List],
+    after_dets_list: List[List],
+    output_dir: Path,
+    method: str,
+    fps: float = 10.0,
+) -> None:
+    """Create visualization videos with detections drawn."""
+    from video.video_processor import frames_to_video_bytes
+    # Draw detections on frames
+    before_viz = []
+    after_viz = []
+    for i, (bf, af) in enumerate(zip(before_frames, after_frames)):
+        b_dets = before_dets_list[i] if i < len(before_dets_list) else []
+        a_dets = after_dets_list[i] if i < len(after_dets_list) else []
+        before_viz.append(draw_detections(bf, b_dets, color=(0, 255, 0)))
+        after_viz.append(draw_detections(af, a_dets, color=(255, 0, 0)))
+    # Save visualization videos
+    before_bytes = frames_to_video_bytes(before_viz, fps=fps)
+    after_bytes = frames_to_video_bytes(after_viz, fps=fps)
+    with open(output_dir / f"{method}_before.mp4", "wb") as f:
+        f.write(before_bytes)
+    with open(output_dir / f"{method}_after.mp4", "wb") as f:
+        f.write(after_bytes)
+def main_image(args) -> None:
+    """Run detection evaluation on images."""
+    from model_cache import default_checkpoint_path
+    before_img = _load_image(args.before)
+    after_img = _maybe_make_after_image(before_img, args)
+    det_methods = args.detectors
+    if len(det_methods) == 1 and det_methods[0].lower() == "all":
+        det_methods = get_available_detectors()
+    results: Dict = {
+        "type": "image",
+        "before": str(Path(args.before)),
+        "after": str(Path(args.after)) if args.after else None,
+        "det_conf": float(args.det_conf),
+        "iou_threshold": float(args.iou),
+        "open_vocab_classes": _normalize_open_vocab_classes(args.open_vocab_classes),
+        "detectors": {},
+    }
+    viz_dir = Path(args.viz_dir) if args.viz_dir else None
+    if viz_dir:
+        viz_dir.mkdir(parents=True, exist_ok=True)
+    for method in det_methods:
+        det_kwargs = _get_detector_kwargs(method, args)
+        detector = create_detector(method, device=args.device, **det_kwargs)
+        call_kwargs = {"conf_threshold": args.det_conf}
+        if method in {"yolo_world", "grounding_dino"}:
+            ov = _normalize_open_vocab_classes(args.open_vocab_classes)
+            if not ov:
+                raise SystemExit(
+                    f"Detector '{method}' is open-vocabulary; provide --open-vocab-classes (e.g. 'person,car')."
+                )
+            call_kwargs["classes"] = ov
+        before_dets = detector(before_img, **call_kwargs)
+        after_dets = detector(after_img, **call_kwargs)
+        summary = summarize_before_after(before_dets, after_dets, iou_threshold=args.iou)
+        results["detectors"][method] = {
+            "summary": summary,
+            "before_detections": detections_to_dict(before_dets),
+            "after_detections": detections_to_dict(after_dets),
+        }
+        if viz_dir:
+            b = draw_detections(before_img, before_dets, color=(0, 255, 0))
+            a = draw_detections(after_img, after_dets, color=(255, 0, 0))
+            b.save(viz_dir / f"{method}_before.png")
+            a.save(viz_dir / f"{method}_after.png")
+    out_path = Path(args.out)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_text(json.dumps(results, indent=2))
+    print(f"Wrote: {out_path}")
+    if viz_dir:
+        print(f"Wrote visualizations to: {viz_dir}")
+def main_video(args) -> None:
+    """Run detection evaluation on videos."""
+    print(f"Video detection evaluation: {args.before}")
+    after_path, before_frames, after_frames = _maybe_make_after_video(args.before, args)
+    det_methods = args.detectors
+    if len(det_methods) == 1 and det_methods[0].lower() == "all":
+        det_methods = get_available_detectors()
+    results: Dict = {
+        "type": "video",
+        "before": str(Path(args.before)),
+        "after": str(Path(after_path)),
+        "det_conf": float(args.det_conf),
+        "iou_threshold": float(args.iou),
+        "open_vocab_classes": _normalize_open_vocab_classes(args.open_vocab_classes),
+        "video_settings": {
+            "mode": args.video_mode,
+            "sigma": args.sigma,
+            "output_fps": args.output_fps,
+            "quality_level": args.quality_level,
+            "frames_evaluated": len(before_frames),
+        },
+        "detectors": {},
+    }
+    viz_dir = Path(args.viz_dir) if args.viz_dir else None
+    if viz_dir:
+        viz_dir.mkdir(parents=True, exist_ok=True)
+    for method in det_methods:
+        print(f"Evaluating with detector: {method}")
+        det_kwargs = _get_detector_kwargs(method, args)
+        detector = create_detector(method, device=args.device, **det_kwargs)
+        call_kwargs = {"conf_threshold": args.det_conf}
+        if method in {"yolo_world", "grounding_dino"}:
+            ov = _normalize_open_vocab_classes(args.open_vocab_classes)
+            if not ov:
+                raise SystemExit(
+                    f"Detector '{method}' is open-vocabulary; provide --open-vocab-classes (e.g. 'person,car')."
+                )
+            call_kwargs["classes"] = ov
+        # Evaluate across all frames
+        video_summary = _evaluate_video_detections(
+            before_frames,
+            after_frames,
+            detector,
+            call_kwargs,
+            args.iou,
+        )
+        results["detectors"][method] = {
+            "summary": {
+                "total_frames": video_summary["total_frames"],
+                "retention_rate": video_summary["retention_rate"],
+                "total_before_detections": video_summary["total_before_detections"],
+                "total_after_detections": video_summary["total_after_detections"],
+                "total_matched": video_summary["total_matched"],
+                "total_lost": video_summary["total_lost"],
+                "avg_before_per_frame": video_summary["avg_before_per_frame"],
+                "avg_after_per_frame": video_summary["avg_after_per_frame"],
+            },
+            "per_frame_results": video_summary["frame_results"],
+        }
+        print(f"  Retention rate: {video_summary['retention_rate']:.2%}")
+        print(f"  Avg detections: {video_summary['avg_before_per_frame']:.1f} before, {video_summary['avg_after_per_frame']:.1f} after")
+        if viz_dir:
+            # Create visualization videos
+            print(f"  Creating visualization videos...")
+            before_dets_list = []
+            after_dets_list = []
+            for bf, af in zip(before_frames, after_frames):
+                before_dets_list.append(detector(bf, **call_kwargs))
+                after_dets_list.append(detector(af, **call_kwargs))
+            _create_video_visualization(
+                before_frames,
+                after_frames,
+                before_dets_list,
+                after_dets_list,
+                viz_dir,
+                method,
+                fps=args.output_fps,
+            )
+    out_path = Path(args.out)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_text(json.dumps(results, indent=2))
+    print(f"\nWrote: {out_path}")
+    if viz_dir:
+        print(f"Wrote visualizations to: {viz_dir}")
+def _get_detector_kwargs(method: str, args) -> Dict:
+    """Get detector-specific kwargs."""
+    det_kwargs = {}
+    if method == "yolo":
+        det_kwargs["model_path"] = args.yolo_weights
+    if method == "yolo_world":
+        det_kwargs["model_path"] = args.yolo_world_weights
+    if method == "efficientdet":
+        det_kwargs["model_name"] = args.efficientdet_name
+    if method == "detr":
+        det_kwargs["model_name"] = args.detr_model
+    if method == "deformable_detr":
+        det_kwargs["model_name"] = args.deformable_detr_model
+    if method == "grounding_dino":
+        det_kwargs["model_name"] = args.grounding_dino_model
+    return det_kwargs
+def main() -> None:
+    import argparse
+    from model_cache import default_checkpoint_path
+    parser = argparse.ArgumentParser(
+        description="Evaluate object detection before vs after ROI compression (images or videos)"
+    )
+    # Input/Output
+    parser.add_argument("--before", required=True, help="Path to original (before) image or video")
+    parser.add_argument("--after", help="Path to already-compressed (after) image or video")
+    parser.add_argument("--out", default="results/detection_eval.json", help="Where to write JSON results")
+    parser.add_argument("--viz-dir", default=None, help="If set, write visualization images/videos here")
+    parser.add_argument("--save-after", help="Save generated after image/video here")
+    # ROI compression (if --after is not provided)
+    parser.add_argument("--checkpoint", help="TIC checkpoint to generate after image (images only)")
+    parser.add_argument("--sigma", type=float, default=0.3, help="Background quality for ROI compression")
+    parser.add_argument("--mask", help="Optional mask path to use for ROI compression (images only)")
+    parser.add_argument("--seg-method", default=None, help="Segmentation method to build mask")
+    parser.add_argument("--seg-classes", nargs="+", default=None, help="Segmentation classes")
+    parser.add_argument("--seg-prompt", default=None, help="Segmentation prompt (sam3)")
+    parser.add_argument("--seg-conf", type=float, default=0.25, help="Segmentation conf for yolo")
+    parser.add_argument("--seg-model-type", default="coco", help="mask2former model_type")
+    # Video-specific settings
+    parser.add_argument("--video-mode", choices=["static", "dynamic"], default="static",
+                        help="Video compression mode")
+    parser.add_argument("--quality-level", type=int, default=4, help="Quality level (1-5)")
+    parser.add_argument("--output-fps", type=float, default=10.0, help="Output FPS for compressed video")
+    parser.add_argument("--target-bandwidth-kbps", type=float, default=500.0,
+                        help="Target bandwidth for dynamic mode")
+    parser.add_argument("--min-fps", type=float, default=5.0, help="Min FPS for dynamic mode")
+    parser.add_argument("--max-fps", type=float, default=30.0, help="Max FPS for dynamic mode")
+    parser.add_argument("--chunk-duration-sec", type=float, default=1.0, help="Chunk duration for dynamic mode")
+    parser.add_argument("--max-video-frames", type=int, default=100,
+                        help="Max frames to evaluate from video (for efficiency)")
+    parser.add_argument("--video-sample-interval", type=int, default=5,
+                        help="Sample every Nth frame from videos")
+    # Detection settings
+    parser.add_argument(
+        "--detectors",
+        nargs="+",
+        default=["yolo"],
+        help=f"Detectors to run (or 'all'). Available: {', '.join(get_available_detectors())}",
+    )
+    parser.add_argument("--det-conf", type=float, default=0.25, help="Detection confidence threshold")
+    parser.add_argument("--iou", type=float, default=0.5, help="IoU threshold for matching before↔after detections")
+    # Open-vocabulary detection
+    parser.add_argument(
+        "--open-vocab-classes",
+        default=None,
+        help="Comma-separated class prompts for open-vocabulary detectors",
+    )
+    parser.add_argument("--device", default="cuda", choices=["cuda", "cpu"], help="Device")
+    parser.add_argument("--N", type=int, default=192)
+    parser.add_argument("--M", type=int, default=192)
+    # Per-detector config
+    parser.add_argument(
+        "--yolo-weights",
+        default=default_checkpoint_path("yolo26x.pt"),
+        help="Ultralytics YOLO weights path/name",
+    )
+    parser.add_argument(
+        "--yolo-world-weights",
+        default=default_checkpoint_path("yolo26s-world.pt"),
+        help="Ultralytics YOLO-World weights path/name",
+    )
+    parser.add_argument("--efficientdet-name", default="tf_efficientdet_d0", help="EfficientDet model name")
+    parser.add_argument("--detr-model", default="facebook/detr-resnet-50", help="DETR model name")
+    parser.add_argument("--deformable-detr-model", default="SenseTime/deformable-detr", help="Deformable DETR model")
+    parser.add_argument(
+        "--grounding-dino-model",
+        default="IDEA-Research/grounding-dino-base",
+        help="GroundingDINO model name",
+    )
+    args = parser.parse_args()
+    # Determine if input is image or video
+    if _is_video(args.before):
+        main_video(args)
+    else:
+        main_image(args)
+if __name__ == "__main__":
+    main()

roi_segmenter.py ADDED Viewed

	@@ -0,0 +1,355 @@

+"""
+CLI for ROI segmentation using modular segmentation framework.
+Supports both image and video input with batched processing.
+"""
+import sys
+import time
+from pathlib import Path
+import numpy as np
+from PIL import Image
+import torch
+from segmentation import create_segmenter
+from segmentation.utils import visualize_mask, save_mask, calculate_roi_stats
+from video import estimate_batch_sizes, smooth_masks_sdf
+VIDEO_EXTENSIONS = {'.mp4', '.avi', '.mov', '.mkv', '.webm'}
+def is_video(path: str) -> bool:
+    """Check if file is a video based on extension."""
+    return Path(path).suffix.lower() in VIDEO_EXTENSIONS
+def extract_video_frames(video_path: str, target_height: int = None) -> tuple[list[Image.Image], float]:
+    """Extract frames from video file.
+    Args:
+        video_path: Path to video file
+        target_height: Optional height to resize frames to (maintains aspect ratio)
+    Returns:
+        (frames, fps)
+    """
+    try:
+        import cv2
+    except ImportError:
+        raise ImportError("opencv-python required for video processing. Install with: pip install opencv-python")
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise ValueError(f"Cannot open video: {video_path}")
+    fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
+    frames = []
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        if target_height:
+            h, w = frame.shape[:2]
+            scale = target_height / h
+            new_w = max(1, int(w * scale))
+            frame = cv2.resize(frame, (new_w, target_height))
+        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        frames.append(Image.fromarray(frame_rgb))
+    cap.release()
+    return frames, fps
+def save_masks_as_video(masks: list[np.ndarray], output_path: str, fps: float = 30.0) -> None:
+    """Save mask sequence as video file."""
+    try:
+        import cv2
+    except ImportError:
+        raise ImportError("opencv-python required. Install with: pip install opencv-python")
+    if not masks:
+        raise ValueError("No masks to save")
+    h, w = masks[0].shape
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    writer = cv2.VideoWriter(output_path, fourcc, fps, (w, h), isColor=False)
+    for mask in masks:
+        mask_uint8 = (mask * 255).astype(np.uint8)
+        writer.write(mask_uint8)
+    writer.release()
+def process_image(args):
+    """Process single image input."""
+    print(f"Segmenting {args.input}...")
+    print(f"  Method: {args.method}")
+    if args.method == 'sam3':
+        print(f"  Prompt: {args.prompt or ' '.join(args.classes)}")
+    else:
+        print(f"  Classes: {args.classes}")
+    # Load image
+    image = Image.open(args.input).convert('RGB')
+    print(f"  Image size: {image.size}")
+    # Create segmenter and mask
+    seg_kwargs = {}
+    if args.method == 'yolo':
+        seg_kwargs['conf_threshold'] = args.conf_threshold
+    segmenter = create_segmenter(args.method, device=args.device, **seg_kwargs)
+    targets = args.classes
+    if args.method == 'sam3' and args.prompt:
+        targets = [args.prompt]
+    mask = segmenter(image, target_classes=targets)
+    # Calculate statistics
+    stats = calculate_roi_stats(mask)
+    print(f"  ROI coverage: {stats['roi_percentage']:.2f}% "
+          f"({stats['roi_pixels']}/{stats['total_pixels']} pixels)")
+    # Save mask
+    save_mask(mask, args.output)
+    print(f"  Mask saved: {args.output}")
+    # Save visualization if requested
+    if args.visualize:
+        viz_path = args.output.replace('.', '_overlay.')
+        viz_img = visualize_mask(image, mask, alpha=0.5, color=(255, 0, 0))
+        viz_img.save(viz_path)
+        print(f"  Visualization saved: {viz_path}")
+    print("Done!")
+def process_video(args):
+    """Process video input with batched segmentation."""
+    print(f"Processing video: {args.input}")
+    print(f"  Method: {args.method}")
+    print(f"  Classes: {args.classes}")
+    # Extract frames
+    print("Extracting frames...")
+    frames, fps = extract_video_frames(args.input, target_height=args.resize_height)
+    print(f"  Extracted {len(frames)} frames at {fps:.2f} FPS")
+    # Create segmenter
+    seg_kwargs = {}
+    if args.method == 'yolo':
+        seg_kwargs['conf_threshold'] = args.conf_threshold
+    segmenter = create_segmenter(args.method, device=args.device, **seg_kwargs)
+    targets = args.classes
+    if args.method == 'sam3' and args.prompt:
+        targets = [args.prompt]
+    # Check if batched segmentation is supported
+    supports_batch = getattr(segmenter, 'supports_batch', False)
+    print(f"  Segmenter supports batching: {supports_batch}")
+    # Estimate optimal batch size if GPU is being used
+    if args.device == 'cuda' and supports_batch:
+        try:
+            # Get first frame dimensions for estimation
+            sample_w, sample_h = frames[0].size
+            batch_info = estimate_batch_sizes(
+                device=args.device,
+                seg_method=args.method,
+                frame_width=sample_w,
+                frame_height=sample_h,
+                total_frames=len(frames),
+            )
+            recommended_batch = batch_info.seg_batch_size
+            print(f"  GPU Memory Estimation:")
+            print(f"    Free VRAM: {batch_info.free_vram_bytes / (1024**3):.2f} GB")
+            print(f"    Recommended batch size: {recommended_batch}")
+            if batch_info.notes:
+                print(f"    {batch_info.notes}")
+        except Exception as e:
+            print(f"  Warning: Could not estimate GPU batch size: {e}")
+            recommended_batch = None
+    else:
+        recommended_batch = None
+    # Segment frames with OOM retry logic
+    t0 = time.perf_counter()
+    if supports_batch and hasattr(segmenter, 'segment_batch'):
+        print(f"  Segmenting {len(frames)} frames in batches...")
+        # OOM retry logic
+        max_retries = 7
+        retry_count = 0
+        masks = None
+        current_batch_size = recommended_batch  # None means use segmenter's default
+        while retry_count <= max_retries:
+            try:
+                if current_batch_size is not None:
+                    # Segment in manual batches
+                    masks = []
+                    for i in range(0, len(frames), current_batch_size):
+                        batch = frames[i:i + current_batch_size]
+                        print(f"    Batch {i//current_batch_size + 1}: frames {i}-{i+len(batch)-1} (batch_size={len(batch)})")
+                        batch_masks = segmenter.segment_batch(batch, target_classes=targets)
+                        masks.extend(batch_masks)
+                else:
+                    # Let segmenter handle batching internally
+                    masks = segmenter.segment_batch(frames, target_classes=targets)
+                # Success - break retry loop
+                break
+            except torch.cuda.OutOfMemoryError as e:
+                retry_count += 1
+                if retry_count > max_retries:
+                    print(f"  ERROR: Out of memory after {max_retries} retries. Try reducing --resize-height.")
+                    raise
+                # Halve the batch size
+                if current_batch_size is None:
+                    # Initial OOM with default batching - start with a reasonable size
+                    current_batch_size = max(1, len(frames) // 4)
+                else:
+                    current_batch_size = max(1, current_batch_size // 2)
+                print(f"  Out of memory! Retry {retry_count}/{max_retries} with batch_size={current_batch_size}")
+                torch.cuda.empty_cache()
+                masks = None  # Reset
+        if masks is None:
+            raise RuntimeError("Segmentation failed after all retries")
+    else:
+        print(f"  Segmenting {len(frames)} frames sequentially...")
+        masks = []
+        for i, frame in enumerate(frames):
+            if (i + 1) % 10 == 0 or i == 0:
+                print(f"    Frame {i+1}/{len(frames)}")
+            masks.append(segmenter(frame, target_classes=targets))
+    t1 = time.perf_counter()
+    total_time = t1 - t0
+    print(f"  Total segmentation time: {total_time:.3f} s")
+    print(f"  Average per-frame: {total_time/len(frames):.4f} s ({len(frames)/total_time:.2f} fps)")
+    # Apply SDF temporal smoothing to reduce jitter
+    if not args.no_smooth and len(masks) > 2:
+        print(f"  Applying SDF temporal smoothing (alpha={args.smooth_alpha}, patience={args.smooth_patience})...")
+        t_smooth_start = time.perf_counter()
+        masks = smooth_masks_sdf(
+            masks,
+            alpha=args.smooth_alpha,
+            empty_thresh=10,
+            patience=args.smooth_patience,
+        )
+        t_smooth_end = time.perf_counter()
+        print(f"  Smoothing time: {t_smooth_end - t_smooth_start:.3f} s")
+    # Calculate aggregate statistics
+    total_roi_pixels = sum(m.sum() for m in masks)
+    total_pixels = sum(m.size for m in masks)
+    avg_coverage = (total_roi_pixels / total_pixels * 100) if total_pixels > 0 else 0.0
+    print(f"  Average ROI coverage: {avg_coverage:.2f}%")
+    # Save output
+    output_path = Path(args.output)
+    if args.save_frames:
+        # Save individual mask frames
+        output_dir = output_path.parent / f"{output_path.stem}_frames"
+        output_dir.mkdir(parents=True, exist_ok=True)
+        print(f"  Saving {len(masks)} mask frames to {output_dir}/")
+        for i, mask in enumerate(masks):
+            frame_path = output_dir / f"mask_{i:06d}.png"
+            save_mask(mask, str(frame_path))
+        print(f"  Saved {len(masks)} frames")
+    else:
+        # Save as video
+        print(f"  Saving mask video to {args.output}")
+        save_masks_as_video(masks, str(output_path), fps=fps)
+        print(f"  Saved mask video")
+    # Save visualization if requested
+    if args.visualize:
+        viz_dir = output_path.parent / f"{output_path.stem}_viz"
+        viz_dir.mkdir(parents=True, exist_ok=True)
+        print(f"  Creating visualization video...")
+        viz_frames = [visualize_mask(frame, mask, alpha=0.5, color=(255, 0, 0))
+                      for frame, mask in zip(frames, masks)]
+        # Save as video
+        try:
+            import cv2
+            viz_path = viz_dir / "overlay.mp4"
+            h, w = viz_frames[0].size[1], viz_frames[0].size[0]
+            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+            writer = cv2.VideoWriter(str(viz_path), fourcc, fps, (w, h), isColor=True)
+            for vf in viz_frames:
+                frame_np = np.array(vf)
+                frame_bgr = cv2.cvtColor(frame_np, cv2.COLOR_RGB2BGR)
+                writer.write(frame_bgr)
+            writer.release()
+            print(f"  Visualization saved: {viz_path}")
+        except Exception as e:
+            print(f"  Warning: Could not save visualization video: {e}")
+    print("Done!")
+# Command-line interface
+if __name__ == '__main__':
+    import argparse
+    from segmentation import get_available_methods
+    # Get available segmentation methods dynamically
+    available_methods = get_available_methods()
+    parser = argparse.ArgumentParser(description='Segment objects in images or videos')
+    parser.add_argument('--input', required=True, help='Input image or video path')
+    parser.add_argument('--output', required=True, help='Output mask path (image/video) or directory')
+    parser.add_argument('--method', default='yolo',
+                       choices=available_methods,
+                       help='Segmentation method (including fake_* methods for detection+tracking)')
+    parser.add_argument('--classes', nargs='+', default=['car'],
+                       help='Target classes to segment')
+    parser.add_argument('--prompt', type=str, default=None,
+                       help='Natural language prompt (use with --method sam3)')
+    parser.add_argument('--conf-threshold', type=float, default=0.25,
+                       help='Confidence threshold')
+    parser.add_argument('--visualize', action='store_true',
+                       help='Save visualization with overlay')
+    parser.add_argument('--device', default='cuda', choices=['cuda', 'cpu'],
+                       help='Device to run on')
+    # Video-specific options
+    parser.add_argument('--resize-height', type=int, default=None,
+                       help='Resize video frames to this height (maintains aspect ratio)')
+    parser.add_argument('--save-frames', action='store_true',
+                       help='For videos: save masks as individual frames instead of video')
+    # Temporal smoothing options
+    parser.add_argument('--no-smooth', action='store_true',
+                       help='Disable temporal smoothing (may cause jitter)')
+    parser.add_argument('--smooth-alpha', type=float, default=0.5,
+                       help='SDF smoothing factor (0.1=slow/viscous, 0.9=fast/reactive)')
+    parser.add_argument('--smooth-patience', type=int, default=5,
+                       help='Frames to tolerate dropouts before decay (0=immediate, 5=conservative, 15=aggressive)')
+    args = parser.parse_args()
+    # Determine if input is image or video
+    if is_video(args.input):
+        process_video(args)
+    else:
+        process_image(args)

segmentation/__init__.py ADDED Viewed

	@@ -0,0 +1,33 @@

+"""
+ROI Segmentation Module
+Provides abstract base class and concrete implementations for various
+segmentation models (YOLO, SegFormer, Mask2Former, Mask R-CNN, SAM3, etc.) used in ROI-based compression.
+"""
+from .base import BaseSegmenter
+from .segformer import SegFormerSegmenter
+from .yolo import YOLOSegmenter
+from .mask2former import Mask2FormerSegmenter
+from .maskrcnn import MaskRCNNSegmenter
+from .sam3 import SAM3Segmenter
+from .fake import FakeSegmenter
+from .factory import create_segmenter, get_available_methods, register_segmenter
+from .utils import visualize_mask, save_mask, load_mask, calculate_roi_stats
+__all__ = [
+    'BaseSegmenter',
+    'SegFormerSegmenter',
+    'YOLOSegmenter',
+    'Mask2FormerSegmenter',
+    'MaskRCNNSegmenter',
+    'SAM3Segmenter',
+    'FakeSegmenter',
+    'create_segmenter',
+    'get_available_methods',
+    'register_segmenter',
+    'visualize_mask',
+    'save_mask',
+    'load_mask',
+    'calculate_roi_stats',
+]

segmentation/base.py ADDED Viewed

	@@ -0,0 +1,169 @@

+"""
+Abstract base class for segmentation models.
+"""
+from abc import ABC, abstractmethod
+from typing import List, Optional, Union, Dict, Any
+import numpy as np
+from PIL import Image
+class BaseSegmenter(ABC):
+    """
+    Abstract base class for all segmentation models.
+    This class defines the common interface that all segmentation models
+    must implement. Subclasses can handle different types of inputs:
+    - Class-based segmentation (YOLO, SegFormer): List of class names
+    - Natural language segmentation (SAM, CLIP-based): Text prompts
+    - Point/box-based segmentation (SAM): Coordinates
+    """
+    def __init__(self, device: str = 'cuda', **kwargs):
+        """
+        Initialize the segmenter.
+        Args:
+            device: Device to run inference on ('cuda' or 'cpu')
+            **kwargs: Model-specific parameters
+        """
+        self.device = device
+        self.model = None
+        self._is_loaded = False
+    @abstractmethod
+    def load_model(self):
+        """
+        Load the segmentation model.
+        This method should load the model weights and prepare the model
+        for inference. Called automatically before first use.
+        """
+        pass
+    @abstractmethod
+    def segment(
+        self,
+        image: Image.Image,
+        target_classes: Optional[List[str]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Create binary segmentation mask for ROI.
+        Args:
+            image: PIL Image to segment
+            target_classes: List of target classes or text prompts
+            **kwargs: Model-specific parameters (e.g., confidence threshold)
+        Returns:
+            Binary mask as numpy array (H, W) with values 0 or 1
+            - 1: Region of Interest (ROI)
+            - 0: Background
+        """
+        pass
+    @abstractmethod
+    def get_available_classes(self) -> Union[List[str], Dict[str, int]]:
+        """
+        Get list or mapping of classes this model can segment.
+        Returns:
+            List of class names or dict mapping class names to IDs
+        """
+        pass
+    def validate_classes(self, target_classes: Optional[List[str]]) -> List[str]:
+        """
+        Validate and filter target classes against available classes.
+        Args:
+            target_classes: List of requested class names
+        Returns:
+            List of valid class names
+        """
+        if target_classes is None:
+            return self.get_default_classes()
+        available_classes = self.get_available_classes()
+        if isinstance(available_classes, dict):
+            available_classes = list(available_classes.keys())
+        valid_classes = []
+        for cls in target_classes:
+            cls_lower = cls.lower()
+            if cls_lower in [c.lower() for c in available_classes]:
+                valid_classes.append(cls)
+            else:
+                print(f"Warning: '{cls}' not in {self.__class__.__name__} classes.")
+        if not valid_classes:
+            print(f"Warning: No valid classes found. Using defaults.")
+            valid_classes = self.get_default_classes()
+        return valid_classes
+    def segment_batch(
+        self,
+        images: List[Image.Image],
+        target_classes: Optional[List[str]] = None,
+        **kwargs
+    ) -> List[np.ndarray]:
+        """
+        Segment a batch of images.
+        Default implementation processes images sequentially.  Subclasses
+        should override this with a true batched forward pass when the
+        underlying model supports it.
+        Args:
+            images: List of PIL Images (ideally same resolution)
+            target_classes: List of target classes or text prompts
+            **kwargs: Model-specific parameters
+        Returns:
+            List of binary masks (H, W) – one per input image
+        """
+        self.ensure_loaded()
+        return [self.segment(img, target_classes, **kwargs) for img in images]
+    # Whether the model supports true GPU-batched inference.
+    # Subclasses should set this to True if segment_batch uses a
+    # single forward pass rather than a loop.
+    supports_batch: bool = False
+    def get_default_classes(self) -> List[str]:
+        """
+        Get default classes to segment if none specified.
+        Returns:
+            List of default class names
+        """
+        return ['car']  # Default fallback
+    def ensure_loaded(self):
+        """Ensure model is loaded before use."""
+        if not self._is_loaded:
+            self.load_model()
+            self._is_loaded = True
+    def __call__(
+        self,
+        image: Image.Image,
+        target_classes: Optional[List[str]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Convenience method to call segment().
+        Args:
+            image: PIL Image to segment
+            target_classes: List of target classes or text prompts
+            **kwargs: Model-specific parameters
+        Returns:
+            Binary mask as numpy array (H, W)
+        """
+        self.ensure_loaded()
+        return self.segment(image, target_classes, **kwargs)

segmentation/factory.py ADDED Viewed

	@@ -0,0 +1,124 @@

+"""
+Factory for creating segmentation models.
+"""
+from typing import Dict, Type, Optional, List, Callable, Union
+from .base import BaseSegmenter
+from .segformer import SegFormerSegmenter
+from .yolo import YOLOSegmenter
+from .mask2former import Mask2FormerSegmenter
+from .maskrcnn import MaskRCNNSegmenter
+from .sam3 import SAM3Segmenter
+from .fake import FakeSegmenter
+# Registry of available segmentation methods
+# Can be either a class or a factory function
+SEGMENTER_REGISTRY: Dict[str, Union[Type[BaseSegmenter], Callable]] = {
+    'segformer': SegFormerSegmenter,
+    'yolo': YOLOSegmenter,
+    'mask2former': Mask2FormerSegmenter,
+    'maskrcnn': MaskRCNNSegmenter,
+    'sam3': SAM3Segmenter,
+}
+def _register_fake_methods():
+    """Register fake segmentation methods (detection + tracking → bbox masks)."""
+    fake_configs = [
+        ('fake_yolo', 'yolo', 'bytetrack'),
+        ('fake_yolo_botsort', 'yolo', 'botsort'),
+        ('fake_detr', 'detr', 'bytetrack'),
+        ('fake_deformable_detr', 'deformable_detr', 'bytetrack'),
+        ('fake_fasterrcnn', 'fasterrcnn', 'bytetrack'),
+        ('fake_retinanet', 'retinanet', 'bytetrack'),
+        ('fake_fcos', 'fcos', 'bytetrack'),
+        ('fake_grounding_dino', 'grounding_dino', 'bytetrack'),
+    ]
+    for name, detector, tracker in fake_configs:
+        # Create a factory function for each config
+        # Use default arguments to capture values properly in closure
+        def make_factory(det=detector, default_tracker=tracker):
+            def factory(**kwargs):
+                # Allow overriding tracker_type, otherwise use default
+                if 'tracker_type' not in kwargs:
+                    kwargs['tracker_type'] = default_tracker
+                return FakeSegmenter(detector_name=det, **kwargs)
+            return factory
+        SEGMENTER_REGISTRY[name] = make_factory()
+# Register fake methods on import
+_register_fake_methods()
+def register_segmenter(name: str, segmenter_class: Type[BaseSegmenter]):
+    """
+    Register a new segmentation method.
+    Args:
+        name: Method name (e.g., 'sam', 'drone_detector')
+        segmenter_class: Segmenter class that extends BaseSegmenter
+    """
+    if not issubclass(segmenter_class, BaseSegmenter):
+        raise ValueError(f"{segmenter_class} must extend BaseSegmenter")
+    SEGMENTER_REGISTRY[name.lower()] = segmenter_class
+def create_segmenter(
+    method: str,
+    device: str = 'cuda',
+    **kwargs
+) -> BaseSegmenter:
+    """
+    Factory function to create a segmentation model.
+    Args:
+        method: Segmentation method name ('segformer', 'yolo', 'fake_yolo', etc.)
+        device: Device to run on ('cuda' or 'cpu')
+        **kwargs: Method-specific parameters
+    Returns:
+        Instance of the requested segmenter
+    Raises:
+        ValueError: If method is not recognized
+    Example:
+        >>> segmenter = create_segmenter('yolo', device='cuda', conf_threshold=0.3)
+        >>> mask = segmenter(image, target_classes=['car', 'person'])
+        >>> # Use detection-based fake segmentation with tracking
+        >>> fake_seg = create_segmenter('fake_yolo', device='cuda')
+        >>> mask = fake_seg(image, target_classes=['person'])
+    """
+    method_lower = method.lower()
+    if method_lower not in SEGMENTER_REGISTRY:
+        available = ', '.join(sorted(SEGMENTER_REGISTRY.keys()))
+        raise ValueError(
+            f"Unknown segmentation method: '{method}'. "
+            f"Available methods: {available}"
+        )
+    factory = SEGMENTER_REGISTRY[method_lower]
+    # Handle both class constructors and factory functions
+    if callable(factory) and not isinstance(factory, type):
+        # It's a factory function (for fake segmenters)
+        return factory(device=device, **kwargs)
+    else:
+        # It's a class constructor
+        return factory(device=device, **kwargs)
+def get_available_methods() -> List[str]:
+    """
+    Get list of available segmentation methods.
+    Returns:
+        List of method names
+    """
+    return list(SEGMENTER_REGISTRY.keys())

segmentation/fake.py ADDED Viewed

	@@ -0,0 +1,412 @@

+"""Detection-based 'fake' segmentation using bounding boxes + object tracking.
+Creates rectangular masks from detection bounding boxes and maintains object
+identity across frames using tracking (ByteTrack, BoTSORT, or SimpleTracker).
+Now supports batch detection for efficiency: detects all frames in batches,
+then runs tracking sequentially on the results.
+"""
+from __future__ import annotations
+import numpy as np
+from PIL import Image
+from typing import List, Optional, Dict, Any
+from .base import BaseSegmenter
+class FakeSegmenter(BaseSegmenter):
+    """
+    Detection-based segmentation that creates rectangular masks from bboxes.
+    Uses object tracking to maintain consistent masks across video frames:
+    - ByteTrack (default for YOLO)
+    - BoTSORT (available for YOLO)
+    - SimpleTracker (fallback for non-YOLO detectors)
+    This is useful for fast ROI extraction when pixel-perfect masks aren't needed.
+    """
+    def __init__(
+        self,
+        device: str = 'cuda',
+        detector_name: str = 'yolo',
+        tracker_type: str = 'bytetrack',
+        conf_threshold: float = 0.25,
+        model_path: Optional[str] = None,
+        **kwargs
+    ):
+        """
+        Initialize fake segmenter.
+        Args:
+            device: Device to run on ('cuda' or 'cpu')
+            detector_name: Detection method ('yolo', 'detr', 'faster_rcnn', etc.)
+            tracker_type: Tracker type ('bytetrack', 'botsort', 'simple')
+            conf_threshold: Confidence threshold for detections
+            model_path: Optional path to detector weights
+            **kwargs: Additional parameters
+        """
+        super().__init__(device=device, **kwargs)
+        self.detector_name = detector_name.lower()
+        self.tracker_type = tracker_type.lower()
+        self.conf_threshold = conf_threshold
+        self.model_path = model_path
+        # Detector (set in load_model())
+        self.detector = None
+        # State for tracking
+        self._tracker = None
+        self._frame_count = 0
+        # Batch support
+        self.supports_batch = True
+    def load_model(self):
+        """Load detector model and initialize tracker."""
+        from detection import create_detector
+        # Create detector
+        if self.model_path:
+            self.detector = create_detector(
+                self.detector_name,
+                device=self.device,
+                model_path=self.model_path,
+            )
+        else:
+            self.detector = create_detector(
+                self.detector_name,
+                device=self.device,
+            )
+        # Ensure detector model is loaded
+        if not self.detector._is_loaded:
+            self.detector.load_model()
+        # Initialize tracker (now all trackers work with any detector!)
+        if self.tracker_type == 'bytetrack':
+            from detection.bytetrack import ByteTracker
+            self._tracker = ByteTracker(
+                track_thresh=0.5,
+                match_thresh=0.8,
+                track_buffer=30,
+                frame_rate=30,
+            )
+        elif self.tracker_type == 'botsort':
+            from detection.bytetrack import BoTSORT
+            self._tracker = BoTSORT(
+                track_thresh=0.5,
+                match_thresh=0.8,
+                track_buffer=30,
+                frame_rate=30,
+            )
+        else:  # simple
+            from detection.tracker import SimpleTracker
+            self._tracker = SimpleTracker(
+                iou_threshold=0.3,
+                max_age=30,
+                min_hits=1,
+                label_match=True,
+            )
+        print(f"Loaded FakeSegmenter: {self.detector_name} + {self.tracker_type} tracking")
+    def reset_tracking(self):
+        """Reset tracker state (call between videos)."""
+        self._frame_count = 0
+        if self._tracker is not None:
+            self._tracker.reset()
+    def _create_bbox_mask(
+        self,
+        width: int,
+        height: int,
+        detections: List[Dict],
+    ) -> np.ndarray:
+        """Create binary mask from bounding boxes.
+        Args:
+            width: Image width
+            height: Image height
+            detections: List of detections with 'bbox_xyxy' key
+        Returns:
+            Binary mask (H, W) with 1.0 where bboxes are
+        """
+        mask = np.zeros((height, width), dtype=np.float32)
+        for det in detections:
+            bbox = det.get('bbox_xyxy', det.get('bbox'))
+            if bbox is None:
+                continue
+            x1, y1, x2, y2 = bbox
+            x1 = int(max(0, min(x1, width - 1)))
+            y1 = int(max(0, min(y1, height - 1)))
+            x2 = int(max(0, min(x2, width - 1)))
+            y2 = int(max(0, min(y2, height - 1)))
+            if x2 > x1 and y2 > y1:
+                mask[y1:y2, x1:x2] = 1.0
+        return mask
+    def segment(
+        self,
+        image: Image.Image,
+        target_classes: Optional[List[str]] = None,
+        conf_threshold: Optional[float] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Create segmentation mask from detections.
+        Args:
+            image: Input PIL Image
+            target_classes: Classes to detect (None = all classes)
+            conf_threshold: Override default confidence threshold
+            **kwargs: Additional parameters
+        Returns:
+            Binary mask (H, W) as float32
+        """
+        if conf_threshold is None:
+            conf_threshold = self.conf_threshold
+        width, height = image.size
+        # Run detection
+        # Pass classes for open-vocabulary detectors (Grounding DINO, YOLO-World)
+        detect_kwargs = {"conf_threshold": conf_threshold}
+        if target_classes:
+            detect_kwargs["classes"] = target_classes
+        detections = self.detector.detect(image, **detect_kwargs)
+        # Convert to dict format
+        det_dicts = [
+            {
+                'label': d.label,
+                'score': d.score,
+                'bbox_xyxy': d.bbox_xyxy,
+            }
+            for d in detections
+        ]
+        # Update tracker
+        if self._tracker is not None:
+            tracks = self._tracker.update(det_dicts)
+            # Convert tracks to detection format
+            detections = track_dicts_to_detections(tracks)
+        else:
+            detections = det_dicts
+        # Filter by target classes if specified
+        if target_classes:
+            target_lower = [tc.lower() for tc in target_classes]
+            detections = [
+                d for d in detections
+                if any(tc in d['label'].lower() for tc in target_lower)
+            ]
+        # Create mask from bboxes
+        mask = self._create_bbox_mask(width, height, detections)
+        self._frame_count += 1
+        return mask
+    def _detect_with_yolo_tracking(
+        self,
+        image: Image.Image,
+        conf_threshold: float,
+    ) -> List[Dict]:
+        """Run YOLO detection with built-in tracking.
+        Returns:
+            List of detections with track IDs
+        """
+        img = np.asarray(image.convert('RGB'))
+        # Determine device argument
+        device_arg = 0 if self.detector.device.startswith('cuda') else 'cpu'
+        # Use YOLO's .track() method instead of .predict()
+        results = self.detector.model.track(
+            source=img,
+            conf=conf_threshold,
+            device=device_arg,
+            verbose=False,
+            tracker=f'{self.tracker_type}.yaml',  # bytetrack.yaml or botsort.yaml
+            persist=True,  # Persist tracks between frames
+        )
+        if not results:
+            return []
+        r0 = results[0]
+        if not hasattr(r0, 'boxes') or r0.boxes is None:
+            return []
+        boxes = r0.boxes
+        xyxy = boxes.xyxy.detach().cpu().numpy()
+        conf = boxes.conf.detach().cpu().numpy()
+        cls = boxes.cls.detach().cpu().numpy().astype(int)
+        # Get track IDs if available
+        track_ids = None
+        if hasattr(boxes, 'id') and boxes.id is not None:
+            track_ids = boxes.id.detach().cpu().numpy().astype(int)
+        detections = []
+        for i, (bbox, score, class_id) in enumerate(zip(xyxy, conf, cls)):
+            label = self.detector._names.get(int(class_id), str(int(class_id)))
+            det = {
+                'label': label,
+                'score': float(score),
+                'bbox_xyxy': [float(x) for x in bbox.tolist()],
+            }
+            if track_ids is not None:
+                det['track_id'] = int(track_ids[i])
+            detections.append(det)
+        return detections
+    def segment_batch(
+        self,
+        images: List[Image.Image],
+        target_classes: Optional[List[str]] = None,
+        conf_threshold: Optional[float] = None,
+        **kwargs
+    ) -> List[np.ndarray]:
+        """
+        Batch segmentation for video processing using offline batch detection + sequential tracking.
+        This is much more efficient:
+        1. Batch detect all frames at once (or in batches if memory limited)
+        2. Run tracker sequentially on detection results
+        3. Create masks from tracked detections
+        Args:
+            images: List of PIL Images
+            target_classes: Classes to detect
+            conf_threshold: Confidence threshold
+            **kwargs: Additional parameters
+        Returns:
+            List of binary masks
+        """
+        # Ensure model is loaded
+        self.ensure_loaded()
+        if conf_threshold is None:
+            conf_threshold = self.conf_threshold
+        if not images:
+            return []
+        # Step 1: Batch detect all frames (TRUE batch inference for speed)
+        all_detections = []
+        # Prepare detection kwargs (classes for open-vocabulary detectors)
+        detect_kwargs = {"conf_threshold": conf_threshold}
+        if target_classes:
+            detect_kwargs["classes"] = target_classes
+        if hasattr(self.detector, 'detect_batch'):
+            # Use batch detection for efficiency (GPU parallelization)
+            batch_dets = self.detector.detect_batch(images, **detect_kwargs)
+            for dets in batch_dets:
+                det_dicts = [
+                    {
+                        'label': d.label,
+                        'score': d.score,
+                        'bbox_xyxy': d.bbox_xyxy,
+                    }
+                    for d in dets
+                ]
+                all_detections.append(det_dicts)
+        else:
+            # Fallback to frame-by-frame for detectors without batch support
+            for image in images:
+                dets = self.detector.detect(image, **detect_kwargs)
+                det_dicts = [
+                    {
+                        'label': d.label,
+                        'score': d.score,
+                        'bbox_xyxy': d.bbox_xyxy,
+                    }
+                    for d in dets
+                ]
+                all_detections.append(det_dicts)
+        # Step 2: Run tracker sequentially on all detections
+        tracked_detections = []
+        if self._tracker is not None:
+            for frame_dets in all_detections:
+                tracks = self._tracker.update(frame_dets)
+                # Convert tracks to detection format
+                frame_tracked = track_dicts_to_detections(tracks)
+                tracked_detections.append(frame_tracked)
+        else:
+            tracked_detections = all_detections
+        # Step 3: Filter by target classes and create masks
+        masks = []
+        for i, (image, detections) in enumerate(zip(images, tracked_detections)):
+            width, height = image.size
+            # Filter by target classes if specified
+            if target_classes:
+                target_lower = [tc.lower() for tc in target_classes]
+                detections = [
+                    d for d in detections
+                    if any(tc in d['label'].lower() for tc in target_lower)
+                ]
+            # Create mask from bboxes
+            mask = self._create_bbox_mask(width, height, detections)
+            masks.append(mask)
+            self._frame_count += 1
+        return masks
+    def get_available_classes(self) -> List[str]:
+        """Get list of classes the detector can detect."""
+        classes = self.detector.get_available_classes()
+        if isinstance(classes, dict):
+            return sorted(classes.keys())
+        elif isinstance(classes, list):
+            return classes
+        else:
+            return []
+    def get_default_classes(self) -> List[str]:
+        """Get default classes for common use cases."""
+        # Common COCO classes
+        return ['person', 'car', 'truck', 'bus', 'bicycle', 'motorcycle']
+def track_dicts_to_detections(tracks: List[Dict]) -> List[Dict]:
+    """Convert tracker output to detection format.
+    Args:
+        tracks: List of track dicts from tracker
+    Returns:
+        List of detection dicts
+    """
+    detections = []
+    for track in tracks:
+        det = {
+            'label': track.get('label', ''),
+            'score': track.get('score', track.get('last_score', 0.0)),
+            'bbox_xyxy': track.get('bbox_xyxy', track.get('last_bbox', [])),
+        }
+        # Add track_id if available
+        if 'track_id' in track:
+            det['track_id'] = track['track_id']
+        detections.append(det)
+    return detections

segmentation/mask2former.py ADDED Viewed

	@@ -0,0 +1,310 @@

+"""
+Mask2Former segmentation using Swin Transformer backbone.
+Supports both COCO and ADE20K pre-trained models.
+"""
+import torch
+import numpy as np
+import cv2
+from PIL import Image
+from typing import List, Optional, Dict
+from .base import BaseSegmenter
+from model_cache import hf_cache_dir, ensure_default_checkpoint_dirs
+class Mask2FormerSegmenter(BaseSegmenter):
+    """
+    Mask2Former segmentation with Swin Transformer backbone.
+    Supports both COCO (133 classes) and ADE20K (150 classes) datasets.
+    """
+    # COCO panoptic categories (133 classes including stuff)
+    COCO_CLASSES = {
+        'person': 0, 'bicycle': 1, 'car': 2, 'motorcycle': 3, 'airplane': 4,
+        'bus': 5, 'train': 6, 'truck': 7, 'boat': 8, 'traffic light': 9,
+        'fire hydrant': 10, 'stop sign': 11, 'parking meter': 12, 'bench': 13,
+        'bird': 14, 'cat': 15, 'dog': 16, 'horse': 17, 'sheep': 18, 'cow': 19,
+        'elephant': 20, 'bear': 21, 'zebra': 22, 'giraffe': 23, 'backpack': 24,
+        'umbrella': 25, 'handbag': 26, 'tie': 27, 'suitcase': 28, 'frisbee': 29,
+        'skis': 30, 'snowboard': 31, 'sports ball': 32, 'kite': 33, 'baseball bat': 34,
+        'baseball glove': 35, 'skateboard': 36, 'surfboard': 37, 'tennis racket': 38,
+        'bottle': 39, 'wine glass': 40, 'cup': 41, 'fork': 42, 'knife': 43,
+        'spoon': 44, 'bowl': 45, 'banana': 46, 'apple': 47, 'sandwich': 48,
+        'orange': 49, 'broccoli': 50, 'carrot': 51, 'hot dog': 52, 'pizza': 53,
+        'donut': 54, 'cake': 55, 'chair': 56, 'couch': 57, 'potted plant': 58,
+        'bed': 59, 'dining table': 60, 'toilet': 61, 'tv': 62, 'laptop': 63,
+        'mouse': 64, 'remote': 65, 'keyboard': 66, 'cell phone': 67, 'microwave': 68,
+        'oven': 69, 'toaster': 70, 'sink': 71, 'refrigerator': 72, 'book': 73,
+        'clock': 74, 'vase': 75, 'scissors': 76, 'teddy bear': 77, 'hair drier': 78,
+        'toothbrush': 79, 'banner': 80, 'blanket': 81, 'bridge': 82, 'cardboard': 83,
+        'counter': 84, 'curtain': 85, 'door-stuff': 86, 'floor-wood': 87,
+        'flower': 88, 'fruit': 89, 'gravel': 90, 'house': 91, 'light': 92,
+        'mirror-stuff': 93, 'net': 94, 'pillow': 95, 'platform': 96, 'playingfield': 97,
+        'railroad': 98, 'river': 99, 'road': 100, 'roof': 101, 'sand': 102,
+        'sea': 103, 'shelf': 104, 'snow': 105, 'stairs': 106, 'tent': 107,
+        'towel': 108, 'wall-brick': 109, 'wall-stone': 110, 'wall-tile': 111,
+        'wall-wood': 112, 'water': 113, 'window-blind': 114, 'window': 115,
+        'tree': 116, 'fence': 117, 'ceiling': 118, 'sky': 119, 'cabinet': 120,
+        'table': 121, 'floor': 122, 'pavement': 123, 'mountain': 124, 'grass': 125,
+        'dirt': 126, 'paper': 127, 'food': 128, 'building': 129, 'rock': 130,
+        'wall': 131, 'rug': 132
+    }
+    # Common ADE20K classes (subset of 150)
+    ADE20K_CLASSES = {
+        'wall': 0, 'building': 1, 'sky': 2, 'floor': 3, 'tree': 4,
+        'ceiling': 5, 'road': 6, 'bed': 7, 'windowpane': 8, 'grass': 9,
+        'cabinet': 10, 'sidewalk': 11, 'person': 12, 'earth': 13, 'door': 14,
+        'table': 15, 'mountain': 16, 'plant': 17, 'curtain': 18, 'chair': 19,
+        'car': 20, 'water': 21, 'painting': 22, 'sofa': 23, 'shelf': 24,
+        'house': 25, 'sea': 26, 'mirror': 27, 'rug': 28, 'field': 29,
+        'armchair': 30, 'seat': 31, 'fence': 32, 'desk': 33, 'rock': 34,
+        'wardrobe': 35, 'lamp': 36, 'bathtub': 37, 'railing': 38, 'cushion': 39,
+        'base': 40, 'box': 41, 'column': 42, 'signboard': 43, 'chest of drawers': 44,
+        'counter': 45, 'sand': 46, 'sink': 47, 'skyscraper': 48, 'fireplace': 49,
+    }
+    def __init__(
+        self,
+        device: str = 'cuda',
+        conf_threshold: float = 0.5,
+        model_type: str = 'coco',  # 'coco' or 'ade20k'
+        **kwargs
+    ):
+        """
+        Initialize Mask2Former segmenter.
+        Args:
+            device: Device to run model on
+            conf_threshold: Confidence threshold for predictions
+            model_type: Which pre-trained model to use ('coco' or 'ade20k')
+            **kwargs: Additional arguments
+        """
+        super().__init__(device, **kwargs)
+        self.conf_threshold = conf_threshold
+        self.model_type = model_type.lower()
+        if self.model_type not in ['coco', 'ade20k']:
+            raise ValueError(f"model_type must be 'coco' or 'ade20k', got {self.model_type}")
+        self.class_map = self.COCO_CLASSES if self.model_type == 'coco' else self.ADE20K_CLASSES
+        self.model = None
+        self.processor = None
+    def load_model(self):
+        """Load Mask2Former model and processor from HuggingFace."""
+        try:
+            from transformers import Mask2FormerForUniversalSegmentation, AutoImageProcessor
+        except ImportError:
+            raise ImportError(
+                "Mask2Former requires transformers. Install with: pip install transformers"
+            )
+        if self.model_type == 'coco':
+            model_name = "facebook/mask2former-swin-large-coco-panoptic"
+        else:  # ade20k
+            model_name = "facebook/mask2former-swin-large-ade-semantic"
+        print(f"Loading Mask2Former ({self.model_type}) model...")
+        ensure_default_checkpoint_dirs()
+        cache_dir = str(hf_cache_dir())
+        self.processor = AutoImageProcessor.from_pretrained(model_name, cache_dir=cache_dir)
+        self.model = Mask2FormerForUniversalSegmentation.from_pretrained(model_name, cache_dir=cache_dir)
+        self.model = self.model.to(self.device)
+        self.model.eval()
+        print(f"✓ Mask2Former loaded: {model_name}")
+    def segment(
+        self,
+        image: Image.Image,
+        target_classes: Optional[List[str]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Segment image using Mask2Former.
+        Args:
+            image: PIL Image
+            target_classes: List of class names to segment (None for all)
+            **kwargs: Additional arguments (unused)
+        Returns:
+            Binary mask as numpy array [H, W] with 1 for ROI, 0 for background
+        """
+        if self.model is None:
+            self.load_model()
+        # Prepare image
+        inputs = self.processor(images=image, return_tensors="pt")
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        # Run inference
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        # Post-process
+        if self.model_type == 'coco':
+            # Panoptic segmentation
+            result = self.processor.post_process_panoptic_segmentation(
+                outputs,
+                target_sizes=[image.size[::-1]]
+            )[0]
+            segmentation = result['segmentation'].cpu().numpy()
+            segments_info = result['segments_info']
+        else:
+            # Semantic segmentation
+            result = self.processor.post_process_semantic_segmentation(
+                outputs,
+                target_sizes=[image.size[::-1]]
+            )[0]
+            segmentation = result.cpu().numpy()
+            segments_info = None
+        # Create binary mask
+        mask = np.zeros(segmentation.shape, dtype=np.uint8)
+        if target_classes is None:
+            # Return all detected objects
+            mask = (segmentation > 0).astype(np.uint8)
+        else:
+            # Filter by target classes
+            target_ids = []
+            for class_name in target_classes:
+                class_lower = class_name.lower()
+                if class_lower in self.class_map:
+                    target_ids.append(self.class_map[class_lower])
+                else:
+                    # Try fuzzy matching
+                    for key, val in self.class_map.items():
+                        if class_lower in key or key in class_lower:
+                            target_ids.append(val)
+                            break
+            if self.model_type == 'coco' and segments_info:
+                # Use segment info for panoptic
+                for segment in segments_info:
+                    if segment['label_id'] in target_ids:
+                        mask[segmentation == segment['id']] = 1
+            else:
+                # Use class IDs directly for semantic
+                for target_id in target_ids:
+                    mask[segmentation == target_id] = 1
+        return mask
+    def get_available_classes(self) -> List[str]:
+        """
+        Get list of available class names.
+        Returns:
+            List of class names supported by the model
+        """
+        return sorted(self.class_map.keys())
+    # Mask2Former can be batched through the HF processor for semantic mode.
+    # Panoptic post-processing is per-image, but the forward pass is batched.
+    supports_batch: bool = True
+    def segment_batch(
+        self,
+        images: List[Image.Image],
+        target_classes: Optional[List[str]] = None,
+        **kwargs,
+    ) -> List[np.ndarray]:
+        """Segment a batch of images via Mask2Former.
+        The HuggingFace processor accepts a list of images.  The forward
+        pass runs on the full batch; post-processing is per-image.
+        Args:
+            images: List of PIL Images
+            target_classes: Class names to include in ROI
+            **kwargs: unused
+        Returns:
+            List of binary masks (H, W) float32
+        """
+        if not images:
+            return []
+        if self.model is None:
+            self.load_model()
+        # Resolve target class IDs
+        target_ids = []
+        if target_classes:
+            for cn in target_classes:
+                cl = cn.lower()
+                if cl in self.class_map:
+                    target_ids.append(self.class_map[cl])
+                else:
+                    for key, val in self.class_map.items():
+                        if cl in key or key in cl:
+                            target_ids.append(val)
+                            break
+        # Batch preprocess
+        inputs = self.processor(images=images, return_tensors="pt", padding=True)
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        # Collect all target sizes for batch post-processing
+        target_sizes = [img.size[::-1] for img in images]  # [(H1, W1), (H2, W2), ...]
+        masks: List[np.ndarray] = []
+        if self.model_type == 'coco':
+            # Post-process entire batch at once for panoptic segmentation
+            results = self.processor.post_process_panoptic_segmentation(
+                outputs,
+                target_sizes=target_sizes,
+            )
+            for i, (result, img) in enumerate(zip(results, images)):
+                segmentation = result['segmentation'].cpu().numpy()
+                segments_info = result['segments_info']
+                mask = np.zeros(segmentation.shape, dtype=np.float32)
+                if target_classes is None:
+                    mask = (segmentation > 0).astype(np.float32)
+                else:
+                    for seg in segments_info:
+                        if seg['label_id'] in target_ids:
+                            mask[segmentation == seg['id']] = 1.0
+                masks.append(mask)
+        else:
+            # Post-process entire batch at once for semantic segmentation
+            results = self.processor.post_process_semantic_segmentation(
+                outputs,
+                target_sizes=target_sizes,
+            )
+            for i, (result, img) in enumerate(zip(results, images)):
+                segmentation = result.cpu().numpy()
+                mask = np.zeros(segmentation.shape, dtype=np.float32)
+                if target_classes is None:
+                    mask = (segmentation > 0).astype(np.float32)
+                else:
+                    for tid in target_ids:
+                        mask[segmentation == tid] = 1.0
+                masks.append(mask)
+        return masks
+    def get_class_info(self) -> Dict[str, int]:
+        """
+        Get detailed class information.
+        Returns:
+            Dictionary mapping class names to IDs
+        """
+        return {
+            'model_type': self.model_type,
+            'num_classes': len(self.class_map),
+            'classes': self.class_map.copy()
+        }

segmentation/maskrcnn.py ADDED Viewed

	@@ -0,0 +1,310 @@

+"""
+Mask R-CNN segmentation using torchvision pre-trained models.
+Supports COCO-trained instance segmentation.
+"""
+from model_cache import ensure_default_checkpoint_dirs
+import torch
+import numpy as np
+import cv2
+from PIL import Image
+from typing import List, Optional, Dict
+from .base import BaseSegmenter
+# Ensure torchvision/torch hub downloads land under `checkpoints/` by default.
+ensure_default_checkpoint_dirs()
+class MaskRCNNSegmenter(BaseSegmenter):
+    """
+    Mask R-CNN instance segmentation from torchvision.
+    Uses pre-trained ResNet50-FPN backbone on COCO dataset (80 classes).
+    """
+    # COCO class names (80 classes)
+    COCO_CLASSES = [
+        '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
+        'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
+        'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+        'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
+        'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
+        'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
+        'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
+        'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
+        'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
+        'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
+        'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
+        'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
+    ]
+    def __init__(
+        self,
+        device: str = 'cuda',
+        conf_threshold: float = 0.5,
+        backbone: str = 'resnet50',  # 'resnet50' or 'mobilenet'
+        **kwargs
+    ):
+        """
+        Initialize Mask R-CNN segmenter.
+        Args:
+            device: Device to run model on
+            conf_threshold: Confidence threshold for detections
+            backbone: Model backbone ('resnet50' or 'mobilenet')
+            **kwargs: Additional arguments
+        """
+        super().__init__(device, **kwargs)
+        self.conf_threshold = conf_threshold
+        self.backbone = backbone.lower()
+        if self.backbone not in ['resnet50', 'mobilenet']:
+            raise ValueError(f"backbone must be 'resnet50' or 'mobilenet', got {self.backbone}")
+        self.model = None
+    def load_model(self):
+        """Load Mask R-CNN model from torchvision."""
+        try:
+            import torchvision
+            from torchvision.models.detection import maskrcnn_resnet50_fpn, MaskRCNN_ResNet50_FPN_Weights
+            from torchvision.models.detection import maskrcnn_resnet50_fpn_v2, MaskRCNN_ResNet50_FPN_V2_Weights
+        except ImportError:
+            raise ImportError(
+                "Mask R-CNN requires torchvision. Install with: pip install torchvision"
+            )
+        print(f"Loading Mask R-CNN ({self.backbone}) model...")
+        if self.backbone == 'resnet50':
+            # Use newer V2 weights for better performance
+            self.model = maskrcnn_resnet50_fpn_v2(weights=MaskRCNN_ResNet50_FPN_V2_Weights.DEFAULT)
+        else:
+            # MobileNet version (lighter but less accurate)
+            from torchvision.models.detection import maskrcnn_mobilenet_v3_large_fpn
+            from torchvision.models.detection import MaskRCNN_MobileNet_V3_Large_FPN_Weights
+            self.model = maskrcnn_mobilenet_v3_large_fpn(weights=MaskRCNN_MobileNet_V3_Large_FPN_Weights.DEFAULT)
+        self.model = self.model.to(self.device)
+        self.model.eval()
+        print(f"✓ Mask R-CNN loaded: {self.backbone}")
+    def segment(
+        self,
+        image: Image.Image,
+        target_classes: Optional[List[str]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Segment image using Mask R-CNN.
+        Args:
+            image: PIL Image
+            target_classes: List of class names to segment (None for all)
+            **kwargs: Additional arguments (can override conf_threshold)
+        Returns:
+            Binary mask as numpy array [H, W] with 1 for ROI, 0 for background
+        """
+        if self.model is None:
+            self.load_model()
+        # Get confidence threshold from kwargs or use default
+        conf_threshold = kwargs.get('conf_threshold', self.conf_threshold)
+        # Prepare image
+        img_array = np.array(image)
+        if len(img_array.shape) == 2:  # Grayscale
+            img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
+        elif img_array.shape[2] == 4:  # RGBA
+            img_array = cv2.cvtColor(img_array, cv2.COLOR_RGBA2RGB)
+        # Convert to tensor [3, H, W] and normalize
+        img_tensor = torch.from_numpy(img_array).float().permute(2, 0, 1) / 255.0
+        img_tensor = img_tensor.to(self.device)
+        # Run inference
+        with torch.no_grad():
+            predictions = self.model([img_tensor])[0]
+        # Get predictions
+        boxes = predictions['boxes'].cpu().numpy()
+        labels = predictions['labels'].cpu().numpy()
+        scores = predictions['scores'].cpu().numpy()
+        masks = predictions['masks'].cpu().numpy()
+        # Filter by confidence
+        keep_indices = scores >= conf_threshold
+        boxes = boxes[keep_indices]
+        labels = labels[keep_indices]
+        scores = scores[keep_indices]
+        masks = masks[keep_indices]
+        # Create combined mask
+        h, w = img_array.shape[:2]
+        combined_mask = np.zeros((h, w), dtype=np.uint8)
+        if target_classes is None:
+            # Combine all high-confidence masks
+            for mask in masks:
+                binary_mask = (mask[0] > 0.5).astype(np.uint8)
+                combined_mask = np.maximum(combined_mask, binary_mask)
+        else:
+            # Filter by target classes
+            target_indices = []
+            for class_name in target_classes:
+                class_lower = class_name.lower()
+                for idx, coco_class in enumerate(self.COCO_CLASSES):
+                    if coco_class.lower() == class_lower or class_lower in coco_class.lower():
+                        target_indices.append(idx)
+            # Combine masks for target classes
+            for i, label in enumerate(labels):
+                if label in target_indices:
+                    binary_mask = (masks[i][0] > 0.5).astype(np.uint8)
+                    combined_mask = np.maximum(combined_mask, binary_mask)
+        return combined_mask
+    def get_available_classes(self) -> List[str]:
+        """
+        Get list of available class names.
+        Returns:
+            List of COCO class names (excluding 'N/A' and '__background__')
+        """
+        return [cls for cls in self.COCO_CLASSES if cls not in ['N/A', '__background__']]
+    # torchvision Mask R-CNN natively accepts a list of tensors.
+    supports_batch: bool = True
+    def segment_batch(
+        self,
+        images: List[Image.Image],
+        target_classes: Optional[List[str]] = None,
+        **kwargs,
+    ) -> List[np.ndarray]:
+        """Segment a batch of images via Mask R-CNN.
+        torchvision's Mask R-CNN forward accepts ``List[Tensor]`` so we
+        can pass all images in one call. Post-processing is per-image.
+        Args:
+            images: List of PIL Images
+            target_classes: COCO class names to include in mask
+            **kwargs: May include ``conf_threshold``
+        Returns:
+            List of binary masks (H, W) float32
+        """
+        if not images:
+            return []
+        if self.model is None:
+            self.load_model()
+        conf_threshold = kwargs.get('conf_threshold', self.conf_threshold)
+        # Resolve target class indices
+        target_indices: Optional[List[int]] = None
+        if target_classes is not None:
+            target_indices = []
+            for cn in target_classes:
+                cl = cn.lower()
+                for idx, cc in enumerate(self.COCO_CLASSES):
+                    if cc.lower() == cl or cl in cc.lower():
+                        target_indices.append(idx)
+        # Build list of tensors (varying sizes are ok for torchvision)
+        tensors = []
+        for img in images:
+            arr = np.array(img)
+            if len(arr.shape) == 2:
+                arr = cv2.cvtColor(arr, cv2.COLOR_GRAY2RGB)
+            elif arr.shape[2] == 4:
+                arr = cv2.cvtColor(arr, cv2.COLOR_RGBA2RGB)
+            t = torch.from_numpy(arr).float().permute(2, 0, 1) / 255.0
+            tensors.append(t.to(self.device))
+        with torch.no_grad():
+            predictions_list = self.model(tensors)
+        masks_out: List[np.ndarray] = []
+        for i, preds in enumerate(predictions_list):
+            h, w = images[i].height, images[i].width
+            combined = np.zeros((h, w), dtype=np.float32)
+            labels = preds['labels'].cpu().numpy()
+            scores = preds['scores'].cpu().numpy()
+            pred_masks = preds['masks'].cpu().numpy()
+            keep = scores >= conf_threshold
+            labels = labels[keep]
+            pred_masks = pred_masks[keep]
+            for j, lbl in enumerate(labels):
+                if target_indices is not None and int(lbl) not in target_indices:
+                    continue
+                binary = (pred_masks[j][0] > 0.5).astype(np.float32)
+                combined = np.maximum(combined, binary)
+            masks_out.append(combined)
+        return masks_out
+    def get_detection_info(
+        self,
+        image: Image.Image,
+        conf_threshold: Optional[float] = None
+    ) -> List[Dict]:
+        """
+        Get detailed detection information for all objects.
+        Args:
+            image: PIL Image
+            conf_threshold: Confidence threshold (uses default if None)
+        Returns:
+            List of dictionaries with detection info (class, score, bbox, mask)
+        """
+        if self.model is None:
+            self.load_model()
+        if conf_threshold is None:
+            conf_threshold = self.conf_threshold
+        # Prepare image
+        img_array = np.array(image)
+        if len(img_array.shape) == 2:
+            img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
+        elif img_array.shape[2] == 4:
+            img_array = cv2.cvtColor(img_array, cv2.COLOR_RGBA2RGB)
+        img_tensor = torch.from_numpy(img_array).float().permute(2, 0, 1) / 255.0
+        img_tensor = img_tensor.to(self.device)
+        # Run inference
+        with torch.no_grad():
+            predictions = self.model([img_tensor])[0]
+        # Get predictions
+        boxes = predictions['boxes'].cpu().numpy()
+        labels = predictions['labels'].cpu().numpy()
+        scores = predictions['scores'].cpu().numpy()
+        masks = predictions['masks'].cpu().numpy()
+        # Filter and format results
+        detections = []
+        for i in range(len(scores)):
+            if scores[i] >= conf_threshold:
+                detections.append({
+                    'class': self.COCO_CLASSES[labels[i]],
+                    'class_id': int(labels[i]),
+                    'score': float(scores[i]),
+                    'bbox': boxes[i].tolist(),
+                    'mask': (masks[i][0] > 0.5).astype(np.uint8)
+                })
+        return detections

segmentation/sam3.py ADDED Viewed

	@@ -0,0 +1,187 @@

+"""SAM3-style promptable segmentation.
+This integrates a prompt-driven segmentation method into the existing
+class-based segmentation interface. Instead of relying on a fixed class
+vocabulary, it accepts natural-language prompts (e.g., "a red car", "the person").
+Implementation approach (lightweight, no custom training):
+- Use a text-conditioned detector (OWL-ViT) to propose bounding boxes from text.
+- Use SAM (Segment Anything) to convert boxes into masks.
+Notes:
+- This is not "SAM 3" in the sense of an official model release; it is a
+  prompt-to-mask pipeline exposed as a single segmenter named "sam3".
+- If required dependencies/models are missing, this segmenter raises a clear
+  error message.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import List, Optional, Union
+import numpy as np
+import torch
+from PIL import Image
+from .base import BaseSegmenter
+from model_cache import hf_cache_dir, ensure_default_checkpoint_dirs
+@dataclass
+class _SAM3Config:
+    detector_model: str = "google/owlvit-base-patch32"
+    sam_model: str = "facebook/sam-vit-base"
+    box_threshold: float = 0.02
+    max_boxes: int = 5
+class SAM3Segmenter(BaseSegmenter):
+    """Prompt-driven segmentation via (text detector → SAM).
+    Use `target_classes` to pass natural language prompts:
+        - `['car']`, `['a car']`, `['the person']`, etc.
+    Returns a binary mask (H, W) with 1 for predicted ROI.
+    """
+    def __init__(
+        self,
+        device: str = "cuda",
+        detector_model: str = _SAM3Config.detector_model,
+        sam_model: str = _SAM3Config.sam_model,
+        box_threshold: float = _SAM3Config.box_threshold,
+        max_boxes: int = _SAM3Config.max_boxes,
+        **kwargs,
+    ):
+        super().__init__(device=device, **kwargs)
+        self.detector_model_name = detector_model
+        self.sam_model_name = sam_model
+        self.box_threshold = float(box_threshold)
+        self.max_boxes = int(max_boxes)
+        self._detector = None
+        self._sam_model = None
+        self._sam_processor = None
+    def load_model(self):
+        try:
+            from transformers import pipeline, SamModel, SamProcessor
+        except Exception as e:  # pragma: no cover
+            raise ImportError(
+                "SAM3Segmenter requires `transformers` with SAM support. "
+                "Try: pip install -U transformers"
+            ) from e
+        # Make sure any HF downloads (including pipeline internals) land under `checkpoints/`.
+        ensure_default_checkpoint_dirs()
+        # Configure device for HF pipeline
+        if self.device.startswith("cuda") and torch.cuda.is_available():
+            pipeline_device = 0
+        else:
+            pipeline_device = -1
+        self._detector = pipeline(
+            task="zero-shot-object-detection",
+            model=self.detector_model_name,
+            device=pipeline_device,
+        )
+        cache_dir = str(hf_cache_dir())
+        self._sam_processor = SamProcessor.from_pretrained(self.sam_model_name, cache_dir=cache_dir)
+        self._sam_model = SamModel.from_pretrained(self.sam_model_name, cache_dir=cache_dir)
+        self._sam_model = self._sam_model.to(self.device)
+        self._sam_model.eval()
+        # Keep BaseSegmenter.model set for consistency
+        self.model = self._sam_model
+    def segment(
+        self,
+        image: Image.Image,
+        target_classes: Optional[List[str]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        self.ensure_loaded()
+        prompts: List[str]
+        if target_classes is None or len(target_classes) == 0:
+            prompts = ["object"]
+        else:
+            # Treat provided "classes" as free-form text prompts.
+            prompts = [str(p).strip() for p in target_classes if str(p).strip()]
+            if not prompts:
+                prompts = ["object"]
+        box_threshold = float(kwargs.get("box_threshold", self.box_threshold))
+        max_boxes = int(kwargs.get("max_boxes", self.max_boxes))
+        detections = self._detector(image, candidate_labels=prompts)
+        # HF pipeline may return dict (single) or list
+        if isinstance(detections, dict):
+            detections = [detections]
+        boxes: List[List[float]] = []
+        for det in detections:
+            score = float(det.get("score", 0.0))
+            if score < box_threshold:
+                continue
+            b = det.get("box") or {}
+            xmin = float(b.get("xmin", 0.0))
+            ymin = float(b.get("ymin", 0.0))
+            xmax = float(b.get("xmax", 0.0))
+            ymax = float(b.get("ymax", 0.0))
+            # Sanity clamp
+            xmin, ymin = max(0.0, xmin), max(0.0, ymin)
+            xmax, ymax = max(xmin + 1.0, xmax), max(ymin + 1.0, ymax)
+            boxes.append([xmin, ymin, xmax, ymax])
+        if not boxes:
+            return np.zeros((image.height, image.width), dtype=np.float32)
+        boxes = boxes[:max_boxes]
+        # SAM expects a batch; provide one image with N boxes
+        inputs = self._sam_processor(
+            image,
+            input_boxes=[boxes],
+            return_tensors="pt",
+        )
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = self._sam_model(**inputs)
+        # Post-process masks back to original image size
+        # Returns list (batch) of tensors: [num_boxes, H, W]
+        post = self._sam_processor.image_processor.post_process_masks(
+            outputs.pred_masks.detach().cpu(),
+            inputs["original_sizes"].detach().cpu(),
+            inputs["reshaped_input_sizes"].detach().cpu(),
+        )
+        masks0 = post[0]
+        if isinstance(masks0, (list, tuple)):
+            # Defensive: some versions may nest
+            masks0 = torch.stack([m.squeeze(0) if m.ndim == 3 else m for m in masks0], dim=0)
+        # masks0: [num_boxes, H, W] or [num_boxes, 1, H, W]
+        if masks0.ndim == 4:
+            masks0 = masks0[:, 0]
+        combined = (masks0 > 0.5).any(dim=0).to(torch.float32)
+        return combined.numpy()
+    def get_available_classes(self) -> Union[List[str], dict]:
+        # Prompt-based model: not a fixed class list.
+        return []
+    def get_default_classes(self) -> List[str]:
+        return ["object"]
+    # SAM3 (OWL-ViT detector → SAM masks) is inherently sequential;
+    # the two-stage pipeline does not support batched inference.
+    supports_batch: bool = False

segmentation/segformer.py ADDED Viewed

	@@ -0,0 +1,193 @@

+"""
+SegFormer-based segmentation implementation.
+Uses Cityscapes-trained model for semantic segmentation.
+"""
+import torch
+import numpy as np
+import cv2
+from PIL import Image
+from typing import List, Optional, Dict
+from .base import BaseSegmenter
+from model_cache import hf_cache_dir, ensure_default_checkpoint_dirs
+class SegFormerSegmenter(BaseSegmenter):
+    """
+    SegFormer segmentation using Cityscapes classes.
+    Supports semantic segmentation of 19 urban scene classes including
+    vehicles, pedestrians, buildings, and road infrastructure.
+    """
+    # Cityscapes class mapping
+    CITYSCAPES_CLASSES = {
+        'road': 0, 'sidewalk': 1, 'building': 2, 'wall': 3, 'fence': 4,
+        'pole': 5, 'traffic light': 6, 'traffic sign': 7, 'vegetation': 8,
+        'terrain': 9, 'sky': 10, 'person': 11, 'rider': 12, 'car': 13,
+        'truck': 14, 'bus': 15, 'train': 16, 'motorcycle': 17, 'bicycle': 18
+    }
+    def __init__(
+        self,
+        device: str = 'cuda',
+        model_name: str = "nvidia/segformer-b4-finetuned-cityscapes-1024-1024",
+        **kwargs
+    ):
+        """
+        Initialize SegFormer segmenter.
+        Args:
+            device: Device to run on ('cuda' or 'cpu')
+            model_name: HuggingFace model identifier
+            **kwargs: Additional parameters
+        """
+        super().__init__(device=device, **kwargs)
+        self.model_name = model_name
+        self.processor = None
+    def load_model(self):
+        """Load SegFormer model from HuggingFace."""
+        from transformers import SegformerImageProcessor, SegformerForSemanticSegmentation
+        print(f"Loading SegFormer model: {self.model_name}")
+        ensure_default_checkpoint_dirs()
+        cache_dir = str(hf_cache_dir())
+        self.processor = SegformerImageProcessor.from_pretrained(self.model_name, cache_dir=cache_dir)
+        self.model = SegformerForSemanticSegmentation.from_pretrained(
+            self.model_name,
+            cache_dir=cache_dir,
+        ).to(self.device)
+        self.model.eval()
+        print("SegFormer model loaded successfully")
+    def segment(
+        self,
+        image: Image.Image,
+        target_classes: Optional[List[str]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Create segmentation mask using SegFormer.
+        Args:
+            image: PIL Image
+            target_classes: List of class names (e.g., ['car', 'building', 'person'])
+            **kwargs: Additional parameters (unused)
+        Returns:
+            Binary mask (H, W) with 1 for target classes, 0 for background
+        """
+        # Validate classes
+        if target_classes is None:
+            target_classes = self.get_default_classes()
+        # Get target class IDs
+        target_ids = []
+        for cls in target_classes:
+            cls_lower = cls.lower()
+            if cls_lower in self.CITYSCAPES_CLASSES:
+                target_ids.append(self.CITYSCAPES_CLASSES[cls_lower])
+            else:
+                print(f"Warning: '{cls}' not in Cityscapes classes. "
+                      f"Available: {list(self.CITYSCAPES_CLASSES.keys())}")
+        if not target_ids:
+            print(f"Warning: No valid classes found. Using 'car' as default.")
+            target_ids = [self.CITYSCAPES_CLASSES['car']]
+        # Process image
+        orig_size = image.size
+        inputs = self.processor(images=image, return_tensors="pt")
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+            logits = outputs.logits
+        # Get segmentation map
+        seg_map = torch.argmax(logits, dim=1)[0].cpu().numpy()
+        # Resize to original size
+        seg_map_resized = cv2.resize(
+            seg_map.astype(np.uint8),
+            orig_size,
+            interpolation=cv2.INTER_NEAREST
+        )
+        # Create binary mask for target classes
+        mask = np.zeros_like(seg_map_resized, dtype=np.float32)
+        for class_id in target_ids:
+            mask[seg_map_resized == class_id] = 1.0
+        return mask
+    def get_available_classes(self) -> Dict[str, int]:
+        """Get Cityscapes class mapping."""
+        return self.CITYSCAPES_CLASSES
+    def get_default_classes(self) -> List[str]:
+        """Default to car segmentation."""
+        return ['car']
+    # SegFormer supports batched inference via the HF processor.
+    supports_batch: bool = True
+    def segment_batch(
+        self,
+        images: List[Image.Image],
+        target_classes: Optional[List[str]] = None,
+        **kwargs,
+    ) -> List[np.ndarray]:
+        """Segment a batch of images in a single forward pass.
+        The HuggingFace SegFormer preprocessor natively accepts a list of
+        PIL images and returns a batched tensor.
+        Args:
+            images: List of PIL Images (should be same resolution for padding)
+            target_classes: Cityscapes class names to include in ROI mask
+            **kwargs: unused
+        Returns:
+            List of binary masks (H, W) float32
+        """
+        if not images:
+            return []
+        self.ensure_loaded()
+        if target_classes is None:
+            target_classes = self.get_default_classes()
+        target_ids = []
+        for cls in target_classes:
+            cls_lower = cls.lower()
+            if cls_lower in self.CITYSCAPES_CLASSES:
+                target_ids.append(self.CITYSCAPES_CLASSES[cls_lower])
+        if not target_ids:
+            target_ids = [self.CITYSCAPES_CLASSES['car']]
+        # Batch preprocess
+        inputs = self.processor(images=images, return_tensors="pt", padding=True)
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+            logits = outputs.logits  # (B, num_classes, h', w')
+        masks: List[np.ndarray] = []
+        for i, img in enumerate(images):
+            seg_map = torch.argmax(logits[i], dim=0).cpu().numpy()
+            seg_resized = cv2.resize(
+                seg_map.astype(np.uint8),
+                img.size,
+                interpolation=cv2.INTER_NEAREST,
+            )
+            mask = np.zeros_like(seg_resized, dtype=np.float32)
+            for cid in target_ids:
+                mask[seg_resized == cid] = 1.0
+            masks.append(mask)
+        return masks

segmentation/utils.py ADDED Viewed

	@@ -0,0 +1,90 @@

+"""
+Utility functions for segmentation visualization and I/O.
+"""
+import numpy as np
+import cv2
+from PIL import Image
+from typing import Tuple
+def visualize_mask(
+    image: Image.Image,
+    mask: np.ndarray,
+    alpha: float = 0.5,
+    color: Tuple[int, int, int] = (255, 0, 0)
+) -> Image.Image:
+    """
+    Overlay segmentation mask on image.
+    Args:
+        image: PIL Image
+        mask: Binary mask (H, W)
+        alpha: Transparency (0-1)
+        color: RGB color tuple for mask
+    Returns:
+        Image with mask overlay
+    """
+    # Convert to numpy
+    img_array = np.array(image)
+    # Create colored mask
+    colored_mask = np.zeros_like(img_array)
+    colored_mask[mask > 0.5] = color
+    # Blend
+    result = cv2.addWeighted(img_array, 1.0, colored_mask, alpha, 0)
+    return Image.fromarray(result)
+def save_mask(mask: np.ndarray, output_path: str):
+    """
+    Save mask as image (white for ROI, black for background).
+    Args:
+        mask: Binary mask array (H, W)
+        output_path: Path to save mask image
+    """
+    mask_img = (mask * 255).astype(np.uint8)
+    Image.fromarray(mask_img).save(output_path)
+def load_mask(mask_path: str) -> np.ndarray:
+    """
+    Load mask from image file.
+    Args:
+        mask_path: Path to mask image
+    Returns:
+        Binary mask as numpy array (H, W) with values 0 or 1
+    """
+    mask_img = Image.open(mask_path).convert('L')
+    mask = np.array(mask_img).astype(np.float32) / 255.0
+    return mask
+def calculate_roi_stats(mask: np.ndarray) -> dict:
+    """
+    Calculate statistics about ROI coverage.
+    Args:
+        mask: Binary mask (H, W)
+    Returns:
+        Dictionary with statistics:
+        - roi_pixels: Number of ROI pixels
+        - total_pixels: Total number of pixels
+        - roi_percentage: Percentage of image covered by ROI
+    """
+    roi_pixels = int(np.sum(mask > 0.5))
+    total_pixels = int(mask.size)
+    roi_percentage = (roi_pixels / total_pixels) * 100
+    return {
+        'roi_pixels': roi_pixels,
+        'total_pixels': total_pixels,
+        'roi_percentage': roi_percentage
+    }

segmentation/yolo.py ADDED Viewed

	@@ -0,0 +1,188 @@

+"""
+YOLO-based instance segmentation implementation.
+Uses YOLO26 (or YOLOv8 fallback) for COCO object detection and segmentation.
+Supports true batch inference for video processing.
+"""
+import numpy as np
+import cv2
+from PIL import Image
+from typing import List, Optional
+from .base import BaseSegmenter
+class YOLOSegmenter(BaseSegmenter):
+    """
+    YOLO instance segmentation using COCO classes.
+    Defaults to YOLO26x-seg for best accuracy.
+    Supports batch inference via ``segment_batch()`` for video pipelines.
+    """
+    def __init__(
+        self,
+        device: str = 'cuda',
+        model_path: str = 'checkpoints/yolo26x-seg.pt',
+        conf_threshold: float = 0.25,
+        **kwargs
+    ):
+        """
+        Initialize YOLO segmenter.
+        Args:
+            device: Device to run on ('cuda' or 'cpu')
+            model_path: Path to YOLO weights file (yolo26x-seg.pt default)
+            conf_threshold: Confidence threshold for detections
+            **kwargs: Additional parameters
+        """
+        super().__init__(device=device, **kwargs)
+        self.model_path = model_path
+        self.conf_threshold = conf_threshold
+    def load_model(self):
+        """Load YOLO model."""
+        from ultralytics import YOLO
+        print(f"Loading YOLO model: {self.model_path}")
+        self.model = YOLO(self.model_path)
+        print("YOLO model loaded successfully")
+    def _extract_mask(
+        self,
+        result,
+        width: int,
+        height: int,
+        target_classes: List[str],
+    ) -> np.ndarray:
+        """Extract combined binary mask from a single YOLO result object.
+        Args:
+            result: Single ultralytics Results object
+            width: Target mask width
+            height: Target mask height
+            target_classes: Classes to include
+        Returns:
+            Binary mask (H, W) float32
+        """
+        mask = np.zeros((height, width), dtype=np.float32)
+        if result.masks is None:
+            return mask
+        masks_data = result.masks.data
+        boxes = result.boxes
+        for idx, box in enumerate(boxes):
+            class_id = int(box.cls[0])
+            class_name = self.model.names[class_id].lower()
+            if any(target.lower() in class_name for target in target_classes):
+                instance_mask = masks_data[idx].cpu().numpy()
+                instance_mask_resized = cv2.resize(
+                    instance_mask,
+                    (width, height),
+                    interpolation=cv2.INTER_LINEAR,
+                )
+                mask = np.maximum(mask, instance_mask_resized)
+        return (mask > 0.5).astype(np.float32)
+    def segment(
+        self,
+        image: Image.Image,
+        target_classes: Optional[List[str]] = None,
+        conf_threshold: Optional[float] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Create segmentation mask using YOLO.
+        Args:
+            image: PIL Image
+            target_classes: List of class names (e.g., ['car', 'person'])
+            conf_threshold: Override default confidence threshold
+            **kwargs: Additional parameters
+        Returns:
+            Binary mask (H, W) with 1 for target instances, 0 for background
+        """
+        threshold = conf_threshold if conf_threshold is not None else self.conf_threshold
+        if target_classes is None:
+            target_classes = self.get_default_classes()
+        results = self.model(image, verbose=False, conf=threshold, device=self.device)
+        if not results:
+            return np.zeros((image.height, image.width), dtype=np.float32)
+        return self._extract_mask(results[0], image.width, image.height, target_classes)
+    def segment_batch(
+        self,
+        images: List[Image.Image],
+        target_classes: Optional[List[str]] = None,
+        conf_threshold: Optional[float] = None,
+        **kwargs,
+    ) -> List[np.ndarray]:
+        """Segment a batch of images in a single YOLO forward pass.
+        Args:
+            images: List of PIL Images (should be the same resolution)
+            target_classes: Class names to include in ROI mask
+            conf_threshold: Override default confidence threshold
+        Returns:
+            List of binary masks (H, W), one per input image
+        """
+        if not images:
+            return []
+        self.ensure_loaded()
+        threshold = conf_threshold if conf_threshold is not None else self.conf_threshold
+        if target_classes is None:
+            target_classes = self.get_default_classes()
+        # Ultralytics accepts a list of PIL images for batch inference
+        results = self.model(images, verbose=False, conf=threshold, device=self.device)
+        masks = []
+        for i, result in enumerate(results):
+            img = images[i]
+            masks.append(self._extract_mask(result, img.width, img.height, target_classes))
+        return masks
+    def get_available_classes(self) -> List[str]:
+        """
+        Get COCO class names.
+        Returns:
+            List of COCO class names (80 classes)
+        """
+        if self.model is None:
+            # Return common COCO classes if model not loaded
+            return [
+                'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+                'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+                'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+                'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+                'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+                'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
+                'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork',
+                'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+                'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
+                'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
+                'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+                'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+                'scissors', 'teddy bear', 'hair drier', 'toothbrush'
+            ]
+        return list(self.model.names.values())
+    def get_default_classes(self) -> List[str]:
+        """Default to car segmentation."""
+        return ['car']
+    # YOLO natively supports batched inference via ultralytics.
+    supports_batch: bool = True

vae/RSTB.py ADDED Viewed

	@@ -0,0 +1,813 @@

+# Copyright (c) 2021-2022, InterDigital Communications, Inc
+# All rights reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the disclaimer
+# below) provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# * Neither the name of InterDigital Communications, Inc nor the names of its
+#   contributors may be used to endorse or promote products derived from this
+#   software without specific prior written permission.
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+# NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+from typing import Any
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torch.autograd import Function
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from compressai.layers import GDN
+__all__ = [
+    "AttentionBlock",
+    "MaskedConv2d",
+    "ResidualBlock",
+    "ResidualBlockUpsample",
+    "ResidualBlockWithStride",
+    "conv3x3",
+    "subpel_conv3x3",
+    "QReLU",
+    "RSTB",
+    "CausalAttentionModule",
+]
+class MaskedConv2d(nn.Conv2d):
+    r"""Masked 2D convolution implementation, mask future "unseen" pixels.
+    Useful for building auto-regressive network components.
+    Introduced in `"Conditional Image Generation with PixelCNN Decoders"
+    <https://arxiv.org/abs/1606.05328>`_.
+    Inherits the same arguments as a `nn.Conv2d`. Use `mask_type='A'` for the
+    first layer (which also masks the "current pixel"), `mask_type='B'` for the
+    following layers.
+    """
+    def __init__(self, *args: Any, mask_type: str = "A", **kwargs: Any):
+        super().__init__(*args, **kwargs)
+        if mask_type not in ("A", "B"):
+            raise ValueError(f'Invalid "mask_type" value "{mask_type}"')
+        self.register_buffer("mask", torch.ones_like(self.weight.data))
+        _, _, h, w = self.mask.size()
+        self.mask[:, :, h // 2, w // 2 + (mask_type == "B"):] = 0
+        self.mask[:, :, h // 2 + 1:] = 0
+    def forward(self, x: Tensor) -> Tensor:
+        # TODO(begaintj): weight assigment is not supported by torchscript
+        self.weight.data *= self.mask
+        return super().forward(x)
+def conv3x3(in_ch: int, out_ch: int, stride: int = 1) -> nn.Module:
+    """3x3 convolution with padding."""
+    return nn.Conv2d(in_ch, out_ch, kernel_size=3, stride=stride, padding=1)
+def subpel_conv3x3(in_ch: int, out_ch: int, r: int = 1) -> nn.Sequential:
+    """3x3 sub-pixel convolution for up-sampling."""
+    return nn.Sequential(
+        nn.Conv2d(in_ch, out_ch * r ** 2, kernel_size=3, padding=1), nn.PixelShuffle(r)
+    )
+def conv1x1(in_ch: int, out_ch: int, stride: int = 1) -> nn.Module:
+    """1x1 convolution."""
+    return nn.Conv2d(in_ch, out_ch, kernel_size=1, stride=stride)
+class ResidualBlockWithStride(nn.Module):
+    """Residual block with a stride on the first convolution.
+    Args:
+        in_ch (int): number of input channels
+        out_ch (int): number of output channels
+        stride (int): stride value (default: 2)
+    """
+    def __init__(self, in_ch: int, out_ch: int, stride: int = 2):
+        super().__init__()
+        self.conv1 = conv3x3(in_ch, out_ch, stride=stride)
+        self.leaky_relu = nn.LeakyReLU(inplace=True)
+        self.conv2 = conv3x3(out_ch, out_ch)
+        self.gdn = GDN(out_ch)
+        if stride != 1 or in_ch != out_ch:
+            self.skip = conv1x1(in_ch, out_ch, stride=stride)
+        else:
+            self.skip = None
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+        out = self.conv1(x)
+        out = self.leaky_relu(out)
+        out = self.conv2(out)
+        out = self.gdn(out)
+        if self.skip is not None:
+            identity = self.skip(x)
+        out += identity
+        return out
+class ResidualBlockUpsample(nn.Module):
+    """Residual block with sub-pixel upsampling on the last convolution.
+    Args:
+        in_ch (int): number of input channels
+        out_ch (int): number of output channels
+        upsample (int): upsampling factor (default: 2)
+    """
+    def __init__(self, in_ch: int, out_ch: int, upsample: int = 2):
+        super().__init__()
+        self.subpel_conv = subpel_conv3x3(in_ch, out_ch, upsample)
+        self.leaky_relu = nn.LeakyReLU(inplace=True)
+        self.conv = conv3x3(out_ch, out_ch)
+        self.igdn = GDN(out_ch, inverse=True)
+        self.upsample = subpel_conv3x3(in_ch, out_ch, upsample)
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+        out = self.subpel_conv(x)
+        out = self.leaky_relu(out)
+        out = self.conv(out)
+        out = self.igdn(out)
+        identity = self.upsample(x)
+        out += identity
+        return out
+class ResidualBlock(nn.Module):
+    """Simple residual block with two 3x3 convolutions.
+    Args:
+        in_ch (int): number of input channels
+        out_ch (int): number of output channels
+    """
+    def __init__(self, in_ch: int, out_ch: int):
+        super().__init__()
+        self.conv1 = conv3x3(in_ch, out_ch)
+        self.leaky_relu = nn.LeakyReLU(inplace=True)
+        self.conv2 = conv3x3(out_ch, out_ch)
+        if in_ch != out_ch:
+            self.skip = conv1x1(in_ch, out_ch)
+        else:
+            self.skip = None
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+        out = self.conv1(x)
+        out = self.leaky_relu(out)
+        out = self.conv2(out)
+        out = self.leaky_relu(out)
+        if self.skip is not None:
+            identity = self.skip(x)
+        out = out + identity
+        return out
+class AttentionBlock(nn.Module):
+    """Self attention block.
+    Simplified variant from `"Learned Image Compression with
+    Discretized Gaussian Mixture Likelihoods and Attention Modules"
+    <https://arxiv.org/abs/2001.01568>`_, by Zhengxue Cheng, Heming Sun, Masaru
+    Takeuchi, Jiro Katto.
+    Args:
+        N (int): Number of channels)
+    """
+    def __init__(self, N: int):
+        super().__init__()
+        class ResidualUnit(nn.Module):
+            """Simple residual unit."""
+            def __init__(self):
+                super().__init__()
+                self.conv = nn.Sequential(
+                    conv1x1(N, N // 2),
+                    nn.ReLU(inplace=True),
+                    conv3x3(N // 2, N // 2),
+                    nn.ReLU(inplace=True),
+                    conv1x1(N // 2, N),
+                )
+                self.relu = nn.ReLU(inplace=True)
+            def forward(self, x: Tensor) -> Tensor:
+                identity = x
+                out = self.conv(x)
+                out += identity
+                out = self.relu(out)
+                return out
+        self.conv_a = nn.Sequential(ResidualUnit(), ResidualUnit(), ResidualUnit())
+        self.conv_b = nn.Sequential(
+            ResidualUnit(),
+            ResidualUnit(),
+            ResidualUnit(),
+            conv1x1(N, N),
+        )
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+        a = self.conv_a(x)
+        b = self.conv_b(x)
+        out = a * torch.sigmoid(b)
+        out += identity
+        return out
+class QReLU(Function):
+    """QReLU
+    Clamping input with given bit-depth range.
+    Suppose that input data presents integer through an integer network
+    otherwise any precision of input will simply clamp without rounding
+    operation.
+    Pre-computed scale with gamma function is used for backward computation.
+    More details can be found in
+    `"Integer networks for data compression with latent-variable models"
+    <https://openreview.net/pdf?id=S1zz2i0cY7>`_,
+    by Johannes Ballé, Nick Johnston and David Minnen, ICLR in 2019
+    Args:
+        input: a tensor data
+        bit_depth: source bit-depth (used for clamping)
+        beta: a parameter for modeling the gradient during backward computation
+    """
+    @staticmethod
+    def forward(ctx, input, bit_depth, beta):
+        # TODO(choih): allow to use adaptive scale instead of
+        # pre-computed scale with gamma function
+        ctx.alpha = 0.9943258522851727
+        ctx.beta = beta
+        ctx.max_value = 2 ** bit_depth - 1
+        ctx.save_for_backward(input)
+        return input.clamp(min=0, max=ctx.max_value)
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_input = None
+        (input,) = ctx.saved_tensors
+        grad_input = grad_output.clone()
+        grad_sub = (
+                torch.exp(
+                    (-ctx.alpha ** ctx.beta)
+                    * torch.abs(2.0 * input / ctx.max_value - 1) ** ctx.beta
+                )
+                * grad_output.clone()
+        )
+        grad_input[input < 0] = grad_sub[input < 0]
+        grad_input[input > ctx.max_value] = grad_sub[input > ctx.max_value]
+        return grad_input, None, None
+class PatchEmbed(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x):
+        x = x.flatten(2).transpose(1, 2)  # B Ph*Pw C
+        return x
+    def flops(self):
+        flops = 0
+        return flops
+class PatchUnEmbed(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, x_size):
+        B, HW, C = x.shape
+        x = x.transpose(1, 2).view(B, -1, x_size[0], x_size[1])
+        return x
+    def flops(self):
+        flops = 0
+        return flops
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+class WindowAttention(nn.Module):
+    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, mask=None):
+        """
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+    def extra_repr(self) -> str:
+        return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'
+    def flops(self, N):
+        # calculate flops for 1 window with token length of N
+        flops = 0
+        # qkv = self.qkv(x)
+        flops += N * self.dim * 3 * self.dim
+        # attn = (q @ k.transpose(-2, -1))
+        flops += self.num_heads * N * (self.dim // self.num_heads) * N
+        #  x = (attn @ v)
+        flops += self.num_heads * N * N * (self.dim // self.num_heads)
+        # x = self.proj(x)
+        flops += N * self.dim * self.dim
+        return flops
+class SwinTransformerBlock(nn.Module):
+    r""" Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resulotion.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
+                 act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        if min(self.input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
+            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        if self.shift_size > 0:
+            attn_mask = self.calculate_mask(self.input_resolution)
+        else:
+            attn_mask = None
+        self.register_buffer("attn_mask", attn_mask)
+    def calculate_mask(self, x_size):
+        # calculate attention mask for SW-MSA
+        H, W = x_size
+        img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
+        h_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+        mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+        return attn_mask
+    def forward(self, x, x_size):
+        H, W = x_size
+        B, L, C = x.shape
+        # assert L == H * W, "input feature has wrong size"
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+        else:
+            shifted_x = x
+        # partition windows
+        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
+        # W-MSA/SW-MSA (to be compatible for testing on images whose shapes are the multiple of window size
+        if self.input_resolution == x_size:
+            attn_windows = self.attn(x_windows, mask=self.attn_mask)  # nW*B, window_size*window_size, C
+        else:
+            attn_windows = self.attn(x_windows, mask=self.calculate_mask(x_size).to(x.device))
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+        x = x.view(B, H * W, C)
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
+               f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
+    def flops(self):
+        flops = 0
+        H, W = self.input_resolution
+        # norm1
+        flops += self.dim * H * W
+        # W-MSA/SW-MSA
+        nW = H * W / self.window_size / self.window_size
+        flops += nW * self.attn.flops(self.window_size * self.window_size)
+        # mlp
+        flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
+        # norm2
+        flops += self.dim * H * W
+        return flops
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., norm_layer=nn.LayerNorm, use_checkpoint=False):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock(dim=dim, input_resolution=input_resolution,
+                                 num_heads=num_heads, window_size=window_size,
+                                 shift_size=0 if (i % 2 == 0) else window_size // 2,
+                                 mlp_ratio=mlp_ratio,
+                                 qkv_bias=qkv_bias, qk_scale=qk_scale,
+                                 drop=drop, attn_drop=attn_drop,
+                                 drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                                 norm_layer=norm_layer)
+            for i in range(depth)])
+    def forward(self, x, x_size):
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x, x_size)
+        return x
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
+    def flops(self):
+        flops = 0
+        for blk in self.blocks:
+            flops += blk.flops()
+        return flops
+class RSTB(nn.Module):
+    """Residual Swin Transformer Block (RSTB).
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., norm_layer=nn.LayerNorm, use_checkpoint=False):
+        super(RSTB, self).__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.residual_group = BasicLayer(dim=dim,
+                                         input_resolution=input_resolution,
+                                         depth=depth,
+                                         num_heads=num_heads,
+                                         window_size=window_size,
+                                         mlp_ratio=mlp_ratio,
+                                         qkv_bias=qkv_bias, qk_scale=qk_scale,
+                                         drop=drop, attn_drop=attn_drop,
+                                         drop_path=drop_path,
+                                         norm_layer=norm_layer,
+                                         use_checkpoint=use_checkpoint
+                                         )
+        self.patch_embed = PatchEmbed()
+        self.patch_unembed = PatchUnEmbed()
+    def forward(self, x, x_size):
+        return self.patch_unembed(self.residual_group(self.patch_embed(x), x_size), x_size) + x
+    def flops(self):
+        flops = 0
+        flops += self.residual_group.flops()
+        flops += self.patch_embed.flops()
+        flops += self.patch_unembed.flops()
+        return flops
+class CausalAttentionModule(nn.Module):
+    r""" Causal multi-head self attention module.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+    """
+    def __init__(self, dim, out_dim, block_len=5, num_heads=16, mlp_ratio=4., qkv_bias=True, qk_scale=None,
+                 attn_drop=0.):
+        super().__init__()
+        assert dim % num_heads == 0
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.block_size = block_len * block_len
+        self.scale = qk_scale or head_dim ** -0.5
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.norm1 = nn.LayerNorm(dim)
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.mask = torch.Tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]).view(1,
+                                                                                                                   self.block_size,
+                                                                                                                   1)
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * block_len - 1) * (2 * block_len - 1), num_heads))  # 2*P-1 * 2*P-1, num_heads
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(block_len)
+        coords_w = torch.arange(block_len)
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, P, P
+        coords_flatten = torch.flatten(coords, 1)  # 2, P*P
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, PP, PP
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # PP, PP, 2
+        relative_coords[:, :, 0] += block_len - 1  # shift to start from 0
+        relative_coords[:, :, 1] += block_len - 1
+        relative_coords[:, :, 0] *= 2 * block_len - 1
+        relative_position_index = relative_coords.sum(-1)  # PP, PP
+        self.register_buffer("relative_position_index", relative_position_index)
+        self.softmax = nn.Softmax(dim=-1)
+        self.norm2 = nn.LayerNorm(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=nn.GELU, drop=attn_drop)
+        self.proj = nn.Linear(dim, out_dim)
+    def forward(self, x):
+        B, C, H, W = x.shape
+        x_unfold = F.unfold(x, kernel_size=(5, 5), padding=2)  # B, CPP, HW
+        x_unfold = x_unfold.reshape(B, C, self.block_size, H * W).permute(0, 3, 2, 1).contiguous().view(-1,
+                                                                                                        self.block_size,
+                                                                                                        C)  # BHW, PP, C
+        x_masked = x_unfold * self.mask.to(x_unfold.device)
+        out = self.norm1(x_masked)
+        qkv = self.qkv(out).reshape(B * H * W, self.block_size, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3,
+                                                                                                                1,
+                                                                                                                4)  # 3, BHW, num_heads, PP, C
+        q, k, v = qkv[0], qkv[1], qkv[
+            2]  # make torchscript happy (cannot use tensor as tuple) # BHW, num_heads, PP, C//num_heads
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))  # BHW, num_heads, PP, PP
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.block_size, self.block_size, -1)  # PP, PP, num_heads
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # num_heads, PP, PP
+        attn = attn + relative_position_bias.unsqueeze(0)
+        attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        out = (attn @ v).transpose(1, 2).reshape(B * H * W, self.block_size,
+                                                 C)  # [BHW, num_heads, PP, PP] [BHW, num_heads, PP, C//num_heads]
+        out += x_masked
+        out_sumed = torch.sum(out, dim=1).reshape(B, H * W, C)
+        out = self.norm2(out_sumed)
+        out = self.mlp(out)
+        out += out_sumed
+        out = self.proj(out)
+        out = out.reshape(B, H, W, -1).permute(0, 3, 1, 2)  # B, C_out, H, W
+        return out

vae/__init__.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""Local ROI-VAE compression package.
+This package was previously named `compression`, but was renamed to `vae` to
+avoid colliding with optional third-party imports (e.g. via Gradio/fsspec).
+It intentionally uses lazy attribute loading to keep import-time overhead low.
+"""
+from __future__ import annotations
+from importlib import import_module
+from typing import TYPE_CHECKING
+__all__ = [
+    "TIC",
+    "ModifiedTIC",
+    "load_checkpoint",
+    "compute_padding",
+    "compress_image",
+    "highlight_roi",
+    "create_comparison_grid",
+    "RSTB",
+    "CausalAttentionModule",
+]
+_EXPORTS: dict[str, tuple[str, str]] = {
+    "TIC": (".tic_model", "TIC"),
+    "ModifiedTIC": (".roi_tic", "ModifiedTIC"),
+    "load_checkpoint": (".roi_tic", "load_checkpoint"),
+    "compute_padding": (".utils", "compute_padding"),
+    "compress_image": (".utils", "compress_image"),
+    "highlight_roi": (".visualization", "highlight_roi"),
+    "create_comparison_grid": (".visualization", "create_comparison_grid"),
+    "RSTB": (".RSTB", "RSTB"),
+    "CausalAttentionModule": (".RSTB", "CausalAttentionModule"),
+}
+def __getattr__(name: str):
+    if name not in _EXPORTS:
+        raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+    module_name, attr_name = _EXPORTS[name]
+    module = import_module(module_name, package=__name__)
+    value = getattr(module, attr_name)
+    globals()[name] = value
+    return value
+if TYPE_CHECKING:
+    from .RSTB import CausalAttentionModule, RSTB
+    from .roi_tic import ModifiedTIC, load_checkpoint
+    from .tic_model import TIC
+    from .utils import compress_image, compute_padding
+    from .visualization import create_comparison_grid, highlight_roi

vae/roi_tic.py ADDED Viewed

	@@ -0,0 +1,132 @@

+"""
+ROI-aware TIC model for region-based compression.
+"""
+import torch
+import torch.nn.functional as F
+from .tic_model import TIC
+class ModifiedTIC(TIC):
+    """Modified TIC that uses pre-computed binary mask with sigma parameter for ROI-based compression"""
+    def __init__(self, N=192, M=192):
+        super().__init__(N=N, M=M)
+        # Cache for pre-allocated constant tensors (device -> tensor)
+        self._ones_cache = {}
+        self._sigma_cache = {}
+    def forward(self, x, mask, sigma=0.3):
+        """
+        Forward pass with ROI mask and quality factor.
+        Args:
+            x: input image tensor [B, C, H, W]
+            mask: ROI mask (1 for ROI, 0 for background) [B, 1, H, W]
+            sigma: quality factor for background (0.01-1.0, lower = more compression)
+                   Can be a scalar (same for all frames) or tensor [B] for per-frame values
+        Returns:
+            dict with compression outputs (y_hat, y, similarity, x_hat, likelihoods)
+        """
+        x_size = (x.shape[2], x.shape[3])
+        batch_size = x.shape[0]
+        device = mask.device
+        # Get or create cached ones tensor for this device
+        if device not in self._ones_cache:
+            self._ones_cache[device] = torch.tensor(1.0, device=device)
+        ones_tensor = self._ones_cache[device]
+        # Convert sigma to tensor [B, 1, 1, 1] for broadcasting
+        if isinstance(sigma, (int, float)):
+            # Scalar sigma - use cache
+            cache_key = (device, sigma)
+            if cache_key not in self._sigma_cache:
+                self._sigma_cache[cache_key] = torch.tensor(sigma, device=device)
+            sigma_tensor = self._sigma_cache[cache_key]
+        else:
+            # Batched sigma - convert to [B, 1, 1, 1]
+            if sigma.dim() == 1:
+                sigma_tensor = sigma.view(batch_size, 1, 1, 1)
+            else:
+                sigma_tensor = sigma
+        # Convert binary mask to quality factors (broadcasting handles per-frame sigma)
+        similarity_loss = torch.where(mask > 0.5, ones_tensor, sigma_tensor)
+        similarity_imp = torch.where(mask > 0.5, ones_tensor, sigma_tensor)
+        # Downsample mask to 1/2 resolution for simi_net
+        # simi_net has 3 stride-2 convolutions (8x downsampling), so input at 1/2 gives output at 1/16
+        # which matches y_codec_a6 dimensions (after g_a's 3x downsampling + g_a6's 1x downsampling = 16x)
+        # Use nearest-neighbor for binary masks (faster, no quality loss for binary data)
+        similarity_down = F.interpolate(similarity_imp, scale_factor=0.5, mode='nearest')
+        similarity_up = F.interpolate(similarity_loss, scale_factor=2, mode='nearest')
+        similarity_up_repeated = similarity_up.repeat(1, 3, 1, 1)
+        # simi_net downsamples by 8x: 128x128 -> 16x16 to match y_codec_a6
+        similarities_channel = self.simi_net(similarity_down)
+        similarities_sigmoid = torch.sigmoid(similarities_channel)
+        y_codec = self.g_a(x, x_size)
+        y_codec_a6 = self.g_a6(y_codec)
+        y_import = self.sub_impor_net(y_codec)
+        y_tanh = self.tanh(y_import)
+        y_soft = self.softsign(y_tanh)
+        y_imp = y_soft + similarities_sigmoid
+        y = y_codec_a6 * y_imp
+        z = self.h_a(y, x_size)
+        z_hat, z_likelihoods = self.entropy_bottleneck(z)
+        params = self.h_s(z_hat, x_size)
+        y_hat = self.gaussian_conditional.quantize(
+            y, "noise" if self.training else "dequantize"
+        )
+        ctx_params = self.context_prediction(y_hat)
+        gaussian_params = self.entropy_parameters(
+            torch.cat((params, ctx_params), dim=1)
+        )
+        scales_hat, means_hat = gaussian_params.chunk(2, 1)
+        _, y_likelihoods = self.gaussian_conditional(y, scales_hat, means=means_hat)
+        x_hat = self.g_s(y_hat, x_size)
+        return {
+            "y_hat": y_hat,
+            "y": y,
+            "similarity": similarity_up_repeated,
+            "x_hat": x_hat,
+            "likelihoods": {"y": y_likelihoods, "z": z_likelihoods},
+        }
+def load_checkpoint(checkpoint_path: str, N: int = 192, M: int = 192, device: str = 'cuda') -> ModifiedTIC:
+    """
+    Load TIC model from checkpoint.
+    Args:
+        checkpoint_path: Path to .pth.tar checkpoint
+        N: Number of channels (default 192)
+        M: Number of channels in expansion layers (default 192)
+        device: 'cuda' or 'cpu'
+    Returns:
+        Loaded ModifiedTIC model in eval mode
+    """
+    model = ModifiedTIC(N=N, M=M).to(device)
+    checkpoint = torch.load(checkpoint_path, map_location=device)
+    # Fix for compressai version mismatch (handle both old→new and new→old)
+    state_dict = checkpoint["state_dict"]
+    # Don't convert - just use as-is
+    # The checkpoint is already in the old format that the model expects
+    model.load_state_dict(state_dict)
+    model.eval()
+    model.update(force=True)
+    return model

vae/tic_model.py ADDED Viewed

	@@ -0,0 +1,989 @@

+import math
+import time
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from compressai.entropy_models import EntropyBottleneck, GaussianConditional
+from .RSTB import RSTB, CausalAttentionModule
+from compressai.ans import BufferedRansEncoder, RansDecoder
+from timm.models.layers import trunc_normal_
+from compressai.models.utils import conv, deconv, update_registered_buffers
+from compressai.layers import AttentionBlock
+from PIL import Image
+import numpy as np
+import matplotlib
+import matplotlib.pyplot as plt
+from matplotlib.colors import ListedColormap
+# from lseg.lseg_net import LSegNet
+# import cv2
+# import random
+import itertools
+# device = "cuda" if torch.cuda.is_available() else "cpu"
+# From Balle's tensorflow compression examples
+SCALES_MIN = 0.11
+SCALES_MAX = 256
+SCALES_LEVELS = 64
+device = "cuda"
+def get_scale_table(min=SCALES_MIN, max=SCALES_MAX, levels=SCALES_LEVELS):
+    return torch.exp(torch.linspace(math.log(min), math.log(max), levels))
+class Binarizer(torch.autograd.Function):
+    """
+    An elementwise function that bins values
+    to 0 or 1 depending on a threshold of
+    0.5
+    Input: a tensor with values in range(0,1)
+    Returns: a tensor with binary values: 0 or 1
+    based on a threshold of 0.5
+    Equation(1) in paper
+    """
+    @staticmethod
+    def forward(ctx, i):
+        result = torch.where(i > 0.9, torch.tensor(1.0), torch.tensor(0.2))
+        return result
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output
+def bin_values(x):
+    return Binarizer.apply(x)
+class TIC(nn.Module):
+    """Neural image compression framework from
+    Lu Ming and Guo, Peiyao and Shi, Huiqing and Cao, Chuntong and Ma, Zhan:
+    `"Transformer-based Image Compression" <https://arxiv.org/abs/2111.06707>`, (DCC 2022).
+    Args:
+        N (int): Number of channels
+        M (int): Number of channels in the expansion layers (last layer of the
+            encoder and last layer of the hyperprior decoder)
+        input_resolution (int): Just used for window partition decision
+    """
+    def __init__(self,  N=192, M=192):
+        super().__init__()
+        depths = [1, 2, 3, 1, 1]
+        num_heads = [4, 8, 16, 16, 16]
+        window_size = 8
+        mlp_ratio = 4.
+        qkv_bias = True
+        qk_scale = None
+        drop_rate = 0.
+        attn_drop_rate = 0.
+        drop_path_rate = 0.2
+        norm_layer = nn.LayerNorm
+        use_checkpoint = False
+        # stochastic depth
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
+        self.align_corners = True
+        self.g_a0 = conv(3, N, kernel_size=5, stride=2)
+        self.g_a1 = RSTB(dim=N,
+                         input_resolution=(128, 128),
+                         depth=depths[0],
+                         num_heads=num_heads[0],
+                         window_size=window_size,
+                         mlp_ratio=mlp_ratio,
+                         qkv_bias=qkv_bias, qk_scale=qk_scale,
+                         drop=drop_rate, attn_drop=attn_drop_rate,
+                         drop_path=dpr[sum(depths[:0]):sum(depths[:1])],
+                         norm_layer=norm_layer,
+                         use_checkpoint=use_checkpoint,
+                         )
+        self.g_a2 = conv(N, N, kernel_size=3, stride=2)
+        self.g_a3 = RSTB(dim=N,
+                         input_resolution=(64, 64),
+                         depth=depths[1],
+                         num_heads=num_heads[1],
+                         window_size=window_size,
+                         mlp_ratio=mlp_ratio,
+                         qkv_bias=qkv_bias, qk_scale=qk_scale,
+                         drop=drop_rate, attn_drop=attn_drop_rate,
+                         drop_path=dpr[sum(depths[:1]):sum(depths[:2])],
+                         norm_layer=norm_layer,
+                         use_checkpoint=use_checkpoint,
+                         )
+        self.g_a4 = conv(N, N, kernel_size=3, stride=2)
+        self.g_a5 = RSTB(dim=N,
+                         input_resolution=(32, 32),
+                         depth=depths[2],
+                         num_heads=num_heads[2],
+                         window_size=window_size,
+                         mlp_ratio=mlp_ratio,
+                         qkv_bias=qkv_bias, qk_scale=qk_scale,
+                         drop=drop_rate, attn_drop=attn_drop_rate,
+                         drop_path=dpr[sum(depths[:2]):sum(depths[:3])],
+                         norm_layer=norm_layer,
+                         use_checkpoint=use_checkpoint,
+                         )
+        self.g_a6 = conv(N, M, kernel_size=3, stride=2)
+        self.h_a0 = conv(M, N, kernel_size=3, stride=1)
+        self.h_a1 = RSTB(dim=N,
+                         input_resolution=(16, 16),
+                         depth=depths[3],
+                         num_heads=num_heads[3],
+                         window_size=window_size // 2,
+                         mlp_ratio=mlp_ratio,
+                         qkv_bias=qkv_bias, qk_scale=qk_scale,
+                         drop=drop_rate, attn_drop=attn_drop_rate,
+                         drop_path=dpr[sum(depths[:3]):sum(depths[:4])],
+                         norm_layer=norm_layer,
+                         use_checkpoint=use_checkpoint,
+                         )
+        self.h_a2 = conv(N, N, kernel_size=3, stride=2)
+        self.h_a3 = RSTB(dim=N,
+                         input_resolution=(8, 8),
+                         depth=depths[4],
+                         num_heads=num_heads[4],
+                         window_size=window_size // 2,
+                         mlp_ratio=mlp_ratio,
+                         qkv_bias=qkv_bias, qk_scale=qk_scale,
+                         drop=drop_rate, attn_drop=attn_drop_rate,
+                         drop_path=dpr[sum(depths[:4]):sum(depths[:5])],
+                         norm_layer=norm_layer,
+                         use_checkpoint=use_checkpoint,
+                         )
+        self.h_a4 = conv(N, N, kernel_size=3, stride=2)
+        depths = depths[::-1]
+        num_heads = num_heads[::-1]
+        self.h_s0 = deconv(N, N, kernel_size=3, stride=2)
+        self.h_s1 = RSTB(dim=N,
+                         input_resolution=(8, 8),
+                         depth=depths[0],
+                         num_heads=num_heads[0],
+                         window_size=window_size // 2,
+                         mlp_ratio=mlp_ratio,
+                         qkv_bias=qkv_bias, qk_scale=qk_scale,
+                         drop=drop_rate, attn_drop=attn_drop_rate,
+                         drop_path=dpr[sum(depths[:0]):sum(depths[:1])],
+                         norm_layer=norm_layer,
+                         use_checkpoint=use_checkpoint,
+                         )
+        self.h_s2 = deconv(N, N, kernel_size=3, stride=2)
+        self.h_s3 = RSTB(dim=N,
+                         input_resolution=(16, 16),
+                         depth=depths[1],
+                         num_heads=num_heads[1],
+                         window_size=window_size // 2,
+                         mlp_ratio=mlp_ratio,
+                         qkv_bias=qkv_bias, qk_scale=qk_scale,
+                         drop=drop_rate, attn_drop=attn_drop_rate,
+                         drop_path=dpr[sum(depths[:1]):sum(depths[:2])],
+                         norm_layer=norm_layer,
+                         use_checkpoint=use_checkpoint,
+                         )
+        self.h_s4 = conv(N, M * 2, kernel_size=3, stride=1)
+        self.g_s0 = deconv(M, N, kernel_size=3, stride=2)
+        self.g_s1 = RSTB(dim=N,
+                         input_resolution=(32, 32),
+                         depth=depths[2],
+                         num_heads=num_heads[2],
+                         window_size=window_size,
+                         mlp_ratio=mlp_ratio,
+                         qkv_bias=qkv_bias, qk_scale=qk_scale,
+                         drop=drop_rate, attn_drop=attn_drop_rate,
+                         drop_path=dpr[sum(depths[:2]):sum(depths[:3])],
+                         norm_layer=norm_layer,
+                         use_checkpoint=use_checkpoint,
+                         )
+        self.g_s2 = deconv(N, N, kernel_size=3, stride=2)
+        self.g_s3 = RSTB(dim=N,
+                         input_resolution=(64, 64),
+                         depth=depths[3],
+                         num_heads=num_heads[3],
+                         window_size=window_size,
+                         mlp_ratio=mlp_ratio,
+                         qkv_bias=qkv_bias, qk_scale=qk_scale,
+                         drop=drop_rate, attn_drop=attn_drop_rate,
+                         drop_path=dpr[sum(depths[:3]):sum(depths[:4])],
+                         norm_layer=norm_layer,
+                         use_checkpoint=use_checkpoint,
+                         )
+        self.g_s4 = deconv(N, N, kernel_size=3, stride=2)
+        self.g_s5 = RSTB(dim=N,
+                         input_resolution=(128, 128),
+                         depth=depths[4],
+                         num_heads=num_heads[4],
+                         window_size=window_size,
+                         mlp_ratio=mlp_ratio,
+                         qkv_bias=qkv_bias, qk_scale=qk_scale,
+                         drop=drop_rate, attn_drop=attn_drop_rate,
+                         drop_path=dpr[sum(depths[:4]):sum(depths[:5])],
+                         norm_layer=norm_layer,
+                         use_checkpoint=use_checkpoint,
+                         )
+        self.g_s6 = deconv(N, 3, kernel_size=5, stride=2)
+        self.entropy_bottleneck = EntropyBottleneck(N)
+        self.gaussian_conditional = GaussianConditional(None)
+        self.context_prediction = CausalAttentionModule(M, M * 2)
+        # self.attetionmap = AttentionBlock(M)
+        self.entropy_parameters = nn.Sequential(
+            nn.Conv2d(M * 12 // 3, M * 10 // 3, 1),
+            nn.GELU(),
+            nn.Conv2d(M * 10 // 3, M * 8 // 3, 1),
+            nn.GELU(),
+            nn.Conv2d(M * 8 // 3, M * 6 // 3, 1),
+        )
+        self.sub_net_leaky = nn.Sequential(
+            conv(N,N,kernel_size=3,stride=2),
+            nn.LeakyReLU()
+        )
+        self.sub_net0 = nn.Sequential(
+            conv(N,64,kernel_size=1,stride=1),
+            nn.ReLU()
+        )
+        self.sub_net1 = nn.Sequential(
+            conv(64,64,kernel_size=3,stride=1),
+            nn.ReLU()
+        )
+        self.sub_net2 = conv(64,N,kernel_size=1,stride=1)
+        self.sub_net_channel = conv(N,M,kernel_size=1,stride=1)
+        self.simi_net = nn.Sequential(
+            conv(1,64,kernel_size=3,stride=2),
+            nn.ReLU(),
+            conv(64,128,kernel_size=3,stride=2),
+            nn.ReLU(),
+            conv(128, M, kernel_size=3, stride=2),
+        )
+        # self.net_lseg = LSegNet(
+        #     backbone="clip_vitl16_384",
+        #     features=256,
+        #     crop_size=256,
+        #     arch_option=0,
+        #     block_depth=0,
+        #     activation="lrelu",
+        # )
+        self.cosine_similarity = torch.nn.CosineSimilarity(dim=1)
+        # self.con3_3 = conv(192,192,kernel_size=3,stride=1)
+        self.tanh = nn.Tanh()
+        self.softsign = nn.Softsign()
+        self.relu = nn.ReLU()
+        self.sigmoid = nn.Sigmoid()
+        self.apply(self._init_weights)
+    def g_a(self, x, x_size=None):
+        if x_size is None:
+            x_size = x.shape[2:4]
+        x = self.g_a0(x)
+        x = self.g_a1(x, (x_size[0] // 2, x_size[1] // 2))
+        x = self.g_a2(x)
+        x = self.g_a3(x, (x_size[0] // 4, x_size[1] // 4))
+        x = self.g_a4(x)
+        x = self.g_a5(x, (x_size[0] // 8, x_size[1] // 8))
+        # x = self.g_a6(x)
+        return x
+    def g_s(self, x, x_size=None):
+        if x_size is None:
+            x_size = (x.shape[2] * 16, x.shape[3] * 16)
+        x = self.g_s0(x)
+        x = self.g_s1(x, (x_size[0] // 8, x_size[1] // 8))
+        x = self.g_s2(x)
+        x = self.g_s3(x, (x_size[0] // 4, x_size[1] // 4))
+        x = self.g_s4(x)
+        x = self.g_s5(x, (x_size[0] // 2, x_size[1] // 2))
+        x = self.g_s6(x)
+        return x
+    def h_a(self, x, x_size=None):
+        if x_size is None:
+            x_size = (x.shape[2] * 16, x.shape[3] * 16)
+        x = self.h_a0(x)
+        x = self.h_a1(x, (x_size[0] // 16, x_size[1] // 16))
+        x = self.h_a2(x)
+        x = self.h_a3(x, (x_size[0] // 32, x_size[1] // 32))
+        x = self.h_a4(x)
+        return x
+    def h_s(self, x, x_size=None):
+        if x_size is None:
+            x_size = (x.shape[2] * 64, x.shape[3] * 64)
+        x = self.h_s0(x)
+        x = self.h_s1(x, (x_size[0] // 32, x_size[1] // 32))
+        x = self.h_s2(x)
+        x = self.h_s3(x, (x_size[0] // 16, x_size[1] // 16))
+        x = self.h_s4(x)
+        return x
+    def sub_impor_net(self,x):   # important map
+        x1 = self.sub_net_leaky(x)
+        x2 = self.sub_net0(x1)
+        x2 = self.sub_net1(x2)
+        x2 = self.sub_net2(x2)
+        x2 = x1 + x2
+        x3 = self.sub_net0(x2)
+        x3 = self.sub_net1(x3)
+        x3 = self.sub_net2(x3)
+        x3 = x2 + x3
+        x4 = self.sub_net0(x3)
+        x4 = self.sub_net1(x4)
+        x4 = self.sub_net2(x4)
+        x_out = x4 + x3
+        x_out = self.sub_net_channel(x_out)
+        return x_out
+    def aux_loss(self):
+        """Return the aggregated loss over the auxiliary entropy bottleneck
+        module(s).
+        """
+        aux_loss = sum(
+            m.loss() for m in self.modules() if isinstance(m, EntropyBottleneck)
+        )
+        return aux_loss
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @torch.jit.ignore
+    def no_weight_decay_keywords(self):
+        return {'relative_position_bias_table'}
+    def forward(self, x, similarity):
+        x_size = (x.shape[2], x.shape[3])
+        h, w = x.size(2), x.size(3)
+        similarity_loss = torch.where(similarity > 0.85, torch.tensor(1.0), torch.tensor(0.01))
+        similarity_imp = torch.where(similarity > 0.85, torch.tensor(1.0), torch.tensor(0.01))
+        similarity_up = F.interpolate(similarity_loss, scale_factor=2, mode='bilinear')
+        similarity_up_repeated = similarity_up.repeat(1, 3, 1, 1)
+        similarities_channel = self.simi_net(similarity_imp)
+        similarities_sigmoid = torch.sigmoid(similarities_channel)
+        y_codec = self.g_a(x, x_size) # y
+        y_codec_a6 = self.g_a6(y_codec)
+        y_import = self.sub_impor_net(y_codec)
+        y_tanh = self.tanh(y_import)
+        y_soft = self.softsign(y_tanh)
+        y_imp = y_soft + similarities_sigmoid
+        y = y_codec_a6 * y_imp
+        z = self.h_a(y, x_size)
+        z_hat, z_likelihoods = self.entropy_bottleneck(z)
+        params = self.h_s(z_hat, x_size)
+        y_hat = self.gaussian_conditional.quantize(
+            y, "noise" if self.training else "dequantize"
+        )
+        ctx_params = self.context_prediction(y_hat)
+        gaussian_params = self.entropy_parameters(
+            torch.cat((params, ctx_params), dim=1)
+        )
+        scales_hat, means_hat = gaussian_params.chunk(2, 1)
+        _, y_likelihoods = self.gaussian_conditional(y, scales_hat, means=means_hat)
+        x_hat = self.g_s(y_hat, x_size)
+        return {
+            "y_hat": y_hat,
+            "y": y,
+            "similarity":similarity_up_repeated,
+            "x_hat": x_hat,
+            "likelihoods": {"y": y_likelihoods, "z": z_likelihoods},
+        }
+    def update(self, scale_table=None, force=False):
+        """Updates the entropy bottleneck(s) CDF values.
+        Needs to be called once after training to be able to later perform the
+        evaluation with an actual entropy coder.
+        Args:
+            scale_table (bool): (default: None)
+            force (bool): overwrite previous values (default: False)
+        Returns:
+            updated (bool): True if one of the EntropyBottlenecks was updated.
+        """
+        if scale_table is None:
+            scale_table = get_scale_table()
+        self.gaussian_conditional.update_scale_table(scale_table, force=force)
+        updated = False
+        for m in self.children():
+            if not isinstance(m, EntropyBottleneck):
+                continue
+            rv = m.update(force=force)
+            updated |= rv
+        return updated
+    def load_state_dict(self, state_dict, strict=True):
+        # Dynamically update the entropy bottleneck buffers related to the CDFs
+        update_registered_buffers(
+            self.entropy_bottleneck,
+            "entropy_bottleneck",
+            ["_quantized_cdf", "_offset", "_cdf_length"],
+            state_dict,
+        )
+        update_registered_buffers(
+            self.gaussian_conditional,
+            "gaussian_conditional",
+            ["_quantized_cdf", "_offset", "_cdf_length", "scale_table"],
+            state_dict,
+        )
+        super().load_state_dict(state_dict, strict=strict)
+    @classmethod
+    def from_state_dict(cls, state_dict):
+        """Return a new model instance from `state_dict`."""
+        N = state_dict["g_a0.weight"].size(0)
+        M = state_dict["g_a6.weight"].size(0)
+        net = cls(N, M)
+        net.load_state_dict(state_dict)
+        return net
+    # def compress(self, x,similarity):
+    def compress(self, x):
+        x = x.cuda()
+        # similarity = similarity.to(device)
+        x_size = (x.shape[2], x.shape[3])
+    #     start_1 = time.time()
+    #
+    #     img_feat = self.net_lseg.forward(x)
+    #     img_feat_norm = torch.nn.functional.normalize(img_feat, dim=1)
+    # #
+    #     prompt = clip.tokenize(similarity).cuda()
+    #     text_feat = self.net_lseg.clip_pretrained.encode_text(prompt)  # 1, 512
+    #     text_feat_norm = torch.nn.functional.normalize(text_feat, dim=1)
+    # #
+    #     similarity = self.cosine_similarity(
+    #         img_feat_norm, text_feat_norm.unsqueeze(-1).unsqueeze(-1)
+    #     )
+    #     similarity = similarity.unsqueeze(0)
+    #
+    #     torch.cuda.synchronize()
+    #
+    #     inf_time = time.time() - start_1
+    #
+    #     print(inf_time)
+        # #####在这里
+        start = time.time()
+        # similarity_down_1 = torch.where(similarity > 0.9, torch.tensor(1.0), torch.tensor(1.0))
+        # similarities_repeated = self.simi_net(similarity_down_1)
+        # similarities_repeated = torch.sigmoid(similarities_repeated)
+        y_codec = self.g_a(x, x_size) # y
+        # y_import = self.sub_impor_net(y_codec)
+        # y_tanh = self.tanh(y_import)
+        #
+        # y_soft = self.softsign(y_tanh)
+        y_codec_a6 = self.g_a6(y_codec)
+        # y_imp = y_soft + similarities_repeated # 相似度* important map
+        # y = y_codec_a6 * y_imp
+        y = y_codec_a6
+        # y = y_imp * y_codec_a6
+        # y = self.sub_net_channel(y)
+        # y = y_codec_a6 * similarities_repeated
+        z = self.h_a(y)
+        z_strings = self.entropy_bottleneck.compress(z)
+        z_hat = self.entropy_bottleneck.decompress(z_strings, z.size()[-2:])
+        params = self.h_s(z_hat)
+        s = 4  # scaling factor between z and y
+        kernel_size = 5  # context prediction kernel size
+        padding = (kernel_size - 1) // 2
+        y_height = z_hat.size(2) * s
+        y_width = z_hat.size(3) * s
+        y_hat = F.pad(y, (padding, padding, padding, padding))
+        # pylint: disable=protected-access
+        cdf = self.gaussian_conditional._quantized_cdf.tolist()
+        cdf_lengths = self.gaussian_conditional._cdf_length.reshape(-1).int().tolist()
+        offsets = self.gaussian_conditional._offset.reshape(-1).int().tolist()
+        # pylint: enable=protected-access
+        # print(cdf, cdf_lengths, offsets)
+        y_strings = []
+        for i in range(y.size(0)):
+            encoder = BufferedRansEncoder()
+            # Warning, this is slow...
+            # TODO: profile the calls to the bindings...
+            symbols_list = []
+            indexes_list = []
+            y_q_ = torch.zeros_like(y)
+            indexes_ = torch.zeros_like(y)
+            for h in range(y_height):
+                for w in range(y_width):
+                    y_crop = y_hat[
+                             i: i + 1, :, h: h + kernel_size, w: w + kernel_size
+                             ]
+                    ctx_p = self.context_prediction(y_crop)
+                    # 1x1 conv for the entropy parameters prediction network, so
+                    # we only keep the elements in the "center"
+                    p = params[i: i + 1, :, h: h + 1, w: w + 1]
+                    gaussian_params = self.entropy_parameters(
+                        torch.cat((p, ctx_p[i: i + 1, :, 2: 3, 2: 3]), dim=1)
+                    )
+                    scales_hat, means_hat = gaussian_params.chunk(2, 1)
+                    indexes = self.gaussian_conditional.build_indexes(scales_hat)
+                    y_q = torch.round(y_crop - means_hat)
+                    y_hat[i, :, h + padding, w + padding] = (y_q + means_hat)[
+                                                            i, :, padding, padding
+                                                            ]
+                    y_q_[i,:, h, w] = y_q[i, :, padding, padding]
+                    indexes_[i,:, h, w] = indexes[i, :,0,0]
+            flag = np.array(np.zeros(y_q_.shape[1]))
+            for idx in range(y_q_.shape[1]):
+                if torch.sum(torch.abs(y_q_[:, idx, :, :])) > 0:  # 全部大于0就设置标志位是1
+                    flag[idx] = 1
+            y_q_ = y_q_[:,np.nonzero(flag),...].squeeze()
+            indexes_ = indexes_[:,np.nonzero(flag),...].squeeze()
+            for h in range(y_height):
+                for w in range(y_width):
+                    # encoder.encode_with_indexes(
+                    #     y_q_[:,np.nonzero(flag),h,w].squeeze().int().tolist(),
+                    #     indexes_[:,np.nonzero(flag),h,w].squeeze().int().tolist(), cdf, cdf_lengths, offsets
+                    # )
+                    symbols_list.extend(y_q_[:,h,w].int().tolist())
+                    indexes_list.extend(indexes_[:,h,w].squeeze().int().tolist())
+            encoder.encode_with_indexes(
+                symbols_list, indexes_list, cdf, cdf_lengths, offsets
+            )
+            string = encoder.flush()
+            y_strings.append(string)
+            print(flag.sum())
+        torch.cuda.synchronize()  # 确保 model2 真正跑完
+        t2 = time.time() - start
+        # print(t2)
+        return {"strings": [y_strings, z_strings], "shape": z.size()[-2:],"flag":flag}
+        # return {"test":similarity}
+    def compress_1(self, x,similarity):
+    # def compress_1(self, x):
+        x = x.cuda()
+        x_size = (x.shape[2], x.shape[3])
+        similarity = similarity.cuda()
+        # # #
+        similarity_down_1 = torch.where(similarity == 0, torch.tensor(1e-4), torch.tensor(1.0))
+        #
+        #
+        # similarity_down_1 = torch.where(similarity > 0.9, torch.tensor(1.0), torch.tensor(1e-4))
+        #
+        similarity_down_1 = F.interpolate(similarity_down_1, scale_factor=0.5, mode='bilinear')
+        similarities_repeated = self.simi_net(similarity_down_1)
+        similarities_repeated = torch.sigmoid(similarities_repeated)
+        y_codec = self.g_a(x, x_size) # y
+        y_import = self.sub_impor_net(y_codec)
+        y_tanh = self.tanh(y_import)
+        y_soft = self.softsign(y_tanh) # important2
+        # y_soft = self.sigmoid(y_soft)
+        y_codec_a6 = self.g_a6(y_codec)
+        # y_codec_a6 = self.attetionmap(y_codec_a6)
+        y_imp = similarities_repeated + y_soft # 相似度* important map
+        y = y_codec_a6 * y_imp
+        #
+        # y= y_codec_a6 * y_tanh
+        # cmap = ListedColormap(['yellow'])
+        #
+        # similarity_image = torch.where(similarity > 0.9, torch.tensor(1.0), torch.tensor(0.1))
+        # similarity_image = F.interpolate(similarity_image, scale_factor=2, mode='bilinear')
+        # abs = torch.abs(similarity_image)
+        # mean = torch.mean(abs, axis=1, keepdims=True)
+        # viz = mean.detach().cpu().numpy()
+        # viz = viz[0]
+        # viz = viz.squeeze()
+        # plt.imshow(viz)
+        # # # 保存图像
+        # plt.imsave('/mnt/disk10T/xfx/CLIP/bird.png', viz)
+        z = self.h_a(y)
+        z_strings = self.entropy_bottleneck.compress(z)
+        z_hat = self.entropy_bottleneck.decompress(z_strings, z.size()[-2:])
+        params = self.h_s(z_hat)
+        s = 4  # scaling factor between z and y
+        kernel_size = 5  # context prediction kernel size
+        padding = (kernel_size - 1) // 2
+        y_height = z_hat.size(2) * s
+        y_width = z_hat.size(3) * s
+        y_hat = F.pad(y, (padding, padding, padding, padding))
+        # pylint: disable=protected-access
+        cdf = self.gaussian_conditional._quantized_cdf.tolist()
+        cdf_lengths = self.gaussian_conditional._cdf_length.reshape(-1).int().tolist()
+        offsets = self.gaussian_conditional._offset.reshape(-1).int().tolist()
+        # pylint: enable=protected-access
+        # print(cdf, cdf_lengths, offsets)
+        y_strings = []
+        for i in range(y.size(0)):
+            encoder = BufferedRansEncoder()
+            # Warning, this is slow...
+            # TODO: profile the calls to the bindings...
+            symbols_list = []
+            indexes_list = []
+            for h in range(y_height):
+                for w in range(y_width):
+                    y_crop = y_hat[
+                             i: i + 1, :, h: h + kernel_size, w: w + kernel_size
+                             ]
+                    ctx_p = self.context_prediction(y_crop)
+                    # 1x1 conv for the entropy parameters prediction network, so
+                    # we only keep the elements in the "center"
+                    p = params[i: i + 1, :, h: h + 1, w: w + 1]
+                    gaussian_params = self.entropy_parameters(
+                        torch.cat((p, ctx_p[i: i + 1, :, 2: 3, 2: 3]), dim=1)
+                    )
+                    scales_hat, means_hat = gaussian_params.chunk(2, 1)
+                    indexes = self.gaussian_conditional.build_indexes(scales_hat)
+                    y_q = torch.round(y_crop - means_hat)
+                    y_hat[i, :, h + padding, w + padding] = (y_q + means_hat)[
+                                                            i, :, padding, padding
+                                                            ]
+                    symbols_list.extend(y_q[i, :, padding, padding].int().tolist())
+                    indexes_list.extend(indexes[i, :].squeeze().int().tolist())
+            encoder.encode_with_indexes(
+                symbols_list, indexes_list, cdf, cdf_lengths, offsets
+            )
+            string = encoder.flush()
+            y_strings.append(string)
+        return {"strings": [y_strings, z_strings], "shape": z.size()[-2:]}
+    def compress_2(self, x,similarity):
+        # def compress_1(self, x):
+        x = x.cuda()
+        x_size = (x.shape[2], x.shape[3])
+        #####在这里
+        similarity_down_1 = torch.where(similarity > 0.9, torch.tensor(1.0), torch.tensor(0.1))
+        similarities_repeated = self.simi_net(similarity_down_1)
+        similarities_repeated = torch.sigmoid(similarities_repeated)
+        y_codec = self.g_a(x, x_size)  # y
+        y_import = self.sub_impor_net(y_codec)
+        y_tanh = self.tanh(y_import)
+        y_codec_a6 = self.g_a6(y_codec)
+        y_imp = similarities_repeated + y_tanh # 相似度* important map
+        y = y_codec_a6 * y_imp
+        z = self.h_a(y)
+        z_strings = self.entropy_bottleneck.compress(z)
+        z_hat = self.entropy_bottleneck.decompress(z_strings, z.size()[-2:])
+        params = self.h_s(z_hat)
+        s = 4  # scaling factor between z and y
+        kernel_size = 5  # context prediction kernel size
+        padding = (kernel_size - 1) // 2
+        y_height = z_hat.size(2) * s
+        y_width = z_hat.size(3) * s
+        y_hat = F.pad(y, (padding, padding, padding, padding))
+        # pylint: disable=protected-access
+        cdf = self.gaussian_conditional._quantized_cdf.tolist()
+        cdf_lengths = self.gaussian_conditional._cdf_length.reshape(-1).int().tolist()
+        offsets = self.gaussian_conditional._offset.reshape(-1).int().tolist()
+        # pylint: enable=protected-access
+        # print(cdf, cdf_lengths, offsets)
+        y_strings = []
+        for i in range(y.size(0)):
+            encoder = BufferedRansEncoder()
+            # Warning, this is slow...
+            # TODO: profile the calls to the bindings...
+            symbols_list = []
+            indexes_list = []
+            for h in range(y_height):
+                for w in range(y_width):
+                    y_crop = y_hat[
+                             i: i + 1, :, h: h + kernel_size, w: w + kernel_size
+                             ]
+                    ctx_p = self.context_prediction(y_crop)
+                    # 1x1 conv for the entropy parameters prediction network, so
+                    # we only keep the elements in the "center"
+                    p = params[i: i + 1, :, h: h + 1, w: w + 1]
+                    gaussian_params = self.entropy_parameters(
+                        torch.cat((p, ctx_p[i: i + 1, :, 2: 3, 2: 3]), dim=1)
+                    )
+                    scales_hat, means_hat = gaussian_params.chunk(2, 1)
+                    indexes = self.gaussian_conditional.build_indexes(scales_hat)
+                    y_q = torch.round(y_crop - means_hat)
+                    y_hat[i, :, h + padding, w + padding] = (y_q + means_hat)[
+                                                            i, :, padding, padding
+                                                            ]
+                    symbols_list.extend(y_q[i, :, padding, padding].int().tolist())
+                    indexes_list.extend(indexes[i, :].squeeze().int().tolist())
+            encoder.encode_with_indexes(
+                symbols_list, indexes_list, cdf, cdf_lengths, offsets
+            )
+            string = encoder.flush()
+            y_strings.append(string)
+        return {"strings": [y_strings, z_strings], "shape": z.size()[-2:]}
+    def decompress(self, strings, shape, flag):
+    # def decompress(self, strings, shape):
+        flag = np.nonzero(flag)
+        assert isinstance(strings, list) and len(strings) == 2
+        # FIXME: we don't respect the default entropy coder and directly call the
+        # range ANS decoder
+        z_hat = self.entropy_bottleneck.decompress(strings[1], shape)
+        params = self.h_s(z_hat)
+        s = 4  # scaling factor between z and y
+        kernel_size = 5  # context prediction kernel size
+        padding = (kernel_size - 1) // 2
+        y_height = z_hat.size(2) * s
+        y_width = z_hat.size(3) * s
+        # initialize y_hat to zeros, and pad it so we can directly work with
+        # sub-tensors of size (N, C, kernel size, kernel_size)
+        y_hat = torch.zeros(
+            (z_hat.size(0), 192, y_height + 2 * padding, y_width + 2 * padding),
+            device=z_hat.device,
+        )
+        decoder = RansDecoder()
+        # pylint: disable=protected-access
+        cdf = self.gaussian_conditional._quantized_cdf.tolist()
+        cdf_lengths = self.gaussian_conditional._cdf_length.reshape(-1).int().tolist()
+        offsets = self.gaussian_conditional._offset.reshape(-1).int().tolist()
+        # Warning: this is slow due to the auto-regressive nature of the
+        # decoding... See more recent publication where they use an
+        # auto-regressive module on chunks of channels for faster decoding...
+        for i, y_string in enumerate(strings[0]):
+            decoder.set_stream(y_string)
+            for h in range(y_height):
+                for w in range(y_width):
+                    # only perform the 5x5 convolution on a cropped tensor
+                    # centered in (h, w)
+                    y_crop = y_hat[
+                             i: i + 1, :, h: h + kernel_size, w: w + kernel_size
+                             ]
+                    ctx_p = self.context_prediction(y_crop)
+                    # 1x1 conv for the entropy parameters prediction network, so
+                    # we only keep the elements in the "center"
+                    p = params[i: i + 1, :, h: h + 1, w: w + 1]
+                    gaussian_params = self.entropy_parameters(
+                        torch.cat((p, ctx_p[i: i + 1, :, 2: 3, 2: 3]), dim=1)
+                    )
+                    scales_hat, means_hat = gaussian_params.chunk(2, 1)
+                    indexes = self.gaussian_conditional.build_indexes(scales_hat)
+                    rv = decoder.decode_stream(
+                        indexes[i, flag].squeeze().int().tolist(),
+                        # indexes[i, :].squeeze().int().tolist(),
+                        cdf,
+                        cdf_lengths,
+                        offsets,
+                    )
+                    # rv = torch.Tensor(rv).reshape(1, -1, 1, 1)
+                    rv = torch.Tensor(rv).reshape(1, -1, 1, 1)
+                    tmp = torch.zeros((1, 192, 1, 1))
+                    tmp[:, flag, ...] = rv
+                    rv = self.gaussian_conditional._dequantize(tmp, means_hat)
+                    # rv = self.gaussian_conditional._dequantize(rv, means_hat)
+                    y_hat[
+                    i,
+                    :,
+                    h + padding: h + padding + 1,
+                    w + padding: w + padding + 1,
+                    ] = rv
+        y_hat = y_hat[:, :, padding:-padding, padding:-padding]
+        # pylint: enable=protected-access
+        x_hat = self.g_s(y_hat).clamp_(0, 1)
+        return {"x_hat": x_hat,}
+    def decompress_1(self, strings, shape):
+        assert isinstance(strings, list) and len(strings) == 2
+        # FIXME: we don't respect the default entropy coder and directly call the
+        # range ANS decoder
+        z_hat = self.entropy_bottleneck.decompress(strings[1], shape)
+        params = self.h_s(z_hat)
+        s = 4  # scaling factor between z and y
+        kernel_size = 5  # context prediction kernel size
+        padding = (kernel_size - 1) // 2
+        y_height = z_hat.size(2) * s
+        y_width = z_hat.size(3) * s
+        # initialize y_hat to zeros, and pad it so we can directly work with
+        # sub-tensors of size (N, C, kernel size, kernel_size)
+        y_hat = torch.zeros(
+            (z_hat.size(0), 192, y_height + 2 * padding, y_width + 2 * padding),
+            device=z_hat.device,
+        )
+        decoder = RansDecoder()
+        # pylint: disable=protected-access
+        cdf = self.gaussian_conditional._quantized_cdf.tolist()
+        cdf_lengths = self.gaussian_conditional._cdf_length.reshape(-1).int().tolist()
+        offsets = self.gaussian_conditional._offset.reshape(-1).int().tolist()
+        # Warning: this is slow due to the auto-regressive nature of the
+        # decoding... See more recent publication where they use an
+        # auto-regressive module on chunks of channels for faster decoding...
+        for i, y_string in enumerate(strings[0]):
+            decoder.set_stream(y_string)
+            for h in range(y_height):
+                for w in range(y_width):
+                    # only perform the 5x5 convolution on a cropped tensor
+                    # centered in (h, w)
+                    y_crop = y_hat[
+                             i: i + 1, :, h: h + kernel_size, w: w + kernel_size
+                             ]
+                    ctx_p = self.context_prediction(y_crop)
+                    # 1x1 conv for the entropy parameters prediction network, so
+                    # we only keep the elements in the "center"
+                    p = params[i: i + 1, :, h: h + 1, w: w + 1]
+                    gaussian_params = self.entropy_parameters(
+                        torch.cat((p, ctx_p[i: i + 1, :, 2: 3, 2: 3]), dim=1)
+                    )
+                    scales_hat, means_hat = gaussian_params.chunk(2, 1)
+                    indexes = self.gaussian_conditional.build_indexes(scales_hat)
+                    rv = decoder.decode_stream(
+                        indexes[i, :].squeeze().int().tolist(),
+                        cdf,
+                        cdf_lengths,
+                        offsets,
+                    )
+                    rv = torch.Tensor(rv).reshape(1, -1, 1, 1)
+                    rv = self.gaussian_conditional._dequantize(rv, means_hat)
+                    y_hat[
+                    i,
+                    :,
+                    h + padding: h + padding + 1,
+                    w + padding: w + padding + 1,
+                    ] = rv
+        y_hat = y_hat[:, :, padding:-padding, padding:-padding]
+        # pylint: enable=protected-access
+        x_hat = self.g_s(y_hat).clamp_(0, 1)
+        return {"x_hat": x_hat}

vae/transformer_layers.py ADDED Viewed

	@@ -0,0 +1,250 @@

+import torch.nn as nn
+import torch
+import numpy as np
+def create_look_ahead_mask(size):
+    """Creates a lookahead mask for autoregressive masking."""
+    mask = np.triu(np.ones((size, size), np.float32), 1)
+    return torch.Tensor(mask)
+class StochasticDepth(nn.Module):
+    """Creates a stochastic depth layer."""
+    def __init__(self, stochastic_depth_drop_rate):
+        """Initializes a stochastic depth layer.
+        Args:
+          stochastic_depth_drop_rate: A `float` of drop rate.
+          name: Name of the layer.
+        Returns:
+          A output `tf.Tensor` of which should have the same shape as input.
+        """
+        super().__init__()
+        self._drop_rate = stochastic_depth_drop_rate
+    def forward(self, inputs):
+        if not self.training or self._drop_rate == 0.:
+            return inputs
+        keep_prob = 1.0 - self._drop_rate
+        batch_size = inputs.shape[0]
+        random_tensor = keep_prob
+        random_tensor += torch.rand_like(
+            [batch_size] + [1] * (inputs.shape.rank - 1), dtype=inputs.dtype)
+        binary_tensor = torch.floor(random_tensor)
+        output = torch.div(inputs, keep_prob) * binary_tensor
+        return output
+class MLP(nn.Module):
+    """MLP head for transformer."""
+    def __init__(self, n_channel,expansion_rate, act, dropout_rate):
+        super().__init__()
+        self._expansion_rate = expansion_rate
+        self._act = act
+        self._dropout_rate = dropout_rate
+        self._fc1 = nn.Linear(
+            n_channel,
+            self._expansion_rate * n_channel)
+        self.act1 = self._act()
+        self._fc2 = nn.Linear(
+            self._expansion_rate * n_channel,
+            n_channel)
+        self.act2 = self._act()
+        self._drop = nn.Dropout(self._dropout_rate)
+    def forward(self, features):
+        """Forward pass."""
+        features = self.act1(self._fc1(features))
+        features = self._drop(features)
+        features = self.act2(self._fc2(features))
+        features = self._drop(features)
+        return features
+class TransformerBlock(nn.Module):
+    """Transformer block that is similar to the Swin encoder block.
+    However, an important difference is that we _do not_ shift the windows
+    for the second Attention layer. Instead, we _feed the encoder outputs_
+    as Keys and Values. This allows for autoregressive applications.
+    If `style == "encoder"`, no autoregression is happening.
+    Also, this class operates on windowed tensor, see `call` docstring.
+    """
+    def __init__(
+        self,
+        *,
+        d_model,
+        seq_len,
+        num_head = 4,
+        mlp_expansion = 4,
+        mlp_act = nn.GELU,
+        drop_out_rate = 0.1,
+        drop_path_rate = 0.1,
+        style = "decoder",
+    ):
+        super().__init__()
+        self._style = style
+        if style == "decoder":
+            # Register as a buffer so moving the module moves the mask too.
+            self.register_buffer(
+                "look_ahead_mask",
+                create_look_ahead_mask(seq_len),
+                persistent=False,
+            )
+        elif style == "encoder":
+            self.look_ahead_mask = None
+        else:
+            raise ValueError(f"Invalid style: {style}")
+        # self._norm1a = nn.LayerNorm(
+        #     axis=-1, epsilon=1e-5, name="mhsa_normalization1")
+        self._norm1a = nn.LayerNorm(d_model)
+        # self._norm1b = tf.keras.layers.LayerNormalization(
+        #     axis=-1, epsilon=1e-5, name="ffn_normalization1")
+        self._norm1b = nn.LayerNorm(d_model,eps=1e-5)
+        # self._norm2a = tf.keras.layers.LayerNormalization(
+        #     axis=-1, epsilon=1e-5, name="mhsa_normalization2")
+        self._norm2a = nn.LayerNorm(d_model, eps=1e-5)
+        # self._norm2b = tf.keras.layers.LayerNormalization(
+        #     axis=-1, epsilon=1e-5, name="ffn_normalization2")
+        self._norm2b = nn.LayerNorm(d_model, eps=1e-5)
+        self._attn1 = nn.MultiheadAttention(
+            d_model,
+            num_head,
+            dropout=drop_out_rate
+        )
+        self._attn2 = nn.MultiheadAttention(
+            d_model,
+            num_head,
+            dropout=drop_out_rate
+        )
+        self._mlp1 = MLP(
+            d_model,
+            expansion_rate=mlp_expansion,
+            act=mlp_act,
+            dropout_rate=drop_out_rate)
+        self._mlp2 = MLP(
+            d_model,
+            expansion_rate=mlp_expansion,
+            act=mlp_act,
+            dropout_rate=drop_out_rate)
+        # No weights, so we share for both blocks.
+        self._drop_path = StochasticDepth(drop_path_rate)
+    def forward(self, features, enc_output):
+        if enc_output is None:
+            if self._style == "decoder":
+                raise ValueError("Need `enc_output` when running decoder.")
+        else:
+            assert enc_output.shape[0] == features.shape[0] and enc_output.shape[2] == features.shape[2]
+        # First Block ---
+        shortcut = features
+        features = self._norm1a(features)
+        # Masked self-attention.
+        features = features.permute(1, 0, 2)  # NLD -> LND
+        features, _ = self._attn1(
+            value=features,
+            key=features,
+            query=features,
+            attn_mask=self.look_ahead_mask)
+        features = features.permute(1, 0, 2)  # LND -> NLD
+        assert features.shape == shortcut.shape
+        features = shortcut + self._drop_path(features)
+        features = features + self._drop_path(
+            self._mlp1(self._norm1b(features)))
+        # Second Block ---
+        shortcut = features
+        features = self._norm2a(features)
+        # Unmasked "lookup" into enc_output, no need for mask.
+        features = features.permute(1, 0, 2)  # NLD -> LND
+        if enc_output is not None:
+            enc_output = enc_output.permute(1, 0, 2)  # NLD -> LND
+        features, _ = self._attn2(  # pytype: disable=wrong-arg-types  # dynamic-method-lookup
+            value=enc_output if enc_output is not None else features,
+            key=enc_output if enc_output is not None else features,
+            query=features,
+            attn_mask=None)
+        features = features.permute(1, 0, 2)  # LND -> NLD
+        features = shortcut + self._drop_path(features)
+        output = features + self._drop_path(
+            self._mlp2(self._norm2b(features)))
+        return output
+class Transformer(nn.Module):
+    """A stack of transformer blocks, useable for encoding or decoding."""
+    def __init__(
+        self,
+        is_decoder,
+        num_layers = 4,
+        d_model = 192,
+        seq_len = 16,
+        num_head = 4,
+        mlp_expansion = 4,
+        drop_out = 0.1
+    ):
+        super().__init__()
+        self.is_decoder = is_decoder
+        # IMPORTANT: use ModuleList so parameters/buffers are registered and moved
+        # correctly with `.to(device)`.
+        self.layers = nn.ModuleList(
+            [
+                TransformerBlock(
+                    d_model=d_model,
+                    seq_len=seq_len,
+                    num_head=num_head,
+                    mlp_expansion=mlp_expansion,
+                    drop_out_rate=drop_out,
+                    drop_path_rate=drop_out,
+                    style="decoder" if is_decoder else "encoder",
+                )
+                for _ in range(num_layers)
+            ]
+        )
+    def forward(
+      self, latent, enc_output
+    ):
+        """Forward pass.
+        For decoder, this predicts distribution of `latent` given `enc_output`.
+        We assume that `latent` has already been embedded in a d_model-dimensional
+        space.
+        Args:
+          latent: (B', seq_len, C) latent.
+          enc_output: (B', seq_len_enc, C) result of concatenated encode output.
+          training: Whether we are training.
+        Returns:
+          Decoder output of shape (B', seq_len, C).
+        """
+        assert len(latent.shape) == 3, latent.shape
+        if enc_output is not None:
+            assert latent.shape[-1] == enc_output.shape[-1], (latent.shape,
+                                                            enc_output.shape)
+        for layer in self.layers:
+            latent = layer(features=latent, enc_output=enc_output)
+        return latent

vae/utils.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""
+Utility functions for image compression.
+"""
+import torch
+import torch.nn.functional as F
+import numpy as np
+from PIL import Image
+from typing import Tuple, Dict
+from .roi_tic import ModifiedTIC
+def compute_padding(in_h: int, in_w: int, min_div: int = 256) -> Tuple[Tuple[int, int, int, int], Tuple[int, int, int, int]]:
+    """
+    Compute padding to make dimensions divisible by min_div.
+    Args:
+        in_h: input height
+        in_w: input width
+        min_div: minimum divisor (default 256 for TIC)
+    Returns:
+        pad: (left, right, top, bottom) padding
+        unpad: negative padding for cropping back
+    """
+    out_h = (in_h + min_div - 1) // min_div * min_div
+    out_w = (in_w + min_div - 1) // min_div * min_div
+    left = (out_w - in_w) // 2
+    right = out_w - in_w - left
+    top = (out_h - in_h) // 2
+    bottom = out_h - in_h - top
+    pad = (left, right, top, bottom)
+    unpad = (-left, -right, -top, -bottom)
+    return pad, unpad
+def compress_image(
+    image: Image.Image,
+    mask: np.ndarray,
+    model: ModifiedTIC,
+    sigma: float = 0.3,
+    device: str = 'cuda'
+) -> Dict:
+    """
+    Compress image with ROI-based quality control.
+    Args:
+        image: PIL Image (RGB)
+        mask: Binary mask (H, W) with 1 for ROI, 0 for background
+        model: Loaded ModifiedTIC model
+        sigma: Background quality (0.01-1.0, lower = more compression)
+        device: 'cuda' or 'cpu'
+    Returns:
+        dict with:
+            - compressed: PIL Image of compressed result
+            - bpp: Bits per pixel
+            - original_size: Original image dimensions
+            - mask_used: The mask that was used
+    """
+    # Convert image to tensor
+    img_array = np.array(image).astype(np.float32) / 255.0
+    img_tensor = torch.from_numpy(img_array).permute(2, 0, 1).unsqueeze(0).to(device)
+    # Pad image
+    _, _, h, w = img_tensor.shape
+    pad, unpad = compute_padding(h, w, min_div=256)
+    img_padded = F.pad(img_tensor, pad, mode='constant', value=0)
+    # Prepare mask
+    mask_tensor = torch.from_numpy(mask).unsqueeze(0).unsqueeze(0).to(device)
+    mask_padded = F.pad(mask_tensor, pad, mode='constant', value=0)
+    # Compress
+    with torch.no_grad():
+        # NOTE: `ModifiedTIC.forward()` handles mask downsampling internally.
+        out = model(img_padded, mask_padded, sigma=sigma)
+    # Unpad result
+    x_hat = F.pad(out['x_hat'], unpad)
+    # Convert back to image
+    x_hat_np = x_hat.squeeze(0).permute(1, 2, 0).cpu().numpy()
+    x_hat_np = np.clip(x_hat_np * 255, 0, 255).astype(np.uint8)
+    compressed_img = Image.fromarray(x_hat_np)
+    # Calculate BPP
+    num_pixels = h * w
+    likelihoods = out['likelihoods']
+    bpp_y = torch.log(likelihoods['y']).sum() / (-np.log(2) * num_pixels)
+    bpp_z = torch.log(likelihoods['z']).sum() / (-np.log(2) * num_pixels)
+    bpp = (bpp_y + bpp_z).item()
+    return {
+        'compressed': compressed_img,
+        'bpp': bpp,
+        'original_size': (w, h),
+        'mask_used': mask
+    }

vae/visualization.py ADDED Viewed

	@@ -0,0 +1,94 @@

+"""
+Visualization utilities for compression results.
+"""
+import numpy as np
+import cv2
+from PIL import Image
+from typing import Tuple
+import matplotlib.pyplot as plt
+def highlight_roi(
+    image: Image.Image,
+    mask: np.ndarray,
+    alpha: float = 0.3,
+    color: Tuple[int, int, int] = (0, 255, 0)
+) -> Image.Image:
+    """
+    Highlight ROI regions in image with colored overlay.
+    Args:
+        image: PIL Image
+        mask: Binary mask (H, W)
+        alpha: Overlay transparency (0-1)
+        color: RGB color tuple for ROI highlight
+    Returns:
+        Image with ROI highlighted
+    """
+    img_array = np.array(image)
+    # Create colored overlay
+    overlay = img_array.copy()
+    overlay[mask > 0.5] = color
+    # Blend
+    result = cv2.addWeighted(img_array, 1 - alpha, overlay, alpha, 0)
+    return Image.fromarray(result)
+def create_comparison_grid(
+    original: Image.Image,
+    compressed: Image.Image,
+    mask: np.ndarray,
+    bpp: float,
+    sigma: float,
+    lambda_val: float,
+    highlight: bool = True
+) -> Image.Image:
+    """
+    Create side-by-side comparison of original and compressed images.
+    Args:
+        original: Original PIL Image
+        compressed: Compressed PIL Image
+        mask: Binary mask used
+        bpp: Bits per pixel
+        sigma: Sigma value used
+        lambda_val: Lambda value used
+        highlight: Whether to show ROI overlay
+    Returns:
+        Combined comparison image
+    """
+    fig, axes = plt.subplots(1, 3 if highlight else 2, figsize=(15 if highlight else 10, 5))
+    # Original
+    axes[0].imshow(original)
+    axes[0].set_title('Original', fontsize=14)
+    axes[0].axis('off')
+    # Compressed
+    axes[1].imshow(compressed)
+    axes[1].set_title(f'Compressed (σ={sigma:.2f}, λ={lambda_val}, BPP={bpp:.3f})', fontsize=14)
+    axes[1].axis('off')
+    # ROI overlay
+    if highlight:
+        highlighted = highlight_roi(original, mask, alpha=0.4, color=(0, 255, 0))
+        axes[2].imshow(highlighted)
+        axes[2].set_title('ROI Mask (green)', fontsize=14)
+        axes[2].axis('off')
+    plt.tight_layout()
+    # Convert to PIL Image
+    fig.canvas.draw()
+    img_array = np.frombuffer(fig.canvas.buffer_rgba(), dtype=np.uint8)
+    img_array = img_array.reshape(fig.canvas.get_width_height()[::-1] + (4,))
+    img_array = img_array[:, :, :3]  # Remove alpha channel
+    plt.close(fig)
+    return Image.fromarray(img_array)

video/__init__.py ADDED Viewed

	@@ -0,0 +1,55 @@

+"""Video processing module for ROI-based video compression.
+Provides:
+- VideoProcessor: Frame extraction, motion analysis, adaptive compression
+- MotionAnalyzer: Optical flow and scene complexity estimation
+- ChunkCompressor: Chunk-by-chunk compression with bandwidth targeting
+- Temporal smoothing utilities for mask stabilization
+- Mask caching for video segmentation reuse
+"""
+from .video_processor import (
+    VideoProcessor,
+    VideoFrame,
+    CompressedChunk,
+    CompressionSettings,
+    ChunkPlan,
+)
+from .motion_analyzer import MotionAnalyzer
+from .chunk_compressor import (
+    ChunkCompressor,
+    BandwidthController,
+    smooth_masks_temporal,
+    smooth_masks_temporal_fast,
+    smooth_masks_sdf,
+)
+from .mask_cache import (
+    save_video_masks,
+    load_video_masks,
+    get_mask_info,
+)
+from .gpu_memory import (
+    estimate_batch_sizes,
+    BatchSizeEstimate,
+)
+from .sdf_smoother import SDFSmoother
+__all__ = [
+    "VideoProcessor",
+    "VideoFrame",
+    "CompressedChunk",
+    "CompressionSettings",
+    "ChunkPlan",
+    "MotionAnalyzer",
+    "ChunkCompressor",
+    "BandwidthController",
+    "smooth_masks_temporal",
+    "smooth_masks_temporal_fast",
+    "smooth_masks_sdf",
+    "save_video_masks",
+    "load_video_masks",
+    "get_mask_info",
+    "estimate_batch_sizes",
+    "BatchSizeEstimate",
+    "SDFSmoother",
+]

video/chunk_compressor.py ADDED Viewed

	@@ -0,0 +1,877 @@

+"""Chunk-based video compression with bandwidth targeting.
+Implements dynamic compression that balances framerate and spatial quality
+to meet bandwidth constraints while prioritizing motion-heavy scenes.
+Includes temporal smoothing for segmentation masks to reduce flickering.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import List, Optional, Tuple, Generator, Dict, Any
+import math
+import numpy as np
+from PIL import Image
+from scipy import ndimage
+from .motion_analyzer import MotionAnalyzer, MotionMetrics
+from .sdf_smoother import SDFSmoother
+def smooth_masks_temporal(
+    masks: List[np.ndarray],
+    window_size: int = 5,
+    threshold_appear: float = 0.4,
+    threshold_disappear: float = 0.2,
+    spatial_smooth: bool = True,
+    spatial_kernel_size: int = 5,
+) -> List[np.ndarray]:
+    """Apply temporal smoothing to a sequence of segmentation masks.
+    Reduces flickering by:
+    1. Applying temporal median/mean filtering across frames
+    2. Using hysteresis thresholding (different thresholds for appearing vs disappearing)
+    3. Optional spatial smoothing to clean up edges
+    Args:
+        masks: List of binary/float masks (H, W), values in [0, 1]
+        window_size: Number of frames to consider for temporal filtering (odd number)
+        threshold_appear: Confidence threshold for a pixel to become ROI (higher = stricter)
+        threshold_disappear: Confidence threshold for a pixel to stop being ROI (lower = stickier)
+        spatial_smooth: Whether to apply spatial Gaussian smoothing
+        spatial_kernel_size: Size of spatial smoothing kernel
+    Returns:
+        List of smoothed masks
+    """
+    if not masks or len(masks) < 2:
+        return masks
+    # Convert to numpy array for efficient processing: (T, H, W)
+    h, w = masks[0].shape
+    mask_stack = np.stack([m.astype(np.float32) for m in masks], axis=0)
+    num_frames = mask_stack.shape[0]
+    # Pad temporally for filtering
+    half_window = window_size // 2
+    padded = np.pad(mask_stack, ((half_window, half_window), (0, 0), (0, 0)), mode='edge')
+    # Apply temporal filtering using a sliding window
+    smoothed = np.zeros_like(mask_stack)
+    for t in range(num_frames):
+        # Get window of frames
+        window = padded[t:t + window_size]  # (window_size, H, W)
+        # Use weighted mean - center frame gets more weight
+        weights = np.array([1, 2, 3, 2, 1][:window_size])
+        weights = weights / weights.sum()
+        weighted_mean = np.average(window, axis=0, weights=weights)
+        smoothed[t] = weighted_mean
+    # Apply hysteresis thresholding
+    # A pixel becomes ROI if confidence > threshold_appear
+    # A pixel stays ROI until confidence < threshold_disappear
+    result = np.zeros_like(smoothed)
+    # Initialize with first frame using appear threshold
+    result[0] = (smoothed[0] > threshold_appear).astype(np.float32)
+    for t in range(1, num_frames):
+        # Pixels that were ROI in previous frame
+        was_roi = result[t - 1] > 0.5
+        # New ROI: high confidence (above appear threshold)
+        new_roi = smoothed[t] > threshold_appear
+        # Continuing ROI: was ROI and still above disappear threshold
+        continuing_roi = was_roi & (smoothed[t] > threshold_disappear)
+        # Combine: either new or continuing
+        result[t] = (new_roi | continuing_roi).astype(np.float32)
+    # Optional: apply spatial smoothing to clean up edges
+    if spatial_smooth:
+        for t in range(num_frames):
+            # Gaussian blur then threshold
+            blurred = ndimage.gaussian_filter(result[t], sigma=spatial_kernel_size / 4)
+            result[t] = (blurred > 0.5).astype(np.float32)
+    # Convert back to list
+    return [result[t] for t in range(num_frames)]
+def smooth_masks_temporal_fast(
+    masks: List[np.ndarray],
+    alpha: float = 0.3,
+    threshold: float = 0.5,
+) -> List[np.ndarray]:
+    """Fast temporal smoothing using exponential moving average.
+    Simpler and faster than full temporal smoothing, good for real-time.
+    Args:
+        masks: List of binary/float masks (H, W)
+        alpha: Smoothing factor (0-1). Lower = more smoothing, more lag.
+        threshold: Threshold for final binary mask
+    Returns:
+        List of smoothed masks
+    """
+    if not masks or len(masks) < 2:
+        return masks
+    result = []
+    ema = masks[0].astype(np.float32).copy()
+    result.append((ema > threshold).astype(np.float32))
+    for i in range(1, len(masks)):
+        current = masks[i].astype(np.float32)
+        # Exponential moving average
+        ema = alpha * current + (1 - alpha) * ema
+        result.append((ema > threshold).astype(np.float32))
+    return result
+def smooth_masks_sdf(
+    masks: List[np.ndarray],
+    alpha: float = 0.5,
+    empty_thresh: int = 10,
+    patience: int = 5,
+) -> List[np.ndarray]:
+    """Smooth masks using Signed Distance Field temporal filtering.
+    Uses SDF representation for fluid, jitter-free transitions while
+    preserving sharp boundaries. More sophisticated than simple EMA.
+    Args:
+        masks: List of binary/float masks (H, W)
+        alpha: Smoothing factor (0.1 = slow/viscous, 0.9 = fast/reactive)
+        empty_thresh: Min pixel count to consider mask "valid"
+        patience: Frames to tolerate empty masks before decay
+                 (0 = immediate, 5 = conservative, 15 = aggressive)
+    Returns:
+        List of smoothed masks
+    """
+    if not masks or len(masks) < 2:
+        return masks
+    smoother = SDFSmoother(alpha=alpha, empty_thresh=empty_thresh, patience=patience)
+    result = []
+    for mask in masks:
+        smoothed = smoother.update(mask.astype(np.float32))
+        result.append(smoothed)
+    return result
+@dataclass
+class CompressionResult:
+    """Result of compressing a single frame."""
+    compressed_image: Image.Image
+    bpp: float
+    original_size: Tuple[int, int]
+    roi_coverage: float
+@dataclass
+class ChunkResult:
+    """Result of compressing a video chunk."""
+    # Compressed frames for this chunk
+    frames: List[Image.Image]
+    # Frame indices from original video that were kept
+    frame_indices: List[int]
+    # Effective framerate for this chunk
+    effective_fps: float
+    # Quality level used (1-5)
+    quality_level: int
+    # Sigma (background preservation) used
+    sigma: float
+    # Average bits per pixel
+    avg_bpp: float
+    # Estimated chunk size in bytes
+    estimated_bytes: int
+    # Motion metrics for this chunk
+    motion_metrics: Optional[MotionMetrics] = None
+    # Chunk index
+    chunk_index: int = 0
+    # Total number of original frames in chunk
+    original_frame_count: int = 0
+class BandwidthController:
+    """Controls compression parameters to meet bandwidth targets.
+    Dynamically adjusts:
+    - Frame sampling rate (effective FPS)
+    - Spatial compression quality
+    - Background preservation (sigma)
+    Based on:
+    - Target bandwidth constraint
+    - Motion complexity of current chunk
+    - Smooth transitions between settings
+    """
+    # Quality level presets (lambda, expected_bpp_range)
+    QUALITY_PRESETS = [
+        (1, 0.05, 0.15),   # Lowest quality: ~0.05-0.15 bpp
+        (2, 0.10, 0.25),   # Low quality: ~0.10-0.25 bpp
+        (3, 0.15, 0.40),   # Medium quality: ~0.15-0.40 bpp
+        (4, 0.25, 0.60),   # High quality: ~0.25-0.60 bpp
+        (5, 0.40, 1.00),   # Best quality: ~0.40-1.00 bpp
+    ]
+    def __init__(
+        self,
+        target_bandwidth_kbps: float = 500.0,
+        base_fps: float = 30.0,
+        min_fps: float = 5.0,
+        max_fps: float = 60.0,
+        chunk_duration_sec: float = 1.0,
+        smoothing_factor: float = 0.3,
+        aggressiveness: float = 0.5,
+    ):
+        """
+        Args:
+            target_bandwidth_kbps: Target bandwidth in kilobits per second
+            base_fps: Original video framerate
+            min_fps: Minimum allowed effective framerate
+            max_fps: Maximum allowed effective framerate
+            chunk_duration_sec: Duration of each chunk in seconds
+            smoothing_factor: How much to smooth parameter transitions (0-1)
+            aggressiveness: Bandwidth savings strategy (0.0=use full bandwidth, 1.0=maximum savings)
+        """
+        self.target_bandwidth_kbps = target_bandwidth_kbps
+        self.base_fps = base_fps
+        self.min_fps = min_fps
+        self.max_fps = max_fps
+        self.chunk_duration_sec = chunk_duration_sec
+        self.smoothing_factor = smoothing_factor
+        self.aggressiveness = max(0.0, min(1.0, aggressiveness))
+        # State for smooth transitions
+        self._prev_fps: Optional[float] = None
+        self._prev_quality: Optional[int] = None
+        self._prev_sigma: Optional[float] = None
+    def reset(self):
+        """Reset state for new video."""
+        self._prev_fps = None
+        self._prev_quality = None
+        self._prev_sigma = None
+    def compute_settings(
+        self,
+        frame_width: int,
+        frame_height: int,
+        motion_metrics: MotionMetrics,
+        roi_coverage: float = 0.3,
+    ) -> Tuple[float, int, float]:
+        """Compute compression settings for a chunk.
+        Args:
+            frame_width: Frame width in pixels
+            frame_height: Frame height in pixels
+            motion_metrics: Motion analysis results
+            roi_coverage: Fraction of frame covered by ROI
+        Returns:
+            Tuple of (effective_fps, quality_level, sigma)
+        """
+        num_pixels = frame_width * frame_height
+        # Target bits per chunk
+        target_bits_per_chunk = self.target_bandwidth_kbps * 1000 * self.chunk_duration_sec
+        # Framerate adjustment based on motion
+        # High motion -> more frames, lower quality per frame
+        # Low motion -> fewer frames, higher quality per frame
+        fr_factor = motion_metrics.framerate_factor
+        # For dynamic mode with high aggressiveness, use a higher baseline
+        # to ensure motion-based variation actually produces meaningful FPS range
+        if self.aggressiveness > 0.5:
+            # Use average of min/max as effective base for motion scaling
+            effective_base_fps = (self.min_fps + self.max_fps) / 2
+        else:
+            # Use video's native FPS
+            effective_base_fps = self.base_fps
+        # Base framerate adjusted by motion
+        motion_fps = effective_base_fps * fr_factor
+        motion_fps = np.clip(motion_fps, self.min_fps, self.max_fps)
+        # Estimate frames in chunk at this FPS
+        frames_in_chunk = int(motion_fps * self.chunk_duration_sec)
+        frames_in_chunk = max(1, frames_in_chunk)
+        # Target bits per frame
+        target_bits_per_frame = target_bits_per_chunk / frames_in_chunk
+        target_bpp = target_bits_per_frame / num_pixels
+        # Find quality level that matches target BPP
+        quality_level = self._find_quality_for_bpp(target_bpp, roi_coverage)
+        # Compute sigma based on motion and quality
+        # Higher motion -> lower sigma (more background compression to save bits for motion)
+        # Lower quality -> lower sigma (aggressive background compression)
+        base_sigma = 0.3
+        motion_adjustment = (1.0 - motion_metrics.motion_magnitude) * 0.4
+        quality_adjustment = (quality_level - 3) * 0.1
+        sigma = np.clip(base_sigma + motion_adjustment + quality_adjustment, 0.05, 0.8)
+        # Final FPS adjustment based on actual quality/bpp achieved
+        expected_bpp = self._estimate_bpp(quality_level, sigma, roi_coverage)
+        expected_bits_per_frame = expected_bpp * num_pixels
+        current_bandwidth = (motion_fps * expected_bits_per_frame) / 1000  # kbps
+        bandwidth_ratio = current_bandwidth / self.target_bandwidth_kbps if self.target_bandwidth_kbps > 0 else 1.0
+        # At high aggressiveness, prioritize motion over bandwidth constraints
+        # At low aggressiveness, prioritize bandwidth target
+        if self.aggressiveness > 0.7:
+            # Very aggressive: trust motion analysis, ignore bandwidth (may exceed target)
+            # Only enforce absolute max_fps constraint, not bandwidth
+            final_fps = motion_fps
+        elif self.aggressiveness > 0.5:
+            # Moderately aggressive: allow bandwidth excursions for high motion
+            # Only reduce FPS if significantly over budget (>1.5x)
+            if bandwidth_ratio > 1.5:
+                reduction_factor = 1.5 / bandwidth_ratio
+                final_fps = motion_fps * reduction_factor
+            else:
+                final_fps = motion_fps
+        else:
+            # Conservative mode: enforce bandwidth target strictly
+            override_threshold = 1.2 + self.aggressiveness * 0.5  # 1.2 at agg=0, 1.35 at agg=0.5
+            if bandwidth_ratio > override_threshold:
+                # Bandwidth too high, reduce FPS
+                fps_reduction = (bandwidth_ratio - 1.0) * 0.5
+                final_fps = max(self.min_fps, motion_fps / (1 + fps_reduction))
+            else:
+                # Keep motion-based FPS
+                final_fps = motion_fps
+        final_fps = np.clip(final_fps, self.min_fps, self.max_fps)
+        # Smooth transitions (disable smoothing at high aggressiveness for dramatic variation)
+        # At agg>0.7, no FPS smoothing - follow motion exactly
+        # At agg≤0.7, apply smoothing
+        if self.aggressiveness > 0.7:
+            # No FPS smoothing - allow dramatic jumps
+            smoothing_fps = 0.0
+            smoothing_other = 0.1  # Minimal smoothing for quality/sigma
+        else:
+            smoothing_fps = 0.3 - self.aggressiveness * 0.3  # 0.3 at agg=0, 0.09 at agg=0.7
+            smoothing_other = smoothing_fps
+        quality_change_limit = 1 + int(self.aggressiveness * 2)  # 1 at agg=0, 3 at agg=1.0
+        if self._prev_fps is not None and smoothing_fps > 0:
+            final_fps = self._smooth(final_fps, self._prev_fps, smoothing_fps)
+        if self._prev_quality is not None:
+            # Quality can change by quality_change_limit steps at a time
+            quality_level = int(np.clip(
+                quality_level,
+                self._prev_quality - quality_change_limit,
+                self._prev_quality + quality_change_limit
+            ))
+        if self._prev_sigma is not None and smoothing_other > 0:
+            sigma = self._smooth(sigma, self._prev_sigma, smoothing_other)
+        # Update state
+        self._prev_fps = final_fps
+        self._prev_quality = quality_level
+        self._prev_sigma = sigma
+        return float(final_fps), int(quality_level), float(sigma)
+    def _find_quality_for_bpp(self, target_bpp: float, roi_coverage: float) -> int:
+        """Find quality level that approximately matches target BPP."""
+        # ROI coverage affects effective BPP (more ROI = higher BPP for same quality)
+        adjusted_target = target_bpp / (0.5 + roi_coverage * 0.5)
+        for level, min_bpp, max_bpp in self.QUALITY_PRESETS:
+            mid_bpp = (min_bpp + max_bpp) / 2
+            if adjusted_target <= mid_bpp:
+                return level
+        return 5  # Max quality if target is very high
+    def _estimate_bpp(self, quality_level: int, sigma: float, roi_coverage: float) -> float:
+        """Estimate BPP for given settings."""
+        _, min_bpp, max_bpp = self.QUALITY_PRESETS[quality_level - 1]
+        base_bpp = (min_bpp + max_bpp) / 2
+        # Sigma affects background compression
+        # Lower sigma = more compression = lower BPP
+        sigma_factor = 0.5 + sigma * 0.5
+        # ROI coverage affects total BPP
+        # More ROI = more high-quality pixels = higher BPP
+        roi_factor = 0.7 + roi_coverage * 0.3
+        return base_bpp * sigma_factor * roi_factor
+    def _smooth(self, new_value: float, prev_value: float, factor: Optional[float] = None) -> float:
+        """Apply exponential smoothing.
+        Args:
+            new_value: Target value
+            prev_value: Previous value
+            factor: Smoothing factor (uses self.smoothing_factor if None)
+        """
+        if factor is None:
+            factor = self.smoothing_factor
+        return prev_value + factor * (new_value - prev_value)
+class ChunkCompressor:
+    """Compresses video chunks with adaptive settings.
+    Each chunk can have different:
+    - Frame sampling rate
+    - Compression quality
+    - Background preservation
+    Based on motion analysis and bandwidth constraints.
+    """
+    def __init__(
+        self,
+        compression_model,
+        segmenter=None,
+        target_classes: Optional[List[str]] = None,
+        device: str = "cuda",
+    ):
+        """
+        Args:
+            compression_model: Loaded TIC compression model
+            segmenter: Optional segmenter for ROI extraction
+            target_classes: Classes to segment as ROI
+            device: Compute device
+        """
+        self.compression_model = compression_model
+        self.segmenter = segmenter
+        self.target_classes = target_classes or []
+        self.device = device
+        self.motion_analyzer = MotionAnalyzer()
+    def reset(self):
+        """Reset state for new video."""
+        self.motion_analyzer.reset()
+    def compress_frame(
+        self,
+        frame: Image.Image,
+        mask: Optional[np.ndarray],
+        sigma: float,
+    ) -> CompressionResult:
+        """Compress a single frame.
+        Args:
+            frame: PIL Image frame
+            mask: ROI mask (or None for no ROI)
+            sigma: Background preservation factor
+        Returns:
+            CompressionResult with compressed image and stats
+        """
+        import vae
+        if mask is None:
+            mask = np.zeros((frame.height, frame.width), dtype=np.float32)
+        result = vae.compress_image(
+            image=frame,
+            mask=mask,
+            model=self.compression_model,
+            sigma=sigma,
+            device=self.device,
+        )
+        roi_coverage = float(mask.mean()) if mask is not None else 0.0
+        return CompressionResult(
+            compressed_image=result["compressed"],
+            bpp=result["bpp"],
+            original_size=(frame.width, frame.height),
+            roi_coverage=roi_coverage,
+        )
+    def compress_chunk(
+        self,
+        frames: List[Image.Image],
+        chunk_index: int,
+        effective_fps: float,
+        base_fps: float,
+        quality_level: int,
+        sigma: float,
+        roi_masks: Optional[List[np.ndarray]] = None,
+    ) -> ChunkResult:
+        """Compress a chunk of frames with given settings.
+        Args:
+            frames: List of PIL Image frames
+            chunk_index: Index of this chunk
+            effective_fps: Target effective framerate
+            base_fps: Original video framerate
+            quality_level: Compression quality 1-5
+            sigma: Background preservation factor
+            roi_masks: Optional list of ROI masks
+        Returns:
+            ChunkResult with compressed frames and stats
+        """
+        if not frames:
+            return ChunkResult(
+                frames=[],
+                frame_indices=[],
+                effective_fps=0.0,
+                quality_level=quality_level,
+                sigma=sigma,
+                avg_bpp=0.0,
+                estimated_bytes=0,
+                chunk_index=chunk_index,
+                original_frame_count=0,
+            )
+        # Compute frame sampling to achieve target FPS
+        frame_step = max(1, int(base_fps / effective_fps))
+        sampled_indices = list(range(0, len(frames), frame_step))
+        # Ensure at least one frame
+        if not sampled_indices:
+            sampled_indices = [0]
+        # Compress sampled frames
+        compressed_frames: List[Image.Image] = []
+        bpps: List[float] = []
+        for idx in sampled_indices:
+            frame = frames[idx]
+            mask = roi_masks[idx] if roi_masks and idx < len(roi_masks) else None
+            result = self.compress_frame(frame, mask, sigma)
+            compressed_frames.append(result.compressed_image)
+            bpps.append(result.bpp)
+        # Compute stats
+        avg_bpp = float(np.mean(bpps)) if bpps else 0.0
+        frame_pixels = frames[0].width * frames[0].height
+        total_bits = avg_bpp * frame_pixels * len(compressed_frames)
+        estimated_bytes = int(total_bits / 8)
+        actual_fps = len(sampled_indices) / (len(frames) / base_fps) if len(frames) > 0 else 0
+        return ChunkResult(
+            frames=compressed_frames,
+            frame_indices=sampled_indices,
+            effective_fps=float(actual_fps),
+            quality_level=quality_level,
+            sigma=sigma,
+            avg_bpp=avg_bpp,
+            estimated_bytes=estimated_bytes,
+            chunk_index=chunk_index,
+            original_frame_count=len(frames),
+        )
+    def segment_frames(
+        self,
+        frames: List[Image.Image],
+        temporal_smoothing: bool = True,
+        smoothing_alpha: float = 0.3,
+    ) -> List[np.ndarray]:
+        """Segment multiple frames to get ROI masks (sequential).
+        Args:
+            frames: List of PIL Images
+            temporal_smoothing: Whether to apply temporal smoothing
+            smoothing_alpha: Alpha for fast EMA smoothing (0-1, lower=smoother)
+        Returns:
+            List of ROI masks (temporally smoothed if enabled)
+        """
+        if self.segmenter is None:
+            return [np.zeros((f.height, f.width), dtype=np.float32) for f in frames]
+        masks = []
+        for frame in frames:
+            mask = self.segmenter(frame, target_classes=self.target_classes)
+            masks.append(mask.astype(np.float32))
+        # Apply temporal smoothing to reduce flickering
+        if temporal_smoothing and len(masks) > 2:
+            masks = smooth_masks_sdf(
+                masks,
+                alpha=smoothing_alpha,
+                empty_thresh=10,
+                patience=5,
+            )
+        return masks
+    def segment_frames_batch(
+        self,
+        frames: List[Image.Image],
+        batch_size: int = 4,
+        temporal_smoothing: bool = True,
+        smoothing_window: int = 5,
+        smoothing_alpha: float = 0.3,
+    ) -> List[np.ndarray]:
+        """Segment multiple frames with batch processing and temporal smoothing.
+        Processes frames in batches for better GPU utilization.
+        Falls back to sequential processing if batch not supported.
+        Optionally applies temporal smoothing to reduce mask flickering.
+        Args:
+            frames: List of PIL Images
+            batch_size: Number of frames to process at once
+            temporal_smoothing: Whether to apply temporal smoothing
+            smoothing_window: Window size for temporal smoothing (odd number)
+            smoothing_alpha: Alpha for fast EMA smoothing (0-1, lower=smoother)
+        Returns:
+            List of ROI masks (temporally smoothed if enabled)
+        """
+        if self.segmenter is None:
+            return [np.zeros((f.height, f.width), dtype=np.float32) for f in frames]
+        # Check if segmenter supports batch processing
+        import torch
+        max_retries = 7
+        bs = batch_size
+        masks = None
+        for attempt in range(max_retries + 1):
+            try:
+                masks = []
+                if hasattr(self.segmenter, 'segment_batch') and getattr(self.segmenter, 'supports_batch', False):
+                    for i in range(0, len(frames), bs):
+                        batch = frames[i:i + bs]
+                        batch_masks = self.segmenter.segment_batch(batch, target_classes=self.target_classes)
+                        masks.extend([m.astype(np.float32) for m in batch_masks])
+                else:
+                    for frame in frames:
+                        mask = self.segmenter(frame, target_classes=self.target_classes)
+                        masks.append(mask.astype(np.float32))
+                break  # success
+            except (torch.cuda.OutOfMemoryError, RuntimeError) as e:
+                if 'out of memory' in str(e).lower() and attempt < max_retries:
+                    bs = max(1, bs // 2)
+                    # Aggressive memory cleanup
+                    masks = None
+                    import gc
+                    gc.collect()
+                    torch.cuda.empty_cache()
+                    torch.cuda.synchronize()
+                    print(f"segment_frames_batch: OOM, retrying batch_size={bs} (attempt {attempt+1}/{max_retries})")
+                    continue
+                raise
+        if masks is None:
+            raise RuntimeError("Segmentation failed after OOM retries")
+        # Apply temporal smoothing to reduce flickering
+        if temporal_smoothing and len(masks) > 2:
+            # Use SDF smoothing for jitter-free, fluid transitions
+            masks = smooth_masks_sdf(
+                masks,
+                alpha=smoothing_alpha,
+                empty_thresh=10,
+                patience=5,
+            )
+        return masks
+    def compress_frames_batch(
+        self,
+        frames: List[Image.Image],
+        masks: List[np.ndarray],
+        sigma: "float | List[float]",
+        batch_size: int = 4,
+    ) -> List[CompressionResult]:
+        """Compress multiple frames with true batch processing.
+        Processes frames in batches for better GPU utilization.
+        Batches padding operations and CPU transfers for efficiency.
+        Args:
+            frames: List of PIL Images
+            masks: List of ROI masks
+            sigma: Background preservation factor – a single float (uniform)
+                   or a per-frame list of floats.
+            batch_size: Number of frames to process at once
+        Returns:
+            List of CompressionResult
+        """
+        import torch
+        import torch.nn.functional as F
+        import vae
+        from vae.utils import compute_padding
+        if not frames:
+            return []
+        # Normalise sigma to a per-frame list
+        if isinstance(sigma, (int, float)):
+            sigmas = [float(sigma)] * len(frames)
+        else:
+            sigmas = [float(s) for s in sigma]
+            if len(sigmas) != len(frames):
+                raise ValueError(f"sigma list length {len(sigmas)} != frame count {len(frames)}")
+        results = []
+        max_retries = 7
+        bs = batch_size
+        # Process in batches with OOM retry
+        pos = 0
+        while pos < len(frames):
+            batch_end = min(pos + bs, len(frames))
+            batch_frames = frames[pos:batch_end]
+            batch_masks = masks[pos:batch_end]
+            try:
+                batch_results = self._compress_batch_inner(
+                    batch_frames, batch_masks, sigmas, pos, pad_cache=None,
+                )
+                results.extend(batch_results)
+                pos = batch_end
+            except (torch.cuda.OutOfMemoryError, RuntimeError) as e:
+                if 'out of memory' in str(e).lower() and bs > 1:
+                    bs = max(1, bs // 2)
+                    torch.cuda.empty_cache()
+                    print(f"compress_frames_batch: OOM, retrying with batch_size={bs}")
+                    continue
+                raise
+        # Clear GPU memory after all batches
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        return results
+    def _compress_batch_inner(
+        self,
+        batch_frames: List[Image.Image],
+        batch_masks: List[np.ndarray],
+        sigmas: List[float],
+        global_offset: int,
+        pad_cache: Optional[Any] = None,
+    ) -> List[CompressionResult]:
+        """Compress a single batch of frames (inner helper).
+        Args:
+            batch_frames: Frames in this batch.
+            batch_masks: Masks in this batch.
+            sigmas: Per-frame sigma values (full list, indexed by global_offset + i).
+            global_offset: Start index into the full sigmas list.
+            pad_cache: Unused, reserved for future padding reuse.
+        Returns:
+            List of CompressionResult for this batch.
+        """
+        import torch
+        import torch.nn.functional as F
+        from vae.utils import compute_padding
+        batch_results = []
+        batch_tensors = []
+        batch_mask_tensors = []
+        original_sizes = []
+        for frame, mask in zip(batch_frames, batch_masks):
+            img_array = np.array(frame).astype(np.float32) / 255.0
+            img_tensor = torch.from_numpy(img_array).permute(2, 0, 1)
+            if mask is None:
+                mask = np.zeros((frame.height, frame.width), dtype=np.float32)
+            mask_tensor = torch.from_numpy(mask).unsqueeze(0)
+            batch_tensors.append(img_tensor)
+            batch_mask_tensors.append(mask_tensor)
+            original_sizes.append((frame.width, frame.height))
+        _, h, w = batch_tensors[0].shape
+        pad, unpad = compute_padding(h, w, min_div=256)
+        padded_batch = torch.stack([
+            F.pad(img_t, pad, mode='constant', value=0) for img_t in batch_tensors
+        ]).to(self.device)
+        padded_masks = torch.stack([
+            F.pad(mask_t, pad, mode='constant', value=0) for mask_t in batch_mask_tensors
+        ]).to(self.device)
+        with torch.no_grad():
+            # Extract per-frame sigma values for this batch
+            batch_sigmas = [sigmas[global_offset + i] for i in range(len(batch_frames))]
+            # Call model once with entire batch (TRUE BATCHING)
+            if all(s == batch_sigmas[0] for s in batch_sigmas):
+                # All frames have same sigma - use scalar
+                out = self.compression_model(
+                    padded_batch,
+                    padded_masks,
+                    sigma=batch_sigmas[0],
+                )
+            else:
+                # Different sigma values - pass as tensor
+                sigma_tensor = torch.tensor(batch_sigmas, device=self.device, dtype=torch.float32)
+                out = self.compression_model(
+                    padded_batch,
+                    padded_masks,
+                    sigma=sigma_tensor,
+                )
+            # Unpad all frames at once
+            x_hat_padded = out['x_hat']
+            x_hat_batch = F.pad(x_hat_padded, unpad)
+            # Move to CPU and convert to numpy once
+            x_hat_cpu = x_hat_batch.cpu().numpy()
+            # Extract per-frame results
+            for i in range(len(batch_frames)):
+                x_hat_np = x_hat_cpu[i].transpose(1, 2, 0)
+                x_hat_np = np.clip(x_hat_np * 255, 0, 255).astype(np.uint8)
+                compressed_img = Image.fromarray(x_hat_np)
+                # Calculate BPP for this frame
+                num_pixels = h * w
+                likelihoods = out['likelihoods']
+                bpp_y = torch.log(likelihoods['y'][i:i+1]).sum() / (-np.log(2) * num_pixels)
+                bpp_z = torch.log(likelihoods['z'][i:i+1]).sum() / (-np.log(2) * num_pixels)
+                bpp = (bpp_y + bpp_z).item()
+                mask_for_coverage = batch_masks[i] if i < len(batch_masks) else None
+                roi_coverage = float(mask_for_coverage.mean()) if mask_for_coverage is not None else 0.0
+                batch_results.append(CompressionResult(
+                    compressed_image=compressed_img,
+                    bpp=bpp,
+                    original_size=original_sizes[i],
+                    roi_coverage=roi_coverage,
+                ))
+        del padded_batch, padded_masks, x_hat_batch, x_hat_cpu
+        return batch_results

video/gpu_memory.py ADDED Viewed

	@@ -0,0 +1,194 @@

+"""GPU memory estimation and automatic batch-size selection.
+Queries free VRAM and uses per-model heuristics to pick the largest safe
+batch size for both segmentation and compression stages.  Falls back to
+batch=1 on CPU or when CUDA info is unavailable.
+"""
+from __future__ import annotations
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple
+# ---------------------------------------------------------------------------
+# Per-frame memory cost heuristics (bytes, float32, conservative)
+# These are empirical estimates measured on a mix of 480p frames.
+# ---------------------------------------------------------------------------
+# Segmentation models: (model_key, approx_param_bytes, per_frame_activation_bytes_480p)
+_SEG_MEMORY_PER_FRAME: dict[str, float] = {
+    # YOLO-X seg – large model (71M params) with conv layers
+    "yolo": 180 * 1024**2,        # ~180 MB per frame at 480p (X model)
+    # SegFormer – transformer encoder, moderate
+    "segformer": 200 * 1024**2,   # ~200 MB per frame
+    # Mask2Former – Swin-Large + mask decoder, heavier
+    "mask2former": 350 * 1024**2, # ~350 MB per frame
+    # Mask R-CNN – ResNet50-FPN, moderate
+    "maskrcnn": 250 * 1024**2,    # ~250 MB per frame
+    # SAM3 (OWL-ViT + SAM) – not truly batchable, treat as sequential
+    "sam3": 500 * 1024**2,        # ~500 MB (single image pipeline)
+    # Fake segmentation (detection + tracking → bbox masks) - much lighter
+    "fake_yolo": 120 * 1024**2,           # ~120 MB per frame (YOLO detection)
+    "fake_yolo_botsort": 120 * 1024**2,   # Same as fake_yolo
+    "fake_detr": 150 * 1024**2,           # ~150 MB per frame (DETR transformer)
+    "fake_deformable_detr": 170 * 1024**2, # ~170 MB per frame
+    "fake_fasterrcnn": 140 * 1024**2,     # ~140 MB per frame (ResNet50 backbone)
+    "fake_retinanet": 140 * 1024**2,      # ~140 MB per frame
+    "fake_fcos": 130 * 1024**2,           # ~130 MB per frame
+    "fake_grounding_dino": 800 * 1024**2, # ~800 MB per frame (VERY large: BERT text + Swin-T vision + cross-attention)
+}
+_SEG_MODEL_OVERHEAD: dict[str, float] = {
+    "yolo": 450 * 1024**2,        # YOLO-X model overhead (~71M params)
+    "segformer": 400 * 1024**2,
+    "mask2former": 800 * 1024**2,
+    "maskrcnn": 300 * 1024**2,
+    "sam3": 600 * 1024**2,
+    # Fake segmentation overhead (detector weights + tracker state)
+    "fake_yolo": 350 * 1024**2,           # YOLO-X detector weights
+    "fake_yolo_botsort": 350 * 1024**2,
+    "fake_detr": 200 * 1024**2,           # DETR weights
+    "fake_deformable_detr": 250 * 1024**2,
+    "fake_fasterrcnn": 160 * 1024**2,     # Faster R-CNN weights
+    "fake_retinanet": 160 * 1024**2,
+    "fake_fcos": 150 * 1024**2,
+    "fake_grounding_dino": 1200 * 1024**2, # Grounding DINO weights (VERY large: ~700M params)
+}
+# TIC transformer compression: very heavy activations (attention is O(N²))
+# Measured empirically on 854×480 frames:
+#   - Model params + entropy structures ≈ 1–3 GB
+#   - Per-frame activations ≈ 2–3 GB (float32)
+_TIC_MODEL_OVERHEAD: float = 2.0 * 1024**3       # ~2 GB for params + entropy buffers
+_TIC_PER_FRAME_480P: float = 2.5 * 1024**3       # ~2.5 GB per frame
+# Reference resolution for the estimates above
+_REF_HEIGHT = 480
+_REF_WIDTH = 854
+_REF_PIXELS = _REF_HEIGHT * _REF_WIDTH
+# Safety margin – leave at least this fraction of free memory unused
+_SAFETY_MARGIN = 0.15  # 15 %
+@dataclass
+class BatchSizeEstimate:
+    """Result of automatic batch-size estimation."""
+    seg_batch_size: int
+    compress_batch_size: int
+    free_vram_bytes: int
+    device: str
+    notes: str = ""
+def _get_free_vram(device: str = "cuda") -> Optional[int]:
+    """Return free VRAM in bytes, or None if unavailable."""
+    try:
+        import torch
+        if not torch.cuda.is_available() or device == "cpu":
+            return None
+        dev_idx = 0
+        if ":" in device:
+            dev_idx = int(device.split(":")[1])
+        free, _total = torch.cuda.mem_get_info(dev_idx)
+        return int(free)
+    except Exception:
+        return None
+def _scale_memory(base_bytes: float, frame_h: int, frame_w: int) -> float:
+    """Scale a 480p memory estimate to an arbitrary resolution."""
+    pixels = frame_h * frame_w
+    # Attention is roughly O(pixels) for feature-map memory and O(tokens²)
+    # for attention matrices.  Use a conservative linear-ish scaling.
+    ratio = pixels / _REF_PIXELS
+    # Slightly super-linear to account for quadratic attention term
+    return base_bytes * (ratio ** 1.2)
+def estimate_batch_sizes(
+    frame_height: int = 480,
+    frame_width: int = 854,
+    seg_method: str = "yolo",
+    device: str = "cuda",
+    total_frames: int = 300,
+) -> BatchSizeEstimate:
+    """Estimate optimal batch sizes for segmentation and compression.
+    The function queries free VRAM and computes how many frames can be
+    processed in a single batch for each stage, independently.
+    Args:
+        frame_height: Height of (pre-processed) frames.
+        frame_width: Width of (pre-processed) frames.
+        seg_method: Segmentation method key (yolo, segformer, mask2former, etc.)
+        device: Torch device string.
+        total_frames: Total number of frames in the video.
+    Returns:
+        BatchSizeEstimate with recommended batch sizes.
+    """
+    free = _get_free_vram(device)
+    notes_parts: list[str] = []
+    if free is None:
+        # CPU fallback
+        return BatchSizeEstimate(
+            seg_batch_size=min(16, total_frames),
+            compress_batch_size=1,
+            free_vram_bytes=0,
+            device=device,
+            notes="CPU mode – using sequential compression, modest seg batches.",
+        )
+    usable = int(free * (1.0 - _SAFETY_MARGIN))
+    # --- Segmentation batch size ---
+    seg_key = seg_method.lower()
+    per_frame_seg = _scale_memory(
+        _SEG_MEMORY_PER_FRAME.get(seg_key, 200 * 1024**2),
+        frame_height,
+        frame_width,
+    )
+    model_overhead_seg = _SEG_MODEL_OVERHEAD.get(seg_key, 400 * 1024**2)
+    available_for_seg_frames = usable - model_overhead_seg
+    if available_for_seg_frames < per_frame_seg:
+        seg_batch = 1
+    else:
+        seg_batch = int(available_for_seg_frames / per_frame_seg)
+    # SAM3 is not truly batchable – cap to 1
+    if seg_key == "sam3":
+        seg_batch = 1
+        notes_parts.append("SAM3 is sequential (OWL-ViT+SAM pipeline).")
+    seg_batch = max(1, min(seg_batch, total_frames))
+    # --- Compression batch size ---
+    per_frame_compress = _scale_memory(_TIC_PER_FRAME_480P, frame_height, frame_width)
+    available_for_compress = usable - _TIC_MODEL_OVERHEAD
+    if available_for_compress < per_frame_compress:
+        compress_batch = 1
+    else:
+        compress_batch = int(available_for_compress / per_frame_compress)
+    compress_batch = max(1, min(compress_batch, total_frames))
+    notes_parts.append(
+        f"Free VRAM: {free / 1024**3:.1f} GB, "
+        f"usable: {usable / 1024**3:.1f} GB.  "
+        f"Seg per-frame est: {per_frame_seg / 1024**2:.0f} MB, "
+        f"compress per-frame est: {per_frame_compress / 1024**2:.0f} MB."
+    )
+    return BatchSizeEstimate(
+        seg_batch_size=seg_batch,
+        compress_batch_size=compress_batch,
+        free_vram_bytes=free,
+        device=device,
+        notes="  ".join(notes_parts),
+    )

video/mask_cache.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""Utilities for saving and loading video segmentation masks."""
+import numpy as np
+import tempfile
+from pathlib import Path
+from typing import List, Optional
+import pickle
+def save_video_masks(masks: List[np.ndarray], output_path: Optional[str] = None) -> str:
+    """Save video segmentation masks to a file.
+    Args:
+        masks: List of mask arrays (H, W) for each frame
+        output_path: Optional output path. If None, creates temp file.
+    Returns:
+        Path to saved mask file
+    """
+    if output_path is None:
+        # Create temporary file
+        fd, output_path = tempfile.mkstemp(suffix='.masks.npz', prefix='video_masks_')
+        import os
+        os.close(fd)
+    # Stack masks and save
+    mask_array = np.stack(masks, axis=0)  # (T, H, W)
+    np.savez_compressed(output_path, masks=mask_array)
+    return output_path
+def load_video_masks(mask_path: str) -> List[np.ndarray]:
+    """Load video segmentation masks from a file.
+    Args:
+        mask_path: Path to saved mask file
+    Returns:
+        List of mask arrays (H, W) for each frame
+    """
+    data = np.load(mask_path)
+    mask_array = data['masks']  # (T, H, W)
+    # Convert back to list
+    masks = [mask_array[i] for i in range(mask_array.shape[0])]
+    return masks
+def get_mask_info(mask_path: str) -> dict:
+    """Get information about saved masks without loading them.
+    Args:
+        mask_path: Path to saved mask file
+    Returns:
+        Dictionary with mask metadata
+    """
+    data = np.load(mask_path)
+    mask_array = data['masks']
+    return {
+        'num_frames': mask_array.shape[0],
+        'height': mask_array.shape[1],
+        'width': mask_array.shape[2],
+        'dtype': str(mask_array.dtype),
+        'size_mb': mask_array.nbytes / (1024 * 1024),
+    }