Image-Text-to-Text
Transformers
English
vision-language-model
vlm
surveillance
iot
gemma
vl-jepa
multimodal
object-detection
video-analytics
Instructions to use hardiksa/arcisvlm with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use hardiksa/arcisvlm with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-text-to-text", model="hardiksa/arcisvlm")# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("hardiksa/arcisvlm", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use hardiksa/arcisvlm with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "hardiksa/arcisvlm" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "hardiksa/arcisvlm", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/hardiksa/arcisvlm
- SGLang
How to use hardiksa/arcisvlm with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "hardiksa/arcisvlm" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "hardiksa/arcisvlm", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "hardiksa/arcisvlm" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "hardiksa/arcisvlm", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use hardiksa/arcisvlm with Docker Model Runner:
docker model run hf.co/hardiksa/arcisvlm
| """ | |
| Camera/Scene Augmentation — synthetic diversity for prototype LoRA training. | |
| Generates diverse camera perspectives from a single image by simulating | |
| different camera types, scenes, and conditions. Used in Stage 5 (prototype | |
| LoRA training) to create varied conditioning inputs so each prototype | |
| specializes on a different camera/scene combination. | |
| Augmentations: | |
| - Crop position: simulate PTZ pan/tilt (random crop of original) | |
| - Brightness/contrast: simulate day/night/indoor/outdoor lighting | |
| - Color temperature: simulate warm (indoor) vs cool (outdoor) cameras | |
| - Noise: simulate low-light camera sensor noise | |
| - Blur: simulate focus/motion blur from cheap cameras | |
| - Resolution: simulate different camera resolutions | |
| Reference: Train on augmented data so hypernetwork learns to differentiate | |
| camera/scene contexts and generate specialized adapters. | |
| """ | |
| from __future__ import annotations | |
| import random | |
| from typing import Optional | |
| import torch | |
| import torch.nn.functional as F | |
| class CameraAugmentor: | |
| """ | |
| Applies random camera-specific augmentations to simulate diverse cameras. | |
| Args: | |
| img_size: Target output image size (square) | |
| crop_range: Min/max fraction of image to crop (simulates zoom level) | |
| brightness_range: Min/max brightness multiplier | |
| contrast_range: Min/max contrast multiplier | |
| noise_range: Min/max Gaussian noise std | |
| blur_prob: Probability of applying Gaussian blur | |
| color_temp_range: Min/max color temperature shift | |
| """ | |
| def __init__( | |
| self, | |
| img_size: int = 448, | |
| crop_range: tuple[float, float] = (0.5, 1.0), | |
| brightness_range: tuple[float, float] = (0.5, 1.5), | |
| contrast_range: tuple[float, float] = (0.7, 1.3), | |
| noise_range: tuple[float, float] = (0.0, 0.05), | |
| blur_prob: float = 0.3, | |
| color_temp_range: tuple[float, float] = (-0.1, 0.1), | |
| ): | |
| self.img_size = img_size | |
| self.crop_range = crop_range | |
| self.brightness_range = brightness_range | |
| self.contrast_range = contrast_range | |
| self.noise_range = noise_range | |
| self.blur_prob = blur_prob | |
| self.color_temp_range = color_temp_range | |
| def __call__(self, image: torch.Tensor) -> tuple[torch.Tensor, dict]: | |
| """ | |
| Apply random camera augmentation. | |
| Args: | |
| image: [3, H, W] or [B, 3, H, W] image tensor (0-1 range) | |
| Returns: | |
| augmented_image: same shape, augmented | |
| aug_params: dict describing which augmentations were applied | |
| """ | |
| squeeze = False | |
| if image.dim() == 3: | |
| image = image.unsqueeze(0) | |
| squeeze = True | |
| B, C, H, W = image.shape | |
| aug_params = {} | |
| # 1. Random crop (simulates PTZ zoom/position) | |
| crop_frac = random.uniform(*self.crop_range) | |
| crop_h = int(H * crop_frac) | |
| crop_w = int(W * crop_frac) | |
| top = random.randint(0, H - crop_h) | |
| left = random.randint(0, W - crop_w) | |
| image = image[:, :, top:top + crop_h, left:left + crop_w] | |
| image = F.interpolate(image, size=(self.img_size, self.img_size), mode="bilinear", align_corners=False) | |
| aug_params["crop"] = {"frac": crop_frac, "top": top, "left": left} | |
| # 2. Brightness | |
| brightness = random.uniform(*self.brightness_range) | |
| image = image * brightness | |
| aug_params["brightness"] = brightness | |
| # 3. Contrast | |
| contrast = random.uniform(*self.contrast_range) | |
| mean = image.mean(dim=[2, 3], keepdim=True) | |
| image = (image - mean) * contrast + mean | |
| aug_params["contrast"] = contrast | |
| # 4. Color temperature shift | |
| temp = random.uniform(*self.color_temp_range) | |
| if abs(temp) > 0.01: | |
| # Warm: boost red, reduce blue. Cool: opposite. | |
| image[:, 0] += temp # Red channel | |
| image[:, 2] -= temp # Blue channel | |
| aug_params["color_temp"] = temp | |
| # 5. Gaussian noise (sensor noise) | |
| noise_std = random.uniform(*self.noise_range) | |
| if noise_std > 0.001: | |
| noise = torch.randn_like(image) * noise_std | |
| image = image + noise | |
| aug_params["noise_std"] = noise_std | |
| # 6. Gaussian blur (focus/motion blur) | |
| if random.random() < self.blur_prob: | |
| kernel_size = random.choice([3, 5]) | |
| sigma = random.uniform(0.5, 2.0) | |
| image = _gaussian_blur(image, kernel_size, sigma) | |
| aug_params["blur"] = {"kernel": kernel_size, "sigma": sigma} | |
| else: | |
| aug_params["blur"] = None | |
| # Clamp to valid range | |
| image = image.clamp(0, 1) | |
| if squeeze: | |
| image = image.squeeze(0) | |
| return image, aug_params | |
| class ScenePresetAugmentor: | |
| """ | |
| Applies pre-defined scene augmentation presets for prototype diversity. | |
| Each preset simulates a specific camera installation scenario. | |
| """ | |
| PRESETS = { | |
| "lobby_daytime": {"brightness": 1.2, "contrast": 1.0, "color_temp": 0.05, "noise": 0.01, "crop_frac": 0.8}, | |
| "lobby_nighttime": {"brightness": 0.5, "contrast": 0.8, "color_temp": -0.05, "noise": 0.04, "crop_frac": 0.8}, | |
| "parking_day": {"brightness": 1.3, "contrast": 1.2, "color_temp": 0.0, "noise": 0.01, "crop_frac": 0.6}, | |
| "parking_night": {"brightness": 0.3, "contrast": 0.7, "color_temp": -0.08, "noise": 0.05, "crop_frac": 0.6}, | |
| "gate_clear": {"brightness": 1.1, "contrast": 1.1, "color_temp": 0.02, "noise": 0.01, "crop_frac": 0.9}, | |
| "gate_rain": {"brightness": 0.7, "contrast": 0.9, "color_temp": -0.03, "noise": 0.03, "crop_frac": 0.9}, | |
| "corridor_empty": {"brightness": 0.9, "contrast": 1.0, "color_temp": 0.08, "noise": 0.02, "crop_frac": 1.0}, | |
| "corridor_crowded": {"brightness": 0.9, "contrast": 1.0, "color_temp": 0.08, "noise": 0.02, "crop_frac": 0.7}, | |
| "rooftop_twilight": {"brightness": 0.4, "contrast": 1.1, "color_temp": -0.1, "noise": 0.03, "crop_frac": 0.5}, | |
| "rooftop_noon": {"brightness": 1.4, "contrast": 1.3, "color_temp": 0.0, "noise": 0.01, "crop_frac": 0.5}, | |
| } | |
| def __init__(self, img_size: int = 448): | |
| self.img_size = img_size | |
| def augment(self, image: torch.Tensor, preset_name: str) -> torch.Tensor: | |
| """Apply a named scene preset augmentation.""" | |
| preset = self.PRESETS.get(preset_name, self.PRESETS["lobby_daytime"]) | |
| if image.dim() == 3: | |
| image = image.unsqueeze(0) | |
| B, C, H, W = image.shape | |
| # Crop | |
| frac = preset["crop_frac"] | |
| ch, cw = int(H * frac), int(W * frac) | |
| top = (H - ch) // 2 | |
| left = (W - cw) // 2 | |
| image = image[:, :, top:top + ch, left:left + cw] | |
| image = F.interpolate(image, size=(self.img_size, self.img_size), mode="bilinear", align_corners=False) | |
| # Brightness + contrast | |
| mean = image.mean(dim=[2, 3], keepdim=True) | |
| image = ((image - mean) * preset["contrast"] + mean) * preset["brightness"] | |
| # Color temperature | |
| image[:, 0] += preset["color_temp"] | |
| image[:, 2] -= preset["color_temp"] | |
| # Noise | |
| if preset["noise"] > 0: | |
| image = image + torch.randn_like(image) * preset["noise"] | |
| return image.clamp(0, 1).squeeze(0) | |
| def list_presets(cls) -> list[str]: | |
| return list(cls.PRESETS.keys()) | |
| def _gaussian_blur(image: torch.Tensor, kernel_size: int, sigma: float) -> torch.Tensor: | |
| """Apply Gaussian blur to image tensor.""" | |
| # Create 1D Gaussian kernel | |
| coords = torch.arange(kernel_size, dtype=torch.float32) - kernel_size // 2 | |
| kernel_1d = torch.exp(-0.5 * (coords / sigma) ** 2) | |
| kernel_1d = kernel_1d / kernel_1d.sum() | |
| # Create 2D kernel | |
| kernel_2d = kernel_1d.unsqueeze(0) * kernel_1d.unsqueeze(1) | |
| kernel_2d = kernel_2d.unsqueeze(0).unsqueeze(0) # [1, 1, K, K] | |
| kernel_2d = kernel_2d.expand(image.shape[1], -1, -1, -1).to(image.device) # [C, 1, K, K] | |
| pad = kernel_size // 2 | |
| return F.conv2d(image, kernel_2d, padding=pad, groups=image.shape[1]) | |