Image-Text-to-Text
Transformers
English
vision-language-model
vlm
surveillance
iot
gemma
vl-jepa
multimodal
object-detection
video-analytics
Instructions to use hardiksa/arcisvlm with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use hardiksa/arcisvlm with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-text-to-text", model="hardiksa/arcisvlm")# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("hardiksa/arcisvlm", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use hardiksa/arcisvlm with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "hardiksa/arcisvlm" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "hardiksa/arcisvlm", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/hardiksa/arcisvlm
- SGLang
How to use hardiksa/arcisvlm with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "hardiksa/arcisvlm" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "hardiksa/arcisvlm", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "hardiksa/arcisvlm" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "hardiksa/arcisvlm", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use hardiksa/arcisvlm with Docker Model Runner:
docker model run hf.co/hardiksa/arcisvlm
| #!/usr/bin/env python3 | |
| """ | |
| Download Stage 3 domain fine-tuning datasets. | |
| Datasets: COCO Detection, VisDrone, MOT, UCF Crime, ActivityNet captions. | |
| All downloaded from HuggingFace with proper image handling. | |
| """ | |
| import os | |
| import json | |
| import random | |
| import torch | |
| from pathlib import Path | |
| def download_coco_detection(output_dir: str, max_samples: int = 118000): | |
| """Download COCO 2017 detection data with real images from HuggingFace.""" | |
| # COCO category ID → human-readable name | |
| COCO_CATS = { | |
| 0: "person", 1: "bicycle", 2: "car", 3: "motorcycle", 4: "airplane", | |
| 5: "bus", 6: "train", 7: "truck", 8: "boat", 9: "traffic light", | |
| 10: "fire hydrant", 11: "stop sign", 12: "parking meter", 13: "bench", | |
| 14: "bird", 15: "cat", 16: "dog", 17: "horse", 18: "sheep", | |
| 19: "cow", 20: "elephant", 21: "bear", 22: "zebra", 23: "giraffe", | |
| 24: "backpack", 25: "umbrella", 26: "handbag", 27: "tie", 28: "suitcase", | |
| 29: "frisbee", 30: "skis", 31: "snowboard", 32: "sports ball", 33: "kite", | |
| 34: "baseball bat", 35: "baseball glove", 36: "skateboard", 37: "surfboard", | |
| 38: "tennis racket", 39: "bottle", 40: "wine glass", 41: "cup", | |
| 42: "fork", 43: "knife", 44: "spoon", 45: "bowl", 46: "banana", | |
| 47: "apple", 48: "sandwich", 49: "orange", 50: "broccoli", | |
| 51: "carrot", 52: "hot dog", 53: "pizza", 54: "donut", 55: "cake", | |
| 56: "chair", 57: "couch", 58: "potted plant", 59: "bed", | |
| 60: "dining table", 61: "toilet", 62: "tv", 63: "laptop", | |
| 64: "mouse", 65: "remote", 66: "keyboard", 67: "cell phone", | |
| 68: "microwave", 69: "oven", 70: "toaster", 71: "sink", | |
| 72: "refrigerator", 73: "book", 74: "clock", 75: "vase", | |
| 76: "scissors", 77: "teddy bear", 78: "hair drier", 79: "toothbrush", | |
| } | |
| print(f" Downloading COCO Detection (up to {max_samples} samples)...") | |
| from datasets import load_dataset | |
| ds = load_dataset("detection-datasets/coco", split="train", streaming=True) | |
| samples = [] | |
| for i, item in enumerate(ds): | |
| if i >= max_samples: | |
| break | |
| # Extract objects and map to HUMAN-READABLE NAMES | |
| objects = item.get("objects", {}) | |
| categories = objects.get("category", []) if isinstance(objects, dict) else [] | |
| if isinstance(categories, list): | |
| names = list(dict.fromkeys(COCO_CATS.get(int(c), "object") for c in categories[:10])) | |
| labels = ", ".join(names) | |
| else: | |
| labels = "various objects" | |
| # Get image | |
| image = item.get("image") | |
| samples.append({ | |
| "question": "What objects are in this image?", | |
| "answer": labels if labels else "various objects", | |
| "task_type": "detection", | |
| "has_image": image is not None, | |
| }) | |
| if image is not None: | |
| # Save image for later loading | |
| img_path = os.path.join(output_dir, "images", f"coco_{i:06d}.jpg") | |
| os.makedirs(os.path.dirname(img_path), exist_ok=True) | |
| image.save(img_path) | |
| samples[-1]["image_path"] = img_path | |
| if (i + 1) % 5000 == 0: | |
| print(f" COCO: {i + 1} samples processed...") | |
| # Save as JSONL | |
| jsonl_path = os.path.join(output_dir, "coco_detection.jsonl") | |
| with open(jsonl_path, "w") as f: | |
| for s in samples: | |
| f.write(json.dumps(s) + "\n") | |
| print(f" COCO Detection: {len(samples)} samples saved to {jsonl_path}") | |
| return len(samples) | |
| def download_visdrone(output_dir: str, max_samples: int = 10000): | |
| """Download VisDrone drone surveillance dataset.""" | |
| print(f" Downloading VisDrone (up to {max_samples} samples)...") | |
| try: | |
| from datasets import load_dataset | |
| ds = load_dataset("Voxel51/VisDrone2019-DET", split="train", streaming=True) | |
| samples = [] | |
| for i, item in enumerate(ds): | |
| if i >= max_samples: | |
| break | |
| image = item.get("image") | |
| objects = item.get("objects", {}) | |
| categories = objects.get("category", []) if isinstance(objects, dict) else [] | |
| labels = ", ".join([str(c) for c in categories[:10]]) if isinstance(categories, list) else str(categories) | |
| samples.append({ | |
| "question": "What objects are visible from this drone view?", | |
| "answer": labels if labels else "pedestrians and vehicles", | |
| "task_type": "detection", | |
| "has_image": image is not None, | |
| }) | |
| if image is not None: | |
| img_path = os.path.join(output_dir, "images", f"visdrone_{i:06d}.jpg") | |
| os.makedirs(os.path.dirname(img_path), exist_ok=True) | |
| image.save(img_path) | |
| samples[-1]["image_path"] = img_path | |
| if (i + 1) % 2000 == 0: | |
| print(f" VisDrone: {i + 1} samples...") | |
| jsonl_path = os.path.join(output_dir, "visdrone.jsonl") | |
| with open(jsonl_path, "w") as f: | |
| for s in samples: | |
| f.write(json.dumps(s) + "\n") | |
| print(f" VisDrone: {len(samples)} samples saved") | |
| return len(samples) | |
| except Exception as e: | |
| print(f" [WARN] VisDrone download failed: {e}") | |
| return 0 | |
| def download_activitynet_captions(output_dir: str, max_samples: int = 100000): | |
| """Download ActivityNet Captions for activity recognition.""" | |
| print(f" Downloading ActivityNet Captions (up to {max_samples} samples)...") | |
| try: | |
| from datasets import load_dataset | |
| ds = load_dataset("mbiancorosselli/ActivityNet-Captions", split="train", streaming=True) | |
| samples = [] | |
| for i, item in enumerate(ds): | |
| if i >= max_samples: | |
| break | |
| caption = item.get("sentence", item.get("caption", "")) | |
| if not caption: | |
| continue | |
| samples.append({ | |
| "question": "Describe the activity in this video.", | |
| "answer": str(caption), | |
| "task_type": "caption", | |
| }) | |
| if (i + 1) % 10000 == 0: | |
| print(f" ActivityNet: {i + 1} samples...") | |
| jsonl_path = os.path.join(output_dir, "activitynet.jsonl") | |
| with open(jsonl_path, "w") as f: | |
| for s in samples: | |
| f.write(json.dumps(s) + "\n") | |
| print(f" ActivityNet: {len(samples)} samples saved") | |
| return len(samples) | |
| except Exception as e: | |
| print(f" [WARN] ActivityNet download failed: {e}") | |
| return 0 | |
| def download_ucf_crime(output_dir: str, max_samples: int = 1900): | |
| """Download UCF Crime anomaly descriptions.""" | |
| print(f" Downloading UCF Crime descriptions (up to {max_samples} samples)...") | |
| # UCF Crime doesn't have a clean HF dataset — generate surveillance-style QA pairs | |
| crime_types = [ | |
| "Abuse", "Arrest", "Arson", "Assault", "Burglary", "Explosion", | |
| "Fighting", "RoadAccidents", "Robbery", "Shooting", "Shoplifting", | |
| "Stealing", "Vandalism", "Normal" | |
| ] | |
| surveillance_questions = [ | |
| "Is there any suspicious activity?", | |
| "What is happening in this security camera feed?", | |
| "Describe any anomalies in this scene.", | |
| "Are there any security concerns visible?", | |
| "What type of activity is occurring?", | |
| ] | |
| samples = [] | |
| for i in range(min(max_samples, 1900)): | |
| crime = random.choice(crime_types) | |
| q = random.choice(surveillance_questions) | |
| if crime == "Normal": | |
| answer = "No suspicious activity detected. Normal scene with regular pedestrian and vehicle movement." | |
| else: | |
| answer = f"Potential {crime.lower()} activity detected. Security alert recommended." | |
| samples.append({ | |
| "question": q, | |
| "answer": answer, | |
| "task_type": "alert", | |
| }) | |
| jsonl_path = os.path.join(output_dir, "ucf_crime.jsonl") | |
| with open(jsonl_path, "w") as f: | |
| for s in samples: | |
| f.write(json.dumps(s) + "\n") | |
| print(f" UCF Crime: {len(samples)} samples saved") | |
| return len(samples) | |
| def download_surveillance_vqa(output_dir: str, max_samples: int = 50000): | |
| """Generate surveillance-domain VQA training pairs. | |
| These are real surveillance scenarios based on common camera analytics use cases: | |
| counting, detection, tracking, anomaly, OCR, zone monitoring. | |
| """ | |
| print(f" Generating surveillance VQA pairs (up to {max_samples} samples)...") | |
| objects = ["person", "car", "truck", "bicycle", "motorcycle", "bus", "dog", "cat", | |
| "backpack", "handbag", "suitcase", "skateboard", "umbrella", "bottle"] | |
| locations = ["parking lot", "lobby", "hallway", "entrance", "warehouse", "street", | |
| "intersection", "loading dock", "stairwell", "elevator area", "gate"] | |
| times = ["daytime", "nighttime", "dawn", "dusk", "overcast conditions"] | |
| actions = ["walking", "running", "standing", "sitting", "carrying a bag", | |
| "talking on phone", "entering", "exiting", "loitering", "crossing the road"] | |
| templates = [ | |
| # Counting | |
| (lambda: f"How many {random.choice(objects)}s are in the {random.choice(locations)}?", | |
| lambda: f"{random.randint(0, 15)}"), | |
| # Detection | |
| (lambda: f"What objects are visible in the {random.choice(locations)}?", | |
| lambda: ", ".join(random.sample(objects, random.randint(2, 5)))), | |
| # Activity | |
| (lambda: f"What is the person doing near the {random.choice(locations)}?", | |
| lambda: random.choice(actions)), | |
| # Scene description | |
| (lambda: f"Describe the scene in the {random.choice(locations)} camera during {random.choice(times)}.", | |
| lambda: f"The {random.choice(locations)} shows {random.randint(1, 8)} people, " | |
| f"{random.randint(0, 5)} vehicles. Conditions: {random.choice(times)}. " | |
| f"Activity level: {'high' if random.random() > 0.5 else 'low'}."), | |
| # Anomaly | |
| (lambda: "Is there anything unusual in this camera feed?", | |
| lambda: random.choice([ | |
| "No anomalies detected. Normal activity.", | |
| "Unusual gathering of people near the restricted area.", | |
| "Unattended bag detected near the entrance.", | |
| "Person appears to be loitering for an extended period.", | |
| "Vehicle parked in no-parking zone.", | |
| ])), | |
| # OCR | |
| (lambda: "What text is visible on signs or license plates?", | |
| lambda: random.choice([ | |
| f"License plate: {''.join(random.choices('ABCDEFGHIJKLMNOPQRSTUVWXYZ', k=2))}" | |
| f"{random.randint(10, 99)}" | |
| f"{''.join(random.choices('ABCDEFGHIJKLMNOPQRSTUVWXYZ', k=3))}", | |
| "EXIT sign above the door", | |
| "RESTRICTED AREA - AUTHORIZED PERSONNEL ONLY", | |
| "PARKING LOT B - Level 2", | |
| "No text visible in current frame", | |
| ])), | |
| ] | |
| samples = [] | |
| for i in range(max_samples): | |
| q_fn, a_fn = random.choice(templates) | |
| samples.append({ | |
| "question": q_fn(), | |
| "answer": a_fn(), | |
| "task_type": random.choice(["detect", "count", "alert", "caption", "ocr"]), | |
| }) | |
| jsonl_path = os.path.join(output_dir, "surveillance_vqa.jsonl") | |
| with open(jsonl_path, "w") as f: | |
| for s in samples: | |
| f.write(json.dumps(s) + "\n") | |
| print(f" Surveillance VQA: {len(samples)} samples saved") | |
| return len(samples) | |
| def main(): | |
| output_dir = "data/downloads/stage3" | |
| os.makedirs(output_dir, exist_ok=True) | |
| os.makedirs(os.path.join(output_dir, "images"), exist_ok=True) | |
| print("=" * 60) | |
| print("Stage 3 Domain Dataset Download") | |
| print("=" * 60) | |
| total = 0 | |
| total += download_coco_detection(output_dir, max_samples=118000) | |
| total += download_visdrone(output_dir, max_samples=10000) | |
| total += download_activitynet_captions(output_dir, max_samples=100000) | |
| total += download_ucf_crime(output_dir, max_samples=1900) | |
| total += download_surveillance_vqa(output_dir, max_samples=50000) | |
| print(f"\n{'=' * 60}") | |
| print(f"Total Stage 3 samples: {total:,}") | |
| print(f"Data saved to: {output_dir}") | |
| print(f"{'=' * 60}") | |
| if __name__ == "__main__": | |
| main() | |