Image-Text-to-Text
Transformers
English
vision-language-model
vlm
surveillance
iot
gemma
vl-jepa
multimodal
object-detection
video-analytics
Instructions to use hardiksa/arcisvlm with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use hardiksa/arcisvlm with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-text-to-text", model="hardiksa/arcisvlm")# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("hardiksa/arcisvlm", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use hardiksa/arcisvlm with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "hardiksa/arcisvlm" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "hardiksa/arcisvlm", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/hardiksa/arcisvlm
- SGLang
How to use hardiksa/arcisvlm with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "hardiksa/arcisvlm" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "hardiksa/arcisvlm", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "hardiksa/arcisvlm" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "hardiksa/arcisvlm", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use hardiksa/arcisvlm with Docker Model Runner:
docker model run hf.co/hardiksa/arcisvlm
| #!/usr/bin/env python3 | |
| """ | |
| Standalone inference test for ArcisVLM. | |
| Tests that the model can generate coherent text given an image + question. | |
| This is the FIRST thing to run after training to verify the model works. | |
| Usage: | |
| python3 scripts/test_inference.py --ckpt checkpoints/v4_stage3_final.pt --device cuda | |
| python3 scripts/test_inference.py --ckpt checkpoints/v4_stage3_final.pt --device cpu | |
| """ | |
| import argparse | |
| import os | |
| import sys | |
| import time | |
| import torch | |
| import yaml | |
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| from model.vlm import VLJEPAModel | |
| from model.tokenizer_utils import load_tokenizer, validate_tokenizer_model_match | |
| TEST_QUESTIONS = [ | |
| "What do you see in this image?", | |
| "How many people are in the scene?", | |
| "Describe the objects in the image.", | |
| "Is there a car in this image?", | |
| "What color is the main object?", | |
| "What is happening in this scene?", | |
| "Are there any people?", | |
| "Count the vehicles.", | |
| "What text is visible?", | |
| "Describe the weather conditions.", | |
| ] | |
| def main(): | |
| parser = argparse.ArgumentParser(description="ArcisVLM Inference Test") | |
| parser.add_argument("--ckpt", required=True, help="Checkpoint path") | |
| parser.add_argument("--config", default="configs/scale_1.3b.yaml") | |
| parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu") | |
| parser.add_argument("--max-tokens", type=int, default=64) | |
| parser.add_argument("--temperature", type=float, default=0.7) | |
| args = parser.parse_args() | |
| print("=" * 70) | |
| print("ArcisVLM Inference Test") | |
| print("=" * 70) | |
| # Load config | |
| with open(args.config) as f: | |
| config = yaml.safe_load(f) | |
| img_size = config.get("vision", {}).get("img_size", 448) | |
| # Load tokenizer FIRST | |
| print("\n--- Tokenizer ---") | |
| ckpt_dir = os.path.dirname(args.ckpt) | |
| tokenizer = load_tokenizer(config, checkpoint_dir=ckpt_dir) | |
| # Load model | |
| print("\n--- Model ---") | |
| model = VLJEPAModel(config) | |
| if os.path.exists(args.ckpt): | |
| ckpt = torch.load(args.ckpt, map_location=args.device, weights_only=False) | |
| if "model_state_dict" in ckpt: | |
| sd = ckpt["model_state_dict"] | |
| # Handle DDP 'module.' prefix | |
| cleaned = {} | |
| for k, v in sd.items(): | |
| cleaned[k.replace("module.", "")] = v | |
| missing, unexpected = model.load_state_dict(cleaned, strict=False) | |
| print(f" Loaded: {args.ckpt}") | |
| print(f" Epoch: {ckpt.get('epoch', '?')}, Loss: {ckpt.get('loss', '?')}") | |
| if missing: | |
| print(f" Missing keys: {len(missing)} (e.g. {missing[:3]})") | |
| if unexpected: | |
| print(f" Unexpected keys: {len(unexpected)} (e.g. {unexpected[:3]})") | |
| else: | |
| model.load_state_dict(ckpt, strict=False) | |
| print(f" Loaded raw state dict: {args.ckpt}") | |
| else: | |
| print(f" [WARN] Checkpoint not found: {args.ckpt}") | |
| print(f" Running with random weights (sanity check only)") | |
| model = model.to(args.device) | |
| model.eval() | |
| params = sum(p.numel() for p in model.parameters()) | |
| print(f" Parameters: {params:,} ({params/1e9:.2f}B)") | |
| # Validate tokenizer-model match | |
| print("\n--- Validation ---") | |
| match = validate_tokenizer_model_match(tokenizer, model) | |
| if not match: | |
| print(" [FATAL] Tokenizer-model mismatch! Results will be garbage.") | |
| print(" Fix: download correct tokenizer from HuggingFace") | |
| # Run inference on test questions | |
| print(f"\n--- Inference ({len(TEST_QUESTIONS)} questions) ---") | |
| print(f" Device: {args.device}") | |
| print(f" Max tokens: {args.max_tokens}") | |
| print(f" Temperature: {args.temperature}") | |
| print() | |
| # Create a dummy image (random noise — not ideal but tests generation) | |
| dummy_image = torch.randn(1, 3, img_size, img_size).to(args.device) | |
| total_time = 0 | |
| for i, question in enumerate(TEST_QUESTIONS): | |
| print(f" Q{i+1}: {question}") | |
| # Tokenize question | |
| q_ids = tokenizer.encode(question) | |
| q_tensor = torch.tensor([q_ids], dtype=torch.long, device=args.device) | |
| # Generate | |
| start = time.time() | |
| with torch.no_grad(): | |
| try: | |
| output_ids = model.generate( | |
| dummy_image, q_tensor, | |
| max_new_tokens=args.max_tokens, | |
| temperature=args.temperature, | |
| ) | |
| elapsed = time.time() - start | |
| total_time += elapsed | |
| # Decode | |
| if output_ids is not None and output_ids.numel() > 0: | |
| pred_text = tokenizer.decode(output_ids[0].cpu().tolist()) | |
| # Clean up | |
| pred_text = pred_text.replace("<pad>", "").replace("<eos>", "").strip() | |
| print(f" A{i+1}: {pred_text[:200]}") | |
| print(f" [{elapsed:.2f}s, {output_ids.shape[-1]} tokens]") | |
| else: | |
| print(f" A{i+1}: [EMPTY OUTPUT]") | |
| print(f" [{elapsed:.2f}s]") | |
| except Exception as e: | |
| elapsed = time.time() - start | |
| print(f" A{i+1}: [ERROR] {e}") | |
| print(f" [{elapsed:.2f}s]") | |
| print() | |
| print("=" * 70) | |
| print(f"Total inference time: {total_time:.2f}s") | |
| print(f"Average per query: {total_time / len(TEST_QUESTIONS):.2f}s") | |
| print("=" * 70) | |
| # Summary | |
| print("\n--- Diagnosis ---") | |
| print("If ALL answers are empty or garbage (random tokens):") | |
| print(" → Tokenizer mismatch between training and inference") | |
| print(" → Check: tokenizer vocab == model decoder vocab") | |
| print() | |
| print("If answers are repetitive (same word repeated):") | |
| print(" → Model collapsed during training (mode collapse)") | |
| print(" → Check: training loss was actually decreasing") | |
| print() | |
| print("If answers are coherent but wrong:") | |
| print(" → Model needs more/better training data") | |
| print(" → Architecture is working, just needs scale") | |
| if __name__ == "__main__": | |
| main() | |