Image-Text-to-Text
Transformers
English
vision-language-model
vlm
surveillance
iot
gemma
vl-jepa
multimodal
object-detection
video-analytics
Instructions to use hardiksa/arcisvlm with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use hardiksa/arcisvlm with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-text-to-text", model="hardiksa/arcisvlm")# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("hardiksa/arcisvlm", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use hardiksa/arcisvlm with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "hardiksa/arcisvlm" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "hardiksa/arcisvlm", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/hardiksa/arcisvlm
- SGLang
How to use hardiksa/arcisvlm with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "hardiksa/arcisvlm" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "hardiksa/arcisvlm", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "hardiksa/arcisvlm" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "hardiksa/arcisvlm", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use hardiksa/arcisvlm with Docker Model Runner:
docker model run hf.co/hardiksa/arcisvlm
| #!/usr/bin/env python3 | |
| """ | |
| Profile ArcisVLM inference to find bottleneck kernels. | |
| Runs torch.profiler on N dummy inference passes and reports: | |
| - Top-10 CUDA kernels by GPU time % | |
| - Total inference time, tokens/sec, peak memory | |
| - Saves Chrome-compatible profiler trace to profiling/trace.json | |
| Usage: | |
| python3 scripts/profile_model.py --config configs/default.yaml --device cpu --num-samples 10 | |
| python3 scripts/profile_model.py --ckpt checkpoints/stage2_final.pt --config configs/scale_1.3b.yaml --device cuda | |
| """ | |
| import argparse | |
| import os | |
| import sys | |
| import time | |
| import torch | |
| import yaml | |
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| from model.vlm import VLJEPAModel | |
| from model.tokenizer import BPETokenizer | |
| def load_model_and_config(config_path: str, ckpt_path: str | None, device: str): | |
| """Load config, build model, optionally load checkpoint.""" | |
| with open(config_path) as f: | |
| config = yaml.safe_load(f) | |
| model = VLJEPAModel(config) | |
| if ckpt_path and os.path.exists(ckpt_path): | |
| ckpt = torch.load(ckpt_path, map_location=device, weights_only=False) | |
| state = ckpt.get("model_state_dict", ckpt) | |
| model.load_state_dict(state, strict=False) | |
| print(f"[profile] Loaded checkpoint: {ckpt_path}", file=sys.stderr) | |
| else: | |
| print("[profile] No checkpoint — profiling random init", file=sys.stderr) | |
| model = model.to(device) | |
| model.eval() | |
| return model, config | |
| def make_dummy_inputs(config: dict, device: str, batch_size: int = 1): | |
| """Create dummy image + query inputs for inference profiling.""" | |
| img_size = config["vision"]["img_size"] | |
| vocab_size = config["decoder"]["vocab_size"] | |
| max_q = config.get("predictor", {}).get("max_query_len", 64) | |
| # Use short query for profiling | |
| q_len = min(32, max_q) | |
| images = torch.randn(batch_size, 3, img_size, img_size, device=device) | |
| query_ids = torch.randint(1, vocab_size, (batch_size, q_len), device=device) | |
| query_mask = torch.ones(batch_size, q_len, dtype=torch.long, device=device) | |
| return images, query_ids, query_mask | |
| def warmup_model(model, config, device, n_warmup: int = 3): | |
| """Run a few warmup passes so CUDA kernels are compiled/cached.""" | |
| images, q_ids, q_mask = make_dummy_inputs(config, device) | |
| for _ in range(n_warmup): | |
| with torch.no_grad(): | |
| model.generate(images, q_ids, q_mask, max_new_tokens=16, temperature=0.8) | |
| if device.startswith("cuda"): | |
| torch.cuda.synchronize() | |
| def profile_inference(model, config: dict, device: str, num_samples: int, | |
| trace_path: str, max_new_tokens: int = 32): | |
| """Run profiled inference and return timing + kernel stats.""" | |
| is_cuda = device.startswith("cuda") | |
| # Reset peak memory tracking | |
| if is_cuda: | |
| torch.cuda.reset_peak_memory_stats() | |
| torch.cuda.synchronize() | |
| activities = [torch.profiler.ProfilerActivity.CPU] | |
| if is_cuda: | |
| activities.append(torch.profiler.ProfilerActivity.CUDA) | |
| total_tokens = 0 | |
| t_start = time.perf_counter() | |
| os.makedirs(os.path.dirname(trace_path) or ".", exist_ok=True) | |
| with torch.profiler.profile( | |
| activities=activities, | |
| record_shapes=True, | |
| profile_memory=True, | |
| with_stack=False, | |
| ) as prof: | |
| for i in range(num_samples): | |
| images, q_ids, q_mask = make_dummy_inputs(config, device) | |
| with torch.no_grad(): | |
| generated = model.generate( | |
| images, q_ids, q_mask, | |
| max_new_tokens=max_new_tokens, | |
| temperature=0.8, | |
| ) | |
| total_tokens += generated.shape[1] | |
| if is_cuda: | |
| torch.cuda.synchronize() | |
| t_end = time.perf_counter() | |
| # Save Chrome trace | |
| prof.export_chrome_trace(trace_path) | |
| # Collect kernel-level stats | |
| sort_key = "cuda_time_total" if is_cuda else "cpu_time_total" | |
| time_key = "cuda_time_total" if is_cuda else "cpu_time_total" | |
| events = prof.key_averages() | |
| # Compute total time across all kernels | |
| total_kernel_time = sum( | |
| getattr(evt, time_key, 0) for evt in events | |
| ) | |
| # Build ranked list | |
| ranked = [] | |
| for evt in sorted(events, key=lambda e: getattr(e, time_key, 0), reverse=True): | |
| kernel_time = getattr(evt, time_key, 0) | |
| pct = (kernel_time / total_kernel_time * 100) if total_kernel_time > 0 else 0.0 | |
| ranked.append({ | |
| "name": evt.key, | |
| "calls": evt.count, | |
| "time_us": kernel_time, | |
| "pct": pct, | |
| "cpu_time_us": evt.cpu_time_total, | |
| "cuda_time_us": getattr(evt, "cuda_time_total", 0), | |
| }) | |
| wall_time = t_end - t_start | |
| tokens_per_sec = total_tokens / wall_time if wall_time > 0 else 0 | |
| peak_mem_gb = (torch.cuda.max_memory_allocated() / 1e9) if is_cuda else 0.0 | |
| return { | |
| "ranked_kernels": ranked, | |
| "wall_time_s": wall_time, | |
| "total_tokens": total_tokens, | |
| "tokens_per_sec": tokens_per_sec, | |
| "peak_memory_gb": peak_mem_gb, | |
| "num_samples": num_samples, | |
| "trace_path": trace_path, | |
| } | |
| def print_report(results: dict): | |
| """Print human-readable profiling report to stdout.""" | |
| print("=" * 70) | |
| print("ArcisVLM Inference Profiling Report") | |
| print("=" * 70) | |
| print(f" Samples: {results['num_samples']}") | |
| print(f" Wall time: {results['wall_time_s']:.2f}s") | |
| print(f" Total tokens: {results['total_tokens']}") | |
| print(f" Tokens/sec: {results['tokens_per_sec']:.1f}") | |
| if results["peak_memory_gb"] > 0: | |
| print(f" Peak GPU mem: {results['peak_memory_gb']:.2f} GB") | |
| print(f" Trace saved: {results['trace_path']}") | |
| print() | |
| print("Top-10 Kernels by GPU/CPU Time:") | |
| print("-" * 70) | |
| print(f" {'Rank':>4} {'%':>6} {'Time(us)':>10} {'Calls':>6} {'Kernel'}") | |
| print("-" * 70) | |
| for i, k in enumerate(results["ranked_kernels"][:10]): | |
| print(f" {i+1:>4} {k['pct']:>5.1f}% {k['time_us']:>10.0f} {k['calls']:>6} {k['name']}") | |
| print("-" * 70) | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Profile ArcisVLM inference") | |
| parser.add_argument("--ckpt", type=str, default=None, help="Checkpoint path") | |
| parser.add_argument("--config", type=str, required=True, help="YAML config path") | |
| parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu") | |
| parser.add_argument("--num-samples", type=int, default=100, help="Number of inference samples") | |
| parser.add_argument("--max-new-tokens", type=int, default=32, help="Tokens to generate per sample") | |
| parser.add_argument("--trace-dir", type=str, default="profiling", help="Directory for trace output") | |
| parser.add_argument("--warmup", type=int, default=3, help="Warmup iterations before profiling") | |
| args = parser.parse_args() | |
| trace_path = os.path.join(args.trace_dir, "trace.json") | |
| model, config = load_model_and_config(args.config, args.ckpt, args.device) | |
| # Warmup | |
| print(f"[profile] Warming up ({args.warmup} iters)...", file=sys.stderr) | |
| warmup_model(model, config, args.device, args.warmup) | |
| # Profile | |
| print(f"[profile] Profiling {args.num_samples} samples on {args.device}...", file=sys.stderr) | |
| results = profile_inference( | |
| model, config, args.device, args.num_samples, | |
| trace_path, args.max_new_tokens | |
| ) | |
| print_report(results) | |
| if __name__ == "__main__": | |
| main() | |