| """ |
| dispatchAI core module — model loading, inference, and utilities. |
| """ |
|
|
| import json |
| import os |
| from typing import Optional, List, Dict, Any |
|
|
|
|
| |
|
|
| _ORG = "dispatchAI" |
|
|
| _MODELS = [ |
| {"name": "SmolLM2-135M-Instruct-mobile", "params": "135M", "size_mb": 270, "ram_mb": 400, "task": "chat", "quant": "FP16", "speed_tps": 25.5}, |
| {"name": "SmolLM2-360M-Instruct-mobile", "params": "360M", "size_mb": 720, "ram_mb": 700, "task": "chat", "quant": "FP16", "speed_tps": 21.0}, |
| {"name": "Qwen2.5-0.5B-Instruct-mobile-int4", "params": "500M", "size_mb": 350, "ram_mb": 550, "task": "chat", "quant": "INT4", "speed_tps": 20.0}, |
| {"name": "Qwen2.5-0.5B-Coder-mobile", "params": "500M", "size_mb": 1000, "ram_mb": 1500, "task": "code", "quant": "FP16", "speed_tps": 20.0}, |
| {"name": "Llama-3.2-1B-Instruct-mobile", "params": "1B", "size_mb": 2500, "ram_mb": 3000, "task": "chat", "quant": "FP16", "speed_tps": 12.0}, |
| {"name": "Llama-3.2-1B-Instruct-Q4-mobile", "params": "1B", "size_mb": 700, "ram_mb": 1100, "task": "chat", "quant": "Q4", "speed_tps": 18.2}, |
| {"name": "Llama-3.2-1B-Instruct-Q6-mobile", "params": "1B", "size_mb": 1100, "ram_mb": 1300, "task": "chat", "quant": "Q6", "speed_tps": 16.8}, |
| {"name": "Llama-3.2-1B-FunctionCall-mobile", "params": "1B", "size_mb": 2500, "ram_mb": 3000, "task": "function_call", "quant": "FP16", "speed_tps": 12.0}, |
| {"name": "TinyLlama-1.1B-Chat-Q5-mobile", "params": "1.1B", "size_mb": 800, "ram_mb": 1200, "task": "chat", "quant": "Q5", "speed_tps": 17.5}, |
| {"name": "MiniCPM5-1B-mobile", "params": "1B", "size_mb": 2500, "ram_mb": 3000, "task": "chat", "quant": "FP16", "speed_tps": 12.0}, |
| {"name": "Qwen2.5-Coder-1.5B-mobile", "params": "1.5B", "size_mb": 3000, "ram_mb": 4000, "task": "code", "quant": "FP16", "speed_tps": 10.5}, |
| {"name": "Qwen2.5-Math-1.5B-mobile", "params": "1.5B", "size_mb": 3000, "ram_mb": 4000, "task": "math", "quant": "FP16", "speed_tps": 10.5}, |
| {"name": "Qwen2.5-1.5B-Instruct-Q5-mobile", "params": "1.5B", "size_mb": 1100, "ram_mb": 1700, "task": "chat", "quant": "Q5", "speed_tps": 14.5}, |
| {"name": "Qwen2.5-1.5B-Instruct-Q8-mobile", "params": "1.5B", "size_mb": 1600, "ram_mb": 2200, "task": "chat", "quant": "Q8", "speed_tps": 13.0}, |
| {"name": "Gemma-2-2B-IT-Q5-mobile", "params": "2B", "size_mb": 1500, "ram_mb": 2200, "task": "chat", "quant": "Q5", "speed_tps": 12.0}, |
| {"name": "Gemma-2B-Arabic-mobile", "params": "2B", "size_mb": 5000, "ram_mb": 5500, "task": "arabic", "quant": "FP16", "speed_tps": 8.0}, |
| {"name": "Llama-3.2-3B-Instruct-Q5-mobile", "params": "3B", "size_mb": 2100, "ram_mb": 2700, "task": "chat", "quant": "Q5", "speed_tps": 8.5}, |
| {"name": "Llama-3.2-3B-FunctionCall-mobile", "params": "3B", "size_mb": 6000, "ram_mb": 7000, "task": "function_call", "quant": "FP16", "speed_tps": 5.5}, |
| {"name": "Phi-3.5-mini-instruct-Q5-mobile", "params": "3.8B", "size_mb": 2800, "ram_mb": 3200, "task": "chat", "quant": "Q5", "speed_tps": 7.5}, |
| {"name": "Moondream2-Vision-Q5-mobile", "params": "1.9B", "size_mb": 1400, "ram_mb": 2000, "task": "vision", "quant": "Q5", "speed_tps": 8.5}, |
| {"name": "EmbeddingGemma-300M-Q8-mobile", "params": "300M", "size_mb": 300, "ram_mb": 500, "task": "embedding", "quant": "Q8", "speed_tps": 22.0}, |
| {"name": "Qwen3-Embedding-0.6B-Q8-mobile", "params": "600M", "size_mb": 600, "ram_mb": 800, "task": "embedding", "quant": "Q8", "speed_tps": 18.0}, |
| ] |
|
|
| _LATENCY_DB = { |
| "135M": {"FP16": 25.5, "Q8_0": 28.2, "Q5_K_M": 30.1, "Q4_K_M": 32.0, "Q2_K": 35.0}, |
| "300M": {"FP16": 22.0, "Q8_0": 24.5, "Q5_K_M": 26.0, "Q4_K_M": 27.5, "Q2_K": 29.5}, |
| "500M": {"FP16": 20.0, "Q8_0": 24.0, "Q5_K_M": 25.5, "Q4_K_M": 26.8, "INT4": 20.0}, |
| "600M": {"FP16": 18.0, "Q8_0": 21.0, "Q5_K_M": 22.5, "Q4_K_M": 23.8}, |
| "1B": {"FP16": 12.0, "Q8_0": 15.5, "Q5_K_M": 17.5, "Q4_K_M": 18.2, "Q5": 17.5, "Q4": 18.2, "Q6": 16.8}, |
| "1.1B": {"FP16": 11.5, "Q8_0": 14.8, "Q5_K_M": 17.0, "Q5": 17.5}, |
| "1.5B": {"FP16": 10.5, "Q8_0": 13.0, "Q5_K_M": 14.5, "Q5": 14.5, "Q8": 13.0}, |
| "1.9B": {"FP16": 8.5, "Q8_0": 11.0, "Q5_K_M": 12.5, "Q5": 8.5}, |
| "2B": {"FP16": 8.0, "Q8_0": 10.5, "Q5_K_M": 12.0, "Q5": 12.0}, |
| "3B": {"FP16": 5.5, "Q8_0": 7.0, "Q5_K_M": 8.5, "Q5": 8.5}, |
| "3.8B": {"FP16": 4.5, "Q8_0": 6.0, "Q5_K_M": 7.5, "Q5": 7.5}, |
| "7B": {"FP16": 2.5, "Q8_0": 3.5, "Q5_K_M": 4.5}, |
| } |
|
|
|
|
| |
|
|
| class DispatchModel: |
| """A loaded dispatchAI model ready for inference. |
| |
| Example: |
| from dispatchai import load_model |
| model = load_model("SmolLM2-135M-Instruct-mobile") |
| print(model.chat("Hello!")) |
| """ |
| |
| def __init__(self, model_name: str, repo_id: str, backend: str = "transformers"): |
| self.model_name = model_name |
| self.repo_id = repo_id |
| self.backend = backend |
| self._model = None |
| self._tokenizer = None |
| self._loaded = False |
| |
| def _detect_chat_format(self, model_name: str) -> Optional[str]: |
| """Auto-detect the correct chat format from model name.""" |
| lower = model_name.lower() |
| if "smollm" in lower or "llama-3" in lower: |
| return "llama-3" |
| elif "gemma" in lower: |
| return "gemma" |
| elif "qwen" in lower or "phi" in lower or "tinyllama" in lower or "minicpm" in lower or "moondream" in lower: |
| return "chatml" |
| return "chatml" |
| |
| def _load(self): |
| """Lazily load the model on first use.""" |
| if self._loaded: |
| return |
| |
| if self.backend == "transformers": |
| try: |
| from transformers import AutoTokenizer, AutoModelForCausalLM |
| import torch |
| except ImportError: |
| raise ImportError( |
| "transformers backend requires: pip install dispatchai[torch]\n" |
| "Or use GGUF backend: load_model(..., backend='gguf')" |
| ) |
| |
| self._tokenizer = AutoTokenizer.from_pretrained(self.repo_id) |
| self._model = AutoModelForCausalLM.from_pretrained( |
| self.repo_id, |
| torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, |
| device_map="auto" if torch.cuda.is_available() else None, |
| ) |
| elif self.backend == "gguf": |
| try: |
| from llama_cpp import Llama |
| except ImportError: |
| raise ImportError( |
| "GGUF backend requires: pip install dispatchai[gguf]" |
| ) |
| |
| from huggingface_hub import hf_hub_download |
| gguf_path = hf_hub_download(self.repo_id, "model.gguf") |
| |
| |
| chat_format = self._detect_chat_format(self.model_name) |
| kwargs = dict(model_path=gguf_path, n_ctx=512, n_threads=4, verbose=False) |
| if chat_format: |
| kwargs["chat_format"] = chat_format |
| self._model = Llama(**kwargs) |
| |
| self._loaded = True |
| |
| def chat(self, message: str, system: str = "", max_tokens: int = 256, temperature: float = 0.7) -> str: |
| """Send a chat message and get a response. |
| |
| Args: |
| message: User message |
| system: Optional system prompt |
| max_tokens: Maximum tokens to generate |
| temperature: Sampling temperature (0.0-1.0) |
| |
| Returns: |
| Model response text |
| """ |
| self._load() |
| |
| if self.backend == "transformers": |
| import torch |
| messages = [] |
| if system: |
| messages.append({"role": "system", "content": system}) |
| messages.append({"role": "user", "content": message}) |
| |
| input_text = self._tokenizer.apply_chat_template( |
| messages, tokenize=False, add_generation_prompt=True |
| ) |
| inputs = self._tokenizer(input_text, return_tensors="pt") |
| if torch.cuda.is_available(): |
| inputs = {k: v.cuda() for k, v in inputs.items()} |
| |
| with torch.no_grad(): |
| outputs = self._model.generate( |
| **inputs, |
| max_new_tokens=max_tokens, |
| temperature=temperature, |
| do_sample=temperature > 0, |
| pad_token_id=self._tokenizer.eos_token_id, |
| ) |
| |
| response = self._tokenizer.decode( |
| outputs[0][inputs["input_ids"].shape[1]:], |
| skip_special_tokens=True |
| ) |
| return response.strip() |
| |
| elif self.backend == "gguf": |
| messages = [] |
| if system: |
| messages.append({"role": "system", "content": system}) |
| messages.append({"role": "user", "content": message}) |
| |
| response = self._model.create_chat_completion( |
| messages=messages, |
| max_tokens=max_tokens, |
| temperature=temperature, |
| ) |
| return response["choices"][0]["message"]["content"].strip() |
| |
| return "" |
| |
| def generate(self, prompt: str, max_tokens: int = 256, temperature: float = 0.7) -> str: |
| """Generate text from a raw prompt (no chat template). |
| |
| Args: |
| prompt: Raw text prompt |
| max_tokens: Maximum tokens to generate |
| temperature: Sampling temperature |
| |
| Returns: |
| Generated text |
| """ |
| self._load() |
| |
| if self.backend == "transformers": |
| import torch |
| inputs = self._tokenizer(prompt, return_tensors="pt") |
| if torch.cuda.is_available(): |
| inputs = {k: v.cuda() for k, v in inputs.items()} |
| |
| with torch.no_grad(): |
| outputs = self._model.generate( |
| **inputs, |
| max_new_tokens=max_tokens, |
| temperature=temperature, |
| do_sample=temperature > 0, |
| pad_token_id=self._tokenizer.eos_token_id, |
| ) |
| |
| return self._tokenizer.decode(outputs[0], skip_special_tokens=True)[len(prompt):].strip() |
| |
| elif self.backend == "gguf": |
| response = self._model(prompt, max_tokens=max_tokens, temperature=temperature, echo=False) |
| return response["choices"][0]["text"].strip() |
| |
| return "" |
| |
| def __repr__(self): |
| return f"DispatchModel(name={self.model_name!r}, repo={self.repo_id!r}, backend={self.backend!r}, loaded={self._loaded})" |
|
|
|
|
| def load_model(model_name: str, backend: str = "transformers", token: Optional[str] = None) -> DispatchModel: |
| """Load a dispatchAI mobile model for inference. |
| |
| Args: |
| model_name: Model name without org prefix (e.g., "SmolLM2-135M-Instruct-mobile") |
| backend: "transformers" (default) or "gguf" for llama.cpp |
| token: Optional HuggingFace token for private/gated models |
| |
| Returns: |
| DispatchModel ready for .chat() or .generate() |
| |
| Example: |
| >>> from dispatchai import load_model |
| >>> model = load_model("SmolLM2-135M-Instruct-mobile") |
| >>> print(model.chat("What is 2+2?")) |
| |
| For GGUF/llama.cpp: |
| >>> model = load_model("Llama-3.2-1B-Instruct-Q4-mobile", backend="gguf") |
| """ |
| |
| if model_name.startswith(f"{_ORG}/"): |
| repo_id = model_name |
| model_name = model_name.replace(f"{_ORG}/", "") |
| else: |
| repo_id = f"{_ORG}/{model_name}" |
| |
| return DispatchModel(model_name, repo_id, backend=backend) |
|
|
|
|
| |
|
|
| def list_models(task: Optional[str] = None) -> List[Dict[str, Any]]: |
| """List all available dispatchAI mobile models. |
| |
| Args: |
| task: Optional filter — "chat", "code", "math", "arabic", "function_call", "vision", "embedding" |
| |
| Returns: |
| List of model dicts with name, params, size, ram, task, quant, speed |
| |
| Example: |
| >>> from dispatchai import list_models |
| >>> for m in list_models("chat"): |
| ... print(f"{m['name']}: {m['size_mb']}MB, {m['speed_tps']} t/s") |
| """ |
| models = _MODELS.copy() |
| if task: |
| models = [m for m in models if m["task"] == task.lower().replace("-", "_")] |
| return sorted(models, key=lambda m: m["size_mb"]) |
|
|
|
|
| def recommend(ram_mb: int = 2048, task: str = "chat", priority: str = "size") -> Dict[str, Any]: |
| """Get a model recommendation for your device. |
| |
| Args: |
| ram_mb: Available RAM in MB (e.g., 2048 for 2GB phone) |
| task: Primary task — "chat", "code", "math", "arabic", "function_call", "vision", "embedding" |
| priority: "size" (smallest), "speed" (fastest), or "quality" (largest params) |
| |
| Returns: |
| Dict with recommended model and alternatives |
| |
| Example: |
| >>> from dispatchai import recommend |
| >>> rec = recommend(ram_mb=2048, task="chat") |
| >>> print(f"Best: {rec['recommended']['name']} ({rec['recommended']['size_mb']}MB)") |
| """ |
| filtered = [m for m in _MODELS if m["ram_mb"] <= ram_mb] |
| |
| task_map = { |
| "chat": "chat", "code": "code", "math": "math", |
| "arabic": "arabic", "function_call": "function_call", |
| "function-call": "function_call", "vision": "vision", |
| "embedding": "embedding", "any": None, |
| } |
| task_key = task_map.get(task.lower(), None) |
| if task_key: |
| filtered = [m for m in filtered if m["task"] == task_key] |
| |
| if not filtered: |
| return {"error": f"No models fit in {ram_mb}MB RAM for task '{task}'"} |
| |
| if priority == "size": |
| filtered.sort(key=lambda m: m["size_mb"]) |
| elif priority == "speed": |
| filtered.sort(key=lambda m: m["speed_tps"], reverse=True) |
| elif priority == "quality": |
| filtered.sort(key=lambda m: m["params"], reverse=True) |
| |
| best = filtered[0] |
| return { |
| "recommended": { |
| "name": best["name"], |
| "repo_id": f"{_ORG}/{best['name']}", |
| "url": f"https://huggingface.co/{_ORG}/{best['name']}", |
| "params": best["params"], |
| "size_mb": best["size_mb"], |
| "ram_mb": best["ram_mb"], |
| "quant": best["quant"], |
| "speed_tps": best["speed_tps"], |
| }, |
| "alternatives": [ |
| {"name": m["name"], "size_mb": m["size_mb"], "speed_tps": m["speed_tps"]} |
| for m in filtered[1:4] |
| ], |
| } |
|
|
|
|
| def estimate_latency(params: str, quant: str = "Q4_K_M", hardware: str = "snapdragon_865") -> Dict[str, float]: |
| """Estimate on-device inference latency. |
| |
| Args: |
| params: Parameter count — "135M", "500M", "1B", "1.5B", "3B", etc. |
| quant: Quantization — "FP16", "Q4_K_M", "Q5_K_M", "Q8_0", "INT4" |
| hardware: Target hardware — "snapdragon_865", "snapdragon_8_gen_2", "apple_a17", etc. |
| |
| Returns: |
| Dict with tokens_per_sec, latency_ms, ram_mb |
| |
| Example: |
| >>> from dispatchai import estimate_latency |
| >>> lat = estimate_latency("1B", "Q4_K_M") |
| >>> print(f"{lat['tokens_per_sec']} t/s, {lat['latency_ms_per_token']}ms/token") |
| """ |
| hw_multipliers = { |
| "snapdragon_865": 1.0, |
| "snapdragon_8_gen_2": 1.8, |
| "snapdragon_8_gen_3": 2.2, |
| "apple_a17": 2.5, |
| "apple_m2": 3.0, |
| "snapdragon_778g": 0.7, |
| } |
| |
| params_upper = params.upper() |
| quant_upper = quant.upper() |
| |
| if params_upper not in _LATENCY_DB: |
| return {"error": f"Unknown params: {params}. Valid: {list(_LATENCY_DB.keys())}"} |
| |
| base_tps = _LATENCY_DB[params_upper].get(quant_upper, 10.0) |
| hw_mult = hw_multipliers.get(hardware, 1.0) |
| actual_tps = base_tps * hw_mult |
| |
| return { |
| "params": params, |
| "quant": quant, |
| "hardware": hardware, |
| "tokens_per_sec": round(actual_tps, 1), |
| "latency_ms_per_token": round(1000 / actual_tps, 0), |
| "suitable_for_realtime": actual_tps > 10, |
| "suitable_for_phone": actual_tps > 2, |
| } |
|
|
|
|
| def calculate_cost(daily_queries: int, cloud_cost_per_1k: float = 0.50, days: int = 365) -> Dict[str, float]: |
| """Compare cloud API vs on-device inference costs. |
| |
| Args: |
| daily_queries: Number of AI queries per day |
| cloud_cost_per_1k: Cloud API cost per 1000 queries |
| days: Time period in days |
| |
| Returns: |
| Dict with cloud_cost, device_cost, savings |
| |
| Example: |
| >>> from dispatchai import calculate_cost |
| >>> result = calculate_cost(daily_queries=10000, cloud_cost_per_1k=0.50) |
| >>> print(f"Save ${result['savings']:.0f}/year with on-device") |
| """ |
| cloud_total = (daily_queries / 1000) * cloud_cost_per_1k * days |
| device_total = 0.50 |
| |
| return { |
| "cloud_cost": round(cloud_total, 2), |
| "device_cost": round(device_total, 2), |
| "savings": round(cloud_total - device_total, 2), |
| "savings_pct": round((1 - device_total / cloud_total) * 100, 1) if cloud_total > 0 else 0, |
| "daily_cloud_cost": round((daily_queries / 1000) * cloud_cost_per_1k, 2), |
| } |
|
|