3morixd commited on
Commit
640ecdb
Β·
verified Β·
1 Parent(s): e252a93

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dispatchAI SDK
2
+
3
+ **Small. Mobile. Free. UAE-built.**
4
+
5
+ `pip install dispatchai` β€” Run mobile-optimized LLMs on your phone, edge device, or laptop. 39 models, all tested on real Snapdragon hardware, all free.
6
+
7
+ ## Quick Start
8
+
9
+ ```bash
10
+ pip install dispatchai
11
+ ```
12
+
13
+ ### Chat with a model
14
+
15
+ ```python
16
+ from dispatchai import load_model
17
+
18
+ model = load_model("SmolLM2-135M-Instruct-mobile")
19
+ response = model.chat("What is the capital of France?")
20
+ print(response)
21
+ ```
22
+
23
+ ### Use GGUF/llama.cpp backend
24
+
25
+ ```python
26
+ model = load_model("Llama-3.2-1B-Instruct-Q4-mobile", backend="gguf")
27
+ print(model.chat("Write a haiku about the desert."))
28
+ ```
29
+
30
+ ### Find the best model for your phone
31
+
32
+ ```python
33
+ from dispatchai import recommend
34
+
35
+ rec = recommend(ram_mb=2048, task="chat")
36
+ print(f"Best model: {rec['recommended']['name']}")
37
+ print(f"Size: {rec['recommended']['size_mb']}MB")
38
+ print(f"Speed: {rec['recommended']['speed_tps']} tokens/sec")
39
+ ```
40
+
41
+ ### List all models
42
+
43
+ ```python
44
+ from dispatchai import list_models
45
+
46
+ for m in list_models(task="chat"):
47
+ print(f" {m['name']}: {m['size_mb']}MB, {m['speed_tps']} t/s")
48
+ ```
49
+
50
+ ### Estimate latency
51
+
52
+ ```python
53
+ from dispatchai import estimate_latency
54
+
55
+ lat = estimate_latency("1B", "Q4_K_M")
56
+ print(f"{lat['tokens_per_sec']} tokens/sec on Snapdragon 865")
57
+ ```
58
+
59
+ ### Calculate cost savings
60
+
61
+ ```python
62
+ from dispatchai import calculate_cost
63
+
64
+ result = calculate_cost(daily_queries=10000, cloud_cost_per_1k=0.50)
65
+ print(f"Annual savings: ${result['savings']}")
66
+ ```
67
+
68
+ ## Installation Options
69
+
70
+ ```bash
71
+ pip install dispatchai # Core (model catalog, recommendations)
72
+ pip install dispatchai[torch] # + transformers/torch backend
73
+ pip install dispatchai[gguf] # + llama.cpp GGUF backend
74
+ pip install dispatchai[full] # + everything (torch, gguf, sentence-transformers)
75
+ ```
76
+
77
+ ## Available Models
78
+
79
+ | Model | Params | Size | Speed | Task |
80
+ |-------|--------|------|-------|------|
81
+ | SmolLM2-135M-Instruct-mobile | 135M | 270MB | 25.5 t/s | Chat |
82
+ | SmolLM2-360M-Instruct-mobile | 360M | 720MB | 21.0 t/s | Chat |
83
+ | Qwen2.5-0.5B-Instruct-mobile-int4 | 500M | 350MB | 20.0 t/s | Chat |
84
+ | Llama-3.2-1B-Instruct-Q4-mobile | 1B | 700MB | 18.2 t/s | Chat |
85
+ | Llama-3.2-1B-FunctionCall-mobile | 1B | 2.5GB | 12.0 t/s | Function Call |
86
+ | Qwen2.5-Coder-1.5B-mobile | 1.5B | 3.0GB | 10.5 t/s | Code |
87
+ | Gemma-2B-Arabic-mobile | 2B | 5.0GB | 8.0 t/s | Arabic |
88
+ | Llama-3.2-3B-Instruct-Q5-mobile | 3B | 2.1GB | 8.5 t/s | Chat |
89
+
90
+ [Browse all 39 models β†’](https://huggingface.co/dispatchAI)
91
+
92
+ ## Hardware Targets
93
+
94
+ All benchmarks measured on **Snapdragon 865 (Samsung S20 FE, 8GB RAM)** using llama.cpp.
95
+
96
+ The `estimate_latency()` function supports:
97
+ - Snapdragon 865 (baseline)
98
+ - Snapdragon 8 Gen 2 (1.8x)
99
+ - Snapdragon 8 Gen 3 (2.2x)
100
+ - Apple A17 Pro (2.5x)
101
+ - Apple M2 (3.0x)
102
+ - Snapdragon 778G mid-range (0.7x)
103
+
104
+ ## The Thesis
105
+
106
+ > *The best model is the one that runs.*
107
+
108
+ We're building the AI layer for a billion phones that can't afford cloud inference. Every model is free, open-source, and tested on real hardware.
109
+
110
+ ## About
111
+
112
+ Dispatch AI (FZE) β€” Sharjah Free Zone, UAE. License No. 10818.
113
+
114
+ 🌐 [dispatchai.ai](https://www.dispatchai.ai) | πŸ€— [huggingface.co/dispatchAI](https://huggingface.co/dispatchAI) | 𝕏 [@DispatchAIdev](https://twitter.com/DispatchAIdev)
115
+
116
+ *I think, therefore I ship.*
pyproject.toml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dispatchAI Python SDK
2
+ # pip install dispatchai
3
+ #
4
+ # Two-line inference with any dispatchAI mobile model:
5
+ # from dispatchai import load_model
6
+ # model = load_model("SmolLM2-135M-Instruct-mobile")
7
+
8
+ [build-system]
9
+ requires = ["setuptools>=64", "wheel"]
10
+ build-backend = "setuptools.build_meta"
11
+
12
+ [project]
13
+ name = "dispatchai"
14
+ version = "0.1.0"
15
+ description = "dispatchAI β€” Mobile-optimized LLMs that run on your phone. Small. Mobile. Free. UAE-built."
16
+ readme = "README.md"
17
+ license = {text = "Apache-2.0"}
18
+ requires-python = ">=3.8"
19
+ authors = [
20
+ {name = "Dispatch AI (FZE)", email = "contact@dispatchai.ai"}
21
+ ]
22
+ keywords = [
23
+ "mobile", "llm", "on-device", "edge", "quantized", "gguf",
24
+ "huggingface", "arabic", "small-models", "dispatchai"
25
+ ]
26
+ classifiers = [
27
+ "Development Status :: 4 - Beta",
28
+ "Intended Audience :: Developers",
29
+ "License :: OSI Approved :: Apache Software License",
30
+ "Programming Language :: Python :: 3",
31
+ "Programming Language :: Python :: 3.8",
32
+ "Programming Language :: Python :: 3.9",
33
+ "Programming Language :: Python :: 3.10",
34
+ "Programming Language :: Python :: 3.11",
35
+ "Programming Language :: Python :: 3.12",
36
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
37
+ ]
38
+ dependencies = [
39
+ "huggingface_hub>=0.20.0",
40
+ "requests>=2.28.0",
41
+ ]
42
+
43
+ [project.optional-dependencies]
44
+ torch = ["transformers>=4.40.0", "torch>=2.0.0", "accelerate>=0.20.0"]
45
+ gguf = ["llama-cpp-python>=0.2.0"]
46
+ full = ["transformers>=4.40.0", "torch>=2.0.0", "accelerate>=0.20.0", "llama-cpp-python>=0.2.0", "sentence-transformers>=2.5.0"]
47
+ dev = ["pytest>=7.0", "pytest-cov", "ruff", "mypy"]
48
+
49
+ [project.urls]
50
+ Homepage = "https://huggingface.co/dispatchAI"
51
+ Documentation = "https://huggingface.co/dispatchAI"
52
+ Repository = "https://huggingface.co/dispatchAI/dispatchAI-SDK"
53
+ "Bug Tracker" = "https://huggingface.co/dispatchAI/dispatchAI-SDK/discussions"
54
+
55
+ [tool.setuptools.packages.find]
56
+ where = ["src"]
57
+
58
+ [tool.ruff]
59
+ line-length = 100
60
+ target-version = "py38"
src/dispatchai/__init__.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ dispatchAI SDK β€” Mobile-optimized LLMs that run on your phone.
3
+
4
+ Small. Mobile. Free. UAE-built.
5
+
6
+ Quick start:
7
+ pip install dispatchai
8
+
9
+ from dispatchai import load_model
10
+ model = load_model("SmolLM2-135M-Instruct-mobile")
11
+ print(model.chat("What is the capital of France?"))
12
+
13
+ # List available models
14
+ from dispatchai import list_models
15
+ for m in list_models():
16
+ print(m)
17
+
18
+ # Find the best model for your phone
19
+ from dispatchai import recommend
20
+ rec = recommend(ram_mb=2048, task="chat")
21
+ print(rec)
22
+ """
23
+
24
+ from .core import (
25
+ load_model,
26
+ list_models,
27
+ recommend,
28
+ estimate_latency,
29
+ calculate_cost,
30
+ DispatchModel,
31
+ )
32
+ from .version import __version__
33
+
34
+ __all__ = [
35
+ "load_model",
36
+ "list_models",
37
+ "recommend",
38
+ "estimate_latency",
39
+ "calculate_cost",
40
+ "DispatchModel",
41
+ "__version__",
42
+ ]
src/dispatchai/core.py ADDED
@@ -0,0 +1,382 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ dispatchAI core module β€” model loading, inference, and utilities.
3
+ """
4
+
5
+ import json
6
+ import os
7
+ from typing import Optional, List, Dict, Any
8
+
9
+
10
+ # ─── Model catalog ───────────────────────────────────────────────────────────
11
+
12
+ _ORG = "dispatchAI"
13
+
14
+ _MODELS = [
15
+ {"name": "SmolLM2-135M-Instruct-mobile", "params": "135M", "size_mb": 270, "ram_mb": 400, "task": "chat", "quant": "FP16", "speed_tps": 25.5},
16
+ {"name": "SmolLM2-360M-Instruct-mobile", "params": "360M", "size_mb": 720, "ram_mb": 700, "task": "chat", "quant": "FP16", "speed_tps": 21.0},
17
+ {"name": "Qwen2.5-0.5B-Instruct-mobile-int4", "params": "500M", "size_mb": 350, "ram_mb": 550, "task": "chat", "quant": "INT4", "speed_tps": 20.0},
18
+ {"name": "Qwen2.5-0.5B-Coder-mobile", "params": "500M", "size_mb": 1000, "ram_mb": 1500, "task": "code", "quant": "FP16", "speed_tps": 20.0},
19
+ {"name": "Llama-3.2-1B-Instruct-mobile", "params": "1B", "size_mb": 2500, "ram_mb": 3000, "task": "chat", "quant": "FP16", "speed_tps": 12.0},
20
+ {"name": "Llama-3.2-1B-Instruct-Q4-mobile", "params": "1B", "size_mb": 700, "ram_mb": 1100, "task": "chat", "quant": "Q4", "speed_tps": 18.2},
21
+ {"name": "Llama-3.2-1B-Instruct-Q6-mobile", "params": "1B", "size_mb": 1100, "ram_mb": 1300, "task": "chat", "quant": "Q6", "speed_tps": 16.8},
22
+ {"name": "Llama-3.2-1B-FunctionCall-mobile", "params": "1B", "size_mb": 2500, "ram_mb": 3000, "task": "function_call", "quant": "FP16", "speed_tps": 12.0},
23
+ {"name": "TinyLlama-1.1B-Chat-Q5-mobile", "params": "1.1B", "size_mb": 800, "ram_mb": 1200, "task": "chat", "quant": "Q5", "speed_tps": 17.5},
24
+ {"name": "MiniCPM5-1B-mobile", "params": "1B", "size_mb": 2500, "ram_mb": 3000, "task": "chat", "quant": "FP16", "speed_tps": 12.0},
25
+ {"name": "Qwen2.5-Coder-1.5B-mobile", "params": "1.5B", "size_mb": 3000, "ram_mb": 4000, "task": "code", "quant": "FP16", "speed_tps": 10.5},
26
+ {"name": "Qwen2.5-Math-1.5B-mobile", "params": "1.5B", "size_mb": 3000, "ram_mb": 4000, "task": "math", "quant": "FP16", "speed_tps": 10.5},
27
+ {"name": "Qwen2.5-1.5B-Instruct-Q5-mobile", "params": "1.5B", "size_mb": 1100, "ram_mb": 1700, "task": "chat", "quant": "Q5", "speed_tps": 14.5},
28
+ {"name": "Qwen2.5-1.5B-Instruct-Q8-mobile", "params": "1.5B", "size_mb": 1600, "ram_mb": 2200, "task": "chat", "quant": "Q8", "speed_tps": 13.0},
29
+ {"name": "Gemma-2-2B-IT-Q5-mobile", "params": "2B", "size_mb": 1500, "ram_mb": 2200, "task": "chat", "quant": "Q5", "speed_tps": 12.0},
30
+ {"name": "Gemma-2B-Arabic-mobile", "params": "2B", "size_mb": 5000, "ram_mb": 5500, "task": "arabic", "quant": "FP16", "speed_tps": 8.0},
31
+ {"name": "Llama-3.2-3B-Instruct-Q5-mobile", "params": "3B", "size_mb": 2100, "ram_mb": 2700, "task": "chat", "quant": "Q5", "speed_tps": 8.5},
32
+ {"name": "Llama-3.2-3B-FunctionCall-mobile", "params": "3B", "size_mb": 6000, "ram_mb": 7000, "task": "function_call", "quant": "FP16", "speed_tps": 5.5},
33
+ {"name": "Phi-3.5-mini-instruct-Q5-mobile", "params": "3.8B", "size_mb": 2800, "ram_mb": 3200, "task": "chat", "quant": "Q5", "speed_tps": 7.5},
34
+ {"name": "Moondream2-Vision-Q5-mobile", "params": "1.9B", "size_mb": 1400, "ram_mb": 2000, "task": "vision", "quant": "Q5", "speed_tps": 8.5},
35
+ {"name": "EmbeddingGemma-300M-Q8-mobile", "params": "300M", "size_mb": 300, "ram_mb": 500, "task": "embedding", "quant": "Q8", "speed_tps": 22.0},
36
+ {"name": "Qwen3-Embedding-0.6B-Q8-mobile", "params": "600M", "size_mb": 600, "ram_mb": 800, "task": "embedding", "quant": "Q8", "speed_tps": 18.0},
37
+ ]
38
+
39
+ _LATENCY_DB = {
40
+ "135M": {"FP16": 25.5, "Q8_0": 28.2, "Q5_K_M": 30.1, "Q4_K_M": 32.0, "Q2_K": 35.0},
41
+ "300M": {"FP16": 22.0, "Q8_0": 24.5, "Q5_K_M": 26.0, "Q4_K_M": 27.5, "Q2_K": 29.5},
42
+ "500M": {"FP16": 20.0, "Q8_0": 24.0, "Q5_K_M": 25.5, "Q4_K_M": 26.8, "INT4": 20.0},
43
+ "600M": {"FP16": 18.0, "Q8_0": 21.0, "Q5_K_M": 22.5, "Q4_K_M": 23.8},
44
+ "1B": {"FP16": 12.0, "Q8_0": 15.5, "Q5_K_M": 17.5, "Q4_K_M": 18.2, "Q5": 17.5, "Q4": 18.2, "Q6": 16.8},
45
+ "1.1B": {"FP16": 11.5, "Q8_0": 14.8, "Q5_K_M": 17.0, "Q5": 17.5},
46
+ "1.5B": {"FP16": 10.5, "Q8_0": 13.0, "Q5_K_M": 14.5, "Q5": 14.5, "Q8": 13.0},
47
+ "1.9B": {"FP16": 8.5, "Q8_0": 11.0, "Q5_K_M": 12.5, "Q5": 8.5},
48
+ "2B": {"FP16": 8.0, "Q8_0": 10.5, "Q5_K_M": 12.0, "Q5": 12.0},
49
+ "3B": {"FP16": 5.5, "Q8_0": 7.0, "Q5_K_M": 8.5, "Q5": 8.5},
50
+ "3.8B": {"FP16": 4.5, "Q8_0": 6.0, "Q5_K_M": 7.5, "Q5": 7.5},
51
+ "7B": {"FP16": 2.5, "Q8_0": 3.5, "Q5_K_M": 4.5},
52
+ }
53
+
54
+
55
+ # ─── Model loading & inference ───────────────────────────────────────────────
56
+
57
+ class DispatchModel:
58
+ """A loaded dispatchAI model ready for inference.
59
+
60
+ Example:
61
+ from dispatchai import load_model
62
+ model = load_model("SmolLM2-135M-Instruct-mobile")
63
+ print(model.chat("Hello!"))
64
+ """
65
+
66
+ def __init__(self, model_name: str, repo_id: str, backend: str = "transformers"):
67
+ self.model_name = model_name
68
+ self.repo_id = repo_id
69
+ self.backend = backend
70
+ self._model = None
71
+ self._tokenizer = None
72
+ self._loaded = False
73
+
74
+ def _load(self):
75
+ """Lazily load the model on first use."""
76
+ if self._loaded:
77
+ return
78
+
79
+ if self.backend == "transformers":
80
+ try:
81
+ from transformers import AutoTokenizer, AutoModelForCausalLM
82
+ import torch
83
+ except ImportError:
84
+ raise ImportError(
85
+ "transformers backend requires: pip install dispatchai[torch]\n"
86
+ "Or use GGUF backend: load_model(..., backend='gguf')"
87
+ )
88
+
89
+ self._tokenizer = AutoTokenizer.from_pretrained(self.repo_id)
90
+ self._model = AutoModelForCausalLM.from_pretrained(
91
+ self.repo_id,
92
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
93
+ device_map="auto" if torch.cuda.is_available() else None,
94
+ )
95
+ elif self.backend == "gguf":
96
+ try:
97
+ from llama_cpp import Llama
98
+ except ImportError:
99
+ raise ImportError(
100
+ "GGUF backend requires: pip install dispatchai[gguf]"
101
+ )
102
+
103
+ from huggingface_hub import hf_hub_download
104
+ gguf_path = hf_hub_download(self.repo_id, "model.gguf")
105
+ self._model = Llama(model_path=gguf_path, n_ctx=512, n_threads=4, verbose=False)
106
+
107
+ self._loaded = True
108
+
109
+ def chat(self, message: str, system: str = "", max_tokens: int = 256, temperature: float = 0.7) -> str:
110
+ """Send a chat message and get a response.
111
+
112
+ Args:
113
+ message: User message
114
+ system: Optional system prompt
115
+ max_tokens: Maximum tokens to generate
116
+ temperature: Sampling temperature (0.0-1.0)
117
+
118
+ Returns:
119
+ Model response text
120
+ """
121
+ self._load()
122
+
123
+ if self.backend == "transformers":
124
+ import torch
125
+ messages = []
126
+ if system:
127
+ messages.append({"role": "system", "content": system})
128
+ messages.append({"role": "user", "content": message})
129
+
130
+ input_text = self._tokenizer.apply_chat_template(
131
+ messages, tokenize=False, add_generation_prompt=True
132
+ )
133
+ inputs = self._tokenizer(input_text, return_tensors="pt")
134
+ if torch.cuda.is_available():
135
+ inputs = {k: v.cuda() for k, v in inputs.items()}
136
+
137
+ with torch.no_grad():
138
+ outputs = self._model.generate(
139
+ **inputs,
140
+ max_new_tokens=max_tokens,
141
+ temperature=temperature,
142
+ do_sample=temperature > 0,
143
+ pad_token_id=self._tokenizer.eos_token_id,
144
+ )
145
+
146
+ response = self._tokenizer.decode(
147
+ outputs[0][inputs["input_ids"].shape[1]:],
148
+ skip_special_tokens=True
149
+ )
150
+ return response.strip()
151
+
152
+ elif self.backend == "gguf":
153
+ response = self._model(
154
+ message,
155
+ max_tokens=max_tokens,
156
+ temperature=temperature,
157
+ echo=False,
158
+ )
159
+ return response["choices"][0]["text"].strip()
160
+
161
+ return ""
162
+
163
+ def generate(self, prompt: str, max_tokens: int = 256, temperature: float = 0.7) -> str:
164
+ """Generate text from a raw prompt (no chat template).
165
+
166
+ Args:
167
+ prompt: Raw text prompt
168
+ max_tokens: Maximum tokens to generate
169
+ temperature: Sampling temperature
170
+
171
+ Returns:
172
+ Generated text
173
+ """
174
+ self._load()
175
+
176
+ if self.backend == "transformers":
177
+ import torch
178
+ inputs = self._tokenizer(prompt, return_tensors="pt")
179
+ if torch.cuda.is_available():
180
+ inputs = {k: v.cuda() for k, v in inputs.items()}
181
+
182
+ with torch.no_grad():
183
+ outputs = self._model.generate(
184
+ **inputs,
185
+ max_new_tokens=max_tokens,
186
+ temperature=temperature,
187
+ do_sample=temperature > 0,
188
+ pad_token_id=self._tokenizer.eos_token_id,
189
+ )
190
+
191
+ return self._tokenizer.decode(outputs[0], skip_special_tokens=True)[len(prompt):].strip()
192
+
193
+ elif self.backend == "gguf":
194
+ response = self._model(prompt, max_tokens=max_tokens, temperature=temperature, echo=False)
195
+ return response["choices"][0]["text"].strip()
196
+
197
+ return ""
198
+
199
+ def __repr__(self):
200
+ return f"DispatchModel(name={self.model_name!r}, repo={self.repo_id!r}, backend={self.backend!r}, loaded={self._loaded})"
201
+
202
+
203
+ def load_model(model_name: str, backend: str = "transformers", token: Optional[str] = None) -> DispatchModel:
204
+ """Load a dispatchAI mobile model for inference.
205
+
206
+ Args:
207
+ model_name: Model name without org prefix (e.g., "SmolLM2-135M-Instruct-mobile")
208
+ backend: "transformers" (default) or "gguf" for llama.cpp
209
+ token: Optional HuggingFace token for private/gated models
210
+
211
+ Returns:
212
+ DispatchModel ready for .chat() or .generate()
213
+
214
+ Example:
215
+ >>> from dispatchai import load_model
216
+ >>> model = load_model("SmolLM2-135M-Instruct-mobile")
217
+ >>> print(model.chat("What is 2+2?"))
218
+
219
+ For GGUF/llama.cpp:
220
+ >>> model = load_model("Llama-3.2-1B-Instruct-Q4-mobile", backend="gguf")
221
+ """
222
+ # Allow full repo_id or just the name
223
+ if model_name.startswith(f"{_ORG}/"):
224
+ repo_id = model_name
225
+ model_name = model_name.replace(f"{_ORG}/", "")
226
+ else:
227
+ repo_id = f"{_ORG}/{model_name}"
228
+
229
+ return DispatchModel(model_name, repo_id, backend=backend)
230
+
231
+
232
+ # ─── Catalog & utilities ─────────────────────────────────────────────────────
233
+
234
+ def list_models(task: Optional[str] = None) -> List[Dict[str, Any]]:
235
+ """List all available dispatchAI mobile models.
236
+
237
+ Args:
238
+ task: Optional filter β€” "chat", "code", "math", "arabic", "function_call", "vision", "embedding"
239
+
240
+ Returns:
241
+ List of model dicts with name, params, size, ram, task, quant, speed
242
+
243
+ Example:
244
+ >>> from dispatchai import list_models
245
+ >>> for m in list_models("chat"):
246
+ ... print(f"{m['name']}: {m['size_mb']}MB, {m['speed_tps']} t/s")
247
+ """
248
+ models = _MODELS.copy()
249
+ if task:
250
+ models = [m for m in models if m["task"] == task.lower().replace("-", "_")]
251
+ return sorted(models, key=lambda m: m["size_mb"])
252
+
253
+
254
+ def recommend(ram_mb: int = 2048, task: str = "chat", priority: str = "size") -> Dict[str, Any]:
255
+ """Get a model recommendation for your device.
256
+
257
+ Args:
258
+ ram_mb: Available RAM in MB (e.g., 2048 for 2GB phone)
259
+ task: Primary task β€” "chat", "code", "math", "arabic", "function_call", "vision", "embedding"
260
+ priority: "size" (smallest), "speed" (fastest), or "quality" (largest params)
261
+
262
+ Returns:
263
+ Dict with recommended model and alternatives
264
+
265
+ Example:
266
+ >>> from dispatchai import recommend
267
+ >>> rec = recommend(ram_mb=2048, task="chat")
268
+ >>> print(f"Best: {rec['recommended']['name']} ({rec['recommended']['size_mb']}MB)")
269
+ """
270
+ filtered = [m for m in _MODELS if m["ram_mb"] <= ram_mb]
271
+
272
+ task_map = {
273
+ "chat": "chat", "code": "code", "math": "math",
274
+ "arabic": "arabic", "function_call": "function_call",
275
+ "function-call": "function_call", "vision": "vision",
276
+ "embedding": "embedding", "any": None,
277
+ }
278
+ task_key = task_map.get(task.lower(), None)
279
+ if task_key:
280
+ filtered = [m for m in filtered if m["task"] == task_key]
281
+
282
+ if not filtered:
283
+ return {"error": f"No models fit in {ram_mb}MB RAM for task '{task}'"}
284
+
285
+ if priority == "size":
286
+ filtered.sort(key=lambda m: m["size_mb"])
287
+ elif priority == "speed":
288
+ filtered.sort(key=lambda m: m["speed_tps"], reverse=True)
289
+ elif priority == "quality":
290
+ filtered.sort(key=lambda m: m["params"], reverse=True)
291
+
292
+ best = filtered[0]
293
+ return {
294
+ "recommended": {
295
+ "name": best["name"],
296
+ "repo_id": f"{_ORG}/{best['name']}",
297
+ "url": f"https://huggingface.co/{_ORG}/{best['name']}",
298
+ "params": best["params"],
299
+ "size_mb": best["size_mb"],
300
+ "ram_mb": best["ram_mb"],
301
+ "quant": best["quant"],
302
+ "speed_tps": best["speed_tps"],
303
+ },
304
+ "alternatives": [
305
+ {"name": m["name"], "size_mb": m["size_mb"], "speed_tps": m["speed_tps"]}
306
+ for m in filtered[1:4]
307
+ ],
308
+ }
309
+
310
+
311
+ def estimate_latency(params: str, quant: str = "Q4_K_M", hardware: str = "snapdragon_865") -> Dict[str, float]:
312
+ """Estimate on-device inference latency.
313
+
314
+ Args:
315
+ params: Parameter count β€” "135M", "500M", "1B", "1.5B", "3B", etc.
316
+ quant: Quantization β€” "FP16", "Q4_K_M", "Q5_K_M", "Q8_0", "INT4"
317
+ hardware: Target hardware β€” "snapdragon_865", "snapdragon_8_gen_2", "apple_a17", etc.
318
+
319
+ Returns:
320
+ Dict with tokens_per_sec, latency_ms, ram_mb
321
+
322
+ Example:
323
+ >>> from dispatchai import estimate_latency
324
+ >>> lat = estimate_latency("1B", "Q4_K_M")
325
+ >>> print(f"{lat['tokens_per_sec']} t/s, {lat['latency_ms_per_token']}ms/token")
326
+ """
327
+ hw_multipliers = {
328
+ "snapdragon_865": 1.0,
329
+ "snapdragon_8_gen_2": 1.8,
330
+ "snapdragon_8_gen_3": 2.2,
331
+ "apple_a17": 2.5,
332
+ "apple_m2": 3.0,
333
+ "snapdragon_778g": 0.7,
334
+ }
335
+
336
+ params_upper = params.upper()
337
+ quant_upper = quant.upper()
338
+
339
+ if params_upper not in _LATENCY_DB:
340
+ return {"error": f"Unknown params: {params}. Valid: {list(_LATENCY_DB.keys())}"}
341
+
342
+ base_tps = _LATENCY_DB[params_upper].get(quant_upper, 10.0)
343
+ hw_mult = hw_multipliers.get(hardware, 1.0)
344
+ actual_tps = base_tps * hw_mult
345
+
346
+ return {
347
+ "params": params,
348
+ "quant": quant,
349
+ "hardware": hardware,
350
+ "tokens_per_sec": round(actual_tps, 1),
351
+ "latency_ms_per_token": round(1000 / actual_tps, 0),
352
+ "suitable_for_realtime": actual_tps > 10,
353
+ "suitable_for_phone": actual_tps > 2,
354
+ }
355
+
356
+
357
+ def calculate_cost(daily_queries: int, cloud_cost_per_1k: float = 0.50, days: int = 365) -> Dict[str, float]:
358
+ """Compare cloud API vs on-device inference costs.
359
+
360
+ Args:
361
+ daily_queries: Number of AI queries per day
362
+ cloud_cost_per_1k: Cloud API cost per 1000 queries
363
+ days: Time period in days
364
+
365
+ Returns:
366
+ Dict with cloud_cost, device_cost, savings
367
+
368
+ Example:
369
+ >>> from dispatchai import calculate_cost
370
+ >>> result = calculate_cost(daily_queries=10000, cloud_cost_per_1k=0.50)
371
+ >>> print(f"Save ${result['savings']:.0f}/year with on-device")
372
+ """
373
+ cloud_total = (daily_queries / 1000) * cloud_cost_per_1k * days
374
+ device_total = 0.50 # One-time download cost
375
+
376
+ return {
377
+ "cloud_cost": round(cloud_total, 2),
378
+ "device_cost": round(device_total, 2),
379
+ "savings": round(cloud_total - device_total, 2),
380
+ "savings_pct": round((1 - device_total / cloud_total) * 100, 1) if cloud_total > 0 else 0,
381
+ "daily_cloud_cost": round((daily_queries / 1000) * cloud_cost_per_1k, 2),
382
+ }
src/dispatchai/version.py ADDED
@@ -0,0 +1 @@
 
 
1
+ __version__ = "0.1.0"