Upload folder using huggingface_hub
Browse files- README.md +116 -0
- pyproject.toml +60 -0
- src/dispatchai/__init__.py +42 -0
- src/dispatchai/core.py +382 -0
- src/dispatchai/version.py +1 -0
README.md
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# dispatchAI SDK
|
| 2 |
+
|
| 3 |
+
**Small. Mobile. Free. UAE-built.**
|
| 4 |
+
|
| 5 |
+
`pip install dispatchai` β Run mobile-optimized LLMs on your phone, edge device, or laptop. 39 models, all tested on real Snapdragon hardware, all free.
|
| 6 |
+
|
| 7 |
+
## Quick Start
|
| 8 |
+
|
| 9 |
+
```bash
|
| 10 |
+
pip install dispatchai
|
| 11 |
+
```
|
| 12 |
+
|
| 13 |
+
### Chat with a model
|
| 14 |
+
|
| 15 |
+
```python
|
| 16 |
+
from dispatchai import load_model
|
| 17 |
+
|
| 18 |
+
model = load_model("SmolLM2-135M-Instruct-mobile")
|
| 19 |
+
response = model.chat("What is the capital of France?")
|
| 20 |
+
print(response)
|
| 21 |
+
```
|
| 22 |
+
|
| 23 |
+
### Use GGUF/llama.cpp backend
|
| 24 |
+
|
| 25 |
+
```python
|
| 26 |
+
model = load_model("Llama-3.2-1B-Instruct-Q4-mobile", backend="gguf")
|
| 27 |
+
print(model.chat("Write a haiku about the desert."))
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
### Find the best model for your phone
|
| 31 |
+
|
| 32 |
+
```python
|
| 33 |
+
from dispatchai import recommend
|
| 34 |
+
|
| 35 |
+
rec = recommend(ram_mb=2048, task="chat")
|
| 36 |
+
print(f"Best model: {rec['recommended']['name']}")
|
| 37 |
+
print(f"Size: {rec['recommended']['size_mb']}MB")
|
| 38 |
+
print(f"Speed: {rec['recommended']['speed_tps']} tokens/sec")
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
### List all models
|
| 42 |
+
|
| 43 |
+
```python
|
| 44 |
+
from dispatchai import list_models
|
| 45 |
+
|
| 46 |
+
for m in list_models(task="chat"):
|
| 47 |
+
print(f" {m['name']}: {m['size_mb']}MB, {m['speed_tps']} t/s")
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
### Estimate latency
|
| 51 |
+
|
| 52 |
+
```python
|
| 53 |
+
from dispatchai import estimate_latency
|
| 54 |
+
|
| 55 |
+
lat = estimate_latency("1B", "Q4_K_M")
|
| 56 |
+
print(f"{lat['tokens_per_sec']} tokens/sec on Snapdragon 865")
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
### Calculate cost savings
|
| 60 |
+
|
| 61 |
+
```python
|
| 62 |
+
from dispatchai import calculate_cost
|
| 63 |
+
|
| 64 |
+
result = calculate_cost(daily_queries=10000, cloud_cost_per_1k=0.50)
|
| 65 |
+
print(f"Annual savings: ${result['savings']}")
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
## Installation Options
|
| 69 |
+
|
| 70 |
+
```bash
|
| 71 |
+
pip install dispatchai # Core (model catalog, recommendations)
|
| 72 |
+
pip install dispatchai[torch] # + transformers/torch backend
|
| 73 |
+
pip install dispatchai[gguf] # + llama.cpp GGUF backend
|
| 74 |
+
pip install dispatchai[full] # + everything (torch, gguf, sentence-transformers)
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
## Available Models
|
| 78 |
+
|
| 79 |
+
| Model | Params | Size | Speed | Task |
|
| 80 |
+
|-------|--------|------|-------|------|
|
| 81 |
+
| SmolLM2-135M-Instruct-mobile | 135M | 270MB | 25.5 t/s | Chat |
|
| 82 |
+
| SmolLM2-360M-Instruct-mobile | 360M | 720MB | 21.0 t/s | Chat |
|
| 83 |
+
| Qwen2.5-0.5B-Instruct-mobile-int4 | 500M | 350MB | 20.0 t/s | Chat |
|
| 84 |
+
| Llama-3.2-1B-Instruct-Q4-mobile | 1B | 700MB | 18.2 t/s | Chat |
|
| 85 |
+
| Llama-3.2-1B-FunctionCall-mobile | 1B | 2.5GB | 12.0 t/s | Function Call |
|
| 86 |
+
| Qwen2.5-Coder-1.5B-mobile | 1.5B | 3.0GB | 10.5 t/s | Code |
|
| 87 |
+
| Gemma-2B-Arabic-mobile | 2B | 5.0GB | 8.0 t/s | Arabic |
|
| 88 |
+
| Llama-3.2-3B-Instruct-Q5-mobile | 3B | 2.1GB | 8.5 t/s | Chat |
|
| 89 |
+
|
| 90 |
+
[Browse all 39 models β](https://huggingface.co/dispatchAI)
|
| 91 |
+
|
| 92 |
+
## Hardware Targets
|
| 93 |
+
|
| 94 |
+
All benchmarks measured on **Snapdragon 865 (Samsung S20 FE, 8GB RAM)** using llama.cpp.
|
| 95 |
+
|
| 96 |
+
The `estimate_latency()` function supports:
|
| 97 |
+
- Snapdragon 865 (baseline)
|
| 98 |
+
- Snapdragon 8 Gen 2 (1.8x)
|
| 99 |
+
- Snapdragon 8 Gen 3 (2.2x)
|
| 100 |
+
- Apple A17 Pro (2.5x)
|
| 101 |
+
- Apple M2 (3.0x)
|
| 102 |
+
- Snapdragon 778G mid-range (0.7x)
|
| 103 |
+
|
| 104 |
+
## The Thesis
|
| 105 |
+
|
| 106 |
+
> *The best model is the one that runs.*
|
| 107 |
+
|
| 108 |
+
We're building the AI layer for a billion phones that can't afford cloud inference. Every model is free, open-source, and tested on real hardware.
|
| 109 |
+
|
| 110 |
+
## About
|
| 111 |
+
|
| 112 |
+
Dispatch AI (FZE) β Sharjah Free Zone, UAE. License No. 10818.
|
| 113 |
+
|
| 114 |
+
π [dispatchai.ai](https://www.dispatchai.ai) | π€ [huggingface.co/dispatchAI](https://huggingface.co/dispatchAI) | π [@DispatchAIdev](https://twitter.com/DispatchAIdev)
|
| 115 |
+
|
| 116 |
+
*I think, therefore I ship.*
|
pyproject.toml
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# dispatchAI Python SDK
|
| 2 |
+
# pip install dispatchai
|
| 3 |
+
#
|
| 4 |
+
# Two-line inference with any dispatchAI mobile model:
|
| 5 |
+
# from dispatchai import load_model
|
| 6 |
+
# model = load_model("SmolLM2-135M-Instruct-mobile")
|
| 7 |
+
|
| 8 |
+
[build-system]
|
| 9 |
+
requires = ["setuptools>=64", "wheel"]
|
| 10 |
+
build-backend = "setuptools.build_meta"
|
| 11 |
+
|
| 12 |
+
[project]
|
| 13 |
+
name = "dispatchai"
|
| 14 |
+
version = "0.1.0"
|
| 15 |
+
description = "dispatchAI β Mobile-optimized LLMs that run on your phone. Small. Mobile. Free. UAE-built."
|
| 16 |
+
readme = "README.md"
|
| 17 |
+
license = {text = "Apache-2.0"}
|
| 18 |
+
requires-python = ">=3.8"
|
| 19 |
+
authors = [
|
| 20 |
+
{name = "Dispatch AI (FZE)", email = "contact@dispatchai.ai"}
|
| 21 |
+
]
|
| 22 |
+
keywords = [
|
| 23 |
+
"mobile", "llm", "on-device", "edge", "quantized", "gguf",
|
| 24 |
+
"huggingface", "arabic", "small-models", "dispatchai"
|
| 25 |
+
]
|
| 26 |
+
classifiers = [
|
| 27 |
+
"Development Status :: 4 - Beta",
|
| 28 |
+
"Intended Audience :: Developers",
|
| 29 |
+
"License :: OSI Approved :: Apache Software License",
|
| 30 |
+
"Programming Language :: Python :: 3",
|
| 31 |
+
"Programming Language :: Python :: 3.8",
|
| 32 |
+
"Programming Language :: Python :: 3.9",
|
| 33 |
+
"Programming Language :: Python :: 3.10",
|
| 34 |
+
"Programming Language :: Python :: 3.11",
|
| 35 |
+
"Programming Language :: Python :: 3.12",
|
| 36 |
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
| 37 |
+
]
|
| 38 |
+
dependencies = [
|
| 39 |
+
"huggingface_hub>=0.20.0",
|
| 40 |
+
"requests>=2.28.0",
|
| 41 |
+
]
|
| 42 |
+
|
| 43 |
+
[project.optional-dependencies]
|
| 44 |
+
torch = ["transformers>=4.40.0", "torch>=2.0.0", "accelerate>=0.20.0"]
|
| 45 |
+
gguf = ["llama-cpp-python>=0.2.0"]
|
| 46 |
+
full = ["transformers>=4.40.0", "torch>=2.0.0", "accelerate>=0.20.0", "llama-cpp-python>=0.2.0", "sentence-transformers>=2.5.0"]
|
| 47 |
+
dev = ["pytest>=7.0", "pytest-cov", "ruff", "mypy"]
|
| 48 |
+
|
| 49 |
+
[project.urls]
|
| 50 |
+
Homepage = "https://huggingface.co/dispatchAI"
|
| 51 |
+
Documentation = "https://huggingface.co/dispatchAI"
|
| 52 |
+
Repository = "https://huggingface.co/dispatchAI/dispatchAI-SDK"
|
| 53 |
+
"Bug Tracker" = "https://huggingface.co/dispatchAI/dispatchAI-SDK/discussions"
|
| 54 |
+
|
| 55 |
+
[tool.setuptools.packages.find]
|
| 56 |
+
where = ["src"]
|
| 57 |
+
|
| 58 |
+
[tool.ruff]
|
| 59 |
+
line-length = 100
|
| 60 |
+
target-version = "py38"
|
src/dispatchai/__init__.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
dispatchAI SDK β Mobile-optimized LLMs that run on your phone.
|
| 3 |
+
|
| 4 |
+
Small. Mobile. Free. UAE-built.
|
| 5 |
+
|
| 6 |
+
Quick start:
|
| 7 |
+
pip install dispatchai
|
| 8 |
+
|
| 9 |
+
from dispatchai import load_model
|
| 10 |
+
model = load_model("SmolLM2-135M-Instruct-mobile")
|
| 11 |
+
print(model.chat("What is the capital of France?"))
|
| 12 |
+
|
| 13 |
+
# List available models
|
| 14 |
+
from dispatchai import list_models
|
| 15 |
+
for m in list_models():
|
| 16 |
+
print(m)
|
| 17 |
+
|
| 18 |
+
# Find the best model for your phone
|
| 19 |
+
from dispatchai import recommend
|
| 20 |
+
rec = recommend(ram_mb=2048, task="chat")
|
| 21 |
+
print(rec)
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
from .core import (
|
| 25 |
+
load_model,
|
| 26 |
+
list_models,
|
| 27 |
+
recommend,
|
| 28 |
+
estimate_latency,
|
| 29 |
+
calculate_cost,
|
| 30 |
+
DispatchModel,
|
| 31 |
+
)
|
| 32 |
+
from .version import __version__
|
| 33 |
+
|
| 34 |
+
__all__ = [
|
| 35 |
+
"load_model",
|
| 36 |
+
"list_models",
|
| 37 |
+
"recommend",
|
| 38 |
+
"estimate_latency",
|
| 39 |
+
"calculate_cost",
|
| 40 |
+
"DispatchModel",
|
| 41 |
+
"__version__",
|
| 42 |
+
]
|
src/dispatchai/core.py
ADDED
|
@@ -0,0 +1,382 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
dispatchAI core module β model loading, inference, and utilities.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
from typing import Optional, List, Dict, Any
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
# βββ Model catalog βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 11 |
+
|
| 12 |
+
_ORG = "dispatchAI"
|
| 13 |
+
|
| 14 |
+
_MODELS = [
|
| 15 |
+
{"name": "SmolLM2-135M-Instruct-mobile", "params": "135M", "size_mb": 270, "ram_mb": 400, "task": "chat", "quant": "FP16", "speed_tps": 25.5},
|
| 16 |
+
{"name": "SmolLM2-360M-Instruct-mobile", "params": "360M", "size_mb": 720, "ram_mb": 700, "task": "chat", "quant": "FP16", "speed_tps": 21.0},
|
| 17 |
+
{"name": "Qwen2.5-0.5B-Instruct-mobile-int4", "params": "500M", "size_mb": 350, "ram_mb": 550, "task": "chat", "quant": "INT4", "speed_tps": 20.0},
|
| 18 |
+
{"name": "Qwen2.5-0.5B-Coder-mobile", "params": "500M", "size_mb": 1000, "ram_mb": 1500, "task": "code", "quant": "FP16", "speed_tps": 20.0},
|
| 19 |
+
{"name": "Llama-3.2-1B-Instruct-mobile", "params": "1B", "size_mb": 2500, "ram_mb": 3000, "task": "chat", "quant": "FP16", "speed_tps": 12.0},
|
| 20 |
+
{"name": "Llama-3.2-1B-Instruct-Q4-mobile", "params": "1B", "size_mb": 700, "ram_mb": 1100, "task": "chat", "quant": "Q4", "speed_tps": 18.2},
|
| 21 |
+
{"name": "Llama-3.2-1B-Instruct-Q6-mobile", "params": "1B", "size_mb": 1100, "ram_mb": 1300, "task": "chat", "quant": "Q6", "speed_tps": 16.8},
|
| 22 |
+
{"name": "Llama-3.2-1B-FunctionCall-mobile", "params": "1B", "size_mb": 2500, "ram_mb": 3000, "task": "function_call", "quant": "FP16", "speed_tps": 12.0},
|
| 23 |
+
{"name": "TinyLlama-1.1B-Chat-Q5-mobile", "params": "1.1B", "size_mb": 800, "ram_mb": 1200, "task": "chat", "quant": "Q5", "speed_tps": 17.5},
|
| 24 |
+
{"name": "MiniCPM5-1B-mobile", "params": "1B", "size_mb": 2500, "ram_mb": 3000, "task": "chat", "quant": "FP16", "speed_tps": 12.0},
|
| 25 |
+
{"name": "Qwen2.5-Coder-1.5B-mobile", "params": "1.5B", "size_mb": 3000, "ram_mb": 4000, "task": "code", "quant": "FP16", "speed_tps": 10.5},
|
| 26 |
+
{"name": "Qwen2.5-Math-1.5B-mobile", "params": "1.5B", "size_mb": 3000, "ram_mb": 4000, "task": "math", "quant": "FP16", "speed_tps": 10.5},
|
| 27 |
+
{"name": "Qwen2.5-1.5B-Instruct-Q5-mobile", "params": "1.5B", "size_mb": 1100, "ram_mb": 1700, "task": "chat", "quant": "Q5", "speed_tps": 14.5},
|
| 28 |
+
{"name": "Qwen2.5-1.5B-Instruct-Q8-mobile", "params": "1.5B", "size_mb": 1600, "ram_mb": 2200, "task": "chat", "quant": "Q8", "speed_tps": 13.0},
|
| 29 |
+
{"name": "Gemma-2-2B-IT-Q5-mobile", "params": "2B", "size_mb": 1500, "ram_mb": 2200, "task": "chat", "quant": "Q5", "speed_tps": 12.0},
|
| 30 |
+
{"name": "Gemma-2B-Arabic-mobile", "params": "2B", "size_mb": 5000, "ram_mb": 5500, "task": "arabic", "quant": "FP16", "speed_tps": 8.0},
|
| 31 |
+
{"name": "Llama-3.2-3B-Instruct-Q5-mobile", "params": "3B", "size_mb": 2100, "ram_mb": 2700, "task": "chat", "quant": "Q5", "speed_tps": 8.5},
|
| 32 |
+
{"name": "Llama-3.2-3B-FunctionCall-mobile", "params": "3B", "size_mb": 6000, "ram_mb": 7000, "task": "function_call", "quant": "FP16", "speed_tps": 5.5},
|
| 33 |
+
{"name": "Phi-3.5-mini-instruct-Q5-mobile", "params": "3.8B", "size_mb": 2800, "ram_mb": 3200, "task": "chat", "quant": "Q5", "speed_tps": 7.5},
|
| 34 |
+
{"name": "Moondream2-Vision-Q5-mobile", "params": "1.9B", "size_mb": 1400, "ram_mb": 2000, "task": "vision", "quant": "Q5", "speed_tps": 8.5},
|
| 35 |
+
{"name": "EmbeddingGemma-300M-Q8-mobile", "params": "300M", "size_mb": 300, "ram_mb": 500, "task": "embedding", "quant": "Q8", "speed_tps": 22.0},
|
| 36 |
+
{"name": "Qwen3-Embedding-0.6B-Q8-mobile", "params": "600M", "size_mb": 600, "ram_mb": 800, "task": "embedding", "quant": "Q8", "speed_tps": 18.0},
|
| 37 |
+
]
|
| 38 |
+
|
| 39 |
+
_LATENCY_DB = {
|
| 40 |
+
"135M": {"FP16": 25.5, "Q8_0": 28.2, "Q5_K_M": 30.1, "Q4_K_M": 32.0, "Q2_K": 35.0},
|
| 41 |
+
"300M": {"FP16": 22.0, "Q8_0": 24.5, "Q5_K_M": 26.0, "Q4_K_M": 27.5, "Q2_K": 29.5},
|
| 42 |
+
"500M": {"FP16": 20.0, "Q8_0": 24.0, "Q5_K_M": 25.5, "Q4_K_M": 26.8, "INT4": 20.0},
|
| 43 |
+
"600M": {"FP16": 18.0, "Q8_0": 21.0, "Q5_K_M": 22.5, "Q4_K_M": 23.8},
|
| 44 |
+
"1B": {"FP16": 12.0, "Q8_0": 15.5, "Q5_K_M": 17.5, "Q4_K_M": 18.2, "Q5": 17.5, "Q4": 18.2, "Q6": 16.8},
|
| 45 |
+
"1.1B": {"FP16": 11.5, "Q8_0": 14.8, "Q5_K_M": 17.0, "Q5": 17.5},
|
| 46 |
+
"1.5B": {"FP16": 10.5, "Q8_0": 13.0, "Q5_K_M": 14.5, "Q5": 14.5, "Q8": 13.0},
|
| 47 |
+
"1.9B": {"FP16": 8.5, "Q8_0": 11.0, "Q5_K_M": 12.5, "Q5": 8.5},
|
| 48 |
+
"2B": {"FP16": 8.0, "Q8_0": 10.5, "Q5_K_M": 12.0, "Q5": 12.0},
|
| 49 |
+
"3B": {"FP16": 5.5, "Q8_0": 7.0, "Q5_K_M": 8.5, "Q5": 8.5},
|
| 50 |
+
"3.8B": {"FP16": 4.5, "Q8_0": 6.0, "Q5_K_M": 7.5, "Q5": 7.5},
|
| 51 |
+
"7B": {"FP16": 2.5, "Q8_0": 3.5, "Q5_K_M": 4.5},
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# βββ Model loading & inference βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 56 |
+
|
| 57 |
+
class DispatchModel:
|
| 58 |
+
"""A loaded dispatchAI model ready for inference.
|
| 59 |
+
|
| 60 |
+
Example:
|
| 61 |
+
from dispatchai import load_model
|
| 62 |
+
model = load_model("SmolLM2-135M-Instruct-mobile")
|
| 63 |
+
print(model.chat("Hello!"))
|
| 64 |
+
"""
|
| 65 |
+
|
| 66 |
+
def __init__(self, model_name: str, repo_id: str, backend: str = "transformers"):
|
| 67 |
+
self.model_name = model_name
|
| 68 |
+
self.repo_id = repo_id
|
| 69 |
+
self.backend = backend
|
| 70 |
+
self._model = None
|
| 71 |
+
self._tokenizer = None
|
| 72 |
+
self._loaded = False
|
| 73 |
+
|
| 74 |
+
def _load(self):
|
| 75 |
+
"""Lazily load the model on first use."""
|
| 76 |
+
if self._loaded:
|
| 77 |
+
return
|
| 78 |
+
|
| 79 |
+
if self.backend == "transformers":
|
| 80 |
+
try:
|
| 81 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 82 |
+
import torch
|
| 83 |
+
except ImportError:
|
| 84 |
+
raise ImportError(
|
| 85 |
+
"transformers backend requires: pip install dispatchai[torch]\n"
|
| 86 |
+
"Or use GGUF backend: load_model(..., backend='gguf')"
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
self._tokenizer = AutoTokenizer.from_pretrained(self.repo_id)
|
| 90 |
+
self._model = AutoModelForCausalLM.from_pretrained(
|
| 91 |
+
self.repo_id,
|
| 92 |
+
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
| 93 |
+
device_map="auto" if torch.cuda.is_available() else None,
|
| 94 |
+
)
|
| 95 |
+
elif self.backend == "gguf":
|
| 96 |
+
try:
|
| 97 |
+
from llama_cpp import Llama
|
| 98 |
+
except ImportError:
|
| 99 |
+
raise ImportError(
|
| 100 |
+
"GGUF backend requires: pip install dispatchai[gguf]"
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
from huggingface_hub import hf_hub_download
|
| 104 |
+
gguf_path = hf_hub_download(self.repo_id, "model.gguf")
|
| 105 |
+
self._model = Llama(model_path=gguf_path, n_ctx=512, n_threads=4, verbose=False)
|
| 106 |
+
|
| 107 |
+
self._loaded = True
|
| 108 |
+
|
| 109 |
+
def chat(self, message: str, system: str = "", max_tokens: int = 256, temperature: float = 0.7) -> str:
|
| 110 |
+
"""Send a chat message and get a response.
|
| 111 |
+
|
| 112 |
+
Args:
|
| 113 |
+
message: User message
|
| 114 |
+
system: Optional system prompt
|
| 115 |
+
max_tokens: Maximum tokens to generate
|
| 116 |
+
temperature: Sampling temperature (0.0-1.0)
|
| 117 |
+
|
| 118 |
+
Returns:
|
| 119 |
+
Model response text
|
| 120 |
+
"""
|
| 121 |
+
self._load()
|
| 122 |
+
|
| 123 |
+
if self.backend == "transformers":
|
| 124 |
+
import torch
|
| 125 |
+
messages = []
|
| 126 |
+
if system:
|
| 127 |
+
messages.append({"role": "system", "content": system})
|
| 128 |
+
messages.append({"role": "user", "content": message})
|
| 129 |
+
|
| 130 |
+
input_text = self._tokenizer.apply_chat_template(
|
| 131 |
+
messages, tokenize=False, add_generation_prompt=True
|
| 132 |
+
)
|
| 133 |
+
inputs = self._tokenizer(input_text, return_tensors="pt")
|
| 134 |
+
if torch.cuda.is_available():
|
| 135 |
+
inputs = {k: v.cuda() for k, v in inputs.items()}
|
| 136 |
+
|
| 137 |
+
with torch.no_grad():
|
| 138 |
+
outputs = self._model.generate(
|
| 139 |
+
**inputs,
|
| 140 |
+
max_new_tokens=max_tokens,
|
| 141 |
+
temperature=temperature,
|
| 142 |
+
do_sample=temperature > 0,
|
| 143 |
+
pad_token_id=self._tokenizer.eos_token_id,
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
response = self._tokenizer.decode(
|
| 147 |
+
outputs[0][inputs["input_ids"].shape[1]:],
|
| 148 |
+
skip_special_tokens=True
|
| 149 |
+
)
|
| 150 |
+
return response.strip()
|
| 151 |
+
|
| 152 |
+
elif self.backend == "gguf":
|
| 153 |
+
response = self._model(
|
| 154 |
+
message,
|
| 155 |
+
max_tokens=max_tokens,
|
| 156 |
+
temperature=temperature,
|
| 157 |
+
echo=False,
|
| 158 |
+
)
|
| 159 |
+
return response["choices"][0]["text"].strip()
|
| 160 |
+
|
| 161 |
+
return ""
|
| 162 |
+
|
| 163 |
+
def generate(self, prompt: str, max_tokens: int = 256, temperature: float = 0.7) -> str:
|
| 164 |
+
"""Generate text from a raw prompt (no chat template).
|
| 165 |
+
|
| 166 |
+
Args:
|
| 167 |
+
prompt: Raw text prompt
|
| 168 |
+
max_tokens: Maximum tokens to generate
|
| 169 |
+
temperature: Sampling temperature
|
| 170 |
+
|
| 171 |
+
Returns:
|
| 172 |
+
Generated text
|
| 173 |
+
"""
|
| 174 |
+
self._load()
|
| 175 |
+
|
| 176 |
+
if self.backend == "transformers":
|
| 177 |
+
import torch
|
| 178 |
+
inputs = self._tokenizer(prompt, return_tensors="pt")
|
| 179 |
+
if torch.cuda.is_available():
|
| 180 |
+
inputs = {k: v.cuda() for k, v in inputs.items()}
|
| 181 |
+
|
| 182 |
+
with torch.no_grad():
|
| 183 |
+
outputs = self._model.generate(
|
| 184 |
+
**inputs,
|
| 185 |
+
max_new_tokens=max_tokens,
|
| 186 |
+
temperature=temperature,
|
| 187 |
+
do_sample=temperature > 0,
|
| 188 |
+
pad_token_id=self._tokenizer.eos_token_id,
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
return self._tokenizer.decode(outputs[0], skip_special_tokens=True)[len(prompt):].strip()
|
| 192 |
+
|
| 193 |
+
elif self.backend == "gguf":
|
| 194 |
+
response = self._model(prompt, max_tokens=max_tokens, temperature=temperature, echo=False)
|
| 195 |
+
return response["choices"][0]["text"].strip()
|
| 196 |
+
|
| 197 |
+
return ""
|
| 198 |
+
|
| 199 |
+
def __repr__(self):
|
| 200 |
+
return f"DispatchModel(name={self.model_name!r}, repo={self.repo_id!r}, backend={self.backend!r}, loaded={self._loaded})"
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
def load_model(model_name: str, backend: str = "transformers", token: Optional[str] = None) -> DispatchModel:
|
| 204 |
+
"""Load a dispatchAI mobile model for inference.
|
| 205 |
+
|
| 206 |
+
Args:
|
| 207 |
+
model_name: Model name without org prefix (e.g., "SmolLM2-135M-Instruct-mobile")
|
| 208 |
+
backend: "transformers" (default) or "gguf" for llama.cpp
|
| 209 |
+
token: Optional HuggingFace token for private/gated models
|
| 210 |
+
|
| 211 |
+
Returns:
|
| 212 |
+
DispatchModel ready for .chat() or .generate()
|
| 213 |
+
|
| 214 |
+
Example:
|
| 215 |
+
>>> from dispatchai import load_model
|
| 216 |
+
>>> model = load_model("SmolLM2-135M-Instruct-mobile")
|
| 217 |
+
>>> print(model.chat("What is 2+2?"))
|
| 218 |
+
|
| 219 |
+
For GGUF/llama.cpp:
|
| 220 |
+
>>> model = load_model("Llama-3.2-1B-Instruct-Q4-mobile", backend="gguf")
|
| 221 |
+
"""
|
| 222 |
+
# Allow full repo_id or just the name
|
| 223 |
+
if model_name.startswith(f"{_ORG}/"):
|
| 224 |
+
repo_id = model_name
|
| 225 |
+
model_name = model_name.replace(f"{_ORG}/", "")
|
| 226 |
+
else:
|
| 227 |
+
repo_id = f"{_ORG}/{model_name}"
|
| 228 |
+
|
| 229 |
+
return DispatchModel(model_name, repo_id, backend=backend)
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
# βββ Catalog & utilities βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 233 |
+
|
| 234 |
+
def list_models(task: Optional[str] = None) -> List[Dict[str, Any]]:
|
| 235 |
+
"""List all available dispatchAI mobile models.
|
| 236 |
+
|
| 237 |
+
Args:
|
| 238 |
+
task: Optional filter β "chat", "code", "math", "arabic", "function_call", "vision", "embedding"
|
| 239 |
+
|
| 240 |
+
Returns:
|
| 241 |
+
List of model dicts with name, params, size, ram, task, quant, speed
|
| 242 |
+
|
| 243 |
+
Example:
|
| 244 |
+
>>> from dispatchai import list_models
|
| 245 |
+
>>> for m in list_models("chat"):
|
| 246 |
+
... print(f"{m['name']}: {m['size_mb']}MB, {m['speed_tps']} t/s")
|
| 247 |
+
"""
|
| 248 |
+
models = _MODELS.copy()
|
| 249 |
+
if task:
|
| 250 |
+
models = [m for m in models if m["task"] == task.lower().replace("-", "_")]
|
| 251 |
+
return sorted(models, key=lambda m: m["size_mb"])
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
def recommend(ram_mb: int = 2048, task: str = "chat", priority: str = "size") -> Dict[str, Any]:
|
| 255 |
+
"""Get a model recommendation for your device.
|
| 256 |
+
|
| 257 |
+
Args:
|
| 258 |
+
ram_mb: Available RAM in MB (e.g., 2048 for 2GB phone)
|
| 259 |
+
task: Primary task β "chat", "code", "math", "arabic", "function_call", "vision", "embedding"
|
| 260 |
+
priority: "size" (smallest), "speed" (fastest), or "quality" (largest params)
|
| 261 |
+
|
| 262 |
+
Returns:
|
| 263 |
+
Dict with recommended model and alternatives
|
| 264 |
+
|
| 265 |
+
Example:
|
| 266 |
+
>>> from dispatchai import recommend
|
| 267 |
+
>>> rec = recommend(ram_mb=2048, task="chat")
|
| 268 |
+
>>> print(f"Best: {rec['recommended']['name']} ({rec['recommended']['size_mb']}MB)")
|
| 269 |
+
"""
|
| 270 |
+
filtered = [m for m in _MODELS if m["ram_mb"] <= ram_mb]
|
| 271 |
+
|
| 272 |
+
task_map = {
|
| 273 |
+
"chat": "chat", "code": "code", "math": "math",
|
| 274 |
+
"arabic": "arabic", "function_call": "function_call",
|
| 275 |
+
"function-call": "function_call", "vision": "vision",
|
| 276 |
+
"embedding": "embedding", "any": None,
|
| 277 |
+
}
|
| 278 |
+
task_key = task_map.get(task.lower(), None)
|
| 279 |
+
if task_key:
|
| 280 |
+
filtered = [m for m in filtered if m["task"] == task_key]
|
| 281 |
+
|
| 282 |
+
if not filtered:
|
| 283 |
+
return {"error": f"No models fit in {ram_mb}MB RAM for task '{task}'"}
|
| 284 |
+
|
| 285 |
+
if priority == "size":
|
| 286 |
+
filtered.sort(key=lambda m: m["size_mb"])
|
| 287 |
+
elif priority == "speed":
|
| 288 |
+
filtered.sort(key=lambda m: m["speed_tps"], reverse=True)
|
| 289 |
+
elif priority == "quality":
|
| 290 |
+
filtered.sort(key=lambda m: m["params"], reverse=True)
|
| 291 |
+
|
| 292 |
+
best = filtered[0]
|
| 293 |
+
return {
|
| 294 |
+
"recommended": {
|
| 295 |
+
"name": best["name"],
|
| 296 |
+
"repo_id": f"{_ORG}/{best['name']}",
|
| 297 |
+
"url": f"https://huggingface.co/{_ORG}/{best['name']}",
|
| 298 |
+
"params": best["params"],
|
| 299 |
+
"size_mb": best["size_mb"],
|
| 300 |
+
"ram_mb": best["ram_mb"],
|
| 301 |
+
"quant": best["quant"],
|
| 302 |
+
"speed_tps": best["speed_tps"],
|
| 303 |
+
},
|
| 304 |
+
"alternatives": [
|
| 305 |
+
{"name": m["name"], "size_mb": m["size_mb"], "speed_tps": m["speed_tps"]}
|
| 306 |
+
for m in filtered[1:4]
|
| 307 |
+
],
|
| 308 |
+
}
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
def estimate_latency(params: str, quant: str = "Q4_K_M", hardware: str = "snapdragon_865") -> Dict[str, float]:
|
| 312 |
+
"""Estimate on-device inference latency.
|
| 313 |
+
|
| 314 |
+
Args:
|
| 315 |
+
params: Parameter count β "135M", "500M", "1B", "1.5B", "3B", etc.
|
| 316 |
+
quant: Quantization β "FP16", "Q4_K_M", "Q5_K_M", "Q8_0", "INT4"
|
| 317 |
+
hardware: Target hardware β "snapdragon_865", "snapdragon_8_gen_2", "apple_a17", etc.
|
| 318 |
+
|
| 319 |
+
Returns:
|
| 320 |
+
Dict with tokens_per_sec, latency_ms, ram_mb
|
| 321 |
+
|
| 322 |
+
Example:
|
| 323 |
+
>>> from dispatchai import estimate_latency
|
| 324 |
+
>>> lat = estimate_latency("1B", "Q4_K_M")
|
| 325 |
+
>>> print(f"{lat['tokens_per_sec']} t/s, {lat['latency_ms_per_token']}ms/token")
|
| 326 |
+
"""
|
| 327 |
+
hw_multipliers = {
|
| 328 |
+
"snapdragon_865": 1.0,
|
| 329 |
+
"snapdragon_8_gen_2": 1.8,
|
| 330 |
+
"snapdragon_8_gen_3": 2.2,
|
| 331 |
+
"apple_a17": 2.5,
|
| 332 |
+
"apple_m2": 3.0,
|
| 333 |
+
"snapdragon_778g": 0.7,
|
| 334 |
+
}
|
| 335 |
+
|
| 336 |
+
params_upper = params.upper()
|
| 337 |
+
quant_upper = quant.upper()
|
| 338 |
+
|
| 339 |
+
if params_upper not in _LATENCY_DB:
|
| 340 |
+
return {"error": f"Unknown params: {params}. Valid: {list(_LATENCY_DB.keys())}"}
|
| 341 |
+
|
| 342 |
+
base_tps = _LATENCY_DB[params_upper].get(quant_upper, 10.0)
|
| 343 |
+
hw_mult = hw_multipliers.get(hardware, 1.0)
|
| 344 |
+
actual_tps = base_tps * hw_mult
|
| 345 |
+
|
| 346 |
+
return {
|
| 347 |
+
"params": params,
|
| 348 |
+
"quant": quant,
|
| 349 |
+
"hardware": hardware,
|
| 350 |
+
"tokens_per_sec": round(actual_tps, 1),
|
| 351 |
+
"latency_ms_per_token": round(1000 / actual_tps, 0),
|
| 352 |
+
"suitable_for_realtime": actual_tps > 10,
|
| 353 |
+
"suitable_for_phone": actual_tps > 2,
|
| 354 |
+
}
|
| 355 |
+
|
| 356 |
+
|
| 357 |
+
def calculate_cost(daily_queries: int, cloud_cost_per_1k: float = 0.50, days: int = 365) -> Dict[str, float]:
|
| 358 |
+
"""Compare cloud API vs on-device inference costs.
|
| 359 |
+
|
| 360 |
+
Args:
|
| 361 |
+
daily_queries: Number of AI queries per day
|
| 362 |
+
cloud_cost_per_1k: Cloud API cost per 1000 queries
|
| 363 |
+
days: Time period in days
|
| 364 |
+
|
| 365 |
+
Returns:
|
| 366 |
+
Dict with cloud_cost, device_cost, savings
|
| 367 |
+
|
| 368 |
+
Example:
|
| 369 |
+
>>> from dispatchai import calculate_cost
|
| 370 |
+
>>> result = calculate_cost(daily_queries=10000, cloud_cost_per_1k=0.50)
|
| 371 |
+
>>> print(f"Save ${result['savings']:.0f}/year with on-device")
|
| 372 |
+
"""
|
| 373 |
+
cloud_total = (daily_queries / 1000) * cloud_cost_per_1k * days
|
| 374 |
+
device_total = 0.50 # One-time download cost
|
| 375 |
+
|
| 376 |
+
return {
|
| 377 |
+
"cloud_cost": round(cloud_total, 2),
|
| 378 |
+
"device_cost": round(device_total, 2),
|
| 379 |
+
"savings": round(cloud_total - device_total, 2),
|
| 380 |
+
"savings_pct": round((1 - device_total / cloud_total) * 100, 1) if cloud_total > 0 else 0,
|
| 381 |
+
"daily_cloud_cost": round((daily_queries / 1000) * cloud_cost_per_1k, 2),
|
| 382 |
+
}
|
src/dispatchai/version.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
__version__ = "0.1.0"
|