Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

README.md +116 -0
pyproject.toml +60 -0
src/dispatchai/__init__.py +42 -0
src/dispatchai/core.py +382 -0
src/dispatchai/version.py +1 -0

README.md ADDED Viewed

	@@ -0,0 +1,116 @@

+# dispatchAI SDK
+**Small. Mobile. Free. UAE-built.**
+`pip install dispatchai` — Run mobile-optimized LLMs on your phone, edge device, or laptop. 39 models, all tested on real Snapdragon hardware, all free.
+## Quick Start
+```bash
+pip install dispatchai
+```
+### Chat with a model
+```python
+from dispatchai import load_model
+model = load_model("SmolLM2-135M-Instruct-mobile")
+response = model.chat("What is the capital of France?")
+print(response)
+```
+### Use GGUF/llama.cpp backend
+```python
+model = load_model("Llama-3.2-1B-Instruct-Q4-mobile", backend="gguf")
+print(model.chat("Write a haiku about the desert."))
+```
+### Find the best model for your phone
+```python
+from dispatchai import recommend
+rec = recommend(ram_mb=2048, task="chat")
+print(f"Best model: {rec['recommended']['name']}")
+print(f"Size: {rec['recommended']['size_mb']}MB")
+print(f"Speed: {rec['recommended']['speed_tps']} tokens/sec")
+```
+### List all models
+```python
+from dispatchai import list_models
+for m in list_models(task="chat"):
+    print(f"  {m['name']}: {m['size_mb']}MB, {m['speed_tps']} t/s")
+```
+### Estimate latency
+```python
+from dispatchai import estimate_latency
+lat = estimate_latency("1B", "Q4_K_M")
+print(f"{lat['tokens_per_sec']} tokens/sec on Snapdragon 865")
+```
+### Calculate cost savings
+```python
+from dispatchai import calculate_cost
+result = calculate_cost(daily_queries=10000, cloud_cost_per_1k=0.50)
+print(f"Annual savings: ${result['savings']}")
+```
+## Installation Options
+```bash
+pip install dispatchai                    # Core (model catalog, recommendations)
+pip install dispatchai[torch]             # + transformers/torch backend
+pip install dispatchai[gguf]              # + llama.cpp GGUF backend
+pip install dispatchai[full]              # + everything (torch, gguf, sentence-transformers)
+```
+## Available Models
+| Model | Params | Size | Speed | Task |
+|-------|--------|------|-------|------|
+| SmolLM2-135M-Instruct-mobile | 135M | 270MB | 25.5 t/s | Chat |
+| SmolLM2-360M-Instruct-mobile | 360M | 720MB | 21.0 t/s | Chat |
+| Qwen2.5-0.5B-Instruct-mobile-int4 | 500M | 350MB | 20.0 t/s | Chat |
+| Llama-3.2-1B-Instruct-Q4-mobile | 1B | 700MB | 18.2 t/s | Chat |
+| Llama-3.2-1B-FunctionCall-mobile | 1B | 2.5GB | 12.0 t/s | Function Call |
+| Qwen2.5-Coder-1.5B-mobile | 1.5B | 3.0GB | 10.5 t/s | Code |
+| Gemma-2B-Arabic-mobile | 2B | 5.0GB | 8.0 t/s | Arabic |
+| Llama-3.2-3B-Instruct-Q5-mobile | 3B | 2.1GB | 8.5 t/s | Chat |
+[Browse all 39 models →](https://huggingface.co/dispatchAI)
+## Hardware Targets
+All benchmarks measured on **Snapdragon 865 (Samsung S20 FE, 8GB RAM)** using llama.cpp.
+The `estimate_latency()` function supports:
+- Snapdragon 865 (baseline)
+- Snapdragon 8 Gen 2 (1.8x)
+- Snapdragon 8 Gen 3 (2.2x)
+- Apple A17 Pro (2.5x)
+- Apple M2 (3.0x)
+- Snapdragon 778G mid-range (0.7x)
+## The Thesis
+> *The best model is the one that runs.*
+We're building the AI layer for a billion phones that can't afford cloud inference. Every model is free, open-source, and tested on real hardware.
+## About
+Dispatch AI (FZE) — Sharjah Free Zone, UAE. License No. 10818.
+🌐 [dispatchai.ai](https://www.dispatchai.ai) | 🤗 [huggingface.co/dispatchAI](https://huggingface.co/dispatchAI) | 𝕏 [@DispatchAIdev](https://twitter.com/DispatchAIdev)
+*I think, therefore I ship.*

pyproject.toml ADDED Viewed

	@@ -0,0 +1,60 @@

+# dispatchAI Python SDK
+# pip install dispatchai
+#
+# Two-line inference with any dispatchAI mobile model:
+#   from dispatchai import load_model
+#   model = load_model("SmolLM2-135M-Instruct-mobile")
+[build-system]
+requires = ["setuptools>=64", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "dispatchai"
+version = "0.1.0"
+description = "dispatchAI — Mobile-optimized LLMs that run on your phone. Small. Mobile. Free. UAE-built."
+readme = "README.md"
+license = {text = "Apache-2.0"}
+requires-python = ">=3.8"
+authors = [
+    {name = "Dispatch AI (FZE)", email = "contact@dispatchai.ai"}
+]
+keywords = [
+    "mobile", "llm", "on-device", "edge", "quantized", "gguf",
+    "huggingface", "arabic", "small-models", "dispatchai"
+]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+dependencies = [
+    "huggingface_hub>=0.20.0",
+    "requests>=2.28.0",
+]
+[project.optional-dependencies]
+torch = ["transformers>=4.40.0", "torch>=2.0.0", "accelerate>=0.20.0"]
+gguf = ["llama-cpp-python>=0.2.0"]
+full = ["transformers>=4.40.0", "torch>=2.0.0", "accelerate>=0.20.0", "llama-cpp-python>=0.2.0", "sentence-transformers>=2.5.0"]
+dev = ["pytest>=7.0", "pytest-cov", "ruff", "mypy"]
+[project.urls]
+Homepage = "https://huggingface.co/dispatchAI"
+Documentation = "https://huggingface.co/dispatchAI"
+Repository = "https://huggingface.co/dispatchAI/dispatchAI-SDK"
+"Bug Tracker" = "https://huggingface.co/dispatchAI/dispatchAI-SDK/discussions"
+[tool.setuptools.packages.find]
+where = ["src"]
+[tool.ruff]
+line-length = 100
+target-version = "py38"

src/dispatchai/__init__.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""
+dispatchAI SDK — Mobile-optimized LLMs that run on your phone.
+Small. Mobile. Free. UAE-built.
+Quick start:
+    pip install dispatchai
+    from dispatchai import load_model
+    model = load_model("SmolLM2-135M-Instruct-mobile")
+    print(model.chat("What is the capital of France?"))
+    # List available models
+    from dispatchai import list_models
+    for m in list_models():
+        print(m)
+    # Find the best model for your phone
+    from dispatchai import recommend
+    rec = recommend(ram_mb=2048, task="chat")
+    print(rec)
+"""
+from .core import (
+    load_model,
+    list_models,
+    recommend,
+    estimate_latency,
+    calculate_cost,
+    DispatchModel,
+)
+from .version import __version__
+__all__ = [
+    "load_model",
+    "list_models",
+    "recommend",
+    "estimate_latency",
+    "calculate_cost",
+    "DispatchModel",
+    "__version__",
+]

src/dispatchai/core.py ADDED Viewed

	@@ -0,0 +1,382 @@

+"""
+dispatchAI core module — model loading, inference, and utilities.
+"""
+import json
+import os
+from typing import Optional, List, Dict, Any
+# ─── Model catalog ───────────────────────────────────────────────────────────
+_ORG = "dispatchAI"
+_MODELS = [
+    {"name": "SmolLM2-135M-Instruct-mobile", "params": "135M", "size_mb": 270, "ram_mb": 400, "task": "chat", "quant": "FP16", "speed_tps": 25.5},
+    {"name": "SmolLM2-360M-Instruct-mobile", "params": "360M", "size_mb": 720, "ram_mb": 700, "task": "chat", "quant": "FP16", "speed_tps": 21.0},
+    {"name": "Qwen2.5-0.5B-Instruct-mobile-int4", "params": "500M", "size_mb": 350, "ram_mb": 550, "task": "chat", "quant": "INT4", "speed_tps": 20.0},
+    {"name": "Qwen2.5-0.5B-Coder-mobile", "params": "500M", "size_mb": 1000, "ram_mb": 1500, "task": "code", "quant": "FP16", "speed_tps": 20.0},
+    {"name": "Llama-3.2-1B-Instruct-mobile", "params": "1B", "size_mb": 2500, "ram_mb": 3000, "task": "chat", "quant": "FP16", "speed_tps": 12.0},
+    {"name": "Llama-3.2-1B-Instruct-Q4-mobile", "params": "1B", "size_mb": 700, "ram_mb": 1100, "task": "chat", "quant": "Q4", "speed_tps": 18.2},
+    {"name": "Llama-3.2-1B-Instruct-Q6-mobile", "params": "1B", "size_mb": 1100, "ram_mb": 1300, "task": "chat", "quant": "Q6", "speed_tps": 16.8},
+    {"name": "Llama-3.2-1B-FunctionCall-mobile", "params": "1B", "size_mb": 2500, "ram_mb": 3000, "task": "function_call", "quant": "FP16", "speed_tps": 12.0},
+    {"name": "TinyLlama-1.1B-Chat-Q5-mobile", "params": "1.1B", "size_mb": 800, "ram_mb": 1200, "task": "chat", "quant": "Q5", "speed_tps": 17.5},
+    {"name": "MiniCPM5-1B-mobile", "params": "1B", "size_mb": 2500, "ram_mb": 3000, "task": "chat", "quant": "FP16", "speed_tps": 12.0},
+    {"name": "Qwen2.5-Coder-1.5B-mobile", "params": "1.5B", "size_mb": 3000, "ram_mb": 4000, "task": "code", "quant": "FP16", "speed_tps": 10.5},
+    {"name": "Qwen2.5-Math-1.5B-mobile", "params": "1.5B", "size_mb": 3000, "ram_mb": 4000, "task": "math", "quant": "FP16", "speed_tps": 10.5},
+    {"name": "Qwen2.5-1.5B-Instruct-Q5-mobile", "params": "1.5B", "size_mb": 1100, "ram_mb": 1700, "task": "chat", "quant": "Q5", "speed_tps": 14.5},
+    {"name": "Qwen2.5-1.5B-Instruct-Q8-mobile", "params": "1.5B", "size_mb": 1600, "ram_mb": 2200, "task": "chat", "quant": "Q8", "speed_tps": 13.0},
+    {"name": "Gemma-2-2B-IT-Q5-mobile", "params": "2B", "size_mb": 1500, "ram_mb": 2200, "task": "chat", "quant": "Q5", "speed_tps": 12.0},
+    {"name": "Gemma-2B-Arabic-mobile", "params": "2B", "size_mb": 5000, "ram_mb": 5500, "task": "arabic", "quant": "FP16", "speed_tps": 8.0},
+    {"name": "Llama-3.2-3B-Instruct-Q5-mobile", "params": "3B", "size_mb": 2100, "ram_mb": 2700, "task": "chat", "quant": "Q5", "speed_tps": 8.5},
+    {"name": "Llama-3.2-3B-FunctionCall-mobile", "params": "3B", "size_mb": 6000, "ram_mb": 7000, "task": "function_call", "quant": "FP16", "speed_tps": 5.5},
+    {"name": "Phi-3.5-mini-instruct-Q5-mobile", "params": "3.8B", "size_mb": 2800, "ram_mb": 3200, "task": "chat", "quant": "Q5", "speed_tps": 7.5},
+    {"name": "Moondream2-Vision-Q5-mobile", "params": "1.9B", "size_mb": 1400, "ram_mb": 2000, "task": "vision", "quant": "Q5", "speed_tps": 8.5},
+    {"name": "EmbeddingGemma-300M-Q8-mobile", "params": "300M", "size_mb": 300, "ram_mb": 500, "task": "embedding", "quant": "Q8", "speed_tps": 22.0},
+    {"name": "Qwen3-Embedding-0.6B-Q8-mobile", "params": "600M", "size_mb": 600, "ram_mb": 800, "task": "embedding", "quant": "Q8", "speed_tps": 18.0},
+]
+_LATENCY_DB = {
+    "135M": {"FP16": 25.5, "Q8_0": 28.2, "Q5_K_M": 30.1, "Q4_K_M": 32.0, "Q2_K": 35.0},
+    "300M": {"FP16": 22.0, "Q8_0": 24.5, "Q5_K_M": 26.0, "Q4_K_M": 27.5, "Q2_K": 29.5},
+    "500M": {"FP16": 20.0, "Q8_0": 24.0, "Q5_K_M": 25.5, "Q4_K_M": 26.8, "INT4": 20.0},
+    "600M": {"FP16": 18.0, "Q8_0": 21.0, "Q5_K_M": 22.5, "Q4_K_M": 23.8},
+    "1B": {"FP16": 12.0, "Q8_0": 15.5, "Q5_K_M": 17.5, "Q4_K_M": 18.2, "Q5": 17.5, "Q4": 18.2, "Q6": 16.8},
+    "1.1B": {"FP16": 11.5, "Q8_0": 14.8, "Q5_K_M": 17.0, "Q5": 17.5},
+    "1.5B": {"FP16": 10.5, "Q8_0": 13.0, "Q5_K_M": 14.5, "Q5": 14.5, "Q8": 13.0},
+    "1.9B": {"FP16": 8.5, "Q8_0": 11.0, "Q5_K_M": 12.5, "Q5": 8.5},
+    "2B": {"FP16": 8.0, "Q8_0": 10.5, "Q5_K_M": 12.0, "Q5": 12.0},
+    "3B": {"FP16": 5.5, "Q8_0": 7.0, "Q5_K_M": 8.5, "Q5": 8.5},
+    "3.8B": {"FP16": 4.5, "Q8_0": 6.0, "Q5_K_M": 7.5, "Q5": 7.5},
+    "7B": {"FP16": 2.5, "Q8_0": 3.5, "Q5_K_M": 4.5},
+}
+# ─── Model loading & inference ───────────────────────────────────────────────
+class DispatchModel:
+    """A loaded dispatchAI model ready for inference.
+    Example:
+        from dispatchai import load_model
+        model = load_model("SmolLM2-135M-Instruct-mobile")
+        print(model.chat("Hello!"))
+    """
+    def __init__(self, model_name: str, repo_id: str, backend: str = "transformers"):
+        self.model_name = model_name
+        self.repo_id = repo_id
+        self.backend = backend
+        self._model = None
+        self._tokenizer = None
+        self._loaded = False
+    def _load(self):
+        """Lazily load the model on first use."""
+        if self._loaded:
+            return
+        if self.backend == "transformers":
+            try:
+                from transformers import AutoTokenizer, AutoModelForCausalLM
+                import torch
+            except ImportError:
+                raise ImportError(
+                    "transformers backend requires: pip install dispatchai[torch]\n"
+                    "Or use GGUF backend: load_model(..., backend='gguf')"
+                )
+            self._tokenizer = AutoTokenizer.from_pretrained(self.repo_id)
+            self._model = AutoModelForCausalLM.from_pretrained(
+                self.repo_id,
+                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+                device_map="auto" if torch.cuda.is_available() else None,
+            )
+        elif self.backend == "gguf":
+            try:
+                from llama_cpp import Llama
+            except ImportError:
+                raise ImportError(
+                    "GGUF backend requires: pip install dispatchai[gguf]"
+                )
+            from huggingface_hub import hf_hub_download
+            gguf_path = hf_hub_download(self.repo_id, "model.gguf")
+            self._model = Llama(model_path=gguf_path, n_ctx=512, n_threads=4, verbose=False)
+        self._loaded = True
+    def chat(self, message: str, system: str = "", max_tokens: int = 256, temperature: float = 0.7) -> str:
+        """Send a chat message and get a response.
+        Args:
+            message: User message
+            system: Optional system prompt
+            max_tokens: Maximum tokens to generate
+            temperature: Sampling temperature (0.0-1.0)
+        Returns:
+            Model response text
+        """
+        self._load()
+        if self.backend == "transformers":
+            import torch
+            messages = []
+            if system:
+                messages.append({"role": "system", "content": system})
+            messages.append({"role": "user", "content": message})
+            input_text = self._tokenizer.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
+            )
+            inputs = self._tokenizer(input_text, return_tensors="pt")
+            if torch.cuda.is_available():
+                inputs = {k: v.cuda() for k, v in inputs.items()}
+            with torch.no_grad():
+                outputs = self._model.generate(
+                    **inputs,
+                    max_new_tokens=max_tokens,
+                    temperature=temperature,
+                    do_sample=temperature > 0,
+                    pad_token_id=self._tokenizer.eos_token_id,
+                )
+            response = self._tokenizer.decode(
+                outputs[0][inputs["input_ids"].shape[1]:],
+                skip_special_tokens=True
+            )
+            return response.strip()
+        elif self.backend == "gguf":
+            response = self._model(
+                message,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                echo=False,
+            )
+            return response["choices"][0]["text"].strip()
+        return ""
+    def generate(self, prompt: str, max_tokens: int = 256, temperature: float = 0.7) -> str:
+        """Generate text from a raw prompt (no chat template).
+        Args:
+            prompt: Raw text prompt
+            max_tokens: Maximum tokens to generate
+            temperature: Sampling temperature
+        Returns:
+            Generated text
+        """
+        self._load()
+        if self.backend == "transformers":
+            import torch
+            inputs = self._tokenizer(prompt, return_tensors="pt")
+            if torch.cuda.is_available():
+                inputs = {k: v.cuda() for k, v in inputs.items()}
+            with torch.no_grad():
+                outputs = self._model.generate(
+                    **inputs,
+                    max_new_tokens=max_tokens,
+                    temperature=temperature,
+                    do_sample=temperature > 0,
+                    pad_token_id=self._tokenizer.eos_token_id,
+                )
+            return self._tokenizer.decode(outputs[0], skip_special_tokens=True)[len(prompt):].strip()
+        elif self.backend == "gguf":
+            response = self._model(prompt, max_tokens=max_tokens, temperature=temperature, echo=False)
+            return response["choices"][0]["text"].strip()
+        return ""
+    def __repr__(self):
+        return f"DispatchModel(name={self.model_name!r}, repo={self.repo_id!r}, backend={self.backend!r}, loaded={self._loaded})"
+def load_model(model_name: str, backend: str = "transformers", token: Optional[str] = None) -> DispatchModel:
+    """Load a dispatchAI mobile model for inference.
+    Args:
+        model_name: Model name without org prefix (e.g., "SmolLM2-135M-Instruct-mobile")
+        backend: "transformers" (default) or "gguf" for llama.cpp
+        token: Optional HuggingFace token for private/gated models
+    Returns:
+        DispatchModel ready for .chat() or .generate()
+    Example:
+        >>> from dispatchai import load_model
+        >>> model = load_model("SmolLM2-135M-Instruct-mobile")
+        >>> print(model.chat("What is 2+2?"))
+    For GGUF/llama.cpp:
+        >>> model = load_model("Llama-3.2-1B-Instruct-Q4-mobile", backend="gguf")
+    """
+    # Allow full repo_id or just the name
+    if model_name.startswith(f"{_ORG}/"):
+        repo_id = model_name
+        model_name = model_name.replace(f"{_ORG}/", "")
+    else:
+        repo_id = f"{_ORG}/{model_name}"
+    return DispatchModel(model_name, repo_id, backend=backend)
+# ─── Catalog & utilities ─────────────────────────────────────────────────────
+def list_models(task: Optional[str] = None) -> List[Dict[str, Any]]:
+    """List all available dispatchAI mobile models.
+    Args:
+        task: Optional filter — "chat", "code", "math", "arabic", "function_call", "vision", "embedding"
+    Returns:
+        List of model dicts with name, params, size, ram, task, quant, speed
+    Example:
+        >>> from dispatchai import list_models
+        >>> for m in list_models("chat"):
+        ...     print(f"{m['name']}: {m['size_mb']}MB, {m['speed_tps']} t/s")
+    """
+    models = _MODELS.copy()
+    if task:
+        models = [m for m in models if m["task"] == task.lower().replace("-", "_")]
+    return sorted(models, key=lambda m: m["size_mb"])
+def recommend(ram_mb: int = 2048, task: str = "chat", priority: str = "size") -> Dict[str, Any]:
+    """Get a model recommendation for your device.
+    Args:
+        ram_mb: Available RAM in MB (e.g., 2048 for 2GB phone)
+        task: Primary task — "chat", "code", "math", "arabic", "function_call", "vision", "embedding"
+        priority: "size" (smallest), "speed" (fastest), or "quality" (largest params)
+    Returns:
+        Dict with recommended model and alternatives
+    Example:
+        >>> from dispatchai import recommend
+        >>> rec = recommend(ram_mb=2048, task="chat")
+        >>> print(f"Best: {rec['recommended']['name']} ({rec['recommended']['size_mb']}MB)")
+    """
+    filtered = [m for m in _MODELS if m["ram_mb"] <= ram_mb]
+    task_map = {
+        "chat": "chat", "code": "code", "math": "math",
+        "arabic": "arabic", "function_call": "function_call",
+        "function-call": "function_call", "vision": "vision",
+        "embedding": "embedding", "any": None,
+    }
+    task_key = task_map.get(task.lower(), None)
+    if task_key:
+        filtered = [m for m in filtered if m["task"] == task_key]
+    if not filtered:
+        return {"error": f"No models fit in {ram_mb}MB RAM for task '{task}'"}
+    if priority == "size":
+        filtered.sort(key=lambda m: m["size_mb"])
+    elif priority == "speed":
+        filtered.sort(key=lambda m: m["speed_tps"], reverse=True)
+    elif priority == "quality":
+        filtered.sort(key=lambda m: m["params"], reverse=True)
+    best = filtered[0]
+    return {
+        "recommended": {
+            "name": best["name"],
+            "repo_id": f"{_ORG}/{best['name']}",
+            "url": f"https://huggingface.co/{_ORG}/{best['name']}",
+            "params": best["params"],
+            "size_mb": best["size_mb"],
+            "ram_mb": best["ram_mb"],
+            "quant": best["quant"],
+            "speed_tps": best["speed_tps"],
+        },
+        "alternatives": [
+            {"name": m["name"], "size_mb": m["size_mb"], "speed_tps": m["speed_tps"]}
+            for m in filtered[1:4]
+        ],
+    }
+def estimate_latency(params: str, quant: str = "Q4_K_M", hardware: str = "snapdragon_865") -> Dict[str, float]:
+    """Estimate on-device inference latency.
+    Args:
+        params: Parameter count — "135M", "500M", "1B", "1.5B", "3B", etc.
+        quant: Quantization — "FP16", "Q4_K_M", "Q5_K_M", "Q8_0", "INT4"
+        hardware: Target hardware — "snapdragon_865", "snapdragon_8_gen_2", "apple_a17", etc.
+    Returns:
+        Dict with tokens_per_sec, latency_ms, ram_mb
+    Example:
+        >>> from dispatchai import estimate_latency
+        >>> lat = estimate_latency("1B", "Q4_K_M")
+        >>> print(f"{lat['tokens_per_sec']} t/s, {lat['latency_ms_per_token']}ms/token")
+    """
+    hw_multipliers = {
+        "snapdragon_865": 1.0,
+        "snapdragon_8_gen_2": 1.8,
+        "snapdragon_8_gen_3": 2.2,
+        "apple_a17": 2.5,
+        "apple_m2": 3.0,
+        "snapdragon_778g": 0.7,
+    }
+    params_upper = params.upper()
+    quant_upper = quant.upper()
+    if params_upper not in _LATENCY_DB:
+        return {"error": f"Unknown params: {params}. Valid: {list(_LATENCY_DB.keys())}"}
+    base_tps = _LATENCY_DB[params_upper].get(quant_upper, 10.0)
+    hw_mult = hw_multipliers.get(hardware, 1.0)
+    actual_tps = base_tps * hw_mult
+    return {
+        "params": params,
+        "quant": quant,
+        "hardware": hardware,
+        "tokens_per_sec": round(actual_tps, 1),
+        "latency_ms_per_token": round(1000 / actual_tps, 0),
+        "suitable_for_realtime": actual_tps > 10,
+        "suitable_for_phone": actual_tps > 2,
+    }
+def calculate_cost(daily_queries: int, cloud_cost_per_1k: float = 0.50, days: int = 365) -> Dict[str, float]:
+    """Compare cloud API vs on-device inference costs.
+    Args:
+        daily_queries: Number of AI queries per day
+        cloud_cost_per_1k: Cloud API cost per 1000 queries
+        days: Time period in days
+    Returns:
+        Dict with cloud_cost, device_cost, savings
+    Example:
+        >>> from dispatchai import calculate_cost
+        >>> result = calculate_cost(daily_queries=10000, cloud_cost_per_1k=0.50)
+        >>> print(f"Save ${result['savings']:.0f}/year with on-device")
+    """
+    cloud_total = (daily_queries / 1000) * cloud_cost_per_1k * days
+    device_total = 0.50  # One-time download cost
+    return {
+        "cloud_cost": round(cloud_total, 2),
+        "device_cost": round(device_total, 2),
+        "savings": round(cloud_total - device_total, 2),
+        "savings_pct": round((1 - device_total / cloud_total) * 100, 1) if cloud_total > 0 else 0,
+        "daily_cloud_cost": round((daily_queries / 1000) * cloud_cost_per_1k, 2),
+    }

src/dispatchai/version.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __version__ = "0.1.0"