Spaces:
Runtime error
Runtime error
File size: 5,462 Bytes
3b7ea55 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | import gradio as gr
import spaces
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_ID = "dispatchAI/Llama-3.2-1B-Instruct-Q4-mobile"
tokenizer = None
model = None
def load_model():
global tokenizer, model
if tokenizer is None:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.float16,
device_map="auto",
)
return tokenizer, model
@spaces.GPU
def agent_respond(task: str, history: list) -> str:
"""A mobile-optimized agent that can answer questions, write code, and solve tasks.
Powered by dispatchAI/Llama-3.2-1B-Instruct-Q4-mobile β a 1B parameter model
quantized to Q4, designed to run on phones. This proves real agents can run
on pocket-sized models.
"""
tokenizer, model = load_model()
messages = [{"role": "system", "content": "You are a helpful mobile AI assistant. You are running on a 1B parameter model optimized for phones. Be concise and helpful."}]
for h in history:
messages.append({"role": "user", "content": h[0]})
if h[1]:
messages.append({"role": "assistant", "content": h[1]})
messages.append({"role": "user", "content": task})
input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=256,
temperature=0.7,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
)
response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
return response
@spaces.GPU
def agent_code(instruction: str) -> str:
"""Generate code using a mobile-optimized model."""
tokenizer, model = load_model()
prompt = f"Write Python code for: {instruction}\n\n```python\n"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=200,
temperature=0.3,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
)
code = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract code block
if "```python" in code:
code = code.split("```python")[1].split("```")[0]
return code.strip()
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), title="dispatchAI Mobile Agent") as demo:
gr.Markdown("""
# π€ dispatchAI Mobile Agent
**A real AI agent running on a 1B parameter model β small enough for your pocket.**
Model: [Llama-3.2-1B-Instruct-Q4-mobile](https://huggingface.co/dispatchAI/Llama-3.2-1B-Instruct-Q4-mobile)
This agent runs on a model quantized to Q4 (700MB file size), designed to run on
Snapdragon 865 phones. It can answer questions, write code, and solve tasks β
all on a model 1/100th the size of GPT-4.
## Try It
- **Chat**: Ask the agent anything
- **Code**: Ask it to write Python code
## The Point
This isn't about matching GPT-4. It's about proving that a 1B model on a phone
can be genuinely useful. For the tasks people actually do on phones β quick answers,
code snippets, summaries, classifications β a 1B model is enough.
""")
with gr.Tab("π¬ Chat"):
chat = gr.ChatInterface(
fn=agent_respond,
title="Chat with a 1B Mobile Agent",
description="Powered by Llama-3.2-1B-Instruct-Q4-mobile (700MB)",
)
with gr.Tab("π¨βπ» Code"):
code_input = gr.Textbox(label="What should I code?", placeholder="A function that reverses a string")
code_btn = gr.Button("Generate Code", variant="primary")
code_output = gr.Code(label="Generated Code", language="python")
code_btn.click(fn=agent_code, inputs=code_input, outputs=code_output)
with gr.Tab("βΉοΈ About"):
gr.Markdown("""
## How This Works
This Space runs a **1 billion parameter Llama-3.2 model** quantized to 4-bit.
| Metric | Value |
|--------|-------|
| Model | Llama-3.2-1B-Instruct |
| Params | 1B |
| Quantization | Q4 (4-bit) |
| File size | 700MB |
| RAM needed | ~1.1GB |
| Speed on Snapdragon 865 | ~18 tokens/sec |
| Speed on this Space (ZeroGPU) | Faster |
## Run This On Your Phone
```bash
# Download the GGUF
hf download dispatchAI/Llama-3.2-1B-Instruct-Q4-mobile model.gguf
# Run with llama.cpp
llama-cli -m model.gguf -p "Hello!" -n 100 -t 4
```
## The Thesis
> A 1B model on a phone is not a compromise. It's a victory.
6.8 billion smartphones. Most can't run a cloud LLM. But they CAN run a 1B model
at 18 tokens/sec. That's fast enough for real-time chat, code completion,
summarization, and classification.
---
π [dispatchAI](https://huggingface.co/dispatchAI) β Small. Mobile. Free. UAE-built.
""")
demo.launch(mcp_server=True)
|