File size: 5,462 Bytes
3b7ea55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import gradio as gr
import spaces
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_ID = "dispatchAI/Llama-3.2-1B-Instruct-Q4-mobile"

tokenizer = None
model = None

def load_model():
    global tokenizer, model
    if tokenizer is None:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            torch_dtype=torch.float16,
            device_map="auto",
        )
    return tokenizer, model

@spaces.GPU
def agent_respond(task: str, history: list) -> str:
    """A mobile-optimized agent that can answer questions, write code, and solve tasks.
    
    Powered by dispatchAI/Llama-3.2-1B-Instruct-Q4-mobile β€” a 1B parameter model 
    quantized to Q4, designed to run on phones. This proves real agents can run 
    on pocket-sized models.
    """
    tokenizer, model = load_model()
    
    messages = [{"role": "system", "content": "You are a helpful mobile AI assistant. You are running on a 1B parameter model optimized for phones. Be concise and helpful."}]
    for h in history:
        messages.append({"role": "user", "content": h[0]})
        if h[1]:
            messages.append({"role": "assistant", "content": h[1]})
    messages.append({"role": "user", "content": task})
    
    input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )
    
    response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
    return response

@spaces.GPU  
def agent_code(instruction: str) -> str:
    """Generate code using a mobile-optimized model."""
    tokenizer, model = load_model()
    
    prompt = f"Write Python code for: {instruction}\n\n```python\n"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.3,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )
    
    code = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract code block
    if "```python" in code:
        code = code.split("```python")[1].split("```")[0]
    return code.strip()

with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), title="dispatchAI Mobile Agent") as demo:
    gr.Markdown("""
    # πŸ€– dispatchAI Mobile Agent
    
    **A real AI agent running on a 1B parameter model β€” small enough for your pocket.**
    
    Model: [Llama-3.2-1B-Instruct-Q4-mobile](https://huggingface.co/dispatchAI/Llama-3.2-1B-Instruct-Q4-mobile)
    
    This agent runs on a model quantized to Q4 (700MB file size), designed to run on 
    Snapdragon 865 phones. It can answer questions, write code, and solve tasks β€” 
    all on a model 1/100th the size of GPT-4.
    
    ## Try It
    
    - **Chat**: Ask the agent anything
    - **Code**: Ask it to write Python code
    
    ## The Point
    
    This isn't about matching GPT-4. It's about proving that a 1B model on a phone 
    can be genuinely useful. For the tasks people actually do on phones β€” quick answers, 
    code snippets, summaries, classifications β€” a 1B model is enough.
    """)
    
    with gr.Tab("πŸ’¬ Chat"):
        chat = gr.ChatInterface(
            fn=agent_respond,
            title="Chat with a 1B Mobile Agent",
            description="Powered by Llama-3.2-1B-Instruct-Q4-mobile (700MB)",
        )
    
    with gr.Tab("πŸ‘¨β€πŸ’» Code"):
        code_input = gr.Textbox(label="What should I code?", placeholder="A function that reverses a string")
        code_btn = gr.Button("Generate Code", variant="primary")
        code_output = gr.Code(label="Generated Code", language="python")
        code_btn.click(fn=agent_code, inputs=code_input, outputs=code_output)
    
    with gr.Tab("ℹ️ About"):
        gr.Markdown("""
        ## How This Works
        
        This Space runs a **1 billion parameter Llama-3.2 model** quantized to 4-bit.
        
        | Metric | Value |
        |--------|-------|
        | Model | Llama-3.2-1B-Instruct |
        | Params | 1B |
        | Quantization | Q4 (4-bit) |
        | File size | 700MB |
        | RAM needed | ~1.1GB |
        | Speed on Snapdragon 865 | ~18 tokens/sec |
        | Speed on this Space (ZeroGPU) | Faster |
        
        ## Run This On Your Phone
        
        ```bash
        # Download the GGUF
        hf download dispatchAI/Llama-3.2-1B-Instruct-Q4-mobile model.gguf
        
        # Run with llama.cpp
        llama-cli -m model.gguf -p "Hello!" -n 100 -t 4
        ```
        
        ## The Thesis
        
        > A 1B model on a phone is not a compromise. It's a victory.
        
        6.8 billion smartphones. Most can't run a cloud LLM. But they CAN run a 1B model 
        at 18 tokens/sec. That's fast enough for real-time chat, code completion, 
        summarization, and classification.
        
        ---
        πŸš€ [dispatchAI](https://huggingface.co/dispatchAI) β€” Small. Mobile. Free. UAE-built.
        """)

demo.launch(mcp_server=True)