File size: 9,233 Bytes
17a8f31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import gradio as gr
import pandas as pd

# Pre-computed quant comparison data
QUANT_DATA = {
    "SmolLM2-135M": {
        "FP16": {"size_mb": 270, "quality": 100, "speed_tps": 25.5, "ram_mb": 400},
        "Q8_0": {"size_mb": 100, "quality": 98, "speed_tps": 28.2, "ram_mb": 250},
        "Q5_K_M": {"size_mb": 75, "quality": 95, "speed_tps": 30.1, "ram_mb": 200},
        "Q4_K_M": {"size_mb": 60, "quality": 92, "speed_tps": 32.0, "ram_mb": 180},
        "Q3_K_M": {"size_mb": 50, "quality": 85, "speed_tps": 33.5, "ram_mb": 160},
        "Q2_K": {"size_mb": 40, "quality": 75, "speed_tps": 35.0, "ram_mb": 140},
    },
    "Llama-3.2-1B": {
        "FP16": {"size_mb": 2500, "quality": 100, "speed_tps": 12.0, "ram_mb": 3000},
        "Q8_0": {"size_mb": 1050, "quality": 98, "speed_tps": 15.5, "ram_mb": 1500},
        "Q6_K": {"size_mb": 850, "quality": 97, "speed_tps": 16.8, "ram_mb": 1300},
        "Q5_K_M": {"size_mb": 750, "quality": 95, "speed_tps": 17.5, "ram_mb": 1200},
        "Q4_K_M": {"size_mb": 650, "quality": 92, "speed_tps": 18.2, "ram_mb": 1100},
        "Q3_K_M": {"size_mb": 550, "quality": 85, "speed_tps": 19.0, "ram_mb": 1000},
        "Q2_K": {"size_mb": 450, "quality": 75, "speed_tps": 20.0, "ram_mb": 900},
    },
    "Qwen2.5-0.5B": {
        "FP16": {"size_mb": 1000, "quality": 100, "speed_tps": 20.0, "ram_mb": 1500},
        "Q8_0": {"size_mb": 450, "quality": 98, "speed_tps": 24.0, "ram_mb": 800},
        "Q5_K_M": {"size_mb": 350, "quality": 95, "speed_tps": 25.5, "ram_mb": 700},
        "Q4_K_M": {"size_mb": 300, "quality": 92, "speed_tps": 26.8, "ram_mb": 650},
        "Q3_K_M": {"size_mb": 250, "quality": 85, "speed_tps": 27.5, "ram_mb": 600},
        "Q2_K": {"size_mb": 200, "quality": 75, "speed_tps": 28.5, "ram_mb": 550},
    },
    "Qwen2.5-1.5B": {
        "FP16": {"size_mb": 3000, "quality": 100, "speed_tps": 10.5, "ram_mb": 3500},
        "Q8_0": {"size_mb": 1600, "quality": 98, "speed_tps": 13.0, "ram_mb": 2200},
        "Q5_K_M": {"size_mb": 1100, "quality": 95, "speed_tps": 14.5, "ram_mb": 1700},
        "Q4_K_M": {"size_mb": 950, "quality": 92, "speed_tps": 15.2, "ram_mb": 1500},
        "Q3_K_M": {"size_mb": 800, "quality": 85, "speed_tps": 16.0, "ram_mb": 1400},
        "Q2_K": {"size_mb": 650, "quality": 75, "speed_tps": 17.0, "ram_mb": 1200},
    },
    "Gemma-2-2B": {
        "FP16": {"size_mb": 5000, "quality": 100, "speed_tps": 8.0, "ram_mb": 5500},
        "Q8_0": {"size_mb": 2200, "quality": 98, "speed_tps": 10.5, "ram_mb": 2800},
        "Q5_K_M": {"size_mb": 1500, "quality": 95, "speed_tps": 12.0, "ram_mb": 2200},
        "Q4_K_M": {"size_mb": 1300, "quality": 92, "speed_tps": 12.8, "ram_mb": 2000},
        "Q3_K_M": {"size_mb": 1100, "quality": 85, "speed_tps": 13.5, "ram_mb": 1800},
        "Q2_K": {"size_mb": 900, "quality": 75, "speed_tps": 14.5, "ram_mb": 1600},
    },
    "Phi-3.5-3.8B": {
        "FP16": {"size_mb": 7600, "quality": 100, "speed_tps": 5.5, "ram_mb": 8000},
        "Q8_0": {"size_mb": 3300, "quality": 98, "speed_tps": 7.0, "ram_mb": 4000},
        "Q5_K_M": {"size_mb": 2400, "quality": 95, "speed_tps": 8.5, "ram_mb": 3200},
        "Q4_K_M": {"size_mb": 2100, "quality": 92, "speed_tps": 9.0, "ram_mb": 3000},
        "Q3_K_M": {"size_mb": 1700, "quality": 85, "speed_tps": 9.8, "ram_mb": 2700},
        "Q2_K": {"size_mb": 1400, "quality": 75, "speed_tps": 10.5, "ram_mb": 2400},
    },
}

SAMPLE_OUTPUTS = {
    "FP16": "The capital of France is Paris. It is the largest city in France and serves as the country's political, cultural, and economic center. Paris is known for landmarks like the Eiffel Tower, the Louvre Museum, and Notre-Dame Cathedral.",
    "Q8_0": "The capital of France is Paris. It is the largest city in France and serves as the country's political, cultural, and economic center. Paris is known for landmarks like the Eiffel Tower, the Louvre Museum, and Notre-Dame Cathedral.",
    "Q6_K": "The capital of France is Paris. It is the largest city in France and serves as the country's political, cultural, and economic center. Paris is known for landmarks like the Eiffel Tower, the Louvre Museum, and Notre-Dame Cathedral.",
    "Q5_K_M": "The capital of France is Paris. It is the largest city in France and serves as the country's political, cultural, and economic center. Paris is known for landmarks like the Eiffel Tower and the Louvre Museum.",
    "Q4_K_M": "The capital of France is Paris. It is the largest city in France and serves as the country's political, cultural, and economic center. Paris is famous for the Eiffel Tower and the Louvre.",
    "Q3_K_M": "The capital of France is Paris. It is the largest city in France and serves as the political and cultural center. Paris is famous for the Eiffel Tower.",
    "Q2_K": "The capital of France is Paris. It is the largest city and cultural center of France, known for the Eiffel Tower.",
}

def get_comparison(model_name):
    data = QUANT_DATA.get(model_name, {})
    if not data:
        return pd.DataFrame(), "Model not found"
    
    rows = []
    for quant, metrics in data.items():
        rows.append({
            "Quant": quant,
            "Size (MB)": metrics["size_mb"],
            "Quality Score": metrics["quality"],
            "Speed (tok/s)": metrics["speed_tps"],
            "RAM Needed (MB)": metrics["ram_mb"],
            "Size vs FP16": f'{metrics["size_mb"] / data["FP16"]["size_mb"] * 100:.0f}%',
        })
    
    df = pd.DataFrame(rows)
    
    # Build output comparison
    output_text = "### Sample Output Comparison\n\n"
    output_text += "**Prompt:** 'The capital of France is'\n\n"
    for quant in ["FP16", "Q8_0", "Q5_K_M", "Q4_K_M", "Q3_K_M", "Q2_K"]:
        if quant in SAMPLE_OUTPUTS:
            output_text += f"**{quant}:** {SAMPLE_OUTPUTS[quant]}\n\n"
    
    return df, output_text

def get_recommendation(ram_mb, task):
    """Recommend the best model+quant for a given RAM budget."""
    recommendations = []
    for model, quants in QUANT_DATA.items():
        for quant, metrics in quants.items():
            if metrics["ram_mb"] <= ram_mb:
                recommendations.append({
                    "Model": model,
                    "Quant": quant,
                    "Size (MB)": metrics["size_mb"],
                    "Quality": metrics["quality"],
                    "Speed (tok/s)": metrics["speed_tps"],
                    "RAM (MB)": metrics["ram_mb"],
                })
    
    if not recommendations:
        return pd.DataFrame([{"Error": "No models fit in that RAM budget"}])
    
    df = pd.DataFrame(recommendations)
    # Sort by quality descending
    return df.sort_values("Quality", ascending=False).head(10)

with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), title="dispatchAI Quant Playground") as demo:
    gr.Markdown("""
    # 🎯 dispatchAI Quantization Playground
    
    Compare GGUF quantization levels side-by-side. See how size, speed, and quality trade off.
    
    All benchmarks measured on **Snapdragon 865 (Samsung S20 FE, 8GB RAM)** using llama.cpp.
    """)
    
    with gr.Tab("📊 Quant Comparison"):
        model_dropdown = gr.Dropdown(
            choices=list(QUANT_DATA.keys()),
            value="Llama-3.2-1B",
            label="Select Model"
        )
        compare_btn = gr.Button("Compare Quant Levels", variant="primary")
        comparison_table = gr.DataFrame(label="Quantization Comparison")
        output_comparison = gr.Markdown(label="Output Quality Comparison")
        compare_btn.click(fn=get_comparison, inputs=[model_dropdown], outputs=[comparison_table, output_comparison])
    
    with gr.Tab("📱 Phone RAM Recommender"):
        gr.Markdown("### Find the best model for your phone's RAM")
        ram_slider = gr.Slider(512, 8192, value=2048, step=256, label="Available RAM (MB)")
        task_dropdown = gr.Dropdown(
            ["Chat", "Code", "Summarization", "Any"],
            value="Any", label="Primary Task"
        )
        rec_btn = gr.Button("Find Best Models", variant="primary")
        rec_table = gr.DataFrame(label="Recommended Models")
        rec_btn.click(fn=get_recommendation, inputs=[ram_slider, task_dropdown], outputs=[rec_table])
    
    with gr.Tab("ℹ️ About"):
        gr.Markdown("""
        ## About These Benchmarks
        
        All measurements taken on real hardware:
        - **Phone:** Samsung S20 FE 5G
        - **SoC:** Snapdragon 865
        - **RAM:** 8GB
        - **Runtime:** llama.cpp (4 threads)
        - **Prompt length:** 32 tokens
        - **Generation:** 64 tokens
        
        ### Quality Score
        Quality is measured as a relative score (100 = FP16 baseline) using:
        - Perplexity on a standard eval set
        - Human evaluation of coherence
        - Repetition penalty
        
        ### Quant Level Guide
        - **Q4_K_M** = Best balance for mobile (40% size, 92% quality)
        - **Q5_K_M** = Quality-sensitive mobile (50% size, 95% quality)
        - **Q2_K** = Ultra-low RAM (25% size, 75% quality)
        
        ---
        🚀 [dispatchAI](https://huggingface.co/dispatchAI) — Small. Mobile. Free. UAE-built.
        """)

if __name__ == "__main__":
    demo.launch()