import gradio as gr import os import subprocess import tempfile import shutil from huggingface_hub import HfApi, snapshot_download, upload_file import spaces QUANT_LEVELS = { "Q2_K": "q2_k", "Q3_K_M": "q3_k_m", "Q4_K_M": "q4_k_m", "Q5_K_M": "q5_k_m", "Q6_K": "q6_k", "Q8_0": "q8_0", "F16": "f16", } @spaces.GPU(duration=300) def convert_model(model_id, quant_levels, hf_token, progress=gr.Progress()): """Convert a HF model to GGUF format with specified quant levels.""" if not model_id: return "❌ Please enter a model ID", "" if not hf_token: return "❌ Please enter your HF token", "" results = [] api = HfApi(token=hf_token) progress(0.1, desc="Downloading model...") try: model_path = snapshot_download(model_id, token=hf_token, ignore_patterns=["*.gguf", "*.pth", "*.bin"]) except Exception as e: return f"❌ Download failed: {e}", "" for i, (q_name, q_code) in enumerate(quant_levels): progress(0.2 + 0.7 * (i / len(quant_levels)), desc=f"Converting {q_name}...") output_file = f"/tmp/{model_id.replace('/', '_')}_{q_name}.gguf" try: # Convert using llama.cpp's convert script cmd = [ "python", "llama.cpp/convert_hf_to_gguf.py", model_path, "--outtype", q_code, "--outfile", output_file ] result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) if result.returncode != 0: results.append(f"❌ {q_name}: conversion failed - {result.stderr[:200]}") continue file_size = os.path.getsize(output_file) / 1024 / 1024 results.append(f"✅ {q_name}: {file_size:.0f}MB") # Upload to the model repo filename = f"{q_name.lower()}.gguf" upload_file( path_or_fileobj=output_file, path_in_repo=filename, repo_id=model_id, token=hf_token, ) results.append(f" → Uploaded as {filename}") # Cleanup os.remove(output_file) except subprocess.TimeoutExpired: results.append(f"❌ {q_name}: conversion timed out") except Exception as e: results.append(f"❌ {q_name}: {e}") # Cleanup shutil.rmtree(model_path, ignore_errors=True) return "\n".join(results), "\n".join(results) with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), title="dispatchAI GGUF Converter") as demo: gr.Markdown(""" # 🔄 dispatchAI GGUF Converter Convert any HuggingFace model to GGUF format with multiple quantization levels. Runs on ZeroGPU — free, fast, no local compute needed. ## How it works 1. Enter the model ID (e.g., `dispatchAI/SmolLM2-135M-Instruct-mobile`) 2. Select quant levels (Q4_K_M is the sweet spot for mobile) 3. Enter your HF token (needs write access to the model repo) 4. Click Convert — the GGUF files will be uploaded to the model repo ## Quant Level Guide | Level | Size vs FP16 | Quality | Use Case | |-------|-------------|---------|----------| | Q2_K | ~25% | Low | Ultra-low RAM (1GB devices) | | Q3_K_M | ~30% | Fair | Very constrained devices | | Q4_K_M | ~40% | Good | **Sweet spot for mobile** | | Q5_K_M | ~50% | Very Good | Quality-sensitive mobile | | Q6_K | ~60% | Excellent | Near-lossless mobile | | Q8_0 | ~70% | Excellent | High-quality, smaller than FP16 | | F16 | 100% | Lossless | Reference / debugging | """) with gr.Row(): model_input = gr.Textbox( label="Model ID", placeholder="dispatchAI/SmolLM2-135M-Instruct-mobile", scale=3 ) token_input = gr.Textbox( label="HF Token (write access)", type="password", scale=2 ) quant_checkboxes = gr.CheckboxGroup( choices=list(QUANT_LEVELS.keys()), value=["Q4_K_M", "Q5_K_M", "Q8_0"], label="Quantization Levels", ) convert_btn = gr.Button("🔄 Convert", variant="primary", size="lg") output = gr.Textbox(label="Results", lines=15) convert_btn.click( fn=convert_model, inputs=[model_input, quant_checkboxes, token_input], outputs=[output, output] ) gr.Markdown(""" --- 🚀 [dispatchAI](https://huggingface.co/dispatchAI) — Small. Mobile. Free. UAE-built. """) if __name__ == "__main__": demo.launch()