Spaces:
Sleeping
Sleeping
| """ | |
| Automatic Model Quantization MVP | |
| Simple proof of concept for HuggingFace maintainers | |
| """ | |
| import gradio as gr | |
| from fastapi import FastAPI, Request, HTTPException | |
| from datetime import datetime | |
| import hmac | |
| import os | |
| import asyncio | |
| from typing import List, Dict | |
| from collections import deque | |
| import json | |
| # In-memory job queue (max 100 jobs) | |
| job_queue = deque(maxlen=100) | |
| processing = False | |
| # Create FastAPI app | |
| app = FastAPI(title="Auto-Quantization MVP") | |
| WEBHOOK_SECRET = os.getenv("WEBHOOK_SECRET", "change-me-in-production") | |
| async def webhook(request: Request): | |
| """ | |
| Receive HuggingFace webhook for model uploads | |
| To set up webhook: | |
| 1. Go to https://huggingface.co/settings/webhooks | |
| 2. Create webhook with URL: https://Sambhavnoobcoder-quantization-mvp.hf.space/webhook | |
| 3. Set secret to match WEBHOOK_SECRET | |
| 4. Select "Repository updates" event | |
| """ | |
| # Verify webhook secret | |
| signature = request.headers.get("X-Webhook-Secret", "") | |
| if not hmac.compare_digest(signature, WEBHOOK_SECRET): | |
| print("β οΈ Invalid webhook secret") | |
| raise HTTPException(status_code=403, detail="Invalid webhook secret") | |
| # Parse payload | |
| try: | |
| payload = await request.json() | |
| except Exception as e: | |
| print(f"β οΈ Error parsing payload: {e}") | |
| raise HTTPException(status_code=400, detail="Invalid payload") | |
| # Extract event details | |
| event = payload.get("event", {}) | |
| repo = payload.get("repo", {}) | |
| print(f"π₯ Received webhook: {event.get('action')} - {repo.get('name')}") | |
| # Check if it's a model upload | |
| if (event.get("action") == "update" and | |
| event.get("scope", "").startswith("repo.content") and | |
| repo.get("type") == "model"): | |
| model_id = repo.get("name") | |
| # Check if model is already in queue | |
| for job in job_queue: | |
| if job["model_id"] == model_id and job["status"] in ["queued", "processing"]: | |
| return { | |
| "status": "already_queued", | |
| "job_id": job["id"], | |
| "message": "Model already in queue" | |
| } | |
| # Add to queue | |
| job = { | |
| "id": len(job_queue) + 1, | |
| "model_id": model_id, | |
| "status": "queued", | |
| "method": "Quanto-int8", | |
| "timestamp": datetime.now().isoformat(), | |
| "owner": repo.get("owner", {}).get("name", "unknown"), | |
| "progress": 0 | |
| } | |
| job_queue.append(job) | |
| print(f"β Job #{job['id']} queued: {model_id}") | |
| return { | |
| "status": "queued", | |
| "job_id": job["id"], | |
| "model": model_id, | |
| "position": len([j for j in job_queue if j["status"] == "queued"]) | |
| } | |
| print(f"βοΈ Ignored event: {event.get('action')} - {repo.get('type')}") | |
| return {"status": "ignored", "reason": "Not a model upload"} | |
| async def get_jobs(): | |
| """Get all jobs (for dashboard)""" | |
| return list(job_queue) | |
| async def health(): | |
| """Health check endpoint""" | |
| return { | |
| "status": "healthy", | |
| "jobs_total": len(job_queue), | |
| "jobs_queued": len([j for j in job_queue if j["status"] == "queued"]), | |
| "jobs_processing": len([j for j in job_queue if j["status"] == "processing"]), | |
| "jobs_completed": len([j for j in job_queue if j["status"] == "completed"]), | |
| "jobs_failed": len([j for j in job_queue if j["status"] == "failed"]) | |
| } | |
| # Background task to process queue | |
| async def process_queue(): | |
| """Process quantization jobs in background""" | |
| global processing | |
| while True: | |
| try: | |
| if not processing and job_queue: | |
| # Find next queued job | |
| queued_jobs = [j for j in job_queue if j["status"] == "queued"] | |
| if queued_jobs: | |
| processing = True | |
| job = queued_jobs[0] | |
| print(f"π Processing job #{job['id']}: {job['model_id']}") | |
| # Import here to avoid circular dependency | |
| from quantizer import quantize_model | |
| # Process job | |
| await quantize_model(job) | |
| processing = False | |
| except Exception as e: | |
| print(f"β Error in queue processor: {e}") | |
| processing = False | |
| await asyncio.sleep(5) # Check every 5 seconds | |
| # Gradio UI | |
| def get_job_list(): | |
| """Get formatted job list for display""" | |
| if not job_queue: | |
| return """ | |
| ## No jobs yet | |
| Upload a model to HuggingFace Hub to trigger automatic quantization! | |
| ### Test with these steps: | |
| 1. Upload a small model (<1B params) to your HF account | |
| 2. Webhook will automatically trigger quantization | |
| 3. Quantized model will appear on Hub: `{model-name}-Quanto-int8` | |
| """ | |
| # Sort by most recent first | |
| sorted_jobs = sorted(list(job_queue), key=lambda x: x["id"], reverse=True) | |
| jobs_text = "" | |
| for job in sorted_jobs[:20]: # Show last 20 jobs | |
| status_emoji = { | |
| "queued": "β³", | |
| "processing": "π", | |
| "completed": "β ", | |
| "failed": "β" | |
| }.get(job["status"], "β") | |
| # Truncate model ID if too long | |
| model_display = job['model_id'] | |
| if len(model_display) > 50: | |
| model_display = model_display[:47] + "..." | |
| jobs_text += f"\n### {status_emoji} Job #{job['id']} - {job['status'].upper()}\n\n" | |
| jobs_text += f"**Model:** `{model_display}` \n" | |
| jobs_text += f"**Method:** {job['method']} \n" | |
| jobs_text += f"**Time:** {job['timestamp']} \n" | |
| if job["status"] == "completed" and "output_repo" in job: | |
| jobs_text += f"**β¨ Output:** [{job['output_repo']}](https://huggingface.co/{job['output_repo']}) \n" | |
| if job["status"] == "failed" and "error" in job: | |
| # Truncate long errors and make them more readable | |
| error_msg = job['error'] | |
| if len(error_msg) > 150: | |
| error_msg = error_msg[:150] + "..." | |
| jobs_text += f"**Error:** {error_msg} \n" | |
| jobs_text += "\n---\n" | |
| return jobs_text | |
| def get_metrics(): | |
| """Calculate metrics for display""" | |
| if not job_queue: | |
| return { | |
| "total": 0, | |
| "completed": 0, | |
| "failed": 0, | |
| "success_rate": "N/A", | |
| "time_saved": 0, | |
| "storage_saved": 0 | |
| } | |
| total = len(job_queue) | |
| completed = len([j for j in job_queue if j["status"] == "completed"]) | |
| # Only count legitimate failures (not "already quantized" or validation errors) | |
| legitimate_failures = [] | |
| for j in job_queue: | |
| if j["status"] == "failed": | |
| error = j.get("error", "") | |
| # Skip validation failures like "already quantized" | |
| if "already quantized" not in error.lower() and "skipping" not in error.lower(): | |
| legitimate_failures.append(j) | |
| failed = len(legitimate_failures) | |
| # Calculate success rate based only on legitimate attempts | |
| legitimate_attempts = completed + failed | |
| success_rate = f"{(completed/legitimate_attempts*100):.1f}%" if legitimate_attempts > 0 else "N/A" | |
| # Estimated time saved (30 min per model) | |
| time_saved = completed * 0.5 | |
| # Estimated storage saved (assuming avg 7GB reduction) | |
| storage_saved = completed * 7 | |
| return { | |
| "total": total, | |
| "completed": completed, | |
| "failed": failed, | |
| "success_rate": success_rate, | |
| "time_saved": time_saved, | |
| "storage_saved": storage_saved | |
| } | |
| # Build Gradio interface | |
| with gr.Blocks(title="Auto-Quantization MVP", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # π€ Automatic Model Quantization (MVP) | |
| **Proof of Concept:** Automatically quantize models uploaded to HuggingFace. | |
| ## π― How It Works | |
| 1. **Upload** a model to HuggingFace Hub | |
| 2. **Webhook triggers** this service automatically | |
| 3. **Model is quantized** using Quanto int8 (2x smaller, 99% quality) | |
| 4. **Quantized model uploaded** to Hub: `{model-name}-Quanto-int8` | |
| **Zero manual work required!** β¨ | |
| """) | |
| # Metrics | |
| with gr.Row(): | |
| with gr.Column(): | |
| metrics_display = gr.Markdown() | |
| gr.Markdown("---") | |
| # Job List | |
| gr.Markdown("## π Job History") | |
| job_display = gr.Markdown(get_job_list()) | |
| with gr.Row(): | |
| refresh_btn = gr.Button("π Refresh", variant="primary") | |
| def refresh_display(): | |
| metrics = get_metrics() | |
| metrics_md = f""" | |
| ## π Impact Metrics | |
| | Metric | Value | | |
| |--------|-------| | |
| | **Models Quantized** | {metrics['completed']} / {metrics['total']} | | |
| | **Success Rate** | {metrics['success_rate']} | | |
| | **Time Saved** | {metrics['time_saved']:.1f} hours | | |
| | **Storage Saved** | {metrics['storage_saved']:.0f} GB | | |
| """ | |
| return metrics_md, get_job_list() | |
| refresh_btn.click( | |
| fn=refresh_display, | |
| outputs=[metrics_display, job_display] | |
| ) | |
| # Initial load | |
| demo.load( | |
| fn=refresh_display, | |
| outputs=[metrics_display, job_display] | |
| ) | |
| gr.Markdown("---") | |
| gr.Markdown(""" | |
| ## βοΈ Setup Instructions | |
| ### 1. Configure Webhook | |
| Create a webhook in your [HuggingFace settings](https://huggingface.co/settings/webhooks): | |
| - **URL:** `https://Sambhavnoobcoder-quantization-mvp.hf.space/webhook` | |
| - **Secret:** Set `WEBHOOK_SECRET` in Space settings (βοΈ Settings β Repository secrets) | |
| - **Events:** Select "Repository updates" | |
| ### 2. Test with Small Model | |
| Upload a small model (<1B parameters) to test: | |
| - `TinyLlama/TinyLlama-1.1B-Chat-v1.0` | |
| - `facebook/opt-125m` | |
| - `EleutherAI/pythia-160m` | |
| ### 3. Monitor Progress | |
| Watch this dashboard - your model will be quantized automatically! | |
| --- | |
| ## π Roadmap | |
| Future quantization methods (based on community feedback): | |
| - [ ] **GPTQ 4-bit** (fastest inference on NVIDIA GPUs) | |
| - [ ] **GGUF** (CPU/mobile inference, Apple Silicon) | |
| - [ ] **AWQ 4-bit** (highest quality) | |
| - [ ] User preferences (choose which formats) | |
| - [ ] Quality evaluation (automatic perplexity testing) | |
| --- | |
| ## π Resources | |
| - **GitHub:** [View Source Code](https://github.com/Sambhavnoobcoder/auto-quantization-mvp) | |
| - **Forum:** [Discussion Thread](https://discuss.huggingface.co/) | |
| - **Contact:** indosambhav@gmail.com | |
| --- | |
| *Built as a proof of concept to demonstrate automatic quantization for HuggingFace* β¨ | |
| """) | |
| # Start background task processor | |
| async def startup_event(): | |
| """Start background task on startup""" | |
| print("π Starting background queue processor...") | |
| asyncio.create_task(process_queue()) | |
| # Mount Gradio app to FastAPI | |
| app = gr.mount_gradio_app(app, demo, path="/") | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) | |