Spaces:

tensiondriven
/

ultravox-quantizer

Runtime error

App Files Files Community

tensiondriven commited on Jan 16

Commit

c97f0b2

verified ·

1 Parent(s): 36277af

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +0 -195

app.py CHANGED Viewed

@@ -1,195 +0,0 @@
-"""
-Ultravox Quantizer - Quantizes fixie-ai/ultravox-v0_4_1-mistral-nemo to INT4
-and pushes to tensiondriven/ultravox-v0_4_1-mistral-nemo-int8
-Uses INT4 (4-bit) quantization because:
-- T4 has 16GB VRAM
-- 12B model at INT8 needs ~12GB, too tight with overhead
-- INT4 needs ~6GB, leaves room for processing
-"""
-import os
-import gc
-import gradio as gr
-import torch
-from huggingface_hub import HfApi, login
-# Config
-SOURCE_MODEL = "fixie-ai/ultravox-v0_4_1-mistral-nemo"
-TARGET_REPO = "tensiondriven/ultravox-v0_4_1-mistral-nemo-int8"
-HF_TOKEN = os.environ.get("HF_TOKEN")
-def clear_memory():
-    """Aggressively clear GPU memory"""
-    gc.collect()
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-        torch.cuda.synchronize()
-def quantize_and_push(use_int4: bool = True) -> str:
-    """
-    Quantize the model and push to HuggingFace Hub.
-    Args:
-        use_int4: If True, use INT4 (4-bit). If False, try INT8 (8-bit).
-    Returns:
-        Status message
-    """
-    if not HF_TOKEN:
-        return "ERROR: HF_TOKEN environment variable not set. Add it in Space settings."
-    try:
-        login(token=HF_TOKEN)
-        api = HfApi(token=HF_TOKEN)
-    except Exception as e:
-        return f"ERROR: Failed to authenticate with HuggingFace: {e}"
-    # Import here to catch import errors
-    try:
-        from transformers import AutoModelForCausalLM, AutoProcessor, BitsAndBytesConfig
-    except ImportError as e:
-        return f"ERROR: Missing dependency: {e}"
-    quant_type = "INT4 (4-bit)" if use_int4 else "INT8 (8-bit)"
-    output_lines = [f"Starting {quant_type} quantization of {SOURCE_MODEL}..."]
-    # Check GPU
-    if torch.cuda.is_available():
-        gpu_name = torch.cuda.get_device_name(0)
-        gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
-        output_lines.append(f"GPU: {gpu_name} with {gpu_mem:.1f}GB VRAM")
-    else:
-        return "ERROR: No CUDA GPU available. This Space requires GPU hardware."
-    clear_memory()
-    # Configure quantization
-    if use_int4:
-        bnb_config = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_quant_type="nf4",
-            bnb_4bit_compute_dtype=torch.float16,
-            bnb_4bit_use_double_quant=True,
-        )
-    else:
-        bnb_config = BitsAndBytesConfig(
-            load_in_8bit=True,
-        )
-    output_lines.append(f"Loading model with {quant_type} quantization...")
-    try:
-        # Load quantized model
-        model = AutoModelForCausalLM.from_pretrained(
-            SOURCE_MODEL,
-            quantization_config=bnb_config,
-            device_map="auto",
-            trust_remote_code=True,
-            torch_dtype=torch.float16,
-            low_cpu_mem_usage=True,
-        )
-        output_lines.append("Model loaded successfully!")
-        # Load processor
-        processor = AutoProcessor.from_pretrained(
-            SOURCE_MODEL,
-            trust_remote_code=True,
-        )
-        output_lines.append("Processor loaded successfully!")
-    except torch.cuda.OutOfMemoryError:
-        clear_memory()
-        return "\n".join(output_lines) + "\n\nERROR: Out of GPU memory. Try INT4 quantization instead."
-    except Exception as e:
-        clear_memory()
-        return "\n".join(output_lines) + f"\n\nERROR loading model: {e}"
-    # Create target repo if needed
-    try:
-        api.create_repo(repo_id=TARGET_REPO, exist_ok=True, private=False)
-        output_lines.append(f"Target repo ready: {TARGET_REPO}")
-    except Exception as e:
-        output_lines.append(f"Warning: Could not create/verify repo: {e}")
-    # Push to hub
-    output_lines.append(f"Pushing quantized model to {TARGET_REPO}...")
-    try:
-        model.push_to_hub(
-            TARGET_REPO,
-            token=HF_TOKEN,
-            safe_serialization=True,
-        )
-        output_lines.append("Model pushed successfully!")
-        processor.push_to_hub(
-            TARGET_REPO,
-            token=HF_TOKEN,
-        )
-        output_lines.append("Processor pushed successfully!")
-    except Exception as e:
-        clear_memory()
-        return "\n".join(output_lines) + f"\n\nERROR pushing to hub: {e}"
-    clear_memory()
-    output_lines.append("")
-    output_lines.append(f"SUCCESS! Quantized model available at:")
-    output_lines.append(f"https://huggingface.co/{TARGET_REPO}")
-    return "\n".join(output_lines)
-def run_int4():
-    """Run INT4 quantization (recommended for T4)"""
-    return quantize_and_push(use_int4=True)
-def run_int8():
-    """Run INT8 quantization (may OOM on T4)"""
-    return quantize_and_push(use_int4=False)
-def get_status():
-    """Check current status"""
-    lines = ["=== Ultravox Quantizer Status ==="]
-    lines.append(f"Source: {SOURCE_MODEL}")
-    lines.append(f"Target: {TARGET_REPO}")
-    lines.append(f"HF_TOKEN set: {'Yes' if HF_TOKEN else 'NO - add in Space settings!'}")
-    if torch.cuda.is_available():
-        lines.append(f"GPU: {torch.cuda.get_device_name(0)}")
-        free_mem = torch.cuda.mem_get_info()[0] / 1e9
-        total_mem = torch.cuda.mem_get_info()[1] / 1e9
-        lines.append(f"VRAM: {free_mem:.1f}GB free / {total_mem:.1f}GB total")
-    else:
-        lines.append("GPU: Not available (CUDA required)")
-    return "\n".join(lines)
-# Gradio UI
-with gr.Blocks(title="Ultravox Quantizer") as demo:
-    gr.Markdown("""
-    # Ultravox Model Quantizer
-    Quantizes `fixie-ai/ultravox-v0_4_1-mistral-nemo` and pushes to `tensiondriven/ultravox-v0_4_1-mistral-nemo-int8`
-    **Recommended: INT4** - T4 has 16GB VRAM, INT4 uses ~6GB leaving room for processing.
-    """)
-    with gr.Row():
-        status_btn = gr.Button("Check Status", variant="secondary")
-        int4_btn = gr.Button("Run INT4 Quantization (Recommended)", variant="primary")
-        int8_btn = gr.Button("Run INT8 Quantization (May OOM)", variant="secondary")
-    output = gr.Textbox(label="Output", lines=20, max_lines=50)
-    status_btn.click(fn=get_status, outputs=output)
-    int4_btn.click(fn=run_int4, outputs=output)
-    int8_btn.click(fn=run_int8, outputs=output)
-if __name__ == "__main__":
-    demo.launch()