Quantize-HF-Models

Sleeping

File size: 5,787 Bytes

import os
import subprocess
import streamlit as st
from huggingface_hub import snapshot_download, HfApi

# ============================================================
# SESSION STATE
# ============================================================

if "quantized_models" not in st.session_state:
    st.session_state.quantized_models = []

# ============================================================
# CONFIG
# ============================================================

MODELS_LIST = ['rahul7star/Qwen3-4B-Thinking-2509-Genius-Coder-AI-Full']

QUANT_TYPES = [
    "Q2_K", "Q3_K_l", "Q3_K_M", "Q3_K_S",
    "Q4_0", "Q4_1", "Q4_K_M", "Q4_K_S",
    "Q5_0", "Q5_1", "Q5_K_M", "Q5_K_S",
    "Q6_K", "Q8_0", "BF16", "F16", "F32"
]

LLAMA_CPP_PATH = "/app/llama.cpp"
CONVERT_SCRIPT = f"{LLAMA_CPP_PATH}/convert_hf_to_gguf.py"
QUANTIZE_BIN = f"{LLAMA_CPP_PATH}/build/bin/llama-quantize"

# ============================================================
# UTILS
# ============================================================

def check_dependencies():
    if not os.path.exists(CONVERT_SCRIPT):
        st.error("❌ convert_hf_to_gguf.py not found")
        st.stop()
    if not os.path.exists(QUANTIZE_BIN):
        st.error("❌ llama-quantize binary not found")
        st.stop()

def download_model(hf_model_name, output_dir="/tmp/models"):
    st.write(f"📥 Downloading `{hf_model_name}` ...")
    model_path = snapshot_download(
        repo_id=hf_model_name,
        local_dir=output_dir,
        local_dir_use_symlinks=False
    )
    st.success("✅ Model downloaded")
    return model_path

def convert_to_gguf(model_path, output_file):
    st.write("🔄 Converting to GGUF...")
    cmd = [
        "python3",
        CONVERT_SCRIPT,
        model_path,
        "--outfile",
        output_file
    ]
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode != 0:
        st.error(result.stderr)
        raise RuntimeError("Conversion failed")
    st.success("✅ GGUF created")

def quantize_model(gguf_file, quant_type):
    output_file = gguf_file.replace(".gguf", f"-{quant_type}.gguf")

    st.write(f"⚡ Quantizing → {quant_type}")
    cmd = [
        QUANTIZE_BIN,
        gguf_file,
        output_file,
        quant_type
    ]

    result = subprocess.run(cmd, capture_output=True, text=True)

    if result.returncode != 0:
        st.error(result.stderr)
        return None

    st.success(f"✅ {quant_type} done")
    return output_file

def upload_to_huggingface(file_path, repo_id):
    hf_token = os.getenv("HF_TOKEN")

    if not hf_token:
        st.error("❌ HF_TOKEN not found in environment variables")
        return

    api = HfApi(token=hf_token)

    api.create_repo(repo_id, exist_ok=True, repo_type="model")

    api.upload_file(
        path_or_fileobj=file_path,
        path_in_repo=os.path.basename(file_path),
        repo_id=repo_id,
    )

    st.success(f"🚀 Uploaded to https://huggingface.co/{repo_id}")

# ============================================================
# UI
# ============================================================

st.title("🦙 LLaMA.cpp Multi-Quantization Tool")

check_dependencies()

# Model selection
selected_model = st.selectbox(
    "Select Hugging Face Model",
    MODELS_LIST,
    index=None
)

hf_model_name = selected_model or st.text_input(
    "Or Enter Custom HF Model ID"
)

# Multi-checkbox quant selection
st.subheader("Select Quantization Types")

selected_quants = []
cols = st.columns(4)

for i, quant in enumerate(QUANT_TYPES):
    with cols[i % 4]:
        if st.checkbox(quant):
            selected_quants.append(quant)

# Start button
if st.button("🚀 Start Quantization"):

    if not hf_model_name:
        st.warning("Please enter a model name")
        st.stop()

    if not selected_quants:
        st.warning("Select at least one quant type")
        st.stop()

    with st.spinner("Processing..."):
        try:
            base_dir = "/tmp/models"
            os.makedirs(base_dir, exist_ok=True)

            model_path = download_model(hf_model_name, base_dir)

            gguf_file = os.path.join(
                base_dir,
                hf_model_name.replace("/", "_") + ".gguf"
            )

            convert_to_gguf(model_path, gguf_file)

            st.session_state.quantized_models = []

            for quant in selected_quants:
                quant_file = quantize_model(gguf_file, quant)
                if quant_file:
                    st.session_state.quantized_models.append(quant_file)

            st.success("🎉 All quantizations completed")

        except Exception as e:
            st.error(f"❌ Error: {str(e)}")

# ============================================================
# DOWNLOAD + UPLOAD SECTION
# ============================================================

if st.session_state.quantized_models:

    st.subheader("📦 Generated Models")

    for file_path in st.session_state.quantized_models:

        with open(file_path, "rb") as f:
            st.download_button(
                label=f"⬇️ Download {os.path.basename(file_path)}",
                data=f,
                file_name=os.path.basename(file_path),
                key=file_path
            )

    st.divider()

    st.subheader("🚀 Upload to Hugging Face")

    repo_id = st.text_input(
        "Target Repository (e.g. username/model-quant)"
    )

    if st.button("📤 Upload All to HF"):
        if not repo_id:
            st.warning("Enter repository ID")
        else:
            with st.spinner("Uploading..."):
                for file_path in st.session_state.quantized_models:
                    upload_to_huggingface(file_path, repo_id)

                st.success("✅ All files uploaded successfully")