Spaces:

Sculptor-AI
/

auto-gguf-quant

Running

App Files Files Community

Kaileh57 commited on Mar 9, 2025

Commit

8df3b1d

1 Parent(s): 0a23172

fix

Browse files

Files changed (5) hide show

README.md +38 -1
app.py +99 -345
monitor.py +259 -0
quantize.py +197 -0
requirements.txt +6 -5

README.md CHANGED Viewed

@@ -10,4 +10,41 @@ pinned: false
 short_description: Automatically quantizes Sculptor models
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 short_description: Automatically quantizes Sculptor models
 ---
+# Ursa Minor Quantization Monitor
+This Space automatically generates quantized versions of the [Sculptor-AI/Ursa_Minor](https://huggingface.co/Sculptor-AI/Ursa_Minor) model and uploads them to the [Sculptor-AI/Ursa_Minor_Quantized](https://huggingface.co/Sculptor-AI/Ursa_Minor_Quantized) repository.
+## Features
+- Monitors the source repository for updates
+- Automatically generates quantized versions when the source model is updated
+- Displays a progress bar during quantization
+- Shows an "up to date" indicator when all quantizations are complete
+- Handles out-of-memory errors gracefully
+## Quantization Types
+The following quantizations are generated in order from smallest to largest:
+| Type | Size (GB) | Notes |
+|------|-----------|-------|
+| GGUF Q2_K | 0.8 | |
+| GGUF Q3_K_S | 0.9 | |
+| GGUF Q3_K_M | 0.9 | lower quality |
+| GGUF Q3_K_L | 1.0 | |
+| GGUF IQ4_XS | 1.0 | |
+| GGUF Q4_K_S | 1.0 | fast, recommended |
+| GGUF Q4_K_M | 1.1 | fast, recommended |
+| GGUF Q5_K_S | 1.2 | |
+| GGUF Q5_K_M | 1.2 | |
+| GGUF Q6_K | 1.4 | very good quality |
+| GGUF Q8_0 | 1.7 | fast, best quality |
+| GGUF f16 | 3.2 | 16 bpw, overkill |
+## Setup
+To run this Space, you need to set an `HF_TOKEN` environment variable with write access to the destination repository.
+## Note About Free Compute Tier
+The Hugging Face free compute tier has limited memory. This Space is designed to handle out-of-memory errors gracefully, but larger quantizations may fail due to memory constraints. If you need to generate larger quantizations, consider upgrading to a paid compute tier.

app.py CHANGED Viewed

@@ -1,360 +1,114 @@
-import os
-import sys
 import gradio as gr
-import subprocess
-import tempfile
-import shutil
-from huggingface_hub import HfApi, login, Repository
 import time
 import threading
-# Initialize Hugging Face API
-hf_token = os.environ.get("HF_TOKEN")
-api = HfApi(token=hf_token)
-if hf_token:
-    login(token=hf_token)
-else:
-    print("WARNING: HF_TOKEN not set. You'll be limited to public repositories.")
-# Define quantization options
-QUANT_TYPES = {
-    "Q4_K_M": "q4_k_m",  # 4-bit, good quality and size
-    "Q5_K_M": "q5_k_m",  # 5-bit, better quality
-    "Q8_0": "q8_0"      # 8-bit, high quality
-}
-def install_llama_cpp():
-    """Install llama.cpp if not already installed"""
-    if not os.path.exists("llama.cpp"):
-        print("Installing llama.cpp...")
-        # Clone llama.cpp
-        subprocess.run(["git", "clone", "https://github.com/ggerganov/llama.cpp.git", "--depth=1"], check=True)
-        # Build llama.cpp (minimal build for conversion only)
-        os.chdir("llama.cpp")
-        subprocess.run(["make", "clean"], check=True)
-        subprocess.run(["make", "convert"], check=True)
-        os.chdir("..")
-        print("llama.cpp installed successfully")
-    else:
-        print("llama.cpp already installed")
-def clone_repo_shallow(repo_id, target_dir):
-    """Clone only the necessary files from a repo to save space"""
-    print(f"Cloning {repo_id} to {target_dir}...")
-    # Create a sparse checkout to save space
-    cmd = [
-        "git", "clone",
-        "--depth=1",
-        "--filter=blob:none",
-        f"https://huggingface.co/{repo_id}",
-        target_dir
-    ]
-    subprocess.run(cmd, check=True)
-    print(f"Repository {repo_id} cloned successfully")
-def find_model_files(directory):
-    """Find model files in the repository"""
-    # Look for common model file patterns
-    model_files = []
-    # Safetensors is preferred (usually smaller)
-    for pattern in ["*.safetensors", "consolidated.*.pt", "pytorch_model.bin", "*.bin"]:
-        cmd = ["find", directory, "-name", pattern]
-        result = subprocess.run(cmd, capture_output=True, text=True)
-        if result.stdout:
-            model_files.extend(result.stdout.strip().split('\n'))
-    # Filter out empty strings and sort by size (prefer smaller files for HF format)
-    model_files = [f for f in model_files if f]
-    if not model_files:
-        return []
-    # Check for model configuration
-    config_file = None
-    cmd = ["find", directory, "-name", "config.json"]
-    result = subprocess.run(cmd, capture_output=True, text=True)
-    if result.stdout:
-        config_file = result.stdout.strip().split('\n')[0]
-    return model_files, config_file
-def quantize_model(repo_id, quant_types, progress=gr.Progress()):
-    """Quantize a model with llama.cpp and push to Hugging Face"""
-    # Install llama.cpp if needed
-    install_llama_cpp()
-    # Create temporary directories for processing
-    with tempfile.TemporaryDirectory() as temp_dir:
-        progress(0.1, "Cloning repository...")
-        model_dir = os.path.join(temp_dir, "model")
-        output_dir = os.path.join(temp_dir, "output")
-        os.makedirs(model_dir, exist_ok=True)
-        os.makedirs(output_dir, exist_ok=True)
-        try:
-            # Clone the source repository
-            clone_repo_shallow(repo_id, model_dir)
-            # Find model files
-            progress(0.2, "Looking for model files...")
-            model_file_info = find_model_files(model_dir)
-            if not model_file_info:
-                return "No model files found in the repository."
-            model_files, config_file = model_file_info
-            model_file = model_files[0]  # Use the first model file found
-            progress(0.3, "Determining model type...")
-            # Try to determine model type
-            model_type = "llama"  # Default model type
-            if config_file:
-                with open(config_file, 'r') as f:
-                    import json
-                    config = json.load(f)
-                    if 'model_type' in config:
-                        config_model_type = config['model_type'].lower()
-                        # Map model type to llama.cpp supported types
-                        type_mapping = {
-                            'llama': 'llama',
-                            'mistral': 'llama',
-                            'mixtral': 'llama',
-                            'falcon': 'falcon',
-                            'mpt': 'mpt',
-                            'gpt_neox': 'gptneox',
-                            'gptj': 'gptj',
-                            'bloom': 'bloom'
-                        }
-                        model_type = type_mapping.get(config_model_type, 'llama')
-            # Create output repository name
-            repo_name = repo_id.split('/')[-1]
-            target_repo_id = f"{repo_id}-gguf"
-            # Create the output repository if it doesn't exist
-            progress(0.4, "Creating target repository...")
-            try:
-                api.create_repo(repo_id=target_repo_id, exist_ok=True)
-            except Exception as e:
-                return f"Error creating repository: {str(e)}"
-            success_count = 0
-            progress_step = 0.5 / len(quant_types)
-            progress_value = 0.4
-            # Process each quantization type
-            for quant_name, quant_type in quant_types.items():
-                progress_value += progress_step
-                progress(progress_value, f"Processing {quant_name} quantization...")
-                output_file = os.path.join(output_dir, f"{repo_name}-{quant_name}.gguf")
-                # Convert to GGUF format
-                print(f"Converting to {quant_name}...")
-                convert_cmd = [
-                    "python3",
-                    os.path.join("llama.cpp", "convert.py"),
-                    f"--model-type", model_type,
-                    f"--outtype", "f16",
-                    f"--outfile", output_file
-                ]
-                # Add model path
-                convert_cmd.append(model_file)
-                try:
-                    # First convert to GGUF format (without quantization)
-                    subprocess.run(convert_cmd, check=True)
-                    # Then quantize if needed
-                    if quant_type != "f16":
-                        quant_output = output_file.replace(".gguf", f"-{quant_type}.gguf")
-                        quantize_cmd = [
-                            os.path.join("llama.cpp", "quantize"),
-                            output_file,
-                            quant_output,
-                            quant_type
-                        ]
-                        subprocess.run(quantize_cmd, check=True)
-                        # Replace the output file with the quantized version
-                        os.remove(output_file)
-                        os.rename(quant_output, output_file)
-                    # Upload to HF
-                    progress(progress_value + (progress_step * 0.7), f"Uploading {quant_name}...")
-                    api.upload_file(
-                        path_or_fileobj=output_file,
-                        path_in_repo=f"{repo_name}-{quant_name}.gguf",
-                        repo_id=target_repo_id,
-                        commit_message=f"Add {quant_name} quantized version"
-                    )
-                    success_count += 1
-                except Exception as e:
-                    print(f"Error processing {quant_name}: {str(e)}")
-            progress(1.0, "Completed!")
-            if success_count > 0:
-                return f"Successfully created {success_count} quantized versions in {target_repo_id}"
             else:
-                return "Failed to create any quantized versions."
-        except Exception as e:
-            return f"Error: {str(e)}"
-# Webhook handler - this will be called when the repo is updated
-def setup_webhook(repo_id, target_repo=None, webhook_url=None):
-    """Set up a webhook for repository updates"""
-    if not hf_token:
-        return "HF_TOKEN not set. Cannot set up webhook."
-    if not target_repo:
-        target_repo = f"{repo_id}-gguf"
-    # Create the webhook URL for this space
-    if not webhook_url:
-        # Get the current space name from HF_SPACE_ID
-        space_id = os.environ.get("HF_SPACE_ID")
-        if not space_id:
-            return "Cannot determine current Space ID. Please specify webhook_url manually."
-        webhook_url = f"https://huggingface.co/spaces/{space_id}/webhook"
-    try:
-        # Add webhook to the source repository
-        api.add_webhook(
-            repo_id=repo_id,
-            webhook_url=webhook_url,
-            webhook_type="repo-update"
-        )
-        return f"Webhook set up for {repo_id} -> {webhook_url}"
-    except Exception as e:
-        return f"Error setting up webhook: {str(e)}"
-# Create Gradio interface
-with gr.Blocks() as interface:
-    gr.Markdown("# GGUF Quantizer (Free Tier)")
-    gr.Markdown("Automatically create GGUF quantized versions of Hugging Face models")
-    with gr.Tab("Quantize Model"):
-        with gr.Row():
-            repo_id = gr.Textbox(label="Model Repository ID (e.g., 'mistralai/Mistral-7B-v0.1')")
-        with gr.Row():
-            q4_k_m = gr.Checkbox(label="Q4_K_M (4-bit, balanced quality/size)", value=True)
-            q5_k_m = gr.Checkbox(label="Q5_K_M (5-bit, higher quality)", value=False)
-            q8_0 = gr.Checkbox(label="Q8_0 (8-bit, highest quality)", value=False)
-        quantize_btn = gr.Button("Quantize Model")
-        output = gr.Textbox(label="Status")
-        def process_quantize(repo_id, q4_k_m, q5_k_m, q8_0, progress=gr.Progress()):
-            selected_types = {}
-            if q4_k_m:
-                selected_types["Q4_K_M"] = "q4_k_m"
-            if q5_k_m:
-                selected_types["Q5_K_M"] = "q5_k_m"
-            if q8_0:
-                selected_types["Q8_0"] = "q8_0"
-            if not selected_types:
-                return "Please select at least one quantization type"
-            return quantize_model(repo_id, selected_types, progress)
-        quantize_btn.click(
-            process_quantize,
-            inputs=[repo_id, q4_k_m, q5_k_m, q8_0],
-            outputs=output
-        )
-    with gr.Tab("Setup Webhook"):
-        gr.Markdown("""
-        ## Set up automatic quantization
-        This will set up a webhook to trigger quantization whenever the source repository is updated.
-        Note: This requires HF_TOKEN to be set in Space secrets.
-        """)
-        webhook_repo_id = gr.Textbox(label="Source Repository ID")
-        webhook_btn = gr.Button("Set Up Webhook")
-        webhook_output = gr.Textbox(label="Webhook Status")
-        webhook_btn.click(
-            setup_webhook,
-            inputs=[webhook_repo_id],
-            outputs=webhook_output
-        )
-    with gr.Tab("Instructions"):
-        gr.Markdown("""
-        ## Instructions
-        ### How to use this Space:
-        1. **Manual Quantization**: Enter a model repository ID and select quantization types
-        2. **Automatic Quantization**: Set up a webhook to trigger quantization when the model is updated
-        ### Adding HF_TOKEN to Space Secrets:
-        1. Go to your Space Settings
-        2. Click on "Repository Secrets"
-        3. Add a new secret with key `HF_TOKEN` and your Hugging Face API token as value
-        ### Limitations (Free Tier):
-        - Limited memory: Very large models may fail to process
-        - Limited storage: Files are processed in streaming mode, but temp files still need space
-        - Limited compute: Quantization may take longer than on paid tiers
-        - Jobs might be interrupted if they run too long
-        """)
-# Start Flask server to handle webhooks
-from flask import Flask, request, jsonify
-import threading
-app = Flask(__name__)
-@app.route('/webhook', methods=['POST'])
-def handle_webhook():
-    try:
-        payload = request.json
-        # Check if this is a repo update event
-        event_type = payload.get('event')
-        if event_type == 'repo-update':
-            repo_id = payload.get('repo', {}).get('name')
-            if repo_id:
-                # Run quantization in background
-                threading.Thread(target=lambda: quantize_model(
-                    repo_id,
-                    {"Q4_K_M": "q4_k_m"}  # Default to just Q4_K_M to save resources
-                )).start()
-                return jsonify({"status": "quantization scheduled"})
-        return jsonify({"status": "event ignored"})
-    except Exception as e:
-        return jsonify({"status": "error", "message": str(e)})
-# Launch both the Gradio and Flask apps
-import nest_asyncio
-import uvicorn
-from threading import Thread
-nest_asyncio.apply()
-# Launch the Gradio interface
-def launch_gradio():
-    interface.launch(debug=False)
-# Launch the Flask webhook handler
-def launch_flask():
-    uvicorn.run(app, host="0.0.0.0", port=7860)
-# Use the main Gradio interface as primary
 if __name__ == "__main__":
-    Thread(target=launch_flask).start()
-    launch_gradio()

 import gradio as gr
+import json
+import os
 import time
+from monitor import setup_monitor, check_repo_updates, get_status
 import threading
+# Initialize status
+if not os.path.exists("status.json"):
+    status = {
+        "last_checked": None,
+        "is_up_to_date": False,
+        "current_quantization": None,
+        "completed_quantizations": [],
+        "failed_quantizations": [],
+        "progress": 0,
+        "status_message": "Initializing...",
+        "out_of_memory": False,
+        "last_successful_quant": None
+    }
+    with open("status.json", "w") as f:
+        json.dump(status, f)
+# Start the monitoring thread
+monitor_thread = threading.Thread(target=setup_monitor, daemon=True)
+monitor_thread.start()
+# Define the Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# Ursa Minor Quantization Monitor")
+    gr.Markdown("This Space automatically generates quantized versions of the [Sculptor-AI/Ursa_Minor](https://huggingface.co/Sculptor-AI/Ursa_Minor) model.")
+    with gr.Row():
+        with gr.Column():
+            status_indicator = gr.Markdown("Loading status...")
+            last_checked = gr.Markdown("Last checked: Never")
+        with gr.Column():
+            check_button = gr.Button("Check for updates now")
+    with gr.Row():
+        progress_bar = gr.Progress(label="Quantization Progress")
+    with gr.Row():
+        completed_box = gr.Dataframe(
+            headers=["Quantization", "Size (GB)", "Status", "Notes"],
+            datatype=["str", "str", "str", "str"],
+            label="Quantization Status"
+        )
+    # Function to update the UI
+    def update_ui():
+        status = get_status()
+        # Update status indicator
+        if status["out_of_memory"]:
+            status_text = f"⚠️ **Out of Memory Error** - The Space ran out of memory while processing {status['last_successful_quant']}. Try using a paid compute tier for larger models."
+        elif status["is_up_to_date"]:
+            status_text = "✅ **Up to date** - All quantizations are complete."
+        elif status["current_quantization"]:
+            status_text = f"🔄 **Processing** - Currently quantizing {status['current_quantization']}."
+        else:
+            status_text = "⏳ **Waiting** - Checking for updates..."
+        # Update last checked time
+        last_checked_text = f"Last checked: {status['last_checked'] if status['last_checked'] else 'Never'}"
+        # Update progress bar
+        progress_value = status["progress"] / 100 if status["progress"] else 0
+        # Update quantization status table
+        quantization_types = [
+            ["GGUF Q2_K", "0.8", "", ""],
+            ["GGUF Q3_K_S", "0.9", "", ""],
+            ["GGUF Q3_K_M", "0.9", "", "lower quality"],
+            ["GGUF Q3_K_L", "1.0", "", ""],
+            ["GGUF IQ4_XS", "1.0", "", ""],
+            ["GGUF Q4_K_S", "1.0", "", "fast, recommended"],
+            ["GGUF Q4_K_M", "1.1", "", "fast, recommended"],
+            ["GGUF Q5_K_S", "1.2", "", ""],
+            ["GGUF Q5_K_M", "1.2", "", ""],
+            ["GGUF Q6_K", "1.4", "", "very good quality"],
+            ["GGUF Q8_0", "1.7", "", "fast, best quality"],
+            ["GGUF f16", "3.2", "", "16 bpw, overkill"]
+        ]
+        # Update status for each quantization
+        for quant in quantization_types:
+            quant_name = quant[0]
+            if quant_name in status["completed_quantizations"]:
+                quant[2] = "✅ Complete"
+            elif quant_name in status["failed_quantizations"]:
+                quant[2] = "❌ Failed"
+            elif quant_name == status["current_quantization"]:
+                quant[2] = "🔄 In progress"
             else:
+                quant[2] = "⏳ Waiting"
+        return status_text, last_checked_text, progress_value, quantization_types
+    # Function to handle manual update check
+    def check_updates():
+        check_repo_updates(force=True)
+        return update_ui()
+    # Connect buttons and set up timed refresh
+    check_button.click(check_updates, outputs=[status_indicator, last_checked, progress_bar, completed_box])
+    # Auto-refresh every 10 seconds
+    demo.load(update_ui, outputs=[status_indicator, last_checked, progress_bar, completed_box], every=10)
+# Launch the app
 if __name__ == "__main__":
+    demo.launch()

monitor.py ADDED Viewed

	@@ -0,0 +1,259 @@

+import json
+import os
+import time
+import requests
+from datetime import datetime
+from apscheduler.schedulers.background import BackgroundScheduler
+from quantize import quantize_model
+# Define the quantization types in order from smallest to largest
+QUANTIZATION_TYPES = [
+    "GGUF Q2_K",
+    "GGUF Q3_K_S",
+    "GGUF Q3_K_M",
+    "GGUF Q3_K_L",
+    "GGUF IQ4_XS",
+    "GGUF Q4_K_S",
+    "GGUF Q4_K_M",
+    "GGUF Q5_K_S",
+    "GGUF Q5_K_M",
+    "GGUF Q6_K",
+    "GGUF Q8_0",
+    "GGUF f16"
+]
+# Mapping of quantization types to llama.cpp quantization parameters
+QUANT_PARAMS = {
+    "GGUF Q2_K": "q2_k",
+    "GGUF Q3_K_S": "q3_k_s",
+    "GGUF Q3_K_M": "q3_k_m",
+    "GGUF Q3_K_L": "q3_k_l",
+    "GGUF IQ4_XS": "iq4_xs",
+    "GGUF Q4_K_S": "q4_k_s",
+    "GGUF Q4_K_M": "q4_k_m",
+    "GGUF Q5_K_S": "q5_k_s",
+    "GGUF Q5_K_M": "q5_k_m",
+    "GGUF Q6_K": "q6_k",
+    "GGUF Q8_0": "q8_0",
+    "GGUF f16": "f16"
+}
+# Source and destination repositories
+SOURCE_REPO = "Sculptor-AI/Ursa_Minor"
+DESTINATION_REPO = "Sculptor-AI/Ursa_Minor_Quantized"  # This should be created in advance
+def get_status():
+    """Read the current status from the status file"""
+    try:
+        with open("status.json", "r") as f:
+            return json.load(f)
+    except Exception as e:
+        print(f"Error reading status: {e}")
+        return {
+            "last_checked": None,
+            "is_up_to_date": False,
+            "current_quantization": None,
+            "completed_quantizations": [],
+            "failed_quantizations": [],
+            "progress": 0,
+            "status_message": "Error reading status",
+            "out_of_memory": False,
+            "last_successful_quant": None
+        }
+def update_status(updates):
+    """Update the status file with the provided updates"""
+    try:
+        status = get_status()
+        status.update(updates)
+        with open("status.json", "w") as f:
+            json.dump(status, f)
+    except Exception as e:
+        print(f"Error updating status: {e}")
+def get_repo_last_modified(repo_id):
+    """Get the last modified date of the repository"""
+    try:
+        url = f"https://huggingface.co/api/models/{repo_id}"
+        response = requests.get(url)
+        response.raise_for_status()
+        data = response.json()
+        return data.get("lastModified")
+    except Exception as e:
+        print(f"Error checking repository: {e}")
+        return None
+def check_repo_updates(force=False):
+    """Check if the source repository has been updated and start quantization if needed"""
+    now = datetime.now().isoformat()
+    update_status({"last_checked": now})
+    print(f"Checking for updates to {SOURCE_REPO}...")
+    # Get current status
+    status = get_status()
+    # If we're already processing, don't check for updates
+    if status["current_quantization"] and not force:
+        print("Already processing, skipping update check")
+        return
+    # If we had an out of memory error and this isn't a forced check, skip
+    if status["out_of_memory"] and not force:
+        print("Previous run had an out of memory error, skipping automatic update check")
+        return
+    # Check if the source repo has been updated
+    last_modified = get_repo_last_modified(SOURCE_REPO)
+    if not last_modified:
+        print("Couldn't get repository information, skipping update")
+        return
+    # Determine if we need to process quantizations
+    need_to_process = False
+    if force:
+        print("Forced update check, processing quantizations")
+        need_to_process = True
+    elif "source_last_modified" not in status or status["source_last_modified"] != last_modified:
+        print("Source repository has been updated, processing quantizations")
+        need_to_process = True
+        update_status({"source_last_modified": last_modified})
+    else:
+        print("Source repository hasn't changed, no processing needed")
+        # Check if all quantizations are complete
+        all_completed = all(quant in status["completed_quantizations"] for quant in QUANTIZATION_TYPES)
+        if all_completed:
+            update_status({"is_up_to_date": True})
+        return
+    # Reset status for a new processing run
+    if need_to_process:
+        update_status({
+            "is_up_to_date": False,
+            "progress": 0,
+            "out_of_memory": False,
+            "status_message": "Starting quantization process...",
+            "completed_quantizations": [],
+            "failed_quantizations": [],
+            "current_quantization": None
+        })
+        # Start the first quantization
+        start_next_quantization()
+def start_next_quantization():
+    """Start the next quantization in the queue"""
+    status = get_status()
+    # Check if we had an out of memory error
+    if status["out_of_memory"]:
+        print("Previous run had an out of memory error, not starting next quantization")
+        return
+    # Find the next quantization to process
+    completed = set(status["completed_quantizations"])
+    failed = set(status["failed_quantizations"])
+    processed = completed.union(failed)
+    next_quant = None
+    for quant in QUANTIZATION_TYPES:
+        if quant not in processed:
+            next_quant = quant
+            break
+    if not next_quant:
+        # All quantizations are complete
+        update_status({
+            "is_up_to_date": True,
+            "current_quantization": None,
+            "progress": 100,
+            "status_message": "All quantizations complete"
+        })
+        print("All quantizations complete!")
+        return
+    # Start the next quantization
+    update_status({
+        "current_quantization": next_quant,
+        "progress": 0,
+        "status_message": f"Starting {next_quant} quantization..."
+    })
+    print(f"Starting quantization: {next_quant}")
+    try:
+        # Run the quantization
+        success = quantize_model(
+            SOURCE_REPO,
+            DESTINATION_REPO,
+            next_quant,
+            QUANT_PARAMS[next_quant]
+        )
+        if success:
+            # Quantization completed successfully
+            print(f"Quantization {next_quant} completed successfully")
+            status = get_status()
+            completed = status["completed_quantizations"]
+            completed.append(next_quant)
+            update_status({
+                "completed_quantizations": completed,
+                "current_quantization": None,
+                "last_successful_quant": next_quant,
+                "progress": 100,
+                "status_message": f"Completed {next_quant} quantization"
+            })
+            # Start the next quantization
+            start_next_quantization()
+        else:
+            # Quantization failed
+            print(f"Quantization {next_quant} failed")
+            status = get_status()
+            failed = status["failed_quantizations"]
+            failed.append(next_quant)
+            update_status({
+                "failed_quantizations": failed,
+                "current_quantization": None,
+                "progress": 0,
+                "status_message": f"Failed {next_quant} quantization"
+            })
+            # Try the next quantization
+            start_next_quantization()
+    except MemoryError:
+        # Handle out of memory error
+        print(f"Out of memory error during {next_quant} quantization")
+        status = get_status()
+        failed = status["failed_quantizations"]
+        failed.append(next_quant)
+        update_status({
+            "failed_quantizations": failed,
+            "current_quantization": None,
+            "out_of_memory": True,
+            "progress": 0,
+            "status_message": f"Out of memory during {next_quant} quantization"
+        })
+def setup_monitor():
+    """Set up the scheduled monitoring"""
+    scheduler = BackgroundScheduler()
+    # Check for updates every hour
+    scheduler.add_job(check_repo_updates, 'interval', hours=1)
+    scheduler.start()
+    # Do an initial check
+    check_repo_updates()
+    try:
+        # Keep the thread alive
+        while True:
+            time.sleep(60)
+    except (KeyboardInterrupt, SystemExit):
+        scheduler.shutdown()

quantize.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import os
+import subprocess
+import tempfile
+import time
+import json
+import shutil
+from huggingface_hub import HfApi, Repository, snapshot_download
+from tqdm import tqdm
+def update_progress(progress):
+    """Update the progress in the status file"""
+    try:
+        with open("status.json", "r") as f:
+            status = json.load(f)
+        status["progress"] = progress
+        with open("status.json", "w") as f:
+            json.dump(status, f)
+    except Exception as e:
+        print(f"Error updating progress: {e}")
+def quantize_model(source_repo, dest_repo, quant_name, quant_type):
+    """
+    Download the model, quantize it, and upload to the destination repo
+    Args:
+        source_repo: HF repo ID for the source model
+        dest_repo: HF repo ID for the destination repo
+        quant_name: Name of the quantization (for display)
+        quant_type: llama.cpp quantization parameter
+    Returns:
+        bool: True if successful, False otherwise
+    """
+    try:
+        update_progress(5)
+        # Create temporary directories
+        with tempfile.TemporaryDirectory() as temp_dir:
+            model_dir = os.path.join(temp_dir, "model")
+            output_dir = os.path.join(temp_dir, "output")
+            os.makedirs(output_dir, exist_ok=True)
+            # Update status
+            with open("status.json", "r") as f:
+                status = json.load(f)
+            status["status_message"] = f"Downloading {source_repo}..."
+            with open("status.json", "w") as f:
+                json.dump(status, f)
+            # Download the model
+            print(f"Downloading {source_repo}...")
+            snapshot_download(
+                repo_id=source_repo,
+                local_dir=model_dir,
+                local_dir_use_symlinks=False
+            )
+            update_progress(30)
+            # Find the model file (assuming it's a .bin file)
+            model_files = [f for f in os.listdir(model_dir) if f.endswith(".bin")]
+            if not model_files:
+                print("No model file found")
+                return False
+            model_file = os.path.join(model_dir, model_files[0])
+            output_file = os.path.join(output_dir, f"Ursa_Minor-{quant_type}.gguf")
+            # Update status
+            with open("status.json", "r") as f:
+                status = json.load(f)
+            status["status_message"] = f"Quantizing to {quant_name}..."
+            with open("status.json", "w") as f:
+                json.dump(status, f)
+            # Run quantization
+            print(f"Quantizing to {quant_type}...")
+            command = [
+                "python", "-m", "llama_cpp.quantize",
+                model_file,
+                output_file,
+                f"--{quant_type}"
+            ]
+            try:
+                # Start the quantization process
+                process = subprocess.Popen(
+                    command,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.STDOUT,
+                    universal_newlines=True
+                )
+                # Monitor output for progress
+                for line in process.stdout:
+                    print(line, end="")
+                    if "Quantizing tensors" in line and ":" in line:
+                        try:
+                            # Parse progress from output
+                            parts = line.split(":")
+                            if len(parts) >= 2:
+                                progress_str = parts[1].strip()
+                                if "/" in progress_str:
+                                    current, total = map(int, progress_str.split("/"))
+                                    progress = 30 + int(60 * current / total)
+                                    update_progress(progress)
+                        except Exception as e:
+                            print(f"Error parsing progress: {e}")
+                # Wait for process to complete
+                process.wait()
+                if process.returncode != 0:
+                    print(f"Quantization failed with return code {process.returncode}")
+                    return False
+            except MemoryError:
+                print("Out of memory during quantization")
+                raise
+            except Exception as e:
+                print(f"Error during quantization: {e}")
+                return False
+            update_progress(90)
+            # Upload to Hugging Face
+            print(f"Uploading {quant_name} to {dest_repo}...")
+            # Update status
+            with open("status.json", "r") as f:
+                status = json.load(f)
+            status["status_message"] = f"Uploading {quant_name} to Hugging Face..."
+            with open("status.json", "w") as f:
+                json.dump(status, f)
+            # Login to HF if token is available
+            token = os.environ.get("HF_TOKEN")
+            if not token:
+                print("HF_TOKEN environment variable not set")
+                return False
+            api = HfApi(token=token)
+            # Create the repo if it doesn't exist
+            try:
+                api.create_repo(
+                    repo_id=dest_repo,
+                    exist_ok=True,
+                    private=False
+                )
+            except Exception as e:
+                print(f"Error creating repo: {e}")
+                return False
+            # Clone the repo
+            repo_dir = os.path.join(temp_dir, "repo")
+            repo = Repository(
+                local_dir=repo_dir,
+                clone_from=dest_repo,
+                token=token
+            )
+            # Copy the quantized model to the repo
+            output_file_name = os.path.basename(output_file)
+            shutil.copy(output_file, os.path.join(repo_dir, output_file_name))
+            # Create or update README.md
+            readme_path = os.path.join(repo_dir, "README.md")
+            if os.path.exists(readme_path):
+                with open(readme_path, "r") as f:
+                    readme_content = f.read()
+            else:
+                readme_content = f"# Ursa Minor Quantized Models\n\nThis repository contains quantized versions of the [Sculptor-AI/Ursa_Minor](https://huggingface.co/Sculptor-AI/Ursa_Minor) model.\n\n## Available Quantizations\n\n"
+            # Add or update the quantization entry in the README
+            quant_entry = f"- **{quant_name}**: [{output_file_name}](/{dest_repo}/blob/main/{output_file_name})\n"
+            if quant_entry not in readme_content:
+                readme_content += quant_entry
+                with open(readme_path, "w") as f:
+                    f.write(readme_content)
+            # Commit and push
+            repo.git_add()
+            repo.git_commit(f"Add {quant_name} quantization")
+            repo.git_push()
+            update_progress(100)
+            return True
+    except MemoryError:
+        # Special handling for memory errors
+        raise
+    except Exception as e:
+        print(f"Error in quantization process: {e}")
+        return False

requirements.txt CHANGED Viewed

@@ -1,5 +1,6 @@
-gradio>=3.41.0
-huggingface_hub>=0.16.0
-flask>=2.0.0
-nest_asyncio>=1.5.6
-uvicorn>=0.22.0

+gradio>=3.40.1
+huggingface_hub>=0.16.4
+requests>=2.31.0
+apscheduler>=3.10.1
+tqdm>=4.66.1
+llama-cpp-python>=0.2.10