Spaces:

Sculptor-AI
/

auto-gguf-quant

Running

App Files Files Community

Kaileh57 commited on Mar 10, 2025

Commit

49c6950

1 Parent(s): 29e0bb8

fix

Browse files

Files changed (6) hide show

Dockerfile +24 -0
app.py +474 -566
groups_merged.txt +0 -0
requirements.txt +3 -4
setup.sh +0 -48
start.sh +31 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,24 @@

+FROM python:3.10-slim
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    cmake \
+    git \
+    wget \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+# Copy requirements and install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of the code
+COPY . .
+# Make start script executable
+RUN chmod +x start.sh
+# Run the start script
+CMD ["./start.sh"]

app.py CHANGED Viewed

@@ -4,639 +4,547 @@ import signal
 import time
 import json
 from datetime import datetime
 import threading
-import logging
 import gradio as gr
-from huggingface_hub import HfApi, login, whoami
-from pathlib import Path
-import shutil
-# Configure logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-# Constants
-SOURCE_REPO = "Sculptor-AI/Ursa_Minor"
-HF_TOKEN = os.environ.get("HF_TOKEN")
-CONVERSION_SCRIPT = "./llama.cpp/convert.py"
-MODEL_CACHE_DIR = "model_cache"
-TEMP_DIR = "temp_outputs"
-# Restored full quantization set, sorted from smallest to largest
 QUANT_CONFIGS = [
-    {"name": "Q2_K", "size_gb": 0.8, "notes": "smallest size"},
-    {"name": "Q3_K_S", "size_gb": 0.9, "notes": "small size"},
-    {"name": "Q3_K_M", "size_gb": 0.9, "notes": "lower quality"},
-    {"name": "Q3_K_L", "size_gb": 1.0, "notes": ""},
-    {"name": "IQ4_XS", "size_gb": 1.0, "notes": ""},
-    {"name": "Q4_K_S", "size_gb": 1.0, "notes": "fast, recommended"},
-    {"name": "Q4_K_M", "size_gb": 1.1, "notes": "fast, recommended"},
-    {"name": "Q5_K_S", "size_gb": 1.2, "notes": "good balance"},
-    {"name": "Q5_K_M", "size_gb": 1.2, "notes": ""},
-    {"name": "Q6_K", "size_gb": 1.4, "notes": "very good quality"},
-    {"name": "Q8_0", "size_gb": 1.7, "notes": "fast, best quality"},
-    {"name": "f16", "size_gb": 3.2, "notes": "16 bpw, full precision"}
 ]
-# State variables
-state = {
-    "last_checked": None,
     "last_commit_hash": None,
-    "is_up_to_date": True,
-    "is_processing": False,
     "current_quant": None,
     "progress": 0,
-    "total_quants": len(QUANT_CONFIGS),
-    "completed_quants": [],
-    "failed_quants": [],
-    "out_of_memory": False,
-    "last_error": None,
-    "status_message": "Ready to check for updates"
 }
-# Initialize HF API
-hf_api = HfApi(token=HF_TOKEN)
-# Set up llama.cpp tools on first run
-if not os.path.exists("./llama.cpp/convert.py"):
-    try:
-        logger.info("Setting up llama.cpp tools...")
-        subprocess.run(["bash", "setup.sh"], check=True)
-        logger.info("Setup completed successfully")
-    except subprocess.CalledProcessError as e:
-        logger.error(f"Error setting up llama.cpp tools: {e}")
-        raise
-# Helper functions
-def save_state():
-    with open("state.json", "w") as f:
-        # Create a serializable copy of the state
-        serializable_state = state.copy()
-        serializable_state["last_checked"] = str(serializable_state["last_checked"]) if serializable_state["last_checked"] else None
-        json.dump(serializable_state, f)
-def load_state():
-    global state
     try:
-        if os.path.exists("state.json"):
-            with open("state.json", "r") as f:
-                loaded_state = json.load(f)
-                # Convert string back to datetime if it exists
-                if loaded_state.get("last_checked"):
-                    loaded_state["last_checked"] = datetime.fromisoformat(loaded_state["last_checked"])
-                state.update(loaded_state)
-    except Exception as e:
-        logger.error(f"Error loading state: {e}")
-def get_latest_commit():
     try:
-        repo_info = hf_api.repo_info(repo_id=SOURCE_REPO)
-        return repo_info.sha
     except Exception as e:
-        logger.error(f"Error getting latest commit: {e}")
         return None
 def check_for_updates():
-    global state
-    state["last_checked"] = datetime.now()
-    latest_commit = get_latest_commit()
-    if latest_commit and latest_commit != state["last_commit_hash"]:
-        logger.info(f"New commit detected: {latest_commit}")
-        state["last_commit_hash"] = latest_commit
-        state["is_up_to_date"] = False
-        state["status_message"] = f"Updates detected in {SOURCE_REPO}. Ready to generate quantizations."
-    else:
-        state["is_up_to_date"] = True
-        state["status_message"] = f"No updates detected in {SOURCE_REPO}. Last checked: {state['last_checked'].strftime('%Y-%m-%d %H:%M:%S')}"
-    save_state()
-    return state["status_message"]
-def download_model():
     try:
-        # Create cache directory if it doesn't exist
-        os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
-        # Clean up any previous downloads to save space
-        if os.path.exists(os.path.join(MODEL_CACHE_DIR, os.path.basename(SOURCE_REPO))):
-            shutil.rmtree(os.path.join(MODEL_CACHE_DIR, os.path.basename(SOURCE_REPO)))
-        # Get model repo information to find the smallest safetensors file
-        logger.info(f"Getting repository information for {SOURCE_REPO}")
-        files = hf_api.list_repo_files(repo_id=SOURCE_REPO)
-        # Filter for safetensors files (which are the model weights)
-        safetensors_files = [f for f in files if f.endswith(".safetensors")]
-        if not safetensors_files:
-            raise Exception(f"No safetensors files found in {SOURCE_REPO}")
-        # Download only required files instead of the entire repo to save space
-        # This includes model config and one weights file
-        required_files = [
-            "config.json",
-            "tokenizer.json",
-            "tokenizer_config.json",
-            safetensors_files[0]  # Just take the first weights file
-        ]
-        # Create the model directory
-        model_dir = os.path.join(MODEL_CACHE_DIR, os.path.basename(SOURCE_REPO))
-        os.makedirs(model_dir, exist_ok=True)
-        # Download only the required files
-        for file in required_files:
-            if file in files:
-                logger.info(f"Downloading {file}")
-                hf_api.hf_hub_download(
-                    repo_id=SOURCE_REPO,
-                    filename=file,
-                    local_dir=model_dir,
-                    token=HF_TOKEN
-                )
-        return model_dir
     except Exception as e:
-        logger.error(f"Error downloading model: {e}")
-        state["last_error"] = str(e)
-        return None
-def process_quantization():
-    global state
-    if state["is_processing"]:
-        return "Already processing quantizations. Please wait."
-    state["is_processing"] = True
-    state["progress"] = 0
-    state["completed_quants"] = []
-    state["failed_quants"] = []
-    state["out_of_memory"] = False
-    state["last_error"] = None
-    state["status_message"] = "Starting quantization process..."
-    # Start the processing in a separate thread
-    thread = threading.Thread(target=quantization_worker)
-    thread.daemon = True
-    thread.start()
-    return "Quantization process started. Please wait for it to complete."
-def quantization_worker():
-    global state
-    try:
-        # Download the model
-        model_path = download_model()
-        if not model_path:
-            state["is_processing"] = False
-            state["status_message"] = "Failed to download model. Check logs for details."
-            return
-        # Create temporary output directory
-        os.makedirs(TEMP_DIR, exist_ok=True)
-        # Get model name from the source repo
-        model_name = os.path.basename(SOURCE_REPO).lower()
-        # Process each quantization configuration - we'll do one at a time to save memory
-        total_quants = len(QUANT_CONFIGS)
-        for i, quant_config in enumerate(QUANT_CONFIGS):
-            if state["out_of_memory"]:
-                # Skip further processing if we've hit memory limits
-                break
-            quant_name = quant_config["name"]
-            state["current_quant"] = quant_name
-            state["progress"] = (i / total_quants) * 100
-            state["status_message"] = f"Processing {quant_name} quantization ({i+1}/{total_quants})"
-            logger.info(f"Processing quantization: {quant_name}")
-            try:
-                # Free up memory between quantizations - this is crucial for the free tier
-                if i > 0:
-                    # Clean up previous files
-                    for file in os.listdir(TEMP_DIR):
-                        file_path = os.path.join(TEMP_DIR, file)
-                        if os.path.isfile(file_path):
-                            os.remove(file_path)
-                # Output path for this quantization
-                quant_output_path = os.path.join(TEMP_DIR, f"{model_name}-{quant_name.lower()}.gguf")
-                # Check available disk space before starting
                 try:
-                    statvfs = os.statvfs(TEMP_DIR)
-                    free_space_gb = (statvfs.f_frsize * statvfs.f_bavail) / (1024 * 1024 * 1024)
-                    logger.info(f"Available disk space: {free_space_gb:.2f} GB")
-                    # Skip if we don't have enough disk space
-                    if free_space_gb < quant_config["size_gb"] * 1.5:  # 50% buffer
-                        logger.warning(f"Not enough disk space for {quant_name} quantization. Need {quant_config['size_gb'] * 1.5:.2f} GB, have {free_space_gb:.2f} GB")
-                        state["failed_quants"].append(f"{quant_name} (disk space)")
-                        continue
-                except Exception as e:
-                    logger.warning(f"Could not check disk space: {e}")
-                # Run the conversion+quantization in one step to save memory
-                # We'll use direct conversion to the target quantization format
-                logger.info(f"Converting and quantizing directly to {quant_name}")
-                # Command to convert and quantize in one step
-                quantize_cmd = [
-                    "python",
-                    "./llama.cpp/convert.py",
-                    model_path,
-                    "--outfile", quant_output_path,
-                    "--outtype", quant_name.lower()
-                ]
-                # Create a process for monitoring memory usage
-                quantize_process = subprocess.Popen(
-                    quantize_cmd,
-                    shell=False,
-                    stdout=subprocess.PIPE,
-                    stderr=subprocess.PIPE,
-                    text=True
-                )
-                # Poll the process and monitor system resources
-                while quantize_process.poll() is None:
-                    # Check if we're getting low on memory
                     try:
-                        with open('/proc/meminfo', 'r') as f:
-                            meminfo = f.read()
-                        # Extract available memory
-                        available_mem = 0
-                        for line in meminfo.split('\n'):
-                            if 'MemAvailable:' in line:
-                                available_mem = int(line.split()[1]) / 1024  # Convert to MB
-                                break
-                        # If memory is critically low (less than 500MB), kill the process
-                        if available_mem < 500:
-                            logger.warning(f"Memory critically low ({available_mem:.2f} MB). Terminating quantization.")
-                            quantize_process.terminate()
-                            state["out_of_memory"] = True
-                            state["failed_quants"].append(f"{quant_name} (OOM)")
-                            break
                     except Exception as e:
-                        logger.warning(f"Could not check memory usage: {e}")
-                    # Wait a bit before checking again
-                    time.sleep(5)
-                # Check if the process completed successfully
-                if quantize_process.poll() is None:
-                    # Process is still running, kill it
-                    quantize_process.terminate()
-                    try:
-                        quantize_process.wait(timeout=10)
-                    except subprocess.TimeoutExpired:
-                        quantize_process.kill()
-                    raise Exception("Quantization process timed out or was terminated")
-                # Get process output
-                stdout, stderr = quantize_process.communicate()
-                if quantize_process.returncode != 0:
-                    if "out of memory" in stderr.lower() or "allocation failed" in stderr.lower() or "not enough memory" in stderr.lower():
-                        logger.warning(f"Out of memory during {quant_name} quantization")
-                        state["out_of_memory"] = True
-                        state["failed_quants"].append(f"{quant_name} (OOM)")
-                        continue
                     else:
-                        raise Exception(f"Error during {quant_name} quantization: {stderr}")
-                # Check if the file was created and has reasonable size
-                if not os.path.exists(quant_output_path) or os.path.getsize(quant_output_path) < 1000000:
-                    raise Exception(f"Quantization produced invalid or empty file")
-                # Create or update repository
-                repo_name = f"{model_name}-{quant_name.lower()}-gguf"
-                username = hf_api.whoami()["name"]
-                repo_id = f"{username}/{repo_name}"
-                try:
-                    # Check if repo exists
-                    hf_api.repo_info(repo_id=repo_id)
-                    logger.info(f"Repository {repo_id} already exists")
-                except Exception:
-                    # Create repo if it doesn't exist
-                    logger.info(f"Creating repository {repo_id}")
-                    hf_api.create_repo(repo_id=repo_id, exist_ok=True)
-                # Upload quantized model
-                logger.info(f"Uploading quantized model to {repo_id}")
-                # Create a simple README first (it's smaller)
-                readme_content = f"""# {model_name.capitalize()} - {quant_name} GGUF
-This repository contains a {quant_name} quantized GGUF version of [{SOURCE_REPO}](https://huggingface.co/{SOURCE_REPO}).
-## Details
-- **Quantization Type:** {quant_name}
-- **Approximate Size:** {quant_config['size_gb']} GB
-- **Notes:** {quant_config['notes']}
-- **Original Model:** [Sculptor-AI/Ursa_Minor](https://huggingface.co/{SOURCE_REPO})
-- **Auto-generated by:** GGUF Quantizer Space
-## Usage with llama.cpp
 ```bash
-# CLI
-llama-cli --hf-repo {repo_id} --hf-file {model_name}-{quant_name.lower()}.gguf -p "Your prompt here"
-# Server
-llama-server --hf-repo {repo_id} --hf-file {model_name}-{quant_name.lower()}.gguf -c 2048
 ```
 """
-                readme_path = os.path.join(TEMP_DIR, "README.md")
-                with open(readme_path, "w") as f:
-                    f.write(readme_content)
-                # Upload README first (it's smaller)
-                hf_api.upload_file(
-                    path_or_fileobj=readme_path,
-                    path_in_repo="README.md",
-                    repo_id=repo_id
-                )
-                # Then upload the model with LFS - this might take a while
-                try:
-                    upload_start_time = time.time()
-                    max_upload_time = 60 * 60  # 1 hour max upload time
-                    # Create a thread to monitor the upload
-                    upload_success = [False]
-                    upload_error = [None]
-                    upload_done = [False]
-                    def upload_file_with_timeout():
-                        try:
-                            hf_api.upload_file(
-                                path_or_fileobj=quant_output_path,
-                                path_in_repo=f"{model_name}-{quant_name.lower()}.gguf",
-                                repo_id=repo_id
-                            )
-                            upload_success[0] = True
                         except Exception as e:
-                            upload_error[0] = e
-                        finally:
-                            upload_done[0] = True
-                    upload_thread = threading.Thread(target=upload_file_with_timeout)
-                    upload_thread.daemon = True
-                    upload_thread.start()
-                    # Wait for upload to complete or timeout
-                    while not upload_done[0]:
-                        if time.time() - upload_start_time > max_upload_time:
-                            logger.warning(f"Upload timed out after {max_upload_time/60:.1f} minutes")
-                            break
-                        time.sleep(10)
-                    if upload_success[0]:
-                        state["completed_quants"].append(quant_name)
-                        logger.info(f"Successfully processed {quant_name} quantization")
-                    else:
-                        error_msg = str(upload_error[0]) if upload_error[0] else "Upload timed out"
-                        logger.error(f"Failed to upload quantized model: {error_msg}")
-                        state["failed_quants"].append(f"{quant_name} (upload failed)")
-                        state["last_error"] = error_msg
-                except Exception as upload_error:
-                    logger.error(f"Failed to upload quantized model: {upload_error}")
-                    state["failed_quants"].append(f"{quant_name} (upload failed)")
-                    state["last_error"] = str(upload_error)
-                # Delete the large file immediately after upload to save space
-                try:
-                    os.remove(quant_output_path)
-                except Exception as rm_error:
-                    logger.warning(f"Could not remove temporary file: {rm_error}")
-            except subprocess.TimeoutExpired as timeout_error:
-                logger.error(f"Timeout during {quant_name} quantization: {timeout_error}")
-                state["failed_quants"].append(f"{quant_name} (timeout)")
-                state["last_error"] = f"Quantization timed out after 30 minutes"
-            except Exception as e:
-                logger.error(f"Error processing {quant_name} quantization: {e}")
-                state["failed_quants"].append(quant_name)
-                state["last_error"] = str(e)
-        # Final cleanup
-        try:
-            shutil.rmtree(TEMP_DIR)
-        except Exception as e:
-            logger.warning(f"Error cleaning up temporary files: {e}")
-        # Clean up model cache to save space
-        try:
-            shutil.rmtree(MODEL_CACHE_DIR)
         except Exception as e:
-            logger.warning(f"Error cleaning up model cache: {e}")
-        state["progress"] = 100
-        state["is_up_to_date"] = True
-        state["is_processing"] = False
-        if state["out_of_memory"]:
-            last_successful = state["completed_quants"][-1] if state["completed_quants"] else "None"
-            state["status_message"] = f"Quantization process stopped due to memory limitations after {last_successful}. Smaller quantizations completed successfully."
-        elif state["failed_quants"]:
-            state["status_message"] = f"Quantization process completed with some failures. {len(state['completed_quants'])}/{total_quants} quantizations were successful."
-        else:
-            state["status_message"] = f"Quantization process completed successfully. All {len(state['completed_quants'])}/{total_quants} quantizations were created."
-    except Exception as e:
-        logger.error(f"Error in quantization worker: {e}")
-        state["is_processing"] = False
-        state["last_error"] = str(e)
-        state["status_message"] = f"Error during quantization process: {str(e)}"
-    save_state()
-# Create Gradio interface
-def create_interface():
-    with gr.Blocks(title="Ursa_Minor GGUF Quantizer", css="footer {visibility: hidden}") as demo:
-        with gr.Row():
-            gr.Markdown("# Ursa_Minor GGUF Auto Quantizer")
         with gr.Row():
             with gr.Column(scale=2):
-                status_md = gr.Markdown(value=f"### Status: {state['status_message']}")
                 with gr.Row():
                     check_button = gr.Button("Check for Updates", variant="primary")
-                    process_button = gr.Button("Generate Quantizations", variant="secondary")
-                with gr.Row():
-                    last_check = gr.Markdown(value=f"Last Checked: {state['last_checked'].strftime('%Y-%m-%d %H:%M:%S') if state['last_checked'] else 'Never'}")
-                with gr.Row():
-                    up_to_date = gr.Markdown(value=f"Up to Date: {'Yes' if state['is_up_to_date'] else 'No'}")
-                with gr.Accordion("Details", open=True):
-                    with gr.Row():
-                        progress = gr.Slider(
-                            minimum=0,
-                            maximum=100,
-                            value=state["progress"],
-                            label="Progress",
-                            interactive=False
-                        )
-                    current_task = gr.Markdown(value="")
-                    with gr.Row():
-                        completed_md = gr.Markdown(value="### Completed Quantizations")
-                        completed_list = gr.Markdown(value="None")
-                    with gr.Row():
-                        failed_md = gr.Markdown(value="### Failed Quantizations")
-                        failed_list = gr.Markdown(value="None")
-                    with gr.Row():
-                        error_md = gr.Markdown(value="### Last Error")
-                        error_text = gr.Markdown(value="None")
-            with gr.Column(scale=1):
-                gr.Markdown("### Quantization Types")
-                quant_table = gr.DataFrame(
-                    value=[[q["name"], f"{q['size_gb']} GB", q["notes"]] for q in QUANT_CONFIGS],
-                    headers=["Type", "Size", "Notes"],
-                    interactive=False
-                )
-        # Functions to update the UI
         def update_status():
-            # Simply update the text components without changing button properties
-            status_text = f"### Status: {state['status_message']}"
-            last_check_text = f"Last Checked: {state['last_checked'].strftime('%Y-%m-%d %H:%M:%S') if state['last_checked'] else 'Never'}"
-            up_to_date_text = f"Up to Date: {'Yes' if state['is_up_to_date'] else 'No'}"
-            current_task_text = ""
-            if state["is_processing"]:
-                current_quant = state["current_quant"] or "Preparing"
-                current_task_text = f"Current Task: Processing {current_quant} quantization"
-            completed_text = "None"
-            if state["completed_quants"]:
-                completed_items = []
-                for q in state["completed_quants"]:
-                    model_name = os.path.basename(SOURCE_REPO).lower()
-                    username = hf_api.whoami()["name"]
-                    repo_id = f"{username}/{model_name}-{q.lower()}-gguf"
-                    completed_items.append(f"- [{q}](https://huggingface.co/{repo_id})")
-                completed_text = "\n".join(completed_items)
-            failed_text = "None"
-            if state["failed_quants"]:
-                failed_items = []
-                for q in state["failed_quants"]:
-                    if "(" in q:  # Check if it has a reason in parentheses
-                        name, reason = q.split(" (", 1)
-                        reason = reason.rstrip(")")
-                        failed_items.append(f"- {name} (Reason: {reason})")
-                    else:
-                        failed_items.append(f"- {q}")
-                failed_text = "\n".join(failed_items)
-            error_text = "None"
-            if state["last_error"]:
-                error_text = f"```\n{state['last_error']}\n```"
-            return [
-                status_text,
-                last_check_text,
-                up_to_date_text,
-                state["progress"],
-                current_task_text,
-                completed_text,
-                failed_text,
-                error_text
-            ]
-        # Register event handlers
-        check_button.click(
-            fn=lambda: check_for_updates(),
-            outputs=[status_md]
-        ).then(
-            fn=update_status,
-            outputs=[
-                status_md,
-                last_check,
-                up_to_date,
-                progress,
-                current_task,
-                completed_list,
-                failed_list,
-                error_text
-            ]
-        )
-        process_button.click(
-            fn=lambda: process_quantization(),
-            outputs=[status_md]
-        ).then(
-            fn=update_status,
-            outputs=[
-                status_md,
-                last_check,
-                up_to_date,
-                progress,
-                current_task,
-                completed_list,
-                failed_list,
-                error_text
-            ]
-        )
-        # Add an interval for updating the UI during processing
-        demo.load(
-            fn=update_status,
-            outputs=[
-                status_md,
-                last_check,
-                up_to_date,
-                progress,
-                current_task,
-                completed_list,
-                failed_list,
-                error_text
-            ]
-        )
-        # Schedule periodic checks for updates - but less frequently for free tier
-        def scheduled_check():
-            while True:
-                try:
-                    if not state["is_processing"]:
-                        check_for_updates()
-                except Exception as e:
-                    logger.error(f"Error in scheduled check: {e}")
-                # Check less frequently to avoid waking up the space too often
-                time.sleep(14400)  # Check every 4 hours instead of hourly
-        # Only start the scheduler thread if we're not in a debugging environment
-        if not os.environ.get("GRADIO_DEBUG"):
-            scheduler_thread = threading.Thread(target=scheduled_check)
-            scheduler_thread.daemon = True
-            scheduler_thread.start()
-            logger.info("Started background update checker")
-        return demo
-# Initialize state from disk
-load_state()
-# Create and launch the interface
-demo = create_interface()
-demo.queue(max_size=10).launch(debug=True, show_api=False)

 import time
 import json
 from datetime import datetime
+from pathlib import Path
 import threading
+import traceback
+os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
 import gradio as gr
+from huggingface_hub import HfApi, commit_info, list_repo_files, hf_hub_download, login, whoami
+from apscheduler.schedulers.background import BackgroundScheduler
+# MODEL_REPO to monitor
+SOURCE_MODEL_REPO = "Sculptor-AI/Ursa_Minor"
+CONVERSION_SCRIPT = "./llama.cpp/convert_hf_to_gguf.py"
+STATUS_FILE = "status.json"
+# Quantization configurations in order of processing
 QUANT_CONFIGS = [
+    {"type": "Q2_K", "size_gb": 0.8, "notes": ""},
+    {"type": "Q3_K_S", "size_gb": 0.9, "notes": ""},
+    {"type": "Q3_K_M", "size_gb": 0.9, "notes": "lower quality"},
+    {"type": "Q3_K_L", "size_gb": 1.0, "notes": ""},
+    {"type": "IQ4_XS", "size_gb": 1.0, "notes": ""},
+    {"type": "Q4_K_S", "size_gb": 1.0, "notes": "fast, recommended"},
+    {"type": "Q4_K_M", "size_gb": 1.1, "notes": "fast, recommended"},
+    {"type": "Q5_K_S", "size_gb": 1.2, "notes": ""},
+    {"type": "Q5_K_M", "size_gb": 1.2, "notes": ""},
+    {"type": "Q6_K", "size_gb": 1.4, "notes": "very good quality"},
+    {"type": "Q8_0", "size_gb": 1.7, "notes": "fast, best quality"},
+    {"type": "f16", "size_gb": 3.2, "notes": "16 bpw, overkill"}
 ]
+# Global variables for process state
+processing_lock = threading.Lock()
+current_status = {
+    "status": "Not started",
+    "last_check": None,
+    "last_updated": None,
     "last_commit_hash": None,
     "current_quant": None,
+    "quant_status": {},
     "progress": 0,
+    "error": None,
+    "log": []
 }
+def escape(s: str) -> str:
+    """Escape HTML for logging"""
+    s = s.replace("&", "&amp;")
+    s = s.replace("<", "&lt;")
+    s = s.replace(">", "&gt;")
+    s = s.replace('"', "&quot;")
+    s = s.replace("\n", "<br/>")
+    return s
+def log_message(message: str, error: bool = False):
+    """Add message to log with timestamp"""
+    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+    log_entry = f"[{timestamp}] {message}"
+    print(log_entry)
+    current_status["log"].append(log_entry)
+    if error:
+        current_status["error"] = message
+    # Keep log size manageable
+    if len(current_status["log"]) > 100:
+        current_status["log"] = current_status["log"][-100:]
+    # Save current status to file
+    save_status()
+def save_status():
+    """Save current status to file"""
+    with open(STATUS_FILE, 'w') as f:
+        json.dump(current_status, f)
+def load_status():
+    """Load status from file if it exists"""
+    global current_status
+    if os.path.exists(STATUS_FILE):
+        try:
+            with open(STATUS_FILE, 'r') as f:
+                current_status = json.load(f)
+        except Exception as e:
+            log_message(f"Error loading status file: {str(e)}", error=True)
+def generate_importance_matrix(model_path: str, train_data_path: str, output_path: str):
+    """Generate importance matrix for a model"""
+    imatrix_command = [
+        "./llama.cpp/llama-imatrix",
+        "-m", model_path,
+        "-f", train_data_path,
+        "-ngl", "99",
+        "--output-frequency", "10",
+        "-o", output_path,
+    ]
+    if not os.path.isfile(model_path):
+        raise Exception(f"Model file not found: {model_path}")
+    log_message(f"Running imatrix command for {model_path}...")
+    process = subprocess.Popen(imatrix_command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
     try:
+        # Monitor the process for output to provide updates
+        for line in process.stdout:
+            log_message(f"imatrix: {line.strip()}")
+        process.wait(timeout=3600)  # 1 hour timeout
+    except subprocess.TimeoutExpired:
+        log_message("Imatrix computation timed out. Sending SIGINT to allow graceful termination...", error=True)
+        process.send_signal(signal.SIGINT)
+        try:
+            process.wait(timeout=60)  # 1 minute grace period
+        except subprocess.TimeoutExpired:
+            log_message("Imatrix process still didn't terminate. Forcefully terminating process...", error=True)
+            process.kill()
+    stderr = process.stderr.read()
+    if stderr:
+        log_message(f"Imatrix stderr: {stderr}")
+    log_message("Importance matrix generation completed.")
+def get_last_commit(repo_id: str):
+    """Get the last commit hash of a repository"""
     try:
+        info = commit_info(repo_id)
+        return info.commit_id
     except Exception as e:
+        log_message(f"Error getting commit info: {str(e)}", error=True)
         return None
 def check_for_updates():
+    """Check if the source model has been updated"""
+    if processing_lock.locked():
+        log_message("Already processing, skipping update check")
+        return False
+    current_status["status"] = "Checking for updates"
+    current_status["last_check"] = datetime.now().isoformat()
     try:
+        # Get the latest commit hash
+        latest_commit = get_last_commit(SOURCE_MODEL_REPO)
+        if latest_commit is None:
+            current_status["status"] = "Error checking for updates"
+            return False
+        log_message(f"Latest commit hash: {latest_commit}")
+        log_message(f"Previous commit hash: {current_status.get('last_commit_hash')}")
+        if current_status.get("last_commit_hash") != latest_commit:
+            current_status["status"] = "Update detected"
+            current_status["last_commit_hash"] = latest_commit
+            save_status()
+            return True
+        else:
+            current_status["status"] = "Up to date"
+            save_status()
+            return False
     except Exception as e:
+        log_message(f"Error checking for updates: {str(e)}", error=True)
+        current_status["status"] = "Error checking for updates"
+        save_status()
+        return False
+def process_model():
+    """Process the model to create all quantized versions"""
+    if processing_lock.locked():
+        log_message("Already processing, cannot start another process")
+        return
+    with processing_lock:
+        try:
+            # Validate authentication
+            try:
+                user_info = whoami()
+                log_message(f"Processing as user: {user_info['name']}")
+            except Exception as e:
+                log_message(f"Authentication error: {str(e)}. Please make sure you're logged in.", error=True)
+                current_status["status"] = "Authentication error"
+                save_status()
+                return
+            api = HfApi()
+            model_name = SOURCE_MODEL_REPO.split('/')[-1]
+            current_status["status"] = "Processing"
+            current_status["progress"] = 0
+            save_status()
+            # Prepare directories
+            if not os.path.exists("downloads"):
+                os.makedirs("downloads")
+            if not os.path.exists("outputs"):
+                os.makedirs("outputs")
+            log_message(f"Starting model processing for {SOURCE_MODEL_REPO}")
+            # Create temp directories for processing
+            with Path("outputs").resolve() as outdir:
+                log_message(f"Output directory: {outdir}")
+                # Download the model
+                log_message(f"Downloading model from {SOURCE_MODEL_REPO}")
                 try:
+                    local_dir = Path("downloads") / model_name
+                    log_message(f"Local directory: {local_dir}")
+                    # Check and download pattern
+                    dl_pattern = ["*.md", "*.json", "*.model"]
                     try:
+                        files = list_repo_files(SOURCE_MODEL_REPO)
+                        has_safetensors = any(file.endswith(".safetensors") for file in files)
+                        pattern = "*.safetensors" if has_safetensors else "*.bin"
+                        dl_pattern.append(pattern)
+                        log_message(f"Using download pattern: {dl_pattern}")
                     except Exception as e:
+                        log_message(f"Error checking repo files: {str(e)}", error=True)
+                        dl_pattern.append("*.safetensors")
+                        dl_pattern.append("*.bin")
+                    # Download the model
+                    api.snapshot_download(
+                        repo_id=SOURCE_MODEL_REPO,
+                        local_dir=local_dir,
+                        local_dir_use_symlinks=False,
+                        allow_patterns=dl_pattern
+                    )
+                    log_message("Model downloaded successfully!")
+                    # Check for adapter config - if it's a LoRA adapter, this won't work
+                    config_dir = local_dir / "config.json"
+                    adapter_config_dir = local_dir / "adapter_config.json"
+                    if os.path.exists(adapter_config_dir) and not os.path.exists(config_dir):
+                        raise Exception('adapter_config.json is present. If you are converting a LoRA adapter to GGUF, please use a different tool.')
+                    # Convert to FP16 first
+                    fp16_path = str(outdir / f"{model_name}.fp16.gguf")
+                    log_message(f"Converting model to FP16: {fp16_path}")
+                    result = subprocess.run([
+                        "python", CONVERSION_SCRIPT, str(local_dir), "--outtype", "f16", "--outfile", fp16_path
+                    ], shell=False, capture_output=True, text=True)
+                    if result.returncode != 0:
+                        raise Exception(f"Error converting to fp16: {result.stderr}")
+                    log_message("Model converted to fp16 successfully!")
+                    # Generate importance matrix for IQ quantizations
+                    imatrix_path = str(outdir / "imatrix.dat")
+                    train_data_path = "llama.cpp/groups_merged.txt"  # Default calibration dataset
+                    if not os.path.isfile(train_data_path):
+                        log_message(f"Warning: Training data file not found: {train_data_path}. Some quantizations may not work.", error=True)
                     else:
+                        try:
+                            generate_importance_matrix(fp16_path, train_data_path, imatrix_path)
+                        except Exception as e:
+                            log_message(f"Error generating importance matrix: {str(e)}", error=True)
+                            imatrix_path = None
+                    # Process each quantization type
+                    total_quants = len(QUANT_CONFIGS)
+                    for i, quant_config in enumerate(QUANT_CONFIGS):
+                        quant_type = quant_config["type"]
+                        current_status["current_quant"] = quant_type
+                        current_status["progress"] = int((i / total_quants) * 100)
+                        save_status()
+                        log_message(f"Processing quantization {i+1}/{total_quants}: {quant_type}")
+                        try:
+                            # Check if this is an IQ quantization
+                            is_iq_quant = quant_type.startswith("IQ")
+                            # Skip if we don't have imatrix and this is an IQ quant
+                            if is_iq_quant and (imatrix_path is None or not os.path.exists(imatrix_path)):
+                                log_message(f"Skipping {quant_type} as importance matrix is not available", error=True)
+                                current_status["quant_status"][quant_type] = "Skipped - No imatrix"
+                                continue
+                            # Set up the repo name
+                            username = user_info["name"]
+                            repo_name = f"{model_name}-{quant_type}-GGUF"
+                            repo_id = f"{username}/{repo_name}"
+                            # Set up output path
+                            quant_file_name = f"{model_name.lower()}-{quant_type.lower()}.gguf"
+                            if is_iq_quant and quant_type != "f16":
+                                quant_file_name = f"{model_name.lower()}-{quant_type.lower()}-imat.gguf"
+                            quant_file_path = str(outdir / quant_file_name)
+                            # Run quantization
+                            if is_iq_quant and quant_type != "f16":
+                                quantize_cmd = [
+                                    "./llama.cpp/llama-quantize",
+                                    "--imatrix", imatrix_path, fp16_path, quant_file_path, quant_type
+                                ]
+                            else:
+                                quantize_cmd = [
+                                    "./llama.cpp/llama-quantize",
+                                    fp16_path, quant_file_path, quant_type
+                                ]
+                            log_message(f"Running quantization command: {' '.join(quantize_cmd)}")
+                            result = subprocess.run(quantize_cmd, shell=False, capture_output=True, text=True)
+                            if result.returncode != 0:
+                                if "out of memory" in result.stderr.lower():
+                                    log_message(f"Out of memory error quantizing {quant_type}. Skipping larger models.", error=True)
+                                    current_status["quant_status"][quant_type] = "Failed - Out of memory"
+                                    # Break the loop to skip larger models
+                                    break
+                                else:
+                                    raise Exception(f"Error quantizing {quant_type}: {result.stderr}")
+                            log_message(f"Quantized successfully with {quant_type}!")
+                            # Create the repo if it doesn't exist
+                            log_message(f"Creating/updating repo {repo_id}")
+                            try:
+                                repo_url = api.create_repo(repo_id=repo_id, exist_ok=True)
+                                log_message(f"Repo URL: {repo_url}")
+                            except Exception as e:
+                                log_message(f"Error creating repo: {str(e)}", error=True)
+                                current_status["quant_status"][quant_type] = "Failed - Repo creation error"
+                                continue
+                            # Create README with model info
+                            log_message("Creating README")
+                            readme_content = f"""# {repo_name}
+This model was converted to GGUF format from [`{SOURCE_MODEL_REPO}`](https://huggingface.co/{SOURCE_MODEL_REPO}) using llama.cpp.
+## Quantization: {quant_type}
+Approximate size: {quant_config['size_gb']} GB
+Notes: {quant_config['notes']}
+## Use with llama.cpp
+Install llama.cpp through brew (works on Mac and Linux)
+```bash
+brew install llama.cpp
+```
+Invoke the llama.cpp server or the CLI.
+### CLI:
+```bash
+llama-cli --hf-repo {repo_id} --hf-file {quant_file_name} -p "The meaning to life and the universe is"
+```
+### Server:
 ```bash
+llama-server --hf-repo {repo_id} --hf-file {quant_file_name} -c 2048
+```
+Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.
+Step 1: Clone llama.cpp from GitHub.
+```
+git clone https://github.com/ggerganov/llama.cpp
+```
+Step 2: Move into the llama.cpp folder and build it with `LLAMA_CURL=1` flag along with other hardware-specific flags (for ex: LLAMA_CUDA=1 for Nvidia GPUs on Linux).
+```
+cd llama.cpp && LLAMA_CURL=1 make
+```
+Step 3: Run inference through the main binary.
+```
+./llama-cli --hf-repo {repo_id} --hf-file {quant_file_name} -p "The meaning to life and the universe is"
+```
+or
+```
+./llama-server --hf-repo {repo_id} --hf-file {quant_file_name} -c 2048
 ```
+## Auto-generated
+This model version was automatically generated when updates were detected in the source repository.
+Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
 """
+                            readme_path = outdir / "README.md"
+                            with open(readme_path, 'w') as f:
+                                f.write(readme_content)
+                            # Upload the quantized model and README
+                            log_message(f"Uploading quantized model: {quant_file_path}")
+                            try:
+                                api.upload_file(
+                                    path_or_fileobj=quant_file_path,
+                                    path_in_repo=quant_file_name,
+                                    repo_id=repo_id,
+                                )
+                                api.upload_file(
+                                    path_or_fileobj=str(readme_path),
+                                    path_in_repo="README.md",
+                                    repo_id=repo_id,
+                                )
+                                if os.path.isfile(imatrix_path) and is_iq_quant:
+                                    log_message(f"Uploading imatrix.dat")
+                                    api.upload_file(
+                                        path_or_fileobj=imatrix_path,
+                                        path_in_repo="imatrix.dat",
+                                        repo_id=repo_id,
+                                    )
+                                log_message(f"Successfully uploaded {quant_type} quantization!")
+                                current_status["quant_status"][quant_type] = "Success"
+                            except Exception as e:
+                                log_message(f"Error uploading files: {str(e)}", error=True)
+                                current_status["quant_status"][quant_type] = f"Failed - Upload error: {str(e)}"
                         except Exception as e:
+                            log_message(f"Error processing {quant_type}: {str(e)}", error=True)
+                            current_status["quant_status"][quant_type] = f"Failed: {str(e)}"
+                            # Continue with the next quantization
+                    # Update status after completion
+                    current_status["status"] = "Completed"
+                    current_status["progress"] = 100
+                    current_status["last_updated"] = datetime.now().isoformat()
+                    log_message("Model processing completed!")
+                except Exception as e:
+                    log_message(f"Error during model processing: {str(e)}", error=True)
+                    current_status["status"] = "Error"
+                    current_status["error"] = str(e)
+                    traceback.print_exc()
         except Exception as e:
+            log_message(f"Error: {str(e)}", error=True)
+            current_status["status"] = "Error"
+            current_status["error"] = str(e)
+            traceback.print_exc()
+        finally:
+            save_status()
+def check_and_process():
+    """Check for updates and process if needed"""
+    log_message("Running scheduled check for updates")
+    if check_for_updates():
+        log_message("Updates detected, starting processing")
+        threading.Thread(target=process_model).start()
+    else:
+        log_message("No updates detected")
+def create_ui():
+    """Create the Gradio interface"""
+    with gr.Blocks(css="body { margin: 0; padding: 0; }") as demo:
+        gr.Markdown("# 🦙 Automatic GGUF Quantization for Ursa_Minor")
+        gr.Markdown(f"This space automatically creates quantized GGUF versions of the [Sculptor-AI/Ursa_Minor](https://huggingface.co/{SOURCE_MODEL_REPO}) model whenever it's updated.")
         with gr.Row():
             with gr.Column(scale=2):
+                status_info = gr.HTML(label="Status", value="<p>Loading status...</p>")
+            with gr.Column(scale=1):
                 with gr.Row():
                     check_button = gr.Button("Check for Updates", variant="primary")
+                    process_button = gr.Button("Force Processing", variant="secondary")
+        progress_bar = gr.Progress(label="Progress")
+        with gr.Tab("Quantization Status"):
+            quant_status = gr.DataFrame(
+                headers=["Type", "Size (GB)", "Notes", "Status"],
+                value=lambda: [[q["type"], q["size_gb"], q["notes"], current_status["quant_status"].get(q["type"], "Not processed")] for q in QUANT_CONFIGS],
+                label="Quantization Status"
+            )
+        with gr.Tab("Logs"):
+            logs = gr.HTML(label="Logs", value="<p>Loading logs...</p>")
         def update_status():
+            """Update the status display"""
+            status_html = f"""
+            <div style="border: 1px solid #ddd; padding: 15px; border-radius: 5px;">
+                <h3>Current Status: <span style="color: {'green' if current_status['status'] == 'Up to date' else 'blue' if current_status['status'] == 'Processing' else 'red' if 'Error' in current_status['status'] else 'orange'}">{current_status['status']}</span></h3>
+                <p><strong>Last Checked:</strong> {current_status.get('last_check', 'Never').replace('T', ' ').split('.')[0] if current_status.get('last_check') else 'Never'}</p>
+                <p><strong>Last Updated:</strong> {current_status.get('last_updated', 'Never').replace('T', ' ').split('.')[0] if current_status.get('last_updated') else 'Never'}</p>
+                <p><strong>Current Quantization:</strong> {current_status.get('current_quant', 'None')}</p>
+                {f'<p style="color: red;"><strong>Error:</strong> {current_status["error"]}</p>' if current_status.get('error') else ''}
+            </div>
+            """
+            return status_html
+        def update_logs():
+            """Update the logs display"""
+            logs_html = "<div style='height: 400px; overflow-y: auto; background-color: #f9f9f9; padding: 10px; font-family: monospace; white-space: pre-wrap;'>"
+            for log in current_status["log"]:
+                if "Error" in log or "error" in log:
+                    logs_html += f"<div style='color: red;'>{log}</div>"
+                else:
+                    logs_html += f"<div>{log}</div>"
+            logs_html += "</div>"
+            return logs_html
+        def on_check_button():
+            """Handle check button click"""
+            if check_for_updates():
+                threading.Thread(target=process_model).start()
+            return update_status(), [[q["type"], q["size_gb"], q["notes"], current_status["quant_status"].get(q["type"], "Not processed")] for q in QUANT_CONFIGS], update_logs()
+        def on_process_button():
+            """Handle process button click"""
+            threading.Thread(target=process_model).start()
+            return update_status(), [[q["type"], q["size_gb"], q["notes"], current_status["quant_status"].get(q["type"], "Not processed")] for q in QUANT_CONFIGS], update_logs()
+        check_button.click(on_check_button, outputs=[status_info, quant_status, logs])
+        process_button.click(on_process_button, outputs=[status_info, quant_status, logs])
+        # Set up periodic refresh
+        demo.load(update_status, outputs=[status_info])
+        demo.load(lambda: [[q["type"], q["size_gb"], q["notes"], current_status["quant_status"].get(q["type"], "Not processed")] for q in QUANT_CONFIGS], outputs=[quant_status])
+        demo.load(update_logs, outputs=[logs])
+        refresh_interval = 5  # seconds
+        gr.HTML("<script>setInterval(function(){ Array.from(document.querySelectorAll('button[id*=Refresh-Button]')).forEach(b => b.click()); }, " + str(refresh_interval * 1000) + ");</script>")
+    return demo
+# Initialize
+def initialize():
+    """Initialize the application"""
+    # Load status from file
+    load_status()
+    # Schedule regular checks for updates
+    scheduler = BackgroundScheduler()
+    scheduler.add_job(check_and_process, 'interval', minutes=60)  # Check every hour
+    scheduler.start()
+    # Run initial check
+    threading.Thread(target=check_and_process).start()
+if __name__ == "__main__":
+    initialize()
+    demo = create_ui()
+    demo.queue(concurrency_count=1).launch()

groups_merged.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt CHANGED Viewed

@@ -1,4 +1,3 @@
-gradio>=4.12.0
-huggingface_hub>=0.19.0
-pandas>=2.0.0
-numpy>=1.24.0

+gradio>=3.50.2
+huggingface_hub>=0.17.1
+apscheduler>=3.10.1

setup.sh DELETED Viewed

@@ -1,48 +0,0 @@
-#!/bin/bash
-set -e
-echo "Setting up for real GGUF quantization..."
-# Clone llama.cpp
-if [ ! -d "llama.cpp" ]; then
-  echo "Cloning llama.cpp repository..."
-  git clone --depth=1 https://github.com/ggerganov/llama.cpp
-fi
-cd llama.cpp
-# Get conversion script
-echo "Setting up conversion script..."
-if [ -f "convert.py" ]; then
-  echo "Found existing convert.py script"
-elif [ -f "convert-hf-to-gguf.py" ]; then
-  echo "Found convert-hf-to-gguf.py"
-  cp convert-hf-to-gguf.py convert.py
-elif [ -f "examples/convert-hf-to-gguf.py" ]; then
-  echo "Found examples/convert-hf-to-gguf.py"
-  cp examples/convert-hf-to-gguf.py convert.py
-else
-  echo "Cannot find conversion script. Using Python alternative."
-  # Install required packages
-  pip install -q transformers torch
-fi
-# Install required packages for the conversion script
-pip install -q transformers torch
-# Initialize state file
-cd ..
-if [ ! -f "state.json" ]; then
-  echo "Initializing state file..."
-  echo '{"last_checked": null, "last_commit_hash": null, "is_up_to_date": true, "is_processing": false, "current_quant": null, "progress": 0, "total_quants": 12, "completed_quants": [], "failed_quants": [], "out_of_memory": false, "last_error": null, "status_message": "Ready to check for updates"}' > state.json
-fi
-# Create necessary directories
-echo "Creating directories..."
-mkdir -p model_cache
-mkdir -p temp_outputs
-echo "Setup completed successfully"

start.sh ADDED Viewed

	@@ -0,0 +1,31 @@

+#!/bin/bash
+# Clone llama.cpp if not exists
+if [ ! -d "llama.cpp" ]; then
+  echo "Cloning llama.cpp repository..."
+  git clone https://github.com/ggerganov/llama.cpp
+fi
+# Copy calibration data if not exists
+if [ ! -f "llama.cpp/groups_merged.txt" ]; then
+  echo "Copying calibration data..."
+  cp groups_merged.txt llama.cpp/groups_merged.txt
+fi
+# Disable CUDA for HF spaces (not supported in free tier)
+# We should still build with optimizations for CPU
+export GGML_CUDA=OFF
+export GGML_AVX=1
+export GGML_AVX2=1
+cd llama.cpp
+echo "Building llama.cpp tools..."
+cmake -B build -DBUILD_SHARED_LIBS=OFF
+cmake --build build --config Release -j --target llama-quantize llama-gguf-split llama-imatrix
+echo "Copying built binaries..."
+cp ./build/bin/llama-* ./ 2>/dev/null || cp ./build/llama-* ./ 2>/dev/null
+rm -rf build
+cd ..
+echo "Starting Gradio app..."
+python app.py