Spaces:
Running
Running
fix
Browse files- Dockerfile +24 -0
- app.py +474 -566
- groups_merged.txt +0 -0
- requirements.txt +3 -4
- setup.sh +0 -48
- start.sh +31 -0
Dockerfile
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# Install system dependencies
|
| 6 |
+
RUN apt-get update && apt-get install -y \
|
| 7 |
+
build-essential \
|
| 8 |
+
cmake \
|
| 9 |
+
git \
|
| 10 |
+
wget \
|
| 11 |
+
&& apt-get clean && rm -rf /var/lib/apt/lists/*
|
| 12 |
+
|
| 13 |
+
# Copy requirements and install Python dependencies
|
| 14 |
+
COPY requirements.txt .
|
| 15 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 16 |
+
|
| 17 |
+
# Copy the rest of the code
|
| 18 |
+
COPY . .
|
| 19 |
+
|
| 20 |
+
# Make start script executable
|
| 21 |
+
RUN chmod +x start.sh
|
| 22 |
+
|
| 23 |
+
# Run the start script
|
| 24 |
+
CMD ["./start.sh"]
|
app.py
CHANGED
|
@@ -4,639 +4,547 @@ import signal
|
|
| 4 |
import time
|
| 5 |
import json
|
| 6 |
from datetime import datetime
|
|
|
|
| 7 |
import threading
|
| 8 |
-
import
|
|
|
|
|
|
|
| 9 |
import gradio as gr
|
| 10 |
-
from huggingface_hub import HfApi, login, whoami
|
| 11 |
-
from pathlib import Path
|
| 12 |
-
import shutil
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
logger = logging.getLogger(__name__)
|
| 17 |
|
| 18 |
-
#
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
# Restored full quantization set, sorted from smallest to largest
|
| 25 |
QUANT_CONFIGS = [
|
| 26 |
-
{"
|
| 27 |
-
{"
|
| 28 |
-
{"
|
| 29 |
-
{"
|
| 30 |
-
{"
|
| 31 |
-
{"
|
| 32 |
-
{"
|
| 33 |
-
{"
|
| 34 |
-
{"
|
| 35 |
-
{"
|
| 36 |
-
{"
|
| 37 |
-
{"
|
| 38 |
]
|
| 39 |
|
| 40 |
-
#
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
| 43 |
"last_commit_hash": None,
|
| 44 |
-
"is_up_to_date": True,
|
| 45 |
-
"is_processing": False,
|
| 46 |
"current_quant": None,
|
|
|
|
| 47 |
"progress": 0,
|
| 48 |
-
"
|
| 49 |
-
"
|
| 50 |
-
"failed_quants": [],
|
| 51 |
-
"out_of_memory": False,
|
| 52 |
-
"last_error": None,
|
| 53 |
-
"status_message": "Ready to check for updates"
|
| 54 |
}
|
| 55 |
|
| 56 |
-
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
-
# Helper functions
|
| 70 |
-
def save_state():
|
| 71 |
-
with open("state.json", "w") as f:
|
| 72 |
-
# Create a serializable copy of the state
|
| 73 |
-
serializable_state = state.copy()
|
| 74 |
-
serializable_state["last_checked"] = str(serializable_state["last_checked"]) if serializable_state["last_checked"] else None
|
| 75 |
-
json.dump(serializable_state, f)
|
| 76 |
-
|
| 77 |
-
def load_state():
|
| 78 |
-
global state
|
| 79 |
try:
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
-
def
|
|
|
|
| 91 |
try:
|
| 92 |
-
|
| 93 |
-
return
|
| 94 |
except Exception as e:
|
| 95 |
-
|
| 96 |
return None
|
| 97 |
|
| 98 |
def check_for_updates():
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
state["last_commit_hash"] = latest_commit
|
| 107 |
-
state["is_up_to_date"] = False
|
| 108 |
-
state["status_message"] = f"Updates detected in {SOURCE_REPO}. Ready to generate quantizations."
|
| 109 |
-
else:
|
| 110 |
-
state["is_up_to_date"] = True
|
| 111 |
-
state["status_message"] = f"No updates detected in {SOURCE_REPO}. Last checked: {state['last_checked'].strftime('%Y-%m-%d %H:%M:%S')}"
|
| 112 |
|
| 113 |
-
save_state()
|
| 114 |
-
return state["status_message"]
|
| 115 |
-
|
| 116 |
-
def download_model():
|
| 117 |
try:
|
| 118 |
-
#
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
shutil.rmtree(os.path.join(MODEL_CACHE_DIR, os.path.basename(SOURCE_REPO)))
|
| 124 |
-
|
| 125 |
-
# Get model repo information to find the smallest safetensors file
|
| 126 |
-
logger.info(f"Getting repository information for {SOURCE_REPO}")
|
| 127 |
-
files = hf_api.list_repo_files(repo_id=SOURCE_REPO)
|
| 128 |
-
|
| 129 |
-
# Filter for safetensors files (which are the model weights)
|
| 130 |
-
safetensors_files = [f for f in files if f.endswith(".safetensors")]
|
| 131 |
-
|
| 132 |
-
if not safetensors_files:
|
| 133 |
-
raise Exception(f"No safetensors files found in {SOURCE_REPO}")
|
| 134 |
-
|
| 135 |
-
# Download only required files instead of the entire repo to save space
|
| 136 |
-
# This includes model config and one weights file
|
| 137 |
-
required_files = [
|
| 138 |
-
"config.json",
|
| 139 |
-
"tokenizer.json",
|
| 140 |
-
"tokenizer_config.json",
|
| 141 |
-
safetensors_files[0] # Just take the first weights file
|
| 142 |
-
]
|
| 143 |
-
|
| 144 |
-
# Create the model directory
|
| 145 |
-
model_dir = os.path.join(MODEL_CACHE_DIR, os.path.basename(SOURCE_REPO))
|
| 146 |
-
os.makedirs(model_dir, exist_ok=True)
|
| 147 |
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
if file in files:
|
| 151 |
-
logger.info(f"Downloading {file}")
|
| 152 |
-
hf_api.hf_hub_download(
|
| 153 |
-
repo_id=SOURCE_REPO,
|
| 154 |
-
filename=file,
|
| 155 |
-
local_dir=model_dir,
|
| 156 |
-
token=HF_TOKEN
|
| 157 |
-
)
|
| 158 |
|
| 159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
except Exception as e:
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
|
|
|
| 164 |
|
| 165 |
-
def
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
return
|
| 170 |
-
|
| 171 |
-
state["is_processing"] = True
|
| 172 |
-
state["progress"] = 0
|
| 173 |
-
state["completed_quants"] = []
|
| 174 |
-
state["failed_quants"] = []
|
| 175 |
-
state["out_of_memory"] = False
|
| 176 |
-
state["last_error"] = None
|
| 177 |
-
state["status_message"] = "Starting quantization process..."
|
| 178 |
-
|
| 179 |
-
# Start the processing in a separate thread
|
| 180 |
-
thread = threading.Thread(target=quantization_worker)
|
| 181 |
-
thread.daemon = True
|
| 182 |
-
thread.start()
|
| 183 |
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
return
|
| 196 |
-
|
| 197 |
-
# Create temporary output directory
|
| 198 |
-
os.makedirs(TEMP_DIR, exist_ok=True)
|
| 199 |
-
|
| 200 |
-
# Get model name from the source repo
|
| 201 |
-
model_name = os.path.basename(SOURCE_REPO).lower()
|
| 202 |
-
|
| 203 |
-
# Process each quantization configuration - we'll do one at a time to save memory
|
| 204 |
-
total_quants = len(QUANT_CONFIGS)
|
| 205 |
-
|
| 206 |
-
for i, quant_config in enumerate(QUANT_CONFIGS):
|
| 207 |
-
if state["out_of_memory"]:
|
| 208 |
-
# Skip further processing if we've hit memory limits
|
| 209 |
-
break
|
| 210 |
-
|
| 211 |
-
quant_name = quant_config["name"]
|
| 212 |
-
state["current_quant"] = quant_name
|
| 213 |
-
state["progress"] = (i / total_quants) * 100
|
| 214 |
-
state["status_message"] = f"Processing {quant_name} quantization ({i+1}/{total_quants})"
|
| 215 |
|
| 216 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
|
| 230 |
-
#
|
|
|
|
| 231 |
try:
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
logger.info(f"Available disk space: {free_space_gb:.2f} GB")
|
| 235 |
|
| 236 |
-
#
|
| 237 |
-
|
| 238 |
-
logger.warning(f"Not enough disk space for {quant_name} quantization. Need {quant_config['size_gb'] * 1.5:.2f} GB, have {free_space_gb:.2f} GB")
|
| 239 |
-
state["failed_quants"].append(f"{quant_name} (disk space)")
|
| 240 |
-
continue
|
| 241 |
-
except Exception as e:
|
| 242 |
-
logger.warning(f"Could not check disk space: {e}")
|
| 243 |
-
|
| 244 |
-
# Run the conversion+quantization in one step to save memory
|
| 245 |
-
# We'll use direct conversion to the target quantization format
|
| 246 |
-
logger.info(f"Converting and quantizing directly to {quant_name}")
|
| 247 |
-
|
| 248 |
-
# Command to convert and quantize in one step
|
| 249 |
-
quantize_cmd = [
|
| 250 |
-
"python",
|
| 251 |
-
"./llama.cpp/convert.py",
|
| 252 |
-
model_path,
|
| 253 |
-
"--outfile", quant_output_path,
|
| 254 |
-
"--outtype", quant_name.lower()
|
| 255 |
-
]
|
| 256 |
-
|
| 257 |
-
# Create a process for monitoring memory usage
|
| 258 |
-
quantize_process = subprocess.Popen(
|
| 259 |
-
quantize_cmd,
|
| 260 |
-
shell=False,
|
| 261 |
-
stdout=subprocess.PIPE,
|
| 262 |
-
stderr=subprocess.PIPE,
|
| 263 |
-
text=True
|
| 264 |
-
)
|
| 265 |
-
|
| 266 |
-
# Poll the process and monitor system resources
|
| 267 |
-
while quantize_process.poll() is None:
|
| 268 |
-
# Check if we're getting low on memory
|
| 269 |
try:
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
for line in meminfo.split('\n'):
|
| 276 |
-
if 'MemAvailable:' in line:
|
| 277 |
-
available_mem = int(line.split()[1]) / 1024 # Convert to MB
|
| 278 |
-
break
|
| 279 |
-
|
| 280 |
-
# If memory is critically low (less than 500MB), kill the process
|
| 281 |
-
if available_mem < 500:
|
| 282 |
-
logger.warning(f"Memory critically low ({available_mem:.2f} MB). Terminating quantization.")
|
| 283 |
-
quantize_process.terminate()
|
| 284 |
-
state["out_of_memory"] = True
|
| 285 |
-
state["failed_quants"].append(f"{quant_name} (OOM)")
|
| 286 |
-
break
|
| 287 |
except Exception as e:
|
| 288 |
-
|
|
|
|
|
|
|
| 289 |
|
| 290 |
-
#
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
quantize_process.wait(timeout=10)
|
| 299 |
-
except subprocess.TimeoutExpired:
|
| 300 |
-
quantize_process.kill()
|
| 301 |
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 313 |
else:
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
|
| 340 |
-
|
|
|
|
|
|
|
| 341 |
|
| 342 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
|
| 344 |
-
|
| 345 |
-
- **Approximate Size:** {quant_config['size_gb']} GB
|
| 346 |
-
- **Notes:** {quant_config['notes']}
|
| 347 |
-
- **Original Model:** [Sculptor-AI/Ursa_Minor](https://huggingface.co/{SOURCE_REPO})
|
| 348 |
-
- **Auto-generated by:** GGUF Quantizer Space
|
| 349 |
|
| 350 |
-
|
|
|
|
|
|
|
|
|
|
| 351 |
|
|
|
|
| 352 |
```bash
|
| 353 |
-
|
| 354 |
-
|
|
|
|
|
|
|
| 355 |
|
| 356 |
-
|
| 357 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 358 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
"""
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
|
|
|
|
|
|
|
|
|
| 390 |
except Exception as e:
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
upload_thread = threading.Thread(target=upload_file_with_timeout)
|
| 396 |
-
upload_thread.daemon = True
|
| 397 |
-
upload_thread.start()
|
| 398 |
|
| 399 |
-
#
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
time.sleep(10)
|
| 405 |
-
|
| 406 |
-
if upload_success[0]:
|
| 407 |
-
state["completed_quants"].append(quant_name)
|
| 408 |
-
logger.info(f"Successfully processed {quant_name} quantization")
|
| 409 |
-
else:
|
| 410 |
-
error_msg = str(upload_error[0]) if upload_error[0] else "Upload timed out"
|
| 411 |
-
logger.error(f"Failed to upload quantized model: {error_msg}")
|
| 412 |
-
state["failed_quants"].append(f"{quant_name} (upload failed)")
|
| 413 |
-
state["last_error"] = error_msg
|
| 414 |
-
except Exception as upload_error:
|
| 415 |
-
logger.error(f"Failed to upload quantized model: {upload_error}")
|
| 416 |
-
state["failed_quants"].append(f"{quant_name} (upload failed)")
|
| 417 |
-
state["last_error"] = str(upload_error)
|
| 418 |
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
except subprocess.TimeoutExpired as timeout_error:
|
| 426 |
-
logger.error(f"Timeout during {quant_name} quantization: {timeout_error}")
|
| 427 |
-
state["failed_quants"].append(f"{quant_name} (timeout)")
|
| 428 |
-
state["last_error"] = f"Quantization timed out after 30 minutes"
|
| 429 |
-
except Exception as e:
|
| 430 |
-
logger.error(f"Error processing {quant_name} quantization: {e}")
|
| 431 |
-
state["failed_quants"].append(quant_name)
|
| 432 |
-
state["last_error"] = str(e)
|
| 433 |
-
|
| 434 |
-
# Final cleanup
|
| 435 |
-
try:
|
| 436 |
-
shutil.rmtree(TEMP_DIR)
|
| 437 |
-
except Exception as e:
|
| 438 |
-
logger.warning(f"Error cleaning up temporary files: {e}")
|
| 439 |
-
|
| 440 |
-
# Clean up model cache to save space
|
| 441 |
-
try:
|
| 442 |
-
shutil.rmtree(MODEL_CACHE_DIR)
|
| 443 |
except Exception as e:
|
| 444 |
-
|
|
|
|
|
|
|
|
|
|
| 445 |
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
state["is_processing"] = False
|
| 449 |
-
|
| 450 |
-
if state["out_of_memory"]:
|
| 451 |
-
last_successful = state["completed_quants"][-1] if state["completed_quants"] else "None"
|
| 452 |
-
state["status_message"] = f"Quantization process stopped due to memory limitations after {last_successful}. Smaller quantizations completed successfully."
|
| 453 |
-
elif state["failed_quants"]:
|
| 454 |
-
state["status_message"] = f"Quantization process completed with some failures. {len(state['completed_quants'])}/{total_quants} quantizations were successful."
|
| 455 |
-
else:
|
| 456 |
-
state["status_message"] = f"Quantization process completed successfully. All {len(state['completed_quants'])}/{total_quants} quantizations were created."
|
| 457 |
-
|
| 458 |
-
except Exception as e:
|
| 459 |
-
logger.error(f"Error in quantization worker: {e}")
|
| 460 |
-
state["is_processing"] = False
|
| 461 |
-
state["last_error"] = str(e)
|
| 462 |
-
state["status_message"] = f"Error during quantization process: {str(e)}"
|
| 463 |
-
|
| 464 |
-
save_state()
|
| 465 |
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 471 |
|
| 472 |
with gr.Row():
|
| 473 |
with gr.Column(scale=2):
|
| 474 |
-
|
| 475 |
|
|
|
|
| 476 |
with gr.Row():
|
| 477 |
check_button = gr.Button("Check for Updates", variant="primary")
|
| 478 |
-
process_button = gr.Button("
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
value=state["progress"],
|
| 492 |
-
label="Progress",
|
| 493 |
-
interactive=False
|
| 494 |
-
)
|
| 495 |
-
|
| 496 |
-
current_task = gr.Markdown(value="")
|
| 497 |
-
|
| 498 |
-
with gr.Row():
|
| 499 |
-
completed_md = gr.Markdown(value="### Completed Quantizations")
|
| 500 |
-
completed_list = gr.Markdown(value="None")
|
| 501 |
-
|
| 502 |
-
with gr.Row():
|
| 503 |
-
failed_md = gr.Markdown(value="### Failed Quantizations")
|
| 504 |
-
failed_list = gr.Markdown(value="None")
|
| 505 |
-
|
| 506 |
-
with gr.Row():
|
| 507 |
-
error_md = gr.Markdown(value="### Last Error")
|
| 508 |
-
error_text = gr.Markdown(value="None")
|
| 509 |
-
|
| 510 |
-
with gr.Column(scale=1):
|
| 511 |
-
gr.Markdown("### Quantization Types")
|
| 512 |
-
quant_table = gr.DataFrame(
|
| 513 |
-
value=[[q["name"], f"{q['size_gb']} GB", q["notes"]] for q in QUANT_CONFIGS],
|
| 514 |
-
headers=["Type", "Size", "Notes"],
|
| 515 |
-
interactive=False
|
| 516 |
-
)
|
| 517 |
|
| 518 |
-
# Functions to update the UI
|
| 519 |
def update_status():
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
if state["completed_quants"]:
|
| 532 |
-
completed_items = []
|
| 533 |
-
for q in state["completed_quants"]:
|
| 534 |
-
model_name = os.path.basename(SOURCE_REPO).lower()
|
| 535 |
-
username = hf_api.whoami()["name"]
|
| 536 |
-
repo_id = f"{username}/{model_name}-{q.lower()}-gguf"
|
| 537 |
-
completed_items.append(f"- [{q}](https://huggingface.co/{repo_id})")
|
| 538 |
-
completed_text = "\n".join(completed_items)
|
| 539 |
-
|
| 540 |
-
failed_text = "None"
|
| 541 |
-
if state["failed_quants"]:
|
| 542 |
-
failed_items = []
|
| 543 |
-
for q in state["failed_quants"]:
|
| 544 |
-
if "(" in q: # Check if it has a reason in parentheses
|
| 545 |
-
name, reason = q.split(" (", 1)
|
| 546 |
-
reason = reason.rstrip(")")
|
| 547 |
-
failed_items.append(f"- {name} (Reason: {reason})")
|
| 548 |
-
else:
|
| 549 |
-
failed_items.append(f"- {q}")
|
| 550 |
-
failed_text = "\n".join(failed_items)
|
| 551 |
-
|
| 552 |
-
error_text = "None"
|
| 553 |
-
if state["last_error"]:
|
| 554 |
-
error_text = f"```\n{state['last_error']}\n```"
|
| 555 |
-
|
| 556 |
-
return [
|
| 557 |
-
status_text,
|
| 558 |
-
last_check_text,
|
| 559 |
-
up_to_date_text,
|
| 560 |
-
state["progress"],
|
| 561 |
-
current_task_text,
|
| 562 |
-
completed_text,
|
| 563 |
-
failed_text,
|
| 564 |
-
error_text
|
| 565 |
-
]
|
| 566 |
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
|
| 577 |
-
progress,
|
| 578 |
-
current_task,
|
| 579 |
-
completed_list,
|
| 580 |
-
failed_list,
|
| 581 |
-
error_text
|
| 582 |
-
]
|
| 583 |
-
)
|
| 584 |
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
outputs=[
|
| 591 |
-
status_md,
|
| 592 |
-
last_check,
|
| 593 |
-
up_to_date,
|
| 594 |
-
progress,
|
| 595 |
-
current_task,
|
| 596 |
-
completed_list,
|
| 597 |
-
failed_list,
|
| 598 |
-
error_text
|
| 599 |
-
]
|
| 600 |
-
)
|
| 601 |
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
status_md,
|
| 607 |
-
last_check,
|
| 608 |
-
up_to_date,
|
| 609 |
-
progress,
|
| 610 |
-
current_task,
|
| 611 |
-
completed_list,
|
| 612 |
-
failed_list,
|
| 613 |
-
error_text
|
| 614 |
-
]
|
| 615 |
-
)
|
| 616 |
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
while True:
|
| 620 |
-
try:
|
| 621 |
-
if not state["is_processing"]:
|
| 622 |
-
check_for_updates()
|
| 623 |
-
except Exception as e:
|
| 624 |
-
logger.error(f"Error in scheduled check: {e}")
|
| 625 |
-
# Check less frequently to avoid waking up the space too often
|
| 626 |
-
time.sleep(14400) # Check every 4 hours instead of hourly
|
| 627 |
|
| 628 |
-
#
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
scheduler_thread.start()
|
| 633 |
-
logger.info("Started background update checker")
|
| 634 |
|
| 635 |
-
|
|
|
|
|
|
|
|
|
|
| 636 |
|
| 637 |
-
# Initialize
|
| 638 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 639 |
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
demo
|
|
|
|
|
|
| 4 |
import time
|
| 5 |
import json
|
| 6 |
from datetime import datetime
|
| 7 |
+
from pathlib import Path
|
| 8 |
import threading
|
| 9 |
+
import traceback
|
| 10 |
+
|
| 11 |
+
os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
|
| 12 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
+
from huggingface_hub import HfApi, commit_info, list_repo_files, hf_hub_download, login, whoami
|
| 15 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
|
|
|
| 16 |
|
| 17 |
+
# MODEL_REPO to monitor
|
| 18 |
+
SOURCE_MODEL_REPO = "Sculptor-AI/Ursa_Minor"
|
| 19 |
+
CONVERSION_SCRIPT = "./llama.cpp/convert_hf_to_gguf.py"
|
| 20 |
+
STATUS_FILE = "status.json"
|
| 21 |
+
|
| 22 |
+
# Quantization configurations in order of processing
|
|
|
|
| 23 |
QUANT_CONFIGS = [
|
| 24 |
+
{"type": "Q2_K", "size_gb": 0.8, "notes": ""},
|
| 25 |
+
{"type": "Q3_K_S", "size_gb": 0.9, "notes": ""},
|
| 26 |
+
{"type": "Q3_K_M", "size_gb": 0.9, "notes": "lower quality"},
|
| 27 |
+
{"type": "Q3_K_L", "size_gb": 1.0, "notes": ""},
|
| 28 |
+
{"type": "IQ4_XS", "size_gb": 1.0, "notes": ""},
|
| 29 |
+
{"type": "Q4_K_S", "size_gb": 1.0, "notes": "fast, recommended"},
|
| 30 |
+
{"type": "Q4_K_M", "size_gb": 1.1, "notes": "fast, recommended"},
|
| 31 |
+
{"type": "Q5_K_S", "size_gb": 1.2, "notes": ""},
|
| 32 |
+
{"type": "Q5_K_M", "size_gb": 1.2, "notes": ""},
|
| 33 |
+
{"type": "Q6_K", "size_gb": 1.4, "notes": "very good quality"},
|
| 34 |
+
{"type": "Q8_0", "size_gb": 1.7, "notes": "fast, best quality"},
|
| 35 |
+
{"type": "f16", "size_gb": 3.2, "notes": "16 bpw, overkill"}
|
| 36 |
]
|
| 37 |
|
| 38 |
+
# Global variables for process state
|
| 39 |
+
processing_lock = threading.Lock()
|
| 40 |
+
current_status = {
|
| 41 |
+
"status": "Not started",
|
| 42 |
+
"last_check": None,
|
| 43 |
+
"last_updated": None,
|
| 44 |
"last_commit_hash": None,
|
|
|
|
|
|
|
| 45 |
"current_quant": None,
|
| 46 |
+
"quant_status": {},
|
| 47 |
"progress": 0,
|
| 48 |
+
"error": None,
|
| 49 |
+
"log": []
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
}
|
| 51 |
|
| 52 |
+
def escape(s: str) -> str:
|
| 53 |
+
"""Escape HTML for logging"""
|
| 54 |
+
s = s.replace("&", "&")
|
| 55 |
+
s = s.replace("<", "<")
|
| 56 |
+
s = s.replace(">", ">")
|
| 57 |
+
s = s.replace('"', """)
|
| 58 |
+
s = s.replace("\n", "<br/>")
|
| 59 |
+
return s
|
| 60 |
|
| 61 |
+
def log_message(message: str, error: bool = False):
|
| 62 |
+
"""Add message to log with timestamp"""
|
| 63 |
+
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
| 64 |
+
log_entry = f"[{timestamp}] {message}"
|
| 65 |
+
print(log_entry)
|
| 66 |
+
current_status["log"].append(log_entry)
|
| 67 |
+
if error:
|
| 68 |
+
current_status["error"] = message
|
| 69 |
+
|
| 70 |
+
# Keep log size manageable
|
| 71 |
+
if len(current_status["log"]) > 100:
|
| 72 |
+
current_status["log"] = current_status["log"][-100:]
|
| 73 |
+
|
| 74 |
+
# Save current status to file
|
| 75 |
+
save_status()
|
| 76 |
+
|
| 77 |
+
def save_status():
|
| 78 |
+
"""Save current status to file"""
|
| 79 |
+
with open(STATUS_FILE, 'w') as f:
|
| 80 |
+
json.dump(current_status, f)
|
| 81 |
+
|
| 82 |
+
def load_status():
|
| 83 |
+
"""Load status from file if it exists"""
|
| 84 |
+
global current_status
|
| 85 |
+
if os.path.exists(STATUS_FILE):
|
| 86 |
+
try:
|
| 87 |
+
with open(STATUS_FILE, 'r') as f:
|
| 88 |
+
current_status = json.load(f)
|
| 89 |
+
except Exception as e:
|
| 90 |
+
log_message(f"Error loading status file: {str(e)}", error=True)
|
| 91 |
+
|
| 92 |
+
def generate_importance_matrix(model_path: str, train_data_path: str, output_path: str):
|
| 93 |
+
"""Generate importance matrix for a model"""
|
| 94 |
+
imatrix_command = [
|
| 95 |
+
"./llama.cpp/llama-imatrix",
|
| 96 |
+
"-m", model_path,
|
| 97 |
+
"-f", train_data_path,
|
| 98 |
+
"-ngl", "99",
|
| 99 |
+
"--output-frequency", "10",
|
| 100 |
+
"-o", output_path,
|
| 101 |
+
]
|
| 102 |
+
|
| 103 |
+
if not os.path.isfile(model_path):
|
| 104 |
+
raise Exception(f"Model file not found: {model_path}")
|
| 105 |
+
|
| 106 |
+
log_message(f"Running imatrix command for {model_path}...")
|
| 107 |
+
process = subprocess.Popen(imatrix_command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
| 108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
try:
|
| 110 |
+
# Monitor the process for output to provide updates
|
| 111 |
+
for line in process.stdout:
|
| 112 |
+
log_message(f"imatrix: {line.strip()}")
|
| 113 |
+
|
| 114 |
+
process.wait(timeout=3600) # 1 hour timeout
|
| 115 |
+
except subprocess.TimeoutExpired:
|
| 116 |
+
log_message("Imatrix computation timed out. Sending SIGINT to allow graceful termination...", error=True)
|
| 117 |
+
process.send_signal(signal.SIGINT)
|
| 118 |
+
try:
|
| 119 |
+
process.wait(timeout=60) # 1 minute grace period
|
| 120 |
+
except subprocess.TimeoutExpired:
|
| 121 |
+
log_message("Imatrix process still didn't terminate. Forcefully terminating process...", error=True)
|
| 122 |
+
process.kill()
|
| 123 |
+
|
| 124 |
+
stderr = process.stderr.read()
|
| 125 |
+
if stderr:
|
| 126 |
+
log_message(f"Imatrix stderr: {stderr}")
|
| 127 |
+
|
| 128 |
+
log_message("Importance matrix generation completed.")
|
| 129 |
|
| 130 |
+
def get_last_commit(repo_id: str):
|
| 131 |
+
"""Get the last commit hash of a repository"""
|
| 132 |
try:
|
| 133 |
+
info = commit_info(repo_id)
|
| 134 |
+
return info.commit_id
|
| 135 |
except Exception as e:
|
| 136 |
+
log_message(f"Error getting commit info: {str(e)}", error=True)
|
| 137 |
return None
|
| 138 |
|
| 139 |
def check_for_updates():
|
| 140 |
+
"""Check if the source model has been updated"""
|
| 141 |
+
if processing_lock.locked():
|
| 142 |
+
log_message("Already processing, skipping update check")
|
| 143 |
+
return False
|
| 144 |
|
| 145 |
+
current_status["status"] = "Checking for updates"
|
| 146 |
+
current_status["last_check"] = datetime.now().isoformat()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
try:
|
| 149 |
+
# Get the latest commit hash
|
| 150 |
+
latest_commit = get_last_commit(SOURCE_MODEL_REPO)
|
| 151 |
+
if latest_commit is None:
|
| 152 |
+
current_status["status"] = "Error checking for updates"
|
| 153 |
+
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
+
log_message(f"Latest commit hash: {latest_commit}")
|
| 156 |
+
log_message(f"Previous commit hash: {current_status.get('last_commit_hash')}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
+
if current_status.get("last_commit_hash") != latest_commit:
|
| 159 |
+
current_status["status"] = "Update detected"
|
| 160 |
+
current_status["last_commit_hash"] = latest_commit
|
| 161 |
+
save_status()
|
| 162 |
+
return True
|
| 163 |
+
else:
|
| 164 |
+
current_status["status"] = "Up to date"
|
| 165 |
+
save_status()
|
| 166 |
+
return False
|
| 167 |
except Exception as e:
|
| 168 |
+
log_message(f"Error checking for updates: {str(e)}", error=True)
|
| 169 |
+
current_status["status"] = "Error checking for updates"
|
| 170 |
+
save_status()
|
| 171 |
+
return False
|
| 172 |
|
| 173 |
+
def process_model():
|
| 174 |
+
"""Process the model to create all quantized versions"""
|
| 175 |
+
if processing_lock.locked():
|
| 176 |
+
log_message("Already processing, cannot start another process")
|
| 177 |
+
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
|
| 179 |
+
with processing_lock:
|
| 180 |
+
try:
|
| 181 |
+
# Validate authentication
|
| 182 |
+
try:
|
| 183 |
+
user_info = whoami()
|
| 184 |
+
log_message(f"Processing as user: {user_info['name']}")
|
| 185 |
+
except Exception as e:
|
| 186 |
+
log_message(f"Authentication error: {str(e)}. Please make sure you're logged in.", error=True)
|
| 187 |
+
current_status["status"] = "Authentication error"
|
| 188 |
+
save_status()
|
| 189 |
+
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
|
| 191 |
+
api = HfApi()
|
| 192 |
+
model_name = SOURCE_MODEL_REPO.split('/')[-1]
|
| 193 |
+
current_status["status"] = "Processing"
|
| 194 |
+
current_status["progress"] = 0
|
| 195 |
+
save_status()
|
| 196 |
|
| 197 |
+
# Prepare directories
|
| 198 |
+
if not os.path.exists("downloads"):
|
| 199 |
+
os.makedirs("downloads")
|
| 200 |
+
if not os.path.exists("outputs"):
|
| 201 |
+
os.makedirs("outputs")
|
| 202 |
+
|
| 203 |
+
log_message(f"Starting model processing for {SOURCE_MODEL_REPO}")
|
| 204 |
+
|
| 205 |
+
# Create temp directories for processing
|
| 206 |
+
with Path("outputs").resolve() as outdir:
|
| 207 |
+
log_message(f"Output directory: {outdir}")
|
| 208 |
|
| 209 |
+
# Download the model
|
| 210 |
+
log_message(f"Downloading model from {SOURCE_MODEL_REPO}")
|
| 211 |
try:
|
| 212 |
+
local_dir = Path("downloads") / model_name
|
| 213 |
+
log_message(f"Local directory: {local_dir}")
|
|
|
|
| 214 |
|
| 215 |
+
# Check and download pattern
|
| 216 |
+
dl_pattern = ["*.md", "*.json", "*.model"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
try:
|
| 218 |
+
files = list_repo_files(SOURCE_MODEL_REPO)
|
| 219 |
+
has_safetensors = any(file.endswith(".safetensors") for file in files)
|
| 220 |
+
pattern = "*.safetensors" if has_safetensors else "*.bin"
|
| 221 |
+
dl_pattern.append(pattern)
|
| 222 |
+
log_message(f"Using download pattern: {dl_pattern}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
except Exception as e:
|
| 224 |
+
log_message(f"Error checking repo files: {str(e)}", error=True)
|
| 225 |
+
dl_pattern.append("*.safetensors")
|
| 226 |
+
dl_pattern.append("*.bin")
|
| 227 |
|
| 228 |
+
# Download the model
|
| 229 |
+
api.snapshot_download(
|
| 230 |
+
repo_id=SOURCE_MODEL_REPO,
|
| 231 |
+
local_dir=local_dir,
|
| 232 |
+
local_dir_use_symlinks=False,
|
| 233 |
+
allow_patterns=dl_pattern
|
| 234 |
+
)
|
| 235 |
+
log_message("Model downloaded successfully!")
|
|
|
|
|
|
|
|
|
|
| 236 |
|
| 237 |
+
# Check for adapter config - if it's a LoRA adapter, this won't work
|
| 238 |
+
config_dir = local_dir / "config.json"
|
| 239 |
+
adapter_config_dir = local_dir / "adapter_config.json"
|
| 240 |
+
if os.path.exists(adapter_config_dir) and not os.path.exists(config_dir):
|
| 241 |
+
raise Exception('adapter_config.json is present. If you are converting a LoRA adapter to GGUF, please use a different tool.')
|
| 242 |
+
|
| 243 |
+
# Convert to FP16 first
|
| 244 |
+
fp16_path = str(outdir / f"{model_name}.fp16.gguf")
|
| 245 |
+
log_message(f"Converting model to FP16: {fp16_path}")
|
| 246 |
+
|
| 247 |
+
result = subprocess.run([
|
| 248 |
+
"python", CONVERSION_SCRIPT, str(local_dir), "--outtype", "f16", "--outfile", fp16_path
|
| 249 |
+
], shell=False, capture_output=True, text=True)
|
| 250 |
+
|
| 251 |
+
if result.returncode != 0:
|
| 252 |
+
raise Exception(f"Error converting to fp16: {result.stderr}")
|
| 253 |
+
|
| 254 |
+
log_message("Model converted to fp16 successfully!")
|
| 255 |
+
|
| 256 |
+
# Generate importance matrix for IQ quantizations
|
| 257 |
+
imatrix_path = str(outdir / "imatrix.dat")
|
| 258 |
+
train_data_path = "llama.cpp/groups_merged.txt" # Default calibration dataset
|
| 259 |
+
|
| 260 |
+
if not os.path.isfile(train_data_path):
|
| 261 |
+
log_message(f"Warning: Training data file not found: {train_data_path}. Some quantizations may not work.", error=True)
|
| 262 |
else:
|
| 263 |
+
try:
|
| 264 |
+
generate_importance_matrix(fp16_path, train_data_path, imatrix_path)
|
| 265 |
+
except Exception as e:
|
| 266 |
+
log_message(f"Error generating importance matrix: {str(e)}", error=True)
|
| 267 |
+
imatrix_path = None
|
| 268 |
+
|
| 269 |
+
# Process each quantization type
|
| 270 |
+
total_quants = len(QUANT_CONFIGS)
|
| 271 |
+
for i, quant_config in enumerate(QUANT_CONFIGS):
|
| 272 |
+
quant_type = quant_config["type"]
|
| 273 |
+
current_status["current_quant"] = quant_type
|
| 274 |
+
current_status["progress"] = int((i / total_quants) * 100)
|
| 275 |
+
save_status()
|
| 276 |
+
|
| 277 |
+
log_message(f"Processing quantization {i+1}/{total_quants}: {quant_type}")
|
| 278 |
+
|
| 279 |
+
try:
|
| 280 |
+
# Check if this is an IQ quantization
|
| 281 |
+
is_iq_quant = quant_type.startswith("IQ")
|
| 282 |
+
|
| 283 |
+
# Skip if we don't have imatrix and this is an IQ quant
|
| 284 |
+
if is_iq_quant and (imatrix_path is None or not os.path.exists(imatrix_path)):
|
| 285 |
+
log_message(f"Skipping {quant_type} as importance matrix is not available", error=True)
|
| 286 |
+
current_status["quant_status"][quant_type] = "Skipped - No imatrix"
|
| 287 |
+
continue
|
| 288 |
+
|
| 289 |
+
# Set up the repo name
|
| 290 |
+
username = user_info["name"]
|
| 291 |
+
repo_name = f"{model_name}-{quant_type}-GGUF"
|
| 292 |
+
repo_id = f"{username}/{repo_name}"
|
| 293 |
+
|
| 294 |
+
# Set up output path
|
| 295 |
+
quant_file_name = f"{model_name.lower()}-{quant_type.lower()}.gguf"
|
| 296 |
+
if is_iq_quant and quant_type != "f16":
|
| 297 |
+
quant_file_name = f"{model_name.lower()}-{quant_type.lower()}-imat.gguf"
|
| 298 |
+
|
| 299 |
+
quant_file_path = str(outdir / quant_file_name)
|
| 300 |
+
|
| 301 |
+
# Run quantization
|
| 302 |
+
if is_iq_quant and quant_type != "f16":
|
| 303 |
+
quantize_cmd = [
|
| 304 |
+
"./llama.cpp/llama-quantize",
|
| 305 |
+
"--imatrix", imatrix_path, fp16_path, quant_file_path, quant_type
|
| 306 |
+
]
|
| 307 |
+
else:
|
| 308 |
+
quantize_cmd = [
|
| 309 |
+
"./llama.cpp/llama-quantize",
|
| 310 |
+
fp16_path, quant_file_path, quant_type
|
| 311 |
+
]
|
| 312 |
+
|
| 313 |
+
log_message(f"Running quantization command: {' '.join(quantize_cmd)}")
|
| 314 |
+
result = subprocess.run(quantize_cmd, shell=False, capture_output=True, text=True)
|
| 315 |
+
|
| 316 |
+
if result.returncode != 0:
|
| 317 |
+
if "out of memory" in result.stderr.lower():
|
| 318 |
+
log_message(f"Out of memory error quantizing {quant_type}. Skipping larger models.", error=True)
|
| 319 |
+
current_status["quant_status"][quant_type] = "Failed - Out of memory"
|
| 320 |
+
# Break the loop to skip larger models
|
| 321 |
+
break
|
| 322 |
+
else:
|
| 323 |
+
raise Exception(f"Error quantizing {quant_type}: {result.stderr}")
|
| 324 |
+
|
| 325 |
+
log_message(f"Quantized successfully with {quant_type}!")
|
| 326 |
+
|
| 327 |
+
# Create the repo if it doesn't exist
|
| 328 |
+
log_message(f"Creating/updating repo {repo_id}")
|
| 329 |
+
try:
|
| 330 |
+
repo_url = api.create_repo(repo_id=repo_id, exist_ok=True)
|
| 331 |
+
log_message(f"Repo URL: {repo_url}")
|
| 332 |
+
except Exception as e:
|
| 333 |
+
log_message(f"Error creating repo: {str(e)}", error=True)
|
| 334 |
+
current_status["quant_status"][quant_type] = "Failed - Repo creation error"
|
| 335 |
+
continue
|
| 336 |
+
|
| 337 |
+
# Create README with model info
|
| 338 |
+
log_message("Creating README")
|
| 339 |
+
readme_content = f"""# {repo_name}
|
| 340 |
+
This model was converted to GGUF format from [`{SOURCE_MODEL_REPO}`](https://huggingface.co/{SOURCE_MODEL_REPO}) using llama.cpp.
|
| 341 |
|
| 342 |
+
## Quantization: {quant_type}
|
| 343 |
+
Approximate size: {quant_config['size_gb']} GB
|
| 344 |
+
Notes: {quant_config['notes']}
|
| 345 |
|
| 346 |
+
## Use with llama.cpp
|
| 347 |
+
Install llama.cpp through brew (works on Mac and Linux)
|
| 348 |
+
|
| 349 |
+
```bash
|
| 350 |
+
brew install llama.cpp
|
| 351 |
+
```
|
| 352 |
|
| 353 |
+
Invoke the llama.cpp server or the CLI.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 354 |
|
| 355 |
+
### CLI:
|
| 356 |
+
```bash
|
| 357 |
+
llama-cli --hf-repo {repo_id} --hf-file {quant_file_name} -p "The meaning to life and the universe is"
|
| 358 |
+
```
|
| 359 |
|
| 360 |
+
### Server:
|
| 361 |
```bash
|
| 362 |
+
llama-server --hf-repo {repo_id} --hf-file {quant_file_name} -c 2048
|
| 363 |
+
```
|
| 364 |
+
|
| 365 |
+
Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.
|
| 366 |
|
| 367 |
+
Step 1: Clone llama.cpp from GitHub.
|
| 368 |
+
```
|
| 369 |
+
git clone https://github.com/ggerganov/llama.cpp
|
| 370 |
+
```
|
| 371 |
+
|
| 372 |
+
Step 2: Move into the llama.cpp folder and build it with `LLAMA_CURL=1` flag along with other hardware-specific flags (for ex: LLAMA_CUDA=1 for Nvidia GPUs on Linux).
|
| 373 |
+
```
|
| 374 |
+
cd llama.cpp && LLAMA_CURL=1 make
|
| 375 |
+
```
|
| 376 |
+
|
| 377 |
+
Step 3: Run inference through the main binary.
|
| 378 |
+
```
|
| 379 |
+
./llama-cli --hf-repo {repo_id} --hf-file {quant_file_name} -p "The meaning to life and the universe is"
|
| 380 |
+
```
|
| 381 |
+
or
|
| 382 |
+
```
|
| 383 |
+
./llama-server --hf-repo {repo_id} --hf-file {quant_file_name} -c 2048
|
| 384 |
```
|
| 385 |
+
|
| 386 |
+
## Auto-generated
|
| 387 |
+
This model version was automatically generated when updates were detected in the source repository.
|
| 388 |
+
Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
| 389 |
"""
|
| 390 |
+
readme_path = outdir / "README.md"
|
| 391 |
+
with open(readme_path, 'w') as f:
|
| 392 |
+
f.write(readme_content)
|
| 393 |
+
|
| 394 |
+
# Upload the quantized model and README
|
| 395 |
+
log_message(f"Uploading quantized model: {quant_file_path}")
|
| 396 |
+
try:
|
| 397 |
+
api.upload_file(
|
| 398 |
+
path_or_fileobj=quant_file_path,
|
| 399 |
+
path_in_repo=quant_file_name,
|
| 400 |
+
repo_id=repo_id,
|
| 401 |
+
)
|
| 402 |
+
|
| 403 |
+
api.upload_file(
|
| 404 |
+
path_or_fileobj=str(readme_path),
|
| 405 |
+
path_in_repo="README.md",
|
| 406 |
+
repo_id=repo_id,
|
| 407 |
+
)
|
| 408 |
+
|
| 409 |
+
if os.path.isfile(imatrix_path) and is_iq_quant:
|
| 410 |
+
log_message(f"Uploading imatrix.dat")
|
| 411 |
+
api.upload_file(
|
| 412 |
+
path_or_fileobj=imatrix_path,
|
| 413 |
+
path_in_repo="imatrix.dat",
|
| 414 |
+
repo_id=repo_id,
|
| 415 |
+
)
|
| 416 |
+
|
| 417 |
+
log_message(f"Successfully uploaded {quant_type} quantization!")
|
| 418 |
+
current_status["quant_status"][quant_type] = "Success"
|
| 419 |
+
except Exception as e:
|
| 420 |
+
log_message(f"Error uploading files: {str(e)}", error=True)
|
| 421 |
+
current_status["quant_status"][quant_type] = f"Failed - Upload error: {str(e)}"
|
| 422 |
+
|
| 423 |
except Exception as e:
|
| 424 |
+
log_message(f"Error processing {quant_type}: {str(e)}", error=True)
|
| 425 |
+
current_status["quant_status"][quant_type] = f"Failed: {str(e)}"
|
| 426 |
+
# Continue with the next quantization
|
|
|
|
|
|
|
|
|
|
|
|
|
| 427 |
|
| 428 |
+
# Update status after completion
|
| 429 |
+
current_status["status"] = "Completed"
|
| 430 |
+
current_status["progress"] = 100
|
| 431 |
+
current_status["last_updated"] = datetime.now().isoformat()
|
| 432 |
+
log_message("Model processing completed!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 433 |
|
| 434 |
+
except Exception as e:
|
| 435 |
+
log_message(f"Error during model processing: {str(e)}", error=True)
|
| 436 |
+
current_status["status"] = "Error"
|
| 437 |
+
current_status["error"] = str(e)
|
| 438 |
+
traceback.print_exc()
|
| 439 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 440 |
except Exception as e:
|
| 441 |
+
log_message(f"Error: {str(e)}", error=True)
|
| 442 |
+
current_status["status"] = "Error"
|
| 443 |
+
current_status["error"] = str(e)
|
| 444 |
+
traceback.print_exc()
|
| 445 |
|
| 446 |
+
finally:
|
| 447 |
+
save_status()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 448 |
|
| 449 |
+
def check_and_process():
|
| 450 |
+
"""Check for updates and process if needed"""
|
| 451 |
+
log_message("Running scheduled check for updates")
|
| 452 |
+
if check_for_updates():
|
| 453 |
+
log_message("Updates detected, starting processing")
|
| 454 |
+
threading.Thread(target=process_model).start()
|
| 455 |
+
else:
|
| 456 |
+
log_message("No updates detected")
|
| 457 |
+
|
| 458 |
+
def create_ui():
|
| 459 |
+
"""Create the Gradio interface"""
|
| 460 |
+
with gr.Blocks(css="body { margin: 0; padding: 0; }") as demo:
|
| 461 |
+
gr.Markdown("# 🦙 Automatic GGUF Quantization for Ursa_Minor")
|
| 462 |
+
gr.Markdown(f"This space automatically creates quantized GGUF versions of the [Sculptor-AI/Ursa_Minor](https://huggingface.co/{SOURCE_MODEL_REPO}) model whenever it's updated.")
|
| 463 |
|
| 464 |
with gr.Row():
|
| 465 |
with gr.Column(scale=2):
|
| 466 |
+
status_info = gr.HTML(label="Status", value="<p>Loading status...</p>")
|
| 467 |
|
| 468 |
+
with gr.Column(scale=1):
|
| 469 |
with gr.Row():
|
| 470 |
check_button = gr.Button("Check for Updates", variant="primary")
|
| 471 |
+
process_button = gr.Button("Force Processing", variant="secondary")
|
| 472 |
+
|
| 473 |
+
progress_bar = gr.Progress(label="Progress")
|
| 474 |
+
|
| 475 |
+
with gr.Tab("Quantization Status"):
|
| 476 |
+
quant_status = gr.DataFrame(
|
| 477 |
+
headers=["Type", "Size (GB)", "Notes", "Status"],
|
| 478 |
+
value=lambda: [[q["type"], q["size_gb"], q["notes"], current_status["quant_status"].get(q["type"], "Not processed")] for q in QUANT_CONFIGS],
|
| 479 |
+
label="Quantization Status"
|
| 480 |
+
)
|
| 481 |
+
|
| 482 |
+
with gr.Tab("Logs"):
|
| 483 |
+
logs = gr.HTML(label="Logs", value="<p>Loading logs...</p>")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 484 |
|
|
|
|
| 485 |
def update_status():
|
| 486 |
+
"""Update the status display"""
|
| 487 |
+
status_html = f"""
|
| 488 |
+
<div style="border: 1px solid #ddd; padding: 15px; border-radius: 5px;">
|
| 489 |
+
<h3>Current Status: <span style="color: {'green' if current_status['status'] == 'Up to date' else 'blue' if current_status['status'] == 'Processing' else 'red' if 'Error' in current_status['status'] else 'orange'}">{current_status['status']}</span></h3>
|
| 490 |
+
<p><strong>Last Checked:</strong> {current_status.get('last_check', 'Never').replace('T', ' ').split('.')[0] if current_status.get('last_check') else 'Never'}</p>
|
| 491 |
+
<p><strong>Last Updated:</strong> {current_status.get('last_updated', 'Never').replace('T', ' ').split('.')[0] if current_status.get('last_updated') else 'Never'}</p>
|
| 492 |
+
<p><strong>Current Quantization:</strong> {current_status.get('current_quant', 'None')}</p>
|
| 493 |
+
{f'<p style="color: red;"><strong>Error:</strong> {current_status["error"]}</p>' if current_status.get('error') else ''}
|
| 494 |
+
</div>
|
| 495 |
+
"""
|
| 496 |
+
return status_html
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 497 |
|
| 498 |
+
def update_logs():
|
| 499 |
+
"""Update the logs display"""
|
| 500 |
+
logs_html = "<div style='height: 400px; overflow-y: auto; background-color: #f9f9f9; padding: 10px; font-family: monospace; white-space: pre-wrap;'>"
|
| 501 |
+
for log in current_status["log"]:
|
| 502 |
+
if "Error" in log or "error" in log:
|
| 503 |
+
logs_html += f"<div style='color: red;'>{log}</div>"
|
| 504 |
+
else:
|
| 505 |
+
logs_html += f"<div>{log}</div>"
|
| 506 |
+
logs_html += "</div>"
|
| 507 |
+
return logs_html
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 508 |
|
| 509 |
+
def on_check_button():
|
| 510 |
+
"""Handle check button click"""
|
| 511 |
+
if check_for_updates():
|
| 512 |
+
threading.Thread(target=process_model).start()
|
| 513 |
+
return update_status(), [[q["type"], q["size_gb"], q["notes"], current_status["quant_status"].get(q["type"], "Not processed")] for q in QUANT_CONFIGS], update_logs()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 514 |
|
| 515 |
+
def on_process_button():
|
| 516 |
+
"""Handle process button click"""
|
| 517 |
+
threading.Thread(target=process_model).start()
|
| 518 |
+
return update_status(), [[q["type"], q["size_gb"], q["notes"], current_status["quant_status"].get(q["type"], "Not processed")] for q in QUANT_CONFIGS], update_logs()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 519 |
|
| 520 |
+
check_button.click(on_check_button, outputs=[status_info, quant_status, logs])
|
| 521 |
+
process_button.click(on_process_button, outputs=[status_info, quant_status, logs])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 522 |
|
| 523 |
+
# Set up periodic refresh
|
| 524 |
+
demo.load(update_status, outputs=[status_info])
|
| 525 |
+
demo.load(lambda: [[q["type"], q["size_gb"], q["notes"], current_status["quant_status"].get(q["type"], "Not processed")] for q in QUANT_CONFIGS], outputs=[quant_status])
|
| 526 |
+
demo.load(update_logs, outputs=[logs])
|
|
|
|
|
|
|
| 527 |
|
| 528 |
+
refresh_interval = 5 # seconds
|
| 529 |
+
gr.HTML("<script>setInterval(function(){ Array.from(document.querySelectorAll('button[id*=Refresh-Button]')).forEach(b => b.click()); }, " + str(refresh_interval * 1000) + ");</script>")
|
| 530 |
+
|
| 531 |
+
return demo
|
| 532 |
|
| 533 |
+
# Initialize
|
| 534 |
+
def initialize():
|
| 535 |
+
"""Initialize the application"""
|
| 536 |
+
# Load status from file
|
| 537 |
+
load_status()
|
| 538 |
+
|
| 539 |
+
# Schedule regular checks for updates
|
| 540 |
+
scheduler = BackgroundScheduler()
|
| 541 |
+
scheduler.add_job(check_and_process, 'interval', minutes=60) # Check every hour
|
| 542 |
+
scheduler.start()
|
| 543 |
+
|
| 544 |
+
# Run initial check
|
| 545 |
+
threading.Thread(target=check_and_process).start()
|
| 546 |
|
| 547 |
+
if __name__ == "__main__":
|
| 548 |
+
initialize()
|
| 549 |
+
demo = create_ui()
|
| 550 |
+
demo.queue(concurrency_count=1).launch()
|
groups_merged.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
requirements.txt
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
gradio>=
|
| 2 |
-
huggingface_hub>=0.
|
| 3 |
-
|
| 4 |
-
numpy>=1.24.0
|
|
|
|
| 1 |
+
gradio>=3.50.2
|
| 2 |
+
huggingface_hub>=0.17.1
|
| 3 |
+
apscheduler>=3.10.1
|
|
|
setup.sh
DELETED
|
@@ -1,48 +0,0 @@
|
|
| 1 |
-
#!/bin/bash
|
| 2 |
-
set -e
|
| 3 |
-
|
| 4 |
-
echo "Setting up for real GGUF quantization..."
|
| 5 |
-
|
| 6 |
-
# Clone llama.cpp
|
| 7 |
-
if [ ! -d "llama.cpp" ]; then
|
| 8 |
-
echo "Cloning llama.cpp repository..."
|
| 9 |
-
git clone --depth=1 https://github.com/ggerganov/llama.cpp
|
| 10 |
-
fi
|
| 11 |
-
|
| 12 |
-
cd llama.cpp
|
| 13 |
-
|
| 14 |
-
# Get conversion script
|
| 15 |
-
echo "Setting up conversion script..."
|
| 16 |
-
if [ -f "convert.py" ]; then
|
| 17 |
-
echo "Found existing convert.py script"
|
| 18 |
-
elif [ -f "convert-hf-to-gguf.py" ]; then
|
| 19 |
-
echo "Found convert-hf-to-gguf.py"
|
| 20 |
-
cp convert-hf-to-gguf.py convert.py
|
| 21 |
-
elif [ -f "examples/convert-hf-to-gguf.py" ]; then
|
| 22 |
-
echo "Found examples/convert-hf-to-gguf.py"
|
| 23 |
-
cp examples/convert-hf-to-gguf.py convert.py
|
| 24 |
-
else
|
| 25 |
-
echo "Cannot find conversion script. Using Python alternative."
|
| 26 |
-
# Install required packages
|
| 27 |
-
pip install -q transformers torch
|
| 28 |
-
fi
|
| 29 |
-
|
| 30 |
-
# Install required packages for the conversion script
|
| 31 |
-
pip install -q transformers torch
|
| 32 |
-
|
| 33 |
-
# Initialize state file
|
| 34 |
-
cd ..
|
| 35 |
-
if [ ! -f "state.json" ]; then
|
| 36 |
-
echo "Initializing state file..."
|
| 37 |
-
echo '{"last_checked": null, "last_commit_hash": null, "is_up_to_date": true, "is_processing": false, "current_quant": null, "progress": 0, "total_quants": 12, "completed_quants": [], "failed_quants": [], "out_of_memory": false, "last_error": null, "status_message": "Ready to check for updates"}' > state.json
|
| 38 |
-
fi
|
| 39 |
-
|
| 40 |
-
# Create necessary directories
|
| 41 |
-
echo "Creating directories..."
|
| 42 |
-
mkdir -p model_cache
|
| 43 |
-
mkdir -p temp_outputs
|
| 44 |
-
|
| 45 |
-
echo "Setup completed successfully"
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
start.sh
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# Clone llama.cpp if not exists
|
| 4 |
+
if [ ! -d "llama.cpp" ]; then
|
| 5 |
+
echo "Cloning llama.cpp repository..."
|
| 6 |
+
git clone https://github.com/ggerganov/llama.cpp
|
| 7 |
+
fi
|
| 8 |
+
|
| 9 |
+
# Copy calibration data if not exists
|
| 10 |
+
if [ ! -f "llama.cpp/groups_merged.txt" ]; then
|
| 11 |
+
echo "Copying calibration data..."
|
| 12 |
+
cp groups_merged.txt llama.cpp/groups_merged.txt
|
| 13 |
+
fi
|
| 14 |
+
|
| 15 |
+
# Disable CUDA for HF spaces (not supported in free tier)
|
| 16 |
+
# We should still build with optimizations for CPU
|
| 17 |
+
export GGML_CUDA=OFF
|
| 18 |
+
export GGML_AVX=1
|
| 19 |
+
export GGML_AVX2=1
|
| 20 |
+
|
| 21 |
+
cd llama.cpp
|
| 22 |
+
echo "Building llama.cpp tools..."
|
| 23 |
+
cmake -B build -DBUILD_SHARED_LIBS=OFF
|
| 24 |
+
cmake --build build --config Release -j --target llama-quantize llama-gguf-split llama-imatrix
|
| 25 |
+
echo "Copying built binaries..."
|
| 26 |
+
cp ./build/bin/llama-* ./ 2>/dev/null || cp ./build/llama-* ./ 2>/dev/null
|
| 27 |
+
rm -rf build
|
| 28 |
+
|
| 29 |
+
cd ..
|
| 30 |
+
echo "Starting Gradio app..."
|
| 31 |
+
python app.py
|