Spaces:
Paused
Paused
File size: 16,878 Bytes
32de6da f9beded 6e5122c 32de6da f074b57 19216c7 6e5122c 32de6da f9beded 32de6da f9beded 32de6da f9beded 32de6da 54b40d5 32de6da 54b40d5 32de6da f074b57 32de6da f074b57 f9beded 32de6da f9beded 32de6da 6e5122c 32de6da 6e5122c 32de6da 6e5122c 32de6da f9beded 32de6da 6e5122c 32de6da f9beded 6e5122c f9beded 6e5122c f9beded 6e5122c f9beded 32de6da f9beded 32de6da f9beded 19216c7 f9beded 19216c7 54b40d5 f9beded 32de6da 19216c7 f9beded 54b40d5 f9beded 54b40d5 19216c7 54b40d5 19216c7 f9beded 19216c7 54b40d5 f074b57 f9beded 54b40d5 f9beded 54b40d5 f9beded 19216c7 f9beded 19216c7 f9beded 19216c7 f9beded 19216c7 f074b57 54b40d5 f9beded 54b40d5 f9beded 54b40d5 32de6da 19216c7 54b40d5 f9beded 54b40d5 f9beded 54b40d5 f9beded 32de6da f9beded 54b40d5 f9beded 54b40d5 19216c7 32de6da 19216c7 54b40d5 f9beded 54b40d5 19216c7 54b40d5 19216c7 f9beded 19216c7 54b40d5 32de6da f9beded 19216c7 54b40d5 19216c7 f9beded 54b40d5 19216c7 32de6da 19216c7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 | import gradio as gr
import torch
import os
import logging
import time
import tempfile
import shutil
import subprocess
from datetime import datetime
from huggingface_hub import HfApi
from transformers import AutoConfig, AutoModel, AutoTokenizer
from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig
import torch.nn.utils.prune as prune
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
HF_TOKEN = os.getenv("HF_TOKEN")
if not HF_TOKEN:
logging.warning("HF_TOKEN environment variable not set. Packaging and uploading will fail.")
api = HfApi()
OUTPUT_DIR = "optimized_models"
os.makedirs(OUTPUT_DIR, exist_ok=True)
def stage_1_analyze_model(model_id: str):
log_stream = "[STAGE 1] Analyzing model...\n"
try:
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True, token=HF_TOKEN)
model_type = config.model_type
analysis_report = f"""
### Model Analysis Report
- **Model ID:** `{model_id}`
- **Architecture:** `{model_type}`
"""
recommendation = ""
if 'llama' in model_type or 'gpt' in model_type or 'mistral' in model_type or 'gemma' in model_type:
recommendation = "**Recommendation:** This is a Large Language Model (LLM). For the best CPU performance and community support, the **GGUF Pipeline** is highly recommended. The ONNX pipeline is a viable alternative."
else:
recommendation = "**Recommendation:** This is likely an encoder model. The **ONNX Pipeline** is recommended. Pruning may offer size reduction, but its impact on performance can vary."
log_stream += f"Analysis complete. Architecture: {model_type}.\n"
return log_stream, analysis_report + "\n" + recommendation, gr.Accordion(open=True)
except Exception as e:
error_msg = f"Failed to analyze model '{model_id}'. Error: {e}"
logging.error(error_msg)
return log_stream + error_msg, "Could not analyze model. Please check the model ID and try again.", gr.Accordion(open=False)
def stage_2_prune_model(model, prune_percentage: float):
if prune_percentage == 0:
return model, "Skipped pruning as percentage was 0."
log_stream = "[STAGE 2] Pruning model...\n"
for name, module in model.named_modules():
if isinstance(module, torch.nn.Linear):
prune.l1_unstructured(module, name='weight', amount=prune_percentage / 100.0)
prune.remove(module, 'weight')
log_stream += f"Pruning complete with {prune_percentage}% target.\n"
return model, log_stream
def stage_3_4_onnx_quantize(model_path: str, calibration_data_path: str):
log_stream = "[STAGE 3 & 4] Converting to ONNX and Quantizing...\n"
run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
model_name = os.path.basename(model_path)
onnx_path = os.path.join(OUTPUT_DIR, f"{model_name}-{run_id}-onnx")
try:
log_stream += "Executing `optimum-cli export onnx` via subprocess...\n"
export_command = [
"optimum-cli", "export", "onnx",
"--model", model_path,
"--trust-remote-code",
onnx_path
]
process = subprocess.run(export_command, check=True, capture_output=True, text=True)
log_stream += process.stdout
if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
log_stream += f"Successfully exported base model to ONNX at: {onnx_path}\n"
except subprocess.CalledProcessError as e:
error_msg = f"Failed during `optimum-cli export onnx`. Error:\n{e.stderr}"
logging.error(error_msg)
raise RuntimeError(error_msg)
try:
quantizer = ORTQuantizer.from_pretrained(onnx_path)
if calibration_data_path:
log_stream += "Performing STATIC quantization with user-provided calibration data.\n"
dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=True, per_channel=False)
from datasets import load_dataset
calibration_dataset = quantizer.get_calibration_dataset(
"text",
dataset_args={"path": calibration_data_path, "split": "train"},
num_samples=100,
dataset_num_proc=1,
)
quantized_path = os.path.join(onnx_path, "quantized-static")
quantizer.quantize(save_dir=quantized_path, quantization_config=dqconfig, calibration_dataset=calibration_dataset)
else:
log_stream += "Performing DYNAMIC quantization.\n"
dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)
quantized_path = os.path.join(onnx_path, "quantized-dynamic")
quantizer.quantize(save_dir=quantized_path, quantization_config=dqconfig)
log_stream += f"Successfully quantized model to: {quantized_path}\n"
return quantized_path, log_stream
except Exception as e:
error_msg = f"Failed during ONNX quantization step. Error: {e}"
logging.error(error_msg, exc_info=True)
raise RuntimeError(error_msg)
def stage_3_4_gguf_quantize(model_id: str, quantization_strategy: str):
log_stream = f"[STAGE 3 & 4] Converting to GGUF with '{quantization_strategy}' quantization...\n"
run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
model_name = model_id.replace('/', '_')
gguf_path = os.path.join(OUTPUT_DIR, f"{model_name}-{run_id}-gguf")
os.makedirs(gguf_path, exist_ok=True)
output_file = os.path.join(gguf_path, "model.gguf")
try:
log_stream += "Executing `optimum-cli export gguf` via subprocess...\n"
export_command = [
"optimum-cli", "export", "gguf",
"--model", model_id,
"--quantization_strategy", quantization_strategy,
"--trust-remote-code",
output_file
]
process = subprocess.run(export_command, check=True, capture_output=True, text=True)
log_stream += process.stdout
if process.stderr: log_stream += f"[STDERR]\n{process.stderr}\n"
log_stream += f"Successfully exported and quantized model to GGUF at: {gguf_path}\n"
return gguf_path, log_stream
except subprocess.CalledProcessError as e:
error_msg = f"Failed during `optimum-cli export gguf`. Error:\n{e.stderr}"
logging.error(error_msg)
raise RuntimeError(error_msg)
def stage_5_package_and_upload(model_id: str, optimized_model_path: str, pipeline_log: str, options: dict):
log_stream = "[STAGE 5] Packaging and Uploading...\n"
if not HF_TOKEN:
return "Skipping upload: HF_TOKEN not found.", log_stream + "Skipping upload: HF_TOKEN not found."
try:
repo_name = f"{model_id.split('/')[-1]}-amop-cpu-{options['pipeline_type'].lower()}"
repo_url = api.create_repo(repo_id=repo_name, exist_ok=True, token=HF_TOKEN)
if options['pipeline_type'] == "GGUF":
template_file = "model_card_template_gguf.md"
else:
template_file = "model_card_template.md"
with open(template_file, "r", encoding="utf-8") as f:
template_content = f.read()
model_card_content = template_content.format(
repo_name=repo_name, model_id=model_id, optimization_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
pruning_status="Enabled" if options.get('prune', False) else "Disabled",
pruning_percent=options.get('prune_percent', 0),
quant_type=options.get('quant_type', 'N/A'),
repo_id=repo_url.repo_id, pipeline_log=pipeline_log
)
readme_path = os.path.join(optimized_model_path, "README.md")
with open(readme_path, "w", encoding="utf-8") as f:
f.write(model_card_content)
if options['pipeline_type'] == "ONNX":
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.save_pretrained(optimized_model_path)
api.upload_folder(folder_path=optimized_model_path, repo_id=repo_url.repo_id, repo_type="model", token=HF_TOKEN)
final_message = f"Success! Your optimized model is available at: huggingface.co/{repo_url.repo_id}"
log_stream += "Upload complete.\n"
return final_message, log_stream
except Exception as e:
error_msg = f"Failed to upload to the Hub. Error: {e}"
logging.error(error_msg, exc_info=True)
return f"Error: {error_msg}", log_stream + error_msg
def run_amop_pipeline(model_id: str, pipeline_type: str, do_prune: bool, prune_percent: float, onnx_quant_type: str, calibration_file, gguf_quant_type: str):
if not model_id:
yield {log_output: "Please enter a Model ID.", final_output: gr.Label(value="Idle", label="Status")}
return
initial_log = f"[START] AMOP {pipeline_type} Pipeline Initiated.\n"
yield {
run_button: gr.Button(interactive=False, value="π Running..."),
analyze_button: gr.Button(interactive=False),
final_output: gr.Label(value={"label": f"RUNNING ({pipeline_type})"}, show_label=True),
log_output: initial_log
}
full_log = initial_log
temp_model_dir = None
try:
repo_name_suffix = f"-amop-cpu-{pipeline_type.lower()}"
repo_id_for_link = f"{api.whoami()['name']}/{model_id.split('/')[-1]}{repo_name_suffix}"
if pipeline_type == "ONNX":
full_log += "Loading base model for pruning...\n"
yield {final_output: gr.Label(value="Loading model (1/5)"), log_output: full_log}
model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
full_log += f"Successfully loaded base model '{model_id}'.\n"
yield {final_output: gr.Label(value="Pruning model (2/5)"), log_output: full_log}
if do_prune:
model, log = stage_2_prune_model(model, prune_percent)
full_log += log
else:
full_log += "[STAGE 2] Pruning skipped by user.\n"
temp_model_dir = tempfile.mkdtemp()
model.save_pretrained(temp_model_dir)
tokenizer.save_pretrained(temp_model_dir)
full_log += f"Saved intermediate model to temporary directory: {temp_model_dir}\n"
yield {final_output: gr.Label(value="Converting to ONNX (3/5)"), log_output: full_log}
calib_path = calibration_file.name if onnx_quant_type == "Static" and calibration_file else None
optimized_path, log = stage_3_4_onnx_quantize(temp_model_dir, calib_path)
full_log += log
options = {'pipeline_type': 'ONNX', 'prune': do_prune, 'prune_percent': prune_percent, 'quant_type': onnx_quant_type}
elif pipeline_type == "GGUF":
full_log += "[STAGE 1 & 2] Loading and Pruning are skipped for GGUF pipeline.\n"
yield {final_output: gr.Label(value="Converting to GGUF (3/5)"), log_output: full_log}
optimized_path, log = stage_3_4_gguf_quantize(model_id, gguf_quant_type)
full_log += log
options = {'pipeline_type': 'GGUF', 'quant_type': gguf_quant_type}
else:
raise ValueError("Invalid pipeline type selected.")
yield {final_output: gr.Label(value="Packaging & Uploading (4/5)"), log_output: full_log}
final_message, log = stage_5_package_and_upload(model_id, optimized_path, full_log, options)
full_log += log
yield {
final_output: gr.Label(value="SUCCESS", label="Status"),
log_output: full_log,
success_box: gr.Markdown(f"β
**Success!** Your optimized model is available here: [{repo_id_for_link}](https://huggingface.co/{repo_id_for_link})", visible=True),
run_button: gr.Button(interactive=True, value="Run Optimization Pipeline", variant="primary"),
analyze_button: gr.Button(interactive=True, value="Analyze Model")
}
except Exception as e:
logging.error(f"AMOP Pipeline failed. Error: {e}", exc_info=True)
full_log += f"\n[ERROR] Pipeline failed: {e}"
yield {
final_output: gr.Label(value="ERROR", label="Status"),
log_output: full_log,
success_box: gr.Markdown(f"β **An error occurred.** Check the logs for details.", visible=True),
run_button: gr.Button(interactive=True, value="Run Optimization Pipeline", variant="primary"),
analyze_button: gr.Button(interactive=True, value="Analyze Model")
}
finally:
if temp_model_dir and os.path.exists(temp_model_dir):
shutil.rmtree(temp_model_dir)
logging.info(f"Cleaned up temporary directory: {temp_model_dir}")
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# π AMOP: Adaptive Model Optimization Pipeline")
gr.Markdown("Turn any Hugging Face Hub model into a CPU-optimized version using ONNX or GGUF.")
if not HF_TOKEN:
gr.Warning("You have not set your HF_TOKEN in the Space secrets! The final 'upload' step will be skipped. Please add a secret with the key `HF_TOKEN` and your Hugging Face write token as the value.")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### 1. Select a Model")
model_id_input = gr.Textbox(
label="Hugging Face Model ID",
placeholder="e.g., gpt2, meta-llama/Llama-2-7b-chat-hf",
)
analyze_button = gr.Button("π Analyze Model", variant="secondary")
with gr.Accordion("βοΈ 2. Configure Optimization", open=False) as optimization_accordion:
analysis_report_output = gr.Markdown()
pipeline_type_radio = gr.Radio(
["ONNX", "GGUF"], label="Select Optimization Pipeline", info="GGUF is recommended for LLMs, ONNX for others."
)
with gr.Group(visible=False) as onnx_options:
gr.Markdown("#### ONNX Pipeline Options")
prune_checkbox = gr.Checkbox(label="Enable Pruning", value=False, info="Removes redundant weights. Applied before ONNX conversion.")
prune_slider = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)")
onnx_quant_radio = gr.Radio(["Dynamic", "Static"], label="ONNX Quantization Type", value="Dynamic", info="Static may offer better performance but requires calibration data.")
calibration_file_upload = gr.File(label="Upload Calibration Data (.txt)", visible=False, file_types=['.txt'])
with gr.Group(visible=False) as gguf_options:
gr.Markdown("#### GGUF Pipeline Options")
gguf_quant_dropdown = gr.Dropdown(
["q4_k_m", "q5_k_m", "q8_0", "f16"],
label="GGUF Quantization Strategy",
value="q4_k_m",
info="q4_k_m is a good balance of size and quality."
)
run_button = gr.Button("π Run Optimization Pipeline", variant="primary")
with gr.Column(scale=2):
gr.Markdown("### Pipeline Status & Logs")
final_output = gr.Label(value="Idle", label="Status", show_label=True)
success_box = gr.Markdown(visible=False)
log_output = gr.Textbox(label="Live Logs", lines=20, interactive=False, max_lines=20)
def update_ui_for_pipeline(pipeline_type):
return {
onnx_options: gr.Group(visible=pipeline_type == "ONNX"),
gguf_options: gr.Group(visible=pipeline_type == "GGUF")
}
def update_ui_for_quant_type(quant_type):
return gr.File(visible=quant_type == "Static")
pipeline_type_radio.change(fn=update_ui_for_pipeline, inputs=pipeline_type_radio, outputs=[onnx_options, gguf_options])
onnx_quant_radio.change(fn=update_ui_for_quant_type, inputs=onnx_quant_radio, outputs=[calibration_file_upload])
analyze_button.click(
fn=stage_1_analyze_model,
inputs=[model_id_input],
outputs=[log_output, analysis_report_output, optimization_accordion]
)
run_button.click(
fn=run_amop_pipeline,
inputs=[model_id_input, pipeline_type_radio, prune_checkbox, prune_slider, onnx_quant_radio, calibration_file_upload, gguf_quant_dropdown],
outputs=[run_button, analyze_button, final_output, log_output, success_box]
)
if __name__ == "__main__":
demo.launch(debug=True) |