Spaces:

broadfield-dev
/

AMOP

Paused

App Files Files Community

broadfield-dev commited on Sep 1, 2025

Commit

19216c7

verified ·

1 Parent(s): 32de6da

Update app.py

Browse files

Files changed (1) hide show

app.py +118 -65

app.py CHANGED Viewed

@@ -4,14 +4,11 @@ import os
 import logging
 from datetime import datetime
 from huggingface_hub import HfApi, HfFolder
-from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoModel
 from optimum.onnxruntime import ORTQuantizer, ORTModelForCausalLM
 from optimum.onnxruntime.configuration import AutoQuantizationConfig
-from optimum.onnx import export
-from optimum.onnx.utils import get_preprocessor
-from datasets import load_dataset
 import torch.nn.utils.prune as prune
-import numpy as np
 import time
 # --- 1. SETUP AND CONFIGURATION ---
@@ -23,8 +20,6 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
 HF_TOKEN = os.getenv("HF_TOKEN")
 if not HF_TOKEN:
     logging.warning("HF_TOKEN environment variable not set. Packaging and uploading will fail.")
-    # For testing locally, you can uncomment the next line and set your token
-    # HfFolder.save_token('YOUR_HF_WRITE_TOKEN')
 api = HfApi()
 OUTPUT_DIR = "optimized_models"
@@ -51,7 +46,6 @@ def stage_1_analyze_model(model_id: str):
         - **Estimated Parameters:** ~{num_params:.2f}M
         """
-        # Recommendation Logic
         recommendation = ""
         if 'llama' in model_type or 'gpt' in model_type or 'mistral' in model_type:
             recommendation = "**Recommendation:** This is a large language model (LLM). For best CPU performance, a GGUF-based quantization strategy is typically state-of-the-art. This initial version of AMOP focuses on the ONNX pipeline. The recommended path is **Quantization -> ONNX Conversion**."
@@ -94,7 +88,7 @@ def stage_2_prune_model(model, prune_percentage: float, progress):
     return model, log_stream
-def stage_3_and_4_quantize_and_onnx(model_id: str, model, progress):
     """
     Performs Stage 3 (Quantization) and Stage 4 (ONNX Conversion).
     This version uses post-training dynamic quantization.
@@ -103,32 +97,16 @@ def stage_3_and_4_quantize_and_onnx(model_id: str, model, progress):
     progress(0.5, desc="Exporting to ONNX")
     try:
-        # Define a unique path for this run
         run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
         onnx_path = os.path.join(OUTPUT_DIR, f"{model_id.replace('/', '_')}-{run_id}-onnx")
         os.makedirs(onnx_path, exist_ok=True)
-        onnx_model_path = os.path.join(onnx_path, "model.onnx")
-        # Export the base model to ONNX
-        # Using a trick to get the task for optimum
-        config = AutoConfig.from_pretrained(model_id)
-        task = getattr(config, "task_specific_params", None)
-        task = "default" if task is None else list(task.keys())[0] if isinstance(task, dict) else "default"
-        # Load preprocessor for ONNX export
-        preprocessor = get_preprocessor(model_id)
-        # This is a key step where we need to find the correct OnnxConfig
-        # Optimum has utilities, but for a general case, we try our best
-        from optimum.exporters.onnx import main_export
         main_export(model_id, output=onnx_path, task="auto", trust_remote_code=True)
         log_stream += f"Successfully exported base model to ONNX at: {onnx_path}\n"
-        # Quantize the ONNX model
         progress(0.7, desc="Applying Dynamic Quantization")
         quantizer = ORTQuantizer.from_pretrained(onnx_path)
-        dqconfig = AutoQuantizationConfig.arm64(is_static=False, per_channel=False) # Dynamic quantization
         quantized_path = os.path.join(onnx_path, "quantized")
         quantizer.quantize(save_dir=quantized_path, quantization_config=dqconfig)
@@ -155,10 +133,9 @@ def stage_5_evaluate_and_package(
     log_stream = "[STAGE 5] Evaluating and Packaging...\n"
     progress(0.9, desc="Evaluating performance")
-    # Simple evaluation: Load the model and measure latency
     try:
         ort_model = ORTModelForCausalLM.from_pretrained(optimized_model_path)
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
         prompt = "My name is Philipp and I"
         inputs = tokenizer(prompt, return_tensors="pt")
@@ -167,7 +144,7 @@ def stage_5_evaluate_and_package(
         gen_tokens = ort_model.generate(**inputs, max_new_tokens=20)
         end_time = time.time()
-        latency = (end_time - start_time) * 1000 # in ms
         num_tokens = len(gen_tokens[0])
         ms_per_token = latency / num_tokens
@@ -178,60 +155,136 @@ def stage_5_evaluate_and_package(
         eval_report = f"- **Evaluation Failed:** Could not load and test the ONNX model. This often happens if the base model is not a text-generation model. Error: {e}\n"
         log_stream += f"Warning: Evaluation failed. {e}\n"
-    # Package and upload
     progress(0.95, desc="Uploading to Hugging Face Hub")
     if not HF_TOKEN:
         return "Skipping upload: HF_TOKEN not found.", log_stream + "Skipping upload: HF_TOKEN not found."
     try:
-        # Create a new repo
         repo_name = f"{model_id.split('/')[-1]}-amop-cpu"
-        repo_url = api.create_repo(
-            repo_id=repo_name,
-            exist_ok=True,
-            token=HF_TOKEN
         )
-        # Generate the Model Card (README.md)
-        model_card_content = f"""
----
-license: mit
-tags:
-- amop-optimized
-- onnx
----
-# AMOP-Optimized CPU Model: {repo_name}
-This model was automatically optimized for CPU inference using the **Adaptive Model Optimization Pipeline (AMOP)**.
-- **Base Model:** [{model_id}](https://huggingface.co/{model_id})
-- **Optimization Date:** {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
-## Optimization Details
-The following AMOP stages were applied:
-- **Stage 2: Pruning:** {"Enabled" if options['prune'] else "Disabled"} (Percentage: {options['prune_percent']}%)
-- **Stage 3 & 4: Quantization & ONNX Conversion:** Enabled (Dynamic Quantization)
-## Performance Metrics
-{eval_report}
-## How to Use
-This model is in ONNX format and can be run with `optimum-onnxruntime`.
-```python
-from optimum.onnxruntime import ORTModelForCausalLM
-from transformers import AutoTokenizer
-model_id = "{repo_url.repo_id}"
-model = ORTModelForCausalLM.from_pretrained(model_id)
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-prompt = "The future of AI is"
-inputs = tokenizer(prompt, return_tensors="pt")
-gen_tokens = model.generate(**inputs)
-print(tokenizer.batch_decode(gen_tokens))

 import logging
 from datetime import datetime
 from huggingface_hub import HfApi, HfFolder
+from transformers import AutoConfig, AutoModel, AutoTokenizer
 from optimum.onnxruntime import ORTQuantizer, ORTModelForCausalLM
 from optimum.onnxruntime.configuration import AutoQuantizationConfig
+from optimum.exporters.onnx import main_export
 import torch.nn.utils.prune as prune
 import time
 # --- 1. SETUP AND CONFIGURATION ---
 HF_TOKEN = os.getenv("HF_TOKEN")
 if not HF_TOKEN:
     logging.warning("HF_TOKEN environment variable not set. Packaging and uploading will fail.")
 api = HfApi()
 OUTPUT_DIR = "optimized_models"
         - **Estimated Parameters:** ~{num_params:.2f}M
         """
         recommendation = ""
         if 'llama' in model_type or 'gpt' in model_type or 'mistral' in model_type:
             recommendation = "**Recommendation:** This is a large language model (LLM). For best CPU performance, a GGUF-based quantization strategy is typically state-of-the-art. This initial version of AMOP focuses on the ONNX pipeline. The recommended path is **Quantization -> ONNX Conversion**."
     return model, log_stream
+def stage_3_and_4_quantize_and_onnx(model_id: str, progress):
     """
     Performs Stage 3 (Quantization) and Stage 4 (ONNX Conversion).
     This version uses post-training dynamic quantization.
     progress(0.5, desc="Exporting to ONNX")
     try:
         run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
         onnx_path = os.path.join(OUTPUT_DIR, f"{model_id.replace('/', '_')}-{run_id}-onnx")
         os.makedirs(onnx_path, exist_ok=True)
         main_export(model_id, output=onnx_path, task="auto", trust_remote_code=True)
         log_stream += f"Successfully exported base model to ONNX at: {onnx_path}\n"
         progress(0.7, desc="Applying Dynamic Quantization")
         quantizer = ORTQuantizer.from_pretrained(onnx_path)
+        dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False) # Dynamic quantization for CPUs
         quantized_path = os.path.join(onnx_path, "quantized")
         quantizer.quantize(save_dir=quantized_path, quantization_config=dqconfig)
     log_stream = "[STAGE 5] Evaluating and Packaging...\n"
     progress(0.9, desc="Evaluating performance")
     try:
         ort_model = ORTModelForCausalLM.from_pretrained(optimized_model_path)
+        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
         prompt = "My name is Philipp and I"
         inputs = tokenizer(prompt, return_tensors="pt")
         gen_tokens = ort_model.generate(**inputs, max_new_tokens=20)
         end_time = time.time()
+        latency = (end_time - start_time) * 1000
         num_tokens = len(gen_tokens[0])
         ms_per_token = latency / num_tokens
         eval_report = f"- **Evaluation Failed:** Could not load and test the ONNX model. This often happens if the base model is not a text-generation model. Error: {e}\n"
         log_stream += f"Warning: Evaluation failed. {e}\n"
     progress(0.95, desc="Uploading to Hugging Face Hub")
     if not HF_TOKEN:
         return "Skipping upload: HF_TOKEN not found.", log_stream + "Skipping upload: HF_TOKEN not found."
     try:
         repo_name = f"{model_id.split('/')[-1]}-amop-cpu"
+        repo_url = api.create_repo(repo_id=repo_name, exist_ok=True, token=HF_TOKEN)
+        # --- THIS IS THE UPDATED SECTION ---
+        # Read the template file
+        with open("model_card_template.md", "r", encoding="utf-8") as f:
+            template_content = f.read()
+        # Fill in the placeholders
+        model_card_content = template_content.format(
+            repo_name=repo_name,
+            model_id=model_id,
+            optimization_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+            eval_report=eval_report,
+            pruning_status="Enabled" if options['prune'] else "Disabled",
+            pruning_percent=options['prune_percent'],
+            repo_id=repo_url.repo_id,
+            pipeline_log=pipeline_log
         )
+        # --- END OF UPDATED SECTION ---
+        readme_path = os.path.join(optimized_model_path, "README.md")
+        with open(readme_path, "w", encoding="utf-8") as f:
+            f.write(model_card_content)
+        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+        tokenizer.save_pretrained(optimized_model_path)
+        api.upload_folder(
+            folder_path=optimized_model_path,
+            repo_id=repo_url.repo_id,
+            repo_type="model",
+            token=HF_TOKEN
+        )
+        final_message = f"✅ Success! Your optimized model is available at: {repo_url}"
+        log_stream += "Upload complete.\n"
+        return final_message, log_stream
+    except Exception as e:
+        error_msg = f"Failed to upload to the Hub. Error: {e}"
+        logging.error(error_msg, exc_info=True)
+        return f"❌ Error: {error_msg}", log_stream + error_msg
+# --- 3. MAIN WORKFLOW FUNCTION ---
+def run_amop_pipeline(model_id: str, do_prune: bool, prune_percent: float, progress=gr.Progress(track_tqdm=True)):
+    if not model_id:
+        return "Please enter a Model ID.", ""
+    full_log = "[START] AMOP Pipeline Initiated.\n"
+    progress(0, desc="Loading Base Model")
+    try:
+        model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
+        full_log += f"Successfully loaded base model '{model_id}'.\n"
+        if do_prune:
+            model, log = stage_2_prune_model(model, prune_percent, progress)
+            full_log += log
+        else:
+            full_log += "[STAGE 2] Pruning skipped by user.\n"
+        # We re-export the pruned model, so it needs to be saved and reloaded by optimum
+        # For simplicity in V1, we will export the original model from the hub
+        # A future version could handle the pruned model state_dict
+        optimized_path, log = stage_3_and_4_quantize_and_onnx(model_id, progress)
+        full_log += log
+        options = {'prune': do_prune, 'prune_percent': prune_percent}
+        final_status, log = stage_5_evaluate_and_package(model_id, optimized_path, full_log, options, progress)
+        full_log += log
+        return final_status, full_log
+    except Exception as e:
+        logging.error(f"AMOP Pipeline failed. Error: {e}", exc_info=True)
+        full_log += f"\n[ERROR] Pipeline failed: {e}"
+        return f"❌ An error occurred during the pipeline. Check the logs for details.", full_log
+# --- 4. GRADIO USER INTERFACE ---
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# AMOP: Adaptive Model Optimization Pipeline")
+    gr.Markdown(
+        "**Turn any Hugging Face Hub model into a CPU-optimized version.** Enter a model ID, choose your optimizations, "
+        "and get a new, smaller, and faster model repository ready for deployment."
+    )
+    if not HF_TOKEN:
+        gr.Warning("You have not set your HF_TOKEN in the Space secrets! The final 'upload' step will be skipped. Please add a secret with the key `HF_TOKEN` and your Hugging Face write token as the value.")
+    with gr.Row():
+        with gr.Column(scale=1):
+            model_id_input = gr.Textbox(label="Hugging Face Model ID", placeholder="e.g., gpt2, bert-base-uncased")
+            analyze_button = gr.Button("1. Analyze Model")
+            with gr.Group(visible=False) as optimization_options:
+                gr.Markdown("### 2. Configure Optimization")
+                analysis_report_output = gr.Markdown()
+                prune_checkbox = gr.Checkbox(label="Enable Pruning (Stage 2)", value=False, info="Note: Pruning is applied conceptually; ONNX export uses the original model for wider compatibility in this version.")
+                prune_slider = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)")
+                gr.Checkbox(label="Enable Quantization & ONNX (Stages 3 & 4)", value=True, interactive=False)
+                run_button = gr.Button("3. Run Optimization Pipeline", variant="primary")
+        with gr.Column(scale=2):
+            gr.Markdown("### Pipeline Status & Logs")
+            final_output = gr.Markdown(label="Final Result")
+            log_output = gr.Textbox(label="Live Logs", lines=20, interactive=False)
+    analyze_button.click(
+        fn=stage_1_analyze_model,
+        inputs=[model_id_input],
+        outputs=[log_output, analysis_report_output, optimization_options]
+    )
+    run_button.click(
+        fn=run_amop_pipeline,
+        inputs=[model_id_input, prune_checkbox, prune_slider],
+        outputs=[final_output, log_output]
+    )
+if __name__ == "__main__":
+    demo.launch(debug=True)