broadfield-dev commited on
Commit
19216c7
·
verified ·
1 Parent(s): 32de6da

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -65
app.py CHANGED
@@ -4,14 +4,11 @@ import os
4
  import logging
5
  from datetime import datetime
6
  from huggingface_hub import HfApi, HfFolder
7
- from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoModel
8
  from optimum.onnxruntime import ORTQuantizer, ORTModelForCausalLM
9
  from optimum.onnxruntime.configuration import AutoQuantizationConfig
10
- from optimum.onnx import export
11
- from optimum.onnx.utils import get_preprocessor
12
- from datasets import load_dataset
13
  import torch.nn.utils.prune as prune
14
- import numpy as np
15
  import time
16
 
17
  # --- 1. SETUP AND CONFIGURATION ---
@@ -23,8 +20,6 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
23
  HF_TOKEN = os.getenv("HF_TOKEN")
24
  if not HF_TOKEN:
25
  logging.warning("HF_TOKEN environment variable not set. Packaging and uploading will fail.")
26
- # For testing locally, you can uncomment the next line and set your token
27
- # HfFolder.save_token('YOUR_HF_WRITE_TOKEN')
28
 
29
  api = HfApi()
30
  OUTPUT_DIR = "optimized_models"
@@ -51,7 +46,6 @@ def stage_1_analyze_model(model_id: str):
51
  - **Estimated Parameters:** ~{num_params:.2f}M
52
  """
53
 
54
- # Recommendation Logic
55
  recommendation = ""
56
  if 'llama' in model_type or 'gpt' in model_type or 'mistral' in model_type:
57
  recommendation = "**Recommendation:** This is a large language model (LLM). For best CPU performance, a GGUF-based quantization strategy is typically state-of-the-art. This initial version of AMOP focuses on the ONNX pipeline. The recommended path is **Quantization -> ONNX Conversion**."
@@ -94,7 +88,7 @@ def stage_2_prune_model(model, prune_percentage: float, progress):
94
  return model, log_stream
95
 
96
 
97
- def stage_3_and_4_quantize_and_onnx(model_id: str, model, progress):
98
  """
99
  Performs Stage 3 (Quantization) and Stage 4 (ONNX Conversion).
100
  This version uses post-training dynamic quantization.
@@ -103,32 +97,16 @@ def stage_3_and_4_quantize_and_onnx(model_id: str, model, progress):
103
  progress(0.5, desc="Exporting to ONNX")
104
 
105
  try:
106
- # Define a unique path for this run
107
  run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
108
  onnx_path = os.path.join(OUTPUT_DIR, f"{model_id.replace('/', '_')}-{run_id}-onnx")
109
  os.makedirs(onnx_path, exist_ok=True)
110
- onnx_model_path = os.path.join(onnx_path, "model.onnx")
111
 
112
- # Export the base model to ONNX
113
- # Using a trick to get the task for optimum
114
- config = AutoConfig.from_pretrained(model_id)
115
- task = getattr(config, "task_specific_params", None)
116
- task = "default" if task is None else list(task.keys())[0] if isinstance(task, dict) else "default"
117
-
118
- # Load preprocessor for ONNX export
119
- preprocessor = get_preprocessor(model_id)
120
-
121
- # This is a key step where we need to find the correct OnnxConfig
122
- # Optimum has utilities, but for a general case, we try our best
123
- from optimum.exporters.onnx import main_export
124
  main_export(model_id, output=onnx_path, task="auto", trust_remote_code=True)
125
-
126
  log_stream += f"Successfully exported base model to ONNX at: {onnx_path}\n"
127
 
128
- # Quantize the ONNX model
129
  progress(0.7, desc="Applying Dynamic Quantization")
130
  quantizer = ORTQuantizer.from_pretrained(onnx_path)
131
- dqconfig = AutoQuantizationConfig.arm64(is_static=False, per_channel=False) # Dynamic quantization
132
 
133
  quantized_path = os.path.join(onnx_path, "quantized")
134
  quantizer.quantize(save_dir=quantized_path, quantization_config=dqconfig)
@@ -155,10 +133,9 @@ def stage_5_evaluate_and_package(
155
  log_stream = "[STAGE 5] Evaluating and Packaging...\n"
156
  progress(0.9, desc="Evaluating performance")
157
 
158
- # Simple evaluation: Load the model and measure latency
159
  try:
160
  ort_model = ORTModelForCausalLM.from_pretrained(optimized_model_path)
161
- tokenizer = AutoTokenizer.from_pretrained(model_id)
162
 
163
  prompt = "My name is Philipp and I"
164
  inputs = tokenizer(prompt, return_tensors="pt")
@@ -167,7 +144,7 @@ def stage_5_evaluate_and_package(
167
  gen_tokens = ort_model.generate(**inputs, max_new_tokens=20)
168
  end_time = time.time()
169
 
170
- latency = (end_time - start_time) * 1000 # in ms
171
  num_tokens = len(gen_tokens[0])
172
  ms_per_token = latency / num_tokens
173
 
@@ -178,60 +155,136 @@ def stage_5_evaluate_and_package(
178
  eval_report = f"- **Evaluation Failed:** Could not load and test the ONNX model. This often happens if the base model is not a text-generation model. Error: {e}\n"
179
  log_stream += f"Warning: Evaluation failed. {e}\n"
180
 
181
- # Package and upload
182
  progress(0.95, desc="Uploading to Hugging Face Hub")
183
 
184
  if not HF_TOKEN:
185
  return "Skipping upload: HF_TOKEN not found.", log_stream + "Skipping upload: HF_TOKEN not found."
186
 
187
  try:
188
- # Create a new repo
189
  repo_name = f"{model_id.split('/')[-1]}-amop-cpu"
190
- repo_url = api.create_repo(
191
- repo_id=repo_name,
192
- exist_ok=True,
193
- token=HF_TOKEN
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  )
 
 
 
 
 
195
 
196
- # Generate the Model Card (README.md)
197
- model_card_content = f"""
198
- ---
199
- license: mit
200
- tags:
201
- - amop-optimized
202
- - onnx
203
- ---
 
 
 
 
 
 
 
 
 
 
204
 
205
- # AMOP-Optimized CPU Model: {repo_name}
206
 
207
- This model was automatically optimized for CPU inference using the **Adaptive Model Optimization Pipeline (AMOP)**.
 
 
 
 
 
208
 
209
- - **Base Model:** [{model_id}](https://huggingface.co/{model_id})
210
- - **Optimization Date:** {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
 
211
 
212
- ## Optimization Details
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
 
214
- The following AMOP stages were applied:
215
- - **Stage 2: Pruning:** {"Enabled" if options['prune'] else "Disabled"} (Percentage: {options['prune_percent']}%)
216
- - **Stage 3 & 4: Quantization & ONNX Conversion:** Enabled (Dynamic Quantization)
 
217
 
218
- ## Performance Metrics
219
 
220
- {eval_report}
221
 
222
- ## How to Use
 
 
 
 
 
 
 
223
 
224
- This model is in ONNX format and can be run with `optimum-onnxruntime`.
 
 
 
 
 
 
 
 
 
 
 
 
225
 
226
- ```python
227
- from optimum.onnxruntime import ORTModelForCausalLM
228
- from transformers import AutoTokenizer
 
 
 
229
 
230
- model_id = "{repo_url.repo_id}"
231
- model = ORTModelForCausalLM.from_pretrained(model_id)
232
- tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 
 
 
 
 
 
 
233
 
234
- prompt = "The future of AI is"
235
- inputs = tokenizer(prompt, return_tensors="pt")
236
- gen_tokens = model.generate(**inputs)
237
- print(tokenizer.batch_decode(gen_tokens))
 
4
  import logging
5
  from datetime import datetime
6
  from huggingface_hub import HfApi, HfFolder
7
+ from transformers import AutoConfig, AutoModel, AutoTokenizer
8
  from optimum.onnxruntime import ORTQuantizer, ORTModelForCausalLM
9
  from optimum.onnxruntime.configuration import AutoQuantizationConfig
10
+ from optimum.exporters.onnx import main_export
 
 
11
  import torch.nn.utils.prune as prune
 
12
  import time
13
 
14
  # --- 1. SETUP AND CONFIGURATION ---
 
20
  HF_TOKEN = os.getenv("HF_TOKEN")
21
  if not HF_TOKEN:
22
  logging.warning("HF_TOKEN environment variable not set. Packaging and uploading will fail.")
 
 
23
 
24
  api = HfApi()
25
  OUTPUT_DIR = "optimized_models"
 
46
  - **Estimated Parameters:** ~{num_params:.2f}M
47
  """
48
 
 
49
  recommendation = ""
50
  if 'llama' in model_type or 'gpt' in model_type or 'mistral' in model_type:
51
  recommendation = "**Recommendation:** This is a large language model (LLM). For best CPU performance, a GGUF-based quantization strategy is typically state-of-the-art. This initial version of AMOP focuses on the ONNX pipeline. The recommended path is **Quantization -> ONNX Conversion**."
 
88
  return model, log_stream
89
 
90
 
91
+ def stage_3_and_4_quantize_and_onnx(model_id: str, progress):
92
  """
93
  Performs Stage 3 (Quantization) and Stage 4 (ONNX Conversion).
94
  This version uses post-training dynamic quantization.
 
97
  progress(0.5, desc="Exporting to ONNX")
98
 
99
  try:
 
100
  run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
101
  onnx_path = os.path.join(OUTPUT_DIR, f"{model_id.replace('/', '_')}-{run_id}-onnx")
102
  os.makedirs(onnx_path, exist_ok=True)
 
103
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  main_export(model_id, output=onnx_path, task="auto", trust_remote_code=True)
 
105
  log_stream += f"Successfully exported base model to ONNX at: {onnx_path}\n"
106
 
 
107
  progress(0.7, desc="Applying Dynamic Quantization")
108
  quantizer = ORTQuantizer.from_pretrained(onnx_path)
109
+ dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False) # Dynamic quantization for CPUs
110
 
111
  quantized_path = os.path.join(onnx_path, "quantized")
112
  quantizer.quantize(save_dir=quantized_path, quantization_config=dqconfig)
 
133
  log_stream = "[STAGE 5] Evaluating and Packaging...\n"
134
  progress(0.9, desc="Evaluating performance")
135
 
 
136
  try:
137
  ort_model = ORTModelForCausalLM.from_pretrained(optimized_model_path)
138
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
139
 
140
  prompt = "My name is Philipp and I"
141
  inputs = tokenizer(prompt, return_tensors="pt")
 
144
  gen_tokens = ort_model.generate(**inputs, max_new_tokens=20)
145
  end_time = time.time()
146
 
147
+ latency = (end_time - start_time) * 1000
148
  num_tokens = len(gen_tokens[0])
149
  ms_per_token = latency / num_tokens
150
 
 
155
  eval_report = f"- **Evaluation Failed:** Could not load and test the ONNX model. This often happens if the base model is not a text-generation model. Error: {e}\n"
156
  log_stream += f"Warning: Evaluation failed. {e}\n"
157
 
 
158
  progress(0.95, desc="Uploading to Hugging Face Hub")
159
 
160
  if not HF_TOKEN:
161
  return "Skipping upload: HF_TOKEN not found.", log_stream + "Skipping upload: HF_TOKEN not found."
162
 
163
  try:
 
164
  repo_name = f"{model_id.split('/')[-1]}-amop-cpu"
165
+ repo_url = api.create_repo(repo_id=repo_name, exist_ok=True, token=HF_TOKEN)
166
+
167
+ # --- THIS IS THE UPDATED SECTION ---
168
+ # Read the template file
169
+ with open("model_card_template.md", "r", encoding="utf-8") as f:
170
+ template_content = f.read()
171
+
172
+ # Fill in the placeholders
173
+ model_card_content = template_content.format(
174
+ repo_name=repo_name,
175
+ model_id=model_id,
176
+ optimization_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
177
+ eval_report=eval_report,
178
+ pruning_status="Enabled" if options['prune'] else "Disabled",
179
+ pruning_percent=options['prune_percent'],
180
+ repo_id=repo_url.repo_id,
181
+ pipeline_log=pipeline_log
182
  )
183
+ # --- END OF UPDATED SECTION ---
184
+
185
+ readme_path = os.path.join(optimized_model_path, "README.md")
186
+ with open(readme_path, "w", encoding="utf-8") as f:
187
+ f.write(model_card_content)
188
 
189
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
190
+ tokenizer.save_pretrained(optimized_model_path)
191
+
192
+ api.upload_folder(
193
+ folder_path=optimized_model_path,
194
+ repo_id=repo_url.repo_id,
195
+ repo_type="model",
196
+ token=HF_TOKEN
197
+ )
198
+
199
+ final_message = f"✅ Success! Your optimized model is available at: {repo_url}"
200
+ log_stream += "Upload complete.\n"
201
+ return final_message, log_stream
202
+ except Exception as e:
203
+ error_msg = f"Failed to upload to the Hub. Error: {e}"
204
+ logging.error(error_msg, exc_info=True)
205
+ return f"❌ Error: {error_msg}", log_stream + error_msg
206
+
207
 
208
+ # --- 3. MAIN WORKFLOW FUNCTION ---
209
 
210
+ def run_amop_pipeline(model_id: str, do_prune: bool, prune_percent: float, progress=gr.Progress(track_tqdm=True)):
211
+ if not model_id:
212
+ return "Please enter a Model ID.", ""
213
+
214
+ full_log = "[START] AMOP Pipeline Initiated.\n"
215
+ progress(0, desc="Loading Base Model")
216
 
217
+ try:
218
+ model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
219
+ full_log += f"Successfully loaded base model '{model_id}'.\n"
220
 
221
+ if do_prune:
222
+ model, log = stage_2_prune_model(model, prune_percent, progress)
223
+ full_log += log
224
+ else:
225
+ full_log += "[STAGE 2] Pruning skipped by user.\n"
226
+
227
+ # We re-export the pruned model, so it needs to be saved and reloaded by optimum
228
+ # For simplicity in V1, we will export the original model from the hub
229
+ # A future version could handle the pruned model state_dict
230
+ optimized_path, log = stage_3_and_4_quantize_and_onnx(model_id, progress)
231
+ full_log += log
232
+
233
+ options = {'prune': do_prune, 'prune_percent': prune_percent}
234
+ final_status, log = stage_5_evaluate_and_package(model_id, optimized_path, full_log, options, progress)
235
+ full_log += log
236
+
237
+ return final_status, full_log
238
 
239
+ except Exception as e:
240
+ logging.error(f"AMOP Pipeline failed. Error: {e}", exc_info=True)
241
+ full_log += f"\n[ERROR] Pipeline failed: {e}"
242
+ return f"❌ An error occurred during the pipeline. Check the logs for details.", full_log
243
 
 
244
 
245
+ # --- 4. GRADIO USER INTERFACE ---
246
 
247
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
248
+ gr.Markdown("# AMOP: Adaptive Model Optimization Pipeline")
249
+ gr.Markdown(
250
+ "**Turn any Hugging Face Hub model into a CPU-optimized version.** Enter a model ID, choose your optimizations, "
251
+ "and get a new, smaller, and faster model repository ready for deployment."
252
+ )
253
+ if not HF_TOKEN:
254
+ gr.Warning("You have not set your HF_TOKEN in the Space secrets! The final 'upload' step will be skipped. Please add a secret with the key `HF_TOKEN` and your Hugging Face write token as the value.")
255
 
256
+ with gr.Row():
257
+ with gr.Column(scale=1):
258
+ model_id_input = gr.Textbox(label="Hugging Face Model ID", placeholder="e.g., gpt2, bert-base-uncased")
259
+ analyze_button = gr.Button("1. Analyze Model")
260
+
261
+ with gr.Group(visible=False) as optimization_options:
262
+ gr.Markdown("### 2. Configure Optimization")
263
+ analysis_report_output = gr.Markdown()
264
+
265
+ prune_checkbox = gr.Checkbox(label="Enable Pruning (Stage 2)", value=False, info="Note: Pruning is applied conceptually; ONNX export uses the original model for wider compatibility in this version.")
266
+ prune_slider = gr.Slider(minimum=0, maximum=90, value=20, step=5, label="Pruning Percentage (%)")
267
+
268
+ gr.Checkbox(label="Enable Quantization & ONNX (Stages 3 & 4)", value=True, interactive=False)
269
 
270
+ run_button = gr.Button("3. Run Optimization Pipeline", variant="primary")
271
+
272
+ with gr.Column(scale=2):
273
+ gr.Markdown("### Pipeline Status & Logs")
274
+ final_output = gr.Markdown(label="Final Result")
275
+ log_output = gr.Textbox(label="Live Logs", lines=20, interactive=False)
276
 
277
+ analyze_button.click(
278
+ fn=stage_1_analyze_model,
279
+ inputs=[model_id_input],
280
+ outputs=[log_output, analysis_report_output, optimization_options]
281
+ )
282
+
283
+ run_button.click(
284
+ fn=run_amop_pipeline,
285
+ inputs=[model_id_input, prune_checkbox, prune_slider],
286
+ outputs=[final_output, log_output]
287
+ )
288
 
289
+ if __name__ == "__main__":
290
+ demo.launch(debug=True)