Spaces:

AIguysingstoo
/

optimization-engineer

Running

App Files Files Community

AIguysingstoo commited on Jun 11, 2025

Commit

e9bb6c3

verified ·

1 Parent(s): a724749

Upload 9 files

Browse files

Files changed (9) hide show

README.md +65 -5
agent/benchmarker.py +136 -0
core/benchmark.py +172 -0
core/data.py +34 -0
core/utils.py +36 -0
interfaces/gradio_app.py +252 -0
main.py +28 -0
models/quantization.py +91 -0
requirements.txt +283 -0

README.md CHANGED Viewed

@@ -1,12 +1,72 @@
 ---
 title: Optimization Engineer
-emoji: 🐠
-colorFrom: green
-colorTo: purple
 sdk: gradio
 sdk_version: 5.33.1
-app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Optimization Engineer
+emoji: ⚡
+colorFrom: gray
+colorTo: gray
 sdk: gradio
 sdk_version: 5.33.1
+app_file: main.py
 pinned: false
+license: apache-2.0
+short_description: A modular, simplified model optimizing agent !
+tags:
+    - mcp-server-track
+    - optimization
+    - mcp-server
+    - gradio
 ---
+# Optimization Engineer 🚀
+An intelligent optimization engineer that serves as both a Gradio web application and an MCP (Model Context Protocol) server for advanced optimization tasks.
+## 🎯 MCP Server Track Submission
+This space is submitted for the MCP Server Track. It functions as:
+1. **Gradio App**: Interactive web interface for optimization tasks
+2. **MCP Server**: Can be connected to MCP clients like Claude Desktop, Cursor, etc.
+## 🎥 Demo Video
+[Link to demo video showing MCP server in action - TO BE ADDED]
+## ✨ Features
+- Interactive optimization interface
+- MCP server capabilities for external tool integration
+- Advanced optimization algorithms and techniques
+- Real-time performance monitoring and benchmarking
+## 🚀 Usage
+### As a Gradio App
+Simply use the interface above to interact with the optimization tools.
+### As an MCP Server
+(Local Only) Copy the generated MCP server URL to your Host's integration to access the tools through Claude Desktop.
+## 🛠️ Development
+```bash
+# Clone the repository
+git clone https://huggingface.co/spaces/AIguysingstoo/optimization-engineer
+# Install dependencies
+pip install -r requirements.txt
+# Run locally
+python main.py gradio
+```
+## 📋 Requirements
+- Python 3.10+
+- Dependencies listed in requirements.txt
+## 🤝 Contributing
+Feel free to submit issues and enhancement requests!
+## 📄 License
+Apache 2.0 License

agent/benchmarker.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import torch
+import time
+import json
+import os
+import numpy as np
+from typing import Dict, List, Any
+from dataclasses import asdict
+from models.quantization import ModelLoader, QuantizationType
+from core.benchmark import BenchmarkConfig, BenchmarkResult, InferenceRunner, PerplexityCalculator
+from core.data import DatasetLoader
+from core.utils import get_device
+class ModelBenchmarker:
+    """Main benchmarking agent."""
+    def __init__(self):
+        self.model = None
+        self.tokenizer = None
+        self.device = None
+    def load_model(self, config: BenchmarkConfig):
+        """Load model based on configuration."""
+        self.device = get_device(config.device)
+        quant_type = QuantizationType(config.quantization_type)
+        if quant_type == QuantizationType.NONE:
+            self.model, self.tokenizer = ModelLoader.load_standard(config.model_name, self.device)
+        else:
+            # Try Transformers integration first, fallback to direct API
+            try:
+                self.model, self.tokenizer = ModelLoader.load_quantized_transformers(config.model_name, quant_type)
+                self.device = str(next(self.model.parameters()).device)
+            except Exception as e:
+                print(f"Transformers integration failed, using direct API: {e}")
+                self.model, self.tokenizer = ModelLoader.load_quantized_direct(config.model_name, quant_type, self.device)
+        # Apply torch.compile if requested
+        if config.use_torch_compile:
+            print("Applying torch.compile...")
+            self.model = torch.compile(self.model)
+    def run_benchmark(self, config: BenchmarkConfig) -> Dict[str, Any]:
+        """Run benchmark with given configuration."""
+        if self.model is None:
+            self.load_model(config)
+        # Get sample prompts
+        prompts, indices = DatasetLoader.get_sample_prompts(config.dataset_name, config.num_samples, config.seed)
+        # Setup inference runner
+        inference_runner = InferenceRunner(self.model, self.tokenizer, self.device)
+        # Setup perplexity calculator if needed
+        perplexity_calc = None
+        if config.calculate_perplexity:
+            perplexity_calc = PerplexityCalculator(self.model, self.tokenizer, self.device)
+        results = []
+        for i, prompt in enumerate(prompts):
+            print(f"Processing prompt {i+1}/{len(prompts)}")
+            # Run inference
+            inference_result = inference_runner.run_single_inference(prompt, config.max_new_tokens)
+            # Calculate perplexity if requested
+            perplexity = None
+            if perplexity_calc:
+                perplexity = perplexity_calc.calculate(inference_result["generated_text"])
+            # Create result
+            result = BenchmarkResult(
+                prompt_id=i,
+                prompt=prompt,
+                generated_text=inference_result["generated_text"],
+                input_tokens=inference_result["input_tokens"],
+                output_tokens=inference_result["output_tokens"],
+                total_time_seconds=inference_result["total_time_seconds"],
+                tokens_per_second=inference_result["tokens_per_second"],
+                first_token_latency_seconds=inference_result["first_token_latency_seconds"],
+                peak_memory_mb=inference_result["peak_memory_mb"],
+                perplexity=perplexity
+            )
+            results.append(result)
+        # Calculate summary
+        summary = self._create_summary(config, results)
+        return {
+            "summary": summary,
+            "samples": [asdict(result) for result in results]
+        }
+    def _create_summary(self, config: BenchmarkConfig, results: List[BenchmarkResult]) -> Dict[str, Any]:
+        """Create benchmark summary."""
+        avg_tokens_per_second = sum(r.tokens_per_second for r in results) / len(results)
+        avg_first_token_latency = sum(r.first_token_latency_seconds for r in results) / len(results)
+        max_memory_mb = max(r.peak_memory_mb for r in results)
+        avg_perplexity = None
+        if config.calculate_perplexity:
+            valid_perplexities = [r.perplexity for r in results if r.perplexity is not None and not np.isinf(r.perplexity)]
+            if valid_perplexities:
+                avg_perplexity = sum(valid_perplexities) / len(valid_perplexities)
+        optimization_desc = config.quantization_type
+        if config.use_torch_compile:
+            optimization_desc += " + torch.compile"
+        return {
+            "model_name": f"{config.model_name} ({optimization_desc})",
+            "device": self.device,
+            "num_samples": len(results),
+            "avg_tokens_per_second": avg_tokens_per_second,
+            "avg_first_token_latency_seconds": avg_first_token_latency,
+            "max_memory_mb": max_memory_mb,
+            "avg_perplexity": avg_perplexity,
+            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+            "optimization_type": optimization_desc
+        }
+    def save_results(self, results: Dict[str, Any], output_dir: str = "benchmark_results") -> str:
+        """Save benchmark results."""
+        os.makedirs(output_dir, exist_ok=True)
+        model_name = results["summary"]["model_name"].split('/')[-1].replace(' ', '_')
+        timestamp = time.strftime("%Y%m%d_%H%M%S")
+        output_file = os.path.join(output_dir, f"{model_name}_{timestamp}.json")
+        with open(output_file, 'w') as f:
+            json.dump(results, f, indent=2)
+        return output_file

core/benchmark.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import torch
+import time
+import gc
+import psutil
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Any
+from torch.nn import CrossEntropyLoss
+@dataclass
+class BenchmarkConfig:
+    """Configuration for benchmarking."""
+    model_name: str
+    dataset_name: str = "tatsu-lab/alpaca"
+    num_samples: int = 20
+    max_new_tokens: int = 100
+    quantization_type: str = "none"
+    use_torch_compile: bool = False
+    calculate_perplexity: bool = False
+    device: Optional[str] = None
+    seed: int = 42
+@dataclass
+class BenchmarkResult:
+    """Single benchmark result."""
+    prompt_id: int
+    prompt: str
+    generated_text: str
+    input_tokens: int
+    output_tokens: int
+    total_time_seconds: float
+    tokens_per_second: float
+    first_token_latency_seconds: float
+    peak_memory_mb: float
+    perplexity: Optional[float] = None
+class MemoryTracker:
+    """Handles memory tracking across different devices."""
+    def __init__(self, device: str):
+        self.device = device
+    def reset_stats(self):
+        """Reset memory tracking."""
+        if self.device == "cuda" and torch.cuda.is_available():
+            torch.cuda.reset_peak_memory_stats()
+    def get_peak_memory_mb(self) -> float:
+        """Get peak memory usage in MB."""
+        if self.device == "cuda" and torch.cuda.is_available():
+            return torch.cuda.max_memory_allocated() / (1024 * 1024)
+        else:
+            return psutil.Process().memory_info().rss / (1024 * 1024)
+    def synchronize(self):
+        """Synchronize device operations."""
+        if self.device == "cuda" and torch.cuda.is_available():
+            torch.cuda.synchronize()
+        elif self.device == "mps" and hasattr(torch.backends, 'mps'):
+            if hasattr(torch.mps, 'synchronize'):
+                torch.mps.synchronize()
+    def clear_cache(self):
+        """Clear memory cache."""
+        gc.collect()
+        if self.device == "cuda" and torch.cuda.is_available():
+            torch.cuda.empty_cache()
+class PerplexityCalculator:
+    """Handles perplexity calculation."""
+    def __init__(self, model, tokenizer, device: str):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.device = device
+    def calculate(self, text: str) -> float:
+        """Calculate perplexity of text."""
+        try:
+            encodings = self.tokenizer(text, return_tensors="pt").to(self.device)
+            input_ids = encodings.input_ids
+            if input_ids.size(1) <= 1:
+                return float('inf')
+            with torch.no_grad():
+                outputs = self.model(input_ids=input_ids, labels=input_ids.clone())
+                if hasattr(outputs, 'loss') and outputs.loss is not None:
+                    return torch.exp(outputs.loss).item()
+                # Fallback manual calculation
+                logits = outputs.logits[:, :-1, :].contiguous()
+                labels = input_ids[:, 1:].contiguous()
+                loss_fn = CrossEntropyLoss()
+                loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))
+                return torch.exp(loss).item()
+        except Exception as e:
+            print(f"Perplexity calculation failed: {e}")
+            return None
+class InferenceRunner:
+    """Handles model inference with timing and memory tracking."""
+    def __init__(self, model, tokenizer, device: str):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.device = device
+        self.memory_tracker = MemoryTracker(device)
+    def run_single_inference(self, prompt: str, max_new_tokens: int) -> Dict[str, Any]:
+        """Run inference on a single prompt."""
+        # Tokenize input
+        input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(self.device)
+        input_token_count = input_ids.shape[1]
+        # Reset memory tracking
+        self.memory_tracker.reset_stats()
+        initial_memory = self.memory_tracker.get_peak_memory_mb()
+        # Generation parameters
+        gen_params = {
+            "max_new_tokens": max_new_tokens,
+            "do_sample": False,
+            "pad_token_id": self.tokenizer.eos_token_id
+        }
+        # Time first token
+        self.memory_tracker.synchronize()
+        first_token_start = time.time()
+        with torch.no_grad():
+            first_output = self.model.generate(input_ids, max_new_tokens=1, **{k: v for k, v in gen_params.items() if k != 'max_new_tokens'})
+        self.memory_tracker.synchronize()
+        first_token_latency = time.time() - first_token_start
+        # Full generation
+        start_time = time.time()
+        with torch.no_grad():
+            outputs = self.model.generate(input_ids, **gen_params)
+        self.memory_tracker.synchronize()
+        total_time = time.time() - start_time
+        # Calculate metrics
+        output_ids = outputs[0][input_token_count:]
+        generated_token_count = len(output_ids)
+        tokens_per_second = generated_token_count / total_time if total_time > 0 else 0
+        # Get memory usage
+        peak_memory_mb = self.memory_tracker.get_peak_memory_mb()
+        if self.device != "cuda":
+            peak_memory_mb = peak_memory_mb - initial_memory
+        # Decode output
+        generated_text = self.tokenizer.decode(output_ids, skip_special_tokens=True)
+        # Clear memory
+        self.memory_tracker.clear_cache()
+        return {
+            "input_tokens": input_token_count,
+            "output_tokens": generated_token_count,
+            "total_time_seconds": total_time,
+            "tokens_per_second": tokens_per_second,
+            "first_token_latency_seconds": first_token_latency,
+            "peak_memory_mb": peak_memory_mb,
+            "generated_text": generated_text
+        }

core/data.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import random
+from datasets import load_dataset
+from typing import List, Tuple
+class DatasetLoader:
+    """Handles dataset loading and sampling."""
+    @staticmethod
+    def get_sample_prompts(dataset_name: str, num_samples: int, seed: int = 42) -> Tuple[List[str], List[int]]:
+        """Get sample prompts from dataset."""
+        print(f"Loading dataset: {dataset_name}")
+        dataset = load_dataset(dataset_name)
+        split_name = 'train' if 'train' in dataset else list(dataset.keys())[0]
+        random.seed(seed)
+        indices = random.sample(range(len(dataset[split_name])), num_samples)
+        # Handle different dataset formats
+        samples = []
+        for idx in indices:
+            item = dataset[split_name][idx]
+            if 'instruction' in item:
+                samples.append(item['instruction'])
+            elif 'text' in item:
+                samples.append(item['text'])
+            elif 'prompt' in item:
+                samples.append(item['prompt'])
+            else:
+                # Fallback - use first text field
+                text_field = next(k for k, v in item.items() if isinstance(v, str))
+                samples.append(item[text_field])
+        return samples, indices

core/utils.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import torch
+import psutil
+from typing import Optional
+def get_device(device: Optional[str] = None) -> str:
+    """Auto-detect or validate device."""
+    if device is None:
+        if torch.cuda.is_available():
+            return "cuda"
+        elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+            return "mps"
+        else:
+            return "cpu"
+    return device
+def get_system_info() -> str:
+    """Get formatted system information."""
+    info = ["# System Information\n"]
+    # CPU
+    info.append(f"**CPU**: {psutil.cpu_count(logical=False)} physical, {psutil.cpu_count()} logical cores")
+    info.append(f"**Memory**: {psutil.virtual_memory().total / (1024**3):.2f} GB")
+    # GPU
+    if torch.cuda.is_available():
+        info.append(f"**CUDA**: {torch.cuda.get_device_name(0)}")
+        info.append(f"**CUDA Memory**: {torch.cuda.get_device_properties(0).total_memory / (1024**3):.2f} GB")
+        info.append(f"**CUDA Version**: {torch.version.cuda}")
+    # MPS
+    if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+        info.append("**Apple Silicon**: MPS Available")
+    info.append(f"**PyTorch**: {torch.__version__}")
+    return "\n".join(info)

interfaces/gradio_app.py ADDED Viewed

	@@ -0,0 +1,252 @@

+import gradio as gr
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+from typing import List, Tuple
+from agent.benchmarker import ModelBenchmarker
+from core.benchmark import BenchmarkConfig
+from core.utils import get_system_info
+class GradioApp:
+    """Gradio web interface for model benchmarking."""
+    def __init__(self):
+        self.benchmarker = ModelBenchmarker()
+        self.history = []
+    def benchmark_single(
+        self,
+        model_name: str,
+        dataset_name: str,
+        num_samples: int,
+        max_tokens: int,
+        quantization: str,
+        torch_compile: bool,
+        perplexity: bool,
+        device: str
+    ) -> Tuple[str, str, str]:
+        """Run single model benchmark."""
+        try:
+            config = BenchmarkConfig(
+                model_name=model_name,
+                dataset_name=dataset_name,
+                num_samples=num_samples,
+                max_new_tokens=max_tokens,
+                quantization_type=quantization,
+                use_torch_compile=torch_compile,
+                calculate_perplexity=perplexity,
+                device=device if device != "auto" else None
+            )
+            results = self.benchmarker.run_benchmark(config)
+            self.history.append(results)
+            # Format summary
+            summary = results["summary"]
+            summary_text = f"""## Benchmark Results
+**Model**: {summary['model_name']}
+**Device**: {summary['device']}
+**Optimization**: {summary['optimization_type']}
+### Performance Metrics
+- **Throughput**: {summary['avg_tokens_per_second']:.2f} tokens/second
+- **First Token Latency**: {summary['avg_first_token_latency_seconds']:.4f} seconds
+- **Peak Memory**: {summary['max_memory_mb']:.2f} MB
+- **Samples**: {summary['num_samples']}
+{f"- **Perplexity**: {summary['avg_perplexity']:.4f}" if summary.get('avg_perplexity') else ""}
+            """
+            # Sample results table
+            samples_df = pd.DataFrame(results['samples'])
+            if not samples_df.empty:
+                display_cols = ['prompt_id', 'input_tokens', 'output_tokens', 'tokens_per_second', 'first_token_latency_seconds']
+                samples_table = samples_df[display_cols].head(10).to_html(index=False)
+            else:
+                samples_table = "No sample data available"
+            return summary_text, samples_table, "✅ Benchmark completed!"
+        except Exception as e:
+            return f"❌ Error: {str(e)}", "", f"❌ Failed: {str(e)}"
+    def compare_optimizations(
+        self,
+        model_name: str,
+        dataset_name: str,
+        num_samples: int,
+        optimizations: List[str]
+    ) -> Tuple[str, go.Figure, str]:
+        """Compare different quantization."""
+        try:
+            results = []
+            for opt in optimizations:
+                config = BenchmarkConfig(
+                    model_name=model_name,
+                    dataset_name=dataset_name,
+                    num_samples=num_samples,
+                    quantization_type=opt,
+                    calculate_perplexity=True
+                )
+                benchmarker = ModelBenchmarker()  # Fresh instance
+                result = benchmarker.run_benchmark(config)
+                results.append(result["summary"])
+            # Create comparison
+            df = pd.DataFrame(results)
+            # Create plot
+            fig = go.Figure()
+            fig.add_trace(go.Bar(
+                name='Throughput',
+                x=df['optimization_type'],
+                y=df['avg_tokens_per_second'],
+                yaxis='y'
+            ))
+            fig.add_trace(go.Scatter(
+                name='Memory (MB)',
+                x=df['optimization_type'],
+                y=df['max_memory_mb'],
+                yaxis='y2',
+                mode='lines+markers',
+                line=dict(color='red')
+            ))
+            fig.update_layout(
+                title=f'Optimization Comparison: {model_name}',
+                xaxis_title='Optimization',
+                yaxis=dict(title='Throughput (tok/s)', side='left'),
+                yaxis2=dict(title='Memory (MB)', side='right', overlaying='y')
+            )
+            # Summary text
+            best_throughput = max(results, key=lambda x: x['avg_tokens_per_second'])
+            best_memory = min(results, key=lambda x: x['max_memory_mb'])
+            summary = f"""## Comparison Results
+### Best Configurations
+- **Highest Throughput**: {best_throughput['optimization_type']} ({best_throughput['avg_tokens_per_second']:.2f} tok/s)
+- **Lowest Memory**: {best_memory['optimization_type']} ({best_memory['max_memory_mb']:.2f} MB)
+### Results Table
+| Optimization | Throughput | Memory | Perplexity |
+|--------------|-----------|---------|-----------|
+{chr(10).join([f"| {r['optimization_type']} | {r['avg_tokens_per_second']:.2f} | {r['max_memory_mb']:.2f} | {r.get('avg_perplexity', 'N/A')} |" for r in results])}
+            """
+            return summary, fig, "✅ Comparison completed!"
+        except Exception as e:
+            return f"❌ Error: {str(e)}", go.Figure(), f"❌ Failed: {str(e)}"
+    def get_history(self) -> str:
+        """Get benchmark history."""
+        if not self.history:
+            return "No benchmarks run yet."
+        history_text = "# Benchmark History\n\n"
+        for i, result in enumerate(self.history):
+            summary = result["summary"]
+            history_text += f"""## Run {i+1}
+- **Model**: {summary['model_name']}
+- **Time**: {summary['timestamp']}
+- **Throughput**: {summary['avg_tokens_per_second']:.2f} tok/s
+- **Memory**: {summary['max_memory_mb']:.2f} MB
+---
+            """
+        return history_text
+    def create_interface(self):
+        """Create Gradio interface."""
+        with gr.Blocks(title="Model Benchmark Agent", theme=gr.themes.Soft()) as app:
+            gr.Markdown("# 🚀 Model Benchmark Agent")
+            gr.Markdown("Benchmark Hugging Face models with optimum-quanto quantization")
+            with gr.Tabs():
+                # Single Benchmark Tab
+                with gr.TabItem("Single Benchmark"):
+                    with gr.Row():
+                        with gr.Column():
+                            model_input = gr.Textbox("facebook/opt-iml-max-1.3b", label="Model Name")
+                            dataset_input = gr.Textbox("tatsu-lab/alpaca", label="Dataset")
+                            num_samples = gr.Slider(1, 100, 20, step=1, label="Samples")
+                            max_tokens = gr.Slider(10, 512, 100, label="Max Tokens")
+                            quantization = gr.Dropdown(
+                                ["none", "int8", "int4", "int2", "float8"],
+                                value="none",
+                                label="Quantization"
+                            )
+                            torch_compile = gr.Checkbox(label="Use torch.compile")
+                            perplexity = gr.Checkbox(label="Calculate Perplexity")
+                            device = gr.Dropdown(["auto", "cuda", "cpu", "mps"], value="auto", label="Device")
+                            benchmark_btn = gr.Button("🚀 Run Benchmark", variant="primary")
+                        with gr.Column():
+                            results_md = gr.Markdown()
+                            samples_html = gr.HTML()
+                            status_text = gr.Textbox(label="Status", interactive=False)
+                    benchmark_btn.click(
+                        self.benchmark_single,
+                        inputs=[model_input, dataset_input, num_samples, max_tokens, quantization, torch_compile, perplexity, device],
+                        outputs=[results_md, samples_html, status_text]
+                    )
+                # Comparison Tab
+                with gr.TabItem("Compare Optimizations"):
+                    with gr.Row():
+                        with gr.Column():
+                            comp_model = gr.Textbox("facebook/opt-iml-max-1.3b", label="Model")
+                            comp_dataset = gr.Textbox("tatsu-lab/alpaca", label="Dataset")
+                            comp_samples = gr.Slider(1, 50, 10, step=1, label="Samples")
+                            comp_opts = gr.CheckboxGroup(
+                                ["none", "int8", "int4", "int2"],
+                                value=["none", "int8"],
+                                label="Optimizations to Compare"
+                            )
+                            compare_btn = gr.Button("📊 Compare", variant="primary")
+                        with gr.Column():
+                            comp_results = gr.Markdown()
+                            comp_plot = gr.Plot()
+                            comp_status = gr.Textbox(label="Status", interactive=False)
+                    compare_btn.click(
+                        self.compare_optimizations,
+                        inputs=[comp_model, comp_dataset, comp_samples, comp_opts],
+                        outputs=[comp_results, comp_plot, comp_status]
+                    )
+                # History Tab
+                with gr.TabItem("History"):
+                    history_md = gr.Markdown()
+                    refresh_btn = gr.Button("🔄 Refresh")
+                    refresh_btn.click(self.get_history, outputs=[history_md])
+                # System Info Tab
+                with gr.TabItem("System Info"):
+                    sys_info_md = gr.Markdown()
+                    sys_info_btn = gr.Button("📋 Get System Info")
+                    sys_info_btn.click(get_system_info, outputs=[sys_info_md])
+        return app
+def launch_app():
+    """Launch the Gradio app."""
+    app = GradioApp()
+    interface = app.create_interface()
+    interface.launch(share=False,
+                     server_name="0.0.0.0",
+                     server_port=7860,
+                     show_error=True,
+                     mcp_server=True)

main.py ADDED Viewed

	@@ -0,0 +1,28 @@

+#!/usr/bin/env python3
+"""Main entry point for the Model Benchmark Agent."""
+import os
+# Disable tokenizer parallelism to avoid forking issues
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+#!/usr/bin/env python3
+import sys
+import os
+from interfaces.gradio_app import launch_app
+def main():
+    # Check if running on HuggingFace Spaces
+    is_huggingface = os.getenv("SPACE_ID") is not None
+    # If on HuggingFace or gradio argument passed, launch Gradio
+    if is_huggingface or (len(sys.argv) > 1 and sys.argv[1] == "gradio"):
+        launch_app()
+    else:
+        # Your existing logic for other modes
+        print("Usage: python main.py [gradio]")
+        print("Available modes:")
+        print("  gradio - Launch Gradio interface")
+        # Add other modes you support
+if __name__ == "__main__":
+    main()

models/quantization.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig
+from optimum.quanto import quantize, freeze, qint8, qint4, qint2, qfloat8
+from enum import Enum
+from typing import Tuple, Any, Optional
+class QuantizationType(Enum):
+    """Supported quantization types."""
+    NONE = "none"
+    INT8 = "int8"
+    INT4 = "int4"
+    INT2 = "int2"
+    FLOAT8 = "float8"
+class ModelLoader:
+    """Handles model loading with different quantization strategies."""
+    @staticmethod
+    def load_standard(model_name: str, device: str) -> Tuple[Any, Any]:
+        """Load model without quantization."""
+        print(f"Loading {model_name} (standard)")
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            trust_remote_code=True,
+            torch_dtype=torch.float16 if device == "cuda" else torch.float32,
+            device_map=device if device != "cpu" else None
+        )
+        if device == "cpu":
+            model = model.to(device)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        return model, tokenizer
+    @staticmethod
+    def load_quantized_transformers(model_name: str, quant_type: QuantizationType) -> Tuple[Any, Any]:
+        """Load model using Transformers QuantoConfig integration."""
+        print(f"Loading {model_name} with {quant_type.value} quantization (Transformers)")
+        quant_config = QuantoConfig(weights=quant_type.value)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            trust_remote_code=True,
+            torch_dtype="auto",
+            device_map="auto",
+            quantization_config=quant_config
+        )
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        return model, tokenizer
+    @staticmethod
+    def load_quantized_direct(model_name: str, quant_type: QuantizationType, device: str) -> Tuple[Any, Any]:
+        """Load model using direct quanto quantization API."""
+        print(f"Loading {model_name} with {quant_type.value} quantization (Direct API)")
+        # Load base model
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            trust_remote_code=True,
+            torch_dtype=torch.float16 if device == "cuda" else torch.float32,
+            device_map=device if device != "cpu" else None
+        )
+        if device == "cpu":
+            model = model.to(device)
+        # Apply quantization
+        quant_map = {
+            QuantizationType.INT8: qint8,
+            QuantizationType.INT4: qint4,
+            QuantizationType.INT2: qint2,
+            QuantizationType.FLOAT8: qfloat8
+        }
+        quantize(model, weights=quant_map[quant_type])
+        freeze(model)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        return model, tokenizer

requirements.txt ADDED Viewed

	@@ -0,0 +1,283 @@

+# This file was autogenerated by uv via the following command:
+#    uv pip compile pyproject.toml --output-file requirements.txt
+accelerate==1.7.0
+    # via model-benchmark-agent (pyproject.toml)
+aiofiles==24.1.0
+    # via gradio
+aiohappyeyeballs==2.6.1
+    # via aiohttp
+aiohttp==3.12.12
+    # via fsspec
+aiosignal==1.3.2
+    # via aiohttp
+annotated-types==0.7.0
+    # via pydantic
+anyio==4.9.0
+    # via
+    #   gradio
+    #   httpx
+    #   mcp
+    #   sse-starlette
+    #   starlette
+attrs==25.3.0
+    # via aiohttp
+certifi==2025.4.26
+    # via
+    #   httpcore
+    #   httpx
+    #   requests
+charset-normalizer==3.4.2
+    # via requests
+click==8.2.1
+    # via
+    #   typer
+    #   uvicorn
+datasets==3.6.0
+    # via model-benchmark-agent (pyproject.toml)
+dill==0.3.8
+    # via
+    #   datasets
+    #   multiprocess
+fastapi==0.115.12
+    # via gradio
+ffmpy==0.6.0
+    # via gradio
+filelock==3.18.0
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   torch
+    #   transformers
+frozenlist==1.7.0
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec==2025.3.0
+    # via
+    #   datasets
+    #   gradio-client
+    #   huggingface-hub
+    #   torch
+gradio==5.33.1
+    # via model-benchmark-agent (pyproject.toml)
+gradio-client==1.10.3
+    # via gradio
+groovy==0.1.2
+    # via gradio
+h11==0.16.0
+    # via
+    #   httpcore
+    #   uvicorn
+hf-xet==1.1.3
+    # via huggingface-hub
+httpcore==1.0.9
+    # via httpx
+httpx==0.28.1
+    # via
+    #   gradio
+    #   gradio-client
+    #   mcp
+    #   safehttpx
+httpx-sse==0.4.0
+    # via mcp
+huggingface-hub==0.32.5
+    # via
+    #   accelerate
+    #   datasets
+    #   gradio
+    #   gradio-client
+    #   optimum-quanto
+    #   tokenizers
+    #   transformers
+idna==3.10
+    # via
+    #   anyio
+    #   httpx
+    #   requests
+    #   yarl
+jinja2==3.1.6
+    # via
+    #   gradio
+    #   torch
+markdown-it-py==3.0.0
+    # via rich
+markupsafe==3.0.2
+    # via
+    #   gradio
+    #   jinja2
+mcp==1.9.3
+    # via model-benchmark-agent (pyproject.toml)
+mdurl==0.1.2
+    # via markdown-it-py
+mpmath==1.3.0
+    # via sympy
+multidict==6.4.4
+    # via
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.16
+    # via datasets
+narwhals==1.42.0
+    # via plotly
+networkx==3.5
+    # via torch
+ninja==1.11.1.4
+    # via optimum-quanto
+numpy==2.3.0
+    # via
+    #   model-benchmark-agent (pyproject.toml)
+    #   accelerate
+    #   datasets
+    #   gradio
+    #   optimum-quanto
+    #   pandas
+    #   transformers
+optimum-quanto==0.2.7
+    # via model-benchmark-agent (pyproject.toml)
+orjson==3.10.18
+    # via gradio
+packaging==25.0
+    # via
+    #   accelerate
+    #   datasets
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   plotly
+    #   transformers
+pandas==2.3.0
+    # via
+    #   model-benchmark-agent (pyproject.toml)
+    #   datasets
+    #   gradio
+pillow==11.2.1
+    # via gradio
+plotly==6.1.2
+    # via model-benchmark-agent (pyproject.toml)
+propcache==0.3.2
+    # via
+    #   aiohttp
+    #   yarl
+psutil==7.0.0
+    # via
+    #   model-benchmark-agent (pyproject.toml)
+    #   accelerate
+pyarrow==20.0.0
+    # via datasets
+pydantic==2.11.5
+    # via
+    #   model-benchmark-agent (pyproject.toml)
+    #   fastapi
+    #   gradio
+    #   mcp
+    #   pydantic-settings
+pydantic-core==2.33.2
+    # via pydantic
+pydantic-settings==2.9.1
+    # via mcp
+pydub==0.25.1
+    # via gradio
+pygments==2.19.1
+    # via rich
+python-dateutil==2.9.0.post0
+    # via pandas
+python-dotenv==1.1.0
+    # via pydantic-settings
+python-multipart==0.0.20
+    # via
+    #   gradio
+    #   mcp
+pytz==2025.2
+    # via pandas
+pyyaml==6.0.2
+    # via
+    #   accelerate
+    #   datasets
+    #   gradio
+    #   huggingface-hub
+    #   transformers
+regex==2024.11.6
+    # via transformers
+requests==2.32.4
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   transformers
+rich==14.0.0
+    # via typer
+ruff==0.11.13
+    # via gradio
+safehttpx==0.1.6
+    # via gradio
+safetensors==0.5.3
+    # via
+    #   accelerate
+    #   optimum-quanto
+    #   transformers
+semantic-version==2.10.0
+    # via gradio
+setuptools==80.9.0
+    # via torch
+shellingham==1.5.4
+    # via typer
+six==1.17.0
+    # via python-dateutil
+sniffio==1.3.1
+    # via anyio
+sse-starlette==2.3.6
+    # via mcp
+starlette==0.46.2
+    # via
+    #   fastapi
+    #   gradio
+    #   mcp
+sympy==1.14.0
+    # via torch
+tokenizers==0.21.1
+    # via transformers
+tomlkit==0.13.3
+    # via gradio
+torch==2.7.1
+    # via
+    #   model-benchmark-agent (pyproject.toml)
+    #   accelerate
+    #   optimum-quanto
+tqdm==4.67.1
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   transformers
+transformers==4.52.4
+    # via model-benchmark-agent (pyproject.toml)
+typer==0.16.0
+    # via gradio
+typing-extensions==4.14.0
+    # via
+    #   anyio
+    #   fastapi
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   pydantic
+    #   pydantic-core
+    #   torch
+    #   typer
+    #   typing-inspection
+typing-inspection==0.4.1
+    # via
+    #   pydantic
+    #   pydantic-settings
+tzdata==2025.2
+    # via pandas
+urllib3==2.4.0
+    # via requests
+uvicorn==0.34.3
+    # via
+    #   gradio
+    #   mcp
+websockets==15.0.1
+    # via gradio-client
+xxhash==3.5.0
+    # via datasets
+yarl==1.20.1
+    # via aiohttp