Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import ollama | |
| import pandas as pd | |
| import plotly.express as px | |
| from rich.console import Console | |
| from rich.progress import track | |
| from datetime import datetime | |
| import json | |
| import time | |
| import os | |
| from benchmarks.benchmark_suite import BenchmarkSuite | |
| console = Console() | |
| def get_available_models(): | |
| try: | |
| models = ollama.list() | |
| if 'models' in models and models['models']: | |
| # Extract model names, handling cases where 'name' key might not exist | |
| model_names = [] | |
| for model in models['models']: | |
| if 'name' in model: | |
| model_names.append(model['name']) | |
| else: | |
| # Fallback to 'model' key if 'name' doesn't exist | |
| model_names.append(model.get('model', 'unknown_model')) | |
| return model_names | |
| else: | |
| console.print("[yellow]No models found in Ollama[/yellow]") | |
| return ["codellama", "llama2", "mistral"] # Fallback default models | |
| except Exception as e: | |
| console.print(f"[red]Error fetching models: {e}[/red]") | |
| return ["codellama", "llama2", "mistral"] # Fallback default models | |
| class BenchmarkApp: | |
| def __init__(self): | |
| self.available_models = get_available_models() | |
| def create_interface(self): | |
| with gr.Blocks(theme=gr.themes.Soft()) as interface: | |
| gr.Markdown(""" | |
| # 🚀 LLM Benchmark Suite | |
| Compare different LLM models using various benchmarking metrics | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| model_name = gr.Dropdown( | |
| choices=self.available_models, | |
| label="Select Model to Benchmark", | |
| value=self.available_models[0] if self.available_models else None | |
| ) | |
| judge_model = gr.Dropdown( | |
| choices=self.available_models, | |
| label="Select Judge Model", | |
| value=self.available_models[0] if self.available_models else None | |
| ) | |
| num_iterations = gr.Slider( | |
| minimum=1, | |
| maximum=20, | |
| value=5, | |
| step=1, | |
| label="Number of Test Iterations" | |
| ) | |
| run_button = gr.Button("🎯 Run Benchmark", variant="primary") | |
| with gr.Column(): | |
| progress_output = gr.Textbox( | |
| label="Benchmark Progress", | |
| lines=10, | |
| max_lines=10 | |
| ) | |
| with gr.Row(): | |
| chat_output = gr.Chatbot( | |
| label="Q&A Chat During Benchmark", | |
| height=300 | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| results_json = gr.JSON(label="Detailed Results") | |
| with gr.Column(): | |
| plot_output = gr.Plot(label="Performance Visualization") | |
| run_button.click( | |
| fn=self.run_benchmark, | |
| inputs=[model_name, judge_model, num_iterations], | |
| outputs=[progress_output, chat_output, results_json, plot_output] | |
| ) | |
| return interface | |
| def run_benchmark(self, model_name, judge_model, num_iterations): | |
| if not model_name or not judge_model: | |
| return "Please select both a model and a judge model.", None, None, None | |
| console.print(f"[bold blue]Starting benchmark for {model_name} with {num_iterations} iterations[/bold blue]") | |
| console.print(f"[bold blue]Judge model: {judge_model}[/bold blue]") | |
| try: | |
| benchmark_suite = BenchmarkSuite(model_name, judge_model) | |
| # Run benchmarks with rich progress bar | |
| results = {} | |
| start_time = time.time() | |
| from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn, TimeRemainingColumn | |
| with Progress( | |
| SpinnerColumn(), | |
| TextColumn("[progress.description]{task.description}"), | |
| BarColumn(), | |
| TaskProgressColumn(), | |
| TimeRemainingColumn(), | |
| console=console | |
| ) as progress: | |
| # Create a single task for overall progress | |
| overall_task = progress.add_task("[cyan]Running benchmarks...", total=5) # 5 test categories | |
| # Run benchmarks | |
| for test_name, result in benchmark_suite.run_all_tests(num_iterations): | |
| results[test_name] = result | |
| progress.update(overall_task, advance=1, description=f"[green]Completed {test_name}[/green]") | |
| console.print(f"[green]✓ {test_name}: Score {result.get('score', 0):.2f}[/green]") | |
| total_time = time.time() - start_time | |
| # Calculate final scores | |
| final_results = { | |
| "model_name": model_name, | |
| "judge_model": judge_model, | |
| "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), | |
| "total_time": f"{total_time:.2f}s", | |
| "results": results, | |
| "overall_score": sum(r.get("score", 0) for r in results.values()) / len(results) if results else 0 | |
| } | |
| # Print summary | |
| console.print("\n[bold green]Benchmark Results Summary:[/bold green]") | |
| for test_name, result in results.items(): | |
| console.print(f" {test_name}: {result.get('score', 0):.2f}/10") | |
| console.print(f"[bold blue]Overall Score: {final_results['overall_score']:.2f}/10[/bold blue]") | |
| console.print(f"[bold blue]Total Time: {total_time:.2f} seconds[/bold blue]") | |
| # Create visualization | |
| df = pd.DataFrame([ | |
| {"Metric": k, "Score": v.get("score", 0)} | |
| for k, v in results.items() | |
| ]) | |
| fig = px.bar( | |
| df, | |
| x="Metric", | |
| y="Score", | |
| title=f"Benchmark Results: {model_name}", | |
| color="Score", | |
| color_continuous_scale="viridis" | |
| ) | |
| # Save results | |
| os.makedirs("benchmark_results", exist_ok=True) | |
| result_file = f"benchmark_results/{model_name}_vs_{judge_model}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" | |
| with open(result_file, "w") as f: | |
| json.dump(final_results, f, indent=2) | |
| progress_text = f"✨ Benchmark completed! Results saved to {result_file}" | |
| console.print(f"[green]{progress_text}[/green]") | |
| return progress_text, None, final_results, fig | |
| except Exception as e: | |
| error_msg = f"Error during benchmark: {str(e)}" | |
| console.print(f"[red]{error_msg}[/red]") | |
| console.print_exception() | |
| return error_msg, None, None, None | |
| if __name__ == "__main__": | |
| app = BenchmarkApp() | |
| interface = app.create_interface() | |
| interface.launch(share=True) | |