import gradio as gr import ollama import pandas as pd import plotly.express as px from rich.console import Console from rich.progress import track from datetime import datetime import json import time import os from benchmarks.benchmark_suite import BenchmarkSuite console = Console() def get_available_models(): try: models = ollama.list() if 'models' in models and models['models']: # Extract model names, handling cases where 'name' key might not exist model_names = [] for model in models['models']: if 'name' in model: model_names.append(model['name']) else: # Fallback to 'model' key if 'name' doesn't exist model_names.append(model.get('model', 'unknown_model')) return model_names else: console.print("[yellow]No models found in Ollama[/yellow]") return ["codellama", "llama2", "mistral"] # Fallback default models except Exception as e: console.print(f"[red]Error fetching models: {e}[/red]") return ["codellama", "llama2", "mistral"] # Fallback default models class BenchmarkApp: def __init__(self): self.available_models = get_available_models() def create_interface(self): with gr.Blocks(theme=gr.themes.Soft()) as interface: gr.Markdown(""" # 🚀 LLM Benchmark Suite Compare different LLM models using various benchmarking metrics """) with gr.Row(): with gr.Column(): model_name = gr.Dropdown( choices=self.available_models, label="Select Model to Benchmark", value=self.available_models[0] if self.available_models else None ) judge_model = gr.Dropdown( choices=self.available_models, label="Select Judge Model", value=self.available_models[0] if self.available_models else None ) num_iterations = gr.Slider( minimum=1, maximum=20, value=5, step=1, label="Number of Test Iterations" ) run_button = gr.Button("🎯 Run Benchmark", variant="primary") with gr.Column(): progress_output = gr.Textbox( label="Benchmark Progress", lines=10, max_lines=10 ) with gr.Row(): chat_output = gr.Chatbot( label="Q&A Chat During Benchmark", height=300 ) with gr.Row(): with gr.Column(): results_json = gr.JSON(label="Detailed Results") with gr.Column(): plot_output = gr.Plot(label="Performance Visualization") run_button.click( fn=self.run_benchmark, inputs=[model_name, judge_model, num_iterations], outputs=[progress_output, chat_output, results_json, plot_output] ) return interface def run_benchmark(self, model_name, judge_model, num_iterations): if not model_name or not judge_model: return "Please select both a model and a judge model.", None, None, None console.print(f"[bold blue]Starting benchmark for {model_name} with {num_iterations} iterations[/bold blue]") console.print(f"[bold blue]Judge model: {judge_model}[/bold blue]") try: benchmark_suite = BenchmarkSuite(model_name, judge_model) # Run benchmarks with rich progress bar results = {} start_time = time.time() from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn, TimeRemainingColumn with Progress( SpinnerColumn(), TextColumn("[progress.description]{task.description}"), BarColumn(), TaskProgressColumn(), TimeRemainingColumn(), console=console ) as progress: # Create a single task for overall progress overall_task = progress.add_task("[cyan]Running benchmarks...", total=5) # 5 test categories # Run benchmarks for test_name, result in benchmark_suite.run_all_tests(num_iterations): results[test_name] = result progress.update(overall_task, advance=1, description=f"[green]Completed {test_name}[/green]") console.print(f"[green]✓ {test_name}: Score {result.get('score', 0):.2f}[/green]") total_time = time.time() - start_time # Calculate final scores final_results = { "model_name": model_name, "judge_model": judge_model, "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "total_time": f"{total_time:.2f}s", "results": results, "overall_score": sum(r.get("score", 0) for r in results.values()) / len(results) if results else 0 } # Print summary console.print("\n[bold green]Benchmark Results Summary:[/bold green]") for test_name, result in results.items(): console.print(f" {test_name}: {result.get('score', 0):.2f}/10") console.print(f"[bold blue]Overall Score: {final_results['overall_score']:.2f}/10[/bold blue]") console.print(f"[bold blue]Total Time: {total_time:.2f} seconds[/bold blue]") # Create visualization df = pd.DataFrame([ {"Metric": k, "Score": v.get("score", 0)} for k, v in results.items() ]) fig = px.bar( df, x="Metric", y="Score", title=f"Benchmark Results: {model_name}", color="Score", color_continuous_scale="viridis" ) # Save results os.makedirs("benchmark_results", exist_ok=True) result_file = f"benchmark_results/{model_name}_vs_{judge_model}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" with open(result_file, "w") as f: json.dump(final_results, f, indent=2) progress_text = f"✨ Benchmark completed! Results saved to {result_file}" console.print(f"[green]{progress_text}[/green]") return progress_text, None, final_results, fig except Exception as e: error_msg = f"Error during benchmark: {str(e)}" console.print(f"[red]{error_msg}[/red]") console.print_exception() return error_msg, None, None, None if __name__ == "__main__": app = BenchmarkApp() interface = app.create_interface() interface.launch(share=True)