File size: 7,494 Bytes
f6b4b31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import gradio as gr
import ollama
import pandas as pd
import plotly.express as px
from rich.console import Console
from rich.progress import track
from datetime import datetime
import json
import time
import os
from benchmarks.benchmark_suite import BenchmarkSuite

console = Console()

def get_available_models():
    try:
        models = ollama.list()
        if 'models' in models and models['models']:
            # Extract model names, handling cases where 'name' key might not exist
            model_names = []
            for model in models['models']:
                if 'name' in model:
                    model_names.append(model['name'])
                else:
                    # Fallback to 'model' key if 'name' doesn't exist
                    model_names.append(model.get('model', 'unknown_model'))
            return model_names
        else:
            console.print("[yellow]No models found in Ollama[/yellow]")
            return ["codellama", "llama2", "mistral"]  # Fallback default models
    except Exception as e:
        console.print(f"[red]Error fetching models: {e}[/red]")
        return ["codellama", "llama2", "mistral"]  # Fallback default models

class BenchmarkApp:
    def __init__(self):
        self.available_models = get_available_models()
        
    def create_interface(self):
        with gr.Blocks(theme=gr.themes.Soft()) as interface:
            gr.Markdown("""
            # 🚀 LLM Benchmark Suite
            Compare different LLM models using various benchmarking metrics
            """)
            
            with gr.Row():
                with gr.Column():
                    model_name = gr.Dropdown(
                        choices=self.available_models,
                        label="Select Model to Benchmark",
                        value=self.available_models[0] if self.available_models else None
                    )
                    judge_model = gr.Dropdown(
                        choices=self.available_models,
                        label="Select Judge Model",
                        value=self.available_models[0] if self.available_models else None
                    )
                    num_iterations = gr.Slider(
                        minimum=1,
                        maximum=20,
                        value=5,
                        step=1,
                        label="Number of Test Iterations"
                    )
                    run_button = gr.Button("🎯 Run Benchmark", variant="primary")
                
                with gr.Column():
                    progress_output = gr.Textbox(
                        label="Benchmark Progress",
                        lines=10,
                        max_lines=10
                    )
            
            with gr.Row():
                chat_output = gr.Chatbot(
                    label="Q&A Chat During Benchmark",
                    height=300
                )
            
            with gr.Row():
                with gr.Column():
                    results_json = gr.JSON(label="Detailed Results")
                with gr.Column():
                    plot_output = gr.Plot(label="Performance Visualization")
            
            run_button.click(
                fn=self.run_benchmark,
                inputs=[model_name, judge_model, num_iterations],
                outputs=[progress_output, chat_output, results_json, plot_output]
            )
            
        return interface
    
    def run_benchmark(self, model_name, judge_model, num_iterations):
        if not model_name or not judge_model:
            return "Please select both a model and a judge model.", None, None, None
        
        console.print(f"[bold blue]Starting benchmark for {model_name} with {num_iterations} iterations[/bold blue]")
        console.print(f"[bold blue]Judge model: {judge_model}[/bold blue]")
        
        try:
            benchmark_suite = BenchmarkSuite(model_name, judge_model)
            
            # Run benchmarks with rich progress bar
            results = {}
            start_time = time.time()
            
            from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn, TimeRemainingColumn
            
            with Progress(
                SpinnerColumn(),
                TextColumn("[progress.description]{task.description}"),
                BarColumn(),
                TaskProgressColumn(),
                TimeRemainingColumn(),
                console=console
            ) as progress:
                # Create a single task for overall progress
                overall_task = progress.add_task("[cyan]Running benchmarks...", total=5)  # 5 test categories
                
                # Run benchmarks
                for test_name, result in benchmark_suite.run_all_tests(num_iterations):
                    results[test_name] = result
                    progress.update(overall_task, advance=1, description=f"[green]Completed {test_name}[/green]")
                    console.print(f"[green]✓ {test_name}: Score {result.get('score', 0):.2f}[/green]")
            
            total_time = time.time() - start_time
            
            # Calculate final scores
            final_results = {
                "model_name": model_name,
                "judge_model": judge_model,
                "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                "total_time": f"{total_time:.2f}s",
                "results": results,
                "overall_score": sum(r.get("score", 0) for r in results.values()) / len(results) if results else 0
            }
            
            # Print summary
            console.print("\n[bold green]Benchmark Results Summary:[/bold green]")
            for test_name, result in results.items():
                console.print(f"  {test_name}: {result.get('score', 0):.2f}/10")
            console.print(f"[bold blue]Overall Score: {final_results['overall_score']:.2f}/10[/bold blue]")
            console.print(f"[bold blue]Total Time: {total_time:.2f} seconds[/bold blue]")
            
            # Create visualization
            df = pd.DataFrame([
                {"Metric": k, "Score": v.get("score", 0)}
                for k, v in results.items()
            ])
            
            fig = px.bar(
                df,
                x="Metric",
                y="Score",
                title=f"Benchmark Results: {model_name}",
                color="Score",
                color_continuous_scale="viridis"
            )
            
            # Save results
            os.makedirs("benchmark_results", exist_ok=True)
            result_file = f"benchmark_results/{model_name}_vs_{judge_model}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
            with open(result_file, "w") as f:
                json.dump(final_results, f, indent=2)
            
            progress_text = f"✨ Benchmark completed! Results saved to {result_file}"
            console.print(f"[green]{progress_text}[/green]")
            
            return progress_text, None, final_results, fig
            
        except Exception as e:
            error_msg = f"Error during benchmark: {str(e)}"
            console.print(f"[red]{error_msg}[/red]")
            console.print_exception()
            return error_msg, None, None, None

if __name__ == "__main__":
    app = BenchmarkApp()
    interface = app.create_interface()
    interface.launch(share=True)