Spaces:
Sleeping
Sleeping
File size: 7,494 Bytes
f6b4b31 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
import gradio as gr
import ollama
import pandas as pd
import plotly.express as px
from rich.console import Console
from rich.progress import track
from datetime import datetime
import json
import time
import os
from benchmarks.benchmark_suite import BenchmarkSuite
console = Console()
def get_available_models():
try:
models = ollama.list()
if 'models' in models and models['models']:
# Extract model names, handling cases where 'name' key might not exist
model_names = []
for model in models['models']:
if 'name' in model:
model_names.append(model['name'])
else:
# Fallback to 'model' key if 'name' doesn't exist
model_names.append(model.get('model', 'unknown_model'))
return model_names
else:
console.print("[yellow]No models found in Ollama[/yellow]")
return ["codellama", "llama2", "mistral"] # Fallback default models
except Exception as e:
console.print(f"[red]Error fetching models: {e}[/red]")
return ["codellama", "llama2", "mistral"] # Fallback default models
class BenchmarkApp:
def __init__(self):
self.available_models = get_available_models()
def create_interface(self):
with gr.Blocks(theme=gr.themes.Soft()) as interface:
gr.Markdown("""
# 🚀 LLM Benchmark Suite
Compare different LLM models using various benchmarking metrics
""")
with gr.Row():
with gr.Column():
model_name = gr.Dropdown(
choices=self.available_models,
label="Select Model to Benchmark",
value=self.available_models[0] if self.available_models else None
)
judge_model = gr.Dropdown(
choices=self.available_models,
label="Select Judge Model",
value=self.available_models[0] if self.available_models else None
)
num_iterations = gr.Slider(
minimum=1,
maximum=20,
value=5,
step=1,
label="Number of Test Iterations"
)
run_button = gr.Button("🎯 Run Benchmark", variant="primary")
with gr.Column():
progress_output = gr.Textbox(
label="Benchmark Progress",
lines=10,
max_lines=10
)
with gr.Row():
chat_output = gr.Chatbot(
label="Q&A Chat During Benchmark",
height=300
)
with gr.Row():
with gr.Column():
results_json = gr.JSON(label="Detailed Results")
with gr.Column():
plot_output = gr.Plot(label="Performance Visualization")
run_button.click(
fn=self.run_benchmark,
inputs=[model_name, judge_model, num_iterations],
outputs=[progress_output, chat_output, results_json, plot_output]
)
return interface
def run_benchmark(self, model_name, judge_model, num_iterations):
if not model_name or not judge_model:
return "Please select both a model and a judge model.", None, None, None
console.print(f"[bold blue]Starting benchmark for {model_name} with {num_iterations} iterations[/bold blue]")
console.print(f"[bold blue]Judge model: {judge_model}[/bold blue]")
try:
benchmark_suite = BenchmarkSuite(model_name, judge_model)
# Run benchmarks with rich progress bar
results = {}
start_time = time.time()
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn, TimeRemainingColumn
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
TimeRemainingColumn(),
console=console
) as progress:
# Create a single task for overall progress
overall_task = progress.add_task("[cyan]Running benchmarks...", total=5) # 5 test categories
# Run benchmarks
for test_name, result in benchmark_suite.run_all_tests(num_iterations):
results[test_name] = result
progress.update(overall_task, advance=1, description=f"[green]Completed {test_name}[/green]")
console.print(f"[green]✓ {test_name}: Score {result.get('score', 0):.2f}[/green]")
total_time = time.time() - start_time
# Calculate final scores
final_results = {
"model_name": model_name,
"judge_model": judge_model,
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"total_time": f"{total_time:.2f}s",
"results": results,
"overall_score": sum(r.get("score", 0) for r in results.values()) / len(results) if results else 0
}
# Print summary
console.print("\n[bold green]Benchmark Results Summary:[/bold green]")
for test_name, result in results.items():
console.print(f" {test_name}: {result.get('score', 0):.2f}/10")
console.print(f"[bold blue]Overall Score: {final_results['overall_score']:.2f}/10[/bold blue]")
console.print(f"[bold blue]Total Time: {total_time:.2f} seconds[/bold blue]")
# Create visualization
df = pd.DataFrame([
{"Metric": k, "Score": v.get("score", 0)}
for k, v in results.items()
])
fig = px.bar(
df,
x="Metric",
y="Score",
title=f"Benchmark Results: {model_name}",
color="Score",
color_continuous_scale="viridis"
)
# Save results
os.makedirs("benchmark_results", exist_ok=True)
result_file = f"benchmark_results/{model_name}_vs_{judge_model}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(result_file, "w") as f:
json.dump(final_results, f, indent=2)
progress_text = f"✨ Benchmark completed! Results saved to {result_file}"
console.print(f"[green]{progress_text}[/green]")
return progress_text, None, final_results, fig
except Exception as e:
error_msg = f"Error during benchmark: {str(e)}"
console.print(f"[red]{error_msg}[/red]")
console.print_exception()
return error_msg, None, None, None
if __name__ == "__main__":
app = BenchmarkApp()
interface = app.create_interface()
interface.launch(share=True)
|