Spaces:

rzvn
/

LLM-Benchmark-Model-vs-Judge

Sleeping

App Files Files Community

LLM-Benchmark-Model-vs-Judge / app.py

rzvn

Upload folder using huggingface_hub

f6b4b31 verified 5 months ago

raw

history blame contribute delete

7.49 kB

	import gradio as gr
	import ollama
	import pandas as pd
	import plotly.express as px
	from rich.console import Console
	from rich.progress import track
	from datetime import datetime
	import json
	import time
	import os
	from benchmarks.benchmark_suite import BenchmarkSuite

	console = Console()

	def get_available_models():
	try:
	models = ollama.list()
	if 'models' in models and models['models']:
	# Extract model names, handling cases where 'name' key might not exist
	model_names = []
	for model in models['models']:
	if 'name' in model:
	model_names.append(model['name'])
	else:
	# Fallback to 'model' key if 'name' doesn't exist
	model_names.append(model.get('model', 'unknown_model'))
	return model_names
	else:
	console.print("[yellow]No models found in Ollama[/yellow]")
	return ["codellama", "llama2", "mistral"] # Fallback default models
	except Exception as e:
	console.print(f"[red]Error fetching models: {e}[/red]")
	return ["codellama", "llama2", "mistral"] # Fallback default models

	class BenchmarkApp:
	def __init__(self):
	self.available_models = get_available_models()

	def create_interface(self):
	with gr.Blocks(theme=gr.themes.Soft()) as interface:
	gr.Markdown("""
	# 🚀 LLM Benchmark Suite
	Compare different LLM models using various benchmarking metrics
	""")

	with gr.Row():
	with gr.Column():
	model_name = gr.Dropdown(
	choices=self.available_models,
	label="Select Model to Benchmark",
	value=self.available_models[0] if self.available_models else None
	)
	judge_model = gr.Dropdown(
	choices=self.available_models,
	label="Select Judge Model",
	value=self.available_models[0] if self.available_models else None
	)
	num_iterations = gr.Slider(
	minimum=1,
	maximum=20,
	value=5,
	step=1,
	label="Number of Test Iterations"
	)
	run_button = gr.Button("🎯 Run Benchmark", variant="primary")

	with gr.Column():
	progress_output = gr.Textbox(
	label="Benchmark Progress",
	lines=10,
	max_lines=10
	)

	with gr.Row():
	chat_output = gr.Chatbot(
	label="Q&A Chat During Benchmark",
	height=300
	)

	with gr.Row():
	with gr.Column():
	results_json = gr.JSON(label="Detailed Results")
	with gr.Column():
	plot_output = gr.Plot(label="Performance Visualization")

	run_button.click(
	fn=self.run_benchmark,
	inputs=[model_name, judge_model, num_iterations],
	outputs=[progress_output, chat_output, results_json, plot_output]
	)

	return interface

	def run_benchmark(self, model_name, judge_model, num_iterations):
	if not model_name or not judge_model:
	return "Please select both a model and a judge model.", None, None, None

	console.print(f"[bold blue]Starting benchmark for {model_name} with {num_iterations} iterations[/bold blue]")
	console.print(f"[bold blue]Judge model: {judge_model}[/bold blue]")

	try:
	benchmark_suite = BenchmarkSuite(model_name, judge_model)

	# Run benchmarks with rich progress bar
	results = {}
	start_time = time.time()

	from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn, TimeRemainingColumn

	with Progress(
	SpinnerColumn(),
	TextColumn("[progress.description]{task.description}"),
	BarColumn(),
	TaskProgressColumn(),
	TimeRemainingColumn(),
	console=console
	) as progress:
	# Create a single task for overall progress
	overall_task = progress.add_task("[cyan]Running benchmarks...", total=5) # 5 test categories

	# Run benchmarks
	for test_name, result in benchmark_suite.run_all_tests(num_iterations):
	results[test_name] = result
	progress.update(overall_task, advance=1, description=f"[green]Completed {test_name}[/green]")
	console.print(f"[green]✓ {test_name}: Score {result.get('score', 0):.2f}[/green]")

	total_time = time.time() - start_time

	# Calculate final scores
	final_results = {
	"model_name": model_name,
	"judge_model": judge_model,
	"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
	"total_time": f"{total_time:.2f}s",
	"results": results,
	"overall_score": sum(r.get("score", 0) for r in results.values()) / len(results) if results else 0
	}

	# Print summary
	console.print("\n[bold green]Benchmark Results Summary:[/bold green]")
	for test_name, result in results.items():
	console.print(f" {test_name}: {result.get('score', 0):.2f}/10")
	console.print(f"[bold blue]Overall Score: {final_results['overall_score']:.2f}/10[/bold blue]")
	console.print(f"[bold blue]Total Time: {total_time:.2f} seconds[/bold blue]")

	# Create visualization
	df = pd.DataFrame([
	{"Metric": k, "Score": v.get("score", 0)}
	for k, v in results.items()
	])

	fig = px.bar(
	df,
	x="Metric",
	y="Score",
	title=f"Benchmark Results: {model_name}",
	color="Score",
	color_continuous_scale="viridis"
	)

	# Save results
	os.makedirs("benchmark_results", exist_ok=True)
	result_file = f"benchmark_results/{model_name}_vs_{judge_model}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
	with open(result_file, "w") as f:
	json.dump(final_results, f, indent=2)

	progress_text = f"✨ Benchmark completed! Results saved to {result_file}"
	console.print(f"[green]{progress_text}[/green]")

	return progress_text, None, final_results, fig

	except Exception as e:
	error_msg = f"Error during benchmark: {str(e)}"
	console.print(f"[red]{error_msg}[/red]")
	console.print_exception()
	return error_msg, None, None, None

	if __name__ == "__main__":
	app = BenchmarkApp()
	interface = app.create_interface()
	interface.launch(share=True)