Spaces:

rzvn
/

LLM-Benchmark-Model-vs-Judge

Sleeping

App Files Files Community

LLM-Benchmark-Model-vs-Judge / test_benchmark.py

rzvn

Upload folder using huggingface_hub

f6b4b31 verified 5 months ago

raw

history blame contribute delete

5.72 kB

	import ollama
	import time
	import json
	from rich.console import Console
	from rich.panel import Panel
	from benchmarks.benchmark_suite import BenchmarkSuite
	from typing import Dict, Any, List, Tuple, Generator

	console = Console()

	def run_full_benchmark(model_name: str, judge_model: str, num_iterations: int = 1):
	"""
	Runs the full benchmark with real-time progress and Q&A output using rich.
	"""
	console.print(Panel("[bold magenta]LLM Full Benchmark Test[/bold magenta]", expand=False))
	console.print(f"[bold blue]Running full benchmark: {model_name} vs {judge_model}[/bold blue]")

	benchmark_suite = BenchmarkSuite(model_name, judge_model)
	results = {}

	# Test categories with their display names
	test_categories = [
	("Logical Reasoning", "test_logical_reasoning"),
	("Code Generation", "test_code_generation"),
	("Mathematical Problem Solving", "test_math_solving"),
	("Context Understanding", "test_context_understanding"),
	("Performance Metrics", "test_performance")
	]

	# Store test cases data
	test_cases = []
	max_test_cases = 5 # Limit the number of test cases displayed

	# Track the last update time for each test case to throttle updates
	last_updates = {}
	update_interval = 0.5 # Minimum seconds between updates per test case

	# Callback function to output Q&A information
	def update_qa_output_callback(prompt: str, model_response: str, judge_response: str, model_name: str, judge_model_name: str):
	# Create a key for this test case
	row_key = f"{model_name}_{hash(prompt) % 1000}"

	# Check if we should update based on throttling
	current_time = time.time()
	if row_key in last_updates:
	time_since_last = current_time - last_updates[row_key]
	if time_since_last < update_interval:
	# Skip update if not enough time has passed
	return
	last_updates[row_key] = current_time

	# Check if this test case already exists
	existing_case = None
	for i, case in enumerate(test_cases):
	if case.get('key') == row_key:
	existing_case = case
	existing_case_index = i
	break

	if existing_case is None:
	# Add a new test case if we haven't reached the limit
	if len(test_cases) < max_test_cases:
	test_cases.append({
	'key': row_key,
	'model_name': model_name,
	'prompt': prompt,
	'model_response': model_response,
	'judge_response': judge_response
	})
	else:
	# If we've reached the limit, update the oldest test case
	test_cases.pop(0) # Remove the oldest
	test_cases.append({
	'key': row_key,
	'model_name': model_name,
	'prompt': prompt,
	'model_response': model_response,
	'judge_response': judge_response
	})
	else:
	# Update the existing test case
	existing_case['model_response'] = model_response
	existing_case['judge_response'] = judge_response

	# Output the Q&A information in rich text
	console.print(f"[bold blue]Model:[/bold blue] {model_name}")
	console.print(f"[bold cyan]Prompt:[/bold cyan] {prompt}")
	console.print(f"[bold green]Response:[/bold green] {model_response}")
	console.print(f"[bold yellow]Judge:[/bold yellow] {judge_response}")
	console.print("-" * 50) # Separator line

	for i, (category_name, method_name) in enumerate(test_categories):
	console.print(f"[bold green]Running {category_name} Benchmark...[/bold green]")

	# Show model loading/processing
	console.print(f"[magenta] Loading models for {category_name}...[/magenta]")
	time.sleep(0.5) # Simulate loading time

	try:
	# Run the actual test, passing the callback
	test_func = getattr(benchmark_suite, method_name)
	result = test_func(num_iterations, update_qa_output_callback) # Pass the new callback
	results[category_name] = result

	console.print(f"[green]✓ {category_name} completed: {result.get('score', 0):.1f}/10[/green]")
	except Exception as e:
	console.print(f"[red]✗ {category_name} failed: {str(e)}[/red]")
	results[category_name] = {"score": 0, "error": str(e)}

	# Calculate final scores
	overall_score = sum(r.get("score", 0) for r in results.values()) / len(results) if results else 0

	# Print summary
	console.print(Panel("[bold magenta]Benchmark Results Summary[/bold magenta]", expand=False))
	for test_name, result in results.items():
	score = result.get('score', 0)
	if 'error' in result:
	console.print(f" {test_name}: [red]Error - {result['error']}[/red]")
	else:
	console.print(f" {test_name}: {score:.1f}/10")
	console.print(f"[bold blue]Overall Score: {overall_score:.1f}/10[/bold blue]")

	return results

	if __name__ == "__main__":
	import sys

	model_to_test = "qwen3:8b"
	judge_model = "deepscaler:latest"
	iterations = 1

	if len(sys.argv) > 1:
	if sys.argv[1] == "detailed":
	run_full_benchmark(model_to_test, judge_model, iterations)
	else:
	console.print("[red]Invalid argument. Use 'python test_benchmark.py detailed'[/red]")
	else:
	run_full_benchmark(model_to_test, judge_model, iterations)