LLM-Benchmark-Model-vs-Judge / test_benchmark.py
rzvn's picture
Upload folder using huggingface_hub
f6b4b31 verified
import ollama
import time
import json
from rich.console import Console
from rich.panel import Panel
from benchmarks.benchmark_suite import BenchmarkSuite
from typing import Dict, Any, List, Tuple, Generator
console = Console()
def run_full_benchmark(model_name: str, judge_model: str, num_iterations: int = 1):
"""
Runs the full benchmark with real-time progress and Q&A output using rich.
"""
console.print(Panel("[bold magenta]LLM Full Benchmark Test[/bold magenta]", expand=False))
console.print(f"[bold blue]Running full benchmark: {model_name} vs {judge_model}[/bold blue]")
benchmark_suite = BenchmarkSuite(model_name, judge_model)
results = {}
# Test categories with their display names
test_categories = [
("Logical Reasoning", "test_logical_reasoning"),
("Code Generation", "test_code_generation"),
("Mathematical Problem Solving", "test_math_solving"),
("Context Understanding", "test_context_understanding"),
("Performance Metrics", "test_performance")
]
# Store test cases data
test_cases = []
max_test_cases = 5 # Limit the number of test cases displayed
# Track the last update time for each test case to throttle updates
last_updates = {}
update_interval = 0.5 # Minimum seconds between updates per test case
# Callback function to output Q&A information
def update_qa_output_callback(prompt: str, model_response: str, judge_response: str, model_name: str, judge_model_name: str):
# Create a key for this test case
row_key = f"{model_name}_{hash(prompt) % 1000}"
# Check if we should update based on throttling
current_time = time.time()
if row_key in last_updates:
time_since_last = current_time - last_updates[row_key]
if time_since_last < update_interval:
# Skip update if not enough time has passed
return
last_updates[row_key] = current_time
# Check if this test case already exists
existing_case = None
for i, case in enumerate(test_cases):
if case.get('key') == row_key:
existing_case = case
existing_case_index = i
break
if existing_case is None:
# Add a new test case if we haven't reached the limit
if len(test_cases) < max_test_cases:
test_cases.append({
'key': row_key,
'model_name': model_name,
'prompt': prompt,
'model_response': model_response,
'judge_response': judge_response
})
else:
# If we've reached the limit, update the oldest test case
test_cases.pop(0) # Remove the oldest
test_cases.append({
'key': row_key,
'model_name': model_name,
'prompt': prompt,
'model_response': model_response,
'judge_response': judge_response
})
else:
# Update the existing test case
existing_case['model_response'] = model_response
existing_case['judge_response'] = judge_response
# Output the Q&A information in rich text
console.print(f"[bold blue]Model:[/bold blue] {model_name}")
console.print(f"[bold cyan]Prompt:[/bold cyan] {prompt}")
console.print(f"[bold green]Response:[/bold green] {model_response}")
console.print(f"[bold yellow]Judge:[/bold yellow] {judge_response}")
console.print("-" * 50) # Separator line
for i, (category_name, method_name) in enumerate(test_categories):
console.print(f"[bold green]Running {category_name} Benchmark...[/bold green]")
# Show model loading/processing
console.print(f"[magenta] Loading models for {category_name}...[/magenta]")
time.sleep(0.5) # Simulate loading time
try:
# Run the actual test, passing the callback
test_func = getattr(benchmark_suite, method_name)
result = test_func(num_iterations, update_qa_output_callback) # Pass the new callback
results[category_name] = result
console.print(f"[green]✓ {category_name} completed: {result.get('score', 0):.1f}/10[/green]")
except Exception as e:
console.print(f"[red]✗ {category_name} failed: {str(e)}[/red]")
results[category_name] = {"score": 0, "error": str(e)}
# Calculate final scores
overall_score = sum(r.get("score", 0) for r in results.values()) / len(results) if results else 0
# Print summary
console.print(Panel("[bold magenta]Benchmark Results Summary[/bold magenta]", expand=False))
for test_name, result in results.items():
score = result.get('score', 0)
if 'error' in result:
console.print(f" {test_name}: [red]Error - {result['error']}[/red]")
else:
console.print(f" {test_name}: {score:.1f}/10")
console.print(f"[bold blue]Overall Score: {overall_score:.1f}/10[/bold blue]")
return results
if __name__ == "__main__":
import sys
model_to_test = "qwen3:8b"
judge_model = "deepscaler:latest"
iterations = 1
if len(sys.argv) > 1:
if sys.argv[1] == "detailed":
run_full_benchmark(model_to_test, judge_model, iterations)
else:
console.print("[red]Invalid argument. Use 'python test_benchmark.py detailed'[/red]")
else:
run_full_benchmark(model_to_test, judge_model, iterations)