import ollama import time import json from rich.console import Console from rich.panel import Panel from benchmarks.benchmark_suite import BenchmarkSuite from typing import Dict, Any, List, Tuple, Generator console = Console() def run_full_benchmark(model_name: str, judge_model: str, num_iterations: int = 1): """ Runs the full benchmark with real-time progress and Q&A output using rich. """ console.print(Panel("[bold magenta]LLM Full Benchmark Test[/bold magenta]", expand=False)) console.print(f"[bold blue]Running full benchmark: {model_name} vs {judge_model}[/bold blue]") benchmark_suite = BenchmarkSuite(model_name, judge_model) results = {} # Test categories with their display names test_categories = [ ("Logical Reasoning", "test_logical_reasoning"), ("Code Generation", "test_code_generation"), ("Mathematical Problem Solving", "test_math_solving"), ("Context Understanding", "test_context_understanding"), ("Performance Metrics", "test_performance") ] # Store test cases data test_cases = [] max_test_cases = 5 # Limit the number of test cases displayed # Track the last update time for each test case to throttle updates last_updates = {} update_interval = 0.5 # Minimum seconds between updates per test case # Callback function to output Q&A information def update_qa_output_callback(prompt: str, model_response: str, judge_response: str, model_name: str, judge_model_name: str): # Create a key for this test case row_key = f"{model_name}_{hash(prompt) % 1000}" # Check if we should update based on throttling current_time = time.time() if row_key in last_updates: time_since_last = current_time - last_updates[row_key] if time_since_last < update_interval: # Skip update if not enough time has passed return last_updates[row_key] = current_time # Check if this test case already exists existing_case = None for i, case in enumerate(test_cases): if case.get('key') == row_key: existing_case = case existing_case_index = i break if existing_case is None: # Add a new test case if we haven't reached the limit if len(test_cases) < max_test_cases: test_cases.append({ 'key': row_key, 'model_name': model_name, 'prompt': prompt, 'model_response': model_response, 'judge_response': judge_response }) else: # If we've reached the limit, update the oldest test case test_cases.pop(0) # Remove the oldest test_cases.append({ 'key': row_key, 'model_name': model_name, 'prompt': prompt, 'model_response': model_response, 'judge_response': judge_response }) else: # Update the existing test case existing_case['model_response'] = model_response existing_case['judge_response'] = judge_response # Output the Q&A information in rich text console.print(f"[bold blue]Model:[/bold blue] {model_name}") console.print(f"[bold cyan]Prompt:[/bold cyan] {prompt}") console.print(f"[bold green]Response:[/bold green] {model_response}") console.print(f"[bold yellow]Judge:[/bold yellow] {judge_response}") console.print("-" * 50) # Separator line for i, (category_name, method_name) in enumerate(test_categories): console.print(f"[bold green]Running {category_name} Benchmark...[/bold green]") # Show model loading/processing console.print(f"[magenta] Loading models for {category_name}...[/magenta]") time.sleep(0.5) # Simulate loading time try: # Run the actual test, passing the callback test_func = getattr(benchmark_suite, method_name) result = test_func(num_iterations, update_qa_output_callback) # Pass the new callback results[category_name] = result console.print(f"[green]✓ {category_name} completed: {result.get('score', 0):.1f}/10[/green]") except Exception as e: console.print(f"[red]✗ {category_name} failed: {str(e)}[/red]") results[category_name] = {"score": 0, "error": str(e)} # Calculate final scores overall_score = sum(r.get("score", 0) for r in results.values()) / len(results) if results else 0 # Print summary console.print(Panel("[bold magenta]Benchmark Results Summary[/bold magenta]", expand=False)) for test_name, result in results.items(): score = result.get('score', 0) if 'error' in result: console.print(f" {test_name}: [red]Error - {result['error']}[/red]") else: console.print(f" {test_name}: {score:.1f}/10") console.print(f"[bold blue]Overall Score: {overall_score:.1f}/10[/bold blue]") return results if __name__ == "__main__": import sys model_to_test = "qwen3:8b" judge_model = "deepscaler:latest" iterations = 1 if len(sys.argv) > 1: if sys.argv[1] == "detailed": run_full_benchmark(model_to_test, judge_model, iterations) else: console.print("[red]Invalid argument. Use 'python test_benchmark.py detailed'[/red]") else: run_full_benchmark(model_to_test, judge_model, iterations)