Spaces:
Sleeping
Sleeping
| import ollama | |
| import time | |
| import json | |
| from rich.console import Console | |
| from rich.panel import Panel | |
| from benchmarks.benchmark_suite import BenchmarkSuite | |
| from typing import Dict, Any, List, Tuple, Generator | |
| console = Console() | |
| def run_full_benchmark(model_name: str, judge_model: str, num_iterations: int = 1): | |
| """ | |
| Runs the full benchmark with real-time progress and Q&A output using rich. | |
| """ | |
| console.print(Panel("[bold magenta]LLM Full Benchmark Test[/bold magenta]", expand=False)) | |
| console.print(f"[bold blue]Running full benchmark: {model_name} vs {judge_model}[/bold blue]") | |
| benchmark_suite = BenchmarkSuite(model_name, judge_model) | |
| results = {} | |
| # Test categories with their display names | |
| test_categories = [ | |
| ("Logical Reasoning", "test_logical_reasoning"), | |
| ("Code Generation", "test_code_generation"), | |
| ("Mathematical Problem Solving", "test_math_solving"), | |
| ("Context Understanding", "test_context_understanding"), | |
| ("Performance Metrics", "test_performance") | |
| ] | |
| # Store test cases data | |
| test_cases = [] | |
| max_test_cases = 5 # Limit the number of test cases displayed | |
| # Track the last update time for each test case to throttle updates | |
| last_updates = {} | |
| update_interval = 0.5 # Minimum seconds between updates per test case | |
| # Callback function to output Q&A information | |
| def update_qa_output_callback(prompt: str, model_response: str, judge_response: str, model_name: str, judge_model_name: str): | |
| # Create a key for this test case | |
| row_key = f"{model_name}_{hash(prompt) % 1000}" | |
| # Check if we should update based on throttling | |
| current_time = time.time() | |
| if row_key in last_updates: | |
| time_since_last = current_time - last_updates[row_key] | |
| if time_since_last < update_interval: | |
| # Skip update if not enough time has passed | |
| return | |
| last_updates[row_key] = current_time | |
| # Check if this test case already exists | |
| existing_case = None | |
| for i, case in enumerate(test_cases): | |
| if case.get('key') == row_key: | |
| existing_case = case | |
| existing_case_index = i | |
| break | |
| if existing_case is None: | |
| # Add a new test case if we haven't reached the limit | |
| if len(test_cases) < max_test_cases: | |
| test_cases.append({ | |
| 'key': row_key, | |
| 'model_name': model_name, | |
| 'prompt': prompt, | |
| 'model_response': model_response, | |
| 'judge_response': judge_response | |
| }) | |
| else: | |
| # If we've reached the limit, update the oldest test case | |
| test_cases.pop(0) # Remove the oldest | |
| test_cases.append({ | |
| 'key': row_key, | |
| 'model_name': model_name, | |
| 'prompt': prompt, | |
| 'model_response': model_response, | |
| 'judge_response': judge_response | |
| }) | |
| else: | |
| # Update the existing test case | |
| existing_case['model_response'] = model_response | |
| existing_case['judge_response'] = judge_response | |
| # Output the Q&A information in rich text | |
| console.print(f"[bold blue]Model:[/bold blue] {model_name}") | |
| console.print(f"[bold cyan]Prompt:[/bold cyan] {prompt}") | |
| console.print(f"[bold green]Response:[/bold green] {model_response}") | |
| console.print(f"[bold yellow]Judge:[/bold yellow] {judge_response}") | |
| console.print("-" * 50) # Separator line | |
| for i, (category_name, method_name) in enumerate(test_categories): | |
| console.print(f"[bold green]Running {category_name} Benchmark...[/bold green]") | |
| # Show model loading/processing | |
| console.print(f"[magenta] Loading models for {category_name}...[/magenta]") | |
| time.sleep(0.5) # Simulate loading time | |
| try: | |
| # Run the actual test, passing the callback | |
| test_func = getattr(benchmark_suite, method_name) | |
| result = test_func(num_iterations, update_qa_output_callback) # Pass the new callback | |
| results[category_name] = result | |
| console.print(f"[green]✓ {category_name} completed: {result.get('score', 0):.1f}/10[/green]") | |
| except Exception as e: | |
| console.print(f"[red]✗ {category_name} failed: {str(e)}[/red]") | |
| results[category_name] = {"score": 0, "error": str(e)} | |
| # Calculate final scores | |
| overall_score = sum(r.get("score", 0) for r in results.values()) / len(results) if results else 0 | |
| # Print summary | |
| console.print(Panel("[bold magenta]Benchmark Results Summary[/bold magenta]", expand=False)) | |
| for test_name, result in results.items(): | |
| score = result.get('score', 0) | |
| if 'error' in result: | |
| console.print(f" {test_name}: [red]Error - {result['error']}[/red]") | |
| else: | |
| console.print(f" {test_name}: {score:.1f}/10") | |
| console.print(f"[bold blue]Overall Score: {overall_score:.1f}/10[/bold blue]") | |
| return results | |
| if __name__ == "__main__": | |
| import sys | |
| model_to_test = "qwen3:8b" | |
| judge_model = "deepscaler:latest" | |
| iterations = 1 | |
| if len(sys.argv) > 1: | |
| if sys.argv[1] == "detailed": | |
| run_full_benchmark(model_to_test, judge_model, iterations) | |
| else: | |
| console.print("[red]Invalid argument. Use 'python test_benchmark.py detailed'[/red]") | |
| else: | |
| run_full_benchmark(model_to_test, judge_model, iterations) |