""" Benchmark script: Compare all approaches on test audio files. Runs all three approaches on each input file, collects metrics, and produces a CSV comparison report. Usage: uv run python scripts/benchmark.py --data-dir data --output-dir benchmark_results """ import argparse import sys import csv import json import logging import time from pathlib import Path from datetime import datetime from approaches import list_approaches, get_approach logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S", ) log = logging.getLogger("benchmark") # Ensure project root is importable when script is run directly sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) class BenchmarkRunner: """Run benchmark across all approaches and audio files.""" def __init__(self, data_dir: str, output_dir: str): self.data_dir = Path(data_dir) self.output_dir = Path(output_dir) self.output_dir.mkdir(parents=True, exist_ok=True) # Results storage self.results = [] def find_test_files(self) -> list: """Find all WAV files in data directory.""" wav_files = sorted(self.data_dir.glob("*.wav")) if not wav_files: log.warning(f"No WAV files found in {self.data_dir}") return wav_files def run_benchmark(self, whisper_model: str = "base"): """Run all approaches on all test files.""" test_files = self.find_test_files() if not test_files: log.error(f"No test files found in {self.data_dir}") return log.info("="*70) log.info(f"BENCHMARK: {len(list_approaches())} approaches × {len(test_files)} files") log.info("="*70) for input_file in test_files: log.info(f"\n{'='*70}") log.info(f"File: {input_file.name}") log.info(f"{'='*70}") for approach_name in list_approaches(): log.info(f"\n Testing approach: {approach_name}") log.info("-"*70) try: result = self._run_approach( approach_name, input_file, whisper_model, ) self.results.append(result) except Exception as e: log.error(f" FAILED: {e}") result = { "timestamp": datetime.now().isoformat(), "input_file": input_file.name, "approach": approach_name, "status": "FAILED", "error": str(e), } self.results.append(result) log.info(f"\n{'='*70}") log.info("BENCHMARK COMPLETE") log.info(f"{'='*70}") # Save results self._save_results() self._print_summary() def _run_approach(self, approach_name: str, input_file: Path, whisper_model: str): """Run single approach on single file.""" # Create output directory for this run output_subdir = self.output_dir / approach_name / input_file.stem output_subdir.mkdir(parents=True, exist_ok=True) # Initialize approach approach_class = get_approach(approach_name) approach = approach_class() # Run start_time = time.time() pipeline_output = approach.run( input_file=str(input_file), output_dir=str(output_subdir), whisper_model=whisper_model, ) execution_time = time.time() - start_time result = { "timestamp": datetime.now().isoformat(), "input_file": input_file.name, "input_size_mb": input_file.stat().st_size / (1024*1024), "approach": approach_name, "status": "SUCCESS", "duration_seconds": pipeline_output.duration_seconds, "execution_time_seconds": execution_time, "samples_per_second": (pipeline_output.duration_seconds / execution_time) if execution_time > 0 else 0, "n_speakers": pipeline_output.n_speakers, "talker_of_interest": pipeline_output.talker_of_interest, "separation_method": pipeline_output.separation_method, "doa_method": pipeline_output.doa_method, "gender_method": pipeline_output.gender_method, "asr_model": pipeline_output.asr_model, "output_dir": str(output_subdir), } # Log metrics log.info(" Status: SUCCESS") log.info(f" Execution time: {execution_time:.2f}s") log.info(f" Speakers: {pipeline_output.n_speakers}") log.info(f" ToI: Speaker {pipeline_output.talker_of_interest}") log.info(f" Output: {output_subdir}") return result def _save_results(self): """Save results to CSV and JSON.""" # Save CSV csv_path = self.output_dir / "benchmark_results.csv" if self.results: fieldnames = self.results[0].keys() with open(csv_path, 'w', newline='') as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(self.results) log.info(f"\nSaved: {csv_path}") # Save JSON json_path = self.output_dir / "benchmark_results.json" with open(json_path, 'w') as f: json.dump(self.results, f, indent=2) log.info(f"Saved: {json_path}") def _print_summary(self): """Print summary statistics.""" if not self.results: return log.info("\n" + "="*70) log.info("SUMMARY") log.info("="*70) # Group by approach by_approach = {} for result in self.results: approach = result.get("approach") if approach not in by_approach: by_approach[approach] = [] by_approach[approach].append(result) # Print stats per approach for approach, runs in sorted(by_approach.items()): successful = [r for r in runs if r.get("status") == "SUCCESS"] failed = [r for r in runs if r.get("status") == "FAILED"] log.info(f"\nApproach: {approach}") log.info(f" Successful: {len(successful)}/{len(runs)}") if successful: avg_exec_time = sum(r["execution_time_seconds"] for r in successful) / len(successful) avg_speedup = sum(r.get("samples_per_second", 0) for r in successful) / len(successful) log.info(f" Avg execution time: {avg_exec_time:.2f}s") log.info(f" Avg speedup (samples/s): {avg_speedup:.1f}x") if failed: log.info(f" Failed runs: {len(failed)}") def main(): parser = argparse.ArgumentParser(description="Benchmark all approaches") parser.add_argument("--data-dir", default="data", help="Directory with test WAV files") parser.add_argument("--output-dir", default="benchmark_results", help="Output directory") parser.add_argument("-w", "--whisper-model", default="base", help="Whisper model") parser.add_argument("-v", "--verbose", action="store_true") args = parser.parse_args() if args.verbose: logging.getLogger().setLevel(logging.DEBUG) data_path = Path(args.data_dir) if not data_path.exists(): log.error(f"Data directory not found: {data_path}") return 1 runner = BenchmarkRunner(args.data_dir, args.output_dir) runner.run_benchmark(args.whisper_model) return 0 if __name__ == "__main__": sys.exit(main())