""" Unified Multi-Talker Audio Source Separation Pipeline Supports multiple approaches for speaker extraction: - ICA: Simple Independent Component Analysis - Frankenstein: ICA + Whisper transcription - ICA+DeepLearning: Two-pass PCA+ICA + Pyannote + SepFormer Pipeline Stages: Step 1: Spatial pre-processing (Direction of Arrival estimation) Step 2: Feature extraction (STFT, mel-spectrogram, energy) Step 3: Source separation (approach-dependent) Step 4: Target speaker identification & counting Step 5: Gender classification & transcription Usage: uv run python main.py data/mixture.wav --approach ica --output output/ica_test uv run python main.py data/mixture.wav --approach frankenstein --output output/frank_test uv run python main.py data/mixture.wav --approach ica_deeplearning --output output/dl_test """ import argparse import logging import sys from pathlib import Path # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s - %(message)s", datefmt="%H:%M:%S", ) log = logging.getLogger("main") def main(): parser = argparse.ArgumentParser( description="Unified multi-talker audio source separation pipeline", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # ICA approach (default) python main.py data/mixture.wav # Frankenstein approach python main.py data/mixture.wav --approach frankenstein -o output/frank # Two-pass ICA+DeepLearning python main.py data/mixture.wav --approach ica_deeplearning -o output/dl # Custom Whisper model python main.py data/mixture.wav --approach ica -w small """, ) parser.add_argument( "input_file", help="Path to input 4-channel WAV file", ) parser.add_argument( "-a", "--approach", choices=["ica", "frankenstein", "ica_deeplearning"], default="ica", help="Approach to use (default: ica)", ) parser.add_argument( "-o", "--output", default="output", help="Output directory (default: output)", ) parser.add_argument( "-w", "--whisper-model", choices=["tiny", "base", "small", "medium", "large"], default="base", help="Whisper model size (default: base)", ) parser.add_argument( "--hf-token", help="HuggingFace token for Pyannote (required for ica_deeplearning)", ) parser.add_argument( "-v", "--verbose", action="store_true", help="Enable verbose logging", ) args = parser.parse_args() # Now import approaches after argparse is set up from approaches import get_approach # Set log level if args.verbose: logging.getLogger().setLevel(logging.DEBUG) # Validate input input_path = Path(args.input_file) if not input_path.exists(): log.error(f"Input file not found: {input_path}") sys.exit(1) if input_path.suffix.lower() != ".wav": log.error(f"Input must be WAV file, got: {input_path.suffix}") sys.exit(1) # Get approach try: approach_class = get_approach(args.approach) except ValueError as e: log.error(str(e)) sys.exit(1) # Initialize approach approach = approach_class() # Run pipeline (only pass hf_token for ica_deeplearning approach) try: run_kwargs = { "input_file": str(input_path), "output_dir": args.output, "whisper_model": args.whisper_model, } if args.approach == "ica_deeplearning" and args.hf_token: run_kwargs["hf_token"] = args.hf_token results = approach.run(**run_kwargs) log.info("\n" + "="*60) log.info("PIPELINE COMPLETE") log.info("="*60) log.info(f"Approach: {args.approach}") log.info(f"Output dir: {args.output}") log.info(f"Execution time: {results.execution_time_seconds:.1f}s") log.info(f"Speakers detected: {results.n_speakers}") log.info(f"Talker of interest: Speaker {results.talker_of_interest}") return 0 except Exception as e: log.exception(f"Pipeline failed: {e}") return 1 if __name__ == "__main__": sys.exit(main())