| """ |
| Unified Multi-Talker Audio Source Separation Pipeline |
| |
| Supports multiple approaches for speaker extraction: |
| - ICA: Simple Independent Component Analysis |
| - Frankenstein: ICA + Whisper transcription |
| - ICA+DeepLearning: Two-pass PCA+ICA + Pyannote + SepFormer |
| |
| Pipeline Stages: |
| Step 1: Spatial pre-processing (Direction of Arrival estimation) |
| Step 2: Feature extraction (STFT, mel-spectrogram, energy) |
| Step 3: Source separation (approach-dependent) |
| Step 4: Target speaker identification & counting |
| Step 5: Gender classification & transcription |
| |
| Usage: |
| uv run python main.py data/mixture.wav --approach ica --output output/ica_test |
| uv run python main.py data/mixture.wav --approach frankenstein --output output/frank_test |
| uv run python main.py data/mixture.wav --approach ica_deeplearning --output output/dl_test |
| """ |
|
|
| import argparse |
| import logging |
| import sys |
| from pathlib import Path |
|
|
| |
| logging.basicConfig( |
| level=logging.INFO, |
| format="%(asctime)s [%(levelname)s] %(name)s - %(message)s", |
| datefmt="%H:%M:%S", |
| ) |
| log = logging.getLogger("main") |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser( |
| description="Unified multi-talker audio source separation pipeline", |
| formatter_class=argparse.RawDescriptionHelpFormatter, |
| epilog=""" |
| Examples: |
| # ICA approach (default) |
| python main.py data/mixture.wav |
| |
| # Frankenstein approach |
| python main.py data/mixture.wav --approach frankenstein -o output/frank |
| |
| # Two-pass ICA+DeepLearning |
| python main.py data/mixture.wav --approach ica_deeplearning -o output/dl |
| |
| # Custom Whisper model |
| python main.py data/mixture.wav --approach ica -w small |
| """, |
| ) |
|
|
| parser.add_argument( |
| "input_file", |
| help="Path to input 4-channel WAV file", |
| ) |
|
|
| parser.add_argument( |
| "-a", "--approach", |
| choices=["ica", "frankenstein", "ica_deeplearning"], |
| default="ica", |
| help="Approach to use (default: ica)", |
| ) |
|
|
| parser.add_argument( |
| "-o", "--output", |
| default="output", |
| help="Output directory (default: output)", |
| ) |
|
|
| parser.add_argument( |
| "-w", "--whisper-model", |
| choices=["tiny", "base", "small", "medium", "large"], |
| default="base", |
| help="Whisper model size (default: base)", |
| ) |
|
|
| parser.add_argument( |
| "--hf-token", |
| help="HuggingFace token for Pyannote (required for ica_deeplearning)", |
| ) |
|
|
| parser.add_argument( |
| "-v", "--verbose", |
| action="store_true", |
| help="Enable verbose logging", |
| ) |
|
|
| args = parser.parse_args() |
|
|
| |
| from approaches import get_approach |
|
|
| |
| if args.verbose: |
| logging.getLogger().setLevel(logging.DEBUG) |
|
|
| |
| input_path = Path(args.input_file) |
| if not input_path.exists(): |
| log.error(f"Input file not found: {input_path}") |
| sys.exit(1) |
|
|
| if input_path.suffix.lower() != ".wav": |
| log.error(f"Input must be WAV file, got: {input_path.suffix}") |
| sys.exit(1) |
|
|
| |
| try: |
| approach_class = get_approach(args.approach) |
| except ValueError as e: |
| log.error(str(e)) |
| sys.exit(1) |
|
|
| |
| approach = approach_class() |
|
|
| |
| try: |
| run_kwargs = { |
| "input_file": str(input_path), |
| "output_dir": args.output, |
| "whisper_model": args.whisper_model, |
| } |
| if args.approach == "ica_deeplearning" and args.hf_token: |
| run_kwargs["hf_token"] = args.hf_token |
|
|
| results = approach.run(**run_kwargs) |
|
|
| log.info("\n" + "="*60) |
| log.info("PIPELINE COMPLETE") |
| log.info("="*60) |
| log.info(f"Approach: {args.approach}") |
| log.info(f"Output dir: {args.output}") |
| log.info(f"Execution time: {results.execution_time_seconds:.1f}s") |
| log.info(f"Speakers detected: {results.n_speakers}") |
| log.info(f"Talker of interest: Speaker {results.talker_of_interest}") |
|
|
| return 0 |
|
|
| except Exception as e: |
| log.exception(f"Pipeline failed: {e}") |
| return 1 |
|
|
|
|
| if __name__ == "__main__": |
| sys.exit(main()) |
|
|