hedrekao
HF deploy: clean snapshot without local artifacts
a361db3
"""
Unified Multi-Talker Audio Source Separation Pipeline
Supports multiple approaches for speaker extraction:
- ICA: Simple Independent Component Analysis
- Frankenstein: ICA + Whisper transcription
- ICA+DeepLearning: Two-pass PCA+ICA + Pyannote + SepFormer
Pipeline Stages:
Step 1: Spatial pre-processing (Direction of Arrival estimation)
Step 2: Feature extraction (STFT, mel-spectrogram, energy)
Step 3: Source separation (approach-dependent)
Step 4: Target speaker identification & counting
Step 5: Gender classification & transcription
Usage:
uv run python main.py data/mixture.wav --approach ica --output output/ica_test
uv run python main.py data/mixture.wav --approach frankenstein --output output/frank_test
uv run python main.py data/mixture.wav --approach ica_deeplearning --output output/dl_test
"""
import argparse
import logging
import sys
from pathlib import Path
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
datefmt="%H:%M:%S",
)
log = logging.getLogger("main")
def main():
parser = argparse.ArgumentParser(
description="Unified multi-talker audio source separation pipeline",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# ICA approach (default)
python main.py data/mixture.wav
# Frankenstein approach
python main.py data/mixture.wav --approach frankenstein -o output/frank
# Two-pass ICA+DeepLearning
python main.py data/mixture.wav --approach ica_deeplearning -o output/dl
# Custom Whisper model
python main.py data/mixture.wav --approach ica -w small
""",
)
parser.add_argument(
"input_file",
help="Path to input 4-channel WAV file",
)
parser.add_argument(
"-a", "--approach",
choices=["ica", "frankenstein", "ica_deeplearning"],
default="ica",
help="Approach to use (default: ica)",
)
parser.add_argument(
"-o", "--output",
default="output",
help="Output directory (default: output)",
)
parser.add_argument(
"-w", "--whisper-model",
choices=["tiny", "base", "small", "medium", "large"],
default="base",
help="Whisper model size (default: base)",
)
parser.add_argument(
"--hf-token",
help="HuggingFace token for Pyannote (required for ica_deeplearning)",
)
parser.add_argument(
"-v", "--verbose",
action="store_true",
help="Enable verbose logging",
)
args = parser.parse_args()
# Now import approaches after argparse is set up
from approaches import get_approach
# Set log level
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
# Validate input
input_path = Path(args.input_file)
if not input_path.exists():
log.error(f"Input file not found: {input_path}")
sys.exit(1)
if input_path.suffix.lower() != ".wav":
log.error(f"Input must be WAV file, got: {input_path.suffix}")
sys.exit(1)
# Get approach
try:
approach_class = get_approach(args.approach)
except ValueError as e:
log.error(str(e))
sys.exit(1)
# Initialize approach
approach = approach_class()
# Run pipeline (only pass hf_token for ica_deeplearning approach)
try:
run_kwargs = {
"input_file": str(input_path),
"output_dir": args.output,
"whisper_model": args.whisper_model,
}
if args.approach == "ica_deeplearning" and args.hf_token:
run_kwargs["hf_token"] = args.hf_token
results = approach.run(**run_kwargs)
log.info("\n" + "="*60)
log.info("PIPELINE COMPLETE")
log.info("="*60)
log.info(f"Approach: {args.approach}")
log.info(f"Output dir: {args.output}")
log.info(f"Execution time: {results.execution_time_seconds:.1f}s")
log.info(f"Speakers detected: {results.n_speakers}")
log.info(f"Talker of interest: Speaker {results.talker_of_interest}")
return 0
except Exception as e:
log.exception(f"Pipeline failed: {e}")
return 1
if __name__ == "__main__":
sys.exit(main())