Spaces:

Hedrekao
/

audio-explorers-visualization

Sleeping

hedrekao

HF deploy: clean snapshot without local artifacts

a361db3 about 2 months ago

4.29 kB

	"""
	Unified Multi-Talker Audio Source Separation Pipeline

	Supports multiple approaches for speaker extraction:
	- ICA: Simple Independent Component Analysis
	- Frankenstein: ICA + Whisper transcription
	- ICA+DeepLearning: Two-pass PCA+ICA + Pyannote + SepFormer

	Pipeline Stages:
	Step 1: Spatial pre-processing (Direction of Arrival estimation)
	Step 2: Feature extraction (STFT, mel-spectrogram, energy)
	Step 3: Source separation (approach-dependent)
	Step 4: Target speaker identification & counting
	Step 5: Gender classification & transcription

	Usage:
	uv run python main.py data/mixture.wav --approach ica --output output/ica_test
	uv run python main.py data/mixture.wav --approach frankenstein --output output/frank_test
	uv run python main.py data/mixture.wav --approach ica_deeplearning --output output/dl_test
	"""

	import argparse
	import logging
	import sys
	from pathlib import Path

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
	datefmt="%H:%M:%S",
	)
	log = logging.getLogger("main")


	def main():
	parser = argparse.ArgumentParser(
	description="Unified multi-talker audio source separation pipeline",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog="""
	Examples:
	# ICA approach (default)
	python main.py data/mixture.wav

	# Frankenstein approach
	python main.py data/mixture.wav --approach frankenstein -o output/frank

	# Two-pass ICA+DeepLearning
	python main.py data/mixture.wav --approach ica_deeplearning -o output/dl

	# Custom Whisper model
	python main.py data/mixture.wav --approach ica -w small
	""",
	)

	parser.add_argument(
	"input_file",
	help="Path to input 4-channel WAV file",
	)

	parser.add_argument(
	"-a", "--approach",
	choices=["ica", "frankenstein", "ica_deeplearning"],
	default="ica",
	help="Approach to use (default: ica)",
	)

	parser.add_argument(
	"-o", "--output",
	default="output",
	help="Output directory (default: output)",
	)

	parser.add_argument(
	"-w", "--whisper-model",
	choices=["tiny", "base", "small", "medium", "large"],
	default="base",
	help="Whisper model size (default: base)",
	)

	parser.add_argument(
	"--hf-token",
	help="HuggingFace token for Pyannote (required for ica_deeplearning)",
	)

	parser.add_argument(
	"-v", "--verbose",
	action="store_true",
	help="Enable verbose logging",
	)

	args = parser.parse_args()

	# Now import approaches after argparse is set up
	from approaches import get_approach

	# Set log level
	if args.verbose:
	logging.getLogger().setLevel(logging.DEBUG)

	# Validate input
	input_path = Path(args.input_file)
	if not input_path.exists():
	log.error(f"Input file not found: {input_path}")
	sys.exit(1)

	if input_path.suffix.lower() != ".wav":
	log.error(f"Input must be WAV file, got: {input_path.suffix}")
	sys.exit(1)

	# Get approach
	try:
	approach_class = get_approach(args.approach)
	except ValueError as e:
	log.error(str(e))
	sys.exit(1)

	# Initialize approach
	approach = approach_class()

	# Run pipeline (only pass hf_token for ica_deeplearning approach)
	try:
	run_kwargs = {
	"input_file": str(input_path),
	"output_dir": args.output,
	"whisper_model": args.whisper_model,
	}
	if args.approach == "ica_deeplearning" and args.hf_token:
	run_kwargs["hf_token"] = args.hf_token

	results = approach.run(**run_kwargs)

	log.info("\n" + "="*60)
	log.info("PIPELINE COMPLETE")
	log.info("="*60)
	log.info(f"Approach: {args.approach}")
	log.info(f"Output dir: {args.output}")
	log.info(f"Execution time: {results.execution_time_seconds:.1f}s")
	log.info(f"Speakers detected: {results.n_speakers}")
	log.info(f"Talker of interest: Speaker {results.talker_of_interest}")

	return 0

	except Exception as e:
	log.exception(f"Pipeline failed: {e}")
	return 1


	if __name__ == "__main__":
	sys.exit(main())