Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| from collections import defaultdict | |
| from logging import getLogger | |
| from pathlib import Path | |
| import librosa | |
| import soundfile as sf | |
| import torch | |
| from joblib import Parallel, delayed | |
| from pyannote.audio import Pipeline | |
| from tqdm import tqdm | |
| from tqdm_joblib import tqdm_joblib | |
| LOG = getLogger(__name__) | |
| def _process_one( | |
| input_path: Path, | |
| output_dir: Path, | |
| sr: int, | |
| *, | |
| min_speakers: int = 1, | |
| max_speakers: int = 1, | |
| huggingface_token: str | None = None, | |
| ) -> None: | |
| try: | |
| audio, sr = librosa.load(input_path, sr=sr, mono=True) | |
| except Exception as e: | |
| LOG.warning(f"Failed to read {input_path}: {e}") | |
| return | |
| pipeline = Pipeline.from_pretrained( | |
| "pyannote/speaker-diarization", use_auth_token=huggingface_token | |
| ) | |
| if pipeline is None: | |
| raise ValueError("Failed to load pipeline") | |
| LOG.info(f"Processing {input_path}. This may take a while...") | |
| diarization = pipeline( | |
| input_path, min_speakers=min_speakers, max_speakers=max_speakers | |
| ) | |
| LOG.info(f"Found {len(diarization)} tracks, writing to {output_dir}") | |
| speaker_count = defaultdict(int) | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| for segment, track, speaker in tqdm( | |
| list(diarization.itertracks(yield_label=True)), desc=f"Writing {input_path}" | |
| ): | |
| if segment.end - segment.start < 1: | |
| continue | |
| speaker_count[speaker] += 1 | |
| audio_cut = audio[int(segment.start * sr) : int(segment.end * sr)] | |
| sf.write( | |
| (output_dir / f"{speaker}_{speaker_count[speaker]}.wav"), | |
| audio_cut, | |
| sr, | |
| ) | |
| LOG.info(f"Speaker count: {speaker_count}") | |
| def preprocess_speaker_diarization( | |
| input_dir: Path | str, | |
| output_dir: Path | str, | |
| sr: int, | |
| *, | |
| min_speakers: int = 1, | |
| max_speakers: int = 1, | |
| huggingface_token: str | None = None, | |
| n_jobs: int = -1, | |
| ) -> None: | |
| if huggingface_token is not None and not huggingface_token.startswith("hf_"): | |
| LOG.warning("Huggingface token probably should start with hf_") | |
| if not torch.cuda.is_available(): | |
| LOG.warning("CUDA is not available. This will be extremely slow.") | |
| input_dir = Path(input_dir) | |
| output_dir = Path(output_dir) | |
| input_dir.mkdir(parents=True, exist_ok=True) | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| input_paths = list(input_dir.rglob("*.*")) | |
| with tqdm_joblib(desc="Preprocessing speaker diarization", total=len(input_paths)): | |
| Parallel(n_jobs=n_jobs)( | |
| delayed(_process_one)( | |
| input_path, | |
| output_dir / input_path.relative_to(input_dir).parent / input_path.stem, | |
| sr, | |
| max_speakers=max_speakers, | |
| min_speakers=min_speakers, | |
| huggingface_token=huggingface_token, | |
| ) | |
| for input_path in input_paths | |
| ) | |