session-scribe / diarization.py
Jedi09's picture
Update diarization.py
bf646a8 verified
"""
Speaker Diarization Module
Pyannote-audio ile konuşmacı ayrımı (kim ne zaman konuşuyor).
"""
import os
from typing import List, Tuple, Optional
# PyTorch 2.6+ compatibility: Disable weights_only restriction for pyannote models
os.environ["TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD"] = "1"
import torch
# Check for GPU availability
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🔧 Diarization device: {DEVICE}")
def get_diarization_pipeline(hf_token: Optional[str] = None):
"""
Load pyannote speaker diarization pipeline.
Args:
hf_token: Hugging Face token (required for pyannote models)
Returns:
Diarization pipeline or None if failed
"""
try:
from pyannote.audio import Pipeline
# Try to get token from environment if not provided
token = hf_token or os.environ.get("HF_TOKEN")
if not token:
print("⚠️ HF_TOKEN bulunamadı. pyannote modeli yüklenemeyebilir.")
pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1",
token=token
)
# Move to GPU if available
pipeline.to(DEVICE)
print("✅ Diarization pipeline yüklendi!")
return pipeline
except Exception as e:
print(f"❌ Diarization pipeline yüklenemedi: {e}")
return None
def diarize_audio(audio_path: str, pipeline, num_speakers: int = None) -> List[Tuple[float, float, str]]:
"""
Perform speaker diarization on audio file.
Args:
audio_path: Path to audio file
pipeline: Pyannote diarization pipeline
num_speakers: Expected number of speakers (None for auto-detect)
Returns:
List of (start_time, end_time, speaker_label) tuples
"""
if pipeline is None:
return []
try:
# Run diarization - let pyannote auto-detect if num_speakers not specified
print(f"🔍 Diarization parametreleri: num_speakers={num_speakers}")
if num_speakers:
# Use min/max range for better detection
result = pipeline(audio_path, min_speakers=2, max_speakers=num_speakers)
else:
# Auto-detect number of speakers
result = pipeline(audio_path)
# Extract segments from DiarizeOutput object
segments = []
# DiarizeOutput has speaker_diarization attribute which is the Annotation
if hasattr(result, 'speaker_diarization'):
diarization = result.speaker_diarization
print(f"🔍 Using speaker_diarization attribute")
else:
diarization = result
# Now iterate over the Annotation object
unique_speakers = set()
for segment, track, speaker in diarization.itertracks(yield_label=True):
segments.append((segment.start, segment.end, speaker))
unique_speakers.add(speaker)
print(f"✅ Diarization tamamlandı: {len(segments)} segment, {len(unique_speakers)} konuşmacı")
print(f"🔍 Bulunan konuşmacılar: {unique_speakers}")
return segments
except Exception as e:
print(f"❌ Diarization hatası: {e}")
return []
def format_speaker_label(speaker: str) -> str:
"""
Convert pyannote speaker labels (SPEAKER_00, SPEAKER_01) to user-friendly format.
"""
speaker_map = {
"SPEAKER_00": "Kişi 1",
"SPEAKER_01": "Kişi 2",
"SPEAKER_02": "Kişi 3",
"SPEAKER_03": "Kişi 4",
}
return speaker_map.get(speaker, speaker)
def format_timestamp(seconds: float) -> str:
"""
Convert seconds to [HH:MM:SS] or [MM:SS] format.
"""
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = int(seconds % 60)
if hours > 0:
return f"{hours:02d}:{minutes:02d}:{secs:02d}"
else:
return f"{minutes:02d}:{secs:02d}"