Bootstrapping / asr_diarization /pipeline.py

Update asr_diarization/pipeline.py

e4b5544 verified about 2 months ago

19.4 kB

	import os
	import json
	import re
	import torch
	import torchaudio
	import noisereduce as nr
	import numpy as np
	from pyannote.audio import Pipeline
	from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline as hf_pipeline
	import tempfile
	from pyannote.core import Annotation, Segment
	from pyannote.metrics.diarization import DiarizationErrorRate
	from jiwer import wer, Compose, ToLowerCase, RemovePunctuation, RemoveMultipleSpaces, Strip


	class ASR_Diarization:
	def __init__(self, HF_TOKEN,
	diar_model="pyannote/speaker-diarization-3.1",
	asr_model="Capstone04/TrainedWhisper_Medium",
	model_path=None,
	use_vad=True,
	vad_threshold=0.3,
	min_segment_duration=0.5,
	snr_threshold=15.0,
	min_whisper_duration=0.3):

	self.HF_TOKEN = HF_TOKEN
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	self.use_vad = use_vad
	self.vad_threshold = vad_threshold
	self.min_segment_duration = min_segment_duration
	self.snr_threshold = snr_threshold
	self.min_whisper_duration = min_whisper_duration

	# Load diarization model
	self.diar_pipeline = Pipeline.from_pretrained(diar_model, use_auth_token=HF_TOKEN)
	self.diar_pipeline = self.diar_pipeline.to(torch.device(self.device))

	# Load WebRTC VAD for post-diarization filtering
	if self.use_vad:
	try:
	import webrtcvad
	self.vad = webrtcvad.Vad(2)
	print("WebRTC VAD loaded for post-diarization filtering")
	except ImportError:
	print("WebRTC VAD not available")
	self.use_vad = False

	# Load ASR model
	if model_path and os.path.exists(model_path):
	print(f"Loading custom ASR model from: {model_path}")
	actual_asr_model = model_path
	else:
	print(f"Loading default ASR model: {asr_model}")
	actual_asr_model = asr_model

	processor = WhisperProcessor.from_pretrained(actual_asr_model, token=HF_TOKEN)
	model = WhisperForConditionalGeneration.from_pretrained(actual_asr_model, token=HF_TOKEN).to(self.device)

	self.asr_pipeline = hf_pipeline(
	"automatic-speech-recognition",
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	device=0 if self.device == "cuda" else -1,
	return_timestamps=True
	)

	def clean_transcription_text(self, text):
	"""Clean ASR text for better TTS performance"""
	if not text:
	return ""

	# Basic cleaning
	text = text.strip()

	# Fix punctuation spacing for TTS
	text = re.sub(r'\s+([.,!?;:])', r'\1', text) # Remove space before punctuation
	text = re.sub(r'([.,!?;:])(?=\w)', r'\1 ', text) # Add space after punctuation

	# Normalize whitespace
	text = re.sub(r'\s+', ' ', text)

	return text.strip()

	def should_keep_segment(self, text, duration, rms_energy):
	"""Generalized segment quality assessment"""
	# Duration too short
	if duration < self.min_whisper_duration:
	return False

	# Energy too low (likely noise)
	if rms_energy < 0.001:
	return False

	# Text too short or just punctuation
	clean_text = text.strip()
	if len(clean_text) <= 1:
	return False

	return True

	def calculate_snr(self, audio_path):
	"""NEW: Calculate SNR using RMS energy"""
	try:
	import librosa
	y, sr = librosa.load(audio_path, sr=16000, mono=True)

	# RMS-based SNR
	rms = librosa.feature.rms(y=y)[0]
	if len(rms) == 0:
	return float('inf')

	# Signal = high RMS regions, Noise = low RMS regions
	high_rms = rms[rms > np.percentile(rms, 70)]
	low_rms = rms[rms <= np.percentile(rms, 30)]

	if len(high_rms) == 0 or len(low_rms) == 0:
	return float('inf')

	signal_power = np.mean(high_rms)
	noise_power = np.mean(low_rms)

	if noise_power == 0:
	return float('inf')

	snr = 10 * np.log10(signal_power / noise_power)
	return snr

	except Exception as e:
	print(f"SNR calculation failed: {e}")
	return float('inf')

	def calculate_rms_energy(self, audio_chunk):
	"""Calculate RMS energy for audio chunk"""
	return np.sqrt(np.mean(audio_chunk**2))

	def run_webrtc_vad_on_segment(self, audio_path, segment_start, segment_end):
	"""Run WebRTC VAD on segment to get speech ratio"""
	if not self.use_vad:
	return 1.0

	try:
	import wave
	# Load audio
	with wave.open(audio_path, "rb") as wf:
	sample_rate = wf.getframerate()
	n_frames = wf.getnframes()
	audio_data = wf.readframes(n_frames)

	audio_array = np.frombuffer(audio_data, dtype=np.int16)
	start_sample = int(segment_start * sample_rate)
	end_sample = int(segment_end * sample_rate)
	segment_audio = audio_array[start_sample:end_sample]
	segment_bytes = segment_audio.tobytes()

	# WebRTC VAD processing (30ms frames)
	frame_duration = 30
	bytes_per_sample = 2
	frame_size = int(sample_rate * frame_duration / 1000) * bytes_per_sample

	speech_frames = 0
	total_frames = 0

	for i in range(0, len(segment_bytes) - frame_size + 1, frame_size):
	frame = segment_bytes[i:i + frame_size]
	if len(frame) == frame_size:
	is_speech = self.vad.is_speech(frame, sample_rate)
	if is_speech:
	speech_frames += 1
	total_frames += 1

	return speech_frames / total_frames if total_frames > 0 else 0.0

	except Exception as e:
	print(f"WebRTC VAD failed: {e}")
	return 0.0

	def run_diarization(self, audio_path):
	"""Run diarization with VAD AFTER approach"""
	# Step 1: Diarization sees FULL audio first
	diarization = self.diar_pipeline(audio_path)
	diar_segments = [
	{"start": t.start, "end": t.end, "speaker": spk}
	for t, _, spk in diarization.itertracks(yield_label=True)
	]

	raw_speakers = list(set([seg['speaker'] for seg in diar_segments]))
	print(f"Diarization detected {len(raw_speakers)} speakers: {sorted(raw_speakers)}")

	# Step 2: Calculate SNR for adaptive processing
	snr = self.calculate_snr(audio_path)

	# Step 3: Apply VAD filtering ONLY if low SNR
	if snr < self.snr_threshold and self.use_vad:
	print(f"Low SNR ({snr:.1f} dB), applying VAD filtering")
	filtered_segments = []

	for seg in diar_segments:
	# Skip VAD for very short segments
	if (seg["end"] - seg["start"]) < 0.2:
	continue

	speech_ratio = self.run_webrtc_vad_on_segment(
	audio_path, seg["start"], seg["end"]
	)

	if speech_ratio >= self.vad_threshold:
	filtered_segments.append(seg)
	else:
	print(f"Filtered low-speech segment: {seg['start']:.2f}-{seg['end']:.2f} (speech: {speech_ratio:.1%})")

	diar_segments = filtered_segments
	else:
	print(f"Good SNR ({snr:.1f} dB), using all diarization segments")

	# Step 4: Duration filtering for Whisper
	filtered_segments = [
	seg for seg in diar_segments
	if (seg["end"] - seg["start"]) >= self.min_whisper_duration
	]

	print(f"Final: {len(filtered_segments)} segments for Whisper")
	return filtered_segments

	def merge_consecutive_speaker_segments(self, segments):
	"""Merge only consecutive segments from the same speaker while preserving order"""
	if not segments:
	return []

	# Sort by start time to ensure correct order
	segments.sort(key=lambda x: x["start"])

	merged_segments = []

	for seg in segments:
	if not merged_segments:
	# First segment
	merged_segments.append(seg)
	else:
	last_seg = merged_segments[-1]

	# Check if same speaker AND consecutive (small gap < 2 seconds)
	if (seg["speaker"] == last_seg["speaker"] and
	(seg["start"] - last_seg["end"]) < 2.0):

	# Merge with previous segment
	last_seg["text"] += " " + seg["text"]
	last_seg["end"] = seg["end"]
	else:
	# Different speaker or large gap - keep as separate segment
	merged_segments.append(seg)

	print(f"Reduced {len(segments)} segments to {len(merged_segments)} while preserving order")
	return merged_segments

	def run_transcription(self, audio_path, diar_json):
	"""Segment-level transcription without word timestamps"""
	# Load and standardize audio
	audio, sr = torchaudio.load(audio_path)

	# Resample to 16kHz for consistency
	if sr != 16000:
	resampler = torchaudio.transforms.Resample(sr, 16000)
	audio = resampler(audio)
	sr = 16000

	merged_segments = []
	speaker_segments = {}

	# Calculate SNR for adaptive noise reduction
	snr = self.calculate_snr(audio_path)

	for seg in diar_json:
	start, end, spk = seg["start"], seg["end"], seg["speaker"]

	# Skip segments that are too short for Whisper
	segment_duration = end - start
	if segment_duration < self.min_whisper_duration:
	print(f"Skipping short segment for Whisper: {start:.2f}-{end:.2f} ({segment_duration:.2f}s)")
	continue

	start_sample, end_sample = int(start * sr), int(end * sr)

	# Handle both mono and stereo audio
	if audio.shape[0] > 1: # Stereo
	chunk = torch.mean(audio[:, start_sample:end_sample], dim=0).numpy()
	else: # Mono
	chunk = audio[0, start_sample:end_sample].numpy()

	# Calculate RMS energy for this segment
	rms_energy = self.calculate_rms_energy(chunk)

	# Adaptive noise reduction based on SNR + RMS
	if len(chunk) > int(0.1 * sr):
	if snr < 10 or rms_energy < 0.01: # Very noisy or low energy
	reduced = nr.reduce_noise(y=chunk, sr=sr, stationary=True, prop_decrease=0.8)
	elif snr < 20: # Moderately noisy
	reduced = nr.reduce_noise(y=chunk, sr=sr, stationary=True, prop_decrease=0.5)
	else: # Clean audio
	reduced = chunk
	else:
	reduced = chunk

	try:
	# Get text without timestamps
	result = self.asr_pipeline(
	reduced,
	generate_kwargs={
	"task": "transcribe",
	"language": "en",
	"temperature": 0.0 # More accurate transcription
	}
	)
	except Exception as e:
	print(f"Whisper failed on segment {start:.2f}-{end:.2f}: {e}")
	continue

	# Extract just the text (no timestamp processing)
	text = result.get("text", "").strip()

	# Clean the text for TTS and apply quality filtering
	clean_text = self.clean_transcription_text(text)

	if clean_text and self.should_keep_segment(clean_text, segment_duration, rms_energy):
	seg_dict = {
	"speaker": spk,
	"start": start, # Keep segment boundaries
	"end": end, # Keep segment boundaries
	"text": clean_text, # Use cleaned text
	"rms_energy": float(rms_energy)
	}
	merged_segments.append(seg_dict)

	if spk not in speaker_segments:
	speaker_segments[spk] = []
	speaker_segments[spk].append(seg_dict)

	return merged_segments, list(speaker_segments.keys())

	def run_pipeline(self, audio_path, output_dir=None, base_name=None,
	ref_rttm=None, ref_json=None, nse_events=None):
	"""Add input validation and proper RTTM format"""
	# Validate input audio file
	if not os.path.exists(audio_path):
	raise FileNotFoundError(f"Audio file not found: {audio_path}")

	try:
	# Quick validation that it's loadable audio
	audio, sr = torchaudio.load(audio_path)
	if audio.numel() == 0:
	raise ValueError("Audio file is empty")
	except Exception as e:
	raise ValueError(f"Invalid audio file: {e}")

	print(f"Processing with VAD: {'ON' if self.use_vad else 'OFF'}")

	# Run diarization and transcription
	diar_json = self.run_diarization(audio_path)
	merged_segments, speakers = self.run_transcription(audio_path, diar_json)

	# Merge consecutive segments by same speaker
	merged_segments = self.merge_consecutive_speaker_segments(merged_segments)

	# Combine ASR segments with NSE events if provided
	if nse_events:
	print(f"Combining {len(merged_segments)} ASR segments with {len(nse_events)} NSE events")
	all_segments = merged_segments + nse_events
	# Sort by start time for proper timeline
	all_segments.sort(key=lambda x: x["start"])
	else:
	all_segments = merged_segments

	if output_dir and base_name:
	os.makedirs(output_dir, exist_ok=True)

	# Save RTTM with standard format and precision
	rttm_path = os.path.join(output_dir, f"{base_name}.rttm")
	with open(rttm_path, "w") as f:
	for seg in diar_json:
	f.write(
	f"SPEAKER {base_name} 1 {seg['start']:.3f} "
	f"{seg['end']-seg['start']:.3f} <NA> <NA> "
	f"{seg['speaker']} <NA> <NA>\n"
	)

	# Save transcription (with NSE events if available)
	merged_path = os.path.join(output_dir, f"{base_name}_merged_transcription.json")
	with open(merged_path, "w") as f:
	json.dump(all_segments, f, indent=2)

	# Evaluation if refs are provided
	eval_results = None
	if ref_rttm or ref_json:
	eval_results = self.evaluate(output_dir, base_name,
	ref_rttm=ref_rttm, ref_json=ref_json)

	return {
	"speakers": speakers,
	"segments": all_segments, # Return combined segments
	"evaluation": eval_results
	}

	def evaluate(self, output_dir, base_name, ref_rttm=None, ref_json=None):
	# Add output_dir validation
	if not output_dir or not base_name:
	return None

	results = {}
	hyp_rttm = os.path.join(output_dir, f"{base_name}.rttm")
	hyp_json = os.path.join(output_dir, f"{base_name}_merged_transcription.json")

	if ref_rttm and os.path.exists(hyp_rttm):
	def load_rttm(path):
	ann = Annotation()
	for line in open(path):
	if line.startswith("SPEAKER"):
	p = line.split()
	start, dur, spk = float(p[3]), float(p[4]), p[7]
	ann[Segment(start, start+dur)] = spk
	return ann

	der_score = DiarizationErrorRate()(load_rttm(ref_rttm), load_rttm(hyp_rttm))
	results["DER"] = round(der_score * 100, 2)

	if ref_json and os.path.exists(hyp_json):
	def load_words_from_hypothesis(path):
	"""Load text from YOUR pipeline output (has 'text' field)"""
	data = json.load(open(path))
	# Filter out NSE events for WER calculation (only use speech)
	speech_segments = [seg for seg in data if seg.get("speaker") != "NSE"]
	# Directly use segment text instead of tokens
	return " ".join([seg["text"] for seg in speech_segments])

	def load_words_from_reference(path):
	"""Load text from REFERENCE file (has 'tokens' field)"""
	data = json.load(open(path))
	# Filter out NSE events for WER calculation (only use speech)
	speech_segments = [seg for seg in data if seg.get("speaker") != "NSE"]
	# Reference format has tokens, not direct text
	return " ".join([tok["text"] for seg in speech_segments for tok in seg["tokens"]])

	# Use appropriate loader for each file
	ref_text = load_words_from_reference(ref_json)
	hyp_text = load_words_from_hypothesis(hyp_json)

	transform = Compose([ToLowerCase(), RemovePunctuation(),
	RemoveMultipleSpaces(), Strip()])
	results["WER_raw"] = round(wer(ref_text, hyp_text), 4)
	results["WER_normalized"] = round(wer(transform(ref_text), transform(hyp_text)), 4)

	return results if results else None

	def __call__(self, inputs, nse_events=None):
	"""FIXED: Add proper temporary file cleanup"""
	if isinstance(inputs, dict):
	if "audio_bytes" in inputs:
	audio_bytes = inputs["audio_bytes"]
	elif "audio" in inputs:
	audio_bytes = inputs["audio"]
	else:
	raise ValueError("No audio found in inputs")
	else:
	audio_bytes = inputs

	tmp_path = None
	try:
	# Create temporary file for processing
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
	tmp.write(audio_bytes)
	tmp_path = tmp.name

	# Run pipeline with NSE events
	result = self.run_pipeline(tmp_path, nse_events=nse_events)
	return result
	finally:
	# Always clean up temporary file
	if tmp_path and os.path.exists(tmp_path):
	os.unlink(tmp_path)