ASR / src /eval /verify_data_quality.py
MihirRPatil's picture
deploy: CDAC ASR backend with pitch/stress fix and LLM feedback
88a679b
Raw
History Blame Contribute Delete
4.16 kB
import os
import argparse
import numpy as np
import librosa
from tqdm import tqdm
from src.utils.audio_utils import AudioPreprocessor
from evaluate_indian_accent import find_audio
def parse_args():
parser = argparse.ArgumentParser(description="Check dataset audio quality and VAD behavior")
parser.add_argument("--dataset_dir", default="indian-accent-dataset/audio", help="Path to Kaggle dataset splits")
parser.add_argument("--limit", type=int, default=100, help="Number of audio files to analyze (default: 100)")
return parser.parse_args()
def main():
args = parse_args()
if not os.path.exists(args.dataset_dir):
print(f"❌ Error: Dataset directory '{args.dataset_dir}' not found.")
return
preprocessor = AudioPreprocessor(sr=16000)
# Track statistics
total_scanned = 0
read_errors = 0
resample_occurred = 0
empty_after_vad = 0
original_sample_rates = []
durations_orig = []
durations_trimmed = []
nan_occurred = 0
splits = ["train", "test", "dev"]
audio_paths = []
for split in splits:
split_dir = os.path.join(args.dataset_dir, "audio", split)
if not os.path.exists(split_dir):
split_dir = os.path.join(args.dataset_dir, split)
if not os.path.exists(split_dir):
continue
speaker_dirs = [
os.path.join(split_dir, d) for d in os.listdir(split_dir)
if os.path.isdir(os.path.join(split_dir, d))
]
for sd in speaker_dirs:
audio_path = find_audio(sd)
if audio_path:
audio_paths.append((sd, audio_path))
print(f"📄 Found {len(audio_paths)} audio files. Analyzing first {args.limit}...")
for speaker_dir, audio_path in tqdm(audio_paths[:args.limit]):
total_scanned += 1
try:
# 1. Try reading audio
audio_array, sr = librosa.load(audio_path, sr=None)
original_sample_rates.append(sr)
# Check for NaNs
if np.isnan(audio_array).any():
nan_occurred += 1
dur_orig = len(audio_array) / sr
durations_orig.append(dur_orig)
# 2. Resample check
if sr != 16000:
resample_occurred += 1
audio_array = librosa.resample(audio_array, orig_sr=sr, target_sr=16000)
# 3. VAD Trimming check
clean_audio = preprocessor.preprocess(audio_array)
dur_trimmed = len(clean_audio) / 16000
durations_trimmed.append(dur_trimmed)
if len(clean_audio) == 0:
empty_after_vad += 1
except Exception as e:
read_errors += 1
print(f"❌ Error reading {audio_path}: {e}")
# Print report
print("\n" + "="*50)
print(" AUDIO DATA QUALITY REPORT")
print("="*50)
print(f"Total Files Scanned: {total_scanned}")
print(f"Read Errors (MP3 backend): {read_errors} ({read_errors/total_scanned:.2%})")
print(f"Resampling Needed (!=16kHz): {resample_occurred} ({resample_occurred/total_scanned:.2%})")
print(f"Empty after VAD Trim: {empty_after_vad} ({empty_after_vad/total_scanned:.2%})")
print(f"NaN Values Detected: {nan_occurred}")
if original_sample_rates:
print("\n📈 Sample Rate Distribution:")
rates, counts = np.unique(original_sample_rates, return_counts=True)
for r, c in zip(rates, counts):
print(f" {r} Hz: {c} files")
if durations_orig:
print(f"\n⏱️ Duration Statistics (Seconds):")
print(f" Original Duration: Mean={np.mean(durations_orig):.2f}s, Max={np.max(durations_orig):.2f}s, Min={np.min(durations_orig):.2f}s")
print(f" VAD Trimmed Duration: Mean={np.mean(durations_trimmed):.2f}s, Max={np.max(durations_trimmed):.2f}s, Min={np.min(durations_trimmed):.2f}s")
print("="*50)
if __name__ == "__main__":
main()