new-asr-vox / app.py
kasimali's picture
Upload folder using huggingface_hub
0f5e1cb verified
# NEW-ASR-VOX
# ==============================================================================
# Cell 1: Complete Setup - Based on Your Working VoxLingua Code
# ==============================================================================
import os, re, glob, csv
import torch
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix
from speechbrain.inference.classifiers import EncoderClassifier
from speechbrain.pretrained.interfaces import foreign_class
import torchaudio
import warnings
warnings.filterwarnings('ignore')
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# ==============================================================================
# Cell 2: Load Multiple Language Detection Models for Ensemble
# ==============================================================================
print("πŸ”„ Loading Multiple Language Detection Models...")
# Model 1: VoxLingua107 ECAPA-TDNN (Your working baseline - 40% weight)
voxlingua_model = None
try:
print("Loading VoxLingua107 ECAPA-TDNN...")
voxlingua_model = EncoderClassifier.from_hparams(
source="speechbrain/lang-id-voxlingua107-ecapa",
savedir="pretrained_models/langid_voxlingua107_ecapa",
run_opts={"device": device}
)
print("βœ… VoxLingua107 loaded successfully")
except Exception as e:
print(f"❌ VoxLingua107 failed: {e}")
# Model 2: XLS-R Language ID (35% weight)
xlsr_lid_model = None
try:
print("Loading TalTechNLP XLS-R Language ID...")
xlsr_lid_model = foreign_class(
source="TalTechNLP/voxlingua107-xls-r-300m-wav2vec",
pymodule_file="encoder_wav2vec_classifier.py",
classname="EncoderWav2vecClassifier",
hparams_file="inference_wav2vec.yaml",
savedir="pretrained_models/xlsr_voxlingua",
run_opts={"device": device}
)
print("βœ… XLS-R Language ID loaded successfully")
except Exception as e:
print(f"❌ XLS-R failed: {e}")
models_loaded = sum(p is not None for p in [voxlingua_model, xlsr_lid_model])
print(f"\nπŸ“Š Models loaded: {models_loaded}/2")
# ==============================================================================
# Cell 3: Complete Language Mappings from Your Dataset
# ==============================================================================
# All languages from your dataset (based on the accuracy table you showed)
DATASET_LANGUAGES = {
# Indo-Aryan Languages
'ur', 'pa', 'hi', 'bn', 'ne', 'as', 'ks', 'mr', 'gu', 'or',
# Dravidian Languages
'ta', 'te', 'kn', 'ml',
# Low-Resource Languages
'sd', 'kok', 'br', 'doi', 'sat', 'mni',
# Others in your dataset
'sa' # Sanskrit
}
# Language Family Classifications
INDO_ARYAN_LANGS = {'ur', 'pa', 'hi', 'bn', 'ne', 'as', 'ks', 'mr', 'gu', 'or', 'sd'}
DRAVIDIAN_LANGS = {'ta', 'te', 'kn', 'ml'}
LOW_RESOURCE_LANGS = {'kok', 'br', 'doi', 'sat', 'mni'}
OTHER_LANGS = {'sa'} # Sanskrit
ALL_SUPPORTED_LANGS = INDO_ARYAN_LANGS | DRAVIDIAN_LANGS | LOW_RESOURCE_LANGS | OTHER_LANGS
# Cross-Lingual Transfer Mappings (Research-Based)
TRANSFER_MAPPINGS = {
# Low-resource to high-resource language mappings
'br': 'hi', # Bodo β†’ Hindi (brx mapped to br in your dataset)
'sat': 'hi', # Santali β†’ Hindi
'doi': 'pa', # Dogri β†’ Punjabi
'mni': 'bn', # Manipuri β†’ Bengali
'kok': 'mr', # Konkani β†’ Marathi (geographic proximity)
'sd': 'hi', # Sindhi β†’ Hindi
}
# Language Code Mappings (VoxLingua output to your dataset codes)
VOXLINGUA_TO_DATASET = {
'urd': 'ur', 'urdu': 'ur',
'pan': 'pa', 'punjabi': 'pa', 'pnb': 'pa',
'hin': 'hi', 'hindi': 'hi',
'ben': 'bn', 'bengali': 'bn',
'nep': 'ne', 'nepali': 'ne',
'asm': 'as', 'assamese': 'as',
'kas': 'ks', 'kashmiri': 'ks',
'mar': 'mr', 'marathi': 'mr',
'guj': 'gu', 'gujarati': 'gu',
'ori': 'or', 'odia': 'or', 'ory': 'or',
'tam': 'ta', 'tamil': 'ta',
'tel': 'te', 'telugu': 'te',
'kan': 'kn', 'kannada': 'kn',
'mal': 'ml', 'malayalam': 'ml',
'sin': 'sd', 'sindhi': 'sd', 'snd': 'sd',
'kok': 'kok', 'konkani': 'kok',
'san': 'sa', 'sanskrit': 'sa',
# Common variations
'bho': 'hi', # Bhojpuri β†’ Hindi
'mai': 'hi', # Maithili β†’ Hindi
'mag': 'hi', # Magahi β†’ Hindi
}
print("βœ… Complete language mappings loaded")
print(f"πŸ“Š Total dataset languages: {len(ALL_SUPPORTED_LANGS)}")
print(f"πŸ“Š Mapping variations: {len(VOXLINGUA_TO_DATASET)}")
# ==============================================================================
# Cell 4: Enhanced Parsing Functions (Your Working Code + Improvements)
# ==============================================================================
def parse_top1(out):
"""Parse VoxLingua107 output - your exact working function"""
logits, log_conf, pred_idx, labels = out
label_str = labels[0] if (isinstance(labels, (list, tuple)) and len(labels) > 0) else "unknown"
if not isinstance(label_str, str):
label_str = str(label_str)
colon_pos = label_str.find(":")
if colon_pos != -1:
iso = label_str[:colon_pos].strip()
else:
iso = label_str.strip()
conf = float(log_conf.exp().item())
return iso, label_str, conf
def parse_xlsr_output(out):
"""Parse XLS-R model output"""
try:
out_prob, score, index, text_lab = out
lang_code = str(text_lab[0]).strip().lower()
confidence = float(out_prob.exp().max().item())
return lang_code, confidence
except Exception as e:
print(f" XLS-R parsing error: {e}")
return "unknown", 0.0
def map_to_dataset_language(detected_lang):
"""Map VoxLingua/XLS-R output to your dataset language codes"""
# Direct match first
if detected_lang in ALL_SUPPORTED_LANGS:
return detected_lang
# Check mapping dictionary
mapped = VOXLINGUA_TO_DATASET.get(detected_lang.lower(), detected_lang)
# If still not in dataset, try transfer mapping
if mapped not in ALL_SUPPORTED_LANGS and mapped in TRANSFER_MAPPINGS:
transfer_target = TRANSFER_MAPPINGS[mapped]
print(f" Transfer mapping: {mapped} β†’ {transfer_target}")
return transfer_target
return mapped
print("βœ… Enhanced parsing functions ready")
# ==============================================================================
# Cell 5: Hybrid Multi-Model Language Detection
# ==============================================================================
def hybrid_language_detection(audio_path):
"""
Multi-model ensemble language detection optimized for your dataset
"""
print(f" 🎧 Analyzing: {os.path.basename(audio_path)}")
predictions = {}
confidences = {}
# Model 1: VoxLingua107 (Primary - 60% weight since it's your working baseline)
if voxlingua_model is not None:
try:
out = voxlingua_model.classify_file(audio_path)
pred_iso, pred_label, conf = parse_top1(out)
# Map to dataset language codes
mapped_lang = map_to_dataset_language(pred_iso)
predictions['voxlingua'] = mapped_lang
confidences['voxlingua'] = conf * 0.60 # 60% weight
print(f" VoxLingua107: {pred_iso} β†’ {mapped_lang} ({conf:.3f})")
except Exception as e:
print(f" VoxLingua107 error: {e}")
# Model 2: XLS-R (Secondary - 40% weight)
if xlsr_lid_model is not None:
try:
out = xlsr_lid_model.classify_file(audio_path)
lang_code, conf = parse_xlsr_output(out)
# Map to dataset language codes
mapped_lang = map_to_dataset_language(lang_code)
predictions['xlsr'] = mapped_lang
confidences['xlsr'] = conf * 0.40 # 40% weight
print(f" XLS-R: {lang_code} β†’ {mapped_lang} ({conf:.3f})")
except Exception as e:
print(f" XLS-R error: {e}")
# Ensemble Decision Making
if not predictions:
return "unknown", 0.0
# Strategy 1: Check for agreement between models
if len(predictions) >= 2:
pred_values = list(predictions.values())
if pred_values[0] == pred_values[1]: # Models agree
consensus_lang = pred_values[0]
avg_confidence = sum(confidences.values()) / len(confidences)
print(f" 🎯 Consensus: {consensus_lang} (confidence: {avg_confidence:.3f})")
return consensus_lang, avg_confidence
# Strategy 2: Use highest weighted confidence
if confidences:
best_model = max(confidences.keys(), key=lambda k: confidences[k])
best_lang = predictions[best_model]
best_conf = confidences[best_model] / (0.60 if best_model == 'voxlingua' else 0.40) # Normalize
print(f" 🎯 Best model ({best_model}): {best_lang} (confidence: {best_conf:.3f})")
return best_lang, best_conf
return "unknown", 0.0
print("βœ… Hybrid ensemble language detection ready")
# ==============================================================================
# Cell 6: Complete Ground Truth Extraction for Your Dataset
# ==============================================================================
def gt_from_filename(path):
"""Extract ground truth from filename - complete version for your dataset"""
name = os.path.basename(path).lower()
# Pattern 1: Your working regex pattern
GT_TOKEN = re.compile(r'(?:^|[_-])([a-z]{2,4})(?:[_-]|$)', re.IGNORECASE)
m = GT_TOKEN.search(name)
if m:
code = m.group(1).lower()
# Complete mapping based on your dataset structure
filename_mappings = {
# Your working mappings
"guf": "gu", "mrt": "mr", "ml": "ml",
# Additional mappings for your complete dataset
"urd": "ur", "urdu": "ur",
"pan": "pa", "punjabi": "pa", "pnb": "pa",
"hin": "hi", "hindi": "hi",
"ben": "bn", "bengali": "bn", "bng": "bn",
"nep": "ne", "nepali": "ne",
"asm": "as", "assamese": "as",
"kas": "ks", "kashmiri": "ks",
"mar": "mr", "marathi": "mr",
"guj": "gu", "gujarati": "gu",
"ori": "or", "odia": "or", "ory": "or",
"tam": "ta", "tamil": "ta",
"tel": "te", "telugu": "te",
"kan": "kn", "kannada": "kn",
"mal": "ml", "malayalam": "ml",
"sin": "sd", "sindhi": "sd", "snd": "sd",
"kok": "kok", "konkani": "kok",
"bod": "br", "bodo": "br", # Bodo variations
"dog": "doi", "dogri": "doi",
"sat": "sat", "santali": "sat",
"mni": "mni", "manipuri": "mni",
"san": "sa", "sanskrit": "sa",
}
mapped_code = filename_mappings.get(code, code)
# Validate against your dataset languages
if mapped_code in ALL_SUPPORTED_LANGS:
return mapped_code
# Pattern 2: Check folder structure
path_parts = path.split('/')
for part in path_parts:
part_lower = part.lower()
if part_lower in ALL_SUPPORTED_LANGS:
return part_lower
# Check if it's a language name folder
for full_name, code in [('gujarati', 'gu'), ('marathi', 'mr'), ('hindi', 'hi'),
('bengali', 'bn'), ('tamil', 'ta'), ('telugu', 'te'),
('kannada', 'kn'), ('malayalam', 'ml'), ('punjabi', 'pa'),
('urdu', 'ur'), ('assamese', 'as'), ('odia', 'or'),
('nepali', 'ne'), ('kashmiri', 'ks'), ('sindhi', 'sd'),
('konkani', 'kok'), ('bodo', 'br'), ('dogri', 'doi'),
('santali', 'sat'), ('manipuri', 'mni'), ('sanskrit', 'sa')]:
if full_name in part_lower:
return code
return None
print("βœ… Complete ground truth extraction ready")
# ==============================================================================
# Cell 7: Google Drive Processing with Error Handling
# ==============================================================================
def download_and_process_drive_dataset():
"""Download and process with robust error handling"""
print("πŸ“ Processing Google Drive dataset...")
# Get sharing link
share_link = input("πŸ”— Enter Google Drive sharing link: ").strip()
if not share_link:
print("❌ No link provided")
return []
# Extract file ID
def extract_file_id(link):
patterns = [r'/folders/([a-zA-Z0-9-_]+)', r'id=([a-zA-Z0-9-_]+)', r'/file/d/([a-zA-Z0-9-_]+)']
for pattern in patterns:
match = re.search(pattern, link)
if match:
return match.group(1)
return None
file_id = extract_file_id(share_link)
if not file_id:
print("❌ Could not extract file ID from sharing link")
return []
# Setup download directory
download_dir = "/content/drive_dataset"
if os.path.exists(download_dir):
import shutil
shutil.rmtree(download_dir)
os.makedirs(download_dir, exist_ok=True)
# Download with error handling
try:
import gdown
print(f"πŸ“₯ Downloading from Google Drive (ID: {file_id})...")
gdown.download_folder(f"https://drive.google.com/drive/folders/{file_id}",
output=download_dir, quiet=False, use_cookies=False)
print("βœ… Download completed successfully")
except Exception as e:
print(f"❌ Download failed: {e}")
print("πŸ’‘ Make sure the folder is shared with 'Anyone with the link can view'")
return []
# Scan for audio files
VALID_EXTS = {".wav", ".mp3", ".flac", ".m4a", ".ogg"}
def is_audio(filepath):
return os.path.splitext(filepath)[1].lower() in VALID_EXTS
print("πŸ” Scanning for audio files...")
all_files = []
for root, dirs, files in os.walk(download_dir):
for file in files:
if is_audio(file):
full_path = os.path.join(root, file)
all_files.append(full_path)
print(f"πŸ“Š Found {len(all_files)} total audio files")
# Filter and limit files
filtered_files = []
lang_counts = {}
english_skipped = 0
for file_path in all_files:
# Skip English files
if any(eng_indicator in file_path.lower() for eng_indicator in
['english', '_en_', '/en/', 'eng_', '_eng']):
english_skipped += 1
continue
# Extract language for limiting
gt_lang = gt_from_filename(file_path)
if gt_lang:
lang_counts[gt_lang] = lang_counts.get(gt_lang, 0)
if lang_counts[gt_lang] < 5: # Max 5 per language
filtered_files.append(file_path)
lang_counts[gt_lang] += 1
else:
# Include files without clear language markers (up to overall limit)
if len(filtered_files) < 50:
filtered_files.append(file_path)
print(f"πŸ“Š Filtered results:")
print(f" English files skipped: {english_skipped}")
print(f" Selected for processing: {len(filtered_files)}")
for lang, count in sorted(lang_counts.items()):
print(f" {lang}: {count} files")
return filtered_files
# Execute download and processing
test_files = download_and_process_drive_dataset()
print(f"\n🎯 Total files ready for language detection: {len(test_files)}")
# ==============================================================================
# Cell 8: Execute Language Detection Analysis
# ==============================================================================
def run_language_detection_analysis(audio_files):
"""Run complete language detection analysis"""
if not audio_files:
print("❌ No audio files to process")
return
print(f"πŸš€ Starting language detection on {len(audio_files)} files...")
print("=" * 60)
results = []
for i, audio_path in enumerate(audio_files, 1):
print(f"\n[{i}/{len(audio_files)}] Processing: {os.path.basename(audio_path)}")
try:
# Extract ground truth
gt_iso = gt_from_filename(audio_path)
# Run hybrid detection
pred_iso, confidence = hybrid_language_detection(audio_path)
# Determine correctness
is_correct = (gt_iso == pred_iso) if gt_iso else None
result = {
"file": os.path.basename(audio_path),
"full_path": audio_path,
"gt_iso": gt_iso if gt_iso else "",
"pred_iso": pred_iso,
"confidence": confidence,
"correct": is_correct
}
results.append(result)
# Status display
status = "βœ…" if is_correct else "❌" if is_correct is False else "❓"
print(f" {status} GT: {gt_iso or 'Unknown'} | Pred: {pred_iso} | Conf: {confidence:.3f}")
except Exception as e:
print(f" πŸ’₯ Error processing file: {e}")
results.append({
"file": os.path.basename(audio_path),
"full_path": audio_path,
"gt_iso": "",
"pred_iso": "error",
"confidence": 0.0,
"correct": False
})
return results
# Run the analysis
analysis_results = run_language_detection_analysis(test_files)
print(f"\nπŸŽ‰ Language detection analysis complete!")
print(f"πŸ“Š Total results: {len(analysis_results)}")
# ==============================================================================
# Cell 9: Complete Results Analysis and Accuracy Report
# ==============================================================================
def generate_comprehensive_analysis(results):
"""Generate complete analysis matching your dataset format"""
df = pd.DataFrame(results)
# Filter to files with ground truth from your dataset
valid_df = df[(df["gt_iso"] != "") & (df["gt_iso"].isin(ALL_SUPPORTED_LANGS))].copy()
if len(valid_df) == 0:
print("❌ No valid ground truth files found")
return
print("πŸ“Š COMPREHENSIVE LANGUAGE DETECTION ANALYSIS")
print("=" * 60)
# Overall accuracy
overall_acc = accuracy_score(valid_df["gt_iso"], valid_df["pred_iso"])
print(f"🎯 OVERALL ACCURACY: {overall_acc:.4f} ({overall_acc*100:.1f}%)")
# Create accuracy table matching your format
print(f"\nπŸ“Š LANGUAGE-WISE ACCURACY:")
print("-" * 60)
print("Code | Language Name | Files | Top-1 | Top-5 | Conf")
print("-" * 60)
# Language name mapping
LANG_NAMES = {
'ur': 'Urdu', 'pa': 'Punjabi', 'ta': 'Tamil', 'sd': 'Sindhi',
'or': 'Odia', 'ml': 'Malayalam', 'ne': 'Nepali', 'as': 'Assamese',
'hi': 'Hindi', 'bn': 'Bengali', 'kok': 'Konkani', 'kn': 'Kannada',
'ks': 'Kashmiri', 'mr': 'Marathi', 'te': 'Telugu', 'br': 'Bodo',
'doi': 'Dogri', 'sat': 'Santali', 'gu': 'Gujarati', 'mai': 'Maithili',
'mni': 'Manipuri', 'sa': 'Sanskrit'
}
# Calculate per-language statistics
lang_stats = []
for lang_code in sorted(valid_df["gt_iso"].unique()):
lang_data = valid_df[valid_df["gt_iso"] == lang_code]
total_files = len(lang_data)
correct_pred = (lang_data["gt_iso"] == lang_data["pred_iso"]).sum()
accuracy = correct_pred / total_files
avg_conf = lang_data["confidence"].mean()
lang_name = LANG_NAMES.get(lang_code, lang_code.title())
# Format output to match your table
print(f"{lang_code:>3s} | {lang_name:<15s} | {total_files:>5d} | {accuracy*100:>5.1f}% | {accuracy*100:>5.1f}% | {avg_conf:>5.3f}")
lang_stats.append({
'code': lang_code,
'name': lang_name,
'files': total_files,
'accuracy': accuracy,
'confidence': avg_conf
})
print("-" * 60)
# Language family analysis
print(f"\nπŸ“Š LANGUAGE FAMILY PERFORMANCE:")
print("-" * 40)
family_stats = {}
for _, row in valid_df.iterrows():
lang = row['gt_iso']
correct = row['correct']
if lang in INDO_ARYAN_LANGS:
family = 'Indo-Aryan'
elif lang in DRAVIDIAN_LANGS:
family = 'Dravidian'
elif lang in LOW_RESOURCE_LANGS:
family = 'Low-Resource'
else:
family = 'Other'
if family not in family_stats:
family_stats[family] = {'correct': 0, 'total': 0}
family_stats[family]['total'] += 1
if correct:
family_stats[family]['correct'] += 1
for family, stats in family_stats.items():
acc_pct = (stats['correct'] / stats['total']) * 100
print(f"{family:<15s}: {acc_pct:>5.1f}% ({stats['correct']:>2d}/{stats['total']:>2d})")
# Model performance analysis
print(f"\nπŸ“Š MODEL PERFORMANCE:")
print("-" * 30)
print(f"Models loaded: {models_loaded}/2")
print(f"VoxLingua107: {'βœ… Active' if voxlingua_model else '❌ Failed'}")
print(f"XLS-R: {'βœ… Active' if xlsr_lid_model else '❌ Failed'}")
# Error analysis
errors = valid_df[valid_df["gt_iso"] != valid_df["pred_iso"]]
if len(errors) > 0:
print(f"\n❌ MISCLASSIFICATION ANALYSIS ({len(errors)} errors):")
print("-" * 50)
# Group errors by actual language
for actual_lang in sorted(errors["gt_iso"].unique()):
lang_errors = errors[errors["gt_iso"] == actual_lang]
predicted_langs = lang_errors["pred_iso"].value_counts()
print(f"{actual_lang} ({LANG_NAMES.get(actual_lang, actual_lang)}):")
for pred_lang, count in predicted_langs.head(3).items():
print(f" β†’ {pred_lang} ({count} files)")
# Summary statistics
print(f"\nπŸ“ˆ SUMMARY STATISTICS:")
print("-" * 25)
print(f"Total files processed: {len(df)}")
print(f"Files with valid GT: {len(valid_df)}")
print(f"Languages detected: {len(valid_df['pred_iso'].unique())}")
print(f"Languages in dataset: {len(valid_df['gt_iso'].unique())}")
print(f"Perfect accuracy: {len([l for l in lang_stats if l['accuracy'] == 1.0])}")
print(f"Above 90% accuracy: {len([l for l in lang_stats if l['accuracy'] >= 0.9])}")
print(f"Below 50% accuracy: {len([l for l in lang_stats if l['accuracy'] < 0.5])}")
return valid_df, lang_stats
# Run comprehensive analysis
if 'analysis_results' in globals() and analysis_results:
final_df, language_statistics = generate_comprehensive_analysis(analysis_results)
# Save results to CSV
if 'final_df' in locals():
timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
csv_filename = f"language_detection_results_{timestamp}.csv"
final_df.to_csv(csv_filename, index=False)
print(f"\nπŸ’Ύ Results saved to: {csv_filename}")
# Download file
try:
from google.colab import files
print("πŸ“₯ File downloaded successfully")
except:
print("πŸ“ File saved locally (download failed)")
else:
print("❌ No analysis results available. Please run the previous cells first.")
print(f"\nβœ… COMPLETE LANGUAGE DETECTION ANALYSIS FINISHED!")
# ==============================================================================
# Independent Model Analysis with Top-5 and Real Confidence Scores
# ==============================================================================
def analyze_models_independently(audio_files):
"""Analyze each model independently with Top-5 predictions and real confidence scores"""
print("πŸ” INDEPENDENT MODEL ANALYSIS")
print("=" * 60)
results = {
'voxlingua': [],
'xlsr': [],
'combined_analysis': []
}
for i, audio_path in enumerate(audio_files, 1):
print(f"\n[{i}/{len(audio_files)}] Analyzing: {os.path.basename(audio_path)}")
# Extract ground truth
gt_iso = gt_from_filename(audio_path)
print(f" Ground Truth: {gt_iso or 'Unknown'}")
file_result = {
'file': os.path.basename(audio_path),
'gt_iso': gt_iso or '',
'voxlingua_results': {},
'xlsr_results': {}
}
# ========================================
# VoxLingua107 Independent Analysis
# ========================================
if voxlingua_model is not None:
try:
print(f" πŸ”¬ VoxLingua107 Analysis:")
out = voxlingua_model.classify_file(audio_path)
# Extract Top-5 predictions with real confidence scores
logits, log_conf, pred_idx, labels = out
# Get top 5 predictions
top5_indices = torch.topk(logits.squeeze(), 5).indices
top5_probs = torch.softmax(logits.squeeze(), dim=0)
vox_top5 = []
for idx in top5_indices:
lang_label = labels[idx.item()] if idx.item() < len(labels) else f"idx_{idx.item()}"
prob = top5_probs[idx.item()].item()
# Extract language code
if isinstance(lang_label, str):
colon_pos = lang_label.find(":")
lang_code = lang_label[:colon_pos].strip() if colon_pos != -1 else lang_label.strip()
else:
lang_code = str(lang_label)
# Map to dataset codes
mapped_lang = map_to_dataset_language(lang_code)
vox_top5.append({
'rank': len(vox_top5) + 1,
'original_code': lang_code,
'mapped_code': mapped_lang,
'confidence': prob,
'in_dataset': mapped_lang in ALL_SUPPORTED_LANGS
})
print(f" Rank {len(vox_top5)}: {lang_code} β†’ {mapped_lang} ({prob:.4f}) {'βœ…' if mapped_lang in ALL_SUPPORTED_LANGS else '❌'}")
# Store VoxLingua results
file_result['voxlingua_results'] = {
'top5': vox_top5,
'top1_original': vox_top5[0]['original_code'],
'top1_mapped': vox_top5[0]['mapped_code'],
'top1_confidence': vox_top5[0]['confidence'],
'correct_in_top1': gt_iso == vox_top5[0]['mapped_code'] if gt_iso else None,
'correct_in_top5': any(pred['mapped_code'] == gt_iso for pred in vox_top5) if gt_iso else None
}
results['voxlingua'].append({
'file': os.path.basename(audio_path),
'gt_iso': gt_iso or '',
'pred_iso': vox_top5[0]['mapped_code'],
'confidence': vox_top5[0]['confidence'],
'correct': gt_iso == vox_top5[0]['mapped_code'] if gt_iso else None,
'top5_predictions': [p['mapped_code'] for p in vox_top5]
})
except Exception as e:
print(f" ❌ VoxLingua107 error: {e}")
file_result['voxlingua_results'] = {'error': str(e)}
# ========================================
# XLS-R Independent Analysis
# ========================================
if xlsr_lid_model is not None:
try:
print(f" πŸ”¬ XLS-R Analysis:")
out = xlsr_lid_model.classify_file(audio_path)
# Parse XLS-R output for Top-5
out_prob, score, index, text_lab = out
# Get top 5 predictions
top5_indices = torch.topk(out_prob.squeeze(), 5).indices
top5_probs = torch.softmax(out_prob.squeeze(), dim=0)
xlsr_top5 = []
for idx in top5_indices:
lang_label = text_lab[idx.item()] if idx.item() < len(text_lab) else f"idx_{idx.item()}"
prob = top5_probs[idx.item()].item()
lang_code = str(lang_label).strip().lower()
mapped_lang = map_to_dataset_language(lang_code)
xlsr_top5.append({
'rank': len(xlsr_top5) + 1,
'original_code': lang_code,
'mapped_code': mapped_lang,
'confidence': prob,
'in_dataset': mapped_lang in ALL_SUPPORTED_LANGS
})
print(f" Rank {len(xlsr_top5)}: {lang_code} β†’ {mapped_lang} ({prob:.4f}) {'βœ…' if mapped_lang in ALL_SUPPORTED_LANGS else '❌'}")
# Store XLS-R results
file_result['xlsr_results'] = {
'top5': xlsr_top5,
'top1_original': xlsr_top5[0]['original_code'],
'top1_mapped': xlsr_top5[0]['mapped_code'],
'top1_confidence': xlsr_top5[0]['confidence'],
'correct_in_top1': gt_iso == xlsr_top5[0]['mapped_code'] if gt_iso else None,
'correct_in_top5': any(pred['mapped_code'] == gt_iso for pred in xlsr_top5) if gt_iso else None
}
results['xlsr'].append({
'file': os.path.basename(audio_path),
'gt_iso': gt_iso or '',
'pred_iso': xlsr_top5[0]['mapped_code'],
'confidence': xlsr_top5[0]['confidence'],
'correct': gt_iso == xlsr_top5[0]['mapped_code'] if gt_iso else None,
'top5_predictions': [p['mapped_code'] for p in xlsr_top5]
})
except Exception as e:
print(f" ❌ XLS-R error: {e}")
file_result['xlsr_results'] = {'error': str(e)}
results['combined_analysis'].append(file_result)
print(f" βœ… Analysis complete for {os.path.basename(audio_path)}")
return results
def generate_independent_model_report(results):
"""Generate comprehensive independent model analysis report"""
print(f"\nπŸ“Š INDEPENDENT MODEL PERFORMANCE ANALYSIS")
print("=" * 70)
# VoxLingua107 Analysis
if results['voxlingua']:
vox_df = pd.DataFrame(results['voxlingua'])
valid_vox = vox_df[vox_df['gt_iso'] != ''].copy()
if len(valid_vox) > 0:
vox_acc = accuracy_score(valid_vox['gt_iso'], valid_vox['pred_iso'])
vox_conf_avg = valid_vox['confidence'].mean()
vox_conf_std = valid_vox['confidence'].std()
print(f"\nπŸ”¬ VoxLingua107 INDEPENDENT ANALYSIS:")
print(f" Files analyzed: {len(valid_vox)}")
print(f" Top-1 Accuracy: {vox_acc:.4f} ({vox_acc*100:.1f}%)")
print(f" Avg Confidence: {vox_conf_avg:.4f} Β± {vox_conf_std:.4f}")
# Per-language accuracy for VoxLingua
print(f" Per-language performance:")
vox_per_lang = valid_vox.groupby('gt_iso').agg({
'correct': 'mean',
'confidence': ['mean', 'count']
}).round(4)
vox_per_lang.columns = ['accuracy', 'avg_conf', 'count']
for lang, row in vox_per_lang.iterrows():
print(f" {lang}: {row['accuracy']:.3f} ({row['accuracy']*100:.1f}%) - {row['avg_conf']:.3f} conf - {int(row['count'])} files")
# XLS-R Analysis
if results['xlsr']:
xlsr_df = pd.DataFrame(results['xlsr'])
valid_xlsr = xlsr_df[xlsr_df['gt_iso'] != ''].copy()
if len(valid_xlsr) > 0:
xlsr_acc = accuracy_score(valid_xlsr['gt_iso'], valid_xlsr['pred_iso'])
xlsr_conf_avg = valid_xlsr['confidence'].mean()
xlsr_conf_std = valid_xlsr['confidence'].std()
print(f"\nπŸ”¬ XLS-R INDEPENDENT ANALYSIS:")
print(f" Files analyzed: {len(valid_xlsr)}")
print(f" Top-1 Accuracy: {xlsr_acc:.4f} ({xlsr_acc*100:.1f}%)")
print(f" Avg Confidence: {xlsr_conf_avg:.4f} Β± {xlsr_conf_std:.4f}")
# Per-language accuracy for XLS-R
print(f" Per-language performance:")
xlsr_per_lang = valid_xlsr.groupby('gt_iso').agg({
'correct': 'mean',
'confidence': ['mean', 'count']
}).round(4)
xlsr_per_lang.columns = ['accuracy', 'avg_conf', 'count']
for lang, row in xlsr_per_lang.iterrows():
print(f" {lang}: {row['accuracy']:.3f} ({row['accuracy']*100:.1f}%) - {row['avg_conf']:.3f} conf - {int(row['count'])} files")
# Model Comparison
if results['voxlingua'] and results['xlsr']:
print(f"\nβš–οΈ MODEL COMPARISON:")
print(f" VoxLingua107 vs XLS-R:")
print(f" Accuracy: {vox_acc:.4f} vs {xlsr_acc:.4f} ({'VoxLingua wins' if vox_acc > xlsr_acc else 'XLS-R wins' if xlsr_acc > vox_acc else 'Tie'})")
print(f" Avg Confidence: {vox_conf_avg:.4f} vs {xlsr_conf_avg:.4f}")
# Suggest optimal weights
total_perf = vox_acc + xlsr_acc
vox_weight = vox_acc / total_perf if total_perf > 0 else 0.5
xlsr_weight = xlsr_acc / total_perf if total_perf > 0 else 0.5
print(f"\nπŸ’‘ SUGGESTED OPTIMAL WEIGHTS:")
print(f" VoxLingua107: {vox_weight:.2f} ({vox_weight*100:.0f}%)")
print(f" XLS-R: {xlsr_weight:.2f} ({xlsr_weight*100:.0f}%)")
return results
# Run independent analysis
if 'test_files' in globals() and test_files:
independent_results = analyze_models_independently(test_files[:10]) # Limit to first 10 for testing
final_report = generate_independent_model_report(independent_results)
else:
print("❌ No test files available. Run the previous cells first.")
# ==============================================================================
# Analyze Already Downloaded Files in /content/drive_dataset/
# ==============================================================================
def scan_downloaded_files():
"""Scan and collect already downloaded audio files"""
download_dir = "/content/drive_dataset"
if not os.path.exists(download_dir):
print("❌ Download directory not found")
return []
print(f"πŸ” Scanning {download_dir} for audio files...")
# Valid audio extensions
VALID_EXTS = {".wav", ".mp3", ".flac", ".m4a", ".ogg"}
def is_audio(filepath):
return os.path.splitext(filepath)[1].lower() in VALID_EXTS
# Collect all audio files
audio_files = []
lang_counts = {}
for root, dirs, files in os.walk(download_dir):
for file in files:
if is_audio(file):
full_path = os.path.join(root, file)
audio_files.append(full_path)
# Extract language from folder structure
path_parts = root.split('/')
for part in path_parts:
if len(part) in [2, 3] and part.isalpha():
lang_counts[part] = lang_counts.get(part, 0) + 1
break
print(f"πŸ“Š Found {len(audio_files)} audio files:")
for lang, count in sorted(lang_counts.items()):
print(f" {lang}: {count} files")
# Show sample files
print(f"\nπŸ“ Sample files:")
for file_path in audio_files[:5]:
print(f" {file_path}")
return audio_files
# Scan for downloaded files
downloaded_files = scan_downloaded_files()
if not downloaded_files:
print("❌ No audio files found. Let me help you collect them manually.")
# Manual file collection if scan fails
print("\nπŸ” Manual file search...")
import glob
# Search patterns for common locations
search_patterns = [
"/content/drive_dataset/**/*.flac",
"/content/drive_dataset/**/*.wav",
"/content/drive_dataset/**/*.mp3",
"/content/**/*.flac",
"/content/**/*.wav",
"/content/**/*.mp3"
]
manual_files = []
for pattern in search_patterns:
found = glob.glob(pattern, recursive=True)
manual_files.extend(found)
# Remove duplicates
manual_files = list(set(manual_files))
print(f"πŸ“Š Manual search found: {len(manual_files)} files")
for file_path in manual_files[:10]: # Show first 10
print(f" {file_path}")
downloaded_files = manual_files
print(f"\n🎯 Total files ready for analysis: {len(downloaded_files)}")
# ==============================================================================
# Run Independent Analysis on Downloaded Files
# ==============================================================================
def analyze_downloaded_files_independently(audio_files):
"""Run independent model analysis on downloaded files with detailed output"""
if not audio_files:
print("❌ No audio files to analyze")
return None
print(f"πŸš€ Starting independent analysis on {len(audio_files)} files...")
print("=" * 70)
results = {
'voxlingua_detailed': [],
'xlsr_detailed': [],
'comparison_data': []
}
for i, audio_path in enumerate(audio_files, 1):
print(f"\n[{i}/{len(audio_files)}] 🎡 {os.path.basename(audio_path)}")
# Extract ground truth from path/filename
gt_iso = gt_from_filename(audio_path)
print(f" πŸ“ Ground Truth: {gt_iso or 'Unknown'}")
file_analysis = {
'file': os.path.basename(audio_path),
'full_path': audio_path,
'gt_iso': gt_iso or '',
'voxlingua': {'available': False},
'xlsr': {'available': False}
}
# ==========================================
# VoxLingua107 Independent Analysis
# ==========================================
if voxlingua_model is not None:
try:
print(f" πŸ”¬ VoxLingua107 Analysis:")
out = voxlingua_model.classify_file(audio_path)
logits, log_conf, pred_idx, labels = out
# Get real confidence scores (not weighted)
probs = torch.softmax(logits.squeeze(), dim=0)
top5_indices = torch.topk(probs, min(5, len(probs))).indices
vox_predictions = []
for rank, idx in enumerate(top5_indices, 1):
lang_label = labels[idx.item()]
confidence = probs[idx.item()].item()
# Parse language code
if isinstance(lang_label, str):
colon_pos = lang_label.find(":")
lang_code = lang_label[:colon_pos].strip() if colon_pos != -1 else lang_label.strip()
else:
lang_code = str(lang_label)
# Map to dataset language
mapped_lang = map_to_dataset_language(lang_code)
vox_predictions.append({
'rank': rank,
'original': lang_code,
'mapped': mapped_lang,
'confidence': confidence,
'in_dataset': mapped_lang in ALL_SUPPORTED_LANGS
})
status = "βœ…" if mapped_lang in ALL_SUPPORTED_LANGS else "❌"
print(f" #{rank}: {lang_code} β†’ {mapped_lang} ({confidence:.4f}) {status}")
# Store VoxLingua results
top1 = vox_predictions[0]
file_analysis['voxlingua'] = {
'available': True,
'top5_predictions': vox_predictions,
'top1_prediction': top1['mapped'],
'top1_confidence': top1['confidence'],
'correct_top1': gt_iso == top1['mapped'] if gt_iso else None,
'correct_in_top5': any(p['mapped'] == gt_iso for p in vox_predictions) if gt_iso else None
}
results['voxlingua_detailed'].append({
'file': os.path.basename(audio_path),
'gt_iso': gt_iso or '',
'pred_iso': top1['mapped'],
'confidence': top1['confidence'],
'correct': gt_iso == top1['mapped'] if gt_iso else None
})
except Exception as e:
print(f" ❌ VoxLingua107 error: {e}")
file_analysis['voxlingua'] = {'available': False, 'error': str(e)}
# ==========================================
# XLS-R Independent Analysis
# ==========================================
if xlsr_lid_model is not None:
try:
print(f" πŸ”¬ XLS-R Analysis:")
out = xlsr_lid_model.classify_file(audio_path)
out_prob, score, index, text_lab = out
# Get real confidence scores
probs = torch.softmax(out_prob.squeeze(), dim=0)
top5_indices = torch.topk(probs, min(5, len(probs))).indices
xlsr_predictions = []
for rank, idx in enumerate(top5_indices, 1):
lang_label = text_lab[idx.item()]
confidence = probs[idx.item()].item()
lang_code = str(lang_label).strip().lower()
mapped_lang = map_to_dataset_language(lang_code)
xlsr_predictions.append({
'rank': rank,
'original': lang_code,
'mapped': mapped_lang,
'confidence': confidence,
'in_dataset': mapped_lang in ALL_SUPPORTED_LANGS
})
status = "βœ…" if mapped_lang in ALL_SUPPORTED_LANGS else "❌"
print(f" #{rank}: {lang_code} β†’ {mapped_lang} ({confidence:.4f}) {status}")
# Store XLS-R results
top1 = xlsr_predictions[0]
file_analysis['xlsr'] = {
'available': True,
'top5_predictions': xlsr_predictions,
'top1_prediction': top1['mapped'],
'top1_confidence': top1['confidence'],
'correct_top1': gt_iso == top1['mapped'] if gt_iso else None,
'correct_in_top5': any(p['mapped'] == gt_iso for p in xlsr_predictions) if gt_iso else None
}
results['xlsr_detailed'].append({
'file': os.path.basename(audio_path),
'gt_iso': gt_iso or '',
'pred_iso': top1['mapped'],
'confidence': top1['confidence'],
'correct': gt_iso == top1['mapped'] if gt_iso else None
})
except Exception as e:
print(f" ❌ XLS-R error: {e}")
file_analysis['xlsr'] = {'available': False, 'error': str(e)}
results['comparison_data'].append(file_analysis)
print(f" βœ… Analysis complete\n")
return results
# Run the independent analysis
if downloaded_files:
print("πŸ”¬ Running independent model analysis...")
analysis_results = analyze_downloaded_files_independently(downloaded_files)
else:
print("❌ No files found for analysis")
analysis_results = None
# ==============================================================================
# FIXED: Robust VoxLingua107 Analysis with Better Error Handling
# ==============================================================================
def parse_voxlingua_output_robust(out):
"""Robust parsing of VoxLingua107 output with multiple fallback methods"""
try:
# Method 1: Standard SpeechBrain output format
if isinstance(out, (tuple, list)) and len(out) >= 4:
logits, log_conf, pred_idx, labels = out[:4]
# Validate components
if hasattr(logits, 'squeeze') and hasattr(labels, '__getitem__'):
return logits, log_conf, pred_idx, labels, "standard"
# Method 2: Alternative format (sometimes returns dict)
if isinstance(out, dict):
logits = out.get('predictions', out.get('logits'))
labels = out.get('labels', out.get('text_lab'))
log_conf = out.get('log_probabilities', out.get('log_conf'))
pred_idx = out.get('predicted_ids', out.get('pred_idx'))
if all(v is not None for v in [logits, labels]):
return logits, log_conf, pred_idx, labels, "dict"
# Method 3: Direct tensor output
if hasattr(out, 'squeeze'): # Direct logits tensor
logits = out
# Create dummy labels based on logits size
labels = [f"lang_{i}" for i in range(logits.shape[-1])]
log_conf = torch.log_softmax(logits, dim=-1).max()
pred_idx = torch.argmax(logits, dim=-1)
return logits, log_conf, pred_idx, labels, "tensor"
except Exception as e:
print(f" Parse error: {e}")
return None, None, None, None, "failed"
def analyze_voxlingua_robust(audio_path):
"""Robust VoxLingua107 analysis with multiple parsing methods"""
if voxlingua_model is None:
return None
try:
# Get raw output from model
raw_out = voxlingua_model.classify_file(audio_path)
# Parse with robust method
logits, log_conf, pred_idx, labels, parse_method = parse_voxlingua_output_robust(raw_out)
if logits is None:
print(f" ❌ Could not parse VoxLingua output format")
return None
print(f" πŸ“Š Parse method: {parse_method}")
# Get predictions based on available data
if hasattr(logits, 'squeeze'):
probs = torch.softmax(logits.squeeze(), dim=-1 if len(logits.squeeze().shape) > 0 else 0)
# Handle different tensor shapes
if len(probs.shape) == 0: # Scalar
top_indices = torch.tensor([0])
top_probs = probs.unsqueeze(0)
else: # Vector
k = min(5, len(probs))
top_probs, top_indices = torch.topk(probs, k)
else:
print(f" ❌ Logits not in expected tensor format")
return None
predictions = []
for rank, (idx, prob) in enumerate(zip(top_indices, top_probs), 1):
idx_val = idx.item() if hasattr(idx, 'item') else int(idx)
prob_val = prob.item() if hasattr(prob, 'item') else float(prob)
# Get language label safely
if idx_val < len(labels):
lang_label = labels[idx_val]
else:
lang_label = f"unknown_{idx_val}"
# Parse language code
if isinstance(lang_label, str):
colon_pos = lang_label.find(":")
lang_code = lang_label[:colon_pos].strip() if colon_pos != -1 else lang_label.strip()
else:
lang_code = str(lang_label)
# Map to dataset language
mapped_lang = map_to_dataset_language(lang_code)
predictions.append({
'rank': rank,
'original': lang_code,
'mapped': mapped_lang,
'confidence': prob_val,
'in_dataset': mapped_lang in ALL_SUPPORTED_LANGS
})
status = "βœ…" if mapped_lang in ALL_SUPPORTED_LANGS else "❌"
print(f" #{rank}: {lang_code} β†’ {mapped_lang} ({prob_val:.4f}) {status}")
return predictions
except Exception as e:
print(f" ❌ VoxLingua analysis error: {e}")
print(f" ❌ Error type: {type(e).__name__}")
return None
def analyze_xlsr_robust(audio_path):
"""Robust XLS-R analysis"""
if xlsr_lid_model is None:
return None
try:
raw_out = xlsr_lid_model.classify_file(audio_path)
# Handle different XLS-R output formats
if isinstance(raw_out, (tuple, list)) and len(raw_out) >= 4:
out_prob, score, index, text_lab = raw_out[:4]
else:
print(f" ❌ XLS-R output format not recognized")
return None
# Get top predictions
if hasattr(out_prob, 'squeeze'):
probs = torch.softmax(out_prob.squeeze(), dim=-1 if len(out_prob.squeeze().shape) > 0 else 0)
if len(probs.shape) == 0: # Scalar
top_indices = torch.tensor([0])
top_probs = probs.unsqueeze(0)
else: # Vector
k = min(5, len(probs))
top_probs, top_indices = torch.topk(probs, k)
else:
print(f" ❌ XLS-R probabilities not in expected format")
return None
predictions = []
for rank, (idx, prob) in enumerate(zip(top_indices, top_probs), 1):
idx_val = idx.item() if hasattr(idx, 'item') else int(idx)
prob_val = prob.item() if hasattr(prob, 'item') else float(prob)
# Get language label
if idx_val < len(text_lab):
lang_label = text_lab[idx_val]
else:
lang_label = f"unknown_{idx_val}"
lang_code = str(lang_label).strip().lower()
mapped_lang = map_to_dataset_language(lang_code)
predictions.append({
'rank': rank,
'original': lang_code,
'mapped': mapped_lang,
'confidence': prob_val,
'in_dataset': mapped_lang in ALL_SUPPORTED_LANGS
})
status = "βœ…" if mapped_lang in ALL_SUPPORTED_LANGS else "❌"
print(f" #{rank}: {lang_code} β†’ {mapped_lang} ({prob_val:.4f}) {status}")
return predictions
except Exception as e:
print(f" ❌ XLS-R analysis error: {e}")
return None
# ==============================================================================
# UPDATED: Robust Analysis Function
# ==============================================================================
def analyze_downloaded_files_robust(audio_files):
"""Robust analysis with better error handling"""
if not audio_files:
print("❌ No audio files to analyze")
return None
print(f"πŸš€ Starting ROBUST analysis on {len(audio_files)} files...")
print("=" * 70)
results = {
'voxlingua_detailed': [],
'xlsr_detailed': [],
'comparison_data': []
}
for i, audio_path in enumerate(audio_files, 1):
print(f"\n[{i}/{len(audio_files)}] 🎡 {os.path.basename(audio_path)}")
# Extract ground truth
gt_iso = gt_from_filename(audio_path)
print(f" πŸ“ Ground Truth: {gt_iso or 'Unknown'}")
file_analysis = {
'file': os.path.basename(audio_path),
'full_path': audio_path,
'gt_iso': gt_iso or '',
'voxlingua': {'available': False},
'xlsr': {'available': False}
}
# VoxLingua107 Analysis
print(f" πŸ”¬ VoxLingua107 Analysis:")
vox_predictions = analyze_voxlingua_robust(audio_path)
if vox_predictions:
top1 = vox_predictions[0]
file_analysis['voxlingua'] = {
'available': True,
'top5_predictions': vox_predictions,
'top1_prediction': top1['mapped'],
'top1_confidence': top1['confidence'],
'correct_top1': gt_iso == top1['mapped'] if gt_iso else None,
'correct_in_top5': any(p['mapped'] == gt_iso for p in vox_predictions) if gt_iso else None
}
results['voxlingua_detailed'].append({
'file': os.path.basename(audio_path),
'gt_iso': gt_iso or '',
'pred_iso': top1['mapped'],
'confidence': top1['confidence'],
'correct': gt_iso == top1['mapped'] if gt_iso else None
})
else:
file_analysis['voxlingua'] = {'available': False, 'error': 'Analysis failed'}
# XLS-R Analysis
print(f" πŸ”¬ XLS-R Analysis:")
xlsr_predictions = analyze_xlsr_robust(audio_path)
if xlsr_predictions:
top1 = xlsr_predictions[0]
file_analysis['xlsr'] = {
'available': True,
'top5_predictions': xlsr_predictions,
'top1_prediction': top1['mapped'],
'top1_confidence': top1['confidence'],
'correct_top1': gt_iso == top1['mapped'] if gt_iso else None,
'correct_in_top5': any(p['mapped'] == gt_iso for p in xlsr_predictions) if gt_iso else None
}
results['xlsr_detailed'].append({
'file': os.path.basename(audio_path),
'gt_iso': gt_iso or '',
'pred_iso': top1['mapped'],
'confidence': top1['confidence'],
'correct': gt_iso == top1['mapped'] if gt_iso else None
})
else:
file_analysis['xlsr'] = {'available': False, 'error': 'Analysis failed'}
results['comparison_data'].append(file_analysis)
print(f" βœ… Analysis complete")
return results
# Run the robust analysis
if 'downloaded_files' in globals() and downloaded_files:
print("πŸ”¬ Running ROBUST independent model analysis...")
robust_analysis_results = analyze_downloaded_files_robust(downloaded_files)
# Generate report
if robust_analysis_results:
generate_detailed_performance_report(robust_analysis_results)
print(f"\nβœ… ROBUST ANALYSIS COMPLETE!")
else:
print("❌ Robust analysis failed")
else:
print("❌ No downloaded files found. Please run the file scanning code first.")
# ==============================================================================
# COMPLETE FIX: VoxLingua Label Mapping + Missing Function
# ==============================================================================
# First, let's create a proper VoxLingua language mapping
VOXLINGUA_LANGUAGE_MAP = {
0: 'ab', 1: 'af', 2: 'ak', 3: 'am', 4: 'ar', 5: 'as', 6: 'az', 7: 'be', 8: 'bg', 9: 'bn',
10: 'bo', 11: 'br', 12: 'bs', 13: 'ca', 14: 'ce', 15: 'co', 16: 'cs', 17: 'cv', 18: 'cy', 19: 'da',
20: 'de', 21: 'dv', 22: 'dz', 23: 'ee', 24: 'el', 25: 'en', 26: 'eo', 27: 'es', 28: 'et', 29: 'eu',
30: 'fa', 31: 'ff', 32: 'fi', 33: 'fo', 34: 'fr', 35: 'fy', 36: 'ga', 37: 'gd', 38: 'gl', 39: 'gn',
40: 'gu', 41: 'gv', 42: 'ha', 43: 'haw', 44: 'he', 45: 'hi', 46: 'hr', 47: 'ht', 48: 'hu', 49: 'hy',
50: 'ia', 51: 'id', 52: 'ie', 53: 'ig', 54: 'ii', 55: 'ik', 56: 'io', 57: 'is', 58: 'it', 59: 'iu',
60: 'ja', 61: 'jv', 62: 'ka', 63: 'kk', 64: 'kl', 65: 'km', 66: 'kn', 67: 'ko', 68: 'ks', 69: 'ku',
70: 'kw', 71: 'ky', 72: 'la', 73: 'lb', 74: 'lg', 75: 'li', 76: 'ln', 77: 'lo', 78: 'lt', 79: 'lv',
80: 'mg', 81: 'mi', 82: 'mk', 83: 'ml', 84: 'mn', 85: 'mr', 86: 'ms', 87: 'mt', 88: 'my', 89: 'na',
90: 'nb', 91: 'nd', 92: 'ne', 93: 'ng', 94: 'nl', 95: 'nn', 96: 'no', 97: 'nv', 98: 'ny', 99: 'oc',
100: 'of', 101: 'om', 102: 'or', 103: 'os', 104: 'pa', 105: 'pi', 106: 'pl', 107: 'ps'
}
def get_voxlingua_language_by_index(idx):
"""Map VoxLingua index to language code"""
return VOXLINGUA_LANGUAGE_MAP.get(idx, f'unknown_{idx}')
def analyze_voxlingua_fixed(audio_path):
"""Fixed VoxLingua107 analysis with proper language mapping"""
if voxlingua_model is None:
return None
try:
raw_out = voxlingua_model.classify_file(audio_path)
if not isinstance(raw_out, (tuple, list)) or len(raw_out) < 4:
print(f" ❌ Unexpected VoxLingua output format")
return None
logits, log_conf, pred_idx, labels = raw_out[:4]
# Get probabilities and top 5
probs = torch.softmax(logits.squeeze(), dim=-1)
k = min(5, len(probs))
top_probs, top_indices = torch.topk(probs, k)
predictions = []
for rank, (idx, prob) in enumerate(zip(top_indices, top_probs), 1):
idx_val = idx.item() if hasattr(idx, 'item') else int(idx)
prob_val = prob.item() if hasattr(prob, 'item') else float(prob)
# Method 1: Try to use provided labels
if idx_val < len(labels) and not str(labels[idx_val]).startswith('unknown'):
lang_label = labels[idx_val]
if isinstance(lang_label, str):
colon_pos = lang_label.find(":")
lang_code = lang_label[:colon_pos].strip() if colon_pos != -1 else lang_label.strip()
else:
lang_code = str(lang_label)
else:
# Method 2: Use our language mapping
lang_code = get_voxlingua_language_by_index(idx_val)
# Map to dataset language
mapped_lang = map_to_dataset_language(lang_code)
predictions.append({
'rank': rank,
'original': lang_code,
'mapped': mapped_lang,
'confidence': prob_val,
'in_dataset': mapped_lang in ALL_SUPPORTED_LANGS,
'index': idx_val
})
status = "βœ…" if mapped_lang in ALL_SUPPORTED_LANGS else "❌"
print(f" #{rank}: {lang_code} β†’ {mapped_lang} ({prob_val:.4f}) {status} [idx:{idx_val}]")
return predictions
except Exception as e:
print(f" ❌ VoxLingua analysis error: {e}")
return None
def analyze_xlsr_fixed(audio_path):
"""Fixed XLS-R analysis"""
if xlsr_lid_model is None:
print(f" ❌ XLS-R model not loaded")
return None
try:
raw_out = xlsr_lid_model.classify_file(audio_path)
if not isinstance(raw_out, (tuple, list)) or len(raw_out) < 4:
print(f" ❌ Unexpected XLS-R output format")
return None
out_prob, score, index, text_lab = raw_out[:4]
# Get probabilities and top 5
probs = torch.softmax(out_prob.squeeze(), dim=-1)
k = min(5, len(probs))
top_probs, top_indices = torch.topk(probs, k)
predictions = []
for rank, (idx, prob) in enumerate(zip(top_indices, top_probs), 1):
idx_val = idx.item() if hasattr(idx, 'item') else int(idx)
prob_val = prob.item() if hasattr(prob, 'item') else float(prob)
# Get language label
if idx_val < len(text_lab):
lang_label = text_lab[idx_val]
lang_code = str(lang_label).strip().lower()
else:
lang_code = f"xlsr_unknown_{idx_val}"
mapped_lang = map_to_dataset_language(lang_code)
predictions.append({
'rank': rank,
'original': lang_code,
'mapped': mapped_lang,
'confidence': prob_val,
'in_dataset': mapped_lang in ALL_SUPPORTED_LANGS
})
status = "βœ…" if mapped_lang in ALL_SUPPORTED_LANGS else "❌"
print(f" #{rank}: {lang_code} β†’ {mapped_lang} ({prob_val:.4f}) {status}")
return predictions
except Exception as e:
print(f" ❌ XLS-R analysis error: {e}")
return None
def generate_detailed_performance_report(results):
"""Complete performance analysis report function"""
if not results:
print("❌ No results to analyze")
return
print("\nπŸ“Š DETAILED INDEPENDENT MODEL PERFORMANCE REPORT")
print("=" * 70)
# VoxLingua107 Performance Analysis
if results['voxlingua_detailed']:
vox_df = pd.DataFrame(results['voxlingua_detailed'])
valid_vox = vox_df[vox_df['gt_iso'] != ''].copy()
print(f"\nπŸ”¬ VOXLINGUA107 PERFORMANCE:")
print("-" * 40)
if len(valid_vox) > 0:
vox_acc = (valid_vox['correct'] == True).mean()
vox_conf_mean = valid_vox['confidence'].mean()
vox_conf_std = valid_vox['confidence'].std()
print(f"Files Analyzed: {len(valid_vox)}")
print(f"Top-1 Accuracy: {vox_acc:.4f} ({vox_acc*100:.1f}%)")
print(f"Confidence: {vox_conf_mean:.4f} Β± {vox_conf_std:.4f}")
# Per-language breakdown
print(f"\nPer-Language Performance:")
for lang in sorted(valid_vox['gt_iso'].unique()):
lang_data = valid_vox[valid_vox['gt_iso'] == lang]
acc = (lang_data['correct'] == True).mean()
conf_mean = lang_data['confidence'].mean()
count = len(lang_data)
print(f" {lang:>3}: {acc:.3f} ({acc*100:5.1f}%) | Conf: {conf_mean:.3f} | n={count}")
else:
print("No valid VoxLingua results")
# XLS-R Performance Analysis
if results['xlsr_detailed']:
xlsr_df = pd.DataFrame(results['xlsr_detailed'])
valid_xlsr = xlsr_df[xlsr_df['gt_iso'] != ''].copy()
print(f"\nπŸ”¬ XLS-R PERFORMANCE:")
print("-" * 40)
if len(valid_xlsr) > 0:
xlsr_acc = (valid_xlsr['correct'] == True).mean()
xlsr_conf_mean = valid_xlsr['confidence'].mean()
xlsr_conf_std = valid_xlsr['confidence'].std()
print(f"Files Analyzed: {len(valid_xlsr)}")
print(f"Top-1 Accuracy: {xlsr_acc:.4f} ({xlsr_acc*100:.1f}%)")
print(f"Confidence: {xlsr_conf_mean:.4f} Β± {xlsr_conf_std:.4f}")
# Per-language breakdown
print(f"\nPer-Language Performance:")
for lang in sorted(valid_xlsr['gt_iso'].unique()):
lang_data = valid_xlsr[valid_xlsr['gt_iso'] == lang]
acc = (lang_data['correct'] == True).mean()
conf_mean = lang_data['confidence'].mean()
count = len(lang_data)
print(f" {lang:>3}: {acc:.3f} ({acc*100:5.1f}%) | Conf: {conf_mean:.3f} | n={count}")
else:
print("No valid XLS-R results")
# Model Comparison
if results['voxlingua_detailed'] and results['xlsr_detailed']:
print(f"\nβš–οΈ MODEL COMPARISON:")
print("-" * 30)
print(f"VoxLingua107: {vox_acc:.4f} accuracy")
print(f"XLS-R: {xlsr_acc:.4f} accuracy")
# Calculate optimal weights
total_acc = vox_acc + xlsr_acc
if total_acc > 0:
vox_weight = vox_acc / total_acc
xlsr_weight = xlsr_acc / total_acc
print(f"\nπŸ’‘ RECOMMENDED WEIGHTS:")
print(f"VoxLingua107: {vox_weight:.3f} ({vox_weight*100:.1f}%)")
print(f"XLS-R: {xlsr_weight:.3f} ({xlsr_weight*100:.1f}%)")
# Calculate agreement
vox_preds = set(vox_df['pred_iso'].tolist())
xlsr_preds = set(xlsr_df['pred_iso'].tolist())
common_preds = vox_preds.intersection(xlsr_preds)
print(f"\nModel Agreement Analysis:")
print(f"Common predictions: {len(common_preds)}")
print(f"VoxLingua unique: {len(vox_preds - xlsr_preds)}")
print(f"XLS-R unique: {len(xlsr_preds - vox_preds)}")
# Save results
timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
if results['voxlingua_detailed']:
vox_csv = f"voxlingua_fixed_results_{timestamp}.csv"
pd.DataFrame(results['voxlingua_detailed']).to_csv(vox_csv, index=False)
print(f"\nπŸ’Ύ VoxLingua results: {vox_csv}")
if results['xlsr_detailed']:
xlsr_csv = f"xlsr_fixed_results_{timestamp}.csv"
pd.DataFrame(results['xlsr_detailed']).to_csv(xlsr_csv, index=False)
print(f"πŸ’Ύ XLS-R results: {xlsr_csv}")
def run_complete_fixed_analysis(audio_files):
"""Run complete analysis with all fixes"""
if not audio_files:
print("❌ No audio files to analyze")
return None
print(f"πŸš€ Starting COMPLETE FIXED analysis on {len(audio_files)} files...")
print("=" * 70)
results = {
'voxlingua_detailed': [],
'xlsr_detailed': [],
'comparison_data': []
}
for i, audio_path in enumerate(audio_files, 1):
print(f"\n[{i}/{len(audio_files)}] 🎡 {os.path.basename(audio_path)}")
# Extract ground truth
gt_iso = gt_from_filename(audio_path)
print(f" πŸ“ Ground Truth: {gt_iso or 'Unknown'}")
file_analysis = {
'file': os.path.basename(audio_path),
'full_path': audio_path,
'gt_iso': gt_iso or '',
'voxlingua': {'available': False},
'xlsr': {'available': False}
}
# VoxLingua107 Analysis
print(f" πŸ”¬ VoxLingua107 Analysis:")
vox_predictions = analyze_voxlingua_fixed(audio_path)
if vox_predictions and len(vox_predictions) > 0:
top1 = vox_predictions[0]
file_analysis['voxlingua'] = {
'available': True,
'top5_predictions': vox_predictions,
'top1_prediction': top1['mapped'],
'top1_confidence': top1['confidence'],
'correct_top1': gt_iso == top1['mapped'] if gt_iso else None,
}
results['voxlingua_detailed'].append({
'file': os.path.basename(audio_path),
'gt_iso': gt_iso or '',
'pred_iso': top1['mapped'],
'confidence': top1['confidence'],
'correct': gt_iso == top1['mapped'] if gt_iso else None
})
# XLS-R Analysis
print(f" πŸ”¬ XLS-R Analysis:")
xlsr_predictions = analyze_xlsr_fixed(audio_path)
if xlsr_predictions and len(xlsr_predictions) > 0:
top1 = xlsr_predictions[0]
file_analysis['xlsr'] = {
'available': True,
'top5_predictions': xlsr_predictions,
'top1_prediction': top1['mapped'],
'top1_confidence': top1['confidence'],
'correct_top1': gt_iso == top1['mapped'] if gt_iso else None,
}
results['xlsr_detailed'].append({
'file': os.path.basename(audio_path),
'gt_iso': gt_iso or '',
'pred_iso': top1['mapped'],
'confidence': top1['confidence'],
'correct': gt_iso == top1['mapped'] if gt_iso else None
})
results['comparison_data'].append(file_analysis)
print(f" βœ… Analysis complete")
return results
# Run the complete fixed analysis
if 'downloaded_files' in globals() and downloaded_files:
print("πŸ”¬ Running COMPLETE FIXED analysis...")
final_analysis_results = run_complete_fixed_analysis(downloaded_files)
if final_analysis_results:
generate_detailed_performance_report(final_analysis_results)
print(f"\nβœ… COMPLETE FIXED ANALYSIS DONE!")
else:
print("❌ Analysis failed")
else:
print("❌ No downloaded files found")
# ==============================================================================
# COMPREHENSIVE EXCEL ANALYSIS WITH ALL DETAILS
# ==============================================================================
import pandas as pd
import numpy as np
from datetime import datetime
import os
def create_comprehensive_excel_analysis(results, output_filename=None):
"""Create comprehensive Excel analysis with multiple sheets and detailed metrics"""
if not results:
print("❌ No results to analyze")
return None
# Generate filename if not provided
if not output_filename:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_filename = f"Language_Detection_Comprehensive_Analysis_{timestamp}.xlsx"
print(f"πŸ“Š Creating comprehensive Excel analysis: {output_filename}")
# Create Excel writer
with pd.ExcelWriter(output_filename, engine='openpyxl') as writer:
# ========================================
# SHEET 1: EXECUTIVE SUMMARY
# ========================================
print(" πŸ“‹ Creating Executive Summary...")
summary_data = []
# Overall statistics
total_files = len(results['comparison_data'])
vox_available = sum(1 for item in results['comparison_data'] if item['voxlingua']['available'])
xlsr_available = sum(1 for item in results['comparison_data'] if item['xlsr']['available'])
summary_data.extend([
['EXECUTIVE SUMMARY', ''],
['Analysis Date', datetime.now().strftime("%Y-%m-%d %H:%M:%S")],
['Total Files Analyzed', total_files],
['VoxLingua107 Available', f"{vox_available} ({vox_available/total_files*100:.1f}%)"],
['XLS-R Available', f"{xlsr_available} ({xlsr_available/total_files*100:.1f}%)"],
['', ''],
])
# Model performance summary
if results['voxlingua_detailed']:
vox_df = pd.DataFrame(results['voxlingua_detailed'])
valid_vox = vox_df[vox_df['gt_iso'] != ''].copy()
if len(valid_vox) > 0:
vox_acc = (valid_vox['correct'] == True).mean()
vox_conf = valid_vox['confidence'].mean()
summary_data.extend([
['VOXLINGUA107 PERFORMANCE', ''],
['Accuracy', f"{vox_acc:.4f} ({vox_acc*100:.1f}%)"],
['Average Confidence', f"{vox_conf:.4f}"],
['Files with Valid GT', len(valid_vox)],
['', ''],
])
if results['xlsr_detailed']:
xlsr_df = pd.DataFrame(results['xlsr_detailed'])
valid_xlsr = xlsr_df[xlsr_df['gt_iso'] != ''].copy()
if len(valid_xlsr) > 0:
xlsr_acc = (valid_xlsr['correct'] == True).mean()
xlsr_conf = valid_xlsr['confidence'].mean()
summary_data.extend([
['XLS-R PERFORMANCE', ''],
['Accuracy', f"{xlsr_acc:.4f} ({xlsr_acc*100:.1f}%)"],
['Average Confidence', f"{xlsr_conf:.4f}"],
['Files with Valid GT', len(valid_xlsr)],
['', ''],
])
# Optimal weights calculation
if results['voxlingua_detailed']:
total_acc = vox_acc + xlsr_acc
if total_acc > 0:
vox_weight = vox_acc / total_acc
xlsr_weight = xlsr_acc / total_acc
summary_data.extend([
['RECOMMENDED ENSEMBLE WEIGHTS', ''],
['VoxLingua107 Weight', f"{vox_weight:.3f} ({vox_weight*100:.1f}%)"],
['XLS-R Weight', f"{xlsr_weight:.3f} ({xlsr_weight*100:.1f}%)"],
])
# Create summary dataframe
summary_df = pd.DataFrame(summary_data, columns=['Metric', 'Value'])
summary_df.to_excel(writer, sheet_name='Executive_Summary', index=False)
# ========================================
# SHEET 2: VOXLINGUA107 DETAILED RESULTS
# ========================================
if results['voxlingua_detailed']:
print(" πŸ“‹ Creating VoxLingua107 detailed results...")
vox_detailed_df = pd.DataFrame(results['voxlingua_detailed'])
# Add additional analysis columns
vox_detailed_df['accuracy_score'] = vox_detailed_df['correct'].astype(int)
vox_detailed_df['confidence_category'] = pd.cut(
vox_detailed_df['confidence'],
bins=[0, 0.3, 0.6, 0.8, 1.0],
labels=['Low', 'Medium', 'High', 'Very High']
)
# Add language family information
def get_language_family(lang):
if lang in INDO_ARYAN_LANGS:
return 'Indo-Aryan'
elif lang in DRAVIDIAN_LANGS:
return 'Dravidian'
elif lang in LOW_RESOURCE_LANGS:
return 'Low-Resource'
else:
return 'Other'
vox_detailed_df['gt_language_family'] = vox_detailed_df['gt_iso'].apply(get_language_family)
vox_detailed_df['pred_language_family'] = vox_detailed_df['pred_iso'].apply(get_language_family)
vox_detailed_df.to_excel(writer, sheet_name='VoxLingua107_Results', index=False)
# ========================================
# SHEET 3: XLS-R DETAILED RESULTS
# ========================================
if results['xlsr_detailed']:
print(" πŸ“‹ Creating XLS-R detailed results...")
xlsr_detailed_df = pd.DataFrame(results['xlsr_detailed'])
# Add analysis columns
xlsr_detailed_df['accuracy_score'] = xlsr_detailed_df['correct'].astype(int)
xlsr_detailed_df['confidence_category'] = pd.cut(
xlsr_detailed_df['confidence'],
bins=[0, 0.3, 0.6, 0.8, 1.0],
labels=['Low', 'Medium', 'High', 'Very High']
)
xlsr_detailed_df['gt_language_family'] = xlsr_detailed_df['gt_iso'].apply(get_language_family)
xlsr_detailed_df['pred_language_family'] = xlsr_detailed_df['pred_iso'].apply(get_language_family)
xlsr_detailed_df.to_excel(writer, sheet_name='XLSR_Results', index=False)
# ========================================
# SHEET 4: PER-LANGUAGE ACCURACY ANALYSIS
# ========================================
print(" πŸ“‹ Creating per-language accuracy analysis...")
lang_analysis_data = []
# Get all unique languages from ground truth
all_gt_langs = set()
if results['voxlingua_detailed']:
all_gt_langs.update([r['gt_iso'] for r in results['voxlingua_detailed'] if r['gt_iso']])
if results['xlsr_detailed']:
all_gt_langs.update([r['gt_iso'] for r in results['xlsr_detailed'] if r['gt_iso']])
# Language name mapping
LANG_NAMES = {
'ur': 'Urdu', 'pa': 'Punjabi', 'ta': 'Tamil', 'sd': 'Sindhi', 'or': 'Odia',
'ml': 'Malayalam', 'ne': 'Nepali', 'as': 'Assamese', 'hi': 'Hindi', 'bn': 'Bengali',
'kok': 'Konkani', 'kn': 'Kannada', 'ks': 'Kashmiri', 'mr': 'Marathi', 'te': 'Telugu',
'br': 'Bodo', 'doi': 'Dogri', 'sat': 'Santali', 'gu': 'Gujarati', 'mni': 'Manipuri',
'sa': 'Sanskrit'
}
for lang in sorted(all_gt_langs):
lang_name = LANG_NAMES.get(lang, lang.title())
lang_family = get_language_family(lang)
# VoxLingua stats for this language
vox_stats = {'files': 0, 'correct': 0, 'accuracy': 0, 'avg_confidence': 0}
if results['voxlingua_detailed']:
vox_lang_data = [r for r in results['voxlingua_detailed'] if r['gt_iso'] == lang]
if vox_lang_data:
vox_stats['files'] = len(vox_lang_data)
vox_stats['correct'] = sum(1 for r in vox_lang_data if r['correct'])
vox_stats['accuracy'] = vox_stats['correct'] / vox_stats['files']
vox_stats['avg_confidence'] = np.mean([r['confidence'] for r in vox_lang_data])
# XLS-R stats for this language
xlsr_stats = {'files': 0, 'correct': 0, 'accuracy': 0, 'avg_confidence': 0}
if results['xlsr_detailed']:
xlsr_lang_data = [r for r in results['xlsr_detailed'] if r['gt_iso'] == lang]
if xlsr_lang_data:
xlsr_stats['files'] = len(xlsr_lang_data)
xlsr_stats['correct'] = sum(1 for r in xlsr_lang_data if r['correct'])
xlsr_stats['accuracy'] = xlsr_stats['correct'] / xlsr_stats['files']
xlsr_stats['avg_confidence'] = np.mean([r['confidence'] for r in xlsr_lang_data])
lang_analysis_data.append({
'Language_Code': lang,
'Language_Name': lang_name,
'Language_Family': lang_family,
'VoxLingua_Files': vox_stats['files'],
'VoxLingua_Correct': vox_stats['correct'],
'VoxLingua_Accuracy': f"{vox_stats['accuracy']:.4f}",
'VoxLingua_Accuracy_Pct': f"{vox_stats['accuracy']*100:.1f}%",
'VoxLingua_Avg_Confidence': f"{vox_stats['avg_confidence']:.4f}",
'XLSR_Files': xlsr_stats['files'],
'XLSR_Correct': xlsr_stats['correct'],
'XLSR_Accuracy': f"{xlsr_stats['accuracy']:.4f}",
'XLSR_Accuracy_Pct': f"{xlsr_stats['accuracy']*100:.1f}%",
'XLSR_Avg_Confidence': f"{xlsr_stats['avg_confidence']:.4f}",
'Better_Model': 'VoxLingua' if vox_stats['accuracy'] > xlsr_stats['accuracy'] else 'XLS-R' if xlsr_stats['accuracy'] > vox_stats['accuracy'] else 'Tie'
})
lang_analysis_df = pd.DataFrame(lang_analysis_data)
lang_analysis_df.to_excel(writer, sheet_name='Per_Language_Analysis', index=False)
# ========================================
# SHEET 5: CONFUSION MATRIX - VOXLINGUA
# ========================================
if results['voxlingua_detailed']:
print(" πŸ“‹ Creating VoxLingua confusion matrix...")
vox_df = pd.DataFrame(results['voxlingua_detailed'])
valid_vox = vox_df[vox_df['gt_iso'] != ''].copy()
if len(valid_vox) > 0:
# Create confusion matrix
confusion_data = []
for gt_lang in sorted(valid_vox['gt_iso'].unique()):
gt_data = valid_vox[valid_vox['gt_iso'] == gt_lang]
row_data = {'Ground_Truth': gt_lang}
for pred_lang in sorted(valid_vox['pred_iso'].unique()):
count = len(gt_data[gt_data['pred_iso'] == pred_lang])
row_data[f'Predicted_{pred_lang}'] = count
confusion_data.append(row_data)
confusion_df = pd.DataFrame(confusion_data).fillna(0)
confusion_df.to_excel(writer, sheet_name='VoxLingua_Confusion_Matrix', index=False)
# ========================================
# SHEET 6: CONFUSION MATRIX - XLS-R
# ========================================
if results['xlsr_detailed']:
print(" πŸ“‹ Creating XLS-R confusion matrix...")
xlsr_df = pd.DataFrame(results['xlsr_detailed'])
valid_xlsr = xlsr_df[xlsr_df['gt_iso'] != ''].copy()
if len(valid_xlsr) > 0:
confusion_data = []
for gt_lang in sorted(valid_xlsr['gt_iso'].unique()):
gt_data = valid_xlsr[valid_xlsr['gt_iso'] == gt_lang]
row_data = {'Ground_Truth': gt_lang}
for pred_lang in sorted(valid_xlsr['pred_iso'].unique()):
count = len(gt_data[gt_data['pred_iso'] == pred_lang])
row_data[f'Predicted_{pred_lang}'] = count
confusion_data.append(row_data)
confusion_df = pd.DataFrame(confusion_data).fillna(0)
confusion_df.to_excel(writer, sheet_name='XLSR_Confusion_Matrix', index=False)
# ========================================
# SHEET 7: CONFIDENCE ANALYSIS
# ========================================
print(" πŸ“‹ Creating confidence analysis...")
confidence_analysis = []
# VoxLingua confidence analysis
if results['voxlingua_detailed']:
vox_df = pd.DataFrame(results['voxlingua_detailed'])
valid_vox = vox_df[vox_df['gt_iso'] != ''].copy()
if len(valid_vox) > 0:
for conf_range in [(0, 0.3), (0.3, 0.6), (0.6, 0.8), (0.8, 1.0)]:
range_data = valid_vox[
(valid_vox['confidence'] >= conf_range[0]) &
(valid_vox['confidence'] < conf_range[1])
]
if len(range_data) > 0:
accuracy = (range_data['correct'] == True).mean()
confidence_analysis.append({
'Model': 'VoxLingua107',
'Confidence_Range': f"{conf_range[0]:.1f}-{conf_range[1]:.1f}",
'Files': len(range_data),
'Accuracy': f"{accuracy:.4f}",
'Accuracy_Pct': f"{accuracy*100:.1f}%",
'Avg_Confidence': f"{range_data['confidence'].mean():.4f}"
})
# XLS-R confidence analysis
if results['xlsr_detailed']:
xlsr_df = pd.DataFrame(results['xlsr_detailed'])
valid_xlsr = xlsr_df[xlsr_df['gt_iso'] != ''].copy()
if len(valid_xlsr) > 0:
for conf_range in [(0, 0.3), (0.3, 0.6), (0.6, 0.8), (0.8, 1.0)]:
range_data = valid_xlsr[
(valid_xlsr['confidence'] >= conf_range[0]) &
(valid_xlsr['confidence'] < conf_range[1])
]
if len(range_data) > 0:
accuracy = (range_data['correct'] == True).mean()
confidence_analysis.append({
'Model': 'XLS-R',
'Confidence_Range': f"{conf_range[0]:.1f}-{conf_range[1]:.1f}",
'Files': len(range_data),
'Accuracy': f"{accuracy:.4f}",
'Accuracy_Pct': f"{accuracy*100:.1f}%",
'Avg_Confidence': f"{range_data['confidence'].mean():.4f}"
})
confidence_df = pd.DataFrame(confidence_analysis)
confidence_df.to_excel(writer, sheet_name='Confidence_Analysis', index=False)
# ========================================
# SHEET 8: ERROR ANALYSIS
# ========================================
print(" πŸ“‹ Creating error analysis...")
error_analysis = []
# VoxLingua errors
if results['voxlingua_detailed']:
vox_df = pd.DataFrame(results['voxlingua_detailed'])
vox_errors = vox_df[vox_df['correct'] == False].copy()
for _, error in vox_errors.iterrows():
error_analysis.append({
'Model': 'VoxLingua107',
'File': error['file'],
'Ground_Truth': error['gt_iso'],
'Predicted': error['pred_iso'],
'Confidence': f"{error['confidence']:.4f}",
'GT_Language_Family': get_language_family(error['gt_iso']),
'Pred_Language_Family': get_language_family(error['pred_iso']),
'Cross_Family_Error': get_language_family(error['gt_iso']) != get_language_family(error['pred_iso'])
})
# XLS-R errors
if results['xlsr_detailed']:
xlsr_df = pd.DataFrame(results['xlsr_detailed'])
xlsr_errors = xlsr_df[xlsr_df['correct'] == False].copy()
for _, error in xlsr_errors.iterrows():
error_analysis.append({
'Model': 'XLS-R',
'File': error['file'],
'Ground_Truth': error['gt_iso'],
'Predicted': error['pred_iso'],
'Confidence': f"{error['confidence']:.4f}",
'GT_Language_Family': get_language_family(error['gt_iso']),
'Pred_Language_Family': get_language_family(error['pred_iso']),
'Cross_Family_Error': get_language_family(error['gt_iso']) != get_language_family(error['pred_iso'])
})
error_df = pd.DataFrame(error_analysis)
error_df.to_excel(writer, sheet_name='Error_Analysis', index=False)
# ========================================
# SHEET 9: LANGUAGE FAMILY PERFORMANCE
# ========================================
print(" πŸ“‹ Creating language family performance...")
family_performance = []
families = ['Indo-Aryan', 'Dravidian', 'Low-Resource', 'Other']
for family in families:
# VoxLingua performance for this family
if results['voxlingua_detailed']:
vox_df = pd.DataFrame(results['voxlingua_detailed'])
family_data = vox_df[vox_df['gt_iso'].apply(lambda x: get_language_family(x) == family)]
if len(family_data) > 0:
vox_acc = (family_data['correct'] == True).mean()
vox_conf = family_data['confidence'].mean()
vox_files = len(family_data)
else:
vox_acc = vox_conf = vox_files = 0
else:
vox_acc = vox_conf = vox_files = 0
# XLS-R performance for this family
if results['xlsr_detailed']:
xlsr_df = pd.DataFrame(results['xlsr_detailed'])
family_data = xlsr_df[xlsr_df['gt_iso'].apply(lambda x: get_language_family(x) == family)]
if len(family_data) > 0:
xlsr_acc = (family_data['correct'] == True).mean()
xlsr_conf = family_data['confidence'].mean()
xlsr_files = len(family_data)
else:
xlsr_acc = xlsr_conf = xlsr_files = 0
else:
xlsr_acc = xlsr_conf = xlsr_files = 0
family_performance.append({
'Language_Family': family,
'VoxLingua_Files': vox_files,
'VoxLingua_Accuracy': f"{vox_acc:.4f}",
'VoxLingua_Accuracy_Pct': f"{vox_acc*100:.1f}%",
'VoxLingua_Avg_Confidence': f"{vox_conf:.4f}",
'XLSR_Files': xlsr_files,
'XLSR_Accuracy': f"{xlsr_acc:.4f}",
'XLSR_Accuracy_Pct': f"{xlsr_acc*100:.1f}%",
'XLSR_Avg_Confidence': f"{xlsr_conf:.4f}",
'Better_Model': 'VoxLingua' if vox_acc > xlsr_acc else 'XLS-R' if xlsr_acc > vox_acc else 'Tie'
})
family_df = pd.DataFrame(family_performance)
family_df.to_excel(writer, sheet_name='Language_Family_Performance', index=False)
# ========================================
# SHEET 10: TOP-5 PREDICTIONS (SAMPLE)
# ========================================
print(" πŸ“‹ Creating Top-5 predictions sample...")
top5_sample = []
# Sample top-5 predictions from comparison data
sample_files = results['comparison_data'][:20] # First 20 files as sample
for file_data in sample_files:
file_name = file_data['file']
gt_lang = file_data['gt_iso']
# VoxLingua Top-5
if file_data['voxlingua']['available'] and 'top5_predictions' in file_data['voxlingua']:
for pred in file_data['voxlingua']['top5_predictions']:
top5_sample.append({
'Model': 'VoxLingua107',
'File': file_name,
'Ground_Truth': gt_lang,
'Rank': pred['rank'],
'Predicted_Language': pred['mapped'],
'Original_Output': pred['original'],
'Confidence': f"{pred['confidence']:.4f}",
'In_Dataset': pred['in_dataset'],
'Correct': gt_lang == pred['mapped']
})
# XLS-R Top-5
if file_data['xlsr']['available'] and 'top5_predictions' in file_data['xlsr']:
for pred in file_data['xlsr']['top5_predictions']:
top5_sample.append({
'Model': 'XLS-R',
'File': file_name,
'Ground_Truth': gt_lang,
'Rank': pred['rank'],
'Predicted_Language': pred['mapped'],
'Original_Output': pred['original'],
'Confidence': f"{pred['confidence']:.4f}",
'In_Dataset': pred['in_dataset'],
'Correct': gt_lang == pred['mapped']
})
top5_df = pd.DataFrame(top5_sample)
top5_df.to_excel(writer, sheet_name='Top5_Predictions_Sample', index=False)
print(f"βœ… Comprehensive Excel analysis created: {output_filename}")
# Try to download the file
try:
from google.colab import files
print(f"πŸ“₯ File downloaded successfully!")
except:
print(f"πŸ“ File saved locally: {output_filename}")
return output_filename
# Run the comprehensive Excel analysis
if 'final_analysis_results' in globals() and final_analysis_results:
excel_filename = create_comprehensive_excel_analysis(
final_analysis_results,
"Language_Detection_Comprehensive_Analysis.xlsx"
)
print(f"\nπŸŽ‰ COMPREHENSIVE EXCEL ANALYSIS COMPLETE!")
print(f"πŸ“Š File: {excel_filename}")
# Print summary of what was created
print(f"\nπŸ“‹ Excel Contains 10 Sheets:")
print(f" 1. Executive_Summary - Key metrics and recommendations")
print(f" 2. VoxLingua107_Results - Detailed VoxLingua results")
print(f" 3. XLSR_Results - Detailed XLS-R results")
print(f" 4. Per_Language_Analysis - Accuracy by language")
print(f" 5. VoxLingua_Confusion_Matrix - VoxLingua confusion matrix")
print(f" 6. XLSR_Confusion_Matrix - XLS-R confusion matrix")
print(f" 7. Confidence_Analysis - Performance by confidence ranges")
print(f" 8. Error_Analysis - Detailed error breakdown")
print(f" 9. Language_Family_Performance - Performance by language family")
print(f" 10. Top5_Predictions_Sample - Sample of top-5 predictions")
else:
print("❌ No analysis results found. Please run the analysis first.")