File size: 6,396 Bytes
181ff6e ef0913c 181ff6e b6de1ea ef0913c 181ff6e ef0913c b6de1ea ef0913c 181ff6e ef0913c b6de1ea ef0913c 181ff6e ef0913c 181ff6e b6de1ea 181ff6e b6de1ea 181ff6e b6de1ea 181ff6e d9879cf ef0913c b6de1ea ef0913c b6de1ea ef0913c b6de1ea ef0913c 181ff6e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 | import 'dotenv/config';
import fs from 'fs';
import path from 'path';
import { execSync } from 'child_process';
import levenshtein from 'fast-levenshtein';
import { aiService } from '../services/ai';
import { normalizeWolof } from './normalizeWolof';
const DATA_DIR = path.join(__dirname, '../../data');
const STATS_PATH = path.join(DATA_DIR, 'calibration_stats.json');
const HF_SAMPLES_PATH = path.join(DATA_DIR, 'hf_samples.json');
const PY_SCRIPT = path.join(__dirname, 'fetch_hf_audio.py');
/**
* Computes Word Error Rate (WER) using Levenshtein distance on words.
* WER = (Substitutions + Deletions + Insertions) / Total Reference Words
*/
function calculateWER(reference: string, hypothesis: string): number {
const refWords = reference.toLowerCase().replace(/[.,/#!$%^&*;:{}=\-_`~()]/g, "").split(/\s+/).filter(w => w);
const hypWords = hypothesis.toLowerCase().replace(/[.,/#!$%^&*;:{}=\-_`~()]/g, "").split(/\s+/).filter(w => w);
if (refWords.length === 0) return 0;
// Map words to unique characters to use fast-levenshtein (which expects strings)
const wordMap = new Map<string, string>();
let charCode = 0xE000; // Use Private Use Area characters
const getChar = (word: string) => {
if (!wordMap.has(word)) {
wordMap.set(word, String.fromCharCode(charCode++));
}
return wordMap.get(word)!;
};
const refChars = refWords.map(getChar).join('');
const hypChars = hypWords.map(getChar).join('');
const distance = levenshtein.get(refChars, hypChars);
return distance / refWords.length;
}
export async function runCalibration() {
console.log("🚀 Starting Whisper Confidence Calibration Stress-Test...");
// Ensure data dir exists
if (!fs.existsSync(DATA_DIR)) {
fs.mkdirSync(DATA_DIR, { recursive: true });
}
// Run Python fetching script if samples not already present
if (!fs.existsSync(HF_SAMPLES_PATH)) {
console.log("📥 Calling Python datasets library to download Hugging Face audio...");
try {
execSync(`python3 ${PY_SCRIPT} --output ${DATA_DIR}`, { stdio: 'inherit' });
} catch (e) {
console.error("❌ Python script failed to fetch samples. Please check HF_TOKEN or network.");
}
} else {
console.log("♻️ Using cached Hugging Face samples...");
}
if (!fs.existsSync(HF_SAMPLES_PATH)) {
console.error("❌ No samples mapped. Exiting calibration.");
return;
}
const samples = JSON.parse(fs.readFileSync(HF_SAMPLES_PATH, 'utf-8'));
console.log(`\n🎧 Processing ${samples.length} samples through Whisper STT...`);
const results = [];
let redCount = 0;
let orangeCount = 0;
let greenCount = 0;
let totalConfidence = 0;
let totalRawWER = 0;
let totalNormWER = 0;
let totalProcessed = 0;
for (let i = 0; i < samples.length; i++) {
const sample = samples[i];
console.log(`[${sample.source} ${i + 1}/${samples.length}] Transcribing...`);
try {
const audioBuffer = Buffer.from(sample.audio_base64, 'base64');
// Pass to Whisper
const { text, confidence } = await aiService.transcribeAudio(audioBuffer, `sample_${i}.wav`, 'WOLOF');
// Normalize
const normResult = normalizeWolof(text);
// Compute WER
const rawWER = calculateWER(sample.original_text, text);
const normWER = calculateWER(sample.original_text, normResult.normalizedText);
totalConfidence += confidence;
totalRawWER += rawWER;
totalNormWER += normWER;
totalProcessed++;
if (confidence <= 50) redCount++;
else if (confidence <= 80) orangeCount++;
else greenCount++;
results.push({
source: sample.source,
index: i,
hfOriginalText: sample.original_text,
transcribedText: text,
normalizedText: normResult.normalizedText,
confidenceScore: confidence,
rawWER,
normalizedWER: normWER,
status: confidence <= 50 ? 'RED' : confidence <= 80 ? 'ORANGE' : 'GREEN'
});
} catch (err: unknown) {
console.error(`Error processing sample ${i} from ${sample.source}: ${(err instanceof Error ? (err instanceof Error ? err.message : String(err)) : String(err))}`);
}
}
const averageConfidence = totalProcessed > 0 ? Math.round(totalConfidence / totalProcessed) : 0;
const averageRawWER = totalProcessed > 0 ? totalRawWER / totalProcessed : 0;
const averageNormalizedWER = totalProcessed > 0 ? totalNormWER / totalProcessed : 0;
// Calculate Dictionary Efficiency (Improvement in WER relative to Raw WER)
let dictionaryEfficiency = 0;
if (averageRawWER > 0) {
// If WER goes down, efficiency is positive.
dictionaryEfficiency = ((averageRawWER - averageNormalizedWER) / averageRawWER) * 100;
}
const stats = {
totalProcessed,
averageConfidence,
averageRawWER,
averageNormalizedWER,
dictionaryEfficiency,
distribution: {
red: { count: redCount, percentage: totalProcessed ? Math.round((redCount / totalProcessed) * 100) : 0 },
orange: { count: orangeCount, percentage: totalProcessed ? Math.round((orangeCount / totalProcessed) * 100) : 0 },
green: { count: greenCount, percentage: totalProcessed ? Math.round((greenCount / totalProcessed) * 100) : 0 }
},
samples: results,
updatedAt: new Date().toISOString()
};
fs.writeFileSync(STATS_PATH, JSON.stringify(stats, null, 2));
console.log(`\n✅ Calibration finished! Stats saved to ${STATS_PATH}`);
console.log(`Average Confidence: ${averageConfidence}%`);
console.log(`Raw WER: ${(averageRawWER * 100).toFixed(2)}% | Normalized WER: ${(averageNormalizedWER * 100).toFixed(2)}%`);
console.log(`Dictionary Efficiency Gain: ${dictionaryEfficiency.toFixed(2)}%`);
console.log(`Red (<=50%): ${redCount} | Orange (51-80%): ${orangeCount} | Green (>80%): ${greenCount}`);
}
// Allow running directly from command line
runCalibration().then(() => process.exit(0)).catch(err => {
console.error(err);
process.exit(1);
});
|