edtech / apps /api /src /scripts /calibrate-whisper.ts
CognxSafeTrack
chore: execute Sprint 38 technical debt resolution (Type Safety, Zod validation, Vitest, Mock LLM extracted)
d9879cf
import 'dotenv/config';
import fs from 'fs';
import path from 'path';
import { execSync } from 'child_process';
import levenshtein from 'fast-levenshtein';
import { aiService } from '../services/ai';
import { normalizeWolof } from './normalizeWolof';
const DATA_DIR = path.join(__dirname, '../../data');
const STATS_PATH = path.join(DATA_DIR, 'calibration_stats.json');
const HF_SAMPLES_PATH = path.join(DATA_DIR, 'hf_samples.json');
const PY_SCRIPT = path.join(__dirname, 'fetch_hf_audio.py');
/**
* Computes Word Error Rate (WER) using Levenshtein distance on words.
* WER = (Substitutions + Deletions + Insertions) / Total Reference Words
*/
function calculateWER(reference: string, hypothesis: string): number {
const refWords = reference.toLowerCase().replace(/[.,/#!$%^&*;:{}=\-_`~()]/g, "").split(/\s+/).filter(w => w);
const hypWords = hypothesis.toLowerCase().replace(/[.,/#!$%^&*;:{}=\-_`~()]/g, "").split(/\s+/).filter(w => w);
if (refWords.length === 0) return 0;
// Map words to unique characters to use fast-levenshtein (which expects strings)
const wordMap = new Map<string, string>();
let charCode = 0xE000; // Use Private Use Area characters
const getChar = (word: string) => {
if (!wordMap.has(word)) {
wordMap.set(word, String.fromCharCode(charCode++));
}
return wordMap.get(word)!;
};
const refChars = refWords.map(getChar).join('');
const hypChars = hypWords.map(getChar).join('');
const distance = levenshtein.get(refChars, hypChars);
return distance / refWords.length;
}
export async function runCalibration() {
console.log("πŸš€ Starting Whisper Confidence Calibration Stress-Test...");
// Ensure data dir exists
if (!fs.existsSync(DATA_DIR)) {
fs.mkdirSync(DATA_DIR, { recursive: true });
}
// Run Python fetching script if samples not already present
if (!fs.existsSync(HF_SAMPLES_PATH)) {
console.log("πŸ“₯ Calling Python datasets library to download Hugging Face audio...");
try {
execSync(`python3 ${PY_SCRIPT} --output ${DATA_DIR}`, { stdio: 'inherit' });
} catch (e) {
console.error("❌ Python script failed to fetch samples. Please check HF_TOKEN or network.");
}
} else {
console.log("♻️ Using cached Hugging Face samples...");
}
if (!fs.existsSync(HF_SAMPLES_PATH)) {
console.error("❌ No samples mapped. Exiting calibration.");
return;
}
const samples = JSON.parse(fs.readFileSync(HF_SAMPLES_PATH, 'utf-8'));
console.log(`\n🎧 Processing ${samples.length} samples through Whisper STT...`);
const results = [];
let redCount = 0;
let orangeCount = 0;
let greenCount = 0;
let totalConfidence = 0;
let totalRawWER = 0;
let totalNormWER = 0;
let totalProcessed = 0;
for (let i = 0; i < samples.length; i++) {
const sample = samples[i];
console.log(`[${sample.source} ${i + 1}/${samples.length}] Transcribing...`);
try {
const audioBuffer = Buffer.from(sample.audio_base64, 'base64');
// Pass to Whisper
const { text, confidence } = await aiService.transcribeAudio(audioBuffer, `sample_${i}.wav`, 'WOLOF');
// Normalize
const normResult = normalizeWolof(text);
// Compute WER
const rawWER = calculateWER(sample.original_text, text);
const normWER = calculateWER(sample.original_text, normResult.normalizedText);
totalConfidence += confidence;
totalRawWER += rawWER;
totalNormWER += normWER;
totalProcessed++;
if (confidence <= 50) redCount++;
else if (confidence <= 80) orangeCount++;
else greenCount++;
results.push({
source: sample.source,
index: i,
hfOriginalText: sample.original_text,
transcribedText: text,
normalizedText: normResult.normalizedText,
confidenceScore: confidence,
rawWER,
normalizedWER: normWER,
status: confidence <= 50 ? 'RED' : confidence <= 80 ? 'ORANGE' : 'GREEN'
});
} catch (err: unknown) {
console.error(`Error processing sample ${i} from ${sample.source}: ${(err instanceof Error ? (err instanceof Error ? err.message : String(err)) : String(err))}`);
}
}
const averageConfidence = totalProcessed > 0 ? Math.round(totalConfidence / totalProcessed) : 0;
const averageRawWER = totalProcessed > 0 ? totalRawWER / totalProcessed : 0;
const averageNormalizedWER = totalProcessed > 0 ? totalNormWER / totalProcessed : 0;
// Calculate Dictionary Efficiency (Improvement in WER relative to Raw WER)
let dictionaryEfficiency = 0;
if (averageRawWER > 0) {
// If WER goes down, efficiency is positive.
dictionaryEfficiency = ((averageRawWER - averageNormalizedWER) / averageRawWER) * 100;
}
const stats = {
totalProcessed,
averageConfidence,
averageRawWER,
averageNormalizedWER,
dictionaryEfficiency,
distribution: {
red: { count: redCount, percentage: totalProcessed ? Math.round((redCount / totalProcessed) * 100) : 0 },
orange: { count: orangeCount, percentage: totalProcessed ? Math.round((orangeCount / totalProcessed) * 100) : 0 },
green: { count: greenCount, percentage: totalProcessed ? Math.round((greenCount / totalProcessed) * 100) : 0 }
},
samples: results,
updatedAt: new Date().toISOString()
};
fs.writeFileSync(STATS_PATH, JSON.stringify(stats, null, 2));
console.log(`\nβœ… Calibration finished! Stats saved to ${STATS_PATH}`);
console.log(`Average Confidence: ${averageConfidence}%`);
console.log(`Raw WER: ${(averageRawWER * 100).toFixed(2)}% | Normalized WER: ${(averageNormalizedWER * 100).toFixed(2)}%`);
console.log(`Dictionary Efficiency Gain: ${dictionaryEfficiency.toFixed(2)}%`);
console.log(`Red (<=50%): ${redCount} | Orange (51-80%): ${orangeCount} | Green (>80%): ${greenCount}`);
}
// Allow running directly from command line
runCalibration().then(() => process.exit(0)).catch(err => {
console.error(err);
process.exit(1);
});