Spaces:

safetrack
/

edtech

Sleeping

edtech / apps /api /src /scripts /calibrate-whisper.ts

CognxSafeTrack

chore: execute Sprint 38 technical debt resolution (Type Safety, Zod validation, Vitest, Mock LLM extracted)

d9879cf 8 days ago

6.4 kB

	import 'dotenv/config';
	import fs from 'fs';
	import path from 'path';
	import { execSync } from 'child_process';
	import levenshtein from 'fast-levenshtein';
	import { aiService } from '../services/ai';
	import { normalizeWolof } from './normalizeWolof';

	const DATA_DIR = path.join(__dirname, '../../data');
	const STATS_PATH = path.join(DATA_DIR, 'calibration_stats.json');
	const HF_SAMPLES_PATH = path.join(DATA_DIR, 'hf_samples.json');
	const PY_SCRIPT = path.join(__dirname, 'fetch_hf_audio.py');

	/**
	* Computes Word Error Rate (WER) using Levenshtein distance on words.
	* WER = (Substitutions + Deletions + Insertions) / Total Reference Words
	*/
	function calculateWER(reference: string, hypothesis: string): number {
	const refWords = reference.toLowerCase().replace(/[.,/#!$%^&*;:{}=\-_`~()]/g, "").split(/\s+/).filter(w => w);
	const hypWords = hypothesis.toLowerCase().replace(/[.,/#!$%^&*;:{}=\-_`~()]/g, "").split(/\s+/).filter(w => w);

	if (refWords.length === 0) return 0;

	// Map words to unique characters to use fast-levenshtein (which expects strings)
	const wordMap = new Map<string, string>();
	let charCode = 0xE000; // Use Private Use Area characters

	const getChar = (word: string) => {
	if (!wordMap.has(word)) {
	wordMap.set(word, String.fromCharCode(charCode++));
	}
	return wordMap.get(word)!;
	};

	const refChars = refWords.map(getChar).join('');
	const hypChars = hypWords.map(getChar).join('');

	const distance = levenshtein.get(refChars, hypChars);
	return distance / refWords.length;
	}

	export async function runCalibration() {
	console.log("🚀 Starting Whisper Confidence Calibration Stress-Test...");

	// Ensure data dir exists
	if (!fs.existsSync(DATA_DIR)) {
	fs.mkdirSync(DATA_DIR, { recursive: true });
	}

	// Run Python fetching script if samples not already present
	if (!fs.existsSync(HF_SAMPLES_PATH)) {
	console.log("📥 Calling Python datasets library to download Hugging Face audio...");
	try {
	execSync(`python3 ${PY_SCRIPT} --output ${DATA_DIR}`, { stdio: 'inherit' });
	} catch (e) {
	console.error("❌ Python script failed to fetch samples. Please check HF_TOKEN or network.");
	}
	} else {
	console.log("♻️ Using cached Hugging Face samples...");
	}

	if (!fs.existsSync(HF_SAMPLES_PATH)) {
	console.error("❌ No samples mapped. Exiting calibration.");
	return;
	}

	const samples = JSON.parse(fs.readFileSync(HF_SAMPLES_PATH, 'utf-8'));
	console.log(`\n🎧 Processing ${samples.length} samples through Whisper STT...`);

	const results = [];
	let redCount = 0;
	let orangeCount = 0;
	let greenCount = 0;
	let totalConfidence = 0;
	let totalRawWER = 0;
	let totalNormWER = 0;

	let totalProcessed = 0;

	for (let i = 0; i < samples.length; i++) {
	const sample = samples[i];
	console.log(`[${sample.source} ${i + 1}/${samples.length}] Transcribing...`);

	try {
	const audioBuffer = Buffer.from(sample.audio_base64, 'base64');
	// Pass to Whisper
	const { text, confidence } = await aiService.transcribeAudio(audioBuffer, `sample_${i}.wav`, 'WOLOF');

	// Normalize
	const normResult = normalizeWolof(text);

	// Compute WER
	const rawWER = calculateWER(sample.original_text, text);
	const normWER = calculateWER(sample.original_text, normResult.normalizedText);

	totalConfidence += confidence;
	totalRawWER += rawWER;
	totalNormWER += normWER;
	totalProcessed++;

	if (confidence <= 50) redCount++;
	else if (confidence <= 80) orangeCount++;
	else greenCount++;

	results.push({
	source: sample.source,
	index: i,
	hfOriginalText: sample.original_text,
	transcribedText: text,
	normalizedText: normResult.normalizedText,
	confidenceScore: confidence,
	rawWER,
	normalizedWER: normWER,
	status: confidence <= 50 ? 'RED' : confidence <= 80 ? 'ORANGE' : 'GREEN'
	});

	} catch (err: unknown) {
	console.error(`Error processing sample ${i} from ${sample.source}: ${(err instanceof Error ? (err instanceof Error ? err.message : String(err)) : String(err))}`);
	}
	}

	const averageConfidence = totalProcessed > 0 ? Math.round(totalConfidence / totalProcessed) : 0;
	const averageRawWER = totalProcessed > 0 ? totalRawWER / totalProcessed : 0;
	const averageNormalizedWER = totalProcessed > 0 ? totalNormWER / totalProcessed : 0;

	// Calculate Dictionary Efficiency (Improvement in WER relative to Raw WER)
	let dictionaryEfficiency = 0;
	if (averageRawWER > 0) {
	// If WER goes down, efficiency is positive.
	dictionaryEfficiency = ((averageRawWER - averageNormalizedWER) / averageRawWER) * 100;
	}

	const stats = {
	totalProcessed,
	averageConfidence,
	averageRawWER,
	averageNormalizedWER,
	dictionaryEfficiency,
	distribution: {
	red: { count: redCount, percentage: totalProcessed ? Math.round((redCount / totalProcessed) * 100) : 0 },
	orange: { count: orangeCount, percentage: totalProcessed ? Math.round((orangeCount / totalProcessed) * 100) : 0 },
	green: { count: greenCount, percentage: totalProcessed ? Math.round((greenCount / totalProcessed) * 100) : 0 }
	},
	samples: results,
	updatedAt: new Date().toISOString()
	};

	fs.writeFileSync(STATS_PATH, JSON.stringify(stats, null, 2));
	console.log(`\n✅ Calibration finished! Stats saved to ${STATS_PATH}`);
	console.log(`Average Confidence: ${averageConfidence}%`);
	console.log(`Raw WER: ${(averageRawWER * 100).toFixed(2)}% \| Normalized WER: ${(averageNormalizedWER * 100).toFixed(2)}%`);
	console.log(`Dictionary Efficiency Gain: ${dictionaryEfficiency.toFixed(2)}%`);
	console.log(`Red (<=50%): ${redCount} \| Orange (51-80%): ${orangeCount} \| Green (>80%): ${greenCount}`);
	}

	// Allow running directly from command line
	runCalibration().then(() => process.exit(0)).catch(err => {
	console.error(err);
	process.exit(1);
	});