File size: 6,396 Bytes
181ff6e
ef0913c
 
181ff6e
b6de1ea
ef0913c
 
 
181ff6e
 
 
 
ef0913c
b6de1ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef0913c
 
 
181ff6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef0913c
 
 
 
 
b6de1ea
 
ef0913c
 
 
181ff6e
 
 
 
ef0913c
181ff6e
 
 
 
 
 
 
b6de1ea
 
 
 
181ff6e
b6de1ea
 
181ff6e
 
 
 
 
 
 
 
 
 
 
 
 
b6de1ea
 
181ff6e
 
 
d9879cf
 
ef0913c
 
 
 
b6de1ea
 
 
 
 
 
 
 
 
ef0913c
 
 
 
b6de1ea
 
 
ef0913c
 
 
 
 
 
 
 
 
 
 
 
b6de1ea
 
ef0913c
 
 
 
181ff6e
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import 'dotenv/config';
import fs from 'fs';
import path from 'path';
import { execSync } from 'child_process';
import levenshtein from 'fast-levenshtein';
import { aiService } from '../services/ai';
import { normalizeWolof } from './normalizeWolof';

const DATA_DIR = path.join(__dirname, '../../data');
const STATS_PATH = path.join(DATA_DIR, 'calibration_stats.json');
const HF_SAMPLES_PATH = path.join(DATA_DIR, 'hf_samples.json');
const PY_SCRIPT = path.join(__dirname, 'fetch_hf_audio.py');

/**
 * Computes Word Error Rate (WER) using Levenshtein distance on words.
 * WER = (Substitutions + Deletions + Insertions) / Total Reference Words
 */
function calculateWER(reference: string, hypothesis: string): number {
    const refWords = reference.toLowerCase().replace(/[.,/#!$%^&*;:{}=\-_`~()]/g, "").split(/\s+/).filter(w => w);
    const hypWords = hypothesis.toLowerCase().replace(/[.,/#!$%^&*;:{}=\-_`~()]/g, "").split(/\s+/).filter(w => w);

    if (refWords.length === 0) return 0;

    // Map words to unique characters to use fast-levenshtein (which expects strings)
    const wordMap = new Map<string, string>();
    let charCode = 0xE000; // Use Private Use Area characters

    const getChar = (word: string) => {
        if (!wordMap.has(word)) {
            wordMap.set(word, String.fromCharCode(charCode++));
        }
        return wordMap.get(word)!;
    };

    const refChars = refWords.map(getChar).join('');
    const hypChars = hypWords.map(getChar).join('');

    const distance = levenshtein.get(refChars, hypChars);
    return distance / refWords.length;
}

export async function runCalibration() {
    console.log("🚀 Starting Whisper Confidence Calibration Stress-Test...");

    // Ensure data dir exists
    if (!fs.existsSync(DATA_DIR)) {
        fs.mkdirSync(DATA_DIR, { recursive: true });
    }

    // Run Python fetching script if samples not already present
    if (!fs.existsSync(HF_SAMPLES_PATH)) {
        console.log("📥 Calling Python datasets library to download Hugging Face audio...");
        try {
            execSync(`python3 ${PY_SCRIPT} --output ${DATA_DIR}`, { stdio: 'inherit' });
        } catch (e) {
            console.error("❌ Python script failed to fetch samples. Please check HF_TOKEN or network.");
        }
    } else {
        console.log("♻️ Using cached Hugging Face samples...");
    }

    if (!fs.existsSync(HF_SAMPLES_PATH)) {
        console.error("❌ No samples mapped. Exiting calibration.");
        return;
    }

    const samples = JSON.parse(fs.readFileSync(HF_SAMPLES_PATH, 'utf-8'));
    console.log(`\n🎧 Processing ${samples.length} samples through Whisper STT...`);

    const results = [];
    let redCount = 0;
    let orangeCount = 0;
    let greenCount = 0;
    let totalConfidence = 0;
    let totalRawWER = 0;
    let totalNormWER = 0;

    let totalProcessed = 0;

    for (let i = 0; i < samples.length; i++) {
        const sample = samples[i];
        console.log(`[${sample.source} ${i + 1}/${samples.length}] Transcribing...`);

        try {
            const audioBuffer = Buffer.from(sample.audio_base64, 'base64');
            // Pass to Whisper
            const { text, confidence } = await aiService.transcribeAudio(audioBuffer, `sample_${i}.wav`, 'WOLOF');

            // Normalize
            const normResult = normalizeWolof(text);

            // Compute WER
            const rawWER = calculateWER(sample.original_text, text);
            const normWER = calculateWER(sample.original_text, normResult.normalizedText);

            totalConfidence += confidence;
            totalRawWER += rawWER;
            totalNormWER += normWER;
            totalProcessed++;

            if (confidence <= 50) redCount++;
            else if (confidence <= 80) orangeCount++;
            else greenCount++;

            results.push({
                source: sample.source,
                index: i,
                hfOriginalText: sample.original_text,
                transcribedText: text,
                normalizedText: normResult.normalizedText,
                confidenceScore: confidence,
                rawWER,
                normalizedWER: normWER,
                status: confidence <= 50 ? 'RED' : confidence <= 80 ? 'ORANGE' : 'GREEN'
            });

        } catch (err: unknown) {
            console.error(`Error processing sample ${i} from ${sample.source}: ${(err instanceof Error ? (err instanceof Error ? err.message : String(err)) : String(err))}`);
        }
    }

    const averageConfidence = totalProcessed > 0 ? Math.round(totalConfidence / totalProcessed) : 0;
    const averageRawWER = totalProcessed > 0 ? totalRawWER / totalProcessed : 0;
    const averageNormalizedWER = totalProcessed > 0 ? totalNormWER / totalProcessed : 0;

    // Calculate Dictionary Efficiency (Improvement in WER relative to Raw WER)
    let dictionaryEfficiency = 0;
    if (averageRawWER > 0) {
        // If WER goes down, efficiency is positive.
        dictionaryEfficiency = ((averageRawWER - averageNormalizedWER) / averageRawWER) * 100;
    }

    const stats = {
        totalProcessed,
        averageConfidence,
        averageRawWER,
        averageNormalizedWER,
        dictionaryEfficiency,
        distribution: {
            red: { count: redCount, percentage: totalProcessed ? Math.round((redCount / totalProcessed) * 100) : 0 },
            orange: { count: orangeCount, percentage: totalProcessed ? Math.round((orangeCount / totalProcessed) * 100) : 0 },
            green: { count: greenCount, percentage: totalProcessed ? Math.round((greenCount / totalProcessed) * 100) : 0 }
        },
        samples: results,
        updatedAt: new Date().toISOString()
    };

    fs.writeFileSync(STATS_PATH, JSON.stringify(stats, null, 2));
    console.log(`\n✅ Calibration finished! Stats saved to ${STATS_PATH}`);
    console.log(`Average Confidence: ${averageConfidence}%`);
    console.log(`Raw WER: ${(averageRawWER * 100).toFixed(2)}% | Normalized WER: ${(averageNormalizedWER * 100).toFixed(2)}%`);
    console.log(`Dictionary Efficiency Gain: ${dictionaryEfficiency.toFixed(2)}%`);
    console.log(`Red (<=50%): ${redCount} | Orange (51-80%): ${orangeCount} | Green (>80%): ${greenCount}`);
}

// Allow running directly from command line
runCalibration().then(() => process.exit(0)).catch(err => {
    console.error(err);
    process.exit(1);
});