CognxSafeTrack commited on
Commit
ef0913c
Β·
1 Parent(s): 5006493

feat(api): add whisper confidence calibration script and endpoint

Browse files
apps/api/src/routes/admin.ts CHANGED
@@ -234,6 +234,24 @@ export async function adminRoutes(fastify: FastifyInstance) {
234
  }
235
  });
236
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  // ══════════════════════════════════════════════════════════════════════════
238
  // TRACK DAYS CRUD
239
  // ══════════════════════════════════════════════════════════════════════════
 
234
  }
235
  });
236
 
237
+ // ── STT Quality Calibration Endpoint ───────────────────────────────────────
238
+ fastify.get('/stats/confidence-distribution', async (_req, reply) => {
239
+ const fs = require('fs');
240
+ const path = require('path');
241
+ const statsPath = path.join(__dirname, '../../data/calibration_stats.json');
242
+
243
+ try {
244
+ if (fs.existsSync(statsPath)) {
245
+ const data = JSON.parse(fs.readFileSync(statsPath, 'utf8'));
246
+ return data;
247
+ } else {
248
+ return reply.code(404).send({ error: "Calibration not run yet", message: "Le fichier calibration_stats.json est manquant. Lancez runCalibration()." });
249
+ }
250
+ } catch (err: any) {
251
+ return reply.code(500).send({ error: err.message });
252
+ }
253
+ });
254
+
255
  // ══════════════════════════════════════════════════════════════════════════
256
  // TRACK DAYS CRUD
257
  // ══════════════════════════════════════════════════════════════════════════
apps/api/src/scripts/calibrate-whisper.ts ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fs from 'fs';
2
+ import path from 'path';
3
+ import { aiService } from '../services/ai';
4
+ import { normalizeWolof } from './normalizeWolof';
5
+
6
+ const STATS_PATH = path.join(__dirname, '../../data/calibration_stats.json');
7
+
8
+ async function downloadAudioBuffer(url: string): Promise<Buffer> {
9
+ const res = await fetch(url);
10
+ if (!res.ok) throw new Error(`Failed to fetch audio from ${url}`);
11
+ const arrayBuffer = await res.arrayBuffer();
12
+ return Buffer.from(arrayBuffer);
13
+ }
14
+
15
+ async function fetchHuggingFaceRows(dataset: string, config: string, split: string, limit: number) {
16
+ const url = `https://datasets-server.huggingface.co/rows?dataset=${encodeURIComponent(dataset)}&config=${encodeURIComponent(config)}&split=${encodeURIComponent(split)}&offset=0&length=${limit}`;
17
+ const res = await fetch(url);
18
+ if (!res.ok) throw new Error(`Failed to fetch dataset rows from ${url}`);
19
+ const data = await res.json() as any;
20
+ return data.rows;
21
+ }
22
+
23
+ export async function runCalibration() {
24
+ console.log("πŸš€ Starting Whisper Confidence Calibration Stress-Test...");
25
+
26
+ const results = [];
27
+ let redCount = 0;
28
+ let orangeCount = 0;
29
+ let greenCount = 0;
30
+ let totalConfidence = 0;
31
+
32
+ const sources = [
33
+ { dataset: 'mozilla-foundation/common_voice_17_0', config: 'wo', name: 'CommonVoice' },
34
+ { dataset: 'google/fleurs', config: 'wo_sn', name: 'FLEURS' }
35
+ ];
36
+
37
+ let totalProcessed = 0;
38
+
39
+ for (const source of sources) {
40
+ console.log(`\nFetching 25 samples from ${source.name}...`);
41
+ try {
42
+ const rows = await fetchHuggingFaceRows(source.dataset, source.config, 'test', 25);
43
+
44
+ for (let i = 0; i < rows.length; i++) {
45
+ const row = rows[i].row;
46
+ // Field names might vary, usually it's 'audio' which is an object containing 'src' (array) or just an object
47
+ const audioData = row.audio || row.audio_data || Object.values(row).find((v: any) => v && v[0]?.src);
48
+
49
+ let audioUrl = '';
50
+ if (Array.isArray(audioData) && audioData[0]?.src) {
51
+ audioUrl = audioData[0].src;
52
+ } else if (audioData?.src) {
53
+ audioUrl = audioData.src; // sometimes a direct object
54
+ }
55
+
56
+ if (!audioUrl) {
57
+ console.warn(`[SKIP] No audio URL found for row ${i} in ${source.name}`);
58
+ continue;
59
+ }
60
+
61
+ console.log(`[${source.name} ${i + 1}/${rows.length}] Transcribing...`);
62
+
63
+ try {
64
+ const audioBuffer = await downloadAudioBuffer(audioUrl);
65
+ // Pass to Whisper
66
+ const { text, confidence } = await aiService.transcribeAudio(audioBuffer, `sample_${i}.mp3`, 'WOLOF');
67
+
68
+ // Normalize
69
+ const normResult = normalizeWolof(text);
70
+
71
+ totalConfidence += confidence;
72
+ totalProcessed++;
73
+
74
+ if (confidence <= 50) redCount++;
75
+ else if (confidence <= 80) orangeCount++;
76
+ else greenCount++;
77
+
78
+ results.push({
79
+ source: source.name,
80
+ index: i,
81
+ originalText: text,
82
+ normalizedText: normResult.normalizedText,
83
+ confidenceScore: confidence,
84
+ status: confidence <= 50 ? 'RED' : confidence <= 80 ? 'ORANGE' : 'GREEN'
85
+ });
86
+
87
+ } catch (err: any) {
88
+ console.error(`Error processing sample ${i} from ${source.name}: ${err.message}`);
89
+ }
90
+ }
91
+ } catch (err: any) {
92
+ console.error(`Failed to fetch or process ${source.name}: ${err.message}`);
93
+ }
94
+ }
95
+
96
+ const averageConfidence = totalProcessed > 0 ? Math.round(totalConfidence / totalProcessed) : 0;
97
+
98
+ const stats = {
99
+ totalProcessed,
100
+ averageConfidence,
101
+ distribution: {
102
+ red: { count: redCount, percentage: totalProcessed ? Math.round((redCount / totalProcessed) * 100) : 0 },
103
+ orange: { count: orangeCount, percentage: totalProcessed ? Math.round((orangeCount / totalProcessed) * 100) : 0 },
104
+ green: { count: greenCount, percentage: totalProcessed ? Math.round((greenCount / totalProcessed) * 100) : 0 }
105
+ },
106
+ samples: results,
107
+ updatedAt: new Date().toISOString()
108
+ };
109
+
110
+ // Ensure data dir exists
111
+ const dataDir = path.dirname(STATS_PATH);
112
+ if (!fs.existsSync(dataDir)) {
113
+ fs.mkdirSync(dataDir, { recursive: true });
114
+ }
115
+
116
+ fs.writeFileSync(STATS_PATH, JSON.stringify(stats, null, 2));
117
+ console.log(`\nβœ… Calibration finished! Stats saved to ${STATS_PATH}`);
118
+ console.log(`Average Confidence: ${averageConfidence}%`);
119
+ console.log(`Red (<=50%): ${redCount} | Orange (51-80%): ${orangeCount} | Green (>80%): ${greenCount}`);
120
+ }
121
+
122
+ // Allow running directly from command line
123
+ if (require.main === module) {
124
+ runCalibration().then(() => process.exit(0)).catch(err => {
125
+ console.error(err);
126
+ process.exit(1);
127
+ });
128
+ }
apps/api/src/scripts/normalizeWolof.ts ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Wolof STT Normalizer Utility v2.0
3
+ * Includes change tracking and WhatsApp message shortening.
4
+ */
5
+
6
+ const NORMALIZATION_RULES: Record<string, string> = {
7
+ "damae": "damay",
8
+ "dama": "damay",
9
+ "dma": "damay",
10
+ "jai": "jaay",
11
+ "jaai": "jaay",
12
+ "jaye": "jaay",
13
+ "jendi": "jΓ«nd",
14
+ "fei": "fey",
15
+ "fay": "fey",
16
+ "yere": "yΓ©re",
17
+ "yare": "yΓ©re",
18
+ "sandwiche": "sandwich",
19
+ "pan": "mburu",
20
+ "cafe": "cafΓ©",
21
+ "sabu": "sabu",
22
+ "omo": "omo",
23
+ "patat": "patas",
24
+ "ognon": "sooble",
25
+ "riz": "ceeb",
26
+ "yof": "Yoff",
27
+ "dakar": "Dakar",
28
+ "pikine": "Pikine",
29
+ "guediawaye": "GuΓ©diawaye",
30
+ "keur": "kΓ«r",
31
+ "ker": "kΓ«r",
32
+ "sikarche": "ci kΓ«r",
33
+ "sikarshe": "ci kΓ«r",
34
+ "sikarce": "ci kΓ«r",
35
+ "sikaarchΓ©": "ci kΓ«r",
36
+ "quartier": "quartier",
37
+ "banlieu": "banlieue",
38
+ "si": "ci",
39
+ "fane": "fan",
40
+ "fana": "fan",
41
+ "lana": "lan",
42
+ "lanna": "lan",
43
+ "nakka": "naka",
44
+ "nakha": "naka",
45
+ "niak": "Γ±Γ kk",
46
+ "niakk": "Γ±Γ kk",
47
+ "dencal": "denc", // requested dencal -> denc
48
+ "limal": "lim",
49
+ "ganee": "gaΓ±",
50
+ "gane": "gaΓ±",
51
+ "borom": "boroom",
52
+ "xaalisou": "xaalis",
53
+ "xaliss": "xaalis",
54
+ };
55
+
56
+ const CAPITALIZED_PLACES = ["Yoff", "Dakar", "Pikine", "GuΓ©diawaye"];
57
+
58
+ export interface NormalizationResult {
59
+ normalizedText: string;
60
+ changes: string[]; // Format: ["damae -> damay", ...]
61
+ }
62
+
63
+ export function normalizeWolof(rawText: string): NormalizationResult {
64
+ if (!rawText) return { normalizedText: '', changes: [] };
65
+
66
+ let text = rawText.trim().replace(/\s{2,}/g, " ");
67
+ const changes: string[] = [];
68
+
69
+ const words = text.split(" ");
70
+ const processedWords = words.map(word => {
71
+ const lowerWord = word.toLowerCase().replace(/[.,/#!$%^&*;:{}=\-_`~()]/g, "");
72
+ if (NORMALIZATION_RULES[lowerWord]) {
73
+ const replacement = NORMALIZATION_RULES[lowerWord];
74
+ if (lowerWord !== replacement.toLowerCase()) {
75
+ changes.push(`${lowerWord} -> ${replacement}`);
76
+ }
77
+ return replacement;
78
+ }
79
+
80
+ const matchingPlace = CAPITALIZED_PLACES.find(p => p.toLowerCase() === lowerWord);
81
+ if (matchingPlace) {
82
+ if (matchingPlace !== word) {
83
+ changes.push(`${word} -> ${matchingPlace}`);
84
+ }
85
+ return matchingPlace;
86
+ }
87
+
88
+ return word;
89
+ });
90
+
91
+ let normalizedText = processedWords.join(" ");
92
+
93
+ if (normalizedText.length > 0) {
94
+ normalizedText = normalizedText.charAt(0).toUpperCase() + normalizedText.slice(1);
95
+ }
96
+
97
+ return { normalizedText, changes: Array.from(new Set(changes)) };
98
+ }