Spaces:

safetrack
/

edtech

Running

App Files Files Community

CognxSafeTrack commited on Mar 7

Commit

ef0913c

1 Parent(s): 5006493

feat(api): add whisper confidence calibration script and endpoint

Browse files

Files changed (3) hide show

apps/api/src/routes/admin.ts +18 -0
apps/api/src/scripts/calibrate-whisper.ts +128 -0
apps/api/src/scripts/normalizeWolof.ts +98 -0

apps/api/src/routes/admin.ts CHANGED Viewed

@@ -234,6 +234,24 @@ export async function adminRoutes(fastify: FastifyInstance) {
         }
     });
     // ══════════════════════════════════════════════════════════════════════════
     // TRACK DAYS CRUD
     // ══════════════════════════════════════════════════════════════════════════

         }
     });
+    // ── STT Quality Calibration Endpoint ───────────────────────────────────────
+    fastify.get('/stats/confidence-distribution', async (_req, reply) => {
+        const fs = require('fs');
+        const path = require('path');
+        const statsPath = path.join(__dirname, '../../data/calibration_stats.json');
+        try {
+            if (fs.existsSync(statsPath)) {
+                const data = JSON.parse(fs.readFileSync(statsPath, 'utf8'));
+                return data;
+            } else {
+                return reply.code(404).send({ error: "Calibration not run yet", message: "Le fichier calibration_stats.json est manquant. Lancez runCalibration()." });
+            }
+        } catch (err: any) {
+            return reply.code(500).send({ error: err.message });
+        }
+    });
     // ══════════════════════════════════════════════════════════════════════════
     // TRACK DAYS CRUD
     // ══════════════════════════════════════════════════════════════════════════

apps/api/src/scripts/calibrate-whisper.ts ADDED Viewed

	@@ -0,0 +1,128 @@

+import fs from 'fs';
+import path from 'path';
+import { aiService } from '../services/ai';
+import { normalizeWolof } from './normalizeWolof';
+const STATS_PATH = path.join(__dirname, '../../data/calibration_stats.json');
+async function downloadAudioBuffer(url: string): Promise<Buffer> {
+    const res = await fetch(url);
+    if (!res.ok) throw new Error(`Failed to fetch audio from ${url}`);
+    const arrayBuffer = await res.arrayBuffer();
+    return Buffer.from(arrayBuffer);
+}
+async function fetchHuggingFaceRows(dataset: string, config: string, split: string, limit: number) {
+    const url = `https://datasets-server.huggingface.co/rows?dataset=${encodeURIComponent(dataset)}&config=${encodeURIComponent(config)}&split=${encodeURIComponent(split)}&offset=0&length=${limit}`;
+    const res = await fetch(url);
+    if (!res.ok) throw new Error(`Failed to fetch dataset rows from ${url}`);
+    const data = await res.json() as any;
+    return data.rows;
+}
+export async function runCalibration() {
+    console.log("🚀 Starting Whisper Confidence Calibration Stress-Test...");
+    const results = [];
+    let redCount = 0;
+    let orangeCount = 0;
+    let greenCount = 0;
+    let totalConfidence = 0;
+    const sources = [
+        { dataset: 'mozilla-foundation/common_voice_17_0', config: 'wo', name: 'CommonVoice' },
+        { dataset: 'google/fleurs', config: 'wo_sn', name: 'FLEURS' }
+    ];
+    let totalProcessed = 0;
+    for (const source of sources) {
+        console.log(`\nFetching 25 samples from ${source.name}...`);
+        try {
+            const rows = await fetchHuggingFaceRows(source.dataset, source.config, 'test', 25);
+            for (let i = 0; i < rows.length; i++) {
+                const row = rows[i].row;
+                // Field names might vary, usually it's 'audio' which is an object containing 'src' (array) or just an object
+                const audioData = row.audio || row.audio_data || Object.values(row).find((v: any) => v && v[0]?.src);
+                let audioUrl = '';
+                if (Array.isArray(audioData) && audioData[0]?.src) {
+                    audioUrl = audioData[0].src;
+                } else if (audioData?.src) {
+                    audioUrl = audioData.src; // sometimes a direct object
+                }
+                if (!audioUrl) {
+                    console.warn(`[SKIP] No audio URL found for row ${i} in ${source.name}`);
+                    continue;
+                }
+                console.log(`[${source.name} ${i + 1}/${rows.length}] Transcribing...`);
+                try {
+                    const audioBuffer = await downloadAudioBuffer(audioUrl);
+                    // Pass to Whisper
+                    const { text, confidence } = await aiService.transcribeAudio(audioBuffer, `sample_${i}.mp3`, 'WOLOF');
+                    // Normalize
+                    const normResult = normalizeWolof(text);
+                    totalConfidence += confidence;
+                    totalProcessed++;
+                    if (confidence <= 50) redCount++;
+                    else if (confidence <= 80) orangeCount++;
+                    else greenCount++;
+                    results.push({
+                        source: source.name,
+                        index: i,
+                        originalText: text,
+                        normalizedText: normResult.normalizedText,
+                        confidenceScore: confidence,
+                        status: confidence <= 50 ? 'RED' : confidence <= 80 ? 'ORANGE' : 'GREEN'
+                    });
+                } catch (err: any) {
+                    console.error(`Error processing sample ${i} from ${source.name}: ${err.message}`);
+                }
+            }
+        } catch (err: any) {
+            console.error(`Failed to fetch or process ${source.name}: ${err.message}`);
+        }
+    }
+    const averageConfidence = totalProcessed > 0 ? Math.round(totalConfidence / totalProcessed) : 0;
+    const stats = {
+        totalProcessed,
+        averageConfidence,
+        distribution: {
+            red: { count: redCount, percentage: totalProcessed ? Math.round((redCount / totalProcessed) * 100) : 0 },
+            orange: { count: orangeCount, percentage: totalProcessed ? Math.round((orangeCount / totalProcessed) * 100) : 0 },
+            green: { count: greenCount, percentage: totalProcessed ? Math.round((greenCount / totalProcessed) * 100) : 0 }
+        },
+        samples: results,
+        updatedAt: new Date().toISOString()
+    };
+    // Ensure data dir exists
+    const dataDir = path.dirname(STATS_PATH);
+    if (!fs.existsSync(dataDir)) {
+        fs.mkdirSync(dataDir, { recursive: true });
+    }
+    fs.writeFileSync(STATS_PATH, JSON.stringify(stats, null, 2));
+    console.log(`\n✅ Calibration finished! Stats saved to ${STATS_PATH}`);
+    console.log(`Average Confidence: ${averageConfidence}%`);
+    console.log(`Red (<=50%): ${redCount} | Orange (51-80%): ${orangeCount} | Green (>80%): ${greenCount}`);
+}
+// Allow running directly from command line
+if (require.main === module) {
+    runCalibration().then(() => process.exit(0)).catch(err => {
+        console.error(err);
+        process.exit(1);
+    });
+}

apps/api/src/scripts/normalizeWolof.ts ADDED Viewed

	@@ -0,0 +1,98 @@

+/**
+ * Wolof STT Normalizer Utility v2.0
+ * Includes change tracking and WhatsApp message shortening.
+ */
+const NORMALIZATION_RULES: Record<string, string> = {
+    "damae": "damay",
+    "dama": "damay",
+    "dma": "damay",
+    "jai": "jaay",
+    "jaai": "jaay",
+    "jaye": "jaay",
+    "jendi": "jënd",
+    "fei": "fey",
+    "fay": "fey",
+    "yere": "yére",
+    "yare": "yére",
+    "sandwiche": "sandwich",
+    "pan": "mburu",
+    "cafe": "café",
+    "sabu": "sabu",
+    "omo": "omo",
+    "patat": "patas",
+    "ognon": "sooble",
+    "riz": "ceeb",
+    "yof": "Yoff",
+    "dakar": "Dakar",
+    "pikine": "Pikine",
+    "guediawaye": "Guédiawaye",
+    "keur": "kër",
+    "ker": "kër",
+    "sikarche": "ci kër",
+    "sikarshe": "ci kër",
+    "sikarce": "ci kër",
+    "sikaarché": "ci kër",
+    "quartier": "quartier",
+    "banlieu": "banlieue",
+    "si": "ci",
+    "fane": "fan",
+    "fana": "fan",
+    "lana": "lan",
+    "lanna": "lan",
+    "nakka": "naka",
+    "nakha": "naka",
+    "niak": "ñàkk",
+    "niakk": "ñàkk",
+    "dencal": "denc", // requested dencal -> denc
+    "limal": "lim",
+    "ganee": "gañ",
+    "gane": "gañ",
+    "borom": "boroom",
+    "xaalisou": "xaalis",
+    "xaliss": "xaalis",
+};
+const CAPITALIZED_PLACES = ["Yoff", "Dakar", "Pikine", "Guédiawaye"];
+export interface NormalizationResult {
+    normalizedText: string;
+    changes: string[]; // Format: ["damae -> damay", ...]
+}
+export function normalizeWolof(rawText: string): NormalizationResult {
+    if (!rawText) return { normalizedText: '', changes: [] };
+    let text = rawText.trim().replace(/\s{2,}/g, " ");
+    const changes: string[] = [];
+    const words = text.split(" ");
+    const processedWords = words.map(word => {
+        const lowerWord = word.toLowerCase().replace(/[.,/#!$%^&*;:{}=\-_`~()]/g, "");
+        if (NORMALIZATION_RULES[lowerWord]) {
+            const replacement = NORMALIZATION_RULES[lowerWord];
+            if (lowerWord !== replacement.toLowerCase()) {
+                changes.push(`${lowerWord} -> ${replacement}`);
+            }
+            return replacement;
+        }
+        const matchingPlace = CAPITALIZED_PLACES.find(p => p.toLowerCase() === lowerWord);
+        if (matchingPlace) {
+            if (matchingPlace !== word) {
+                changes.push(`${word} -> ${matchingPlace}`);
+            }
+            return matchingPlace;
+        }
+        return word;
+    });
+    let normalizedText = processedWords.join(" ");
+    if (normalizedText.length > 0) {
+        normalizedText = normalizedText.charAt(0).toUpperCase() + normalizedText.slice(1);
+    }
+    return { normalizedText, changes: Array.from(new Set(changes)) };
+}