Spaces:

safetrack
/

edtech

Running

App Files Files Community

CognxSafeTrack commited on Mar 6

Commit

672b517

1 Parent(s): 3b473c3

feat(ai): integrate Whisper STT confidence score for auto-validation

Browse files

Files changed (6) hide show

apps/api/src/routes/ai.ts +2 -2
apps/api/src/services/ai/index.ts +2 -2
apps/api/src/services/ai/mock-provider.ts +2 -2
apps/api/src/services/ai/openai-provider.ts +13 -3
apps/api/src/services/ai/types.ts +6 -1
apps/whatsapp-worker/src/index.ts +36 -14

apps/api/src/routes/ai.ts CHANGED Viewed

@@ -113,13 +113,13 @@ export async function aiRoutes(fastify: FastifyInstance) {
         try {
             const { buffer: audioToTranscribe, format } = await convertToMp3IfNeeded(buffer, filename);
             console.log(`[AI] Calling transcribeAudio for format: ${format} (Lang: ${language || 'none'})`);
-            const text = await aiService.transcribeAudio(audioToTranscribe, `message.${format}`, language);
             // 🌟 STT Hardening: Basic quality check 🌟
             // Include common punctuation: , . ! ? ' -
             const isSuspect = text.length < 3 || /[^a-zA-Z0-9\sàâäéèêëîïôöùûüçÀÂÄÉÈÊËÎÏÔÖÙÛÜÇ,.!?'\-]/.test(text.slice(0, 10));
-            return { success: true, text, isSuspect };
         } catch (err: any) {
             console.error(`[AI] ❌ Transcription error:`, err);
             if (err?.name === 'QuotaExceededError') {

         try {
             const { buffer: audioToTranscribe, format } = await convertToMp3IfNeeded(buffer, filename);
             console.log(`[AI] Calling transcribeAudio for format: ${format} (Lang: ${language || 'none'})`);
+            const { text, confidence } = await aiService.transcribeAudio(audioToTranscribe, `message.${format}`, language);
             // 🌟 STT Hardening: Basic quality check 🌟
             // Include common punctuation: , . ! ? ' -
             const isSuspect = text.length < 3 || /[^a-zA-Z0-9\sàâäéèêëîïôöùûüçÀÂÄÉÈÊËÎÏÔÖÙÛÜÇ,.!?'\-]/.test(text.slice(0, 10));
+            return { success: true, text, confidence, isSuspect };
         } catch (err: any) {
             console.error(`[AI] ❌ Transcription error:`, err);
             if (err?.name === 'QuotaExceededError') {

apps/api/src/services/ai/index.ts CHANGED Viewed

@@ -217,9 +217,9 @@ Wolof v4.0 si WOLOF : ñ (Waññi, Ñàkk), ë (Jënd), é (Liggéey). FCFA pour
     }
     /**
-     * Transcribes an audio buffer to text (useful for Wolof/FR voice messages).
      */
-    async transcribeAudio(audioBuffer: Buffer, filename: string, language?: string): Promise<string> {
         return this.provider.transcribeAudio(audioBuffer, filename, language);
     }

     }
     /**
+     * Transcribes an audio buffer to text (useful for Wolof/FR voice messages) and returns confidence score.
      */
+    async transcribeAudio(audioBuffer: Buffer, filename: string, language?: string): Promise<{ text: string, confidence: number }> {
         return this.provider.transcribeAudio(audioBuffer, filename, language);
     }

apps/api/src/services/ai/mock-provider.ts CHANGED Viewed

@@ -50,9 +50,9 @@ export class MockLLMProvider implements LLMProvider {
         throw new Error("MockLLMProvider does not support this schema.");
     }
-    async transcribeAudio(_audioBuffer: Buffer, filename: string): Promise<string> {
         console.log(`[MOCK LLM] Transcribing audio from ${filename}...`);
-        return "INSCRIPTION";
     }
     async generateSpeech(text: string): Promise<Buffer> {

         throw new Error("MockLLMProvider does not support this schema.");
     }
+    async transcribeAudio(_audioBuffer: Buffer, filename: string): Promise<{ text: string, confidence: number }> {
         console.log(`[MOCK LLM] Transcribing audio from ${filename}...`);
+        return { text: "INSCRIPTION", confidence: 100 };
     }
     async generateSpeech(text: string): Promise<Buffer> {

apps/api/src/services/ai/openai-provider.ts CHANGED Viewed

@@ -61,7 +61,7 @@ export class OpenAIProvider implements LLMProvider {
         }
     }
-    async transcribeAudio(audioBuffer: Buffer, filename: string, language?: string): Promise<string> {
         console.log(`[OPENAI] Transcribing audio file ${filename} (hint: ${language || 'none'})...`);
         try {
@@ -71,8 +71,18 @@ export class OpenAIProvider implements LLMProvider {
                 file: file,
                 model: 'whisper-1',
                 language: language === 'WOLOF' ? 'fr' : (language?.toLowerCase() || undefined), // Hint 'fr' for Wolof as it often helps Whisper with the mixed context
-            });
-            return response.text;
         } catch (err: any) {
             console.error('[OPENAI] ❌ Connection or API Error:', {
                 name: err?.name,

         }
     }
+    async transcribeAudio(audioBuffer: Buffer, filename: string, language?: string): Promise<{ text: string; confidence: number }> {
         console.log(`[OPENAI] Transcribing audio file ${filename} (hint: ${language || 'none'})...`);
         try {
                 file: file,
                 model: 'whisper-1',
                 language: language === 'WOLOF' ? 'fr' : (language?.toLowerCase() || undefined), // Hint 'fr' for Wolof as it often helps Whisper with the mixed context
+                response_format: 'verbose_json',
+            }) as any;
+            // Calculate confidence from avg_logprob if available, otherwise default to 100
+            let confidence = 100;
+            if (response.segments && response.segments.length > 0) {
+                const totalLogprob = response.segments.reduce((acc: number, seg: any) => acc + (seg.avg_logprob || 0), 0);
+                const avgLogprob = totalLogprob / response.segments.length;
+                confidence = Math.max(0, Math.min(100, Math.round(Math.exp(avgLogprob) * 100)));
+            }
+            return { text: response.text, confidence };
         } catch (err: any) {
             console.error('[OPENAI] ❌ Connection or API Error:', {
                 name: err?.name,

apps/api/src/services/ai/types.ts CHANGED Viewed

@@ -1,9 +1,14 @@
 import { z } from 'zod';
 // Base interface for all LLM Providers
 export interface LLMProvider {
     generateStructuredData<T>(prompt: string, schema: z.ZodSchema<T>): Promise<T>;
-    transcribeAudio(audioBuffer: Buffer, filename: string, language?: string): Promise<string>;
     generateSpeech(text: string): Promise<Buffer>;
 }

 import { z } from 'zod';
+export interface TranscriptionResult {
+    text: string;
+    confidence: number;
+}
 // Base interface for all LLM Providers
 export interface LLMProvider {
     generateStructuredData<T>(prompt: string, schema: z.ZodSchema<T>): Promise<T>;
+    transcribeAudio(audioBuffer: Buffer, filename: string, language?: string): Promise<TranscriptionResult>;
     generateSpeech(text: string): Promise<Buffer>;
 }

apps/whatsapp-worker/src/index.ts CHANGED Viewed

@@ -61,18 +61,8 @@ const worker = new Worker('whatsapp-queue', async (job: Job) => {
             console.log(`[WORKER] Generating expert feedback for User ${userId}`);
-            // 🚨 HUMAN-IN-THE-LOOP INTERCEPTION
-            // If the user's language is WOLOF, we pause AI interpretation and wait for an Admin Review
-            if (language === 'WOLOF') {
-                console.log(`[WORKER] Intercepting WOLOF audio for User ${userId}. Shifting to PENDING_REVIEW.`);
-                await prisma.userProgress.upsert({
-                    where: { userId_trackId: { userId, trackId } },
-                    update: { exerciseStatus: 'PENDING_REVIEW' as any },
-                    create: { userId, trackId, exerciseStatus: 'PENDING_REVIEW' as any }
-                });
-                await sendTextMessage(user.phone, "🎙️ Nyangi jaxas sa kàddu. Xamle dina la tontu ci kanam ! (En cours d'analyse)");
-                return; // Stop job execution, wait for Admin
-            }
             const AI_API_BASE_URL = getApiUrl();
             const apiKey = getAdminApiKey();
@@ -461,6 +451,7 @@ const worker = new Worker('whatsapp-queue', async (job: Job) => {
                 if (transcribeRes.ok) {
                     const data = await transcribeRes.json() as any;
                     const isSuspect = data.isSuspect || false;
                     transcribedText = data.text || '';
                     const user = await prisma.user.findFirst({ where: { phone } });
@@ -475,14 +466,45 @@ const worker = new Worker('whatsapp-queue', async (job: Job) => {
                         console.log(`[STT] Normalized: "${originalText}" -> "${transcribedText}"`);
                         // Soft Feedback UI
-                        await sendTextMessage(phone, `Ma dégg na: "${transcribedText}" ✅`);
                         if (normResult.changes.length > 0) {
                             const limitedChanges = normResult.changes.slice(0, 2).join(", ");
                             await sendTextMessage(phone, `Nataal bu gën: ${limitedChanges}`);
                         }
                     }
-                    console.log(`[STT] transcribe result="${transcribedText.substring(0, 80)}" (suspect=${isSuspect})`);
                     // 🌟 STT Hardening: Handle suspect transcription 🌟
                     if (isSuspect && user) {

             console.log(`[WORKER] Generating expert feedback for User ${userId}`);
+            // 🚨 HUMAN-IN-THE-LOOP (Moved to download-media based on Whisper Confidence)
+            // If the user's language is WOLOF and it reaches here, either precision was > 80% or Admin manually overrode it.
             const AI_API_BASE_URL = getApiUrl();
             const apiKey = getAdminApiKey();
                 if (transcribeRes.ok) {
                     const data = await transcribeRes.json() as any;
                     const isSuspect = data.isSuspect || false;
+                    const confidence = data.confidence || 0;
                     transcribedText = data.text || '';
                     const user = await prisma.user.findFirst({ where: { phone } });
                         console.log(`[STT] Normalized: "${originalText}" -> "${transcribedText}"`);
                         // Soft Feedback UI
+                        await sendTextMessage(phone, `Ma dégg na: "${transcribedText}" ✅\n(Confiance STT: ${confidence}%)`);
                         if (normResult.changes.length > 0) {
                             const limitedChanges = normResult.changes.slice(0, 2).join(", ");
                             await sendTextMessage(phone, `Nataal bu gën: ${limitedChanges}`);
                         }
+                        // 🚨 Wolof Auto-Validation Logic (Target: > 80%) 🚨
+                        if (confidence <= 80) {
+                            console.log(`[STT] Whisper Confidence (${confidence}%) <= 80. Intercepting WOLOF audio for User ${user.id}. Shifting to PENDING_REVIEW.`);
+                            // First, make sure there is an active enrollment to find the trackId
+                            const activeEnrollment = await prisma.enrollment.findFirst({
+                                where: { userId: user.id, status: 'ACTIVE' },
+                                include: { track: true }
+                            });
+                            if (activeEnrollment) {
+                                await prisma.userProgress.upsert({
+                                    where: { userId_trackId: { userId: user.id, trackId: activeEnrollment.trackId } },
+                                    update: { exerciseStatus: 'PENDING_REVIEW' as any, adminTranscription: transcribedText },
+                                    create: { userId: user.id, trackId: activeEnrollment.trackId, exerciseStatus: 'PENDING_REVIEW' as any, adminTranscription: transcribedText }
+                                });
+                                await sendTextMessage(phone, "🎙️ Nyangi jaxas sa kàddu. Xamle dina la tontu ci kanam ! (En cours d'analyse par l'équipe)");
+                            } else {
+                                // Edge case: not enrolled but sent audio...
+                                await sendTextMessage(phone, "Dama jaxaso ci li nga wax... Mën nga ko waxaat ndànk ?");
+                            }
+                            // Still save the audio URL to the message for the admin to read!
+                            await prisma.message.updateMany({
+                                where: { userId: user.id, direction: 'INBOUND', mediaUrl: audioUrl },
+                                data: { content: transcribedText }
+                            }).catch(() => { });
+                            return; // Stop here, WAIT FOR ADMIN OVERRIDE
+                        }
                     }
+                    console.log(`[STT] transcribe result="${transcribedText.substring(0, 80)}" (suspect=${isSuspect}, confidence=${confidence}%)`);
                     // 🌟 STT Hardening: Handle suspect transcription 🌟
                     if (isSuspect && user) {