CognxSafeTrack commited on
Commit
672b517
·
1 Parent(s): 3b473c3

feat(ai): integrate Whisper STT confidence score for auto-validation

Browse files
apps/api/src/routes/ai.ts CHANGED
@@ -113,13 +113,13 @@ export async function aiRoutes(fastify: FastifyInstance) {
113
  try {
114
  const { buffer: audioToTranscribe, format } = await convertToMp3IfNeeded(buffer, filename);
115
  console.log(`[AI] Calling transcribeAudio for format: ${format} (Lang: ${language || 'none'})`);
116
- const text = await aiService.transcribeAudio(audioToTranscribe, `message.${format}`, language);
117
 
118
  // 🌟 STT Hardening: Basic quality check 🌟
119
  // Include common punctuation: , . ! ? ' -
120
  const isSuspect = text.length < 3 || /[^a-zA-Z0-9\sàâäéèêëîïôöùûüçÀÂÄÉÈÊËÎÏÔÖÙÛÜÇ,.!?'\-]/.test(text.slice(0, 10));
121
 
122
- return { success: true, text, isSuspect };
123
  } catch (err: any) {
124
  console.error(`[AI] ❌ Transcription error:`, err);
125
  if (err?.name === 'QuotaExceededError') {
 
113
  try {
114
  const { buffer: audioToTranscribe, format } = await convertToMp3IfNeeded(buffer, filename);
115
  console.log(`[AI] Calling transcribeAudio for format: ${format} (Lang: ${language || 'none'})`);
116
+ const { text, confidence } = await aiService.transcribeAudio(audioToTranscribe, `message.${format}`, language);
117
 
118
  // 🌟 STT Hardening: Basic quality check 🌟
119
  // Include common punctuation: , . ! ? ' -
120
  const isSuspect = text.length < 3 || /[^a-zA-Z0-9\sàâäéèêëîïôöùûüçÀÂÄÉÈÊËÎÏÔÖÙÛÜÇ,.!?'\-]/.test(text.slice(0, 10));
121
 
122
+ return { success: true, text, confidence, isSuspect };
123
  } catch (err: any) {
124
  console.error(`[AI] ❌ Transcription error:`, err);
125
  if (err?.name === 'QuotaExceededError') {
apps/api/src/services/ai/index.ts CHANGED
@@ -217,9 +217,9 @@ Wolof v4.0 si WOLOF : ñ (Waññi, Ñàkk), ë (Jënd), é (Liggéey). FCFA pour
217
  }
218
 
219
  /**
220
- * Transcribes an audio buffer to text (useful for Wolof/FR voice messages).
221
  */
222
- async transcribeAudio(audioBuffer: Buffer, filename: string, language?: string): Promise<string> {
223
  return this.provider.transcribeAudio(audioBuffer, filename, language);
224
  }
225
 
 
217
  }
218
 
219
  /**
220
+ * Transcribes an audio buffer to text (useful for Wolof/FR voice messages) and returns confidence score.
221
  */
222
+ async transcribeAudio(audioBuffer: Buffer, filename: string, language?: string): Promise<{ text: string, confidence: number }> {
223
  return this.provider.transcribeAudio(audioBuffer, filename, language);
224
  }
225
 
apps/api/src/services/ai/mock-provider.ts CHANGED
@@ -50,9 +50,9 @@ export class MockLLMProvider implements LLMProvider {
50
  throw new Error("MockLLMProvider does not support this schema.");
51
  }
52
 
53
- async transcribeAudio(_audioBuffer: Buffer, filename: string): Promise<string> {
54
  console.log(`[MOCK LLM] Transcribing audio from ${filename}...`);
55
- return "INSCRIPTION";
56
  }
57
 
58
  async generateSpeech(text: string): Promise<Buffer> {
 
50
  throw new Error("MockLLMProvider does not support this schema.");
51
  }
52
 
53
+ async transcribeAudio(_audioBuffer: Buffer, filename: string): Promise<{ text: string, confidence: number }> {
54
  console.log(`[MOCK LLM] Transcribing audio from ${filename}...`);
55
+ return { text: "INSCRIPTION", confidence: 100 };
56
  }
57
 
58
  async generateSpeech(text: string): Promise<Buffer> {
apps/api/src/services/ai/openai-provider.ts CHANGED
@@ -61,7 +61,7 @@ export class OpenAIProvider implements LLMProvider {
61
  }
62
  }
63
 
64
- async transcribeAudio(audioBuffer: Buffer, filename: string, language?: string): Promise<string> {
65
  console.log(`[OPENAI] Transcribing audio file ${filename} (hint: ${language || 'none'})...`);
66
 
67
  try {
@@ -71,8 +71,18 @@ export class OpenAIProvider implements LLMProvider {
71
  file: file,
72
  model: 'whisper-1',
73
  language: language === 'WOLOF' ? 'fr' : (language?.toLowerCase() || undefined), // Hint 'fr' for Wolof as it often helps Whisper with the mixed context
74
- });
75
- return response.text;
 
 
 
 
 
 
 
 
 
 
76
  } catch (err: any) {
77
  console.error('[OPENAI] ❌ Connection or API Error:', {
78
  name: err?.name,
 
61
  }
62
  }
63
 
64
+ async transcribeAudio(audioBuffer: Buffer, filename: string, language?: string): Promise<{ text: string; confidence: number }> {
65
  console.log(`[OPENAI] Transcribing audio file ${filename} (hint: ${language || 'none'})...`);
66
 
67
  try {
 
71
  file: file,
72
  model: 'whisper-1',
73
  language: language === 'WOLOF' ? 'fr' : (language?.toLowerCase() || undefined), // Hint 'fr' for Wolof as it often helps Whisper with the mixed context
74
+ response_format: 'verbose_json',
75
+ }) as any;
76
+
77
+ // Calculate confidence from avg_logprob if available, otherwise default to 100
78
+ let confidence = 100;
79
+ if (response.segments && response.segments.length > 0) {
80
+ const totalLogprob = response.segments.reduce((acc: number, seg: any) => acc + (seg.avg_logprob || 0), 0);
81
+ const avgLogprob = totalLogprob / response.segments.length;
82
+ confidence = Math.max(0, Math.min(100, Math.round(Math.exp(avgLogprob) * 100)));
83
+ }
84
+
85
+ return { text: response.text, confidence };
86
  } catch (err: any) {
87
  console.error('[OPENAI] ❌ Connection or API Error:', {
88
  name: err?.name,
apps/api/src/services/ai/types.ts CHANGED
@@ -1,9 +1,14 @@
1
  import { z } from 'zod';
2
 
 
 
 
 
 
3
  // Base interface for all LLM Providers
4
  export interface LLMProvider {
5
  generateStructuredData<T>(prompt: string, schema: z.ZodSchema<T>): Promise<T>;
6
- transcribeAudio(audioBuffer: Buffer, filename: string, language?: string): Promise<string>;
7
  generateSpeech(text: string): Promise<Buffer>;
8
  }
9
 
 
1
  import { z } from 'zod';
2
 
3
+ export interface TranscriptionResult {
4
+ text: string;
5
+ confidence: number;
6
+ }
7
+
8
  // Base interface for all LLM Providers
9
  export interface LLMProvider {
10
  generateStructuredData<T>(prompt: string, schema: z.ZodSchema<T>): Promise<T>;
11
+ transcribeAudio(audioBuffer: Buffer, filename: string, language?: string): Promise<TranscriptionResult>;
12
  generateSpeech(text: string): Promise<Buffer>;
13
  }
14
 
apps/whatsapp-worker/src/index.ts CHANGED
@@ -61,18 +61,8 @@ const worker = new Worker('whatsapp-queue', async (job: Job) => {
61
 
62
  console.log(`[WORKER] Generating expert feedback for User ${userId}`);
63
 
64
- // 🚨 HUMAN-IN-THE-LOOP INTERCEPTION
65
- // If the user's language is WOLOF, we pause AI interpretation and wait for an Admin Review
66
- if (language === 'WOLOF') {
67
- console.log(`[WORKER] Intercepting WOLOF audio for User ${userId}. Shifting to PENDING_REVIEW.`);
68
- await prisma.userProgress.upsert({
69
- where: { userId_trackId: { userId, trackId } },
70
- update: { exerciseStatus: 'PENDING_REVIEW' as any },
71
- create: { userId, trackId, exerciseStatus: 'PENDING_REVIEW' as any }
72
- });
73
- await sendTextMessage(user.phone, "🎙️ Nyangi jaxas sa kàddu. Xamle dina la tontu ci kanam ! (En cours d'analyse)");
74
- return; // Stop job execution, wait for Admin
75
- }
76
  const AI_API_BASE_URL = getApiUrl();
77
  const apiKey = getAdminApiKey();
78
 
@@ -461,6 +451,7 @@ const worker = new Worker('whatsapp-queue', async (job: Job) => {
461
  if (transcribeRes.ok) {
462
  const data = await transcribeRes.json() as any;
463
  const isSuspect = data.isSuspect || false;
 
464
  transcribedText = data.text || '';
465
 
466
  const user = await prisma.user.findFirst({ where: { phone } });
@@ -475,14 +466,45 @@ const worker = new Worker('whatsapp-queue', async (job: Job) => {
475
  console.log(`[STT] Normalized: "${originalText}" -> "${transcribedText}"`);
476
 
477
  // Soft Feedback UI
478
- await sendTextMessage(phone, `Ma dégg na: "${transcribedText}" ✅`);
479
  if (normResult.changes.length > 0) {
480
  const limitedChanges = normResult.changes.slice(0, 2).join(", ");
481
  await sendTextMessage(phone, `Nataal bu gën: ${limitedChanges}`);
482
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
483
  }
484
 
485
- console.log(`[STT] transcribe result="${transcribedText.substring(0, 80)}" (suspect=${isSuspect})`);
486
 
487
  // 🌟 STT Hardening: Handle suspect transcription 🌟
488
  if (isSuspect && user) {
 
61
 
62
  console.log(`[WORKER] Generating expert feedback for User ${userId}`);
63
 
64
+ // 🚨 HUMAN-IN-THE-LOOP (Moved to download-media based on Whisper Confidence)
65
+ // If the user's language is WOLOF and it reaches here, either precision was > 80% or Admin manually overrode it.
 
 
 
 
 
 
 
 
 
 
66
  const AI_API_BASE_URL = getApiUrl();
67
  const apiKey = getAdminApiKey();
68
 
 
451
  if (transcribeRes.ok) {
452
  const data = await transcribeRes.json() as any;
453
  const isSuspect = data.isSuspect || false;
454
+ const confidence = data.confidence || 0;
455
  transcribedText = data.text || '';
456
 
457
  const user = await prisma.user.findFirst({ where: { phone } });
 
466
  console.log(`[STT] Normalized: "${originalText}" -> "${transcribedText}"`);
467
 
468
  // Soft Feedback UI
469
+ await sendTextMessage(phone, `Ma dégg na: "${transcribedText}" ✅\n(Confiance STT: ${confidence}%)`);
470
  if (normResult.changes.length > 0) {
471
  const limitedChanges = normResult.changes.slice(0, 2).join(", ");
472
  await sendTextMessage(phone, `Nataal bu gën: ${limitedChanges}`);
473
  }
474
+
475
+ // 🚨 Wolof Auto-Validation Logic (Target: > 80%) 🚨
476
+ if (confidence <= 80) {
477
+ console.log(`[STT] Whisper Confidence (${confidence}%) <= 80. Intercepting WOLOF audio for User ${user.id}. Shifting to PENDING_REVIEW.`);
478
+
479
+ // First, make sure there is an active enrollment to find the trackId
480
+ const activeEnrollment = await prisma.enrollment.findFirst({
481
+ where: { userId: user.id, status: 'ACTIVE' },
482
+ include: { track: true }
483
+ });
484
+
485
+ if (activeEnrollment) {
486
+ await prisma.userProgress.upsert({
487
+ where: { userId_trackId: { userId: user.id, trackId: activeEnrollment.trackId } },
488
+ update: { exerciseStatus: 'PENDING_REVIEW' as any, adminTranscription: transcribedText },
489
+ create: { userId: user.id, trackId: activeEnrollment.trackId, exerciseStatus: 'PENDING_REVIEW' as any, adminTranscription: transcribedText }
490
+ });
491
+ await sendTextMessage(phone, "🎙️ Nyangi jaxas sa kàddu. Xamle dina la tontu ci kanam ! (En cours d'analyse par l'équipe)");
492
+ } else {
493
+ // Edge case: not enrolled but sent audio...
494
+ await sendTextMessage(phone, "Dama jaxaso ci li nga wax... Mën nga ko waxaat ndànk ?");
495
+ }
496
+
497
+ // Still save the audio URL to the message for the admin to read!
498
+ await prisma.message.updateMany({
499
+ where: { userId: user.id, direction: 'INBOUND', mediaUrl: audioUrl },
500
+ data: { content: transcribedText }
501
+ }).catch(() => { });
502
+
503
+ return; // Stop here, WAIT FOR ADMIN OVERRIDE
504
+ }
505
  }
506
 
507
+ console.log(`[STT] transcribe result="${transcribedText.substring(0, 80)}" (suspect=${isSuspect}, confidence=${confidence}%)`);
508
 
509
  // 🌟 STT Hardening: Handle suspect transcription 🌟
510
  if (isSuspect && user) {