CognxSafeTrack commited on
Commit ·
672b517
1
Parent(s): 3b473c3
feat(ai): integrate Whisper STT confidence score for auto-validation
Browse files
apps/api/src/routes/ai.ts
CHANGED
|
@@ -113,13 +113,13 @@ export async function aiRoutes(fastify: FastifyInstance) {
|
|
| 113 |
try {
|
| 114 |
const { buffer: audioToTranscribe, format } = await convertToMp3IfNeeded(buffer, filename);
|
| 115 |
console.log(`[AI] Calling transcribeAudio for format: ${format} (Lang: ${language || 'none'})`);
|
| 116 |
-
const text = await aiService.transcribeAudio(audioToTranscribe, `message.${format}`, language);
|
| 117 |
|
| 118 |
// 🌟 STT Hardening: Basic quality check 🌟
|
| 119 |
// Include common punctuation: , . ! ? ' -
|
| 120 |
const isSuspect = text.length < 3 || /[^a-zA-Z0-9\sàâäéèêëîïôöùûüçÀÂÄÉÈÊËÎÏÔÖÙÛÜÇ,.!?'\-]/.test(text.slice(0, 10));
|
| 121 |
|
| 122 |
-
return { success: true, text, isSuspect };
|
| 123 |
} catch (err: any) {
|
| 124 |
console.error(`[AI] ❌ Transcription error:`, err);
|
| 125 |
if (err?.name === 'QuotaExceededError') {
|
|
|
|
| 113 |
try {
|
| 114 |
const { buffer: audioToTranscribe, format } = await convertToMp3IfNeeded(buffer, filename);
|
| 115 |
console.log(`[AI] Calling transcribeAudio for format: ${format} (Lang: ${language || 'none'})`);
|
| 116 |
+
const { text, confidence } = await aiService.transcribeAudio(audioToTranscribe, `message.${format}`, language);
|
| 117 |
|
| 118 |
// 🌟 STT Hardening: Basic quality check 🌟
|
| 119 |
// Include common punctuation: , . ! ? ' -
|
| 120 |
const isSuspect = text.length < 3 || /[^a-zA-Z0-9\sàâäéèêëîïôöùûüçÀÂÄÉÈÊËÎÏÔÖÙÛÜÇ,.!?'\-]/.test(text.slice(0, 10));
|
| 121 |
|
| 122 |
+
return { success: true, text, confidence, isSuspect };
|
| 123 |
} catch (err: any) {
|
| 124 |
console.error(`[AI] ❌ Transcription error:`, err);
|
| 125 |
if (err?.name === 'QuotaExceededError') {
|
apps/api/src/services/ai/index.ts
CHANGED
|
@@ -217,9 +217,9 @@ Wolof v4.0 si WOLOF : ñ (Waññi, Ñàkk), ë (Jënd), é (Liggéey). FCFA pour
|
|
| 217 |
}
|
| 218 |
|
| 219 |
/**
|
| 220 |
-
* Transcribes an audio buffer to text (useful for Wolof/FR voice messages).
|
| 221 |
*/
|
| 222 |
-
async transcribeAudio(audioBuffer: Buffer, filename: string, language?: string): Promise<string> {
|
| 223 |
return this.provider.transcribeAudio(audioBuffer, filename, language);
|
| 224 |
}
|
| 225 |
|
|
|
|
| 217 |
}
|
| 218 |
|
| 219 |
/**
|
| 220 |
+
* Transcribes an audio buffer to text (useful for Wolof/FR voice messages) and returns confidence score.
|
| 221 |
*/
|
| 222 |
+
async transcribeAudio(audioBuffer: Buffer, filename: string, language?: string): Promise<{ text: string, confidence: number }> {
|
| 223 |
return this.provider.transcribeAudio(audioBuffer, filename, language);
|
| 224 |
}
|
| 225 |
|
apps/api/src/services/ai/mock-provider.ts
CHANGED
|
@@ -50,9 +50,9 @@ export class MockLLMProvider implements LLMProvider {
|
|
| 50 |
throw new Error("MockLLMProvider does not support this schema.");
|
| 51 |
}
|
| 52 |
|
| 53 |
-
async transcribeAudio(_audioBuffer: Buffer, filename: string): Promise<string> {
|
| 54 |
console.log(`[MOCK LLM] Transcribing audio from ${filename}...`);
|
| 55 |
-
return "INSCRIPTION";
|
| 56 |
}
|
| 57 |
|
| 58 |
async generateSpeech(text: string): Promise<Buffer> {
|
|
|
|
| 50 |
throw new Error("MockLLMProvider does not support this schema.");
|
| 51 |
}
|
| 52 |
|
| 53 |
+
async transcribeAudio(_audioBuffer: Buffer, filename: string): Promise<{ text: string, confidence: number }> {
|
| 54 |
console.log(`[MOCK LLM] Transcribing audio from ${filename}...`);
|
| 55 |
+
return { text: "INSCRIPTION", confidence: 100 };
|
| 56 |
}
|
| 57 |
|
| 58 |
async generateSpeech(text: string): Promise<Buffer> {
|
apps/api/src/services/ai/openai-provider.ts
CHANGED
|
@@ -61,7 +61,7 @@ export class OpenAIProvider implements LLMProvider {
|
|
| 61 |
}
|
| 62 |
}
|
| 63 |
|
| 64 |
-
async transcribeAudio(audioBuffer: Buffer, filename: string, language?: string): Promise<string> {
|
| 65 |
console.log(`[OPENAI] Transcribing audio file ${filename} (hint: ${language || 'none'})...`);
|
| 66 |
|
| 67 |
try {
|
|
@@ -71,8 +71,18 @@ export class OpenAIProvider implements LLMProvider {
|
|
| 71 |
file: file,
|
| 72 |
model: 'whisper-1',
|
| 73 |
language: language === 'WOLOF' ? 'fr' : (language?.toLowerCase() || undefined), // Hint 'fr' for Wolof as it often helps Whisper with the mixed context
|
| 74 |
-
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
} catch (err: any) {
|
| 77 |
console.error('[OPENAI] ❌ Connection or API Error:', {
|
| 78 |
name: err?.name,
|
|
|
|
| 61 |
}
|
| 62 |
}
|
| 63 |
|
| 64 |
+
async transcribeAudio(audioBuffer: Buffer, filename: string, language?: string): Promise<{ text: string; confidence: number }> {
|
| 65 |
console.log(`[OPENAI] Transcribing audio file ${filename} (hint: ${language || 'none'})...`);
|
| 66 |
|
| 67 |
try {
|
|
|
|
| 71 |
file: file,
|
| 72 |
model: 'whisper-1',
|
| 73 |
language: language === 'WOLOF' ? 'fr' : (language?.toLowerCase() || undefined), // Hint 'fr' for Wolof as it often helps Whisper with the mixed context
|
| 74 |
+
response_format: 'verbose_json',
|
| 75 |
+
}) as any;
|
| 76 |
+
|
| 77 |
+
// Calculate confidence from avg_logprob if available, otherwise default to 100
|
| 78 |
+
let confidence = 100;
|
| 79 |
+
if (response.segments && response.segments.length > 0) {
|
| 80 |
+
const totalLogprob = response.segments.reduce((acc: number, seg: any) => acc + (seg.avg_logprob || 0), 0);
|
| 81 |
+
const avgLogprob = totalLogprob / response.segments.length;
|
| 82 |
+
confidence = Math.max(0, Math.min(100, Math.round(Math.exp(avgLogprob) * 100)));
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
return { text: response.text, confidence };
|
| 86 |
} catch (err: any) {
|
| 87 |
console.error('[OPENAI] ❌ Connection or API Error:', {
|
| 88 |
name: err?.name,
|
apps/api/src/services/ai/types.ts
CHANGED
|
@@ -1,9 +1,14 @@
|
|
| 1 |
import { z } from 'zod';
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
// Base interface for all LLM Providers
|
| 4 |
export interface LLMProvider {
|
| 5 |
generateStructuredData<T>(prompt: string, schema: z.ZodSchema<T>): Promise<T>;
|
| 6 |
-
transcribeAudio(audioBuffer: Buffer, filename: string, language?: string): Promise<
|
| 7 |
generateSpeech(text: string): Promise<Buffer>;
|
| 8 |
}
|
| 9 |
|
|
|
|
| 1 |
import { z } from 'zod';
|
| 2 |
|
| 3 |
+
export interface TranscriptionResult {
|
| 4 |
+
text: string;
|
| 5 |
+
confidence: number;
|
| 6 |
+
}
|
| 7 |
+
|
| 8 |
// Base interface for all LLM Providers
|
| 9 |
export interface LLMProvider {
|
| 10 |
generateStructuredData<T>(prompt: string, schema: z.ZodSchema<T>): Promise<T>;
|
| 11 |
+
transcribeAudio(audioBuffer: Buffer, filename: string, language?: string): Promise<TranscriptionResult>;
|
| 12 |
generateSpeech(text: string): Promise<Buffer>;
|
| 13 |
}
|
| 14 |
|
apps/whatsapp-worker/src/index.ts
CHANGED
|
@@ -61,18 +61,8 @@ const worker = new Worker('whatsapp-queue', async (job: Job) => {
|
|
| 61 |
|
| 62 |
console.log(`[WORKER] Generating expert feedback for User ${userId}`);
|
| 63 |
|
| 64 |
-
// 🚨 HUMAN-IN-THE-LOOP
|
| 65 |
-
// If the user's language is WOLOF
|
| 66 |
-
if (language === 'WOLOF') {
|
| 67 |
-
console.log(`[WORKER] Intercepting WOLOF audio for User ${userId}. Shifting to PENDING_REVIEW.`);
|
| 68 |
-
await prisma.userProgress.upsert({
|
| 69 |
-
where: { userId_trackId: { userId, trackId } },
|
| 70 |
-
update: { exerciseStatus: 'PENDING_REVIEW' as any },
|
| 71 |
-
create: { userId, trackId, exerciseStatus: 'PENDING_REVIEW' as any }
|
| 72 |
-
});
|
| 73 |
-
await sendTextMessage(user.phone, "🎙️ Nyangi jaxas sa kàddu. Xamle dina la tontu ci kanam ! (En cours d'analyse)");
|
| 74 |
-
return; // Stop job execution, wait for Admin
|
| 75 |
-
}
|
| 76 |
const AI_API_BASE_URL = getApiUrl();
|
| 77 |
const apiKey = getAdminApiKey();
|
| 78 |
|
|
@@ -461,6 +451,7 @@ const worker = new Worker('whatsapp-queue', async (job: Job) => {
|
|
| 461 |
if (transcribeRes.ok) {
|
| 462 |
const data = await transcribeRes.json() as any;
|
| 463 |
const isSuspect = data.isSuspect || false;
|
|
|
|
| 464 |
transcribedText = data.text || '';
|
| 465 |
|
| 466 |
const user = await prisma.user.findFirst({ where: { phone } });
|
|
@@ -475,14 +466,45 @@ const worker = new Worker('whatsapp-queue', async (job: Job) => {
|
|
| 475 |
console.log(`[STT] Normalized: "${originalText}" -> "${transcribedText}"`);
|
| 476 |
|
| 477 |
// Soft Feedback UI
|
| 478 |
-
await sendTextMessage(phone, `Ma dégg na: "${transcribedText}" ✅`);
|
| 479 |
if (normResult.changes.length > 0) {
|
| 480 |
const limitedChanges = normResult.changes.slice(0, 2).join(", ");
|
| 481 |
await sendTextMessage(phone, `Nataal bu gën: ${limitedChanges}`);
|
| 482 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 483 |
}
|
| 484 |
|
| 485 |
-
console.log(`[STT] transcribe result="${transcribedText.substring(0, 80)}" (suspect=${isSuspect})`);
|
| 486 |
|
| 487 |
// 🌟 STT Hardening: Handle suspect transcription 🌟
|
| 488 |
if (isSuspect && user) {
|
|
|
|
| 61 |
|
| 62 |
console.log(`[WORKER] Generating expert feedback for User ${userId}`);
|
| 63 |
|
| 64 |
+
// 🚨 HUMAN-IN-THE-LOOP (Moved to download-media based on Whisper Confidence)
|
| 65 |
+
// If the user's language is WOLOF and it reaches here, either precision was > 80% or Admin manually overrode it.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
const AI_API_BASE_URL = getApiUrl();
|
| 67 |
const apiKey = getAdminApiKey();
|
| 68 |
|
|
|
|
| 451 |
if (transcribeRes.ok) {
|
| 452 |
const data = await transcribeRes.json() as any;
|
| 453 |
const isSuspect = data.isSuspect || false;
|
| 454 |
+
const confidence = data.confidence || 0;
|
| 455 |
transcribedText = data.text || '';
|
| 456 |
|
| 457 |
const user = await prisma.user.findFirst({ where: { phone } });
|
|
|
|
| 466 |
console.log(`[STT] Normalized: "${originalText}" -> "${transcribedText}"`);
|
| 467 |
|
| 468 |
// Soft Feedback UI
|
| 469 |
+
await sendTextMessage(phone, `Ma dégg na: "${transcribedText}" ✅\n(Confiance STT: ${confidence}%)`);
|
| 470 |
if (normResult.changes.length > 0) {
|
| 471 |
const limitedChanges = normResult.changes.slice(0, 2).join(", ");
|
| 472 |
await sendTextMessage(phone, `Nataal bu gën: ${limitedChanges}`);
|
| 473 |
}
|
| 474 |
+
|
| 475 |
+
// 🚨 Wolof Auto-Validation Logic (Target: > 80%) 🚨
|
| 476 |
+
if (confidence <= 80) {
|
| 477 |
+
console.log(`[STT] Whisper Confidence (${confidence}%) <= 80. Intercepting WOLOF audio for User ${user.id}. Shifting to PENDING_REVIEW.`);
|
| 478 |
+
|
| 479 |
+
// First, make sure there is an active enrollment to find the trackId
|
| 480 |
+
const activeEnrollment = await prisma.enrollment.findFirst({
|
| 481 |
+
where: { userId: user.id, status: 'ACTIVE' },
|
| 482 |
+
include: { track: true }
|
| 483 |
+
});
|
| 484 |
+
|
| 485 |
+
if (activeEnrollment) {
|
| 486 |
+
await prisma.userProgress.upsert({
|
| 487 |
+
where: { userId_trackId: { userId: user.id, trackId: activeEnrollment.trackId } },
|
| 488 |
+
update: { exerciseStatus: 'PENDING_REVIEW' as any, adminTranscription: transcribedText },
|
| 489 |
+
create: { userId: user.id, trackId: activeEnrollment.trackId, exerciseStatus: 'PENDING_REVIEW' as any, adminTranscription: transcribedText }
|
| 490 |
+
});
|
| 491 |
+
await sendTextMessage(phone, "🎙️ Nyangi jaxas sa kàddu. Xamle dina la tontu ci kanam ! (En cours d'analyse par l'équipe)");
|
| 492 |
+
} else {
|
| 493 |
+
// Edge case: not enrolled but sent audio...
|
| 494 |
+
await sendTextMessage(phone, "Dama jaxaso ci li nga wax... Mën nga ko waxaat ndànk ?");
|
| 495 |
+
}
|
| 496 |
+
|
| 497 |
+
// Still save the audio URL to the message for the admin to read!
|
| 498 |
+
await prisma.message.updateMany({
|
| 499 |
+
where: { userId: user.id, direction: 'INBOUND', mediaUrl: audioUrl },
|
| 500 |
+
data: { content: transcribedText }
|
| 501 |
+
}).catch(() => { });
|
| 502 |
+
|
| 503 |
+
return; // Stop here, WAIT FOR ADMIN OVERRIDE
|
| 504 |
+
}
|
| 505 |
}
|
| 506 |
|
| 507 |
+
console.log(`[STT] transcribe result="${transcribedText.substring(0, 80)}" (suspect=${isSuspect}, confidence=${confidence}%)`);
|
| 508 |
|
| 509 |
// 🌟 STT Hardening: Handle suspect transcription 🌟
|
| 510 |
if (isSuspect && user) {
|