Adi362 commited on
Commit
9332524
·
verified ·
1 Parent(s): cb272d3

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +49 -517
main.py CHANGED
@@ -1,22 +1,39 @@
1
  import asyncio
 
2
  import json
 
3
  import os
 
4
  import re
 
5
  import statistics
 
6
  import time
 
7
  import uuid
 
8
  from dataclasses import dataclass
 
9
  from typing import Any, Optional
 
10
  import httpx
 
11
  from dotenv import load_dotenv
 
12
  from fastapi import FastAPI, HTTPException
 
13
  from fastapi.middleware.cors import CORSMiddleware
 
14
  from fastapi.responses import StreamingResponse
15
- from pydantic import BaseModel
 
16
 
17
 
18
 
19
  load_dotenv()
 
 
 
20
  app = FastAPI(title="CortexFlow Backend", version="1.0.0")
21
 
22
  app.add_middleware(
@@ -36,8 +53,11 @@ app.add_middleware(
36
 
37
 
38
  GROQ_API_KEY = os.getenv("GROQ_API_KEY", "").strip()
 
39
  GROQ_API_BASE = os.getenv("GROQ_API_BASE", "https://api.groq.com/openai/v1").rstrip("/")
 
40
  GROQ_TIMEOUT_SECONDS = float(os.getenv("GROQ_TIMEOUT_SECONDS", "40"))
 
41
  MODEL_DISCOVERY_TTL_SECONDS = int(os.getenv("MODEL_DISCOVERY_TTL_SECONDS", "900"))
42
 
43
 
@@ -77,8 +97,10 @@ PREFERRED_SAFETY_MODELS = [
77
 
78
 
79
  OVERRIDE_REASONING_MODEL = os.getenv("GROQ_REASONING_MODEL", "").strip()
 
80
  OVERRIDE_SAFETY_MODEL = os.getenv("GROQ_SAFETY_MODEL", "").strip()
81
- GROQ_TRANSCRIBE_MODEL = os.getenv("GROQ_TRANSCRIBE_MODEL", "whisper-large-v3-turbo").strip() or "whisper-large-v3-turbo"
 
82
 
83
  MIN_WORDS_REQUIRED = int(os.getenv("MIN_WORDS_REQUIRED", "25"))
84
 
@@ -131,14 +153,6 @@ STOPWORDS = {
131
  "not", "no", "yes", "so", "because", "about", "into", "out", "up", "down", "can", "could", "would",
132
 
133
  "should", "will", "just", "very", "really", "also",
134
- # Common Romanized Hindi stopwords for code-mixed speech.
135
- "hai", "hain", "tha", "thi", "the", "ho", "hoga", "hogi", "honge", "main", "mein", "mera", "meri", "mere",
136
- "hum", "tum", "aap", "ye", "yeh", "wo", "woh", "is", "iss", "us", "uss", "ko", "se", "ka", "ki", "ke",
137
- "par", "aur", "lekin", "magar", "kyunki", "kyonki", "agar", "jab", "tab", "tak", "ya", "nahi", "nahin", "haan",
138
- # Common Devanagari stopwords for native Hindi transcripts.
139
- "है", "हैं", "था", "थी", "थे", "हो", "होगा", "होगी", "होंगे", "मैं", "में", "मेरा", "मेरी", "मेरे", "हम",
140
- "तुम", "आप", "ये", "यह", "वो", "वह", "इस", "उस", "को", "से", "का", "की", "के", "पर", "और", "लेकिन",
141
- "मगर", "क्योंकि", "अगर", "जब", "तब", "तक", "या", "नहीं", "हाँ",
142
 
143
  }
144
 
@@ -146,8 +160,7 @@ STOPWORDS = {
146
 
147
  FILLERS = {
148
 
149
- "um", "uh", "erm", "hmm", "like", "actually", "basically", "literally",
150
- "matlab", "achha", "accha", "toh", "na", "yaar", "dekho", "samjho", "मतलब", "अच्छा", "तो", "ना",
151
 
152
  }
153
 
@@ -156,61 +169,31 @@ FILLERS = {
156
  POSITIVE_WORDS = {
157
 
158
  "good", "better", "great", "calm", "confident", "clear", "focused", "stable", "happy", "optimistic", "safe", "steady",
159
- "accha", "badhiya", "shaant", "khush", "सुरक्षित", "शांत", "खुश", "अच्छा",
160
 
161
  }
162
 
163
  NEGATIVE_WORDS = {
164
 
165
  "bad", "worse", "anxious", "scared", "panic", "panicked", "confused", "sad", "depressed", "angry", "overwhelmed", "stressed",
166
- "bura", "ghabrahat", "darr", "pareshan", "dukhi", "चिंतित", "डरा", "उलझन", "दुखी", "तनाव",
167
 
168
  }
169
 
170
  AROUSAL_WORDS = {
171
 
172
  "urgent", "immediately", "intense", "extreme", "critical", "afraid", "panic", "terrified", "racing", "shaking", "worried",
173
- "jaldi", "turant", "tez", "bahut", "घबराहट", "तुरंत", "जल्दी", "तेज", "चिंता",
174
 
175
  }
176
 
177
  HEDGE_WORDS = {
178
 
179
  "maybe", "perhaps", "possibly", "probably", "sort", "kind", "might", "could", "guess", "unsure", "not sure",
180
- "shayad", "lagta", "shayad", "pata", "कदाचित", "शायद", "लगता", "पता",
181
 
182
  }
183
 
184
- FILLER_PHRASES = {
185
- "you know",
186
- "i mean",
187
- "sort of",
188
- "kind of",
189
- "pata hai",
190
- "you know what",
191
- }
192
-
193
- HEDGE_PHRASES = {
194
- "not sure",
195
- "i guess",
196
- "sort of",
197
- "kind of",
198
- "pata nahi",
199
- "mujhe lagta",
200
- }
201
-
202
  SUBORDINATORS = {
203
 
204
  "because", "although", "though", "while", "unless", "until", "since", "whereas", "however", "therefore", "moreover", "which", "that",
205
- "kyunki", "kyonki", "agar", "jab", "jabki", "lekin", "magar", "isliye", "jo", "कि", "क्योंकि", "अगर", "जब", "जबकि", "लेकिन", "मगर", "इसलिए", "जो",
206
-
207
- }
208
 
209
- ROMAN_HINDI_MARKERS = {
210
- "hai", "hain", "tha", "thi", "the", "main", "mein", "mera", "meri", "mere", "hum", "tum", "aap", "ye", "yeh",
211
- "wo", "woh", "ko", "se", "ka", "ki", "ke", "par", "aur", "lekin", "magar", "kyunki", "kyonki", "agar", "jab",
212
- "tab", "tak", "ya", "nahi", "nahin", "haan", "accha", "achha", "matlab", "yaar", "jaldi", "turant", "shayad",
213
- "pata", "samjho", "dekho", "bahut", "thoda", "zyada", "abhi", "kal", "kar", "karna", "kiya", "karo", "raha", "rahi",
214
  }
215
 
216
 
@@ -227,10 +210,6 @@ class AnalyzeRequest(BaseModel):
227
 
228
  audio_duration: Optional[float] = None
229
 
230
- detected_language: Optional[str] = None
231
-
232
- language_profile: Optional[dict[str, Any]] = None
233
-
234
  session_id: Optional[str] = None
235
 
236
 
@@ -259,8 +238,6 @@ class AnalysisState:
259
 
260
  quality_notes: list[str]
261
 
262
- language_profile: dict[str, Any]
263
-
264
  metrics: dict[str, Any]
265
 
266
 
@@ -271,10 +248,6 @@ _MODEL_CACHE: dict[str, Any] = {"updated": 0.0, "models": []}
271
 
272
  _MODEL_CACHE_LOCK = asyncio.Lock()
273
 
274
- LATIN_TOKEN_RE = re.compile(r"[A-Za-z]+(?:'[A-Za-z]+)?")
275
- DEVANAGARI_TOKEN_RE = re.compile(r"[\u0900-\u097F]+")
276
- WORD_TOKEN_RE = re.compile(r"[A-Za-z]+(?:'[A-Za-z]+)?|[\u0900-\u097F]+")
277
-
278
 
279
 
280
  def clamp01(v: float) -> float:
@@ -295,7 +268,7 @@ def mean(values: list[float], default: float = 0.0) -> float:
295
 
296
  def tokenize_words(text: str) -> list[str]:
297
 
298
- return [tok.lower() for tok in WORD_TOKEN_RE.findall(text)]
299
 
300
 
301
 
@@ -303,7 +276,7 @@ def tokenize_words(text: str) -> list[str]:
303
 
304
  def split_sentences(text: str) -> list[str]:
305
 
306
- parts = [p.strip() for p in re.split(r"(?<=[.!?])\s+", text) if p.strip()]
307
 
308
  return parts if parts else ([text.strip()] if text.strip() else [])
309
 
@@ -316,154 +289,6 @@ def content_words(tokens: list[str]) -> list[str]:
316
  return [t for t in tokens if len(t) > 2 and t not in STOPWORDS]
317
 
318
 
319
- def read_profile_ratio(profile: Optional[dict[str, Any]], snake_key: str, camel_key: str) -> Optional[float]:
320
-
321
- if not profile or not isinstance(profile, dict):
322
-
323
- return None
324
-
325
- raw = profile.get(snake_key)
326
-
327
- if raw is None:
328
-
329
- raw = profile.get(camel_key)
330
-
331
- if raw is None:
332
-
333
- return None
334
-
335
- try:
336
-
337
- return clamp01(float(raw))
338
-
339
- except (TypeError, ValueError):
340
-
341
- return None
342
-
343
-
344
- def detect_language_profile(
345
-
346
- text: str,
347
-
348
- hinted_language: Optional[str] = None,
349
-
350
- hinted_profile: Optional[dict[str, Any]] = None,
351
-
352
- ) -> dict[str, Any]:
353
-
354
- latin_tokens = [tok.lower() for tok in LATIN_TOKEN_RE.findall(text)]
355
-
356
- devanagari_tokens = DEVANAGARI_TOKEN_RE.findall(text)
357
-
358
- roman_hindi_hits = sum(1 for tok in latin_tokens if tok in ROMAN_HINDI_MARKERS)
359
-
360
- hindi_tokens = len(devanagari_tokens) + roman_hindi_hits
361
-
362
- english_tokens = max(len(latin_tokens) - roman_hindi_hits, 0)
363
-
364
- total = max(hindi_tokens + english_tokens, 1)
365
-
366
-
367
-
368
- hindi_ratio = hindi_tokens / total
369
-
370
- english_ratio = english_tokens / total
371
-
372
- devanagari_ratio = len(devanagari_tokens) / total
373
-
374
-
375
-
376
- hinted_english_ratio = read_profile_ratio(hinted_profile, "english_ratio", "englishRatio")
377
-
378
- hinted_hindi_ratio = read_profile_ratio(hinted_profile, "hindi_ratio", "hindiRatio")
379
-
380
- if hinted_english_ratio is not None and hinted_hindi_ratio is not None and (hinted_english_ratio + hinted_hindi_ratio) > 0:
381
-
382
- hinted_total = hinted_english_ratio + hinted_hindi_ratio
383
-
384
- hinted_english_ratio /= hinted_total
385
-
386
- hinted_hindi_ratio /= hinted_total
387
-
388
- english_ratio = (0.75 * english_ratio) + (0.25 * hinted_english_ratio)
389
-
390
- hindi_ratio = (0.75 * hindi_ratio) + (0.25 * hinted_hindi_ratio)
391
-
392
- ratio_total = max(english_ratio + hindi_ratio, 1e-6)
393
-
394
- english_ratio = english_ratio / ratio_total
395
-
396
- hindi_ratio = hindi_ratio / ratio_total
397
-
398
-
399
-
400
- label = "multilingual"
401
-
402
- if hindi_ratio >= 0.2 and english_ratio >= 0.2:
403
-
404
- label = "hinglish"
405
-
406
- elif hindi_ratio >= 0.68:
407
-
408
- label = "hindi"
409
-
410
- elif english_ratio >= 0.68:
411
-
412
- label = "english"
413
-
414
-
415
-
416
- hint = (hinted_language or "").strip().lower()
417
-
418
- if hint in {"hi", "hindi"}:
419
-
420
- if english_ratio >= 0.2:
421
-
422
- label = "hinglish"
423
-
424
- elif label == "multilingual":
425
-
426
- label = "hindi"
427
-
428
- elif hint in {"en", "english"}:
429
-
430
- if hindi_ratio >= 0.2:
431
-
432
- label = "hinglish"
433
-
434
- elif label == "multilingual":
435
-
436
- label = "english"
437
-
438
-
439
-
440
- if hinted_profile and isinstance(hinted_profile, dict):
441
-
442
- hinted_label = str(hinted_profile.get("label", "")).strip().lower()
443
-
444
- if hinted_label in {"hinglish", "hindi", "english", "multilingual"} and (
445
-
446
- label == "multilingual" or abs(hindi_ratio - english_ratio) < 0.12
447
-
448
- ):
449
-
450
- label = hinted_label
451
-
452
-
453
-
454
- return {
455
-
456
- "label": label,
457
-
458
- "english_ratio": round(english_ratio, 4),
459
-
460
- "hindi_ratio": round(hindi_ratio, 4),
461
-
462
- "devanagari_ratio": round(devanagari_ratio, 4),
463
-
464
- }
465
-
466
-
467
 
468
 
469
 
@@ -504,148 +329,6 @@ def scale_inverse(value: float, good: float, poor: float) -> float:
504
  return clamp01((good - value) / (good - poor))
505
 
506
 
507
- def pick_language_target(
508
-
509
- language_profile: dict[str, Any],
510
-
511
- english: float,
512
-
513
- hinglish: float,
514
-
515
- hindi: float,
516
-
517
- multilingual: Optional[float] = None,
518
-
519
- ) -> float:
520
-
521
- try:
522
-
523
- english_ratio = clamp01(float(language_profile.get("english_ratio", 0.0)))
524
-
525
- except (TypeError, ValueError):
526
-
527
- english_ratio = 0.0
528
-
529
- try:
530
-
531
- hindi_ratio = clamp01(float(language_profile.get("hindi_ratio", 0.0)))
532
-
533
- except (TypeError, ValueError):
534
-
535
- hindi_ratio = 0.0
536
-
537
- ratio_total = english_ratio + hindi_ratio
538
-
539
-
540
-
541
- if ratio_total > 1e-6:
542
-
543
- english_weight = english_ratio / ratio_total
544
-
545
- hindi_weight = hindi_ratio / ratio_total
546
-
547
- base_target = (english_weight * english) + (hindi_weight * hindi)
548
-
549
- code_mix_strength = clamp01(2.0 * min(english_weight, hindi_weight))
550
-
551
- blended_target = ((1.0 - code_mix_strength) * base_target) + (code_mix_strength * hinglish)
552
-
553
- if multilingual is not None:
554
-
555
- blended_target = (0.9 * blended_target) + (0.1 * multilingual)
556
-
557
- return blended_target
558
-
559
-
560
-
561
- label = str(language_profile.get("label", "english")).lower()
562
-
563
- if label == "hinglish":
564
-
565
- return hinglish
566
-
567
- if label == "hindi":
568
-
569
- return hindi
570
-
571
- if label == "multilingual":
572
-
573
- return multilingual if multilingual is not None else (english + hindi) / 2.0
574
-
575
- return english
576
-
577
-
578
- def transcription_model_capabilities(model_name: str) -> dict[str, Any]:
579
-
580
- normalized = model_name.strip().lower()
581
-
582
- if not normalized:
583
-
584
- return {
585
-
586
- "model": "unknown",
587
-
588
- "multilingual": False,
589
-
590
- "hindi_supported": False,
591
-
592
- "hinglish_supported": False,
593
-
594
- "notes": "No transcription model configured.",
595
-
596
- }
597
-
598
-
599
-
600
- english_only = normalized.endswith("-en") or normalized in {
601
-
602
- "distil-whisper-large-v3-en",
603
-
604
- "whisper-large-v3-en",
605
-
606
- }
607
-
608
- multilingual = (
609
-
610
- ("whisper" in normalized and not english_only)
611
-
612
- or ("gpt-4o-mini-transcribe" in normalized)
613
-
614
- or ("gpt-4o-transcribe" in normalized)
615
-
616
- )
617
-
618
-
619
-
620
- if english_only:
621
-
622
- notes = "Configured model appears English-only. Use a multilingual Whisper model for Hindi/Hinglish."
623
-
624
- elif multilingual:
625
-
626
- notes = "Configured model supports multilingual transcription, including Hindi and code-mixed Hinglish."
627
-
628
- else:
629
-
630
- notes = "Model capability is unknown; verify multilingual Hindi support in provider documentation."
631
-
632
-
633
-
634
- return {
635
-
636
- "model": model_name,
637
-
638
- "multilingual": multilingual,
639
-
640
- "hindi_supported": multilingual,
641
-
642
- "hinglish_supported": multilingual,
643
-
644
- "notes": notes,
645
-
646
- }
647
-
648
-
649
 
650
  def safe_step_event(name: str, status: str, detail: Optional[str] = None) -> bytes:
651
 
@@ -658,31 +341,6 @@ def safe_step_event(name: str, status: str, detail: Optional[str] = None) -> byt
658
  return (json.dumps(payload) + "\n").encode()
659
 
660
 
661
- def count_phrase_hits(text: str, phrases: set[str]) -> int:
662
-
663
- lowered = text.lower()
664
-
665
- return sum(lowered.count(phrase) for phrase in phrases if phrase)
666
-
667
-
668
- def estimate_filler_hits(tokens: list[str], text: str) -> int:
669
-
670
- token_hits = sum(1 for t in tokens if t in FILLERS)
671
-
672
- phrase_hits = count_phrase_hits(text, FILLER_PHRASES)
673
-
674
- return token_hits + phrase_hits
675
-
676
-
677
- def estimate_hedge_hits(tokens: list[str], text: str) -> int:
678
-
679
- token_hits = sum(1 for t in tokens if t in HEDGE_WORDS)
680
-
681
- phrase_hits = count_phrase_hits(text, HEDGE_PHRASES)
682
-
683
- return token_hits + phrase_hits
684
-
685
-
686
 
687
 
688
 
@@ -712,23 +370,13 @@ def ensure_nonempty_text(req: AnalyzeRequest) -> str:
712
 
713
 
714
 
715
- def lexical_domain(
716
-
717
- tokens: list[str],
718
-
719
- text: str,
720
-
721
- content: list[str],
722
-
723
- language_profile: dict[str, Any],
724
-
725
- ) -> tuple[DomainScore, dict[str, float]]:
726
 
727
  total = max(len(tokens), 1)
728
 
729
  unique = len(set(tokens))
730
 
731
- filler_hits = estimate_filler_hits(tokens, text)
732
 
733
 
734
 
@@ -740,21 +388,11 @@ def lexical_domain(
740
 
741
 
742
 
743
- ttr_target = pick_language_target(language_profile, english=0.52, hinglish=0.57, hindi=0.56, multilingual=0.55)
744
-
745
- density_target = pick_language_target(language_profile, english=0.58, hinglish=0.63, hindi=0.61, multilingual=0.60)
746
-
747
- filler_low = pick_language_target(language_profile, english=2.0, hinglish=3.5, hindi=3.5, multilingual=3.0)
748
-
749
- filler_high = pick_language_target(language_profile, english=12.0, hinglish=20.0, hindi=17.0, multilingual=17.0)
750
-
751
-
752
 
753
- s_ttr = clamp01(abs(ttr - ttr_target) / 0.30)
754
 
755
- s_density = clamp01(abs(density - density_target) / 0.25)
756
-
757
- s_filler = scale_linear(filler_rate, filler_low, filler_high)
758
 
759
 
760
 
@@ -852,15 +490,7 @@ def semantic_domain(sentences: list[str]) -> tuple[DomainScore, dict[str, float]
852
 
853
  def prosody_domain(
854
 
855
- tokens: list[str],
856
-
857
- text: str,
858
-
859
- pause_map: Optional[list[float]],
860
-
861
- audio_duration: Optional[float],
862
-
863
- language_profile: dict[str, Any],
864
 
865
  ) -> tuple[DomainScore, dict[str, float], bool]:
866
 
@@ -894,11 +524,7 @@ def prosody_domain(
894
 
895
  pause_freq = len(pauses) / duration_minutes
896
 
897
- pause_hesitation = sum(1 for p in pauses if p >= 0.8) / len(pauses)
898
-
899
- lexical_hesitation = clamp01(estimate_filler_hits(tokens, text) / max(word_count, 1))
900
-
901
- hesitation_ratio = clamp01((0.7 * pause_hesitation) + (0.3 * lexical_hesitation))
902
 
903
  else:
904
 
@@ -906,13 +532,11 @@ def prosody_domain(
906
 
907
  pause_freq = (punctuation_pauses / max(word_count, 1)) * 100
908
 
909
- hesitation_ratio = clamp01(estimate_filler_hits(tokens, text) / max(word_count, 1))
910
-
911
 
912
 
913
- speech_rate_target = pick_language_target(language_profile, english=140.0, hinglish=132.0, hindi=126.0, multilingual=133.0)
914
 
915
- s_rate = clamp01(abs(speech_rate - speech_rate_target) / 95.0)
916
 
917
  s_pause = scale_linear(pause_freq, low=8.0, high=30.0)
918
 
@@ -950,17 +574,7 @@ def prosody_domain(
950
 
951
 
952
 
953
- def syntax_domain(
954
-
955
- tokens: list[str],
956
-
957
- sentences: list[str],
958
-
959
- text: str,
960
-
961
- language_profile: dict[str, Any],
962
-
963
- ) -> tuple[DomainScore, dict[str, float]]:
964
 
965
  sentence_count = max(len(sentences), 1)
966
 
@@ -990,31 +604,15 @@ def syntax_domain(
990
 
991
 
992
 
993
- mlu_target = pick_language_target(language_profile, english=17.0, hinglish=15.0, hindi=14.5, multilingual=15.5)
994
-
995
- depth_low = pick_language_target(language_profile, english=2.0, hinglish=1.5, hindi=1.4, multilingual=1.6)
996
 
997
- depth_high = pick_language_target(language_profile, english=6.5, hinglish=5.7, hindi=5.3, multilingual=5.8)
998
-
999
-
1000
-
1001
- s_mlu = clamp01(abs(mlu - mlu_target) / 12.0)
1002
-
1003
- s_depth = scale_linear(clause_depth, low=depth_low, high=depth_high)
1004
 
1005
  s_passive = scale_linear(passive_ratio, low=0.15, high=1.2)
1006
 
1007
 
1008
 
1009
- passive_weight = pick_language_target(language_profile, english=0.20, hinglish=0.12, hindi=0.05, multilingual=0.10)
1010
-
1011
- mlu_weight = 0.45 + ((0.20 - passive_weight) * 0.55)
1012
-
1013
- depth_weight = 1.0 - mlu_weight - passive_weight
1014
-
1015
-
1016
-
1017
- overall = clamp01((mlu_weight * s_mlu) + (depth_weight * s_depth) + (passive_weight * s_passive))
1018
 
1019
 
1020
 
@@ -1044,7 +642,7 @@ def syntax_domain(
1044
 
1045
 
1046
 
1047
- def affective_domain(tokens: list[str], text: str) -> tuple[DomainScore, dict[str, float]]:
1048
 
1049
  total = max(len(tokens), 1)
1050
 
@@ -1054,7 +652,7 @@ def affective_domain(tokens: list[str], text: str) -> tuple[DomainScore, dict[st
1054
 
1055
  arousal = sum(1 for t in tokens if t in AROUSAL_WORDS)
1056
 
1057
- hedge = estimate_hedge_hits(tokens, text)
1058
 
1059
 
1060
 
@@ -1154,10 +752,6 @@ def compute_analysis_state(
1154
 
1155
  audio_duration: Optional[float],
1156
 
1157
- detected_language: Optional[str] = None,
1158
-
1159
- hinted_profile: Optional[dict[str, Any]] = None,
1160
-
1161
  ) -> AnalysisState:
1162
 
1163
  tokens = tokenize_words(text)
@@ -1166,31 +760,21 @@ def compute_analysis_state(
1166
 
1167
  cwords = content_words(tokens)
1168
 
1169
- language_profile = detect_language_profile(
1170
-
1171
- text,
1172
-
1173
- hinted_language=detected_language,
1174
-
1175
- hinted_profile=hinted_profile,
1176
-
1177
- )
1178
-
1179
 
1180
 
1181
  repeat_ratio = 1.0 - (len(set(tokens)) / max(len(tokens), 1))
1182
 
1183
 
1184
 
1185
- lexical, lexical_raw = lexical_domain(tokens, text, cwords, language_profile)
1186
 
1187
  semantic, semantic_raw = semantic_domain(sentences)
1188
 
1189
- prosody, prosody_raw, has_audio = prosody_domain(tokens, text, pause_map, audio_duration, language_profile)
1190
 
1191
- syntax, syntax_raw = syntax_domain(tokens, sentences, text, language_profile)
1192
 
1193
- affective, affective_raw = affective_domain(tokens, text)
1194
 
1195
  confidence, quality_notes = compute_confidence(
1196
 
@@ -1204,18 +788,6 @@ def compute_analysis_state(
1204
 
1205
  )
1206
 
1207
- quality_notes.append(
1208
-
1209
- "Detected language mode: "
1210
-
1211
- + str(language_profile.get("label", "multilingual")).title()
1212
-
1213
- + f" (Hindi {round(float(language_profile.get('hindi_ratio', 0.0)) * 100)}%, "
1214
-
1215
- + f"English {round(float(language_profile.get('english_ratio', 0.0)) * 100)}%)."
1216
-
1217
- )
1218
-
1219
 
1220
 
1221
  scores = {
@@ -1264,8 +836,6 @@ def compute_analysis_state(
1264
 
1265
  "repeat_ratio": round(repeat_ratio, 4),
1266
 
1267
- "language_profile": language_profile,
1268
-
1269
  "lexical": lexical_raw,
1270
 
1271
  "semantic": semantic_raw,
@@ -1290,8 +860,6 @@ def compute_analysis_state(
1290
 
1291
  quality_notes=quality_notes,
1292
 
1293
- language_profile=language_profile,
1294
-
1295
  metrics=metrics,
1296
 
1297
  )
@@ -1348,11 +916,9 @@ def summary_fallback(state: AnalysisState, risk_level: str) -> str:
1348
 
1349
  confidence_pct = round(state.confidence * 100)
1350
 
1351
- language_mode = str(state.language_profile.get("label", "multilingual"))
1352
-
1353
  return (
1354
 
1355
- f"This {language_mode} speech analysis found a {risk_level} overall cognitive load signal based on linguistic and timing features. "
1356
 
1357
  f"The strongest deviation appeared in {top_domain} markers (score {top_value:.2f}). "
1358
 
@@ -1638,8 +1204,6 @@ async def compose_safe_summary(state: AnalysisState, risk_level: str) -> tuple[s
1638
 
1639
  "confidence": state.confidence,
1640
 
1641
- "language_profile": state.language_profile,
1642
-
1643
  "scores": {k: v.overall for k, v in state.scores.items()},
1644
 
1645
  "quality_notes": state.quality_notes,
@@ -1652,8 +1216,6 @@ async def compose_safe_summary(state: AnalysisState, risk_level: str) -> tuple[s
1652
 
1653
  "You summarize computational language-screening outputs. "
1654
 
1655
- "English, Hindi, and code-mixed Hinglish samples are all valid and should be interpreted fairly. "
1656
-
1657
  "Never diagnose disease, never use alarming wording, and always state uncertainty when confidence is limited. "
1658
 
1659
  "Output exactly 2-3 sentences in plain text."
@@ -1716,8 +1278,6 @@ async def health() -> dict[str, Any]:
1716
 
1717
  available = await fetch_available_models()
1718
 
1719
- transcribe_caps = transcription_model_capabilities(GROQ_TRANSCRIBE_MODEL)
1720
-
1721
  return {
1722
 
1723
  "ok": True,
@@ -1728,10 +1288,6 @@ async def health() -> dict[str, Any]:
1728
 
1729
  "model_count": len(available),
1730
 
1731
- "transcription_model": GROQ_TRANSCRIBE_MODEL,
1732
-
1733
- "transcription_capabilities": transcribe_caps,
1734
-
1735
  }
1736
 
1737
 
@@ -1744,8 +1300,6 @@ async def models_recommended() -> dict[str, Any]:
1744
 
1745
  available = await fetch_available_models()
1746
 
1747
- transcribe_caps = transcription_model_capabilities(GROQ_TRANSCRIBE_MODEL)
1748
-
1749
  return {
1750
 
1751
  "available_models": available,
@@ -1756,12 +1310,10 @@ async def models_recommended() -> dict[str, Any]:
1756
 
1757
  "safety": pick_model(available, OVERRIDE_SAFETY_MODEL, PREFERRED_SAFETY_MODELS),
1758
 
1759
- "transcription": GROQ_TRANSCRIBE_MODEL,
1760
 
1761
  },
1762
 
1763
- "transcription_capabilities": transcribe_caps,
1764
-
1765
  "notes": {
1766
 
1767
  "production_primary": "openai/gpt-oss-120b",
@@ -1770,12 +1322,6 @@ async def models_recommended() -> dict[str, Any]:
1770
 
1771
  "fast_fallback": "openai/gpt-oss-20b",
1772
 
1773
- "transcription_accuracy_primary": "whisper-large-v3",
1774
-
1775
- "transcription_speed_price_primary": "whisper-large-v3-turbo",
1776
-
1777
- "transcription_language_note": "Both Whisper models are multilingual and suitable for Hindi/Hinglish speech.",
1778
-
1779
  },
1780
 
1781
  }
@@ -1802,19 +1348,7 @@ async def analyze(req: AnalyzeRequest):
1802
 
1803
  try:
1804
 
1805
- state = compute_analysis_state(
1806
-
1807
- text,
1808
-
1809
- req.pause_map,
1810
-
1811
- req.audio_duration,
1812
-
1813
- detected_language=req.detected_language,
1814
-
1815
- hinted_profile=req.language_profile,
1816
-
1817
- )
1818
 
1819
  yield safe_step_event("STT preprocessor", "done", "Input normalized and validated")
1820
 
@@ -1906,8 +1440,6 @@ async def analyze(req: AnalyzeRequest):
1906
 
1907
  },
1908
 
1909
- "language_profile": state.language_profile,
1910
-
1911
  "model_info": model_meta,
1912
 
1913
  }
 
1
  import asyncio
2
+
3
  import json
4
+
5
  import os
6
+
7
  import re
8
+
9
  import statistics
10
+
11
  import time
12
+
13
  import uuid
14
+
15
  from dataclasses import dataclass
16
+
17
  from typing import Any, Optional
18
+
19
  import httpx
20
+
21
  from dotenv import load_dotenv
22
+
23
  from fastapi import FastAPI, HTTPException
24
+
25
  from fastapi.middleware.cors import CORSMiddleware
26
+
27
  from fastapi.responses import StreamingResponse
28
+
29
+ from pydantic import BaseModel, Field
30
 
31
 
32
 
33
  load_dotenv()
34
+
35
+
36
+
37
  app = FastAPI(title="CortexFlow Backend", version="1.0.0")
38
 
39
  app.add_middleware(
 
53
 
54
 
55
  GROQ_API_KEY = os.getenv("GROQ_API_KEY", "").strip()
56
+
57
  GROQ_API_BASE = os.getenv("GROQ_API_BASE", "https://api.groq.com/openai/v1").rstrip("/")
58
+
59
  GROQ_TIMEOUT_SECONDS = float(os.getenv("GROQ_TIMEOUT_SECONDS", "40"))
60
+
61
  MODEL_DISCOVERY_TTL_SECONDS = int(os.getenv("MODEL_DISCOVERY_TTL_SECONDS", "900"))
62
 
63
 
 
97
 
98
 
99
  OVERRIDE_REASONING_MODEL = os.getenv("GROQ_REASONING_MODEL", "").strip()
100
+
101
  OVERRIDE_SAFETY_MODEL = os.getenv("GROQ_SAFETY_MODEL", "").strip()
102
+
103
+
104
 
105
  MIN_WORDS_REQUIRED = int(os.getenv("MIN_WORDS_REQUIRED", "25"))
106
 
 
153
  "not", "no", "yes", "so", "because", "about", "into", "out", "up", "down", "can", "could", "would",
154
 
155
  "should", "will", "just", "very", "really", "also",
 
 
 
 
 
 
 
 
156
 
157
  }
158
 
 
160
 
161
  FILLERS = {
162
 
163
+ "um", "uh", "erm", "hmm", "like", "you", "know", "actually", "basically", "literally", "sort", "kind", "maybe",
 
164
 
165
  }
166
 
 
169
  POSITIVE_WORDS = {
170
 
171
  "good", "better", "great", "calm", "confident", "clear", "focused", "stable", "happy", "optimistic", "safe", "steady",
 
172
 
173
  }
174
 
175
  NEGATIVE_WORDS = {
176
 
177
  "bad", "worse", "anxious", "scared", "panic", "panicked", "confused", "sad", "depressed", "angry", "overwhelmed", "stressed",
 
178
 
179
  }
180
 
181
  AROUSAL_WORDS = {
182
 
183
  "urgent", "immediately", "intense", "extreme", "critical", "afraid", "panic", "terrified", "racing", "shaking", "worried",
 
184
 
185
  }
186
 
187
  HEDGE_WORDS = {
188
 
189
  "maybe", "perhaps", "possibly", "probably", "sort", "kind", "might", "could", "guess", "unsure", "not sure",
 
190
 
191
  }
192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  SUBORDINATORS = {
194
 
195
  "because", "although", "though", "while", "unless", "until", "since", "whereas", "however", "therefore", "moreover", "which", "that",
 
 
 
196
 
 
 
 
 
 
197
  }
198
 
199
 
 
210
 
211
  audio_duration: Optional[float] = None
212
 
 
 
 
 
213
  session_id: Optional[str] = None
214
 
215
 
 
238
 
239
  quality_notes: list[str]
240
 
 
 
241
  metrics: dict[str, Any]
242
 
243
 
 
248
 
249
  _MODEL_CACHE_LOCK = asyncio.Lock()
250
 
 
 
 
 
251
 
252
 
253
  def clamp01(v: float) -> float:
 
268
 
269
  def tokenize_words(text: str) -> list[str]:
270
 
271
+ return re.findall(r"[A-Za-z']+", text.lower())
272
 
273
 
274
 
 
276
 
277
  def split_sentences(text: str) -> list[str]:
278
 
279
+ parts = [p.strip() for p in re.split(r"(?<=[.!?])\s+", text) if p.strip()]
280
 
281
  return parts if parts else ([text.strip()] if text.strip() else [])
282
 
 
289
  return [t for t in tokens if len(t) > 2 and t not in STOPWORDS]
290
 
291
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
293
 
294
 
 
329
  return clamp01((good - value) / (good - poor))
330
 
331
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
 
333
  def safe_step_event(name: str, status: str, detail: Optional[str] = None) -> bytes:
334
 
 
341
  return (json.dumps(payload) + "\n").encode()
342
 
343
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
 
345
 
346
 
 
370
 
371
 
372
 
373
+ def lexical_domain(tokens: list[str], content: list[str]) -> tuple[DomainScore, dict[str, float]]:
 
 
 
 
 
 
 
 
 
 
374
 
375
  total = max(len(tokens), 1)
376
 
377
  unique = len(set(tokens))
378
 
379
+ filler_hits = sum(1 for t in tokens if t in FILLERS)
380
 
381
 
382
 
 
388
 
389
 
390
 
391
+ s_ttr = clamp01(abs(ttr - 0.52) / 0.30)
 
 
 
 
 
 
 
 
392
 
393
+ s_density = clamp01(abs(density - 0.58) / 0.25)
394
 
395
+ s_filler = scale_linear(filler_rate, 2.0, 14.0)
 
 
396
 
397
 
398
 
 
490
 
491
  def prosody_domain(
492
 
493
+ tokens: list[str], text: str, pause_map: Optional[list[float]], audio_duration: Optional[float]
 
 
 
 
 
 
 
 
494
 
495
  ) -> tuple[DomainScore, dict[str, float], bool]:
496
 
 
524
 
525
  pause_freq = len(pauses) / duration_minutes
526
 
527
+ hesitation_ratio = sum(1 for p in pauses if p >= 0.8) / len(pauses)
 
 
 
 
528
 
529
  else:
530
 
 
532
 
533
  pause_freq = (punctuation_pauses / max(word_count, 1)) * 100
534
 
535
+ hesitation_ratio = sum(1 for t in tokens if t in FILLERS) / max(word_count, 1)
 
536
 
537
 
 
538
 
539
+ s_rate = clamp01(abs(speech_rate - 140.0) / 95.0)
540
 
541
  s_pause = scale_linear(pause_freq, low=8.0, high=30.0)
542
 
 
574
 
575
 
576
 
577
+ def syntax_domain(tokens: list[str], sentences: list[str], text: str) -> tuple[DomainScore, dict[str, float]]:
 
 
 
 
 
 
 
 
 
 
578
 
579
  sentence_count = max(len(sentences), 1)
580
 
 
604
 
605
 
606
 
607
+ s_mlu = clamp01(abs(mlu - 17.0) / 12.0)
 
 
608
 
609
+ s_depth = scale_linear(clause_depth, low=2.0, high=6.5)
 
 
 
 
 
 
610
 
611
  s_passive = scale_linear(passive_ratio, low=0.15, high=1.2)
612
 
613
 
614
 
615
+ overall = clamp01((0.45 * s_mlu) + (0.35 * s_depth) + (0.20 * s_passive))
 
 
 
 
 
 
 
 
616
 
617
 
618
 
 
642
 
643
 
644
 
645
+ def affective_domain(tokens: list[str]) -> tuple[DomainScore, dict[str, float]]:
646
 
647
  total = max(len(tokens), 1)
648
 
 
652
 
653
  arousal = sum(1 for t in tokens if t in AROUSAL_WORDS)
654
 
655
+ hedge = sum(1 for t in tokens if t in HEDGE_WORDS)
656
 
657
 
658
 
 
752
 
753
  audio_duration: Optional[float],
754
 
 
 
 
 
755
  ) -> AnalysisState:
756
 
757
  tokens = tokenize_words(text)
 
760
 
761
  cwords = content_words(tokens)
762
 
 
 
 
 
 
 
 
 
 
 
763
 
764
 
765
  repeat_ratio = 1.0 - (len(set(tokens)) / max(len(tokens), 1))
766
 
767
 
768
 
769
+ lexical, lexical_raw = lexical_domain(tokens, cwords)
770
 
771
  semantic, semantic_raw = semantic_domain(sentences)
772
 
773
+ prosody, prosody_raw, has_audio = prosody_domain(tokens, text, pause_map, audio_duration)
774
 
775
+ syntax, syntax_raw = syntax_domain(tokens, sentences, text)
776
 
777
+ affective, affective_raw = affective_domain(tokens)
778
 
779
  confidence, quality_notes = compute_confidence(
780
 
 
788
 
789
  )
790
 
 
 
 
 
 
 
 
 
 
 
 
 
791
 
792
 
793
  scores = {
 
836
 
837
  "repeat_ratio": round(repeat_ratio, 4),
838
 
 
 
839
  "lexical": lexical_raw,
840
 
841
  "semantic": semantic_raw,
 
860
 
861
  quality_notes=quality_notes,
862
 
 
 
863
  metrics=metrics,
864
 
865
  )
 
916
 
917
  confidence_pct = round(state.confidence * 100)
918
 
 
 
919
  return (
920
 
921
+ f"This analysis found a {risk_level} overall cognitive load signal based on linguistic and timing features. "
922
 
923
  f"The strongest deviation appeared in {top_domain} markers (score {top_value:.2f}). "
924
 
 
1204
 
1205
  "confidence": state.confidence,
1206
 
 
 
1207
  "scores": {k: v.overall for k, v in state.scores.items()},
1208
 
1209
  "quality_notes": state.quality_notes,
 
1216
 
1217
  "You summarize computational language-screening outputs. "
1218
 
 
 
1219
  "Never diagnose disease, never use alarming wording, and always state uncertainty when confidence is limited. "
1220
 
1221
  "Output exactly 2-3 sentences in plain text."
 
1278
 
1279
  available = await fetch_available_models()
1280
 
 
 
1281
  return {
1282
 
1283
  "ok": True,
 
1288
 
1289
  "model_count": len(available),
1290
 
 
 
 
 
1291
  }
1292
 
1293
 
 
1300
 
1301
  available = await fetch_available_models()
1302
 
 
 
1303
  return {
1304
 
1305
  "available_models": available,
 
1310
 
1311
  "safety": pick_model(available, OVERRIDE_SAFETY_MODEL, PREFERRED_SAFETY_MODELS),
1312
 
1313
+ "transcription": "whisper-large-v3-turbo",
1314
 
1315
  },
1316
 
 
 
1317
  "notes": {
1318
 
1319
  "production_primary": "openai/gpt-oss-120b",
 
1322
 
1323
  "fast_fallback": "openai/gpt-oss-20b",
1324
 
 
 
 
 
 
 
1325
  },
1326
 
1327
  }
 
1348
 
1349
  try:
1350
 
1351
+ state = compute_analysis_state(text, req.pause_map, req.audio_duration)
 
 
 
 
 
 
 
 
 
 
 
 
1352
 
1353
  yield safe_step_event("STT preprocessor", "done", "Input normalized and validated")
1354
 
 
1440
 
1441
  },
1442
 
 
 
1443
  "model_info": model_meta,
1444
 
1445
  }