Adi362 commited on
Commit
cb272d3
·
verified ·
1 Parent(s): 7b5b134

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +1956 -744
main.py CHANGED
@@ -1,744 +1,1956 @@
1
- import asyncio
2
- import json
3
- import os
4
- import re
5
- import statistics
6
- import time
7
- import uuid
8
- from dataclasses import dataclass
9
- from typing import Any, Optional
10
- import httpx
11
- from dotenv import load_dotenv
12
- from fastapi import FastAPI, HTTPException
13
- from fastapi.middleware.cors import CORSMiddleware
14
- from fastapi.responses import StreamingResponse
15
- from pydantic import BaseModel, Field
16
-
17
- load_dotenv()
18
-
19
- app = FastAPI(title="CortexFlow Backend", version="1.0.0")
20
- app.add_middleware(
21
- CORSMiddleware,
22
- allow_origins=["*"],
23
- allow_methods=["GET", "POST"],
24
- allow_headers=["*"],
25
- )
26
-
27
-
28
- GROQ_API_KEY = os.getenv("GROQ_API_KEY", "").strip()
29
- GROQ_API_BASE = os.getenv("GROQ_API_BASE", "https://api.groq.com/openai/v1").rstrip("/")
30
- GROQ_TIMEOUT_SECONDS = float(os.getenv("GROQ_TIMEOUT_SECONDS", "40"))
31
- MODEL_DISCOVERY_TTL_SECONDS = int(os.getenv("MODEL_DISCOVERY_TTL_SECONDS", "900"))
32
-
33
- PREFERRED_REASONING_MODELS = [
34
- m.strip()
35
- for m in os.getenv(
36
- "GROQ_REASONING_CANDIDATES",
37
- "openai/gpt-oss-120b,llama-3.3-70b-versatile,openai/gpt-oss-20b,llama-3.1-8b-instant",
38
- ).split(",")
39
- if m.strip()
40
- ]
41
- PREFERRED_SAFETY_MODELS = [
42
- m.strip()
43
- for m in os.getenv(
44
- "GROQ_SAFETY_CANDIDATES",
45
- "openai/gpt-oss-safeguard-20b,openai/gpt-oss-20b,llama-3.1-8b-instant",
46
- ).split(",")
47
- if m.strip()
48
- ]
49
-
50
- OVERRIDE_REASONING_MODEL = os.getenv("GROQ_REASONING_MODEL", "").strip()
51
- OVERRIDE_SAFETY_MODEL = os.getenv("GROQ_SAFETY_MODEL", "").strip()
52
-
53
- MIN_WORDS_REQUIRED = int(os.getenv("MIN_WORDS_REQUIRED", "25"))
54
-
55
- STEP_NAMES = [
56
- "STT preprocessor",
57
- "Lexical agent",
58
- "Semantic agent",
59
- "Prosody agent",
60
- "Syntax agent",
61
- "Biomarker mapper",
62
- "Report composer",
63
- ]
64
- DOMAIN_REGION = {
65
- "lexical": "Broca's area",
66
- "semantic": "Wernicke's area",
67
- "prosody": "SMA",
68
- "syntax": "DLPFC",
69
- "affective": "Amygdala",
70
- }
71
-
72
- STOPWORDS = {
73
- "the", "a", "an", "and", "or", "but", "if", "then", "than", "of", "to", "in", "on", "at", "for",
74
- "with", "without", "by", "from", "as", "is", "am", "are", "was", "were", "be", "been", "being",
75
- "it", "its", "this", "that", "these", "those", "i", "you", "he", "she", "we", "they", "them",
76
- "my", "your", "our", "their", "me", "him", "her", "us", "do", "does", "did", "have", "has", "had",
77
- "not", "no", "yes", "so", "because", "about", "into", "out", "up", "down", "can", "could", "would",
78
- "should", "will", "just", "very", "really", "also",
79
- }
80
-
81
- FILLERS = {
82
- "um", "uh", "erm", "hmm", "like", "you", "know", "actually", "basically", "literally", "sort", "kind", "maybe",
83
- }
84
-
85
- POSITIVE_WORDS = {
86
- "good", "better", "great", "calm", "confident", "clear", "focused", "stable", "happy", "optimistic", "safe", "steady",
87
- }
88
- NEGATIVE_WORDS = {
89
- "bad", "worse", "anxious", "scared", "panic", "panicked", "confused", "sad", "depressed", "angry", "overwhelmed", "stressed",
90
- }
91
- AROUSAL_WORDS = {
92
- "urgent", "immediately", "intense", "extreme", "critical", "afraid", "panic", "terrified", "racing", "shaking", "worried",
93
- }
94
- HEDGE_WORDS = {
95
- "maybe", "perhaps", "possibly", "probably", "sort", "kind", "might", "could", "guess", "unsure", "not sure",
96
- }
97
- SUBORDINATORS = {
98
- "because", "although", "though", "while", "unless", "until", "since", "whereas", "however", "therefore", "moreover", "which", "that",
99
- }
100
-
101
-
102
- class AnalyzeRequest(BaseModel):
103
- input_value: Optional[str] = None
104
- transcript: Optional[str] = None
105
- pause_map: Optional[list[float]] = None
106
- audio_duration: Optional[float] = None
107
- session_id: Optional[str] = None
108
-
109
-
110
- @dataclass
111
- class DomainScore:
112
- overall: float
113
- details: dict[str, float]
114
-
115
- @dataclass
116
- class AnalysisState:
117
- scores: dict[str, DomainScore]
118
- overall_load: float
119
- confidence: float
120
- quality_notes: list[str]
121
- metrics: dict[str, Any]
122
-
123
-
124
- _MODEL_CACHE: dict[str, Any] = {"updated": 0.0, "models": []}
125
- _MODEL_CACHE_LOCK = asyncio.Lock()
126
-
127
- def clamp01(v: float) -> float:
128
- return max(0.0, min(1.0, v))
129
-
130
-
131
- def mean(values: list[float], default: float = 0.0) -> float:
132
- return float(statistics.mean(values)) if values else default
133
-
134
-
135
- def tokenize_words(text: str) -> list[str]:
136
- return re.findall(r"[A-Za-z']+", text.lower())
137
-
138
-
139
- def split_sentences(text: str) -> list[str]:
140
- parts = [p.strip() for p in re.split(r"(?<=[.!?])\s+", text) if p.strip()]
141
- return parts if parts else ([text.strip()] if text.strip() else [])
142
-
143
-
144
- def content_words(tokens: list[str]) -> list[str]:
145
- return [t for t in tokens if len(t) > 2 and t not in STOPWORDS]
146
-
147
-
148
- def jaccard(a: set[str], b: set[str]) -> float:
149
- if not a or not b:
150
- return 0.0
151
- inter = len(a.intersection(b))
152
- union = len(a.union(b))
153
- return inter / union if union else 0.0
154
-
155
-
156
- def scale_linear(value: float, low: float, high: float) -> float:
157
- if high <= low:
158
- return 0.0
159
- return clamp01((value - low) / (high - low))
160
-
161
-
162
- def scale_inverse(value: float, good: float, poor: float) -> float:
163
- if poor >= good:
164
- return 0.0
165
- return clamp01((good - value) / (good - poor))
166
-
167
- def safe_step_event(name: str, status: str, detail: Optional[str] = None) -> bytes:
168
- payload: dict[str, Any] = {"type": "step", "step": {"name": name, "status": status}}
169
- if detail:
170
- payload["step"]["detail"] = detail
171
- return (json.dumps(payload) + "\n").encode()
172
-
173
-
174
- def ensure_nonempty_text(req: AnalyzeRequest) -> str:
175
- text = (req.input_value or req.transcript or "").strip()
176
- words = tokenize_words(text)
177
- if not text:
178
- raise HTTPException(status_code=400, detail="No input text provided")
179
- if len(words) < MIN_WORDS_REQUIRED:
180
- raise HTTPException(
181
- status_code=422,
182
- detail=f"Need at least {MIN_WORDS_REQUIRED} words for reliable analysis. Received {len(words)} words.",
183
- )
184
- return text
185
-
186
-
187
- def lexical_domain(tokens: list[str], content: list[str]) -> tuple[DomainScore, dict[str, float]]:
188
- total = max(len(tokens), 1)
189
- unique = len(set(tokens))
190
- filler_hits = sum(1 for t in tokens if t in FILLERS)
191
-
192
- ttr = unique / total
193
- density = len(content) / total
194
- filler_rate = (filler_hits / total) * 100.0
195
-
196
- s_ttr = clamp01(abs(ttr - 0.52) / 0.30)
197
- s_density = clamp01(abs(density - 0.58) / 0.25)
198
- s_filler = scale_linear(filler_rate, 2.0, 14.0)
199
-
200
- overall = clamp01((0.4 * s_ttr) + (0.35 * s_density) + (0.25 * s_filler))
201
-
202
- details = {
203
- "ttr": round(s_ttr, 4),
204
- "density": round(s_density, 4),
205
- "filler_rate": round(s_filler, 4),
206
- }
207
- raw = {
208
- "ttr": round(ttr, 4),
209
- "lexical_density": round(density, 4),
210
- "filler_rate_per_100w": round(filler_rate, 2),
211
- }
212
- return DomainScore(round(overall, 4), details), raw
213
-
214
-
215
- def semantic_domain(sentences: list[str]) -> tuple[DomainScore, dict[str, float]]:
216
- if len(sentences) < 2:
217
- coherence = 0.16
218
- idea_density = 0.45
219
- tangentiality = 0.55
220
- else:
221
- sentence_content = [set(content_words(tokenize_words(s))) for s in sentences]
222
- pairwise = [jaccard(sentence_content[i], sentence_content[i + 1]) for i in range(len(sentence_content) - 1)]
223
- coherence = mean(pairwise, default=0.12)
224
- avg_content_len = mean([len(x) for x in sentence_content], default=0.0)
225
- idea_density = clamp01(avg_content_len / 14.0)
226
- tangentiality = clamp01(1.0 - coherence)
227
- s_coherence = scale_inverse(coherence, good=0.22, poor=0.05)
228
- s_idea_density = scale_inverse(idea_density, good=0.65, poor=0.25)
229
- s_tangentiality = scale_linear(tangentiality, low=0.35, high=0.85)
230
-
231
- overall = clamp01((0.45 * s_coherence) + (0.30 * s_idea_density) + (0.25 * s_tangentiality))
232
-
233
- details = {
234
- "coherence": round(s_coherence, 4),
235
- "idea_density": round(s_idea_density, 4),
236
- "tangentiality": round(s_tangentiality, 4),
237
- }
238
- raw = {
239
- "coherence_index": round(coherence, 4),
240
- "idea_density_index": round(idea_density, 4),
241
- "tangentiality_index": round(tangentiality, 4),
242
- }
243
- return DomainScore(round(overall, 4), details), raw
244
-
245
-
246
- def prosody_domain(
247
- tokens: list[str], text: str, pause_map: Optional[list[float]], audio_duration: Optional[float]
248
- ) -> tuple[DomainScore, dict[str, float], bool]:
249
- word_count = max(len(tokens), 1)
250
- pauses = [float(p) for p in (pause_map or []) if p >= 0]
251
- has_audio_prosody = bool(pauses)
252
-
253
- if audio_duration and audio_duration > 5.0:
254
- duration_seconds = audio_duration
255
- else:
256
- estimated_speech_seconds = word_count / 2.5
257
- duration_seconds = estimated_speech_seconds + sum(pauses)
258
-
259
- duration_minutes = max(duration_seconds / 60.0, 0.1)
260
- speech_rate = word_count / duration_minutes
261
-
262
- if pauses:
263
- pause_freq = len(pauses) / duration_minutes
264
- hesitation_ratio = sum(1 for p in pauses if p >= 0.8) / len(pauses)
265
- else:
266
- punctuation_pauses = len(re.findall(r"[,;:\-]", text))
267
- pause_freq = (punctuation_pauses / max(word_count, 1)) * 100
268
- hesitation_ratio = sum(1 for t in tokens if t in FILLERS) / max(word_count, 1)
269
-
270
- s_rate = clamp01(abs(speech_rate - 140.0) / 95.0)
271
- s_pause = scale_linear(pause_freq, low=8.0, high=30.0)
272
- s_hes = scale_linear(hesitation_ratio, low=0.08, high=0.35)
273
-
274
- overall = clamp01((0.4 * s_rate) + (0.35 * s_pause) + (0.25 * s_hes))
275
-
276
- details = {
277
- "speech_rate": round(s_rate, 4),
278
- "pause_freq": round(s_pause, 4),
279
- "hesitation": round(s_hes, 4),
280
- }
281
- raw = {
282
- "speech_rate_wpm": round(speech_rate, 1),
283
- "pause_frequency_per_min": round(pause_freq, 2),
284
- "hesitation_ratio": round(hesitation_ratio, 4),
285
- "duration_seconds": round(duration_seconds, 2),
286
- }
287
- return DomainScore(round(overall, 4), details), raw, has_audio_prosody
288
-
289
- def syntax_domain(tokens: list[str], sentences: list[str], text: str) -> tuple[DomainScore, dict[str, float]]:
290
- sentence_count = max(len(sentences), 1)
291
- mlu = len(tokens) / sentence_count
292
-
293
- per_sentence_depth = []
294
- for s in sentences:
295
- stoks = tokenize_words(s)
296
- sub_count = sum(1 for t in stoks if t in SUBORDINATORS)
297
- comma_count = s.count(",")
298
- per_sentence_depth.append(sub_count + (comma_count * 0.5))
299
- clause_depth = mean(per_sentence_depth, default=0.0)
300
-
301
- passive_matches = re.findall(r"\b(?:is|are|was|were|be|been|being)\s+\w+(?:ed|en)\b", text.lower())
302
- passive_ratio = len(passive_matches) / max(sentence_count, 1)
303
-
304
- s_mlu = clamp01(abs(mlu - 17.0) / 12.0)
305
- s_depth = scale_linear(clause_depth, low=2.0, high=6.5)
306
- s_passive = scale_linear(passive_ratio, low=0.15, high=1.2)
307
-
308
- overall = clamp01((0.45 * s_mlu) + (0.35 * s_depth) + (0.20 * s_passive))
309
-
310
- details = {
311
- "mlu": round(s_mlu, 4),
312
- "clause_depth": round(s_depth, 4),
313
- "passive_ratio": round(s_passive, 4),
314
- }
315
- raw = {
316
- "mean_length_utterance": round(mlu, 2),
317
- "clause_depth_index": round(clause_depth, 2),
318
- "passive_ratio": round(passive_ratio, 3),
319
- }
320
- return DomainScore(round(overall, 4), details), raw
321
-
322
-
323
- def affective_domain(tokens: list[str]) -> tuple[DomainScore, dict[str, float]]:
324
- total = max(len(tokens), 1)
325
- pos = sum(1 for t in tokens if t in POSITIVE_WORDS)
326
- neg = sum(1 for t in tokens if t in NEGATIVE_WORDS)
327
- arousal = sum(1 for t in tokens if t in AROUSAL_WORDS)
328
- hedge = sum(1 for t in tokens if t in HEDGE_WORDS)
329
-
330
- valence = (pos - neg) / (pos + neg + 1)
331
- valence_01 = (valence + 1.0) / 2.0
332
- arousal_rate = (arousal / total) * 100.0
333
- certainty = 1.0 - clamp01(hedge / max(total * 0.15, 1.0))
334
-
335
- s_valence = scale_inverse(valence_01, good=0.62, poor=0.20)
336
- s_arousal = scale_linear(arousal_rate, low=3.0, high=14.0)
337
- s_certainty = scale_inverse(certainty, good=0.72, poor=0.32)
338
- overall = clamp01((0.4 * s_valence) + (0.35 * s_arousal) + (0.25 * s_certainty))
339
-
340
- details = {
341
- "valence": round(s_valence, 4),
342
- "arousal": round(s_arousal, 4),
343
- "certainty": round(s_certainty, 4),
344
- }
345
- raw = {
346
- "valence_score": round(valence_01, 4),
347
- "arousal_rate_per_100w": round(arousal_rate, 2),
348
- "certainty_index": round(certainty, 4),
349
- }
350
- return DomainScore(round(overall, 4), details), raw
351
-
352
-
353
- def compute_confidence(
354
- word_count: int, sentence_count: int, has_audio_prosody: bool, repeat_ratio: float
355
- ) -> tuple[float, list[str]]:
356
- notes: list[str] = []
357
- c_words = clamp01(word_count / 180.0)
358
- c_sents = clamp01(sentence_count / 8.0)
359
- c_repeat = clamp01(1.0 - (repeat_ratio * 1.4))
360
- c_audio = 1.0 if has_audio_prosody else 0.55
361
-
362
- confidence = clamp01((0.45 * c_words) + (0.2 * c_sents) + (0.2 * c_repeat) + (0.15 * c_audio))
363
-
364
- if word_count < 60:
365
- notes.append("Low sample length. Interpret results cautiously.")
366
- if not has_audio_prosody:
367
- notes.append("Prosody is inferred from text patterns because pause-map audio features were not provided.")
368
- if repeat_ratio > 0.45:
369
- notes.append("High repetition detected, which can reduce semantic reliability.")
370
-
371
- return round(confidence, 4), notes
372
-
373
-
374
- def compute_analysis_state(
375
- text: str,
376
- pause_map: Optional[list[float]],
377
- audio_duration: Optional[float],
378
- ) -> AnalysisState:
379
- tokens = tokenize_words(text)
380
- sentences = split_sentences(text)
381
- cwords = content_words(tokens)
382
-
383
- repeat_ratio = 1.0 - (len(set(tokens)) / max(len(tokens), 1))
384
-
385
- lexical, lexical_raw = lexical_domain(tokens, cwords)
386
- semantic, semantic_raw = semantic_domain(sentences)
387
- prosody, prosody_raw, has_audio = prosody_domain(tokens, text, pause_map, audio_duration)
388
- syntax, syntax_raw = syntax_domain(tokens, sentences, text)
389
- affective, affective_raw = affective_domain(tokens)
390
- confidence, quality_notes = compute_confidence(
391
- word_count=len(tokens),
392
- sentence_count=len(sentences),
393
- has_audio_prosody=has_audio,
394
- repeat_ratio=repeat_ratio,
395
- )
396
-
397
- scores = {
398
- "lexical": lexical,
399
- "semantic": semantic,
400
- "prosody": prosody,
401
- "syntax": syntax,
402
- "affective": affective,
403
- }
404
-
405
- weighted = (
406
- (0.22 * lexical.overall)
407
- + (0.23 * semantic.overall)
408
- + (0.18 * prosody.overall)
409
- + (0.22 * syntax.overall)
410
- + (0.15 * affective.overall)
411
- )
412
-
413
- confidence_factor = 0.75 + (0.25 * confidence)
414
- overall_load = clamp01(weighted * confidence_factor)
415
-
416
- metrics = {
417
- "word_count": len(tokens),
418
- "sentence_count": len(sentences),
419
- "repeat_ratio": round(repeat_ratio, 4),
420
- "lexical": lexical_raw,
421
- "semantic": semantic_raw,
422
- "prosody": prosody_raw,
423
- "syntax": syntax_raw,
424
- "affective": affective_raw,
425
- }
426
-
427
- return AnalysisState(
428
- scores=scores,
429
- overall_load=round(overall_load, 4),
430
- confidence=confidence,
431
- quality_notes=quality_notes,
432
- metrics=metrics,
433
- )
434
-
435
-
436
- def severity_from_score(value: float) -> str:
437
- if value >= 0.72:
438
- return "high"
439
- if value >= 0.42:
440
- return "moderate"
441
- return "low"
442
-
443
- def level_from_overall(overall_load: float, confidence: float) -> str:
444
- if overall_load >= 0.68:
445
- base = "high"
446
- elif overall_load >= 0.44:
447
- base = "moderate"
448
- else:
449
- base = "low"
450
-
451
- if confidence < 0.45 and base == "high":
452
- return "moderate"
453
- return base
454
-
455
-
456
- def summary_fallback(state: AnalysisState, risk_level: str) -> str:
457
- top_domain = max(state.scores.items(), key=lambda kv: kv[1].overall)[0]
458
- top_value = state.scores[top_domain].overall
459
- confidence_pct = round(state.confidence * 100)
460
- return (
461
- f"This analysis found a {risk_level} overall cognitive load signal based on linguistic and timing features. "
462
- f"The strongest deviation appeared in {top_domain} markers (score {top_value:.2f}). "
463
- f"Confidence is {confidence_pct}% and this output is screening support only, not a diagnosis."
464
- )
465
-
466
-
467
- def make_highlights(state: AnalysisState) -> list[dict[str, Any]]:
468
- sorted_domains = sorted(state.scores.items(), key=lambda kv: kv[1].overall, reverse=True)
469
- highlights: list[dict[str, Any]] = []
470
- for domain, score in sorted_domains[:3]:
471
- if score.overall >= 0.66:
472
- finding = "Elevated deviation from expected baseline in this domain."
473
- elif score.overall >= 0.42:
474
- finding = "Mild-to-moderate deviation with mixed stability."
475
- else:
476
- finding = "Signals remain within expected variation for this domain."
477
-
478
- highlights.append(
479
- {
480
- "region": DOMAIN_REGION[domain],
481
- "activation": round(score.overall, 4),
482
- "finding": finding,
483
- "clinical_context": "Screening signal only. Interpret alongside clinical judgement and repeated assessments.",
484
- }
485
- )
486
- return highlights
487
-
488
-
489
- def make_indicators(state: AnalysisState) -> list[dict[str, Any]]:
490
- indicators: list[dict[str, Any]] = []
491
- for domain, dscore in state.scores.items():
492
- for k, v in dscore.details.items():
493
- if v < 0.42:
494
- continue
495
- indicators.append(
496
- {
497
- "indicator": f"{domain.title()} · {k.replace('_', ' ').title()}",
498
- "severity": severity_from_score(v),
499
- "explanation": f"Computed score {v:.2f} from measured input features; higher means greater deviation from baseline patterns.",
500
- }
501
- )
502
- indicators.sort(key=lambda x: {"high": 2, "moderate": 1, "low": 0}[x["severity"]], reverse=True)
503
- return indicators[:6]
504
-
505
-
506
- def recommendation_for_level(level: str, confidence: float) -> str:
507
- if level == "high":
508
- return (
509
- "Repeat this assessment with a longer sample, then discuss the combined results with a qualified clinician. "
510
- "Do not treat this result as a diagnosis."
511
- )
512
- if level == "moderate":
513
- return (
514
- "Collect 1-2 additional samples across different times of day to confirm trend stability before drawing conclusions."
515
- )
516
- if confidence < 0.5:
517
- return "Provide a longer speech sample for stronger reliability before interpreting the result."
518
- return "Current signals are relatively stable. Continue periodic monitoring rather than one-off interpretation."
519
-
520
-
521
- async def fetch_available_models() -> list[str]:
522
- if not GROQ_API_KEY:
523
- return []
524
-
525
- async with _MODEL_CACHE_LOCK:
526
- now = time.time()
527
- if now - float(_MODEL_CACHE["updated"]) < MODEL_DISCOVERY_TTL_SECONDS:
528
- return list(_MODEL_CACHE["models"])
529
-
530
- headers = {"Authorization": f"Bearer {GROQ_API_KEY}"}
531
- try:
532
- async with httpx.AsyncClient(timeout=GROQ_TIMEOUT_SECONDS) as client:
533
- res = await client.get(f"{GROQ_API_BASE}/models", headers=headers)
534
- res.raise_for_status()
535
- data = res.json().get("data", [])
536
- models = sorted({item.get("id", "") for item in data if item.get("id")})
537
- _MODEL_CACHE["updated"] = now
538
- _MODEL_CACHE["models"] = models
539
- return models
540
- except Exception:
541
- return list(_MODEL_CACHE["models"])
542
-
543
-
544
- def pick_model(available: list[str], override: str, candidates: list[str]) -> Optional[str]:
545
- if override and override in available:
546
- return override
547
-
548
- for m in candidates:
549
- if m in available:
550
- return m
551
- for m in available:
552
- lowered = m.lower()
553
- if "instruct" in lowered or "versatile" in lowered or "gpt-oss" in lowered:
554
- return m
555
-
556
- return available[0] if available else None
557
-
558
-
559
- async def groq_chat(model: str, system: str, user: str, temperature: float = 0.2) -> Optional[str]:
560
- if not GROQ_API_KEY or not model:
561
- return None
562
-
563
- headers = {
564
- "Authorization": f"Bearer {GROQ_API_KEY}",
565
- "Content-Type": "application/json",
566
- }
567
- payload = {
568
- "model": model,
569
- "temperature": temperature,
570
- "messages": [
571
- {"role": "system", "content": system},
572
- {"role": "user", "content": user},
573
- ],
574
- }
575
-
576
- try:
577
- async with httpx.AsyncClient(timeout=GROQ_TIMEOUT_SECONDS) as client:
578
- res = await client.post(f"{GROQ_API_BASE}/chat/completions", headers=headers, json=payload)
579
- res.raise_for_status()
580
- data = res.json()
581
- return data["choices"][0]["message"]["content"].strip()
582
- except Exception:
583
- return None
584
-
585
-
586
- async def compose_safe_summary(state: AnalysisState, risk_level: str) -> tuple[str, dict[str, Optional[str]]]:
587
- available = await fetch_available_models()
588
- reasoning_model = pick_model(available, OVERRIDE_REASONING_MODEL, PREFERRED_REASONING_MODELS)
589
- safety_model = pick_model(available, OVERRIDE_SAFETY_MODEL, PREFERRED_SAFETY_MODELS)
590
-
591
- model_meta = {
592
- "reasoning_model": reasoning_model,
593
- "safety_model": safety_model,
594
- }
595
-
596
- baseline_summary = summary_fallback(state, risk_level)
597
- if not reasoning_model:
598
- return baseline_summary, model_meta
599
-
600
- features_for_prompt = {
601
- "risk_level": risk_level,
602
- "overall_cognitive_load": state.overall_load,
603
- "confidence": state.confidence,
604
- "scores": {k: v.overall for k, v in state.scores.items()},
605
- "quality_notes": state.quality_notes,
606
- "metrics": state.metrics,
607
- }
608
- system = (
609
- "You summarize computational language-screening outputs. "
610
- "Never diagnose disease, never use alarming wording, and always state uncertainty when confidence is limited. "
611
- "Output exactly 2-3 sentences in plain text."
612
- )
613
- user = "Write a careful summary for this analysis:\n" + json.dumps(features_for_prompt)
614
-
615
- summary = await groq_chat(reasoning_model, system, user, temperature=0.15)
616
- if not summary:
617
- return baseline_summary, model_meta
618
-
619
- if safety_model:
620
- safety_system = (
621
- "You are a safety editor for health-adjacent UX. "
622
- "Rewrite text to avoid panic, avoid diagnosis claims, and keep uncertainty explicit. "
623
- "Keep 2-3 sentences."
624
- )
625
- safety_user = (
626
- "Rewrite this summary to be non-alarmist and clinically careful while keeping factual content:\n"
627
- + summary
628
- + "\n\nConfidence: "
629
- + str(state.confidence)
630
- )
631
- safe = await groq_chat(safety_model, safety_system, safety_user, temperature=0.1)
632
- if safe:
633
- summary = safe
634
-
635
- return summary, model_meta
636
-
637
-
638
- @app.get("/health")
639
- async def health() -> dict[str, Any]:
640
- available = await fetch_available_models()
641
- return {
642
- "ok": True,
643
- "service": "cortexflow-backend",
644
- "groq_configured": bool(GROQ_API_KEY),
645
- "model_count": len(available),
646
- }
647
-
648
-
649
- @app.get("/models/recommended")
650
- async def models_recommended() -> dict[str, Any]:
651
- available = await fetch_available_models()
652
- return {
653
- "available_models": available,
654
- "recommended": {
655
- "reasoning": pick_model(available, OVERRIDE_REASONING_MODEL, PREFERRED_REASONING_MODELS),
656
- "safety": pick_model(available, OVERRIDE_SAFETY_MODEL, PREFERRED_SAFETY_MODELS),
657
- "transcription": "whisper-large-v3-turbo",
658
- },
659
- "notes": {
660
- "production_primary": "openai/gpt-oss-120b",
661
- "production_fallback": "llama-3.3-70b-versatile",
662
- "fast_fallback": "openai/gpt-oss-20b",
663
- },
664
- }
665
-
666
- @app.post("/analyze")
667
- async def analyze(req: AnalyzeRequest):
668
- text = ensure_nonempty_text(req)
669
- session_id = req.session_id or str(uuid.uuid4())
670
-
671
- async def generate():
672
- for idx, step_name in enumerate(STEP_NAMES):
673
- yield safe_step_event(step_name, "running" if idx == 0 else "pending")
674
-
675
- try:
676
- state = compute_analysis_state(text, req.pause_map, req.audio_duration)
677
- yield safe_step_event("STT preprocessor", "done", "Input normalized and validated")
678
- yield safe_step_event("Lexical agent", "running")
679
-
680
- await asyncio.sleep(0)
681
- yield safe_step_event("Lexical agent", "done")
682
- yield safe_step_event("Semantic agent", "running")
683
-
684
- await asyncio.sleep(0)
685
- yield safe_step_event("Semantic agent", "done")
686
- yield safe_step_event("Prosody agent", "running")
687
-
688
- await asyncio.sleep(0)
689
- yield safe_step_event("Prosody agent", "done")
690
- yield safe_step_event("Syntax agent", "running")
691
-
692
- await asyncio.sleep(0)
693
- yield safe_step_event("Syntax agent", "done")
694
- yield safe_step_event("Biomarker mapper", "running")
695
-
696
- scores_payload = {
697
- domain: {**score.details, "overall": score.overall}
698
- for domain, score in state.scores.items()
699
- }
700
-
701
- yield safe_step_event("Biomarker mapper", "done")
702
- yield safe_step_event("Report composer", "running")
703
-
704
- risk_level = level_from_overall(state.overall_load, state.confidence)
705
- summary, model_meta = await compose_safe_summary(state, risk_level)
706
-
707
- report = {
708
- "summary": summary,
709
- "risk_level": risk_level,
710
- "overall_cognitive_load": state.overall_load,
711
- "highlights": make_highlights(state),
712
- "risk_indicators": make_indicators(state),
713
- "recommendation": recommendation_for_level(risk_level, state.confidence),
714
- "disclaimer": (
715
- "This tool is a non-diagnostic screening aid. It can be wrong and must not be used as a standalone "
716
- "medical decision system. If you are concerned, consult a qualified clinician."
717
- ),
718
- "quality": {
719
- "confidence": state.confidence,
720
- "notes": state.quality_notes,
721
- },
722
- "model_info": model_meta,
723
- }
724
- yield safe_step_event("Report composer", "done")
725
-
726
- payload = {
727
- "type": "end",
728
- "message": summary,
729
- "scores": scores_payload,
730
- "report": report,
731
- "session_id": session_id,
732
- }
733
- yield (json.dumps(payload) + "\n").encode()
734
-
735
- except HTTPException as exc:
736
- yield (json.dumps({"type": "error", "message": exc.detail}) + "\n").encode()
737
- except Exception as exc:
738
- yield (json.dumps({"type": "error", "message": f"Analysis failed: {str(exc)}"}) + "\n").encode()
739
-
740
- return StreamingResponse(
741
- generate(),
742
- media_type="text/plain",
743
- headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
744
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import os
4
+ import re
5
+ import statistics
6
+ import time
7
+ import uuid
8
+ from dataclasses import dataclass
9
+ from typing import Any, Optional
10
+ import httpx
11
+ from dotenv import load_dotenv
12
+ from fastapi import FastAPI, HTTPException
13
+ from fastapi.middleware.cors import CORSMiddleware
14
+ from fastapi.responses import StreamingResponse
15
+ from pydantic import BaseModel
16
+
17
+
18
+
19
+ load_dotenv()
20
+ app = FastAPI(title="CortexFlow Backend", version="1.0.0")
21
+
22
+ app.add_middleware(
23
+
24
+ CORSMiddleware,
25
+
26
+ allow_origins=["*"],
27
+
28
+ allow_methods=["GET", "POST"],
29
+
30
+ allow_headers=["*"],
31
+
32
+ )
33
+
34
+
35
+
36
+
37
+
38
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY", "").strip()
39
+ GROQ_API_BASE = os.getenv("GROQ_API_BASE", "https://api.groq.com/openai/v1").rstrip("/")
40
+ GROQ_TIMEOUT_SECONDS = float(os.getenv("GROQ_TIMEOUT_SECONDS", "40"))
41
+ MODEL_DISCOVERY_TTL_SECONDS = int(os.getenv("MODEL_DISCOVERY_TTL_SECONDS", "900"))
42
+
43
+
44
+
45
+ PREFERRED_REASONING_MODELS = [
46
+
47
+ m.strip()
48
+
49
+ for m in os.getenv(
50
+
51
+ "GROQ_REASONING_CANDIDATES",
52
+
53
+ "openai/gpt-oss-120b,llama-3.3-70b-versatile,openai/gpt-oss-20b,llama-3.1-8b-instant",
54
+
55
+ ).split(",")
56
+
57
+ if m.strip()
58
+
59
+ ]
60
+
61
+ PREFERRED_SAFETY_MODELS = [
62
+
63
+ m.strip()
64
+
65
+ for m in os.getenv(
66
+
67
+ "GROQ_SAFETY_CANDIDATES",
68
+
69
+ "openai/gpt-oss-safeguard-20b,openai/gpt-oss-20b,llama-3.1-8b-instant",
70
+
71
+ ).split(",")
72
+
73
+ if m.strip()
74
+
75
+ ]
76
+
77
+
78
+
79
+ OVERRIDE_REASONING_MODEL = os.getenv("GROQ_REASONING_MODEL", "").strip()
80
+ OVERRIDE_SAFETY_MODEL = os.getenv("GROQ_SAFETY_MODEL", "").strip()
81
+ GROQ_TRANSCRIBE_MODEL = os.getenv("GROQ_TRANSCRIBE_MODEL", "whisper-large-v3-turbo").strip() or "whisper-large-v3-turbo"
82
+
83
+ MIN_WORDS_REQUIRED = int(os.getenv("MIN_WORDS_REQUIRED", "25"))
84
+
85
+
86
+
87
+ STEP_NAMES = [
88
+
89
+ "STT preprocessor",
90
+
91
+ "Lexical agent",
92
+
93
+ "Semantic agent",
94
+
95
+ "Prosody agent",
96
+
97
+ "Syntax agent",
98
+
99
+ "Biomarker mapper",
100
+
101
+ "Report composer",
102
+
103
+ ]
104
+
105
+ DOMAIN_REGION = {
106
+
107
+ "lexical": "Broca's area",
108
+
109
+ "semantic": "Wernicke's area",
110
+
111
+ "prosody": "SMA",
112
+
113
+ "syntax": "DLPFC",
114
+
115
+ "affective": "Amygdala",
116
+
117
+ }
118
+
119
+
120
+
121
+ STOPWORDS = {
122
+
123
+ "the", "a", "an", "and", "or", "but", "if", "then", "than", "of", "to", "in", "on", "at", "for",
124
+
125
+ "with", "without", "by", "from", "as", "is", "am", "are", "was", "were", "be", "been", "being",
126
+
127
+ "it", "its", "this", "that", "these", "those", "i", "you", "he", "she", "we", "they", "them",
128
+
129
+ "my", "your", "our", "their", "me", "him", "her", "us", "do", "does", "did", "have", "has", "had",
130
+
131
+ "not", "no", "yes", "so", "because", "about", "into", "out", "up", "down", "can", "could", "would",
132
+
133
+ "should", "will", "just", "very", "really", "also",
134
+ # Common Romanized Hindi stopwords for code-mixed speech.
135
+ "hai", "hain", "tha", "thi", "the", "ho", "hoga", "hogi", "honge", "main", "mein", "mera", "meri", "mere",
136
+ "hum", "tum", "aap", "ye", "yeh", "wo", "woh", "is", "iss", "us", "uss", "ko", "se", "ka", "ki", "ke",
137
+ "par", "aur", "lekin", "magar", "kyunki", "kyonki", "agar", "jab", "tab", "tak", "ya", "nahi", "nahin", "haan",
138
+ # Common Devanagari stopwords for native Hindi transcripts.
139
+ "है", "हैं", "था", "थी", "थे", "हो", "होगा", "होगी", "होंगे", "मैं", "में", "मेरा", "मेरी", "मेरे", "हम",
140
+ "तुम", "आप", "ये", "यह", "वो", "वह", "इस", "उस", "को", "से", "का", "की", "के", "पर", "और", "लेकिन",
141
+ "मगर", "क्योंकि", "अगर", "जब", "तब", "तक", "या", "नहीं", "हाँ",
142
+
143
+ }
144
+
145
+
146
+
147
+ FILLERS = {
148
+
149
+ "um", "uh", "erm", "hmm", "like", "actually", "basically", "literally",
150
+ "matlab", "achha", "accha", "toh", "na", "yaar", "dekho", "samjho", "मतलब", "अच्छा", "तो", "ना",
151
+
152
+ }
153
+
154
+
155
+
156
+ POSITIVE_WORDS = {
157
+
158
+ "good", "better", "great", "calm", "confident", "clear", "focused", "stable", "happy", "optimistic", "safe", "steady",
159
+ "accha", "badhiya", "shaant", "khush", "सुरक्षित", "शांत", "खुश", "अच्छा",
160
+
161
+ }
162
+
163
+ NEGATIVE_WORDS = {
164
+
165
+ "bad", "worse", "anxious", "scared", "panic", "panicked", "confused", "sad", "depressed", "angry", "overwhelmed", "stressed",
166
+ "bura", "ghabrahat", "darr", "pareshan", "dukhi", "चिंतित", "डरा", "उलझन", "दुखी", "तनाव",
167
+
168
+ }
169
+
170
+ AROUSAL_WORDS = {
171
+
172
+ "urgent", "immediately", "intense", "extreme", "critical", "afraid", "panic", "terrified", "racing", "shaking", "worried",
173
+ "jaldi", "turant", "tez", "bahut", "घबराहट", "तुरंत", "जल्दी", "तेज", "चिंता",
174
+
175
+ }
176
+
177
+ HEDGE_WORDS = {
178
+
179
+ "maybe", "perhaps", "possibly", "probably", "sort", "kind", "might", "could", "guess", "unsure", "not sure",
180
+ "shayad", "lagta", "shayad", "pata", "कदाचित", "शायद", "लगता", "पता",
181
+
182
+ }
183
+
184
+ FILLER_PHRASES = {
185
+ "you know",
186
+ "i mean",
187
+ "sort of",
188
+ "kind of",
189
+ "pata hai",
190
+ "you know what",
191
+ }
192
+
193
+ HEDGE_PHRASES = {
194
+ "not sure",
195
+ "i guess",
196
+ "sort of",
197
+ "kind of",
198
+ "pata nahi",
199
+ "mujhe lagta",
200
+ }
201
+
202
+ SUBORDINATORS = {
203
+
204
+ "because", "although", "though", "while", "unless", "until", "since", "whereas", "however", "therefore", "moreover", "which", "that",
205
+ "kyunki", "kyonki", "agar", "jab", "jabki", "lekin", "magar", "isliye", "jo", "कि", "क्योंकि", "अगर", "जब", "जबकि", "लेकिन", "मगर", "इसलिए", "जो",
206
+
207
+ }
208
+
209
+ ROMAN_HINDI_MARKERS = {
210
+ "hai", "hain", "tha", "thi", "the", "main", "mein", "mera", "meri", "mere", "hum", "tum", "aap", "ye", "yeh",
211
+ "wo", "woh", "ko", "se", "ka", "ki", "ke", "par", "aur", "lekin", "magar", "kyunki", "kyonki", "agar", "jab",
212
+ "tab", "tak", "ya", "nahi", "nahin", "haan", "accha", "achha", "matlab", "yaar", "jaldi", "turant", "shayad",
213
+ "pata", "samjho", "dekho", "bahut", "thoda", "zyada", "abhi", "kal", "kar", "karna", "kiya", "karo", "raha", "rahi",
214
+ }
215
+
216
+
217
+
218
+
219
+
220
+ class AnalyzeRequest(BaseModel):
221
+
222
+ input_value: Optional[str] = None
223
+
224
+ transcript: Optional[str] = None
225
+
226
+ pause_map: Optional[list[float]] = None
227
+
228
+ audio_duration: Optional[float] = None
229
+
230
+ detected_language: Optional[str] = None
231
+
232
+ language_profile: Optional[dict[str, Any]] = None
233
+
234
+ session_id: Optional[str] = None
235
+
236
+
237
+
238
+
239
+
240
+ @dataclass
241
+
242
+ class DomainScore:
243
+
244
+ overall: float
245
+
246
+ details: dict[str, float]
247
+
248
+
249
+
250
+ @dataclass
251
+
252
+ class AnalysisState:
253
+
254
+ scores: dict[str, DomainScore]
255
+
256
+ overall_load: float
257
+
258
+ confidence: float
259
+
260
+ quality_notes: list[str]
261
+
262
+ language_profile: dict[str, Any]
263
+
264
+ metrics: dict[str, Any]
265
+
266
+
267
+
268
+
269
+
270
+ _MODEL_CACHE: dict[str, Any] = {"updated": 0.0, "models": []}
271
+
272
+ _MODEL_CACHE_LOCK = asyncio.Lock()
273
+
274
+ LATIN_TOKEN_RE = re.compile(r"[A-Za-z]+(?:'[A-Za-z]+)?")
275
+ DEVANAGARI_TOKEN_RE = re.compile(r"[\u0900-\u097F]+")
276
+ WORD_TOKEN_RE = re.compile(r"[A-Za-z]+(?:'[A-Za-z]+)?|[\u0900-\u097F]+")
277
+
278
+
279
+
280
+ def clamp01(v: float) -> float:
281
+
282
+ return max(0.0, min(1.0, v))
283
+
284
+
285
+
286
+
287
+
288
+ def mean(values: list[float], default: float = 0.0) -> float:
289
+
290
+ return float(statistics.mean(values)) if values else default
291
+
292
+
293
+
294
+
295
+
296
+ def tokenize_words(text: str) -> list[str]:
297
+
298
+ return [tok.lower() for tok in WORD_TOKEN_RE.findall(text)]
299
+
300
+
301
+
302
+
303
+
304
+ def split_sentences(text: str) -> list[str]:
305
+
306
+ parts = [p.strip() for p in re.split(r"(?<=[.!?।])\s+", text) if p.strip()]
307
+
308
+ return parts if parts else ([text.strip()] if text.strip() else [])
309
+
310
+
311
+
312
+
313
+
314
+ def content_words(tokens: list[str]) -> list[str]:
315
+
316
+ return [t for t in tokens if len(t) > 2 and t not in STOPWORDS]
317
+
318
+
319
+ def read_profile_ratio(profile: Optional[dict[str, Any]], snake_key: str, camel_key: str) -> Optional[float]:
320
+
321
+ if not profile or not isinstance(profile, dict):
322
+
323
+ return None
324
+
325
+ raw = profile.get(snake_key)
326
+
327
+ if raw is None:
328
+
329
+ raw = profile.get(camel_key)
330
+
331
+ if raw is None:
332
+
333
+ return None
334
+
335
+ try:
336
+
337
+ return clamp01(float(raw))
338
+
339
+ except (TypeError, ValueError):
340
+
341
+ return None
342
+
343
+
344
+ def detect_language_profile(
345
+
346
+ text: str,
347
+
348
+ hinted_language: Optional[str] = None,
349
+
350
+ hinted_profile: Optional[dict[str, Any]] = None,
351
+
352
+ ) -> dict[str, Any]:
353
+
354
+ latin_tokens = [tok.lower() for tok in LATIN_TOKEN_RE.findall(text)]
355
+
356
+ devanagari_tokens = DEVANAGARI_TOKEN_RE.findall(text)
357
+
358
+ roman_hindi_hits = sum(1 for tok in latin_tokens if tok in ROMAN_HINDI_MARKERS)
359
+
360
+ hindi_tokens = len(devanagari_tokens) + roman_hindi_hits
361
+
362
+ english_tokens = max(len(latin_tokens) - roman_hindi_hits, 0)
363
+
364
+ total = max(hindi_tokens + english_tokens, 1)
365
+
366
+
367
+
368
+ hindi_ratio = hindi_tokens / total
369
+
370
+ english_ratio = english_tokens / total
371
+
372
+ devanagari_ratio = len(devanagari_tokens) / total
373
+
374
+
375
+
376
+ hinted_english_ratio = read_profile_ratio(hinted_profile, "english_ratio", "englishRatio")
377
+
378
+ hinted_hindi_ratio = read_profile_ratio(hinted_profile, "hindi_ratio", "hindiRatio")
379
+
380
+ if hinted_english_ratio is not None and hinted_hindi_ratio is not None and (hinted_english_ratio + hinted_hindi_ratio) > 0:
381
+
382
+ hinted_total = hinted_english_ratio + hinted_hindi_ratio
383
+
384
+ hinted_english_ratio /= hinted_total
385
+
386
+ hinted_hindi_ratio /= hinted_total
387
+
388
+ english_ratio = (0.75 * english_ratio) + (0.25 * hinted_english_ratio)
389
+
390
+ hindi_ratio = (0.75 * hindi_ratio) + (0.25 * hinted_hindi_ratio)
391
+
392
+ ratio_total = max(english_ratio + hindi_ratio, 1e-6)
393
+
394
+ english_ratio = english_ratio / ratio_total
395
+
396
+ hindi_ratio = hindi_ratio / ratio_total
397
+
398
+
399
+
400
+ label = "multilingual"
401
+
402
+ if hindi_ratio >= 0.2 and english_ratio >= 0.2:
403
+
404
+ label = "hinglish"
405
+
406
+ elif hindi_ratio >= 0.68:
407
+
408
+ label = "hindi"
409
+
410
+ elif english_ratio >= 0.68:
411
+
412
+ label = "english"
413
+
414
+
415
+
416
+ hint = (hinted_language or "").strip().lower()
417
+
418
+ if hint in {"hi", "hindi"}:
419
+
420
+ if english_ratio >= 0.2:
421
+
422
+ label = "hinglish"
423
+
424
+ elif label == "multilingual":
425
+
426
+ label = "hindi"
427
+
428
+ elif hint in {"en", "english"}:
429
+
430
+ if hindi_ratio >= 0.2:
431
+
432
+ label = "hinglish"
433
+
434
+ elif label == "multilingual":
435
+
436
+ label = "english"
437
+
438
+
439
+
440
+ if hinted_profile and isinstance(hinted_profile, dict):
441
+
442
+ hinted_label = str(hinted_profile.get("label", "")).strip().lower()
443
+
444
+ if hinted_label in {"hinglish", "hindi", "english", "multilingual"} and (
445
+
446
+ label == "multilingual" or abs(hindi_ratio - english_ratio) < 0.12
447
+
448
+ ):
449
+
450
+ label = hinted_label
451
+
452
+
453
+
454
+ return {
455
+
456
+ "label": label,
457
+
458
+ "english_ratio": round(english_ratio, 4),
459
+
460
+ "hindi_ratio": round(hindi_ratio, 4),
461
+
462
+ "devanagari_ratio": round(devanagari_ratio, 4),
463
+
464
+ }
465
+
466
+
467
+
468
+
469
+
470
+ def jaccard(a: set[str], b: set[str]) -> float:
471
+
472
+ if not a or not b:
473
+
474
+ return 0.0
475
+
476
+ inter = len(a.intersection(b))
477
+
478
+ union = len(a.union(b))
479
+
480
+ return inter / union if union else 0.0
481
+
482
+
483
+
484
+
485
+
486
+ def scale_linear(value: float, low: float, high: float) -> float:
487
+
488
+ if high <= low:
489
+
490
+ return 0.0
491
+
492
+ return clamp01((value - low) / (high - low))
493
+
494
+
495
+
496
+
497
+
498
+ def scale_inverse(value: float, good: float, poor: float) -> float:
499
+
500
+ if poor >= good:
501
+
502
+ return 0.0
503
+
504
+ return clamp01((good - value) / (good - poor))
505
+
506
+
507
+ def pick_language_target(
508
+
509
+ language_profile: dict[str, Any],
510
+
511
+ english: float,
512
+
513
+ hinglish: float,
514
+
515
+ hindi: float,
516
+
517
+ multilingual: Optional[float] = None,
518
+
519
+ ) -> float:
520
+
521
+ try:
522
+
523
+ english_ratio = clamp01(float(language_profile.get("english_ratio", 0.0)))
524
+
525
+ except (TypeError, ValueError):
526
+
527
+ english_ratio = 0.0
528
+
529
+ try:
530
+
531
+ hindi_ratio = clamp01(float(language_profile.get("hindi_ratio", 0.0)))
532
+
533
+ except (TypeError, ValueError):
534
+
535
+ hindi_ratio = 0.0
536
+
537
+ ratio_total = english_ratio + hindi_ratio
538
+
539
+
540
+
541
+ if ratio_total > 1e-6:
542
+
543
+ english_weight = english_ratio / ratio_total
544
+
545
+ hindi_weight = hindi_ratio / ratio_total
546
+
547
+ base_target = (english_weight * english) + (hindi_weight * hindi)
548
+
549
+ code_mix_strength = clamp01(2.0 * min(english_weight, hindi_weight))
550
+
551
+ blended_target = ((1.0 - code_mix_strength) * base_target) + (code_mix_strength * hinglish)
552
+
553
+ if multilingual is not None:
554
+
555
+ blended_target = (0.9 * blended_target) + (0.1 * multilingual)
556
+
557
+ return blended_target
558
+
559
+
560
+
561
+ label = str(language_profile.get("label", "english")).lower()
562
+
563
+ if label == "hinglish":
564
+
565
+ return hinglish
566
+
567
+ if label == "hindi":
568
+
569
+ return hindi
570
+
571
+ if label == "multilingual":
572
+
573
+ return multilingual if multilingual is not None else (english + hindi) / 2.0
574
+
575
+ return english
576
+
577
+
578
+ def transcription_model_capabilities(model_name: str) -> dict[str, Any]:
579
+
580
+ normalized = model_name.strip().lower()
581
+
582
+ if not normalized:
583
+
584
+ return {
585
+
586
+ "model": "unknown",
587
+
588
+ "multilingual": False,
589
+
590
+ "hindi_supported": False,
591
+
592
+ "hinglish_supported": False,
593
+
594
+ "notes": "No transcription model configured.",
595
+
596
+ }
597
+
598
+
599
+
600
+ english_only = normalized.endswith("-en") or normalized in {
601
+
602
+ "distil-whisper-large-v3-en",
603
+
604
+ "whisper-large-v3-en",
605
+
606
+ }
607
+
608
+ multilingual = (
609
+
610
+ ("whisper" in normalized and not english_only)
611
+
612
+ or ("gpt-4o-mini-transcribe" in normalized)
613
+
614
+ or ("gpt-4o-transcribe" in normalized)
615
+
616
+ )
617
+
618
+
619
+
620
+ if english_only:
621
+
622
+ notes = "Configured model appears English-only. Use a multilingual Whisper model for Hindi/Hinglish."
623
+
624
+ elif multilingual:
625
+
626
+ notes = "Configured model supports multilingual transcription, including Hindi and code-mixed Hinglish."
627
+
628
+ else:
629
+
630
+ notes = "Model capability is unknown; verify multilingual Hindi support in provider documentation."
631
+
632
+
633
+
634
+ return {
635
+
636
+ "model": model_name,
637
+
638
+ "multilingual": multilingual,
639
+
640
+ "hindi_supported": multilingual,
641
+
642
+ "hinglish_supported": multilingual,
643
+
644
+ "notes": notes,
645
+
646
+ }
647
+
648
+
649
+
650
+ def safe_step_event(name: str, status: str, detail: Optional[str] = None) -> bytes:
651
+
652
+ payload: dict[str, Any] = {"type": "step", "step": {"name": name, "status": status}}
653
+
654
+ if detail:
655
+
656
+ payload["step"]["detail"] = detail
657
+
658
+ return (json.dumps(payload) + "\n").encode()
659
+
660
+
661
+ def count_phrase_hits(text: str, phrases: set[str]) -> int:
662
+
663
+ lowered = text.lower()
664
+
665
+ return sum(lowered.count(phrase) for phrase in phrases if phrase)
666
+
667
+
668
+ def estimate_filler_hits(tokens: list[str], text: str) -> int:
669
+
670
+ token_hits = sum(1 for t in tokens if t in FILLERS)
671
+
672
+ phrase_hits = count_phrase_hits(text, FILLER_PHRASES)
673
+
674
+ return token_hits + phrase_hits
675
+
676
+
677
+ def estimate_hedge_hits(tokens: list[str], text: str) -> int:
678
+
679
+ token_hits = sum(1 for t in tokens if t in HEDGE_WORDS)
680
+
681
+ phrase_hits = count_phrase_hits(text, HEDGE_PHRASES)
682
+
683
+ return token_hits + phrase_hits
684
+
685
+
686
+
687
+
688
+
689
+ def ensure_nonempty_text(req: AnalyzeRequest) -> str:
690
+
691
+ text = (req.input_value or req.transcript or "").strip()
692
+
693
+ words = tokenize_words(text)
694
+
695
+ if not text:
696
+
697
+ raise HTTPException(status_code=400, detail="No input text provided")
698
+
699
+ if len(words) < MIN_WORDS_REQUIRED:
700
+
701
+ raise HTTPException(
702
+
703
+ status_code=422,
704
+
705
+ detail=f"Need at least {MIN_WORDS_REQUIRED} words for reliable analysis. Received {len(words)} words.",
706
+
707
+ )
708
+
709
+ return text
710
+
711
+
712
+
713
+
714
+
715
+ def lexical_domain(
716
+
717
+ tokens: list[str],
718
+
719
+ text: str,
720
+
721
+ content: list[str],
722
+
723
+ language_profile: dict[str, Any],
724
+
725
+ ) -> tuple[DomainScore, dict[str, float]]:
726
+
727
+ total = max(len(tokens), 1)
728
+
729
+ unique = len(set(tokens))
730
+
731
+ filler_hits = estimate_filler_hits(tokens, text)
732
+
733
+
734
+
735
+ ttr = unique / total
736
+
737
+ density = len(content) / total
738
+
739
+ filler_rate = (filler_hits / total) * 100.0
740
+
741
+
742
+
743
+ ttr_target = pick_language_target(language_profile, english=0.52, hinglish=0.57, hindi=0.56, multilingual=0.55)
744
+
745
+ density_target = pick_language_target(language_profile, english=0.58, hinglish=0.63, hindi=0.61, multilingual=0.60)
746
+
747
+ filler_low = pick_language_target(language_profile, english=2.0, hinglish=3.5, hindi=3.5, multilingual=3.0)
748
+
749
+ filler_high = pick_language_target(language_profile, english=12.0, hinglish=20.0, hindi=17.0, multilingual=17.0)
750
+
751
+
752
+
753
+ s_ttr = clamp01(abs(ttr - ttr_target) / 0.30)
754
+
755
+ s_density = clamp01(abs(density - density_target) / 0.25)
756
+
757
+ s_filler = scale_linear(filler_rate, filler_low, filler_high)
758
+
759
+
760
+
761
+ overall = clamp01((0.4 * s_ttr) + (0.35 * s_density) + (0.25 * s_filler))
762
+
763
+
764
+
765
+ details = {
766
+
767
+ "ttr": round(s_ttr, 4),
768
+
769
+ "density": round(s_density, 4),
770
+
771
+ "filler_rate": round(s_filler, 4),
772
+
773
+ }
774
+
775
+ raw = {
776
+
777
+ "ttr": round(ttr, 4),
778
+
779
+ "lexical_density": round(density, 4),
780
+
781
+ "filler_rate_per_100w": round(filler_rate, 2),
782
+
783
+ }
784
+
785
+ return DomainScore(round(overall, 4), details), raw
786
+
787
+
788
+
789
+
790
+
791
+ def semantic_domain(sentences: list[str]) -> tuple[DomainScore, dict[str, float]]:
792
+
793
+ if len(sentences) < 2:
794
+
795
+ coherence = 0.16
796
+
797
+ idea_density = 0.45
798
+
799
+ tangentiality = 0.55
800
+
801
+ else:
802
+
803
+ sentence_content = [set(content_words(tokenize_words(s))) for s in sentences]
804
+
805
+ pairwise = [jaccard(sentence_content[i], sentence_content[i + 1]) for i in range(len(sentence_content) - 1)]
806
+
807
+ coherence = mean(pairwise, default=0.12)
808
+
809
+ avg_content_len = mean([len(x) for x in sentence_content], default=0.0)
810
+
811
+ idea_density = clamp01(avg_content_len / 14.0)
812
+
813
+ tangentiality = clamp01(1.0 - coherence)
814
+
815
+ s_coherence = scale_inverse(coherence, good=0.22, poor=0.05)
816
+
817
+ s_idea_density = scale_inverse(idea_density, good=0.65, poor=0.25)
818
+
819
+ s_tangentiality = scale_linear(tangentiality, low=0.35, high=0.85)
820
+
821
+
822
+
823
+ overall = clamp01((0.45 * s_coherence) + (0.30 * s_idea_density) + (0.25 * s_tangentiality))
824
+
825
+
826
+
827
+ details = {
828
+
829
+ "coherence": round(s_coherence, 4),
830
+
831
+ "idea_density": round(s_idea_density, 4),
832
+
833
+ "tangentiality": round(s_tangentiality, 4),
834
+
835
+ }
836
+
837
+ raw = {
838
+
839
+ "coherence_index": round(coherence, 4),
840
+
841
+ "idea_density_index": round(idea_density, 4),
842
+
843
+ "tangentiality_index": round(tangentiality, 4),
844
+
845
+ }
846
+
847
+ return DomainScore(round(overall, 4), details), raw
848
+
849
+
850
+
851
+
852
+
853
+ def prosody_domain(
854
+
855
+ tokens: list[str],
856
+
857
+ text: str,
858
+
859
+ pause_map: Optional[list[float]],
860
+
861
+ audio_duration: Optional[float],
862
+
863
+ language_profile: dict[str, Any],
864
+
865
+ ) -> tuple[DomainScore, dict[str, float], bool]:
866
+
867
+ word_count = max(len(tokens), 1)
868
+
869
+ pauses = [float(p) for p in (pause_map or []) if p >= 0]
870
+
871
+ has_audio_prosody = bool(pauses)
872
+
873
+
874
+
875
+ if audio_duration and audio_duration > 5.0:
876
+
877
+ duration_seconds = audio_duration
878
+
879
+ else:
880
+
881
+ estimated_speech_seconds = word_count / 2.5
882
+
883
+ duration_seconds = estimated_speech_seconds + sum(pauses)
884
+
885
+
886
+
887
+ duration_minutes = max(duration_seconds / 60.0, 0.1)
888
+
889
+ speech_rate = word_count / duration_minutes
890
+
891
+
892
+
893
+ if pauses:
894
+
895
+ pause_freq = len(pauses) / duration_minutes
896
+
897
+ pause_hesitation = sum(1 for p in pauses if p >= 0.8) / len(pauses)
898
+
899
+ lexical_hesitation = clamp01(estimate_filler_hits(tokens, text) / max(word_count, 1))
900
+
901
+ hesitation_ratio = clamp01((0.7 * pause_hesitation) + (0.3 * lexical_hesitation))
902
+
903
+ else:
904
+
905
+ punctuation_pauses = len(re.findall(r"[,;:\-]", text))
906
+
907
+ pause_freq = (punctuation_pauses / max(word_count, 1)) * 100
908
+
909
+ hesitation_ratio = clamp01(estimate_filler_hits(tokens, text) / max(word_count, 1))
910
+
911
+
912
+
913
+ speech_rate_target = pick_language_target(language_profile, english=140.0, hinglish=132.0, hindi=126.0, multilingual=133.0)
914
+
915
+ s_rate = clamp01(abs(speech_rate - speech_rate_target) / 95.0)
916
+
917
+ s_pause = scale_linear(pause_freq, low=8.0, high=30.0)
918
+
919
+ s_hes = scale_linear(hesitation_ratio, low=0.08, high=0.35)
920
+
921
+
922
+
923
+ overall = clamp01((0.4 * s_rate) + (0.35 * s_pause) + (0.25 * s_hes))
924
+
925
+
926
+
927
+ details = {
928
+
929
+ "speech_rate": round(s_rate, 4),
930
+
931
+ "pause_freq": round(s_pause, 4),
932
+
933
+ "hesitation": round(s_hes, 4),
934
+
935
+ }
936
+
937
+ raw = {
938
+
939
+ "speech_rate_wpm": round(speech_rate, 1),
940
+
941
+ "pause_frequency_per_min": round(pause_freq, 2),
942
+
943
+ "hesitation_ratio": round(hesitation_ratio, 4),
944
+
945
+ "duration_seconds": round(duration_seconds, 2),
946
+
947
+ }
948
+
949
+ return DomainScore(round(overall, 4), details), raw, has_audio_prosody
950
+
951
+
952
+
953
+ def syntax_domain(
954
+
955
+ tokens: list[str],
956
+
957
+ sentences: list[str],
958
+
959
+ text: str,
960
+
961
+ language_profile: dict[str, Any],
962
+
963
+ ) -> tuple[DomainScore, dict[str, float]]:
964
+
965
+ sentence_count = max(len(sentences), 1)
966
+
967
+ mlu = len(tokens) / sentence_count
968
+
969
+
970
+
971
+ per_sentence_depth = []
972
+
973
+ for s in sentences:
974
+
975
+ stoks = tokenize_words(s)
976
+
977
+ sub_count = sum(1 for t in stoks if t in SUBORDINATORS)
978
+
979
+ comma_count = s.count(",")
980
+
981
+ per_sentence_depth.append(sub_count + (comma_count * 0.5))
982
+
983
+ clause_depth = mean(per_sentence_depth, default=0.0)
984
+
985
+
986
+
987
+ passive_matches = re.findall(r"\b(?:is|are|was|were|be|been|being)\s+\w+(?:ed|en)\b", text.lower())
988
+
989
+ passive_ratio = len(passive_matches) / max(sentence_count, 1)
990
+
991
+
992
+
993
+ mlu_target = pick_language_target(language_profile, english=17.0, hinglish=15.0, hindi=14.5, multilingual=15.5)
994
+
995
+ depth_low = pick_language_target(language_profile, english=2.0, hinglish=1.5, hindi=1.4, multilingual=1.6)
996
+
997
+ depth_high = pick_language_target(language_profile, english=6.5, hinglish=5.7, hindi=5.3, multilingual=5.8)
998
+
999
+
1000
+
1001
+ s_mlu = clamp01(abs(mlu - mlu_target) / 12.0)
1002
+
1003
+ s_depth = scale_linear(clause_depth, low=depth_low, high=depth_high)
1004
+
1005
+ s_passive = scale_linear(passive_ratio, low=0.15, high=1.2)
1006
+
1007
+
1008
+
1009
+ passive_weight = pick_language_target(language_profile, english=0.20, hinglish=0.12, hindi=0.05, multilingual=0.10)
1010
+
1011
+ mlu_weight = 0.45 + ((0.20 - passive_weight) * 0.55)
1012
+
1013
+ depth_weight = 1.0 - mlu_weight - passive_weight
1014
+
1015
+
1016
+
1017
+ overall = clamp01((mlu_weight * s_mlu) + (depth_weight * s_depth) + (passive_weight * s_passive))
1018
+
1019
+
1020
+
1021
+ details = {
1022
+
1023
+ "mlu": round(s_mlu, 4),
1024
+
1025
+ "clause_depth": round(s_depth, 4),
1026
+
1027
+ "passive_ratio": round(s_passive, 4),
1028
+
1029
+ }
1030
+
1031
+ raw = {
1032
+
1033
+ "mean_length_utterance": round(mlu, 2),
1034
+
1035
+ "clause_depth_index": round(clause_depth, 2),
1036
+
1037
+ "passive_ratio": round(passive_ratio, 3),
1038
+
1039
+ }
1040
+
1041
+ return DomainScore(round(overall, 4), details), raw
1042
+
1043
+
1044
+
1045
+
1046
+
1047
+ def affective_domain(tokens: list[str], text: str) -> tuple[DomainScore, dict[str, float]]:
1048
+
1049
+ total = max(len(tokens), 1)
1050
+
1051
+ pos = sum(1 for t in tokens if t in POSITIVE_WORDS)
1052
+
1053
+ neg = sum(1 for t in tokens if t in NEGATIVE_WORDS)
1054
+
1055
+ arousal = sum(1 for t in tokens if t in AROUSAL_WORDS)
1056
+
1057
+ hedge = estimate_hedge_hits(tokens, text)
1058
+
1059
+
1060
+
1061
+ valence = (pos - neg) / (pos + neg + 1)
1062
+
1063
+ valence_01 = (valence + 1.0) / 2.0
1064
+
1065
+ arousal_rate = (arousal / total) * 100.0
1066
+
1067
+ certainty = 1.0 - clamp01(hedge / max(total * 0.15, 1.0))
1068
+
1069
+
1070
+
1071
+ s_valence = scale_inverse(valence_01, good=0.62, poor=0.20)
1072
+
1073
+ s_arousal = scale_linear(arousal_rate, low=3.0, high=14.0)
1074
+
1075
+ s_certainty = scale_inverse(certainty, good=0.72, poor=0.32)
1076
+
1077
+ overall = clamp01((0.4 * s_valence) + (0.35 * s_arousal) + (0.25 * s_certainty))
1078
+
1079
+
1080
+
1081
+ details = {
1082
+
1083
+ "valence": round(s_valence, 4),
1084
+
1085
+ "arousal": round(s_arousal, 4),
1086
+
1087
+ "certainty": round(s_certainty, 4),
1088
+
1089
+ }
1090
+
1091
+ raw = {
1092
+
1093
+ "valence_score": round(valence_01, 4),
1094
+
1095
+ "arousal_rate_per_100w": round(arousal_rate, 2),
1096
+
1097
+ "certainty_index": round(certainty, 4),
1098
+
1099
+ }
1100
+
1101
+ return DomainScore(round(overall, 4), details), raw
1102
+
1103
+
1104
+
1105
+
1106
+
1107
+ def compute_confidence(
1108
+
1109
+ word_count: int, sentence_count: int, has_audio_prosody: bool, repeat_ratio: float
1110
+
1111
+ ) -> tuple[float, list[str]]:
1112
+
1113
+ notes: list[str] = []
1114
+
1115
+ c_words = clamp01(word_count / 180.0)
1116
+
1117
+ c_sents = clamp01(sentence_count / 8.0)
1118
+
1119
+ c_repeat = clamp01(1.0 - (repeat_ratio * 1.4))
1120
+
1121
+ c_audio = 1.0 if has_audio_prosody else 0.55
1122
+
1123
+
1124
+
1125
+ confidence = clamp01((0.45 * c_words) + (0.2 * c_sents) + (0.2 * c_repeat) + (0.15 * c_audio))
1126
+
1127
+
1128
+
1129
+ if word_count < 60:
1130
+
1131
+ notes.append("Low sample length. Interpret results cautiously.")
1132
+
1133
+ if not has_audio_prosody:
1134
+
1135
+ notes.append("Prosody is inferred from text patterns because pause-map audio features were not provided.")
1136
+
1137
+ if repeat_ratio > 0.45:
1138
+
1139
+ notes.append("High repetition detected, which can reduce semantic reliability.")
1140
+
1141
+
1142
+
1143
+ return round(confidence, 4), notes
1144
+
1145
+
1146
+
1147
+
1148
+
1149
+ def compute_analysis_state(
1150
+
1151
+ text: str,
1152
+
1153
+ pause_map: Optional[list[float]],
1154
+
1155
+ audio_duration: Optional[float],
1156
+
1157
+ detected_language: Optional[str] = None,
1158
+
1159
+ hinted_profile: Optional[dict[str, Any]] = None,
1160
+
1161
+ ) -> AnalysisState:
1162
+
1163
+ tokens = tokenize_words(text)
1164
+
1165
+ sentences = split_sentences(text)
1166
+
1167
+ cwords = content_words(tokens)
1168
+
1169
+ language_profile = detect_language_profile(
1170
+
1171
+ text,
1172
+
1173
+ hinted_language=detected_language,
1174
+
1175
+ hinted_profile=hinted_profile,
1176
+
1177
+ )
1178
+
1179
+
1180
+
1181
+ repeat_ratio = 1.0 - (len(set(tokens)) / max(len(tokens), 1))
1182
+
1183
+
1184
+
1185
+ lexical, lexical_raw = lexical_domain(tokens, text, cwords, language_profile)
1186
+
1187
+ semantic, semantic_raw = semantic_domain(sentences)
1188
+
1189
+ prosody, prosody_raw, has_audio = prosody_domain(tokens, text, pause_map, audio_duration, language_profile)
1190
+
1191
+ syntax, syntax_raw = syntax_domain(tokens, sentences, text, language_profile)
1192
+
1193
+ affective, affective_raw = affective_domain(tokens, text)
1194
+
1195
+ confidence, quality_notes = compute_confidence(
1196
+
1197
+ word_count=len(tokens),
1198
+
1199
+ sentence_count=len(sentences),
1200
+
1201
+ has_audio_prosody=has_audio,
1202
+
1203
+ repeat_ratio=repeat_ratio,
1204
+
1205
+ )
1206
+
1207
+ quality_notes.append(
1208
+
1209
+ "Detected language mode: "
1210
+
1211
+ + str(language_profile.get("label", "multilingual")).title()
1212
+
1213
+ + f" (Hindi {round(float(language_profile.get('hindi_ratio', 0.0)) * 100)}%, "
1214
+
1215
+ + f"English {round(float(language_profile.get('english_ratio', 0.0)) * 100)}%)."
1216
+
1217
+ )
1218
+
1219
+
1220
+
1221
+ scores = {
1222
+
1223
+ "lexical": lexical,
1224
+
1225
+ "semantic": semantic,
1226
+
1227
+ "prosody": prosody,
1228
+
1229
+ "syntax": syntax,
1230
+
1231
+ "affective": affective,
1232
+
1233
+ }
1234
+
1235
+
1236
+
1237
+ weighted = (
1238
+
1239
+ (0.22 * lexical.overall)
1240
+
1241
+ + (0.23 * semantic.overall)
1242
+
1243
+ + (0.18 * prosody.overall)
1244
+
1245
+ + (0.22 * syntax.overall)
1246
+
1247
+ + (0.15 * affective.overall)
1248
+
1249
+ )
1250
+
1251
+
1252
+
1253
+ confidence_factor = 0.75 + (0.25 * confidence)
1254
+
1255
+ overall_load = clamp01(weighted * confidence_factor)
1256
+
1257
+
1258
+
1259
+ metrics = {
1260
+
1261
+ "word_count": len(tokens),
1262
+
1263
+ "sentence_count": len(sentences),
1264
+
1265
+ "repeat_ratio": round(repeat_ratio, 4),
1266
+
1267
+ "language_profile": language_profile,
1268
+
1269
+ "lexical": lexical_raw,
1270
+
1271
+ "semantic": semantic_raw,
1272
+
1273
+ "prosody": prosody_raw,
1274
+
1275
+ "syntax": syntax_raw,
1276
+
1277
+ "affective": affective_raw,
1278
+
1279
+ }
1280
+
1281
+
1282
+
1283
+ return AnalysisState(
1284
+
1285
+ scores=scores,
1286
+
1287
+ overall_load=round(overall_load, 4),
1288
+
1289
+ confidence=confidence,
1290
+
1291
+ quality_notes=quality_notes,
1292
+
1293
+ language_profile=language_profile,
1294
+
1295
+ metrics=metrics,
1296
+
1297
+ )
1298
+
1299
+
1300
+
1301
+
1302
+
1303
+ def severity_from_score(value: float) -> str:
1304
+
1305
+ if value >= 0.72:
1306
+
1307
+ return "high"
1308
+
1309
+ if value >= 0.42:
1310
+
1311
+ return "moderate"
1312
+
1313
+ return "low"
1314
+
1315
+
1316
+
1317
+ def level_from_overall(overall_load: float, confidence: float) -> str:
1318
+
1319
+ if overall_load >= 0.68:
1320
+
1321
+ base = "high"
1322
+
1323
+ elif overall_load >= 0.44:
1324
+
1325
+ base = "moderate"
1326
+
1327
+ else:
1328
+
1329
+ base = "low"
1330
+
1331
+
1332
+
1333
+ if confidence < 0.45 and base == "high":
1334
+
1335
+ return "moderate"
1336
+
1337
+ return base
1338
+
1339
+
1340
+
1341
+
1342
+
1343
+ def summary_fallback(state: AnalysisState, risk_level: str) -> str:
1344
+
1345
+ top_domain = max(state.scores.items(), key=lambda kv: kv[1].overall)[0]
1346
+
1347
+ top_value = state.scores[top_domain].overall
1348
+
1349
+ confidence_pct = round(state.confidence * 100)
1350
+
1351
+ language_mode = str(state.language_profile.get("label", "multilingual"))
1352
+
1353
+ return (
1354
+
1355
+ f"This {language_mode} speech analysis found a {risk_level} overall cognitive load signal based on linguistic and timing features. "
1356
+
1357
+ f"The strongest deviation appeared in {top_domain} markers (score {top_value:.2f}). "
1358
+
1359
+ f"Confidence is {confidence_pct}% and this output is screening support only, not a diagnosis."
1360
+
1361
+ )
1362
+
1363
+
1364
+
1365
+
1366
+
1367
+ def make_highlights(state: AnalysisState) -> list[dict[str, Any]]:
1368
+
1369
+ sorted_domains = sorted(state.scores.items(), key=lambda kv: kv[1].overall, reverse=True)
1370
+
1371
+ highlights: list[dict[str, Any]] = []
1372
+
1373
+ for domain, score in sorted_domains[:3]:
1374
+
1375
+ if score.overall >= 0.66:
1376
+
1377
+ finding = "Elevated deviation from expected baseline in this domain."
1378
+
1379
+ elif score.overall >= 0.42:
1380
+
1381
+ finding = "Mild-to-moderate deviation with mixed stability."
1382
+
1383
+ else:
1384
+
1385
+ finding = "Signals remain within expected variation for this domain."
1386
+
1387
+
1388
+
1389
+ highlights.append(
1390
+
1391
+ {
1392
+
1393
+ "region": DOMAIN_REGION[domain],
1394
+
1395
+ "activation": round(score.overall, 4),
1396
+
1397
+ "finding": finding,
1398
+
1399
+ "clinical_context": "Screening signal only. Interpret alongside clinical judgement and repeated assessments.",
1400
+
1401
+ }
1402
+
1403
+ )
1404
+
1405
+ return highlights
1406
+
1407
+
1408
+
1409
+
1410
+
1411
+ def make_indicators(state: AnalysisState) -> list[dict[str, Any]]:
1412
+
1413
+ indicators: list[dict[str, Any]] = []
1414
+
1415
+ for domain, dscore in state.scores.items():
1416
+
1417
+ for k, v in dscore.details.items():
1418
+
1419
+ if v < 0.42:
1420
+
1421
+ continue
1422
+
1423
+ indicators.append(
1424
+
1425
+ {
1426
+
1427
+ "indicator": f"{domain.title()} · {k.replace('_', ' ').title()}",
1428
+
1429
+ "severity": severity_from_score(v),
1430
+
1431
+ "explanation": f"Computed score {v:.2f} from measured input features; higher means greater deviation from baseline patterns.",
1432
+
1433
+ }
1434
+
1435
+ )
1436
+
1437
+ indicators.sort(key=lambda x: {"high": 2, "moderate": 1, "low": 0}[x["severity"]], reverse=True)
1438
+
1439
+ return indicators[:6]
1440
+
1441
+
1442
+
1443
+
1444
+
1445
+ def recommendation_for_level(level: str, confidence: float) -> str:
1446
+
1447
+ if level == "high":
1448
+
1449
+ return (
1450
+
1451
+ "Repeat this assessment with a longer sample, then discuss the combined results with a qualified clinician. "
1452
+
1453
+ "Do not treat this result as a diagnosis."
1454
+
1455
+ )
1456
+
1457
+ if level == "moderate":
1458
+
1459
+ return (
1460
+
1461
+ "Collect 1-2 additional samples across different times of day to confirm trend stability before drawing conclusions."
1462
+
1463
+ )
1464
+
1465
+ if confidence < 0.5:
1466
+
1467
+ return "Provide a longer speech sample for stronger reliability before interpreting the result."
1468
+
1469
+ return "Current signals are relatively stable. Continue periodic monitoring rather than one-off interpretation."
1470
+
1471
+
1472
+
1473
+
1474
+
1475
+ async def fetch_available_models() -> list[str]:
1476
+
1477
+ if not GROQ_API_KEY:
1478
+
1479
+ return []
1480
+
1481
+
1482
+
1483
+ async with _MODEL_CACHE_LOCK:
1484
+
1485
+ now = time.time()
1486
+
1487
+ if now - float(_MODEL_CACHE["updated"]) < MODEL_DISCOVERY_TTL_SECONDS:
1488
+
1489
+ return list(_MODEL_CACHE["models"])
1490
+
1491
+
1492
+
1493
+ headers = {"Authorization": f"Bearer {GROQ_API_KEY}"}
1494
+
1495
+ try:
1496
+
1497
+ async with httpx.AsyncClient(timeout=GROQ_TIMEOUT_SECONDS) as client:
1498
+
1499
+ res = await client.get(f"{GROQ_API_BASE}/models", headers=headers)
1500
+
1501
+ res.raise_for_status()
1502
+
1503
+ data = res.json().get("data", [])
1504
+
1505
+ models = sorted({item.get("id", "") for item in data if item.get("id")})
1506
+
1507
+ _MODEL_CACHE["updated"] = now
1508
+
1509
+ _MODEL_CACHE["models"] = models
1510
+
1511
+ return models
1512
+
1513
+ except Exception:
1514
+
1515
+ return list(_MODEL_CACHE["models"])
1516
+
1517
+
1518
+
1519
+
1520
+
1521
+ def pick_model(available: list[str], override: str, candidates: list[str]) -> Optional[str]:
1522
+
1523
+ if override and override in available:
1524
+
1525
+ return override
1526
+
1527
+
1528
+
1529
+ for m in candidates:
1530
+
1531
+ if m in available:
1532
+
1533
+ return m
1534
+
1535
+ for m in available:
1536
+
1537
+ lowered = m.lower()
1538
+
1539
+ if "instruct" in lowered or "versatile" in lowered or "gpt-oss" in lowered:
1540
+
1541
+ return m
1542
+
1543
+
1544
+
1545
+ return available[0] if available else None
1546
+
1547
+
1548
+
1549
+
1550
+
1551
+ async def groq_chat(model: str, system: str, user: str, temperature: float = 0.2) -> Optional[str]:
1552
+
1553
+ if not GROQ_API_KEY or not model:
1554
+
1555
+ return None
1556
+
1557
+
1558
+
1559
+ headers = {
1560
+
1561
+ "Authorization": f"Bearer {GROQ_API_KEY}",
1562
+
1563
+ "Content-Type": "application/json",
1564
+
1565
+ }
1566
+
1567
+ payload = {
1568
+
1569
+ "model": model,
1570
+
1571
+ "temperature": temperature,
1572
+
1573
+ "messages": [
1574
+
1575
+ {"role": "system", "content": system},
1576
+
1577
+ {"role": "user", "content": user},
1578
+
1579
+ ],
1580
+
1581
+ }
1582
+
1583
+
1584
+
1585
+ try:
1586
+
1587
+ async with httpx.AsyncClient(timeout=GROQ_TIMEOUT_SECONDS) as client:
1588
+
1589
+ res = await client.post(f"{GROQ_API_BASE}/chat/completions", headers=headers, json=payload)
1590
+
1591
+ res.raise_for_status()
1592
+
1593
+ data = res.json()
1594
+
1595
+ return data["choices"][0]["message"]["content"].strip()
1596
+
1597
+ except Exception:
1598
+
1599
+ return None
1600
+
1601
+
1602
+
1603
+
1604
+
1605
+ async def compose_safe_summary(state: AnalysisState, risk_level: str) -> tuple[str, dict[str, Optional[str]]]:
1606
+
1607
+ available = await fetch_available_models()
1608
+
1609
+ reasoning_model = pick_model(available, OVERRIDE_REASONING_MODEL, PREFERRED_REASONING_MODELS)
1610
+
1611
+ safety_model = pick_model(available, OVERRIDE_SAFETY_MODEL, PREFERRED_SAFETY_MODELS)
1612
+
1613
+
1614
+
1615
+ model_meta = {
1616
+
1617
+ "reasoning_model": reasoning_model,
1618
+
1619
+ "safety_model": safety_model,
1620
+
1621
+ }
1622
+
1623
+
1624
+
1625
+ baseline_summary = summary_fallback(state, risk_level)
1626
+
1627
+ if not reasoning_model:
1628
+
1629
+ return baseline_summary, model_meta
1630
+
1631
+
1632
+
1633
+ features_for_prompt = {
1634
+
1635
+ "risk_level": risk_level,
1636
+
1637
+ "overall_cognitive_load": state.overall_load,
1638
+
1639
+ "confidence": state.confidence,
1640
+
1641
+ "language_profile": state.language_profile,
1642
+
1643
+ "scores": {k: v.overall for k, v in state.scores.items()},
1644
+
1645
+ "quality_notes": state.quality_notes,
1646
+
1647
+ "metrics": state.metrics,
1648
+
1649
+ }
1650
+
1651
+ system = (
1652
+
1653
+ "You summarize computational language-screening outputs. "
1654
+
1655
+ "English, Hindi, and code-mixed Hinglish samples are all valid and should be interpreted fairly. "
1656
+
1657
+ "Never diagnose disease, never use alarming wording, and always state uncertainty when confidence is limited. "
1658
+
1659
+ "Output exactly 2-3 sentences in plain text."
1660
+
1661
+ )
1662
+
1663
+ user = "Write a careful summary for this analysis:\n" + json.dumps(features_for_prompt)
1664
+
1665
+
1666
+
1667
+ summary = await groq_chat(reasoning_model, system, user, temperature=0.15)
1668
+
1669
+ if not summary:
1670
+
1671
+ return baseline_summary, model_meta
1672
+
1673
+
1674
+
1675
+ if safety_model:
1676
+
1677
+ safety_system = (
1678
+
1679
+ "You are a safety editor for health-adjacent UX. "
1680
+
1681
+ "Rewrite text to avoid panic, avoid diagnosis claims, and keep uncertainty explicit. "
1682
+
1683
+ "Keep 2-3 sentences."
1684
+
1685
+ )
1686
+
1687
+ safety_user = (
1688
+
1689
+ "Rewrite this summary to be non-alarmist and clinically careful while keeping factual content:\n"
1690
+
1691
+ + summary
1692
+
1693
+ + "\n\nConfidence: "
1694
+
1695
+ + str(state.confidence)
1696
+
1697
+ )
1698
+
1699
+ safe = await groq_chat(safety_model, safety_system, safety_user, temperature=0.1)
1700
+
1701
+ if safe:
1702
+
1703
+ summary = safe
1704
+
1705
+
1706
+
1707
+ return summary, model_meta
1708
+
1709
+
1710
+
1711
+
1712
+
1713
+ @app.get("/health")
1714
+
1715
+ async def health() -> dict[str, Any]:
1716
+
1717
+ available = await fetch_available_models()
1718
+
1719
+ transcribe_caps = transcription_model_capabilities(GROQ_TRANSCRIBE_MODEL)
1720
+
1721
+ return {
1722
+
1723
+ "ok": True,
1724
+
1725
+ "service": "cortexflow-backend",
1726
+
1727
+ "groq_configured": bool(GROQ_API_KEY),
1728
+
1729
+ "model_count": len(available),
1730
+
1731
+ "transcription_model": GROQ_TRANSCRIBE_MODEL,
1732
+
1733
+ "transcription_capabilities": transcribe_caps,
1734
+
1735
+ }
1736
+
1737
+
1738
+
1739
+
1740
+
1741
+ @app.get("/models/recommended")
1742
+
1743
+ async def models_recommended() -> dict[str, Any]:
1744
+
1745
+ available = await fetch_available_models()
1746
+
1747
+ transcribe_caps = transcription_model_capabilities(GROQ_TRANSCRIBE_MODEL)
1748
+
1749
+ return {
1750
+
1751
+ "available_models": available,
1752
+
1753
+ "recommended": {
1754
+
1755
+ "reasoning": pick_model(available, OVERRIDE_REASONING_MODEL, PREFERRED_REASONING_MODELS),
1756
+
1757
+ "safety": pick_model(available, OVERRIDE_SAFETY_MODEL, PREFERRED_SAFETY_MODELS),
1758
+
1759
+ "transcription": GROQ_TRANSCRIBE_MODEL,
1760
+
1761
+ },
1762
+
1763
+ "transcription_capabilities": transcribe_caps,
1764
+
1765
+ "notes": {
1766
+
1767
+ "production_primary": "openai/gpt-oss-120b",
1768
+
1769
+ "production_fallback": "llama-3.3-70b-versatile",
1770
+
1771
+ "fast_fallback": "openai/gpt-oss-20b",
1772
+
1773
+ "transcription_accuracy_primary": "whisper-large-v3",
1774
+
1775
+ "transcription_speed_price_primary": "whisper-large-v3-turbo",
1776
+
1777
+ "transcription_language_note": "Both Whisper models are multilingual and suitable for Hindi/Hinglish speech.",
1778
+
1779
+ },
1780
+
1781
+ }
1782
+
1783
+
1784
+
1785
+ @app.post("/analyze")
1786
+
1787
+ async def analyze(req: AnalyzeRequest):
1788
+
1789
+ text = ensure_nonempty_text(req)
1790
+
1791
+ session_id = req.session_id or str(uuid.uuid4())
1792
+
1793
+
1794
+
1795
+ async def generate():
1796
+
1797
+ for idx, step_name in enumerate(STEP_NAMES):
1798
+
1799
+ yield safe_step_event(step_name, "running" if idx == 0 else "pending")
1800
+
1801
+
1802
+
1803
+ try:
1804
+
1805
+ state = compute_analysis_state(
1806
+
1807
+ text,
1808
+
1809
+ req.pause_map,
1810
+
1811
+ req.audio_duration,
1812
+
1813
+ detected_language=req.detected_language,
1814
+
1815
+ hinted_profile=req.language_profile,
1816
+
1817
+ )
1818
+
1819
+ yield safe_step_event("STT preprocessor", "done", "Input normalized and validated")
1820
+
1821
+ yield safe_step_event("Lexical agent", "running")
1822
+
1823
+
1824
+
1825
+ await asyncio.sleep(0)
1826
+
1827
+ yield safe_step_event("Lexical agent", "done")
1828
+
1829
+ yield safe_step_event("Semantic agent", "running")
1830
+
1831
+
1832
+
1833
+ await asyncio.sleep(0)
1834
+
1835
+ yield safe_step_event("Semantic agent", "done")
1836
+
1837
+ yield safe_step_event("Prosody agent", "running")
1838
+
1839
+
1840
+
1841
+ await asyncio.sleep(0)
1842
+
1843
+ yield safe_step_event("Prosody agent", "done")
1844
+
1845
+ yield safe_step_event("Syntax agent", "running")
1846
+
1847
+
1848
+
1849
+ await asyncio.sleep(0)
1850
+
1851
+ yield safe_step_event("Syntax agent", "done")
1852
+
1853
+ yield safe_step_event("Biomarker mapper", "running")
1854
+
1855
+
1856
+
1857
+ scores_payload = {
1858
+
1859
+ domain: {**score.details, "overall": score.overall}
1860
+
1861
+ for domain, score in state.scores.items()
1862
+
1863
+ }
1864
+
1865
+
1866
+
1867
+ yield safe_step_event("Biomarker mapper", "done")
1868
+
1869
+ yield safe_step_event("Report composer", "running")
1870
+
1871
+
1872
+
1873
+ risk_level = level_from_overall(state.overall_load, state.confidence)
1874
+
1875
+ summary, model_meta = await compose_safe_summary(state, risk_level)
1876
+
1877
+
1878
+
1879
+ report = {
1880
+
1881
+ "summary": summary,
1882
+
1883
+ "risk_level": risk_level,
1884
+
1885
+ "overall_cognitive_load": state.overall_load,
1886
+
1887
+ "highlights": make_highlights(state),
1888
+
1889
+ "risk_indicators": make_indicators(state),
1890
+
1891
+ "recommendation": recommendation_for_level(risk_level, state.confidence),
1892
+
1893
+ "disclaimer": (
1894
+
1895
+ "This tool is a non-diagnostic screening aid. It can be wrong and must not be used as a standalone "
1896
+
1897
+ "medical decision system. If you are concerned, consult a qualified clinician."
1898
+
1899
+ ),
1900
+
1901
+ "quality": {
1902
+
1903
+ "confidence": state.confidence,
1904
+
1905
+ "notes": state.quality_notes,
1906
+
1907
+ },
1908
+
1909
+ "language_profile": state.language_profile,
1910
+
1911
+ "model_info": model_meta,
1912
+
1913
+ }
1914
+
1915
+ yield safe_step_event("Report composer", "done")
1916
+
1917
+
1918
+
1919
+ payload = {
1920
+
1921
+ "type": "end",
1922
+
1923
+ "message": summary,
1924
+
1925
+ "scores": scores_payload,
1926
+
1927
+ "report": report,
1928
+
1929
+ "session_id": session_id,
1930
+
1931
+ }
1932
+
1933
+ yield (json.dumps(payload) + "\n").encode()
1934
+
1935
+
1936
+
1937
+ except HTTPException as exc:
1938
+
1939
+ yield (json.dumps({"type": "error", "message": exc.detail}) + "\n").encode()
1940
+
1941
+ except Exception as exc:
1942
+
1943
+ yield (json.dumps({"type": "error", "message": f"Analysis failed: {str(exc)}"}) + "\n").encode()
1944
+
1945
+
1946
+
1947
+ return StreamingResponse(
1948
+
1949
+ generate(),
1950
+
1951
+ media_type="text/plain",
1952
+
1953
+ headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
1954
+
1955
+ )
1956
+