Spaces:
Sleeping
Sleeping
Upload main.py
Browse files
main.py
CHANGED
|
@@ -14,6 +14,7 @@ import tempfile
|
|
| 14 |
import os
|
| 15 |
import uuid
|
| 16 |
from contextlib import asynccontextmanager
|
|
|
|
| 17 |
|
| 18 |
from faster_whisper import WhisperModel
|
| 19 |
from zeroconf import ServiceInfo
|
|
@@ -27,6 +28,11 @@ SERVICE_PORT = 8000
|
|
| 27 |
# Cloud deployment detection (Hugging Face Spaces, Railway, etc.)
|
| 28 |
IS_CLOUD = os.environ.get("SPACE_ID") is not None or os.environ.get("RAILWAY_ENVIRONMENT") is not None
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 32 |
# Filipino / Taglish vocabulary hint for Whisper initial_prompt.
|
|
@@ -42,7 +48,9 @@ FILIPINO_VOCAB_PROMPT = (
|
|
| 42 |
"maganda, mabuti, masaya, malaki, maliit, "
|
| 43 |
"kumain, uminom, pumunta, naglaro, natulog, "
|
| 44 |
"paaralan, bahay, trabaho, kaibigan, pamilya, "
|
| 45 |
-
"salamat, magandang, umaga, hapon, gabi"
|
|
|
|
|
|
|
| 46 |
)
|
| 47 |
|
| 48 |
# Known Whisper misrecognitions for Filipino β extend as needed.
|
|
@@ -53,6 +61,10 @@ WHISPER_CORRECTIONS: dict[str, str] = {
|
|
| 53 |
"cami": "kami",
|
| 54 |
"cum": "kum",
|
| 55 |
"naman naman": "naman",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
}
|
| 57 |
|
| 58 |
|
|
@@ -128,43 +140,49 @@ async def lifespan(app: FastAPI):
|
|
| 128 |
global async_zeroconf, service_info, model, roberta_model, roberta_tokenizer
|
| 129 |
|
| 130 |
# 1. Load Whisper
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
# 2. Load RoBERTa (Tagalog)
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
if torch.cuda.is_available():
|
| 160 |
-
roberta_model.to("cuda")
|
| 161 |
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
|
| 170 |
# Startup: Register mDNS service (skip on cloud deployments)
|
|
@@ -413,7 +431,7 @@ def analyze_prosody(segments: list, duration_seconds: float) -> ProsodyInfo:
|
|
| 413 |
|
| 414 |
|
| 415 |
|
| 416 |
-
def
|
| 417 |
"""
|
| 418 |
Calculate a fluency score (1-10) using RoBERTa perplexity (PPL).
|
| 419 |
Lower PPL = More natural/fluent.
|
|
@@ -451,6 +469,26 @@ def calculate_fluency(text: str) -> float:
|
|
| 451 |
print(f"β οΈ RoBERTa analysis failed: {e}")
|
| 452 |
return check_coherence_heuristic(text)
|
| 453 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 454 |
|
| 455 |
def check_coherence_heuristic(text: str) -> float:
|
| 456 |
"""Heuristic check for coherence (Fallback)."""
|
|
@@ -508,6 +546,20 @@ def generate_feedback(pace: PaceInfo, fillers: FillerInfo, prosody: ProsodyInfo,
|
|
| 508 |
|
| 509 |
from fastapi import Form, UploadFile, File
|
| 510 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 511 |
@app.post("/sessions/{session_id}/transcribe", response_model=QuickTranscriptResponse)
|
| 512 |
async def quick_transcribe(
|
| 513 |
session_id: str,
|
|
@@ -676,7 +728,7 @@ async def upload_audio_chunk(session_id: str, file: UploadFile = File(...)):
|
|
| 676 |
pace = calculate_pace(transcript, safe_duration)
|
| 677 |
prosody = analyze_prosody(segments, safe_duration)
|
| 678 |
# Use RoBERTa for advanced fluency scoring (or fallback to heuristic)
|
| 679 |
-
coherence =
|
| 680 |
|
| 681 |
feedback = generate_feedback(pace, fillers, prosody, coherence)
|
| 682 |
|
|
@@ -691,3 +743,10 @@ async def upload_audio_chunk(session_id: str, file: UploadFile = File(...)):
|
|
| 691 |
feedback=feedback,
|
| 692 |
message=message,
|
| 693 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
import os
|
| 15 |
import uuid
|
| 16 |
from contextlib import asynccontextmanager
|
| 17 |
+
import httpx
|
| 18 |
|
| 19 |
from faster_whisper import WhisperModel
|
| 20 |
from zeroconf import ServiceInfo
|
|
|
|
| 28 |
# Cloud deployment detection (Hugging Face Spaces, Railway, etc.)
|
| 29 |
IS_CLOUD = os.environ.get("SPACE_ID") is not None or os.environ.get("RAILWAY_ENVIRONMENT") is not None
|
| 30 |
|
| 31 |
+
# Service Mode Configuration (Split Architecture)
|
| 32 |
+
SERVICE_MODE = os.environ.get("SERVICE_MODE", "audio").lower() # 'audio' or 'nlp'
|
| 33 |
+
NLP_API_URL = os.environ.get("NLP_API_URL", "").rstrip("/")
|
| 34 |
+
|
| 35 |
+
|
| 36 |
|
| 37 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 38 |
# Filipino / Taglish vocabulary hint for Whisper initial_prompt.
|
|
|
|
| 48 |
"maganda, mabuti, masaya, malaki, maliit, "
|
| 49 |
"kumain, uminom, pumunta, naglaro, natulog, "
|
| 50 |
"paaralan, bahay, trabaho, kaibigan, pamilya, "
|
| 51 |
+
"salamat, magandang, umaga, hapon, gabi, "
|
| 52 |
+
# Common English loanwords/test phrases
|
| 53 |
+
"hello, hi, mic, test, testing, okay, yes, no"
|
| 54 |
)
|
| 55 |
|
| 56 |
# Known Whisper misrecognitions for Filipino β extend as needed.
|
|
|
|
| 61 |
"cami": "kami",
|
| 62 |
"cum": "kum",
|
| 63 |
"naman naman": "naman",
|
| 64 |
+
# English loanword corrections
|
| 65 |
+
"helo": "hello",
|
| 66 |
+
"mike": "mic",
|
| 67 |
+
"test": "test", # to ensure it's not accidentally stripped
|
| 68 |
}
|
| 69 |
|
| 70 |
|
|
|
|
| 140 |
global async_zeroconf, service_info, model, roberta_model, roberta_tokenizer
|
| 141 |
|
| 142 |
# 1. Load Whisper
|
| 143 |
+
if SERVICE_MODE == "audio":
|
| 144 |
+
print("β³ Loading Whisper model...")
|
| 145 |
+
try:
|
| 146 |
+
print(f"π§ CUDA Available: {torch.cuda.is_available()}")
|
| 147 |
+
if torch.cuda.is_available():
|
| 148 |
+
print(f"π§ GPU Device: {torch.cuda.get_device_name(0)}")
|
| 149 |
+
model = WhisperModel(
|
| 150 |
+
"small", # 3x more accurate than 'base'
|
| 151 |
+
device="cuda",
|
| 152 |
+
compute_type="float16"
|
| 153 |
+
)
|
| 154 |
+
else:
|
| 155 |
+
# CPU / free HF Space β medium+int8 fits in ~1.5 GB RAM
|
| 156 |
+
print("π§ Using CPU mode (medium + int8)")
|
| 157 |
+
model = WhisperModel("medium", device="cpu", compute_type="int8")
|
| 158 |
+
print("β
Whisper 'medium' model loaded successfully")
|
| 159 |
+
except Exception as e:
|
| 160 |
+
print(f"β Failed to load Whisper model: {e}")
|
| 161 |
+
print("β οΈ Falling back to base/int8...")
|
| 162 |
+
model = WhisperModel("base", device="cpu", compute_type="int8")
|
| 163 |
+
else:
|
| 164 |
+
print("βοΈ Audio Service Mode not active, skipping Whisper.")
|
| 165 |
|
| 166 |
# 2. Load RoBERTa (Tagalog)
|
| 167 |
+
if SERVICE_MODE == "nlp":
|
| 168 |
+
print("β³ Loading RoBERTa (Tagalog) model...")
|
| 169 |
+
try:
|
| 170 |
+
# Use jcblaise/roberta-tagalog-base for fluency/coherence
|
| 171 |
+
model_name = "jcblaise/roberta-tagalog-base"
|
| 172 |
+
roberta_tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 173 |
+
roberta_model = AutoModelForMaskedLM.from_pretrained(model_name)
|
|
|
|
|
|
|
| 174 |
|
| 175 |
+
if torch.cuda.is_available():
|
| 176 |
+
roberta_model.to("cuda")
|
| 177 |
+
|
| 178 |
+
roberta_model.eval() # Set to evaluation mode
|
| 179 |
+
print("β
RoBERTa model loaded successfully")
|
| 180 |
+
except Exception as e:
|
| 181 |
+
print(f"β Failed to load RoBERTa model: {e}")
|
| 182 |
+
roberta_model = None
|
| 183 |
+
roberta_tokenizer = None
|
| 184 |
+
else:
|
| 185 |
+
print("βοΈ NLP Service Mode not active, skipping RoBERTa.")
|
| 186 |
|
| 187 |
|
| 188 |
# Startup: Register mDNS service (skip on cloud deployments)
|
|
|
|
| 431 |
|
| 432 |
|
| 433 |
|
| 434 |
+
def calculate_fluency_local(text: str) -> float:
|
| 435 |
"""
|
| 436 |
Calculate a fluency score (1-10) using RoBERTa perplexity (PPL).
|
| 437 |
Lower PPL = More natural/fluent.
|
|
|
|
| 469 |
print(f"β οΈ RoBERTa analysis failed: {e}")
|
| 470 |
return check_coherence_heuristic(text)
|
| 471 |
|
| 472 |
+
async def get_fluency_score(text: str) -> float:
|
| 473 |
+
"""Gets the fluency score, either locally (NLP mode) or remotely (Audio mode)."""
|
| 474 |
+
if SERVICE_MODE == "nlp":
|
| 475 |
+
return calculate_fluency_local(text)
|
| 476 |
+
|
| 477 |
+
if NLP_API_URL:
|
| 478 |
+
# Call the NLP Microservice
|
| 479 |
+
try:
|
| 480 |
+
async with httpx.AsyncClient(timeout=10.0) as client:
|
| 481 |
+
res = await client.post(f"{NLP_API_URL}/fluency", json={"text": text})
|
| 482 |
+
if res.status_code == 200:
|
| 483 |
+
return res.json().get("coherence_score", 5.0)
|
| 484 |
+
else:
|
| 485 |
+
print(f"β οΈ External NLP API returned {res.status_code}, falling back to heuristic.")
|
| 486 |
+
except Exception as e:
|
| 487 |
+
print(f"β οΈ Failed to connect to NLP API at {NLP_API_URL}: {e}")
|
| 488 |
+
|
| 489 |
+
# Fallback heuristic if local model missing and no external API configured/available
|
| 490 |
+
return check_coherence_heuristic(text)
|
| 491 |
+
|
| 492 |
|
| 493 |
def check_coherence_heuristic(text: str) -> float:
|
| 494 |
"""Heuristic check for coherence (Fallback)."""
|
|
|
|
| 546 |
|
| 547 |
from fastapi import Form, UploadFile, File
|
| 548 |
|
| 549 |
+
class FluencyRequest(BaseModel):
|
| 550 |
+
text: str
|
| 551 |
+
|
| 552 |
+
class FluencyResponse(BaseModel):
|
| 553 |
+
coherence_score: float
|
| 554 |
+
|
| 555 |
+
@app.post("/fluency", response_model=FluencyResponse)
|
| 556 |
+
async def analyze_fluency(req: FluencyRequest):
|
| 557 |
+
"""External endpoint for Audio service to request fluency scoring. (NLP Mode Only)"""
|
| 558 |
+
score = calculate_fluency_local(req.text)
|
| 559 |
+
return FluencyResponse(coherence_score=score)
|
| 560 |
+
|
| 561 |
+
|
| 562 |
+
|
| 563 |
@app.post("/sessions/{session_id}/transcribe", response_model=QuickTranscriptResponse)
|
| 564 |
async def quick_transcribe(
|
| 565 |
session_id: str,
|
|
|
|
| 728 |
pace = calculate_pace(transcript, safe_duration)
|
| 729 |
prosody = analyze_prosody(segments, safe_duration)
|
| 730 |
# Use RoBERTa for advanced fluency scoring (or fallback to heuristic)
|
| 731 |
+
coherence = await get_fluency_score(transcript)
|
| 732 |
|
| 733 |
feedback = generate_feedback(pace, fillers, prosody, coherence)
|
| 734 |
|
|
|
|
| 743 |
feedback=feedback,
|
| 744 |
message=message,
|
| 745 |
)
|
| 746 |
+
|
| 747 |
+
if __name__ == "__main__":
|
| 748 |
+
import uvicorn
|
| 749 |
+
# Run the FastAPI app via uvicorn directly from python
|
| 750 |
+
# Passing the 'app' object directly instead of the string "main:app"
|
| 751 |
+
# because dynamic string imports often fail inside PyInstaller EXEs
|
| 752 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|