Jay162005 commited on
Commit
eb0c8ae
·
verified ·
1 Parent(s): 03e3fc5

Upload 4 files

Browse files
Files changed (4) hide show
  1. Dockerfile +32 -0
  2. README.md +24 -5
  3. main.py +607 -0
  4. requirements.txt +11 -0
Dockerfile ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Hugging Face Spaces Dockerfile for SalitaKo Backend
2
+ FROM python:3.11-slim
3
+
4
+ # Install system dependencies for audio processing
5
+ RUN apt-get update && apt-get install -y \
6
+ ffmpeg \
7
+ libsndfile1 \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ # Create non-root user (required by HF Spaces)
11
+ RUN useradd -m -u 1000 user
12
+ USER user
13
+ ENV HOME=/home/user \
14
+ PATH=/home/user/.local/bin:$PATH
15
+
16
+ WORKDIR $HOME/app
17
+
18
+ # Copy requirements first for caching
19
+ COPY --chown=user requirements-hf.txt requirements.txt
20
+
21
+ # Install Python dependencies (CPU-only torch for free tier)
22
+ RUN pip install --no-cache-dir --upgrade pip && \
23
+ pip install --no-cache-dir -r requirements.txt
24
+
25
+ # Copy application code
26
+ COPY --chown=user . .
27
+
28
+ # Expose port 7860 (Hugging Face Spaces default)
29
+ EXPOSE 7860
30
+
31
+ # Run the FastAPI app
32
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,12 +1,31 @@
1
  ---
2
- title: Salitako2.0
3
- emoji: 🦀
4
  colorFrom: blue
5
- colorTo: red
6
  sdk: docker
 
7
  pinned: false
8
  license: mit
9
- short_description: a ai transcription
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: SalitaKo Speech Coach API
3
+ emoji: 🎤
4
  colorFrom: blue
5
+ colorTo: purple
6
  sdk: docker
7
+ app_port: 7860
8
  pinned: false
9
  license: mit
 
10
  ---
11
 
12
+ # SalitaKo Speech Coach API
13
+
14
+ Filipino/Tagalog speech coaching backend powered by:
15
+ - **Whisper** (faster-whisper) - Speech-to-text
16
+ - **RoBERTa** (jcblaise/roberta-tagalog-base) - Fluency scoring
17
+
18
+ ## API Endpoints
19
+
20
+ - `GET /` - Welcome message
21
+ - `GET /health` - Health check
22
+ - `GET /docs` - Swagger UI documentation
23
+ - `POST /sessions` - Create a new session
24
+ - `POST /sessions/{id}/transcribe` - Quick transcription
25
+ - `POST /sessions/{id}/audio-chunk` - Full analysis with feedback
26
+
27
+ ## Usage
28
+
29
+ ```bash
30
+ curl https://YOUR-SPACE.hf.space/health
31
+ ```
main.py ADDED
@@ -0,0 +1,607 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import socket
3
+ import sqlite3
4
+ import datetime
5
+ import numpy as np
6
+ from fastapi import FastAPI, UploadFile, File
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ from pydantic import BaseModel
9
+ import asyncio
10
+ import tempfile
11
+ import os
12
+ import uuid
13
+ from contextlib import asynccontextmanager
14
+
15
+ from faster_whisper import WhisperModel
16
+ from zeroconf import ServiceInfo
17
+ from zeroconf.asyncio import AsyncZeroconf
18
+
19
+ # mDNS Service Configuration
20
+ SERVICE_TYPE = "_salitako._tcp.local."
21
+ SERVICE_NAME = "SalitaKo Server._salitako._tcp.local."
22
+ SERVICE_PORT = 8000
23
+
24
+ # Cloud deployment detection (Hugging Face Spaces, Railway, etc.)
25
+ IS_CLOUD = os.environ.get("SPACE_ID") is not None or os.environ.get("RAILWAY_ENVIRONMENT") is not None
26
+
27
+
28
+ def get_local_ip():
29
+ """Get the local IP address of this machine."""
30
+ try:
31
+ s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
32
+ s.connect(("8.8.8.8", 80))
33
+ ip = s.getsockname()[0]
34
+ s.close()
35
+ return ip
36
+ except Exception:
37
+ return "127.0.0.1"
38
+
39
+
40
+ # Global async zeroconf instance
41
+ async_zeroconf = None
42
+ service_info = None
43
+
44
+
45
+ from transformers import AutoTokenizer, AutoModelForMaskedLM
46
+ import torch
47
+
48
+ # Global model instances
49
+ model = None # Whisper
50
+ roberta_model = None
51
+ roberta_tokenizer = None
52
+
53
+ @asynccontextmanager
54
+ async def lifespan(app: FastAPI):
55
+ """Manage mDNS service registration and Model loading on startup/shutdown."""
56
+ global async_zeroconf, service_info, model, roberta_model, roberta_tokenizer
57
+
58
+ # 1. Load Whisper
59
+ print("⏳ Loading Whisper model...")
60
+ try:
61
+ print(f"🔧 CUDA Available: {torch.cuda.is_available()}")
62
+ if torch.cuda.is_available():
63
+ print(f"🔧 GPU Device: {torch.cuda.get_device_name(0)}")
64
+ model = WhisperModel(
65
+ "base", # Fast loading
66
+ device="cuda", # Use NVIDIA GPU
67
+ compute_type="float16"
68
+ )
69
+ else:
70
+ # CPU fallback (for cloud free tiers)
71
+ print("🔧 Using CPU mode")
72
+ model = WhisperModel("base", device="cpu", compute_type="int8")
73
+ print("✅ Whisper model loaded successfully")
74
+ except Exception as e:
75
+ print(f"❌ Failed to load Whisper model: {e}")
76
+ print("⚠️ Falling back to CPU/int8...")
77
+ model = WhisperModel("small", device="cpu", compute_type="int8")
78
+
79
+ # 2. Load RoBERTa (Tagalog)
80
+ print("⏳ Loading RoBERTa (Tagalog) model...")
81
+ try:
82
+ # Use jcblaise/roberta-tagalog-base for fluency/coherence
83
+ model_name = "jcblaise/roberta-tagalog-base"
84
+ roberta_tokenizer = AutoTokenizer.from_pretrained(model_name)
85
+ roberta_model = AutoModelForMaskedLM.from_pretrained(model_name)
86
+
87
+ if torch.cuda.is_available():
88
+ roberta_model.to("cuda")
89
+
90
+ roberta_model.eval() # Set to evaluation mode
91
+ print("✅ RoBERTa model loaded successfully")
92
+ except Exception as e:
93
+ print(f"❌ Failed to load RoBERTa model: {e}")
94
+ roberta_model = None
95
+ roberta_tokenizer = None
96
+
97
+
98
+ # Startup: Register mDNS service (skip on cloud deployments)
99
+ if IS_CLOUD:
100
+ print("☁️ Cloud deployment detected - skipping mDNS registration")
101
+ else:
102
+ local_ip = get_local_ip()
103
+ print(f"🌐 Local IP: {local_ip}")
104
+
105
+ try:
106
+ async_zeroconf = AsyncZeroconf()
107
+ service_info = ServiceInfo(
108
+ SERVICE_TYPE,
109
+ SERVICE_NAME,
110
+ addresses=[socket.inet_aton(local_ip)],
111
+ port=SERVICE_PORT,
112
+ properties={
113
+ "version": "0.2.0",
114
+ "api": "/docs",
115
+ "name": "SalitaKo Speech Coach"
116
+ },
117
+ server=f"salitako.local.",
118
+ )
119
+
120
+ await async_zeroconf.async_register_service(service_info)
121
+ print(f"📡 mDNS service registered: {SERVICE_NAME} at {local_ip}:{SERVICE_PORT}")
122
+ except Exception as e:
123
+ print(f"⚠️ mDNS registration failed (non-fatal): {e}")
124
+ async_zeroconf = None
125
+
126
+ yield
127
+
128
+ # Shutdown: Unregister mDNS service
129
+ if async_zeroconf and service_info:
130
+ print("📡 Unregistering mDNS service...")
131
+ try:
132
+ await async_zeroconf.async_unregister_service(service_info)
133
+ await async_zeroconf.async_close()
134
+ except Exception as e:
135
+ print(f"⚠️ mDNS unregister failed: {e}")
136
+
137
+
138
+ app = FastAPI(title="SalitaKo API", version="0.2.0", lifespan=lifespan)
139
+
140
+
141
+ @app.get("/")
142
+ async def read_root():
143
+ local_ip = get_local_ip()
144
+ return {
145
+ "message": "Welcome to SalitaKo API",
146
+ "docs_url": f"http://{local_ip}:8000/docs",
147
+ "health_check": f"http://{local_ip}:8000/health",
148
+ "local_ip": local_ip
149
+ }
150
+
151
+
152
+ app.add_middleware(
153
+ CORSMiddleware,
154
+ allow_origins=[
155
+ "http://localhost:3000",
156
+ "https://*.hf.space", # Hugging Face Spaces
157
+ "*" # Allow all for development (restrict in production)
158
+ ],
159
+ allow_credentials=True,
160
+ allow_methods=["*"],
161
+ allow_headers=["*"],
162
+ )
163
+
164
+
165
+
166
+ class SessionResult(BaseModel):
167
+ student_name: str
168
+ wpm: float
169
+ fluency_score: float
170
+ filler_count: int
171
+ duration_seconds: int
172
+
173
+
174
+ @app.post("/log-session")
175
+ async def log_session_result(data: SessionResult):
176
+ """Log session results to a local SQLite database for research analysis."""
177
+ try:
178
+ # Connect to a simple file-based DB
179
+ conn = sqlite3.connect('thesis_data.db')
180
+ cursor = conn.cursor()
181
+
182
+ # Create table if it doesn't exist
183
+ cursor.execute('''
184
+ CREATE TABLE IF NOT EXISTS results (
185
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
186
+ student_name TEXT,
187
+ wpm REAL,
188
+ fluency_score REAL,
189
+ filler_count INTEGER,
190
+ duration INTEGER,
191
+ timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
192
+ )
193
+ ''')
194
+
195
+ # Insert the data
196
+ cursor.execute('''
197
+ INSERT INTO results (student_name, wpm, fluency_score, filler_count, duration)
198
+ VALUES (?, ?, ?, ?, ?)
199
+ ''', (data.student_name, data.wpm, data.fluency_score, data.filler_count, data.duration_seconds))
200
+
201
+ conn.commit()
202
+ conn.close()
203
+ print(f"📝 Logged session for {data.student_name}")
204
+ return {"status": "logged"}
205
+ except Exception as e:
206
+ print(f"❌ Failed to log session: {e}")
207
+ return {"status": "error", "message": str(e)}
208
+
209
+
210
+ class AppConfig(BaseModel):
211
+ update_interval_seconds: int
212
+ supported_languages: list[str]
213
+ semantic_score_min: int
214
+ semantic_score_max: int
215
+
216
+
217
+ class SessionCreateResponse(BaseModel):
218
+ session_id: str
219
+
220
+
221
+ class FillerInfo(BaseModel):
222
+ count: int
223
+ fillers_detected: list[str]
224
+
225
+
226
+ class PaceInfo(BaseModel):
227
+ wpm: float
228
+ status: str # Slow, Normal, Fast
229
+
230
+
231
+ class ProsodyInfo(BaseModel):
232
+ volume_db: float | None
233
+ silence_ratio: float | None
234
+
235
+
236
+ class Feedback(BaseModel):
237
+ general: str
238
+ pacing: str
239
+ fillers: str
240
+ coherence: str
241
+
242
+
243
+ class ChunkAnalysisResponse(BaseModel):
244
+ transcript: str
245
+ wpm: float | None
246
+ filler_count: int | None
247
+
248
+ # Detailed analysis
249
+ fillers: FillerInfo | None
250
+ pacing: PaceInfo | None
251
+ prosody: ProsodyInfo | None
252
+ coherence_score: float | None
253
+ feedback: Feedback | None
254
+
255
+ message: str
256
+
257
+
258
+ # Lightweight response for real-time transcription (no analysis)
259
+ class QuickTranscriptResponse(BaseModel):
260
+ transcript: str
261
+ has_speech: bool # For auto-stop detection
262
+ message: str
263
+
264
+
265
+ @app.get("/health")
266
+ async def health_check():
267
+ return {"status": "ok"}
268
+
269
+
270
+ @app.get("/config", response_model=AppConfig)
271
+ async def get_config():
272
+ """Return static configuration for the frontend UI."""
273
+
274
+ return AppConfig(
275
+ update_interval_seconds=3,
276
+ supported_languages=["en", "fil"],
277
+ semantic_score_min=0,
278
+ semantic_score_max=100,
279
+ )
280
+
281
+
282
+ @app.post("/sessions", response_model=SessionCreateResponse)
283
+ async def create_session():
284
+ """Create a new speaking session and return its ID.
285
+
286
+ For now, the session is not persisted; this is a placeholder
287
+ to be backed by a database later.
288
+ """
289
+
290
+ session_id = str(uuid.uuid4())
291
+ return SessionCreateResponse(session_id=session_id)
292
+
293
+
294
+ def detect_fillers(text: str) -> FillerInfo:
295
+ """Detect and count common Filipino filler words."""
296
+ keywords = [
297
+ "ano", "ah", "uh", "uhm", "parang", "kasi", "ganun",
298
+ "e", "eh", "diba", "yung", "bale", "so", "like"
299
+ ]
300
+ detected = []
301
+ count = 0
302
+ words = re.findall(r"\b\w+\b", text.lower())
303
+ for word in words:
304
+ if word in keywords:
305
+ detected.append(word)
306
+ count += 1
307
+ return FillerInfo(count=count, fillers_detected=detected)
308
+
309
+
310
+ def calculate_pace(transcript: str, duration_seconds: float) -> PaceInfo:
311
+ """Calculate WPM and classify speed."""
312
+ words = len(transcript.split())
313
+ if duration_seconds <= 0:
314
+ return PaceInfo(wpm=0.0, status="Normal")
315
+
316
+ wpm = (words / duration_seconds) * 60.0
317
+
318
+ if wpm < 100:
319
+ status = "Slow"
320
+ elif wpm > 160:
321
+ status = "Fast"
322
+ else:
323
+ status = "Normal"
324
+
325
+ return PaceInfo(wpm=float(f"{wpm:.2f}"), status=status)
326
+
327
+
328
+ def analyze_prosody(segments: list, duration_seconds: float) -> ProsodyInfo:
329
+ """Analyze prosody based on segment timings (silence detection)."""
330
+ if not segments:
331
+ return ProsodyInfo(volume_db=0.0, silence_ratio=1.0)
332
+
333
+ speech_duration = 0.0
334
+ for seg in segments:
335
+ speech_duration += (seg.end - seg.start)
336
+
337
+ silence_duration = max(0.0, duration_seconds - speech_duration)
338
+ silence_ratio = silence_duration / duration_seconds if duration_seconds > 0 else 0.0
339
+
340
+ return ProsodyInfo(volume_db=None, silence_ratio=float(f"{silence_ratio:.2f}"))
341
+
342
+
343
+
344
+ def calculate_fluency(text: str) -> float:
345
+ """
346
+ Calculate a fluency score (1-10) using RoBERTa perplexity (PPL).
347
+ Lower PPL = More natural/fluent.
348
+ """
349
+ global roberta_model, roberta_tokenizer
350
+
351
+ if not roberta_model or not roberta_tokenizer:
352
+ # Fallback to simple heuristic if model not loaded
353
+ return check_coherence_heuristic(text)
354
+
355
+ if not text.strip() or len(text.split()) < 2:
356
+ return 1.0 # Too short
357
+
358
+ try:
359
+ inputs = roberta_tokenizer(text, return_tensors="pt")
360
+ if torch.cuda.is_available():
361
+ inputs = {k: v.to("cuda") for k, v in inputs.items()}
362
+
363
+ with torch.no_grad():
364
+ outputs = roberta_model(**inputs, labels=inputs["input_ids"])
365
+ loss = outputs.loss
366
+ ppl = torch.exp(loss).item()
367
+
368
+ # Normalize PPL to Score (1-10)
369
+ # Typical coherent text has PPL 5-50.
370
+ # >100 is likely incoherent.
371
+ # Score = 10 - (log(PPL) * factor)
372
+
373
+ # PPL 10 -> Score ~8
374
+ # PPL 100 -> Score ~3
375
+ import math
376
+ score = max(1.0, min(10.0, 11.0 - math.log(ppl)))
377
+ return float(f"{score:.2f}")
378
+
379
+ except Exception as e:
380
+ print(f"⚠️ RoBERTa analysis failed: {e}")
381
+ return check_coherence_heuristic(text)
382
+
383
+
384
+ def check_coherence_heuristic(text: str) -> float:
385
+ """Heuristic check for coherence (Fallback)."""
386
+ score = 5.0
387
+ # Penalize very short fragments
388
+ if len(text.split()) < 3:
389
+ score -= 2.0
390
+
391
+ # Penalize excessive repetition
392
+ words = text.lower().split()
393
+ if len(words) > 4:
394
+ unique_words = set(words)
395
+ ratio = len(unique_words) / len(words)
396
+ if ratio < 0.5:
397
+ score -= 2.0
398
+
399
+ return max(1.0, score)
400
+
401
+
402
+ def generate_feedback(pace: PaceInfo, fillers: FillerInfo, prosody: ProsodyInfo, coherence_score: float) -> Feedback:
403
+ """Generate Filipino feedback based on metrics."""
404
+
405
+ # Pacing Feedback
406
+ if pace.status == "Fast":
407
+ pacing_msg = "Medyo mabilis ang iyong pagsasalita. Subukang bagalan ng kaunti para mas maintindihan."
408
+ elif pace.status == "Slow":
409
+ pacing_msg = "Medyo mabagal. Subukang bilisan nang kaunti para mas tuloy-tuloy ang daloy."
410
+ else:
411
+ pacing_msg = "Ayos ang iyong bilis! Panatilihin ito."
412
+
413
+ # Filler Feedback
414
+ if fillers.count > 2:
415
+ filler_msg = f"Napansin ko ang paggamit ng '{fillers.fillers_detected[0]}'. Subukang mag-pause sandali sa halip na gumamit ng filler words."
416
+ else:
417
+ filler_msg = "Mahusay! Malinis ang iyong pagsasalita mula sa mga filler words."
418
+
419
+ # General/Coherence
420
+ if coherence_score < 3.0:
421
+ coherence_msg = "Medyo putol-putol ang ideya. Subukang buuin ang pangungusap."
422
+ general_msg = "Kaya mo yan! Practice pa tayo."
423
+ else:
424
+ coherence_msg = "Malinaw ang daloy ng iyong ideya."
425
+ general_msg = "Maganda ang iyong performance!"
426
+
427
+ return Feedback(
428
+ general=general_msg,
429
+ pacing=pacing_msg,
430
+ fillers=filler_msg,
431
+ coherence=coherence_msg
432
+ )
433
+
434
+
435
+
436
+
437
+
438
+ from fastapi import Form, UploadFile, File
439
+
440
+ @app.post("/sessions/{session_id}/transcribe", response_model=QuickTranscriptResponse)
441
+ async def quick_transcribe(
442
+ session_id: str,
443
+ file: UploadFile = File(...),
444
+ prompt: str = Form("") # Optional previous context
445
+ ):
446
+ """Fast transcription endpoint with context prompt."""
447
+
448
+ audio_bytes = await file.read()
449
+
450
+ def _transcribe() -> tuple[str, bool]:
451
+ tmp_file = tempfile.NamedTemporaryFile(suffix=".webm", delete=False)
452
+ try:
453
+ tmp_file.write(audio_bytes)
454
+ tmp_file.flush()
455
+ tmp_file.close()
456
+
457
+ # Use the previous transcript as a prompt to guide Whisper
458
+ # This fixes "amo" -> "ano" by giving context
459
+ initial_prompt_text = prompt if prompt else None
460
+
461
+ segments, info = model.transcribe(
462
+ tmp_file.name,
463
+ language="tl", # Force Tagalog/Taglish to prevent Spanish detection
464
+ task="transcribe",
465
+ beam_size=5,
466
+ vad_filter=True, # Re-enable VAD to help with silence (looping)
467
+ vad_parameters=dict(min_silence_duration_ms=500),
468
+ initial_prompt=initial_prompt_text,
469
+ condition_on_previous_text=False,
470
+ # Filters to reduce hallucinations/looping:
471
+ temperature=0.0,
472
+ compression_ratio_threshold=2.4, # Filter loops
473
+ log_prob_threshold=-1.0, # Filter uncertain nonsense (fixed param name)
474
+ no_speech_threshold=0.6, # Filter silence
475
+ )
476
+
477
+ texts = [seg.text.strip() for seg in segments if seg.text]
478
+ transcript = " ".join(texts).strip()
479
+ # Consider any non-trivial transcript as speech
480
+ has_speech = len(transcript) > 2
481
+
482
+ return transcript, has_speech
483
+ finally:
484
+ try:
485
+ os.remove(tmp_file.name)
486
+ except OSError:
487
+ pass
488
+
489
+ try:
490
+ transcript, has_speech = await asyncio.to_thread(_transcribe)
491
+ return QuickTranscriptResponse(
492
+ transcript=transcript,
493
+ has_speech=has_speech,
494
+ message="OK" if has_speech else "No speech detected"
495
+ )
496
+ except Exception as exc:
497
+ print(f"[transcribe-error] {exc}")
498
+ return QuickTranscriptResponse(
499
+ transcript="",
500
+ has_speech=False,
501
+ message="Transcription failed"
502
+ )
503
+
504
+
505
+ @app.post("/sessions/{session_id}/audio-chunk", response_model=ChunkAnalysisResponse)
506
+ async def upload_audio_chunk(session_id: str, file: UploadFile = File(...)):
507
+ """Full analysis endpoint - use when recording stops.
508
+
509
+ Uses a local Whisper model (via faster-whisper) so there is
510
+ no dependency on paid cloud APIs. The audio comes from the
511
+ browser as WEBM/Opus; we write it to a temporary file and let
512
+ Whisper handle decoding via ffmpeg.
513
+ """
514
+
515
+ audio_bytes = await file.read()
516
+
517
+ async def recognize_with_whisper(audio_content: bytes) -> tuple[str, float | None, list]:
518
+ """Run Whisper transcription in a worker thread.
519
+
520
+ Returns a pair of (transcript, duration_seconds, segments).
521
+ """
522
+
523
+ def _call() -> tuple[str, float | None, list]:
524
+ # Use global model instance
525
+
526
+ tmp_file = tempfile.NamedTemporaryFile(suffix=".webm", delete=False)
527
+ try:
528
+ tmp_file.write(audio_content)
529
+ tmp_file.flush()
530
+ tmp_file.close()
531
+
532
+ segments, info = model.transcribe(
533
+ tmp_file.name,
534
+ language="tl", # Force Tagalog to prevent translation to English
535
+ task="transcribe", # Transcribe, don't translate to English
536
+ beam_size=5, # Better accuracy
537
+ vad_filter=False, # Disabled to avoid cutting off speech
538
+ condition_on_previous_text=False, # Faster, no context dependency
539
+ )
540
+
541
+ segment_list = list(segments)
542
+
543
+ texts: list[str] = []
544
+ for segment in segment_list:
545
+ if segment.text:
546
+ texts.append(segment.text.strip())
547
+
548
+ transcript_text = " ".join(texts).strip()
549
+
550
+ duration_seconds: float | None = None
551
+ # Prefer model-reported duration when available.
552
+ if getattr(info, "duration", None):
553
+ duration_seconds = float(info.duration) # type: ignore[arg-type]
554
+ elif segment_list:
555
+ start = float(segment_list[0].start or 0.0)
556
+ end = float(segment_list[-1].end or 0.0)
557
+ if end > start:
558
+ duration_seconds = end - start
559
+
560
+ return transcript_text, duration_seconds, segment_list
561
+ finally:
562
+ try:
563
+ os.remove(tmp_file.name)
564
+ except OSError:
565
+ pass
566
+
567
+ return await asyncio.to_thread(_call)
568
+
569
+ transcript = ""
570
+ duration_seconds: float | None = None
571
+ segments: list = []
572
+
573
+ try:
574
+ transcript, duration_seconds, segments = await recognize_with_whisper(audio_bytes)
575
+ if transcript:
576
+ message = "Transcription successful."
577
+ else:
578
+ message = "No clear speech detected in this chunk."
579
+ except Exception as exc: # pragma: no cover - defensive for runtime issues
580
+ # Log detailed error on the server side only.
581
+ print(f"[whisper-error] Failed to transcribe chunk for session {session_id}: {exc}")
582
+ message = "Transcription skipped for this chunk (audio too short or invalid)."
583
+ transcript = ""
584
+
585
+ # Run analysis modules
586
+ # Use fallback duration of 3.0s if undefined, to avoid division by zero
587
+ safe_duration = duration_seconds if duration_seconds and duration_seconds > 0 else 3.0
588
+
589
+ fillers = detect_fillers(transcript)
590
+ pace = calculate_pace(transcript, safe_duration)
591
+ prosody = analyze_prosody(segments, safe_duration)
592
+ # Use RoBERTa for advanced fluency scoring (or fallback to heuristic)
593
+ coherence = calculate_fluency(transcript)
594
+
595
+ feedback = generate_feedback(pace, fillers, prosody, coherence)
596
+
597
+ return ChunkAnalysisResponse(
598
+ transcript=transcript,
599
+ wpm=pace.wpm,
600
+ filler_count=fillers.count,
601
+ fillers=fillers,
602
+ pacing=pace,
603
+ prosody=prosody,
604
+ coherence_score=coherence,
605
+ feedback=feedback,
606
+ message=message,
607
+ )
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Hugging Face Spaces specific requirements (CPU-only for free tier)
2
+ fastapi
3
+ uvicorn[standard]
4
+ python-multipart
5
+ faster-whisper
6
+ numpy
7
+ scipy
8
+ zeroconf
9
+ transformers
10
+ --extra-index-url https://download.pytorch.org/whl/cpu
11
+ torch