Spaces:

Jay162005
/

salitako2.0

Sleeping

App Files Files Community

Jay162005 commited on Feb 15

Commit

eb0c8ae

verified ·

1 Parent(s): 03e3fc5

Upload 4 files

Browse files

Files changed (4) hide show

Dockerfile +32 -0
README.md +24 -5
main.py +607 -0
requirements.txt +11 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,32 @@

+# Hugging Face Spaces Dockerfile for SalitaKo Backend
+FROM python:3.11-slim
+# Install system dependencies for audio processing
+RUN apt-get update && apt-get install -y \
+    ffmpeg \
+    libsndfile1 \
+    && rm -rf /var/lib/apt/lists/*
+# Create non-root user (required by HF Spaces)
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+# Copy requirements first for caching
+COPY --chown=user requirements-hf.txt requirements.txt
+# Install Python dependencies (CPU-only torch for free tier)
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY --chown=user . .
+# Expose port 7860 (Hugging Face Spaces default)
+EXPOSE 7860
+# Run the FastAPI app
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,12 +1,31 @@
 ---
-title: Salitako2.0
-emoji: 🦀
 colorFrom: blue
-colorTo: red
 sdk: docker
 pinned: false
 license: mit
-short_description: a ai transcription
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: SalitaKo Speech Coach API
+emoji: 🎤
 colorFrom: blue
+colorTo: purple
 sdk: docker
+app_port: 7860
 pinned: false
 license: mit
 ---
+# SalitaKo Speech Coach API
+Filipino/Tagalog speech coaching backend powered by:
+- **Whisper** (faster-whisper) - Speech-to-text
+- **RoBERTa** (jcblaise/roberta-tagalog-base) - Fluency scoring
+## API Endpoints
+- `GET /` - Welcome message
+- `GET /health` - Health check
+- `GET /docs` - Swagger UI documentation
+- `POST /sessions` - Create a new session
+- `POST /sessions/{id}/transcribe` - Quick transcription
+- `POST /sessions/{id}/audio-chunk` - Full analysis with feedback
+## Usage
+```bash
+curl https://YOUR-SPACE.hf.space/health
+```

main.py ADDED Viewed

	@@ -0,0 +1,607 @@

+import re
+import socket
+import sqlite3
+import datetime
+import numpy as np
+from fastapi import FastAPI, UploadFile, File
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+import asyncio
+import tempfile
+import os
+import uuid
+from contextlib import asynccontextmanager
+from faster_whisper import WhisperModel
+from zeroconf import ServiceInfo
+from zeroconf.asyncio import AsyncZeroconf
+# mDNS Service Configuration
+SERVICE_TYPE = "_salitako._tcp.local."
+SERVICE_NAME = "SalitaKo Server._salitako._tcp.local."
+SERVICE_PORT = 8000
+# Cloud deployment detection (Hugging Face Spaces, Railway, etc.)
+IS_CLOUD = os.environ.get("SPACE_ID") is not None or os.environ.get("RAILWAY_ENVIRONMENT") is not None
+def get_local_ip():
+    """Get the local IP address of this machine."""
+    try:
+        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+        s.connect(("8.8.8.8", 80))
+        ip = s.getsockname()[0]
+        s.close()
+        return ip
+    except Exception:
+        return "127.0.0.1"
+# Global async zeroconf instance
+async_zeroconf = None
+service_info = None
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+import torch
+# Global model instances
+model = None  # Whisper
+roberta_model = None
+roberta_tokenizer = None
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Manage mDNS service registration and Model loading on startup/shutdown."""
+    global async_zeroconf, service_info, model, roberta_model, roberta_tokenizer
+    # 1. Load Whisper
+    print("⏳ Loading Whisper model...")
+    try:
+        print(f"🔧 CUDA Available: {torch.cuda.is_available()}")
+        if torch.cuda.is_available():
+            print(f"🔧 GPU Device: {torch.cuda.get_device_name(0)}")
+            model = WhisperModel(
+                "base",           # Fast loading
+                device="cuda",     # Use NVIDIA GPU
+                compute_type="float16"
+            )
+        else:
+            # CPU fallback (for cloud free tiers)
+            print("🔧 Using CPU mode")
+            model = WhisperModel("base", device="cpu", compute_type="int8")
+        print("✅ Whisper model loaded successfully")
+    except Exception as e:
+        print(f"❌ Failed to load Whisper model: {e}")
+        print("⚠️ Falling back to CPU/int8...")
+        model = WhisperModel("small", device="cpu", compute_type="int8")
+    # 2. Load RoBERTa (Tagalog)
+    print("⏳ Loading RoBERTa (Tagalog) model...")
+    try:
+        # Use jcblaise/roberta-tagalog-base for fluency/coherence
+        model_name = "jcblaise/roberta-tagalog-base"
+        roberta_tokenizer = AutoTokenizer.from_pretrained(model_name)
+        roberta_model = AutoModelForMaskedLM.from_pretrained(model_name)
+        if torch.cuda.is_available():
+            roberta_model.to("cuda")
+        roberta_model.eval() # Set to evaluation mode
+        print("✅ RoBERTa model loaded successfully")
+    except Exception as e:
+        print(f"❌ Failed to load RoBERTa model: {e}")
+        roberta_model = None
+        roberta_tokenizer = None
+    # Startup: Register mDNS service (skip on cloud deployments)
+    if IS_CLOUD:
+        print("☁️ Cloud deployment detected - skipping mDNS registration")
+    else:
+        local_ip = get_local_ip()
+        print(f"🌐 Local IP: {local_ip}")
+        try:
+            async_zeroconf = AsyncZeroconf()
+            service_info = ServiceInfo(
+                SERVICE_TYPE,
+                SERVICE_NAME,
+                addresses=[socket.inet_aton(local_ip)],
+                port=SERVICE_PORT,
+                properties={
+                    "version": "0.2.0",
+                    "api": "/docs",
+                    "name": "SalitaKo Speech Coach"
+                },
+                server=f"salitako.local.",
+            )
+            await async_zeroconf.async_register_service(service_info)
+            print(f"📡 mDNS service registered: {SERVICE_NAME} at {local_ip}:{SERVICE_PORT}")
+        except Exception as e:
+            print(f"⚠️ mDNS registration failed (non-fatal): {e}")
+            async_zeroconf = None
+    yield
+    # Shutdown: Unregister mDNS service
+    if async_zeroconf and service_info:
+        print("📡 Unregistering mDNS service...")
+        try:
+            await async_zeroconf.async_unregister_service(service_info)
+            await async_zeroconf.async_close()
+        except Exception as e:
+            print(f"⚠️ mDNS unregister failed: {e}")
+app = FastAPI(title="SalitaKo API", version="0.2.0", lifespan=lifespan)
+@app.get("/")
+async def read_root():
+    local_ip = get_local_ip()
+    return {
+        "message": "Welcome to SalitaKo API",
+        "docs_url": f"http://{local_ip}:8000/docs",
+        "health_check": f"http://{local_ip}:8000/health",
+        "local_ip": local_ip
+    }
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=[
+        "http://localhost:3000",
+        "https://*.hf.space",  # Hugging Face Spaces
+        "*"  # Allow all for development (restrict in production)
+    ],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+class SessionResult(BaseModel):
+    student_name: str
+    wpm: float
+    fluency_score: float
+    filler_count: int
+    duration_seconds: int
+@app.post("/log-session")
+async def log_session_result(data: SessionResult):
+    """Log session results to a local SQLite database for research analysis."""
+    try:
+        # Connect to a simple file-based DB
+        conn = sqlite3.connect('thesis_data.db')
+        cursor = conn.cursor()
+        # Create table if it doesn't exist
+        cursor.execute('''
+            CREATE TABLE IF NOT EXISTS results (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                student_name TEXT,
+                wpm REAL,
+                fluency_score REAL,
+                filler_count INTEGER,
+                duration INTEGER,
+                timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
+            )
+        ''')
+        # Insert the data
+        cursor.execute('''
+            INSERT INTO results (student_name, wpm, fluency_score, filler_count, duration)
+            VALUES (?, ?, ?, ?, ?)
+        ''', (data.student_name, data.wpm, data.fluency_score, data.filler_count, data.duration_seconds))
+        conn.commit()
+        conn.close()
+        print(f"📝 Logged session for {data.student_name}")
+        return {"status": "logged"}
+    except Exception as e:
+        print(f"❌ Failed to log session: {e}")
+        return {"status": "error", "message": str(e)}
+class AppConfig(BaseModel):
+    update_interval_seconds: int
+    supported_languages: list[str]
+    semantic_score_min: int
+    semantic_score_max: int
+class SessionCreateResponse(BaseModel):
+    session_id: str
+class FillerInfo(BaseModel):
+    count: int
+    fillers_detected: list[str]
+class PaceInfo(BaseModel):
+    wpm: float
+    status: str  # Slow, Normal, Fast
+class ProsodyInfo(BaseModel):
+    volume_db: float | None
+    silence_ratio: float | None
+class Feedback(BaseModel):
+    general: str
+    pacing: str
+    fillers: str
+    coherence: str
+class ChunkAnalysisResponse(BaseModel):
+    transcript: str
+    wpm: float | None
+    filler_count: int | None
+    # Detailed analysis
+    fillers: FillerInfo | None
+    pacing: PaceInfo | None
+    prosody: ProsodyInfo | None
+    coherence_score: float | None
+    feedback: Feedback | None
+    message: str
+# Lightweight response for real-time transcription (no analysis)
+class QuickTranscriptResponse(BaseModel):
+    transcript: str
+    has_speech: bool  # For auto-stop detection
+    message: str
+@app.get("/health")
+async def health_check():
+    return {"status": "ok"}
+@app.get("/config", response_model=AppConfig)
+async def get_config():
+    """Return static configuration for the frontend UI."""
+    return AppConfig(
+        update_interval_seconds=3,
+        supported_languages=["en", "fil"],
+        semantic_score_min=0,
+        semantic_score_max=100,
+    )
+@app.post("/sessions", response_model=SessionCreateResponse)
+async def create_session():
+    """Create a new speaking session and return its ID.
+    For now, the session is not persisted; this is a placeholder
+    to be backed by a database later.
+    """
+    session_id = str(uuid.uuid4())
+    return SessionCreateResponse(session_id=session_id)
+def detect_fillers(text: str) -> FillerInfo:
+    """Detect and count common Filipino filler words."""
+    keywords = [
+        "ano", "ah", "uh", "uhm", "parang", "kasi", "ganun",
+        "e", "eh", "diba", "yung", "bale", "so", "like"
+    ]
+    detected = []
+    count = 0
+    words = re.findall(r"\b\w+\b", text.lower())
+    for word in words:
+        if word in keywords:
+            detected.append(word)
+            count += 1
+    return FillerInfo(count=count, fillers_detected=detected)
+def calculate_pace(transcript: str, duration_seconds: float) -> PaceInfo:
+    """Calculate WPM and classify speed."""
+    words = len(transcript.split())
+    if duration_seconds <= 0:
+        return PaceInfo(wpm=0.0, status="Normal")
+    wpm = (words / duration_seconds) * 60.0
+    if wpm < 100:
+        status = "Slow"
+    elif wpm > 160:
+        status = "Fast"
+    else:
+        status = "Normal"
+    return PaceInfo(wpm=float(f"{wpm:.2f}"), status=status)
+def analyze_prosody(segments: list, duration_seconds: float) -> ProsodyInfo:
+    """Analyze prosody based on segment timings (silence detection)."""
+    if not segments:
+        return ProsodyInfo(volume_db=0.0, silence_ratio=1.0)
+    speech_duration = 0.0
+    for seg in segments:
+        speech_duration += (seg.end - seg.start)
+    silence_duration = max(0.0, duration_seconds - speech_duration)
+    silence_ratio = silence_duration / duration_seconds if duration_seconds > 0 else 0.0
+    return ProsodyInfo(volume_db=None, silence_ratio=float(f"{silence_ratio:.2f}"))
+def calculate_fluency(text: str) -> float:
+    """
+    Calculate a fluency score (1-10) using RoBERTa perplexity (PPL).
+    Lower PPL = More natural/fluent.
+    """
+    global roberta_model, roberta_tokenizer
+    if not roberta_model or not roberta_tokenizer:
+        # Fallback to simple heuristic if model not loaded
+        return check_coherence_heuristic(text)
+    if not text.strip() or len(text.split()) < 2:
+        return 1.0 # Too short
+    try:
+        inputs = roberta_tokenizer(text, return_tensors="pt")
+        if torch.cuda.is_available():
+            inputs = {k: v.to("cuda") for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = roberta_model(**inputs, labels=inputs["input_ids"])
+            loss = outputs.loss
+            ppl = torch.exp(loss).item()
+        # Normalize PPL to Score (1-10)
+        # Typical coherent text has PPL 5-50.
+        # >100 is likely incoherent.
+        # Score = 10 - (log(PPL) * factor)
+        # PPL 10 -> Score ~8
+        # PPL 100 -> Score ~3
+        import math
+        score = max(1.0, min(10.0, 11.0 - math.log(ppl)))
+        return float(f"{score:.2f}")
+    except Exception as e:
+        print(f"⚠️ RoBERTa analysis failed: {e}")
+        return check_coherence_heuristic(text)
+def check_coherence_heuristic(text: str) -> float:
+    """Heuristic check for coherence (Fallback)."""
+    score = 5.0
+    # Penalize very short fragments
+    if len(text.split()) < 3:
+        score -= 2.0
+    # Penalize excessive repetition
+    words = text.lower().split()
+    if len(words) > 4:
+        unique_words = set(words)
+        ratio = len(unique_words) / len(words)
+        if ratio < 0.5:
+            score -= 2.0
+    return max(1.0, score)
+def generate_feedback(pace: PaceInfo, fillers: FillerInfo, prosody: ProsodyInfo, coherence_score: float) -> Feedback:
+    """Generate Filipino feedback based on metrics."""
+    # Pacing Feedback
+    if pace.status == "Fast":
+        pacing_msg = "Medyo mabilis ang iyong pagsasalita. Subukang bagalan ng kaunti para mas maintindihan."
+    elif pace.status == "Slow":
+        pacing_msg = "Medyo mabagal. Subukang bilisan nang kaunti para mas tuloy-tuloy ang daloy."
+    else:
+        pacing_msg = "Ayos ang iyong bilis! Panatilihin ito."
+    # Filler Feedback
+    if fillers.count > 2:
+        filler_msg = f"Napansin ko ang paggamit ng '{fillers.fillers_detected[0]}'. Subukang mag-pause sandali sa halip na gumamit ng filler words."
+    else:
+        filler_msg = "Mahusay! Malinis ang iyong pagsasalita mula sa mga filler words."
+    # General/Coherence
+    if coherence_score < 3.0:
+        coherence_msg = "Medyo putol-putol ang ideya. Subukang buuin ang pangungusap."
+        general_msg = "Kaya mo yan! Practice pa tayo."
+    else:
+        coherence_msg = "Malinaw ang daloy ng iyong ideya."
+        general_msg = "Maganda ang iyong performance!"
+    return Feedback(
+        general=general_msg,
+        pacing=pacing_msg,
+        fillers=filler_msg,
+        coherence=coherence_msg
+    )
+from fastapi import Form, UploadFile, File
+@app.post("/sessions/{session_id}/transcribe", response_model=QuickTranscriptResponse)
+async def quick_transcribe(
+    session_id: str,
+    file: UploadFile = File(...),
+    prompt: str = Form("")  # Optional previous context
+):
+    """Fast transcription endpoint with context prompt."""
+    audio_bytes = await file.read()
+    def _transcribe() -> tuple[str, bool]:
+        tmp_file = tempfile.NamedTemporaryFile(suffix=".webm", delete=False)
+        try:
+            tmp_file.write(audio_bytes)
+            tmp_file.flush()
+            tmp_file.close()
+            # Use the previous transcript as a prompt to guide Whisper
+            # This fixes "amo" -> "ano" by giving context
+            initial_prompt_text = prompt if prompt else None
+            segments, info = model.transcribe(
+                tmp_file.name,
+                language="tl",     # Force Tagalog/Taglish to prevent Spanish detection
+                task="transcribe",
+                beam_size=5,
+                vad_filter=True,   # Re-enable VAD to help with silence (looping)
+                vad_parameters=dict(min_silence_duration_ms=500),
+                initial_prompt=initial_prompt_text,
+                condition_on_previous_text=False,
+                # Filters to reduce hallucinations/looping:
+                temperature=0.0,
+                compression_ratio_threshold=2.4, # Filter loops
+                log_prob_threshold=-1.0,         # Filter uncertain nonsense (fixed param name)
+                no_speech_threshold=0.6,         # Filter silence
+            )
+            texts = [seg.text.strip() for seg in segments if seg.text]
+            transcript = " ".join(texts).strip()
+            # Consider any non-trivial transcript as speech
+            has_speech = len(transcript) > 2
+            return transcript, has_speech
+        finally:
+            try:
+                os.remove(tmp_file.name)
+            except OSError:
+                pass
+    try:
+        transcript, has_speech = await asyncio.to_thread(_transcribe)
+        return QuickTranscriptResponse(
+            transcript=transcript,
+            has_speech=has_speech,
+            message="OK" if has_speech else "No speech detected"
+        )
+    except Exception as exc:
+        print(f"[transcribe-error] {exc}")
+        return QuickTranscriptResponse(
+            transcript="",
+            has_speech=False,
+            message="Transcription failed"
+        )
+@app.post("/sessions/{session_id}/audio-chunk", response_model=ChunkAnalysisResponse)
+async def upload_audio_chunk(session_id: str, file: UploadFile = File(...)):
+    """Full analysis endpoint - use when recording stops.
+    Uses a local Whisper model (via faster-whisper) so there is
+    no dependency on paid cloud APIs. The audio comes from the
+    browser as WEBM/Opus; we write it to a temporary file and let
+    Whisper handle decoding via ffmpeg.
+    """
+    audio_bytes = await file.read()
+    async def recognize_with_whisper(audio_content: bytes) -> tuple[str, float | None, list]:
+        """Run Whisper transcription in a worker thread.
+        Returns a pair of (transcript, duration_seconds, segments).
+        """
+        def _call() -> tuple[str, float | None, list]:
+            # Use global model instance
+            tmp_file = tempfile.NamedTemporaryFile(suffix=".webm", delete=False)
+            try:
+                tmp_file.write(audio_content)
+                tmp_file.flush()
+                tmp_file.close()
+                segments, info = model.transcribe(
+                    tmp_file.name,
+                    language="tl",  # Force Tagalog to prevent translation to English
+                    task="transcribe",  # Transcribe, don't translate to English
+                    beam_size=5,    # Better accuracy
+                    vad_filter=False,  # Disabled to avoid cutting off speech
+                    condition_on_previous_text=False,  # Faster, no context dependency
+                )
+                segment_list = list(segments)
+                texts: list[str] = []
+                for segment in segment_list:
+                    if segment.text:
+                        texts.append(segment.text.strip())
+                transcript_text = " ".join(texts).strip()
+                duration_seconds: float | None = None
+                # Prefer model-reported duration when available.
+                if getattr(info, "duration", None):
+                    duration_seconds = float(info.duration)  # type: ignore[arg-type]
+                elif segment_list:
+                    start = float(segment_list[0].start or 0.0)
+                    end = float(segment_list[-1].end or 0.0)
+                    if end > start:
+                        duration_seconds = end - start
+                return transcript_text, duration_seconds, segment_list
+            finally:
+                try:
+                    os.remove(tmp_file.name)
+                except OSError:
+                    pass
+        return await asyncio.to_thread(_call)
+    transcript = ""
+    duration_seconds: float | None = None
+    segments: list = []
+    try:
+        transcript, duration_seconds, segments = await recognize_with_whisper(audio_bytes)
+        if transcript:
+            message = "Transcription successful."
+        else:
+            message = "No clear speech detected in this chunk."
+    except Exception as exc:  # pragma: no cover - defensive for runtime issues
+        # Log detailed error on the server side only.
+        print(f"[whisper-error] Failed to transcribe chunk for session {session_id}: {exc}")
+        message = "Transcription skipped for this chunk (audio too short or invalid)."
+        transcript = ""
+    # Run analysis modules
+    # Use fallback duration of 3.0s if undefined, to avoid division by zero
+    safe_duration = duration_seconds if duration_seconds and duration_seconds > 0 else 3.0
+    fillers = detect_fillers(transcript)
+    pace = calculate_pace(transcript, safe_duration)
+    prosody = analyze_prosody(segments, safe_duration)
+    # Use RoBERTa for advanced fluency scoring (or fallback to heuristic)
+    coherence = calculate_fluency(transcript)
+    feedback = generate_feedback(pace, fillers, prosody, coherence)
+    return ChunkAnalysisResponse(
+        transcript=transcript,
+        wpm=pace.wpm,
+        filler_count=fillers.count,
+        fillers=fillers,
+        pacing=pace,
+        prosody=prosody,
+        coherence_score=coherence,
+        feedback=feedback,
+        message=message,
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+# Hugging Face Spaces specific requirements (CPU-only for free tier)
+fastapi
+uvicorn[standard]
+python-multipart
+faster-whisper
+numpy
+scipy
+zeroconf
+transformers
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch