Spaces:

Jay162005
/

salitako2.0

Sleeping

App Files Files Community

Jay162005 commited on Feb 19

Commit

006d672

verified ·

1 Parent(s): abe87ac

Upload 2 files

Browse files

Files changed (2) hide show

main.py +103 -17
requirements.txt +5 -3

main.py CHANGED Viewed

@@ -1,8 +1,11 @@
 import re
 import socket
 import sqlite3
 import datetime
 import numpy as np
 from fastapi import FastAPI, UploadFile, File
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
@@ -25,6 +28,75 @@ SERVICE_PORT = 8000
 IS_CLOUD = os.environ.get("SPACE_ID") is not None or os.environ.get("RAILWAY_ENVIRONMENT") is not None
 def get_local_ip():
     """Get the local IP address of this machine."""
     try:
@@ -62,19 +134,19 @@ async def lifespan(app: FastAPI):
         if torch.cuda.is_available():
             print(f"🔧 GPU Device: {torch.cuda.get_device_name(0)}")
             model = WhisperModel(
-                "base",           # Fast loading
-                device="cuda",     # Use NVIDIA GPU
                 compute_type="float16"
             )
         else:
-            # CPU fallback (for cloud free tiers)
-            print("🔧 Using CPU mode")
-            model = WhisperModel("base", device="cpu", compute_type="int8")
-        print("✅ Whisper model loaded successfully")
     except Exception as e:
         print(f"❌ Failed to load Whisper model: {e}")
-        print("⚠️ Falling back to CPU/int8...")
-        model = WhisperModel("small", device="cpu", compute_type="int8")
     # 2. Load RoBERTa (Tagalog)
     print("⏳ Loading RoBERTa (Tagalog) model...")
@@ -372,7 +444,6 @@ def calculate_fluency(text: str) -> float:
         # PPL 10 -> Score ~8
         # PPL 100 -> Score ~3
-        import math
         score = max(1.0, min(10.0, 11.0 - math.log(ppl)))
         return float(f"{score:.2f}")
@@ -448,21 +519,27 @@ async def quick_transcribe(
     audio_bytes = await file.read()
     def _transcribe() -> tuple[str, bool]:
-        tmp_file = tempfile.NamedTemporaryFile(suffix=".webm", delete=False)
         try:
             tmp_file.write(audio_bytes)
             tmp_file.flush()
             tmp_file.close()
-            # Use the previous transcript as a prompt to guide Whisper
-            # This fixes "amo" -> "ano" by giving context
-            initial_prompt_text = prompt if prompt else None
             segments, info = model.transcribe(
-                tmp_file.name,
                 language="tl",     # Force Tagalog/Taglish to prevent Spanish detection
                 task="transcribe",
                 beam_size=5,
                 vad_filter=True,   # Re-enable VAD to help with silence (looping)
                 vad_parameters=dict(min_silence_duration_ms=500),
                 initial_prompt=initial_prompt_text,
@@ -470,12 +547,14 @@ async def quick_transcribe(
                 # Filters to reduce hallucinations/looping:
                 temperature=0.0,
                 compression_ratio_threshold=2.4, # Filter loops
-                log_prob_threshold=-1.0,         # Filter uncertain nonsense (fixed param name)
                 no_speech_threshold=0.6,         # Filter silence
             )
             texts = [seg.text.strip() for seg in segments if seg.text]
             transcript = " ".join(texts).strip()
             # Consider any non-trivial transcript as speech
             has_speech = len(transcript) > 2
@@ -523,17 +602,22 @@ async def upload_audio_chunk(session_id: str, file: UploadFile = File(...)):
         def _call() -> tuple[str, float | None, list]:
             # Use global model instance
-            tmp_file = tempfile.NamedTemporaryFile(suffix=".webm", delete=False)
             try:
                 tmp_file.write(audio_content)
                 tmp_file.flush()
                 tmp_file.close()
                 segments, info = model.transcribe(
-                    tmp_file.name,
                     language="tl",  # Force Tagalog to prevent translation to English
                     task="transcribe",  # Transcribe, don't translate to English
                     beam_size=5,    # Better accuracy
                     vad_filter=False,  # Disabled to avoid cutting off speech
                     condition_on_previous_text=False,  # Faster, no context dependency
                 )
@@ -546,6 +630,8 @@ async def upload_audio_chunk(session_id: str, file: UploadFile = File(...)):
                         texts.append(segment.text.strip())
                 transcript_text = " ".join(texts).strip()
                 duration_seconds: float | None = None
                 # Prefer model-reported duration when available.

 import re
+import math
 import socket
 import sqlite3
 import datetime
 import numpy as np
+from scipy.signal import butter, sosfilt
+from scipy.io import wavfile
 from fastapi import FastAPI, UploadFile, File
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 IS_CLOUD = os.environ.get("SPACE_ID") is not None or os.environ.get("RAILWAY_ENVIRONMENT") is not None
+# ──────────────────────────────────────────────────────────────
+# Filipino / Taglish vocabulary hint for Whisper initial_prompt.
+# Priming the decoder with real Filipino words dramatically
+# reduces mis-hearings like "amo" → "ano".
+# ──────────────────────────────────────────────────────────────
+FILIPINO_VOCAB_PROMPT = (
+    "Ang, ang, mga, na, sa, ng, ko, mo, niya, namin, nila, "
+    "ano, ito, iyon, siya, kami, tayo, sila, "
+    "hindi, oo, wala, meron, paano, bakit, "
+    "kasi, diba, yung, naman, pala, talaga, "
+    "po, ho, kuya, ate, "
+    "maganda, mabuti, masaya, malaki, maliit, "
+    "kumain, uminom, pumunta, naglaro, natulog, "
+    "paaralan, bahay, trabaho, kaibigan, pamilya, "
+    "salamat, magandang, umaga, hapon, gabi"
+)
+# Known Whisper misrecognitions for Filipino — extend as needed.
+WHISPER_CORRECTIONS: dict[str, str] = {
+    "amo": "ano",
+    "cayo": "kayo",
+    "yong": "yung",
+    "cami": "kami",
+    "cum": "kum",
+    "naman naman": "naman",
+}
+def post_process_transcript(text: str) -> str:
+    """Fix known Whisper misrecognitions for Filipino."""
+    # Multi-word replacements first
+    for wrong, right in WHISPER_CORRECTIONS.items():
+        if " " in wrong:
+            text = re.sub(re.escape(wrong), right, text, flags=re.IGNORECASE)
+    words = text.split()
+    corrected = []
+    for word in words:
+        lower = word.lower()
+        if lower in WHISPER_CORRECTIONS:
+            corrected.append(WHISPER_CORRECTIONS[lower])
+        else:
+            corrected.append(word)
+    return " ".join(corrected)
+def preprocess_audio(file_path: str) -> str:
+    """Apply high-pass filter + normalization to reduce background noise."""
+    try:
+        sr, audio = wavfile.read(file_path)
+        audio = audio.astype(np.float32) / 32768.0
+        # High-pass at 80 Hz — removes low rumble / AC hum
+        sos = butter(5, 80, btype="highpass", fs=sr, output="sos")
+        audio = sosfilt(sos, audio)
+        # Peak-normalize to 0.95
+        peak = np.max(np.abs(audio))
+        if peak > 0:
+            audio = audio / peak * 0.95
+        processed_path = file_path.replace(".wav", "_clean.wav")
+        wavfile.write(processed_path, sr, (audio * 32767).astype(np.int16))
+        return processed_path
+    except Exception as e:
+        print(f"⚠️ Audio preprocessing failed (using raw): {e}")
+        return file_path
 def get_local_ip():
     """Get the local IP address of this machine."""
     try:
         if torch.cuda.is_available():
             print(f"🔧 GPU Device: {torch.cuda.get_device_name(0)}")
             model = WhisperModel(
+                "small",           # 3x more accurate than 'base'
+                device="cuda",
                 compute_type="float16"
             )
         else:
+            # CPU / free HF Space — small+int8 fits in ~2 GB RAM
+            print("🔧 Using CPU mode (small + int8)")
+            model = WhisperModel("small", device="cpu", compute_type="int8")
+        print("✅ Whisper 'small' model loaded successfully")
     except Exception as e:
         print(f"❌ Failed to load Whisper model: {e}")
+        print("⚠️ Falling back to base/int8...")
+        model = WhisperModel("base", device="cpu", compute_type="int8")
     # 2. Load RoBERTa (Tagalog)
     print("⏳ Loading RoBERTa (Tagalog) model...")
         # PPL 10 -> Score ~8
         # PPL 100 -> Score ~3
         score = max(1.0, min(10.0, 11.0 - math.log(ppl)))
         return float(f"{score:.2f}")
     audio_bytes = await file.read()
     def _transcribe() -> tuple[str, bool]:
+        tmp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
         try:
             tmp_file.write(audio_bytes)
             tmp_file.flush()
             tmp_file.close()
+            # Preprocess: high-pass filter + normalize
+            audio_path = preprocess_audio(tmp_file.name)
+            # Combine vocab hint + previous context for better accuracy
+            if prompt:
+                initial_prompt_text = f"{FILIPINO_VOCAB_PROMPT}. {prompt}"
+            else:
+                initial_prompt_text = FILIPINO_VOCAB_PROMPT
             segments, info = model.transcribe(
+                audio_path,
                 language="tl",     # Force Tagalog/Taglish to prevent Spanish detection
                 task="transcribe",
                 beam_size=5,
+                word_timestamps=True,  # Better alignment, fewer hallucinations
                 vad_filter=True,   # Re-enable VAD to help with silence (looping)
                 vad_parameters=dict(min_silence_duration_ms=500),
                 initial_prompt=initial_prompt_text,
                 # Filters to reduce hallucinations/looping:
                 temperature=0.0,
                 compression_ratio_threshold=2.4, # Filter loops
+                log_prob_threshold=-1.0,         # Filter uncertain nonsense
                 no_speech_threshold=0.6,         # Filter silence
             )
             texts = [seg.text.strip() for seg in segments if seg.text]
             transcript = " ".join(texts).strip()
+            # Post-process: fix known misrecognitions
+            transcript = post_process_transcript(transcript)
             # Consider any non-trivial transcript as speech
             has_speech = len(transcript) > 2
         def _call() -> tuple[str, float | None, list]:
             # Use global model instance
+            tmp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
             try:
                 tmp_file.write(audio_content)
                 tmp_file.flush()
                 tmp_file.close()
+                # Preprocess: high-pass filter + normalize
+                audio_path = preprocess_audio(tmp_file.name)
                 segments, info = model.transcribe(
+                    audio_path,
                     language="tl",  # Force Tagalog to prevent translation to English
                     task="transcribe",  # Transcribe, don't translate to English
                     beam_size=5,    # Better accuracy
+                    word_timestamps=True,  # Better alignment
+                    initial_prompt=FILIPINO_VOCAB_PROMPT,  # Filipino vocab hint
                     vad_filter=False,  # Disabled to avoid cutting off speech
                     condition_on_previous_text=False,  # Faster, no context dependency
                 )
                         texts.append(segment.text.strip())
                 transcript_text = " ".join(texts).strip()
+                # Post-process: fix known misrecognitions
+                transcript_text = post_process_transcript(transcript_text)
                 duration_seconds: float | None = None
                 # Prefer model-reported duration when available.

requirements.txt CHANGED Viewed

@@ -1,11 +1,13 @@
-# Hugging Face Spaces specific requirements (CPU-only for free tier)
 fastapi
 uvicorn[standard]
 python-multipart
 faster-whisper
 numpy
 scipy
 zeroconf
 transformers
---extra-index-url https://download.pytorch.org/whl/cpu
-torch

 fastapi
 uvicorn[standard]
 python-multipart
 faster-whisper
 numpy
 scipy
+pytest
+httpx
 zeroconf
+torch --index-url https://download.pytorch.org/whl/cpu
 transformers
+pyinstaller