Jay162005 commited on
Commit
006d672
·
verified ·
1 Parent(s): abe87ac

Upload 2 files

Browse files
Files changed (2) hide show
  1. main.py +103 -17
  2. requirements.txt +5 -3
main.py CHANGED
@@ -1,8 +1,11 @@
1
  import re
 
2
  import socket
3
  import sqlite3
4
  import datetime
5
  import numpy as np
 
 
6
  from fastapi import FastAPI, UploadFile, File
7
  from fastapi.middleware.cors import CORSMiddleware
8
  from pydantic import BaseModel
@@ -25,6 +28,75 @@ SERVICE_PORT = 8000
25
  IS_CLOUD = os.environ.get("SPACE_ID") is not None or os.environ.get("RAILWAY_ENVIRONMENT") is not None
26
 
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  def get_local_ip():
29
  """Get the local IP address of this machine."""
30
  try:
@@ -62,19 +134,19 @@ async def lifespan(app: FastAPI):
62
  if torch.cuda.is_available():
63
  print(f"🔧 GPU Device: {torch.cuda.get_device_name(0)}")
64
  model = WhisperModel(
65
- "base", # Fast loading
66
- device="cuda", # Use NVIDIA GPU
67
  compute_type="float16"
68
  )
69
  else:
70
- # CPU fallback (for cloud free tiers)
71
- print("🔧 Using CPU mode")
72
- model = WhisperModel("base", device="cpu", compute_type="int8")
73
- print("✅ Whisper model loaded successfully")
74
  except Exception as e:
75
  print(f"❌ Failed to load Whisper model: {e}")
76
- print("⚠️ Falling back to CPU/int8...")
77
- model = WhisperModel("small", device="cpu", compute_type="int8")
78
 
79
  # 2. Load RoBERTa (Tagalog)
80
  print("⏳ Loading RoBERTa (Tagalog) model...")
@@ -372,7 +444,6 @@ def calculate_fluency(text: str) -> float:
372
 
373
  # PPL 10 -> Score ~8
374
  # PPL 100 -> Score ~3
375
- import math
376
  score = max(1.0, min(10.0, 11.0 - math.log(ppl)))
377
  return float(f"{score:.2f}")
378
 
@@ -448,21 +519,27 @@ async def quick_transcribe(
448
  audio_bytes = await file.read()
449
 
450
  def _transcribe() -> tuple[str, bool]:
451
- tmp_file = tempfile.NamedTemporaryFile(suffix=".webm", delete=False)
452
  try:
453
  tmp_file.write(audio_bytes)
454
  tmp_file.flush()
455
  tmp_file.close()
456
 
457
- # Use the previous transcript as a prompt to guide Whisper
458
- # This fixes "amo" -> "ano" by giving context
459
- initial_prompt_text = prompt if prompt else None
 
 
 
 
 
460
 
461
  segments, info = model.transcribe(
462
- tmp_file.name,
463
  language="tl", # Force Tagalog/Taglish to prevent Spanish detection
464
  task="transcribe",
465
  beam_size=5,
 
466
  vad_filter=True, # Re-enable VAD to help with silence (looping)
467
  vad_parameters=dict(min_silence_duration_ms=500),
468
  initial_prompt=initial_prompt_text,
@@ -470,12 +547,14 @@ async def quick_transcribe(
470
  # Filters to reduce hallucinations/looping:
471
  temperature=0.0,
472
  compression_ratio_threshold=2.4, # Filter loops
473
- log_prob_threshold=-1.0, # Filter uncertain nonsense (fixed param name)
474
  no_speech_threshold=0.6, # Filter silence
475
  )
476
 
477
  texts = [seg.text.strip() for seg in segments if seg.text]
478
  transcript = " ".join(texts).strip()
 
 
479
  # Consider any non-trivial transcript as speech
480
  has_speech = len(transcript) > 2
481
 
@@ -523,17 +602,22 @@ async def upload_audio_chunk(session_id: str, file: UploadFile = File(...)):
523
  def _call() -> tuple[str, float | None, list]:
524
  # Use global model instance
525
 
526
- tmp_file = tempfile.NamedTemporaryFile(suffix=".webm", delete=False)
527
  try:
528
  tmp_file.write(audio_content)
529
  tmp_file.flush()
530
  tmp_file.close()
531
 
 
 
 
532
  segments, info = model.transcribe(
533
- tmp_file.name,
534
  language="tl", # Force Tagalog to prevent translation to English
535
  task="transcribe", # Transcribe, don't translate to English
536
  beam_size=5, # Better accuracy
 
 
537
  vad_filter=False, # Disabled to avoid cutting off speech
538
  condition_on_previous_text=False, # Faster, no context dependency
539
  )
@@ -546,6 +630,8 @@ async def upload_audio_chunk(session_id: str, file: UploadFile = File(...)):
546
  texts.append(segment.text.strip())
547
 
548
  transcript_text = " ".join(texts).strip()
 
 
549
 
550
  duration_seconds: float | None = None
551
  # Prefer model-reported duration when available.
 
1
  import re
2
+ import math
3
  import socket
4
  import sqlite3
5
  import datetime
6
  import numpy as np
7
+ from scipy.signal import butter, sosfilt
8
+ from scipy.io import wavfile
9
  from fastapi import FastAPI, UploadFile, File
10
  from fastapi.middleware.cors import CORSMiddleware
11
  from pydantic import BaseModel
 
28
  IS_CLOUD = os.environ.get("SPACE_ID") is not None or os.environ.get("RAILWAY_ENVIRONMENT") is not None
29
 
30
 
31
+ # ──────────────────────────────────────────────────────────────
32
+ # Filipino / Taglish vocabulary hint for Whisper initial_prompt.
33
+ # Priming the decoder with real Filipino words dramatically
34
+ # reduces mis-hearings like "amo" → "ano".
35
+ # ──────────────────────────────────────────────────────────────
36
+ FILIPINO_VOCAB_PROMPT = (
37
+ "Ang, ang, mga, na, sa, ng, ko, mo, niya, namin, nila, "
38
+ "ano, ito, iyon, siya, kami, tayo, sila, "
39
+ "hindi, oo, wala, meron, paano, bakit, "
40
+ "kasi, diba, yung, naman, pala, talaga, "
41
+ "po, ho, kuya, ate, "
42
+ "maganda, mabuti, masaya, malaki, maliit, "
43
+ "kumain, uminom, pumunta, naglaro, natulog, "
44
+ "paaralan, bahay, trabaho, kaibigan, pamilya, "
45
+ "salamat, magandang, umaga, hapon, gabi"
46
+ )
47
+
48
+ # Known Whisper misrecognitions for Filipino — extend as needed.
49
+ WHISPER_CORRECTIONS: dict[str, str] = {
50
+ "amo": "ano",
51
+ "cayo": "kayo",
52
+ "yong": "yung",
53
+ "cami": "kami",
54
+ "cum": "kum",
55
+ "naman naman": "naman",
56
+ }
57
+
58
+
59
+ def post_process_transcript(text: str) -> str:
60
+ """Fix known Whisper misrecognitions for Filipino."""
61
+ # Multi-word replacements first
62
+ for wrong, right in WHISPER_CORRECTIONS.items():
63
+ if " " in wrong:
64
+ text = re.sub(re.escape(wrong), right, text, flags=re.IGNORECASE)
65
+
66
+ words = text.split()
67
+ corrected = []
68
+ for word in words:
69
+ lower = word.lower()
70
+ if lower in WHISPER_CORRECTIONS:
71
+ corrected.append(WHISPER_CORRECTIONS[lower])
72
+ else:
73
+ corrected.append(word)
74
+ return " ".join(corrected)
75
+
76
+
77
+ def preprocess_audio(file_path: str) -> str:
78
+ """Apply high-pass filter + normalization to reduce background noise."""
79
+ try:
80
+ sr, audio = wavfile.read(file_path)
81
+ audio = audio.astype(np.float32) / 32768.0
82
+
83
+ # High-pass at 80 Hz — removes low rumble / AC hum
84
+ sos = butter(5, 80, btype="highpass", fs=sr, output="sos")
85
+ audio = sosfilt(sos, audio)
86
+
87
+ # Peak-normalize to 0.95
88
+ peak = np.max(np.abs(audio))
89
+ if peak > 0:
90
+ audio = audio / peak * 0.95
91
+
92
+ processed_path = file_path.replace(".wav", "_clean.wav")
93
+ wavfile.write(processed_path, sr, (audio * 32767).astype(np.int16))
94
+ return processed_path
95
+ except Exception as e:
96
+ print(f"⚠️ Audio preprocessing failed (using raw): {e}")
97
+ return file_path
98
+
99
+
100
  def get_local_ip():
101
  """Get the local IP address of this machine."""
102
  try:
 
134
  if torch.cuda.is_available():
135
  print(f"🔧 GPU Device: {torch.cuda.get_device_name(0)}")
136
  model = WhisperModel(
137
+ "small", # 3x more accurate than 'base'
138
+ device="cuda",
139
  compute_type="float16"
140
  )
141
  else:
142
+ # CPU / free HF Space — small+int8 fits in ~2 GB RAM
143
+ print("🔧 Using CPU mode (small + int8)")
144
+ model = WhisperModel("small", device="cpu", compute_type="int8")
145
+ print("✅ Whisper 'small' model loaded successfully")
146
  except Exception as e:
147
  print(f"❌ Failed to load Whisper model: {e}")
148
+ print("⚠️ Falling back to base/int8...")
149
+ model = WhisperModel("base", device="cpu", compute_type="int8")
150
 
151
  # 2. Load RoBERTa (Tagalog)
152
  print("⏳ Loading RoBERTa (Tagalog) model...")
 
444
 
445
  # PPL 10 -> Score ~8
446
  # PPL 100 -> Score ~3
 
447
  score = max(1.0, min(10.0, 11.0 - math.log(ppl)))
448
  return float(f"{score:.2f}")
449
 
 
519
  audio_bytes = await file.read()
520
 
521
  def _transcribe() -> tuple[str, bool]:
522
+ tmp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
523
  try:
524
  tmp_file.write(audio_bytes)
525
  tmp_file.flush()
526
  tmp_file.close()
527
 
528
+ # Preprocess: high-pass filter + normalize
529
+ audio_path = preprocess_audio(tmp_file.name)
530
+
531
+ # Combine vocab hint + previous context for better accuracy
532
+ if prompt:
533
+ initial_prompt_text = f"{FILIPINO_VOCAB_PROMPT}. {prompt}"
534
+ else:
535
+ initial_prompt_text = FILIPINO_VOCAB_PROMPT
536
 
537
  segments, info = model.transcribe(
538
+ audio_path,
539
  language="tl", # Force Tagalog/Taglish to prevent Spanish detection
540
  task="transcribe",
541
  beam_size=5,
542
+ word_timestamps=True, # Better alignment, fewer hallucinations
543
  vad_filter=True, # Re-enable VAD to help with silence (looping)
544
  vad_parameters=dict(min_silence_duration_ms=500),
545
  initial_prompt=initial_prompt_text,
 
547
  # Filters to reduce hallucinations/looping:
548
  temperature=0.0,
549
  compression_ratio_threshold=2.4, # Filter loops
550
+ log_prob_threshold=-1.0, # Filter uncertain nonsense
551
  no_speech_threshold=0.6, # Filter silence
552
  )
553
 
554
  texts = [seg.text.strip() for seg in segments if seg.text]
555
  transcript = " ".join(texts).strip()
556
+ # Post-process: fix known misrecognitions
557
+ transcript = post_process_transcript(transcript)
558
  # Consider any non-trivial transcript as speech
559
  has_speech = len(transcript) > 2
560
 
 
602
  def _call() -> tuple[str, float | None, list]:
603
  # Use global model instance
604
 
605
+ tmp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
606
  try:
607
  tmp_file.write(audio_content)
608
  tmp_file.flush()
609
  tmp_file.close()
610
 
611
+ # Preprocess: high-pass filter + normalize
612
+ audio_path = preprocess_audio(tmp_file.name)
613
+
614
  segments, info = model.transcribe(
615
+ audio_path,
616
  language="tl", # Force Tagalog to prevent translation to English
617
  task="transcribe", # Transcribe, don't translate to English
618
  beam_size=5, # Better accuracy
619
+ word_timestamps=True, # Better alignment
620
+ initial_prompt=FILIPINO_VOCAB_PROMPT, # Filipino vocab hint
621
  vad_filter=False, # Disabled to avoid cutting off speech
622
  condition_on_previous_text=False, # Faster, no context dependency
623
  )
 
630
  texts.append(segment.text.strip())
631
 
632
  transcript_text = " ".join(texts).strip()
633
+ # Post-process: fix known misrecognitions
634
+ transcript_text = post_process_transcript(transcript_text)
635
 
636
  duration_seconds: float | None = None
637
  # Prefer model-reported duration when available.
requirements.txt CHANGED
@@ -1,11 +1,13 @@
1
- # Hugging Face Spaces specific requirements (CPU-only for free tier)
2
  fastapi
3
  uvicorn[standard]
4
  python-multipart
5
  faster-whisper
6
  numpy
7
  scipy
 
 
8
  zeroconf
 
9
  transformers
10
- --extra-index-url https://download.pytorch.org/whl/cpu
11
- torch
 
 
1
  fastapi
2
  uvicorn[standard]
3
  python-multipart
4
  faster-whisper
5
  numpy
6
  scipy
7
+ pytest
8
+ httpx
9
  zeroconf
10
+ torch --index-url https://download.pytorch.org/whl/cpu
11
  transformers
12
+ pyinstaller
13
+