Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,6 +7,7 @@ import sys
|
|
| 7 |
import logging
|
| 8 |
import re
|
| 9 |
import subprocess
|
|
|
|
| 10 |
from contextlib import asynccontextmanager
|
| 11 |
from datetime import datetime
|
| 12 |
from fastapi import FastAPI, HTTPException, Depends, Security
|
|
@@ -39,7 +40,7 @@ MODEL = stable_whisper.load_faster_whisper(
|
|
| 39 |
'kotoba-tech/kotoba-whisper-bilingual-v1.0-faster',
|
| 40 |
device='cpu',
|
| 41 |
compute_type='int8',
|
| 42 |
-
cpu_threads=
|
| 43 |
num_workers=1,
|
| 44 |
)
|
| 45 |
logger.info("Model loaded successfully.")
|
|
@@ -48,6 +49,11 @@ logger.info("Model loaded successfully.")
|
|
| 48 |
queue: asyncio.Queue = asyncio.Queue()
|
| 49 |
jobs: dict = {}
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
# --- 5. Lifespan (replaces deprecated @app.on_event) ---
|
| 52 |
@asynccontextmanager
|
| 53 |
async def lifespan(app: FastAPI):
|
|
@@ -308,6 +314,16 @@ async def process_job(job_id: str, video_url: str):
|
|
| 308 |
# We get English output directly — no Google Translate needed for EN.
|
| 309 |
jobs[job_id].update({"status": "AI Transcribing & Translating JA→EN...", "progress": 40})
|
| 310 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
def process_ai():
|
| 312 |
result = MODEL.transcribe(
|
| 313 |
v_path,
|
|
@@ -318,12 +334,13 @@ async def process_job(job_id: str, video_url: str):
|
|
| 318 |
# --- stable_whisper faster-whisper wrapper params ---
|
| 319 |
vad=True, # Silero VAD: skip silence/music beds
|
| 320 |
vad_threshold=0.35,
|
|
|
|
| 321 |
# --- passed through to faster_whisper.WhisperModel.transcribe ---
|
| 322 |
beam_size=2, # balanced speed/accuracy on CPU
|
| 323 |
temperature=0, # deterministic decode
|
| 324 |
condition_on_previous_text=False, # prevents hallucination snowballing
|
| 325 |
no_speech_threshold=0.3, # catch quiet disfluencies (moans, sighs)
|
| 326 |
-
log_prob_threshold=-2.0,
|
| 327 |
)
|
| 328 |
|
| 329 |
# --- Anime-optimised regrouping ---
|
|
@@ -347,7 +364,8 @@ async def process_job(job_id: str, video_url: str):
|
|
| 347 |
|
| 348 |
result.to_srt_vtt(en_vtt, segment_level=True, word_level=False)
|
| 349 |
|
| 350 |
-
|
|
|
|
| 351 |
|
| 352 |
if not os.path.exists(en_vtt):
|
| 353 |
raise RuntimeError("Transcription finished but EN VTT was not created")
|
|
|
|
| 7 |
import logging
|
| 8 |
import re
|
| 9 |
import subprocess
|
| 10 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 11 |
from contextlib import asynccontextmanager
|
| 12 |
from datetime import datetime
|
| 13 |
from fastapi import FastAPI, HTTPException, Depends, Security
|
|
|
|
| 40 |
'kotoba-tech/kotoba-whisper-bilingual-v1.0-faster',
|
| 41 |
device='cpu',
|
| 42 |
compute_type='int8',
|
| 43 |
+
cpu_threads=4,
|
| 44 |
num_workers=1,
|
| 45 |
)
|
| 46 |
logger.info("Model loaded successfully.")
|
|
|
|
| 49 |
queue: asyncio.Queue = asyncio.Queue()
|
| 50 |
jobs: dict = {}
|
| 51 |
|
| 52 |
+
# Dedicated single-thread executor for Whisper.
|
| 53 |
+
# asyncio's default ThreadPoolExecutor can be starved by concurrent requests;
|
| 54 |
+
# a pinned executor guarantees Whisper always has its own OS thread.
|
| 55 |
+
WHISPER_EXECUTOR = ThreadPoolExecutor(max_workers=1, thread_name_prefix='whisper')
|
| 56 |
+
|
| 57 |
# --- 5. Lifespan (replaces deprecated @app.on_event) ---
|
| 58 |
@asynccontextmanager
|
| 59 |
async def lifespan(app: FastAPI):
|
|
|
|
| 314 |
# We get English output directly — no Google Translate needed for EN.
|
| 315 |
jobs[job_id].update({"status": "AI Transcribing & Translating JA→EN...", "progress": 40})
|
| 316 |
|
| 317 |
+
def _whisper_progress(transcribed: float, total: float):
|
| 318 |
+
# Update job progress between 40-58% while Whisper is running
|
| 319 |
+
if total > 0:
|
| 320 |
+
pct = 40 + int((transcribed / total) * 18)
|
| 321 |
+
jobs[job_id]["progress"] = min(pct, 58)
|
| 322 |
+
jobs[job_id]["status"] = (
|
| 323 |
+
f"Transcribing & Translating "
|
| 324 |
+
f"({int(transcribed)}s / {int(total)}s)..."
|
| 325 |
+
)
|
| 326 |
+
|
| 327 |
def process_ai():
|
| 328 |
result = MODEL.transcribe(
|
| 329 |
v_path,
|
|
|
|
| 334 |
# --- stable_whisper faster-whisper wrapper params ---
|
| 335 |
vad=True, # Silero VAD: skip silence/music beds
|
| 336 |
vad_threshold=0.35,
|
| 337 |
+
progress_callback=_whisper_progress, # live progress updates
|
| 338 |
# --- passed through to faster_whisper.WhisperModel.transcribe ---
|
| 339 |
beam_size=2, # balanced speed/accuracy on CPU
|
| 340 |
temperature=0, # deterministic decode
|
| 341 |
condition_on_previous_text=False, # prevents hallucination snowballing
|
| 342 |
no_speech_threshold=0.3, # catch quiet disfluencies (moans, sighs)
|
| 343 |
+
log_prob_threshold=-2.0, # keep low-confidence disfluency tokens
|
| 344 |
)
|
| 345 |
|
| 346 |
# --- Anime-optimised regrouping ---
|
|
|
|
| 364 |
|
| 365 |
result.to_srt_vtt(en_vtt, segment_level=True, word_level=False)
|
| 366 |
|
| 367 |
+
loop = asyncio.get_event_loop()
|
| 368 |
+
await loop.run_in_executor(WHISPER_EXECUTOR, process_ai)
|
| 369 |
|
| 370 |
if not os.path.exists(en_vtt):
|
| 371 |
raise RuntimeError("Transcription finished but EN VTT was not created")
|