Spaces:

VladRet2026
/

ConvertAudioToJSON

Running

App Files Files

VladRet2026 commited on 18 days ago

Commit

369d759

1 Parent(s): bd1a487

OptimezedWhisperWorking

Browse files

Files changed (2) hide show

Dockerfile +7 -2
app.py +73 -5

Dockerfile CHANGED Viewed

@@ -2,7 +2,12 @@ FROM python:3.11-slim
 ENV PYTHONUNBUFFERED=1 PIP_NO_CACHE_DIR=1 HOME=/home/user \
     PATH=/home/user/.local/bin:$PATH PORT=7860 \
-    WHISPER_MODEL=openai/whisper-large-v3-turbo
 RUN apt-get update && apt-get install -y --no-install-recommends ffmpeg \
     && rm -rf /var/lib/apt/lists/* \
@@ -18,4 +23,4 @@ COPY --chown=user app.py ./
 COPY --chown=user extractors/ ./extractors/
 EXPOSE 7860
-CMD ["gunicorn", "--bind", "0.0.0.0:7860", "--workers", "1", "--threads", "4", "--timeout", "120", "app:app"]

 ENV PYTHONUNBUFFERED=1 PIP_NO_CACHE_DIR=1 HOME=/home/user \
     PATH=/home/user/.local/bin:$PATH PORT=7860 \
+    WHISPER_MODEL=openai/whisper-large-v3-turbo \
+    OMP_NUM_THREADS=2 MKL_NUM_THREADS=2 OPENBLAS_NUM_THREADS=2 NUMEXPR_NUM_THREADS=2 \
+    TOKENIZERS_PARALLELISM=false \
+    WHISPER_CPU_THREADS=2 WHISPER_CPU_INTEROP_THREADS=1 \
+    WHISPER_CHUNK_LENGTH_S=30 WHISPER_BATCH_SIZE=8 WHISPER_NUM_BEAMS=1 \
+    WHISPER_ENABLE_PROMPT=0 WHISPER_PRELOAD_ON_START=1
 RUN apt-get update && apt-get install -y --no-install-recommends ffmpeg \
     && rm -rf /var/lib/apt/lists/* \
 COPY --chown=user extractors/ ./extractors/
 EXPOSE 7860
+CMD ["gunicorn", "--bind", "0.0.0.0:7860", "--worker-class", "sync", "--workers", "1", "--timeout", "180", "--preload", "app:app"]

app.py CHANGED Viewed

@@ -31,6 +31,7 @@ HF_TOKEN = os.getenv("HF_TOKEN")
 _WHISPER_MODEL: Optional[Any] = None
 _WHISPER_PROCESSOR: Optional[Any] = None
 app = Flask(__name__)
@@ -140,18 +141,48 @@ TEST_PHRASES = [
 ]
 def get_whisper_pipeline() -> Any:
     """Возвращает Whisper pipeline (ленивая загрузка)."""
     global _WHISPER_MODEL, _WHISPER_PROCESSOR
     if _WHISPER_MODEL is None:
         from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
         model_id = os.getenv("WHISPER_MODEL", "openai/whisper-large-v3-turbo")
         model = AutoModelForSpeechSeq2Seq.from_pretrained(
             model_id,
-            dtype=torch.float32,
             low_cpu_mem_usage=True,
             use_safetensors=True,
         )
@@ -159,13 +190,18 @@ def get_whisper_pipeline() -> Any:
         _WHISPER_PROCESSOR = AutoProcessor.from_pretrained(model_id)
         _WHISPER_MODEL = pipeline(
             "automatic-speech-recognition",
             model=model,
             tokenizer=_WHISPER_PROCESSOR.tokenizer,
             feature_extractor=_WHISPER_PROCESSOR.feature_extractor,
-            dtype=torch.float32,
             device="cpu",
         )
     return _WHISPER_MODEL
@@ -306,22 +342,38 @@ def transcribe_audio_text(audio_path: str, suppliers: list[str] | None = None, u
     try:
         t0 = time.time()
         pipe = get_whisper_pipeline()
         generate_kwargs = {
             "language": "russian",
             "task": "transcribe",
         }
-        prompt = build_whisper_prompt(suppliers or [], users or [])
-        if prompt and _WHISPER_PROCESSOR is not None:
             try:
                 generate_kwargs["prompt_ids"] = _WHISPER_PROCESSOR.get_prompt_ids(prompt, return_tensors="pt")
                 print(f"[TIMINGS] whisper_prompt_enabled: suppliers={len(suppliers or [])}, users={len(users or [])}")
             except Exception as prompt_error:
                 print(f"[WARN] Whisper prompt disabled: {prompt_error}")
-        result = pipe(audio_path, generate_kwargs=generate_kwargs)
         text = result.get("text", "").strip()
         elapsed = round(time.time() - t0, 3)
         print(f"[TIMINGS] whisper_transcribe: {elapsed}s")
@@ -401,6 +453,22 @@ def parse_context(raw: str | None) -> dict[str, Any]:
         return {}
 # ============================================================================
 # ENDPOINTS
 # ============================================================================

 _WHISPER_MODEL: Optional[Any] = None
 _WHISPER_PROCESSOR: Optional[Any] = None
+_TORCH_CPU_CONFIGURED = False
 app = Flask(__name__)
 ]
+def env_flag(name: str, default: bool = False) -> bool:
+    """Парсит bool-флаг из переменных окружения."""
+    raw = os.getenv(name)
+    if raw is None:
+        return default
+    return raw.strip().lower() in {"1", "true", "yes", "on"}
+def configure_torch_for_cpu() -> None:
+    """Настраивает torch для CPU-инференса."""
+    global _TORCH_CPU_CONFIGURED
+    if _TORCH_CPU_CONFIGURED:
+        return
+    cpu_count = max(1, os.cpu_count() or 1)
+    num_threads = int(os.getenv("WHISPER_CPU_THREADS", str(cpu_count)))
+    num_threads = max(1, min(num_threads, cpu_count))
+    interop_threads = int(os.getenv("WHISPER_CPU_INTEROP_THREADS", "1"))
+    interop_threads = max(1, interop_threads)
+    torch.set_num_threads(num_threads)
+    torch.set_num_interop_threads(interop_threads)
+    torch.backends.mkldnn.enabled = True
+    _TORCH_CPU_CONFIGURED = True
+    print(f"[INFO] torch cpu threads configured: intra={num_threads}, interop={interop_threads}")
 def get_whisper_pipeline() -> Any:
     """Возвращает Whisper pipeline (ленивая загрузка)."""
     global _WHISPER_MODEL, _WHISPER_PROCESSOR
+    configure_torch_for_cpu()
     if _WHISPER_MODEL is None:
         from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
         model_id = os.getenv("WHISPER_MODEL", "openai/whisper-large-v3-turbo")
+        torch_dtype = torch.float32
         model = AutoModelForSpeechSeq2Seq.from_pretrained(
             model_id,
+            torch_dtype=torch_dtype,
             low_cpu_mem_usage=True,
             use_safetensors=True,
         )
         _WHISPER_PROCESSOR = AutoProcessor.from_pretrained(model_id)
+        chunk_length_s = int(os.getenv("WHISPER_CHUNK_LENGTH_S", "30"))
+        batch_size = int(os.getenv("WHISPER_BATCH_SIZE", "8"))
         _WHISPER_MODEL = pipeline(
             "automatic-speech-recognition",
             model=model,
             tokenizer=_WHISPER_PROCESSOR.tokenizer,
             feature_extractor=_WHISPER_PROCESSOR.feature_extractor,
+            torch_dtype=torch_dtype,
             device="cpu",
+            chunk_length_s=max(0, chunk_length_s),
+            batch_size=max(1, batch_size),
         )
     return _WHISPER_MODEL
     try:
         t0 = time.time()
+        pipeline_t0 = time.time()
         pipe = get_whisper_pipeline()
+        print(f"[TIMINGS] whisper_pipeline_ready: {round(time.time() - pipeline_t0, 3)}s")
         generate_kwargs = {
             "language": "russian",
             "task": "transcribe",
+            "num_beams": int(os.getenv("WHISPER_NUM_BEAMS", "1")),
+            "do_sample": False,
+            "condition_on_prev_text": False,
         }
+        use_prompt = env_flag("WHISPER_ENABLE_PROMPT", default=False)
+        prompt = ""
+        if use_prompt:
+            max_items = int(os.getenv("WHISPER_PROMPT_MAX_ITEMS", "12"))
+            prompt = build_whisper_prompt(suppliers or [], users or [], max_items=max_items)
+        if use_prompt and prompt and _WHISPER_PROCESSOR is not None:
             try:
                 generate_kwargs["prompt_ids"] = _WHISPER_PROCESSOR.get_prompt_ids(prompt, return_tensors="pt")
                 print(f"[TIMINGS] whisper_prompt_enabled: suppliers={len(suppliers or [])}, users={len(users or [])}")
             except Exception as prompt_error:
                 print(f"[WARN] Whisper prompt disabled: {prompt_error}")
+        elif not use_prompt:
+            print("[TIMINGS] whisper_prompt_disabled")
+        infer_t0 = time.time()
+        with torch.inference_mode():
+            result = pipe(audio_path, generate_kwargs=generate_kwargs)
+        print(f"[TIMINGS] whisper_infer_only: {round(time.time() - infer_t0, 3)}s")
         text = result.get("text", "").strip()
         elapsed = round(time.time() - t0, 3)
         print(f"[TIMINGS] whisper_transcribe: {elapsed}s")
         return {}
+def preload_whisper_if_enabled() -> None:
+    """Предзагружает Whisper при старте процесса, чтобы убрать холодный старт в запросе."""
+    if not env_flag("WHISPER_PRELOAD_ON_START", default=True):
+        return
+    started = time.time()
+    try:
+        get_whisper_pipeline()
+        print(f"[TIMINGS] whisper_preload: {round(time.time() - started, 3)}s")
+    except Exception as preload_error:
+        print(f"[WARN] Whisper preload failed: {preload_error}")
+preload_whisper_if_enabled()
 # ============================================================================
 # ENDPOINTS
 # ============================================================================