Spaces:

MIP-Tech
/

Speach-To-Text

Sleeping

MIP-Tech Claude Sonnet 4.6 commited on 22 days ago

Commit

9308938

1 Parent(s): 27216ff

Fix deprecation warnings, CPU batch size, and root route

- torch_dtype → dtype in from_pretrained and pipeline (transformers 4.49+)
- Move max_new_tokens to generate_kwargs in pipeline constructor to
silence generation_config conflict warning
- batch_size: 16 → 2 on CPU (16 parallel chunks wastes RAM, not faster),
keep 8 on CUDA
- Add GET / → redirect to /docs so HF Space health probes return 200
instead of 404

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (2) hide show

api/main.py +6 -1
src/inference/transcribe.py +8 -5

api/main.py CHANGED Viewed

@@ -4,7 +4,7 @@ from contextlib import asynccontextmanager
 from fastapi import FastAPI, Request
 from fastapi.concurrency import run_in_threadpool
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse
 from api.config import settings
 from api.routers.transcription import router as transcription_router
@@ -81,6 +81,11 @@ app.add_middleware(
 app.include_router(transcription_router)
 @app.get("/health", response_model=HealthResponse, tags=["system"])
 async def health(request: Request) -> HealthResponse:
     transcriber = getattr(request.app.state, "transcriber", None)

 from fastapi import FastAPI, Request
 from fastapi.concurrency import run_in_threadpool
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse, RedirectResponse
 from api.config import settings
 from api.routers.transcription import router as transcription_router
 app.include_router(transcription_router)
+@app.get("/", include_in_schema=False)
+async def root() -> RedirectResponse:
+    return RedirectResponse(url="/docs")
 @app.get("/health", response_model=HealthResponse, tags=["system"])
 async def health(request: Request) -> HealthResponse:
     transcriber = getattr(request.app.state, "transcriber", None)

src/inference/transcribe.py CHANGED Viewed

@@ -24,23 +24,26 @@ class WhisperTranscriber:
         try:
             from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
             self.processor = AutoProcessor.from_pretrained(model_path)
             self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
                 model_path,
-                torch_dtype=torch.float16 if "cuda" in self.device else torch.float32,
                 low_cpu_mem_usage=True,
             ).to(self.device)
             self.pipe = pipeline(
                 "automatic-speech-recognition",
                 model=self.model,
                 tokenizer=self.processor.tokenizer,
                 feature_extractor=self.processor.feature_extractor,
-                max_new_tokens=128,
                 chunk_length_s=30,
-                batch_size=16,
                 return_timestamps=True,
-                torch_dtype=torch.float16 if "cuda" in self.device else torch.float32,
                 device=self.device,
             )
         except Exception as e:
             logger.error("Failed to load Whisper backend: %s", e)

         try:
             from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
             self.processor = AutoProcessor.from_pretrained(model_path)
+            dtype = torch.float16 if "cuda" in self.device else torch.float32
             self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
                 model_path,
+                dtype=dtype,
                 low_cpu_mem_usage=True,
             ).to(self.device)
+            # batch_size=16 is only useful on GPU; CPU benefits from 1-2 chunks at a time
+            batch_size = 8 if "cuda" in self.device else 2
             self.pipe = pipeline(
                 "automatic-speech-recognition",
                 model=self.model,
                 tokenizer=self.processor.tokenizer,
                 feature_extractor=self.processor.feature_extractor,
                 chunk_length_s=30,
+                batch_size=batch_size,
                 return_timestamps=True,
+                dtype=dtype,
                 device=self.device,
+                generate_kwargs={"max_new_tokens": 128},
             )
         except Exception as e:
             logger.error("Failed to load Whisper backend: %s", e)