Spaces:

sohamchitimali
/

whisper

Running

App Files Files Community

sohamchitimali commited on Sep 7, 2025

Commit

b997d93

1 Parent(s): 55bd939

update

Browse files

Files changed (2) hide show

app.py +19 -209
requirements.txt +6 -18

app.py CHANGED Viewed

@@ -1,8 +1,5 @@
 import os
 import tempfile
-import glob
-import shutil
-import subprocess
 import torch
 import whisper
 import gradio as gr
@@ -10,34 +7,12 @@ from fastapi import FastAPI, File, Form, UploadFile, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 import uvicorn
-# -----------------------
-# Configuration / tuning
-# -----------------------
-# Use all CPU cores for PyTorch (must be set before loading the model)
-NUM_CPU = os.cpu_count() or 1
-torch.set_num_threads(NUM_CPU)
-torch.set_num_interop_threads(NUM_CPU)
 MODEL_NAME = os.getenv("WHISPER_MODEL", "base")
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# On CPU, always use fp16=False
-FP16 = (DEVICE == "cuda")
-# ffmpeg presence check (used to normalize & chunk audio)
-FFMPEG = shutil.which("ffmpeg")
-# chunk duration (seconds) - smaller chunks help with long audio on CPU
-CHUNK_SECONDS = int(os.getenv("WHUNK_CHUNK_SECONDS", "30"))
-# -----------------------
-# Load model (after threads set)
-# -----------------------
 MODEL = whisper.load_model(MODEL_NAME, device=DEVICE)
-# -----------------------
-# FastAPI app
-# -----------------------
 app = FastAPI(title="Whisper API")
 app.add_middleware(
@@ -48,192 +23,38 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# -----------------------
-# Utilities
-# -----------------------
 def _save_temp(upload: UploadFile) -> str:
-    """Save UploadFile to a temp file and return path."""
     suffix = os.path.splitext(upload.filename or "audio")[1] or ".wav"
     with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
         tmp.write(upload.file.read())
         return tmp.name
-def _ensure_wav_mono_16k(src_path: str) -> str:
-    """
-    Use ffmpeg to convert src_path to mono 16k WAV.
-    If ffmpeg not present, return src_path (best-effort).
-    Returns path to standardized wav (temp file) that caller must remove.
-    """
-    if not FFMPEG:
-        # ffmpeg not available — rely on caller-provided file (best-effort)
-        return src_path
-    out_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-    out_path = out_tmp.name
-    out_tmp.close()
-    cmd = [
-        FFMPEG,
-        "-y",
-        "-i", src_path,
-        "-ar", "16000",      # sample rate 16 kHz
-        "-ac", "1",          # mono
-        "-sample_fmt", "s16",# PCM16
-        out_path
-    ]
-    try:
-        subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-        return out_path
-    except Exception:
-        # conversion failed — cleanup and fallback
-        try:
-            os.remove(out_path)
-        except Exception:
-            pass
-        return src_path
-def _split_into_chunks(wav_path: str, chunk_seconds: int = CHUNK_SECONDS) -> list:
-    """
-    Split a WAV into chunk files using ffmpeg segmenter.
-    Returns list of chunk file paths (sorted).
-    If ffmpeg missing or splitting fails, returns [wav_path].
-    Caller must remove chunk files after use.
-    """
-    if not FFMPEG:
-        return [wav_path]
-    tmpdir = tempfile.mkdtemp(prefix="whisper_chunks_")
-    # segment into re-encoded WAVs to guarantee compatibility
-    out_pattern = os.path.join(tmpdir, "chunk_%03d.wav")
-    cmd = [
-        FFMPEG,
-        "-y",
-        "-i", wav_path,
-        "-ar", "16000",
-        "-ac", "1",
-        "-f", "segment",
-        "-segment_time", str(chunk_seconds),
-        "-reset_timestamps", "1",
-        out_pattern
-    ]
-    try:
-        subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-        # collect chunk files
-        chunks = sorted(glob.glob(os.path.join(tmpdir, "chunk_*.wav")))
-        return chunks or [wav_path]
-    except Exception:
-        # on failure, cleanup and fallback
-        shutil.rmtree(tmpdir, ignore_errors=True)
-        return [wav_path]
-def _cleanup_paths(paths: list):
-    for p in paths:
-        try:
-            if os.path.isdir(p):
-                shutil.rmtree(p, ignore_errors=True)
-            else:
-                os.remove(p)
-        except Exception:
-            pass
-# -----------------------
-# Transcription core
-# -----------------------
-def transcribe_file(path: str, task: str = "transcribe") -> dict:
-    """
-    Transcribe (or translate) the provided file path.
-    This:
-     - normalizes audio to mono-16k WAV (via ffmpeg if available),
-     - splits into CHUNK_SECONDS segments (if ffmpeg present),
-     - transcribes segments sequentially with whisper and concatenates text.
-    Returns a dict: {"text": ..., "language": ..., "duration": ...}
-    """
-    temp_to_cleanup = []
-    try:
-        # Ensure WAV 16k mono
-        std_wav = _ensure_wav_mono_16k(path)
-        if std_wav != path:
-            temp_to_cleanup.append(std_wav)
-        # Split into chunks
-        chunks = _split_into_chunks(std_wav, CHUNK_SECONDS)
-        # if chunking created files in a tempdir, ensure that dir removed later
-        if len(chunks) > 1:
-            temp_to_cleanup.extend(chunks)
-            # note: we added chunk files individually; _split_into_chunks will have created a tmpdir.
-            # we'll remove the chunk files and directory in cleanup below.
-        full_text_parts = []
-        language_detected = None
-        duration_total = 0.0
-        for idx, cpath in enumerate(chunks):
-            # call model.transcribe on each chunk
-            # We use same task (transcribe/translate) and FP16 flag accordingly.
-            try:
-                result = MODEL.transcribe(cpath, task=task, language=None, fp16=FP16)
-            except Exception as e:
-                # If a chunk fails, try once with fp16=False (safe fallback)
-                try:
-                    result = MODEL.transcribe(cpath, task=task, language=None, fp16=False)
-                except Exception as e2:
-                    # give up on this chunk but continue
-                    result = {"text": "", "language": None, "duration": 0.0}
-            text = (result.get("text") or "").strip()
-            if text:
-                full_text_parts.append(text)
-            # populate top-level language/duration from the last successful chunk if available
-            if not language_detected and result.get("language"):
-                language_detected = result.get("language")
-            try:
-                duration_total += float(result.get("duration") or 0.0)
-            except Exception:
-                pass
-        # join with sensible spacing
-        full_text = " ".join([p for p in full_text_parts if p])
-        return {
-            "text": full_text.strip(),
-            "language": language_detected or "",
-            "duration": duration_total
-        }
-    finally:
-        # cleanup any temp files and chunk dirs
-        _cleanup_paths(list(set(temp_to_cleanup)))
-# -----------------------
-# FastAPI endpoints
-# -----------------------
 @app.post("/api/transcribe")
 async def transcribe(audio: UploadFile = File(...)):
     if not audio:
         raise HTTPException(status_code=400, detail="No audio provided")
     path = _save_temp(audio)
     try:
-        result = transcribe_file(path, task="transcribe")
         return {
             "text": result.get("text", "").strip(),
             "language": result.get("language", ""),
             "duration": float(result.get("duration") or 0.0)
         }
     finally:
-        try:
-            os.remove(path)
-        except Exception:
-            pass
 @app.post("/api/translate")
 async def translate(audio: UploadFile = File(...), target_language: str = Form(...)):
     if not audio:
         raise HTTPException(status_code=400, detail="No audio provided")
-    if target_language.strip().lower() not in {"en", "eng", "english"}:
         raise HTTPException(status_code=400, detail="Whisper only translates to English")
     path = _save_temp(audio)
     try:
-        result = transcribe_file(path, task="translate")
         return {
             "text": result.get("text", "").strip(),
             "source_language": result.get("language", ""),
@@ -241,46 +62,35 @@ async def translate(audio: UploadFile = File(...), target_language: str = Form(.
             "duration": float(result.get("duration") or 0.0)
         }
     finally:
-        try:
-            os.remove(path)
-        except Exception:
-            pass
 @app.get("/")
 async def root():
     return {"message": "Whisper API is running. Use /api/transcribe or /api/translate."}
-# -----------------------
-# Gradio UI
-# -----------------------
 def gradio_ui():
     with gr.Blocks() as demo:
         gr.Markdown("## 🎙️ Whisper API Demo")
         with gr.Row():
-            audio_input = gr.Audio(label="Upload audio or record", type="filepath")
-            translate_checkbox = gr.Checkbox(label="Translate to English", value=False)
-        output = gr.Textbox(label="Transcription / Translation", lines=6)
         btn = gr.Button("Transcribe")
-        def transcribe_gr(audio_path, do_translate):
-            if audio_path is None or audio_path == "":
                 return "No audio provided."
-            task = "translate" if do_translate else "transcribe"
-            # use the same internal function used by the API endpoints
-            result = transcribe_file(audio_path, task=task)
             return result.get("text", "").strip()
-        btn.click(fn=transcribe_gr, inputs=[audio_input, translate_checkbox], outputs=output)
     return demo
-# -----------------------
-# Mount Gradio inside FastAPI
-# -----------------------
 demo = gradio_ui()
 gr.mount_gradio_app(app, demo, path="/")
-# -----------------------
-# Run server (local)
-# -----------------------
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)

 import os
 import tempfile
 import torch
 import whisper
 import gradio as gr
 from fastapi.middleware.cors import CORSMiddleware
 import uvicorn
+# 🔹 Load Whisper model
 MODEL_NAME = os.getenv("WHISPER_MODEL", "base")
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MODEL = whisper.load_model(MODEL_NAME, device=DEVICE)
+# 🔹 FastAPI app
 app = FastAPI(title="Whisper API")
 app.add_middleware(
     allow_headers=["*"],
 )
+# 🔹 Utility to save uploaded files temporarily
 def _save_temp(upload: UploadFile) -> str:
     suffix = os.path.splitext(upload.filename or "audio")[1] or ".wav"
     with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
         tmp.write(upload.file.read())
         return tmp.name
+# 🔹 API endpoints
 @app.post("/api/transcribe")
 async def transcribe(audio: UploadFile = File(...)):
     if not audio:
         raise HTTPException(status_code=400, detail="No audio provided")
     path = _save_temp(audio)
     try:
+        result = MODEL.transcribe(path, task="transcribe", language=None, fp16=(DEVICE=="cuda"))
         return {
             "text": result.get("text", "").strip(),
             "language": result.get("language", ""),
             "duration": float(result.get("duration") or 0.0)
         }
     finally:
+        os.remove(path)
 @app.post("/api/translate")
 async def translate(audio: UploadFile = File(...), target_language: str = Form(...)):
     if not audio:
         raise HTTPException(status_code=400, detail="No audio provided")
+    if target_language.strip().lower() not in {"en","eng","english"}:
         raise HTTPException(status_code=400, detail="Whisper only translates to English")
     path = _save_temp(audio)
     try:
+        result = MODEL.transcribe(path, task="translate", language=None, fp16=(DEVICE=="cuda"))
         return {
             "text": result.get("text", "").strip(),
             "source_language": result.get("language", ""),
             "duration": float(result.get("duration") or 0.0)
         }
     finally:
+        os.remove(path)
 @app.get("/")
 async def root():
     return {"message": "Whisper API is running. Use /api/transcribe or /api/translate."}
+# 🔹 Gradio UI
 def gradio_ui():
     with gr.Blocks() as demo:
         gr.Markdown("## 🎙️ Whisper API Demo")
         with gr.Row():
+            audio_input = gr.Audio(label="Upload audio", type="filepath")  # fixed: no 'source'
+        output = gr.Textbox(label="Transcription")
         btn = gr.Button("Transcribe")
+        # Directly call Whisper model, no internal HTTP request
+        def transcribe_gr(audio_path):
+            if audio_path is None:
                 return "No audio provided."
+            result = MODEL.transcribe(audio_path, task="transcribe", language=None, fp16=(DEVICE=="cuda"))
             return result.get("text", "").strip()
+        btn.click(fn=transcribe_gr, inputs=audio_input, outputs=output)
     return demo
+# 🔹 Mount Gradio inside FastAPI
 demo = gradio_ui()
 gr.mount_gradio_app(app, demo, path="/")
+# 🔹 Run server locally
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)

requirements.txt CHANGED Viewed

@@ -1,18 +1,6 @@
-# Core app
-fastapi==0.116.1
-uvicorn[standard]==0.35.0
-gradio==5.42.0
-# Whisper (OpenAI's official package)
-openai-whisper==20250625
-# FastAPI file/form parsing
-python-multipart==0.0.20
-# Useful libs
-numpy>=1.25
-soundfile>=0.12.1
-requests>=2.31.0
-# Optional but HIGHLY recommended for CPU performance (see notes)
-faster-whisper>=0.8.0

+fastapi
+uvicorn
+gradio
+openai-whisper
+torch
+python-multipart