Spaces:

bichnhan2701
/

PhoWhisperBaseAPI

Sleeping

bichnhan2701 commited on Dec 10, 2025

Commit

fe80760

1 Parent(s): 4f456d8

Add logic transcribe file chunks

Files changed (2) hide show

app/main.py CHANGED Viewed

@@ -7,7 +7,7 @@ from pathlib import Path
 import logging
 from .config import TMP_DIR, MAX_UPLOAD_BYTES, MAX_DURATION_SECS
 from .audio_utils import save_upload_file, download_file_from_url, get_audio_info, ensure_wav_16k_mono, make_temp_path
-from .model import load_model, transcribe_file
 app = FastAPI(title="PhoWhisper ASR API")
@@ -54,8 +54,14 @@ async def transcribe(file: UploadFile = File(...)):
         if MODEL is None:
             MODEL = load_model(chunk_length_s=30)
         text = transcribe_file(MODEL, tmp_wav, max_chunk_length=30.0, overlap_s=5.0)
         info2 = get_audio_info(tmp_wav) or {}
-        return JSONResponse({"text": text, "duration": info2.get("duration"), "sample_rate": info2.get("samplerate")})
     except HTTPException:
         raise
     except Exception as e:

 import logging
 from .config import TMP_DIR, MAX_UPLOAD_BYTES, MAX_DURATION_SECS
 from .audio_utils import save_upload_file, download_file_from_url, get_audio_info, ensure_wav_16k_mono, make_temp_path
+from .model import load_model, transcribe_file, transcribe_file_chunks
 app = FastAPI(title="PhoWhisper ASR API")
         if MODEL is None:
             MODEL = load_model(chunk_length_s=30)
         text = transcribe_file(MODEL, tmp_wav, max_chunk_length=30.0, overlap_s=5.0)
+        chunks = transcribe_file_chunks(MODEL, tmp_wav, max_chunk_length=30.0, overlap_s=5.0)
         info2 = get_audio_info(tmp_wav) or {}
+        return JSONResponse({
+            "text": text,
+            "duration": info2.get("duration"),
+            "sample_rate": info2.get("samplerate"),
+            "chunks": chunks
+        })
     except HTTPException:
         raise
     except Exception as e:

app/model.py CHANGED Viewed

@@ -162,3 +162,36 @@ def transcribe_file(model, wav_path: str, max_chunk_length: float = 30.0, overla
     if isinstance(out, dict):
         return out.get("text") or ""
     return str(out)

     if isinstance(out, dict):
         return out.get("text") or ""
     return str(out)
+# Hàm trả về danh sách dict chứa start, end, text cho từng chunk
+def transcribe_file_chunks(model, wav_path: str, max_chunk_length: float = 30.0, overlap_s: float = 5.0):
+    """
+    Chia audio thành các chunk, transcribe từng chunk, trả về list dict: {start, end, text}
+    """
+    info = get_audio_info(wav_path) or {}
+    duration = info.get("duration", 0.0)
+    # Tính toán các mốc thời gian bắt đầu cho từng chunk
+    step = max_chunk_length - overlap_s
+    if step <= 0:
+        raise ValueError("max_chunk_length must be > overlap_s")
+    starts = []
+    t = 0.0
+    while t < duration:
+        starts.append(t)
+        t += step
+    results = []
+    for i, s in enumerate(starts):
+        chunk_end = min(s + max_chunk_length, duration)
+        dst = make_temp_path(suffix=f".chunk{i}.wav")
+        _ffmpeg_extract_segment(wav_path, s, chunk_end - s, dst)
+        out = model(dst)
+        if isinstance(out, dict):
+            text = out.get("text", "")
+        else:
+            text = str(out)
+        results.append({"start": s, "end": chunk_end, "text": text})
+        try:
+            os.remove(dst)
+        except Exception:
+            pass
+    return results