Spaces:
Sleeping
Sleeping
Commit
·
fe80760
1
Parent(s):
4f456d8
Add logic transcribe file chunks
Browse files- app/main.py +8 -2
- app/model.py +33 -0
app/main.py
CHANGED
|
@@ -7,7 +7,7 @@ from pathlib import Path
|
|
| 7 |
import logging
|
| 8 |
from .config import TMP_DIR, MAX_UPLOAD_BYTES, MAX_DURATION_SECS
|
| 9 |
from .audio_utils import save_upload_file, download_file_from_url, get_audio_info, ensure_wav_16k_mono, make_temp_path
|
| 10 |
-
from .model import load_model, transcribe_file
|
| 11 |
|
| 12 |
app = FastAPI(title="PhoWhisper ASR API")
|
| 13 |
|
|
@@ -54,8 +54,14 @@ async def transcribe(file: UploadFile = File(...)):
|
|
| 54 |
if MODEL is None:
|
| 55 |
MODEL = load_model(chunk_length_s=30)
|
| 56 |
text = transcribe_file(MODEL, tmp_wav, max_chunk_length=30.0, overlap_s=5.0)
|
|
|
|
| 57 |
info2 = get_audio_info(tmp_wav) or {}
|
| 58 |
-
return JSONResponse({
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
except HTTPException:
|
| 60 |
raise
|
| 61 |
except Exception as e:
|
|
|
|
| 7 |
import logging
|
| 8 |
from .config import TMP_DIR, MAX_UPLOAD_BYTES, MAX_DURATION_SECS
|
| 9 |
from .audio_utils import save_upload_file, download_file_from_url, get_audio_info, ensure_wav_16k_mono, make_temp_path
|
| 10 |
+
from .model import load_model, transcribe_file, transcribe_file_chunks
|
| 11 |
|
| 12 |
app = FastAPI(title="PhoWhisper ASR API")
|
| 13 |
|
|
|
|
| 54 |
if MODEL is None:
|
| 55 |
MODEL = load_model(chunk_length_s=30)
|
| 56 |
text = transcribe_file(MODEL, tmp_wav, max_chunk_length=30.0, overlap_s=5.0)
|
| 57 |
+
chunks = transcribe_file_chunks(MODEL, tmp_wav, max_chunk_length=30.0, overlap_s=5.0)
|
| 58 |
info2 = get_audio_info(tmp_wav) or {}
|
| 59 |
+
return JSONResponse({
|
| 60 |
+
"text": text,
|
| 61 |
+
"duration": info2.get("duration"),
|
| 62 |
+
"sample_rate": info2.get("samplerate"),
|
| 63 |
+
"chunks": chunks
|
| 64 |
+
})
|
| 65 |
except HTTPException:
|
| 66 |
raise
|
| 67 |
except Exception as e:
|
app/model.py
CHANGED
|
@@ -162,3 +162,36 @@ def transcribe_file(model, wav_path: str, max_chunk_length: float = 30.0, overla
|
|
| 162 |
if isinstance(out, dict):
|
| 163 |
return out.get("text") or ""
|
| 164 |
return str(out)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
if isinstance(out, dict):
|
| 163 |
return out.get("text") or ""
|
| 164 |
return str(out)
|
| 165 |
+
|
| 166 |
+
# Hàm trả về danh sách dict chứa start, end, text cho từng chunk
|
| 167 |
+
def transcribe_file_chunks(model, wav_path: str, max_chunk_length: float = 30.0, overlap_s: float = 5.0):
|
| 168 |
+
"""
|
| 169 |
+
Chia audio thành các chunk, transcribe từng chunk, trả về list dict: {start, end, text}
|
| 170 |
+
"""
|
| 171 |
+
info = get_audio_info(wav_path) or {}
|
| 172 |
+
duration = info.get("duration", 0.0)
|
| 173 |
+
# Tính toán các mốc thời gian bắt đầu cho từng chunk
|
| 174 |
+
step = max_chunk_length - overlap_s
|
| 175 |
+
if step <= 0:
|
| 176 |
+
raise ValueError("max_chunk_length must be > overlap_s")
|
| 177 |
+
starts = []
|
| 178 |
+
t = 0.0
|
| 179 |
+
while t < duration:
|
| 180 |
+
starts.append(t)
|
| 181 |
+
t += step
|
| 182 |
+
results = []
|
| 183 |
+
for i, s in enumerate(starts):
|
| 184 |
+
chunk_end = min(s + max_chunk_length, duration)
|
| 185 |
+
dst = make_temp_path(suffix=f".chunk{i}.wav")
|
| 186 |
+
_ffmpeg_extract_segment(wav_path, s, chunk_end - s, dst)
|
| 187 |
+
out = model(dst)
|
| 188 |
+
if isinstance(out, dict):
|
| 189 |
+
text = out.get("text", "")
|
| 190 |
+
else:
|
| 191 |
+
text = str(out)
|
| 192 |
+
results.append({"start": s, "end": chunk_end, "text": text})
|
| 193 |
+
try:
|
| 194 |
+
os.remove(dst)
|
| 195 |
+
except Exception:
|
| 196 |
+
pass
|
| 197 |
+
return results
|