bichnhan2701 commited on
Commit
fe80760
·
1 Parent(s): 4f456d8

Add logic transcribe file chunks

Browse files
Files changed (2) hide show
  1. app/main.py +8 -2
  2. app/model.py +33 -0
app/main.py CHANGED
@@ -7,7 +7,7 @@ from pathlib import Path
7
  import logging
8
  from .config import TMP_DIR, MAX_UPLOAD_BYTES, MAX_DURATION_SECS
9
  from .audio_utils import save_upload_file, download_file_from_url, get_audio_info, ensure_wav_16k_mono, make_temp_path
10
- from .model import load_model, transcribe_file
11
 
12
  app = FastAPI(title="PhoWhisper ASR API")
13
 
@@ -54,8 +54,14 @@ async def transcribe(file: UploadFile = File(...)):
54
  if MODEL is None:
55
  MODEL = load_model(chunk_length_s=30)
56
  text = transcribe_file(MODEL, tmp_wav, max_chunk_length=30.0, overlap_s=5.0)
 
57
  info2 = get_audio_info(tmp_wav) or {}
58
- return JSONResponse({"text": text, "duration": info2.get("duration"), "sample_rate": info2.get("samplerate")})
 
 
 
 
 
59
  except HTTPException:
60
  raise
61
  except Exception as e:
 
7
  import logging
8
  from .config import TMP_DIR, MAX_UPLOAD_BYTES, MAX_DURATION_SECS
9
  from .audio_utils import save_upload_file, download_file_from_url, get_audio_info, ensure_wav_16k_mono, make_temp_path
10
+ from .model import load_model, transcribe_file, transcribe_file_chunks
11
 
12
  app = FastAPI(title="PhoWhisper ASR API")
13
 
 
54
  if MODEL is None:
55
  MODEL = load_model(chunk_length_s=30)
56
  text = transcribe_file(MODEL, tmp_wav, max_chunk_length=30.0, overlap_s=5.0)
57
+ chunks = transcribe_file_chunks(MODEL, tmp_wav, max_chunk_length=30.0, overlap_s=5.0)
58
  info2 = get_audio_info(tmp_wav) or {}
59
+ return JSONResponse({
60
+ "text": text,
61
+ "duration": info2.get("duration"),
62
+ "sample_rate": info2.get("samplerate"),
63
+ "chunks": chunks
64
+ })
65
  except HTTPException:
66
  raise
67
  except Exception as e:
app/model.py CHANGED
@@ -162,3 +162,36 @@ def transcribe_file(model, wav_path: str, max_chunk_length: float = 30.0, overla
162
  if isinstance(out, dict):
163
  return out.get("text") or ""
164
  return str(out)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  if isinstance(out, dict):
163
  return out.get("text") or ""
164
  return str(out)
165
+
166
+ # Hàm trả về danh sách dict chứa start, end, text cho từng chunk
167
+ def transcribe_file_chunks(model, wav_path: str, max_chunk_length: float = 30.0, overlap_s: float = 5.0):
168
+ """
169
+ Chia audio thành các chunk, transcribe từng chunk, trả về list dict: {start, end, text}
170
+ """
171
+ info = get_audio_info(wav_path) or {}
172
+ duration = info.get("duration", 0.0)
173
+ # Tính toán các mốc thời gian bắt đầu cho từng chunk
174
+ step = max_chunk_length - overlap_s
175
+ if step <= 0:
176
+ raise ValueError("max_chunk_length must be > overlap_s")
177
+ starts = []
178
+ t = 0.0
179
+ while t < duration:
180
+ starts.append(t)
181
+ t += step
182
+ results = []
183
+ for i, s in enumerate(starts):
184
+ chunk_end = min(s + max_chunk_length, duration)
185
+ dst = make_temp_path(suffix=f".chunk{i}.wav")
186
+ _ffmpeg_extract_segment(wav_path, s, chunk_end - s, dst)
187
+ out = model(dst)
188
+ if isinstance(out, dict):
189
+ text = out.get("text", "")
190
+ else:
191
+ text = str(out)
192
+ results.append({"start": s, "end": chunk_end, "text": text})
193
+ try:
194
+ os.remove(dst)
195
+ except Exception:
196
+ pass
197
+ return results