Spaces:

bichnhan2701
/

PhoWhisperBaseAPI

Running

App Files Files Community

bichnhan2701 commited on 25 days ago

Commit

f84ed9c

1 Parent(s): efea087

Change logic pipeline

Browse files

Files changed (26) hide show

.dockerignore +10 -2
.gitignore +3 -2
Dockerfile +22 -10
app/api/__init__.py +0 -0
app/api/transcribe.py +191 -0
app/config/__init__.py +0 -0
app/{config.py → config/settings.py} +13 -0
app/core/__init__.py +0 -0
app/{model.py → core/asr_engine.py} +18 -92
app/{audio_utils.py → core/audio_utils.py} +25 -14
app/core/chunking.py +36 -0
app/infra/metrics.py +32 -0
app/infra/redis_client.py +8 -0
app/jobs/transcribe_job.py +27 -0
app/main.py +29 -98
app/schemas/__init__.py +0 -0
app/schemas/transcribe.py +14 -0
app/services/__init__.py +0 -0
app/services/note_client.py +48 -0
app/services/text_normalizer.py +49 -0
app/utils/hashing.py +7 -0
requirements.txt +6 -0
test/conftest.py +9 -55
test/test_long_performance.py +0 -21
test/test_short_and_chunk.py +0 -46
test/test_silence_and_overlap.py +0 -12

.dockerignore CHANGED Viewed

@@ -1,2 +1,10 @@
-tests/
-*.md

+test/
+*.md
+.myvenv
+__pycache__
+*.pyc
+.DS_Store
+.git
+.vscode
+.idea
+docker-compose.yml

.gitignore CHANGED Viewed

@@ -1,6 +1,7 @@
-tests/
 .myvenv/
 __pycache__/
 *.pyc
 .env
-*.md

+test/
 .myvenv/
 __pycache__/
 *.pyc
 .env
+*.md
+docker-compose.yml

Dockerfile CHANGED Viewed

@@ -1,22 +1,34 @@
-FROM python:3.10-slim
-# install system deps
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
-    ffmpeg libsndfile1 git build-essential wget && \
     rm -rf /var/lib/apt/lists/*
 WORKDIR /app
-# copy requirements first for layer caching
 COPY requirements.txt /app/requirements.txt
-RUN pip install --upgrade pip
-RUN pip install --no-cache-dir -r /app/requirements.txt
 # copy app code
 COPY . /app
-ENV PORT=7860
-EXPOSE 7860
-# default command
-CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]

+FROM python:3.11-slim
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    TMP_DIR=/tmp/uploads \
+    PORT=7860
+# system deps (single RUN to minimize layers)
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    ffmpeg libsndfile1 git build-essential wget curl && \
     rm -rf /var/lib/apt/lists/*
 WORKDIR /app
+# install python deps using cached layer
 COPY requirements.txt /app/requirements.txt
+RUN pip install --upgrade pip && \
+    pip install --no-cache-dir -r /app/requirements.txt
 # copy app code
 COPY . /app
+# create tmp dir and non-root user
+RUN mkdir -p ${TMP_DIR} && groupadd -r app && useradd -r -g app app && \
+    chown -R app:app /app ${TMP_DIR}
+USER app
+EXPOSE ${PORT}
+HEALTHCHECK --interval=30s --timeout=3s --start-period=10s \
+  CMD curl -f http://localhost:${PORT}/health || exit 1
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]

app/api/__init__.py ADDED Viewed

File without changes

app/api/transcribe.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import os
+import logging
+import uuid
+import asyncio
+from fastapi import APIRouter, UploadFile, File, HTTPException, status
+from fastapi.responses import JSONResponse
+from pathlib import Path
+from typing import Optional
+import time
+from app.core.audio_utils import save_upload_file, get_audio_info, ensure_wav_16k_mono, make_temp_path, download_file_from_url
+from app.core.asr_engine import load_model, transcribe_file, transcribe_file_chunks
+from app.config import settings
+from app.services.text_normalizer import normalize_text
+from app.services.note_client import NoteServiceClient
+from rq import Queue
+from app.infra.redis_client import redis_client
+from app.jobs.transcribe_job import transcribe_job
+from app.schemas.transcribe import TranscribeResponse
+from app.infra.metrics import REQUEST_COUNT, REQUEST_LATENCY, ASR_DURATION, NORMALIZE_DURATION, ERROR_COUNT
+router = APIRouter()
+# load model on import/startup to avoid repeated initialization
+# you may prefer to call load_model in FastAPI startup event
+ASR_MODEL = None
+@router.on_event("startup")
+async def _startup():
+    global ASR_MODEL
+    # load model in thread to avoid blocking event loop
+    ASR_MODEL = await asyncio.to_thread(load_model, 30)
+def _ensure_file_limits(path: str):
+    if os.path.getsize(path) > settings.MAX_UPLOAD_BYTES:
+        raise HTTPException(status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE, detail="File size exceeds limit")
+    info = get_audio_info(path)
+    if info and info.get("duration", 0) > settings.MAX_DURATION_SECS:
+        raise HTTPException(status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE, detail="Audio duration exceeds limit")
+@router.post("/transcribe", response_model=TranscribeResponse)
+async def transcribe(file: UploadFile = File(...)):
+    tmp_in = make_temp_path(suffix=Path(file.filename).suffix or ".wav")
+    tmp_wav = None
+    note_service = NoteServiceClient()
+    note_id = str(uuid.uuid4())
+    start_time = time.perf_counter()
+    endpoint = "/transcribe"
+    status_label = "success"
+    with REQUEST_LATENCY.labels(endpoint).time():
+        try:
+            # write upload to tmp (blocking) -> run in thread
+            await asyncio.to_thread(save_upload_file, file, tmp_in)
+            _ensure_file_limits(tmp_in)
+            tmp_wav = make_temp_path(suffix=".wav")
+            # ffmpeg convert is blocking -> run in thread
+            await asyncio.to_thread(ensure_wav_16k_mono, tmp_in, tmp_wav)
+            # Kiểm tra duration để quyết định xử lý sync hay async
+            info = get_audio_info(tmp_wav) or {}
+            duration_sec = info.get("duration", 0)
+            ASYNC_THRESHOLD = 120  # 2 phút, có thể chỉnh
+            if duration_sec > ASYNC_THRESHOLD:
+                # Enqueue background job bằng RQ
+                q = Queue("asr", connection=redis_client)
+                job = q.enqueue(
+                    transcribe_job,
+                    tmp_wav,
+                    note_id,
+                    job_timeout=1800
+                )
+                logging.info(f"Enqueued background transcribe job: note_id={note_id} job_id={job.id} duration={duration_sec:.1f}s")
+                REQUEST_COUNT.labels(endpoint, "queued").inc()
+                return JSONResponse(status_code=202, content={
+                    "note_id": note_id,
+                    "job_id": job.id,
+                    "status": "queued",
+                    "duration": duration_sec
+                })
+            # Nếu audio ngắn, xử lý sync như cũ
+            model = ASR_MODEL or await asyncio.to_thread(load_model, 30)
+            with ASR_DURATION.labels(endpoint).time():
+                text = await asyncio.to_thread(transcribe_file, model, tmp_wav, 30.0, 5.0)
+                chunks = await asyncio.to_thread(transcribe_file_chunks, model, tmp_wav, 30.0, 5.0)
+            # normalize via Gemini (already async safe in your service)
+            with NORMALIZE_DURATION.labels(endpoint).time():
+                normalized_text = await normalize_text(text)
+            info2 = get_audio_info(tmp_wav) or {}
+            # persist to Note Service (async HTTP)
+            await note_service.save_transcript(
+                note_id=note_id,
+                raw_text=text,
+                normalized_text=normalized_text,
+                duration=info2.get("duration"),
+                sample_rate=info2.get("samplerate"),
+                chunks=chunks,
+                asr_model="PhoWhisper-base",
+                normalization_model="gemini-1.5"
+            )
+            duration = time.perf_counter() - start_time
+            logging.info(f"/transcribe success note_id={note_id} duration={duration:.2f}s audio_dur={info2.get('duration')}")
+            REQUEST_COUNT.labels(endpoint, status_label).inc()
+            return JSONResponse(status_code=200, content={
+                "note_id": note_id,
+                "status": "transcribed",
+                "duration": info2.get("duration")
+            })
+        except HTTPException:
+            status_label = "http_error"
+            ERROR_COUNT.labels(endpoint, status_label).inc()
+            raise
+        except Exception as e:
+            status_label = "error"
+            ERROR_COUNT.labels(endpoint, status_label).inc()
+            logging.exception(f"/transcribe failed note_id={note_id}")
+            raise HTTPException(status_code=500, detail=f"Transcription failed: {e}")
+        finally:
+            # cleanup
+            for p in [tmp_in, tmp_wav]:
+                try:
+                    if p and os.path.exists(p):
+                        os.remove(p)
+                except Exception:
+                    pass
+@router.post("/transcribe-url", response_model=TranscribeResponse)
+async def transcribe_url(payload: dict):
+    audio_url = payload.get("audio_url")
+    user_id = payload.get("user_id")
+    if not audio_url:
+        raise HTTPException(status_code=400, detail="audio_url required")
+    if not user_id:
+        raise HTTPException(status_code=400, detail="user_id required")
+    tmp_in = make_temp_path(suffix=Path(audio_url).suffix or ".tmp")
+    tmp_wav = None
+    note_service = NoteServiceClient()
+    note_id = str(uuid.uuid4())
+    start_time = time.perf_counter()
+    try:
+        # download blocking -> thread
+        await asyncio.to_thread(download_file_from_url, audio_url, tmp_in)
+        _ensure_file_limits(tmp_in)
+        tmp_wav = make_temp_path(suffix=".wav")
+        await asyncio.to_thread(ensure_wav_16k_mono, tmp_in, tmp_wav)
+        model = ASR_MODEL or await asyncio.to_thread(load_model, 30)
+        text = await asyncio.to_thread(transcribe_file, model, tmp_wav, 30.0, 5.0)
+        chunks = await asyncio.to_thread(transcribe_file_chunks, model, tmp_wav, 30.0, 5.0)
+        normalized_text = await normalize_text(text)
+        info2 = get_audio_info(tmp_wav) or {}
+        await note_service.save_transcript(
+            note_id=note_id,
+            raw_text=text,
+            normalized_text=normalized_text,
+            duration=info2.get("duration"),
+            sample_rate=info2.get("samplerate"),
+            chunks=chunks,
+            asr_model="PhoWhisper-base",
+            normalization_model="gemini-1.5"
+        )
+        duration = time.perf_counter() - start_time
+        logging.info(f"/transcribe-url success note_id={note_id} duration={duration:.2f}s audio_dur={info2.get('duration')}")
+        return JSONResponse(status_code=200, content={
+            "note_id": note_id,
+            "status": "transcribed",
+            "duration": info2.get("duration")
+        })
+    except HTTPException:
+        raise
+    except Exception as e:
+        logging.exception(f"/transcribe-url failed note_id={note_id}")
+        raise HTTPException(status_code=500, detail=f"Transcription failed: {e}")
+    finally:
+        for p in [tmp_in, tmp_wav]:
+            try:
+                if p and os.path.exists(p):
+                    os.remove(p)
+            except Exception:
+                pass

app/config/__init__.py ADDED Viewed

File without changes

app/{config.py → config/settings.py} RENAMED Viewed

@@ -1,3 +1,5 @@
 import os
 # Limits & model setting
@@ -12,3 +14,14 @@ os.makedirs(TMP_DIR, exist_ok=True)
 # Cloud credentials (set as HF Spaces secrets or env)
 # FIREBASE_SERVICE_ACCOUNT = os.getenv("FIREBASE_SERVICE_ACCOUNT_JSON")  # optional
 # CLOUDINARY_URL = os.getenv("CLOUDINARY_URL")  # optional

+# App settings and configuration
 import os
 # Limits & model setting
 # Cloud credentials (set as HF Spaces secrets or env)
 # FIREBASE_SERVICE_ACCOUNT = os.getenv("FIREBASE_SERVICE_ACCOUNT_JSON")  # optional
 # CLOUDINARY_URL = os.getenv("CLOUDINARY_URL")  # optional
+# Gemini API Key (for text normalization)
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
+# External services
+NOTE_SERVICE_URL = os.getenv("NOTE_SERVICE_URL", "http://localhost:9000")
+# HTTP timeouts
+HTTPX_TIMEOUT = float(os.getenv("HTTPX_TIMEOUT", "10.0"))
+# Redis URL
+REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/0")

app/core/__init__.py ADDED Viewed

File without changes

app/{model.py → core/asr_engine.py} RENAMED Viewed

@@ -1,21 +1,14 @@
-import math
-import shlex
-import subprocess
-import os
-from pathlib import Path
-from typing import List
-from .audio_utils import make_temp_path, ensure_wav_16k_mono, get_audio_info
-from transformers import pipeline
 import logging
-from .config import MODEL_NAME
 _model = None
 def load_model(chunk_length_s: int = None):
     global _model
     if _model is None:
-        # This will download weights at runtime (from Hugging Face Hub).
-        # If you are on HF Spaces, the image will download on first run.
         logging.info(f"Loading ASR model {MODEL_NAME} ...")
         kwargs = {}
         if chunk_length_s is not None:
@@ -24,57 +17,13 @@ def load_model(chunk_length_s: int = None):
         logging.info("Model loaded")
     return _model
-#_ffmpeg_extract_segment dùng ffmpeg để tạo mỗi chunk ở dạng WAV 16k mono PCM16 (đảm bảo chuẩn cho model).
-def _ffmpeg_extract_segment(src: str, start: float, duration: float, dst: str):
-    """
-    Extract segment [start, start+duration) using ffmpeg into dst (wav 16k mono pcm16).
-    We call ffmpeg with -ss + -t on input for safety.
-    """
-    # Note: -ss before -i is fast seek but less accurate for some formats. We use -ss after -i for accuracy.
-    cmd = f'ffmpeg -v error -y -ss {start:.3f} -i "{src}" -t {duration:.3f} -ar 16000 -ac 1 -acodec pcm_s16le "{dst}"'
-    proc = subprocess.run(shlex.split(cmd), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    if proc.returncode != 0:
-        raise RuntimeError(f"ffmpeg extract failed: {proc.stderr.decode(errors='ignore')}")
-    return dst
-#_split_audio_to_chunks tạo list chunk files.
-def _split_audio_to_chunks(src_wav: str, chunk_length_s: float = 30.0, overlap_s: float = 5.0) -> List[str]:
-    """
-    Split src_wav into chunk files (wav 16k mono). Returns list of chunk file paths in order.
-    Overlap is seconds overlapping between consecutive chunks.
-    """
-    info = get_audio_info(src_wav)
-    if not info:
-        raise RuntimeError("Cannot read audio info")
-    duration = info["duration"]
-    step = chunk_length_s - overlap_s
-    if step <= 0:
-        raise ValueError("chunk_length_s must be > overlap_s")
-    starts = []
-    t = 0.0
-    while t < duration:
-        starts.append(t)
-        t += step
-    # ensure last chunk covers end - if last start + chunk_length < duration, we still create chunk that may be shorter
-    chunks = []
-    for i, s in enumerate(starts):
-        dst = make_temp_path(suffix=f".chunk{i}.wav")
-        _ffmpeg_extract_segment(src_wav, s, chunk_length_s, dst)
-        chunks.append(dst)
-    return chunks
-#_merge_transcripts là heuristic so khớp bằng từ (exact n-gram match) và fallback; không hoàn hảo nhưng giảm phần lớn lặp do overlap.
-def _merge_transcripts(prev_text: str, new_text: str, max_overlap_words: int = 8) -> str:
-    """
-    Heuristic merge: if end of prev_text and start of new_text share overlapping words, remove overlap.
-    We find the longest overlap up to max_overlap_words.
-    """
     if not prev_text:
         return new_text
     p_words = prev_text.strip().split()
     n_words = new_text.strip().split()
     max_ol = min(max_overlap_words, len(p_words), len(n_words))
-    # search for largest k where last k words of prev == first k words of new
     best_k = 0
     for k in range(max_ol, 0, -1):
         if p_words[-k:] == n_words[:k]:
@@ -83,35 +32,22 @@ def _merge_transcripts(prev_text: str, new_text: str, max_overlap_words: int = 8
     if best_k > 0:
         merged = " ".join(p_words + n_words[best_k:])
         return merged
-    # If no exact overlap, try fuzzy overlap by matching word sequences (less strict)
-    # simple heuristic: if last N words of prev appear anywhere at start of new, trim
     for k in range(max_ol, 1, -1):
         seq = " ".join(p_words[-k:])
         if seq in new_text:
             idx = new_text.find(seq)
-            # remove through the sequence
             merged = " ".join(p_words + new_text[idx + len(seq):].strip().split())
             return merged
-    # fallback: just concatenate with space
     return prev_text.rstrip() + " " + new_text.lstrip()
-#transcribe_long_audio chạy model trên từng chunk (tuần tự theo mặc định); có tuỳ chọn parallel=True để xử lý đồng thời (cần thận trọng với GPU memory).
 def transcribe_long_audio(model, wav_path: str, chunk_length_s: float = 30.0, overlap_s: float = 5.0, parallel: bool = False) -> str:
-    """
-    Transcribe a long wav file by splitting into overlapping chunks,
-    transcribing each, then merging transcripts.
-    - model: loaded pipeline
-    - wav_path: already normalized wav (16k, mono)
-    Returns stitched transcript string.
-    """
-    # 1) split into chunks
-    chunks = _split_audio_to_chunks(wav_path, chunk_length_s=chunk_length_s, overlap_s=overlap_s)
     logging.info(f"Split into {len(chunks)} chunks")
     texts = []
-    # 2) process chunks (sequential for safety). Optionally implement limited parallelism.
     if parallel:
-        # optional: implement ThreadPool/ProcessPool with care for GPU memory; by default we do sequential
         from concurrent.futures import ThreadPoolExecutor, as_completed
         def process_chunk(path):
             try:
@@ -122,7 +58,7 @@ def transcribe_long_audio(model, wav_path: str, chunk_length_s: float = 30.0, ov
             except Exception as e:
                 logging.exception("Chunk inference failed")
                 return ""
-        with ThreadPoolExecutor(max_workers=2) as ex:  # limit concurrency
             futures = {ex.submit(process_chunk, c): c for c in chunks}
             for fut in as_completed(futures):
                 texts.append(fut.result() or "")
@@ -133,12 +69,9 @@ def transcribe_long_audio(model, wav_path: str, chunk_length_s: float = 30.0, ov
                 texts.append(out.get("text", "") or "")
             else:
                 texts.append(str(out) or "")
-    # 3) merge texts to single transcript, removing overlap duplicates
     merged = ""
     for t in texts:
-        merged = _merge_transcripts(merged, t, max_overlap_words=12)
-    # 4) cleanup chunk files
     for c in chunks:
         try:
             os.remove(c)
@@ -146,31 +79,24 @@ def transcribe_long_audio(model, wav_path: str, chunk_length_s: float = 30.0, ov
             pass
     return merged
-# transcribe_file quyết định có chunk hay không.
 def transcribe_file(model, wav_path: str, max_chunk_length: float = 30.0, overlap_s: float = 5.0):
-    """
-    Main entry: if audio short, run single pass; if long, call chunked transcribe.
-    """
     info = get_audio_info(wav_path) or {}
     duration = info.get("duration", 0.0)
-    # threshold to decide chunking: if duration > chunk_length -> chunk
-    if duration and duration > max_chunk_length * 1.1:  # slightly bigger than chunk length
         logging.info(f"Long audio detected ({duration}s) -> chunking")
         return transcribe_long_audio(model, wav_path, chunk_length_s=max_chunk_length, overlap_s=overlap_s)
-    # short audio -> direct
     out = model(wav_path)
     if isinstance(out, dict):
         return out.get("text") or ""
     return str(out)
-# Hàm trả về danh sách dict chứa start, end, text cho từng chunk
 def transcribe_file_chunks(model, wav_path: str, max_chunk_length: float = 30.0, overlap_s: float = 5.0):
-    """
-    Chia audio thành các chunk, transcribe từng chunk, trả về list dict: {start, end, text}
-    """
     info = get_audio_info(wav_path) or {}
     duration = info.get("duration", 0.0)
-    # Tính toán các mốc thời gian bắt đầu cho từng chunk
     step = max_chunk_length - overlap_s
     if step <= 0:
         raise ValueError("max_chunk_length must be > overlap_s")
@@ -183,7 +109,7 @@ def transcribe_file_chunks(model, wav_path: str, max_chunk_length: float = 30.0,
     for i, s in enumerate(starts):
         chunk_end = min(s + max_chunk_length, duration)
         dst = make_temp_path(suffix=f".chunk{i}.wav")
-        _ffmpeg_extract_segment(wav_path, s, chunk_end - s, dst)
         out = model(dst)
         if isinstance(out, dict):
             text = out.get("text", "")

+# PhoWhisper inference engine
 import logging
+from transformers import pipeline
+from app.config.settings import MODEL_NAME
 _model = None
 def load_model(chunk_length_s: int = None):
     global _model
     if _model is None:
         logging.info(f"Loading ASR model {MODEL_NAME} ...")
         kwargs = {}
         if chunk_length_s is not None:
         logging.info("Model loaded")
     return _model
+# Heuristic merge for chunked transcripts
+def merge_transcripts(prev_text: str, new_text: str, max_overlap_words: int = 8) -> str:
     if not prev_text:
         return new_text
     p_words = prev_text.strip().split()
     n_words = new_text.strip().split()
     max_ol = min(max_overlap_words, len(p_words), len(n_words))
     best_k = 0
     for k in range(max_ol, 0, -1):
         if p_words[-k:] == n_words[:k]:
     if best_k > 0:
         merged = " ".join(p_words + n_words[best_k:])
         return merged
     for k in range(max_ol, 1, -1):
         seq = " ".join(p_words[-k:])
         if seq in new_text:
             idx = new_text.find(seq)
             merged = " ".join(p_words + new_text[idx + len(seq):].strip().split())
             return merged
     return prev_text.rstrip() + " " + new_text.lstrip()
 def transcribe_long_audio(model, wav_path: str, chunk_length_s: float = 30.0, overlap_s: float = 5.0, parallel: bool = False) -> str:
+    from app.core.chunking import split_audio_to_chunks
+    from app.core.audio_utils import make_temp_path
+    import os
+    chunks = split_audio_to_chunks(wav_path, chunk_length_s=chunk_length_s, overlap_s=overlap_s)
     logging.info(f"Split into {len(chunks)} chunks")
     texts = []
     if parallel:
         from concurrent.futures import ThreadPoolExecutor, as_completed
         def process_chunk(path):
             try:
             except Exception as e:
                 logging.exception("Chunk inference failed")
                 return ""
+        with ThreadPoolExecutor(max_workers=2) as ex:
             futures = {ex.submit(process_chunk, c): c for c in chunks}
             for fut in as_completed(futures):
                 texts.append(fut.result() or "")
                 texts.append(out.get("text", "") or "")
             else:
                 texts.append(str(out) or "")
     merged = ""
     for t in texts:
+        merged = merge_transcripts(merged, t, max_overlap_words=12)
     for c in chunks:
         try:
             os.remove(c)
             pass
     return merged
 def transcribe_file(model, wav_path: str, max_chunk_length: float = 30.0, overlap_s: float = 5.0):
+    from app.core.audio_utils import get_audio_info
     info = get_audio_info(wav_path) or {}
     duration = info.get("duration", 0.0)
+    if duration and duration > max_chunk_length * 1.1:
         logging.info(f"Long audio detected ({duration}s) -> chunking")
         return transcribe_long_audio(model, wav_path, chunk_length_s=max_chunk_length, overlap_s=overlap_s)
     out = model(wav_path)
     if isinstance(out, dict):
         return out.get("text") or ""
     return str(out)
 def transcribe_file_chunks(model, wav_path: str, max_chunk_length: float = 30.0, overlap_s: float = 5.0):
+    from app.core.audio_utils import get_audio_info, make_temp_path
+    from app.core.chunking import ffmpeg_extract_segment
+    import os
     info = get_audio_info(wav_path) or {}
     duration = info.get("duration", 0.0)
     step = max_chunk_length - overlap_s
     if step <= 0:
         raise ValueError("max_chunk_length must be > overlap_s")
     for i, s in enumerate(starts):
         chunk_end = min(s + max_chunk_length, duration)
         dst = make_temp_path(suffix=f".chunk{i}.wav")
+        ffmpeg_extract_segment(wav_path, s, chunk_end - s, dst)
         out = model(dst)
         if isinstance(out, dict):
             text = out.get("text", "")

app/{audio_utils.py → core/audio_utils.py} RENAMED Viewed

@@ -1,23 +1,23 @@
-import os
 import subprocess
 import shlex
 import uuid
 import requests
 from pathlib import Path
 import soundfile as sf
-from .config import TMP_DIR, MAX_UPLOAD_BYTES, MAX_DURATION_SECS
 def save_upload_file(upload_file, dest_path: str):
-    """Save fastapi UploadFile to dest_path"""
     with open(dest_path, "wb") as f:
         while True:
-            chunk = upload_file.file.read(1024*1024)
             if not chunk:
                 break
             f.write(chunk)
 def download_file_from_url(url: str, dest_path: str, timeout=30):
-    """Download remote file to dest_path."""
     r = requests.get(url, stream=True, timeout=timeout)
     r.raise_for_status()
     total = 0
@@ -30,26 +30,37 @@ def download_file_from_url(url: str, dest_path: str, timeout=30):
                 f.write(chunk)
 def get_audio_info(path: str):
-    """Return duration (s) and sample_rate using soundfile."""
     try:
         info = sf.info(path)
         duration = info.frames / info.samplerate
-        return {"duration": duration, "samplerate": info.samplerate, "channels": info.channels}
     except Exception:
         return None
 def ensure_wav_16k_mono(src_path: str, dest_path: str):
     """
-    Use ffmpeg to convert any audio to wav (PCM16), 16kHz, mono.
-    Returns dest_path if ok, raises exception on error.
     """
-    # Using ffmpeg command line
-    # -y overwrite
-    cmd = f'ffmpeg -v error -y -i "{src_path}" -ar 16000 -ac 1 -acodec pcm_s16le "{dest_path}"'
-    proc = subprocess.run(shlex.split(cmd), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     if proc.returncode != 0:
-        raise RuntimeError(f"ffmpeg convert failed: {proc.stderr.decode(errors='ignore')}")
     return dest_path
 def make_temp_path(suffix=".wav"):
     return str(Path(TMP_DIR) / f"{uuid.uuid4().hex}{suffix}")

+# Audio utilities: ffmpeg, normalization, etc.
 import subprocess
 import shlex
 import uuid
 import requests
 from pathlib import Path
 import soundfile as sf
+from app.config.settings import TMP_DIR, MAX_UPLOAD_BYTES
 def save_upload_file(upload_file, dest_path: str):
+    """Save FastAPI UploadFile to dest_path (streaming)."""
     with open(dest_path, "wb") as f:
         while True:
+            chunk = upload_file.file.read(1024 * 1024)
             if not chunk:
                 break
             f.write(chunk)
 def download_file_from_url(url: str, dest_path: str, timeout=30):
+    """Download remote file to dest_path with size limit."""
     r = requests.get(url, stream=True, timeout=timeout)
     r.raise_for_status()
     total = 0
                 f.write(chunk)
 def get_audio_info(path: str):
+    """Return duration (s), sample_rate, channels using soundfile."""
     try:
         info = sf.info(path)
         duration = info.frames / info.samplerate
+        return {
+            "duration": duration,
+            "samplerate": info.samplerate,
+            "channels": info.channels,
+        }
     except Exception:
         return None
 def ensure_wav_16k_mono(src_path: str, dest_path: str):
     """
+    Convert any audio to WAV PCM16, 16kHz, mono using ffmpeg.
     """
+    cmd = (
+        f'ffmpeg -v error -y -i "{src_path}" '
+        f'-ar 16000 -ac 1 -acodec pcm_s16le "{dest_path}"'
+    )
+    proc = subprocess.run(
+        shlex.split(cmd),
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+    )
     if proc.returncode != 0:
+        raise RuntimeError(
+            f"ffmpeg convert failed: {proc.stderr.decode(errors='ignore')}"
+        )
     return dest_path
 def make_temp_path(suffix=".wav"):
+    """Generate unique temp file path under TMP_DIR."""
     return str(Path(TMP_DIR) / f"{uuid.uuid4().hex}{suffix}")

app/core/chunking.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# Audio chunking/splitting/merging logic
+import shlex
+import subprocess
+from typing import List
+from app.core.audio_utils import get_audio_info, make_temp_path
+def ffmpeg_extract_segment(src: str, start: float, duration: float, dst: str):
+    """
+    Extract segment [start, start+duration) using ffmpeg into dst (wav 16k mono pcm16).
+    """
+    cmd = f'ffmpeg -v error -y -ss {start:.3f} -i "{src}" -t {duration:.3f} -ar 16000 -ac 1 -acodec pcm_s16le "{dst}"'
+    proc = subprocess.run(shlex.split(cmd), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    if proc.returncode != 0:
+        raise RuntimeError(f"ffmpeg extract failed: {proc.stderr.decode(errors='ignore')}")
+    return dst
+def split_audio_to_chunks(src_wav: str, chunk_length_s: float = 30.0, overlap_s: float = 5.0) -> List[str]:
+    info = get_audio_info(src_wav)
+    if not info:
+        raise RuntimeError("Cannot read audio info")
+    duration = info["duration"]
+    step = chunk_length_s - overlap_s
+    if step <= 0:
+        raise ValueError("chunk_length_s must be > overlap_s")
+    starts = []
+    t = 0.0
+    while t < duration:
+        starts.append(t)
+        t += step
+    chunks = []
+    for i, s in enumerate(starts):
+        chunk_path = make_temp_path(suffix=f"_chunk{i}.wav")
+        ffmpeg_extract_segment(src_wav, s, min(chunk_length_s, duration - s), chunk_path)
+        chunks.append(chunk_path)
+    return chunks

app/infra/metrics.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from prometheus_client import Counter, Histogram
+REQUEST_COUNT = Counter(
+    "asr_requests_total",
+    "Total ASR requests",
+    ["endpoint", "status"]
+)
+REQUEST_LATENCY = Histogram(
+    "asr_request_latency_seconds",
+    "ASR request latency",
+    ["endpoint"]
+)
+ASR_DURATION = Histogram(
+    "asr_model_duration_seconds",
+    "ASR model inference duration",
+    ["endpoint"]
+)
+NORMALIZE_DURATION = Histogram(
+    "normalize_duration_seconds",
+    "Text normalization duration",
+    ["endpoint"]
+)
+ERROR_COUNT = Counter(
+    "asr_error_total",
+    "Total ASR errors",
+    ["endpoint", "error_type"]
+)

app/infra/redis_client.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import os
+import redis
+from app.config.settings import REDIS_URL
+redis_client = redis.Redis.from_url(
+    REDIS_URL,
+    decode_responses=True
+)

app/jobs/transcribe_job.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from app.core.asr_engine import load_model, transcribe_file
+from app.services.text_normalizer import normalize_text
+from app.services.note_client import NoteServiceClient
+# This function will be run by RQ worker
+def transcribe_job(tmp_wav: str, note_id: str):
+    model = load_model()
+    text = transcribe_file(model, tmp_wav, 30.0, 5.0)
+    # normalize_text có thể là async, nhưng RQ chỉ chạy sync nên cần chạy event loop nếu cần
+    import asyncio
+    if asyncio.iscoroutinefunction(normalize_text):
+        normalized = asyncio.run(normalize_text(text))
+    else:
+        normalized = normalize_text(text)
+    note_service = NoteServiceClient()
+    # Gửi transcript sang Note Service
+    note_service.save_transcript(
+        note_id=note_id,
+        raw_text=text,
+        normalized_text=normalized,
+        duration=None,
+        sample_rate=None,
+        chunks=None,
+        asr_model="PhoWhisper-base",
+        normalization_model="gemini-1.5"
+    )
+    return True

app/main.py CHANGED Viewed

@@ -1,16 +1,24 @@
-import os
-import shutil
-from fastapi import FastAPI, UploadFile, File, HTTPException, Request
-from fastapi.responses import JSONResponse
-from fastapi.middleware.cors import CORSMiddleware
-from pathlib import Path
 import logging
-from .config import TMP_DIR, MAX_UPLOAD_BYTES, MAX_DURATION_SECS
-from .audio_utils import save_upload_file, download_file_from_url, get_audio_info, ensure_wav_16k_mono, make_temp_path
-from .model import load_model, transcribe_file, transcribe_file_chunks
 app = FastAPI(title="PhoWhisper ASR API")
 # CORS — tighten in prod
 app.add_middleware(
     CORSMiddleware,
@@ -19,99 +27,22 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# load model lazily on first request to avoid startup heavy cost
-MODEL = None
-@app.on_event("startup")
-def on_startup():
-    # ensure tmp dir exists
-    Path(TMP_DIR).mkdir(parents=True, exist_ok=True)
-    # optionally pre-load model (comment if you want lazy)
-    # global MODEL
-    # MODEL = load_model()
-    logging.info("API startup complete")
 @app.get("/health")
 def health():
     return {"status": "ok"}
-@app.post("/transcribe")
-async def transcribe(file: UploadFile = File(...)):
-    # 1. basic size check: UploadFile does not expose size easily, so stream-write and check
-    tmp_in = make_temp_path(suffix=Path(file.filename).suffix or ".wav")
-    try:
-        save_upload_file(file, tmp_in)
-        if os.path.getsize(tmp_in) > MAX_UPLOAD_BYTES:
-            raise HTTPException(status_code=413, detail="File size exceeds limit")
-        info = get_audio_info(tmp_in)
-        if info and info.get("duration") and info["duration"] > MAX_DURATION_SECS:
-            raise HTTPException(status_code=413, detail="Audio duration exceeds limit")
-        # 2. normalize
-        tmp_wav = make_temp_path(suffix=".wav")
-        ensure_wav_16k_mono(tmp_in, tmp_wav)
-        # 3. load model if needed
-        global MODEL
-        if MODEL is None:
-            MODEL = load_model(chunk_length_s=30)
-        text = transcribe_file(MODEL, tmp_wav, max_chunk_length=30.0, overlap_s=5.0)
-        chunks = transcribe_file_chunks(MODEL, tmp_wav, max_chunk_length=30.0, overlap_s=5.0)
-        info2 = get_audio_info(tmp_wav) or {}
-        return JSONResponse({
-            "text": text,
-            "duration": info2.get("duration"),
-            "sample_rate": info2.get("samplerate"),
-            "chunks": chunks
-        })
-    except HTTPException:
-        raise
-    except Exception as e:
-        logging.exception("Transcribe failed")
-        raise HTTPException(status_code=500, detail=str(e))
-    finally:
-        # cleanup temp files
-        for p in [tmp_in, locals().get("tmp_wav")]:
-            try:
-                if p and os.path.exists(p):
-                    os.remove(p)
-            except Exception:
-                pass
-@app.post("/transcribe-url")
-async def transcribe_url(payload: dict):
-    audio_url = payload.get("audio_url")
-    if not audio_url:
-        raise HTTPException(status_code=400, detail="audio_url required")
-    tmp_in = make_temp_path(suffix=Path(audio_url).suffix or ".tmp")
-    try:
-        download_file_from_url(audio_url, tmp_in)
-        if os.path.getsize(tmp_in) > MAX_UPLOAD_BYTES:
-            raise HTTPException(status_code=413, detail="File size exceeds limit")
-        info = get_audio_info(tmp_in)
-        if info and info.get("duration") and info["duration"] > MAX_DURATION_SECS:
-            raise HTTPException(status_code=413, detail="Audio duration exceeds limit")
-        tmp_wav = make_temp_path(suffix=".wav")
-        ensure_wav_16k_mono(tmp_in, tmp_wav)
-        global MODEL
-        if MODEL is None:
-            MODEL = load_model(chunk_length_s=30)
-        text = transcribe_file(MODEL, tmp_wav, max_chunk_length=30.0, overlap_s=5.0)
-        chunks = transcribe_file_chunks(MODEL, tmp_wav, max_chunk_length=30.0, overlap_s=5.0)
-        info2 = get_audio_info(tmp_wav) or {}
-        return JSONResponse({
-            "text": text,
-            "duration": info2.get("duration"),
-            "sample_rate": info2.get("samplerate"),
-            "chunks": chunks
-        })
-    except HTTPException:
-        raise
-    except Exception as e:
-        logging.exception("Transcribe-url failed")
-        raise HTTPException(status_code=500, detail=str(e))
-    finally:
-        for p in [tmp_in, locals().get("tmp_wav")]:
-            try:
-                if p and os.path.exists(p):
-                    os.remove(p)
-            except Exception:
-                pass

+from fastapi import FastAPI, Response
+from prometheus_client import generate_latest
+import asyncio
 import logging
+from fastapi.middleware.cors import CORSMiddleware
+from app.api.transcribe import router as transcribe_router
+from app.core.asr_engine import load_model
 app = FastAPI(title="PhoWhisper ASR API")
+# Preload ASR model at startup
+@app.on_event("startup")
+async def preload_asr_model():
+    # Load model in thread to avoid blocking event loop
+    logging.info("Preloading ASR model at startup...")
+    await asyncio.to_thread(load_model, 30)
+    logging.info("ASR model preloaded.")
 # CORS — tighten in prod
 app.add_middleware(
     CORSMiddleware,
     allow_headers=["*"],
 )
+# --- OLD LOGIC: Đã chuyển sang app/api/transcribe.py ---
+# - Định nghĩa endpoint trực tiếp
+# - Chứa toàn bộ logic xử lý
+# - Đã refactor thành router riêng và tách core/service
+# Health check (có thể giữ lại nếu muốn)
 @app.get("/health")
 def health():
     return {"status": "ok"}
+# Expose /metrics endpoint for Prometheus
+@app.get("/metrics")
+def metrics():
+    return Response(generate_latest(), media_type="text/plain")
+# Include API routers
+app.include_router(transcribe_router)

app/schemas/__init__.py ADDED Viewed

File without changes

app/schemas/transcribe.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# Request/Response models for transcription
+from pydantic import BaseModel
+from typing import List, Optional
+class Chunk(BaseModel):
+    start: float
+    end: float
+    text: str
+class TranscribeResponse(BaseModel):
+    note_id: str
+    status: str
+    duration: Optional[float] = None

app/services/__init__.py ADDED Viewed

File without changes

app/services/note_client.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import httpx
+from app.config.settings import NOTE_SERVICE_URL, HTTPX_TIMEOUT
+from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception
+class NoteServiceClient:
+    def __init__(self, base_url: str = None):
+        self.base_url = (base_url or NOTE_SERVICE_URL).rstrip("/")
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=1, max=8),
+        reraise=True,
+        retry=retry_if_exception(
+            lambda e: (
+                isinstance(e, httpx.RequestError) or
+                (isinstance(e, httpx.HTTPStatusError) and 500 <= e.response.status_code < 600)
+            )
+        )
+    )
+    async def save_transcript(self, note_id: str, raw_text: str, normalized_text: str,
+                              duration: float, sample_rate: int, chunks: list,
+                              asr_model: str = "PhoWhisper-base",
+                              normalization_model: str = "gemini-1.5"):
+        url = f"{self.base_url}/notes/{note_id}/transcript"
+        payload = {
+            "raw_text": raw_text,
+            "normalized_text": normalized_text,
+            "duration": duration,
+            "sample_rate": sample_rate,
+            "chunks": chunks,
+            "asr_model": asr_model,
+            "normalization_model": normalization_model
+        }
+        timeout = httpx.Timeout(HTTPX_TIMEOUT)
+        async with httpx.AsyncClient(timeout=timeout) as client:
+            try:
+                resp = await client.post(url, json=payload)
+                resp.raise_for_status()
+                return resp.json()
+            except httpx.HTTPStatusError as e:
+                # Chỉ retry nếu là 5xx
+                if 500 <= e.response.status_code < 600:
+                    raise
+                else:
+                    raise
+            except httpx.RequestError as e:
+                # Retry network errors
+                raise

app/services/text_normalizer.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from app.infra.redis_client import redis_client
+from app.utils.hashing import sha256
+CACHE_TTL = 60 * 60 * 24 * 3  # 3 days
+# Simple in-memory cache (có thể thay bằng Redis, v.v. sau này)
+# _normalize_cache = {}
+# --- Gemini client thực tế (Google GenerativeAI) ---
+import google.generativeai as genai
+from app.config.settings import GEMINI_API_KEY
+if GEMINI_API_KEY:
+	genai.configure(api_key=GEMINI_API_KEY)
+	_gemini_model = genai.GenerativeModel("gemini-pro")
+else:
+	_gemini_model = None
+async def normalize_text(raw_text: str) -> str:
+	cache_key = f"normalize:{sha256(raw_text)}"
+	cached = redis_client.get(cache_key)
+	if cached:
+		return cached
+	prompt = f"""
+Bạn là hệ thống chuẩn hóa transcript tiếng Việt.
+- KHÔNG thêm ý mới
+- Giữ nguyên nội dung
+- Chỉ sửa chính tả, dấu câu, xuống dòng hợp lý
+Văn bản:
+{raw_text}
+"""
+	result = raw_text
+	if _gemini_model:
+		# Google GenerativeAI Gemini API (synchronous, wrap in thread for async)
+		import asyncio
+		loop = asyncio.get_event_loop()
+		def call_gemini():
+			response = _gemini_model.generate_content(prompt)
+			return response.text.strip() if hasattr(response, 'text') else str(response)
+		result = await loop.run_in_executor(None, call_gemini)
+	else:
+		# Nếu chưa cấu hình Gemini, trả về text gốc
+		result = raw_text
+	result = result.strip()
+	redis_client.setex(cache_key, CACHE_TTL, result)
+	return result

app/utils/hashing.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Hashing utilities for cache keys, helpers
+import hashlib
+def sha256(text: str) -> str:
+	return hashlib.sha256(text.encode('utf-8')).hexdigest()

requirements.txt CHANGED Viewed

@@ -5,5 +5,11 @@ torch
 soundfile
 python-multipart
 requests
 numpy
 pytest

 soundfile
 python-multipart
 requests
+httpx
+redis
+rq
+tenacity
+prometheus-client
+google-generativeai
 numpy
 pytest

test/conftest.py CHANGED Viewed

@@ -1,57 +1,11 @@
-import numpy as np
-import soundfile as sf
-from pathlib import Path
-import os
-import tempfile
-import re
 import pytest
-# helper to synthesize a sine wave (mono) wav file
-def generate_sine_wav(path: str, duration_s: float, sr: int = 16000, freq: float = 440.0, amplitude: float = 0.2):
-    t = np.linspace(0, duration_s, int(sr * duration_s), endpoint=False)
-    data = amplitude * np.sin(2 * np.pi * freq * t)
-    sf.write(path, data, sr, subtype='PCM_16')
-# helper to generate silence (zeros)
-def generate_silence_wav(path: str, duration_s: float, sr: int = 16000):
-    import numpy as np
-    data = np.zeros(int(sr * duration_s), dtype=float)
-    sf.write(path, data, sr, subtype='PCM_16')
-@pytest.fixture
-def tmp_audio_dir(tmp_path):
-    d = tmp_path / "audio"
-    d.mkdir()
-    return d
-# Fake model: examine file path, if path contains ".chunk{N}." then return predictable text.
-# Example chunk file name "...chunk3.wav" -> "chunk3_text"
-def fake_model_from_path(path: str):
-    name = str(path)
-    m = re.search(r"chunk(\d+)", name)
-    if m:
-        i = int(m.group(1))
-        # produce a text designed to create overlaps:
-        # chunk0 -> "alpha beta gamma"
-        # chunk1 -> "beta gamma delta"
-        # chunk2 -> "gamma delta epsilon"
-        # general pattern derived from i
-        words = [
-            ["alpha", "beta", "gamma"],
-            ["beta", "gamma", "delta"],
-            ["gamma", "delta", "epsilon"],
-            ["delta", "epsilon", "zeta"],
-        ]
-        w = words[i % len(words)]
-        return {"text": " ".join(w)}
-    # fallback: return a simple marker based on filename
-    return {"text": Path(name).stem}
-# A fake pipeline object: callable that takes a file path and returns dict {"text": ...}
-class FakePipeline:
-    def __call__(self, path):
-        return fake_model_from_path(path)
-@pytest.fixture
-def fake_pipeline():
-    return FakePipeline()

 import pytest
+import tempfile
+import os
+@pytest.fixture(autouse=True)
+def mock_env(monkeypatch):
+    monkeypatch.setenv("TMP_DIR", tempfile.gettempdir())
+    monkeypatch.setenv("MAX_UPLOAD_BYTES", "1048576")
+    monkeypatch.setenv("MAX_DURATION_SECS", "3600")
+    monkeypatch.setenv("NOTE_SERVICE_URL", "http://note")
+    monkeypatch.setenv("REDIS_URL", "redis://localhost:6379/0")

test/test_long_performance.py DELETED Viewed

@@ -1,21 +0,0 @@
-import os
-import time
-import pytest
-from pathlib import Path
-from conftest import generate_sine_wav
-from app.model import transcribe_file
-@pytest.mark.skipif(os.getenv("RUN_LONG_TESTS", "0") != "1", reason="long tests disabled by default")
-def test_very_long_audio_runtime(tmp_path, fake_pipeline):
-    # create 10min (600s) wav — heavy; enabled only if RUN_LONG_TESTS=1
-    p = tmp_path / "very_long.wav"
-    generate_sine_wav(str(p), duration_s=600.0)  # 10 minutes
-    # measure time per minute
-    start = time.time()
-    text = transcribe_file(fake_pipeline, str(p), max_chunk_length=30.0, overlap_s=5.0)
-    elapsed = time.time() - start
-    # compute approx seconds per minute of audio processed
-    avg_sec_per_min = elapsed / (600.0 / 60.0)
-    print(f"Elapsed {elapsed:.2f}s; avg seconds per audio-minute: {avg_sec_per_min:.2f}")
-    # Basic assert: completed and returned a string
-    assert isinstance(text, str)

test/test_short_and_chunk.py DELETED Viewed

@@ -1,46 +0,0 @@
-import os
-from pathlib import Path
-import pytest
-from app.model import transcribe_file, _split_audio_to_chunks, _merge_transcripts
-from conftest import generate_sine_wav
-def test_short_audio_direct(tmp_path, fake_pipeline):
-    # create short wav: 5s
-    p = tmp_path / "short.wav"
-    generate_sine_wav(str(p), duration_s=5.0)
-    # call transcribe_file with chunk threshold 30s -> should not chunk
-    text = transcribe_file(fake_pipeline, str(p), max_chunk_length=30.0, overlap_s=5.0)
-    # fake pipeline returns filename-stem as text for non-chunk files
-    assert "short" in text or len(text) > 0
-def test_chunk_split_and_merge(tmp_path):
-    # Create audio ~75s to force chunking into 3 chunks with (L=30, O=5) -> starts 0,25,50
-    p = tmp_path / "long75.wav"
-    # 75s sine; note: generating >60s may be heavy; for CI shorten if needed
-    from conftest import generate_sine_wav
-    generate_sine_wav(str(p), duration_s=75.0)
-    # Use internal split function to inspect chunking
-    chunks = _split_audio_to_chunks(str(p), chunk_length_s=30.0, overlap_s=5.0)
-    # Expect at least 3 chunks
-    assert len(chunks) >= 3
-    # Simulate pipeline outputs for each chunk like in fake_model_from_path
-    simulated_texts = []
-    for idx, c in enumerate(chunks):
-        # derive same pattern as fake_model_from_path: chunk{i} -> list words
-        # We just test merging behavior: create overlapping words
-        if idx == 0:
-            simulated_texts.append("alpha beta gamma")
-        elif idx == 1:
-            simulated_texts.append("beta gamma delta")
-        elif idx == 2:
-            simulated_texts.append("gamma delta epsilon")
-        else:
-            simulated_texts.append(f"chunk{idx}")
-    # Merge one by one
-    merged = ""
-    for t in simulated_texts:
-        merged = _merge_transcripts(merged, t, max_overlap_words=5)
-    # After merging, no duplicate immediate sequences like "beta gamma beta gamma"
-    assert "beta gamma beta" not in merged
-    # Ensure merged contains parts from all chunks
-    assert "alpha" in merged and "epsilon" in merged

test/test_silence_and_overlap.py DELETED Viewed

@@ -1,12 +0,0 @@
-from app.model import _merge_transcripts
-import pytest
-def test_silence_edge_overlap():
-    # Simulate chunk outputs where chunk1 ends with filler repeated and chunk2 starts with filler
-    a = "hello um um"
-    b = "um um good morning"
-    merged = _merge_transcripts(a, b, max_overlap_words=4)
-    # Should not create triple 'um' (heuristic will remove overlap)
-    assert "um um um" not in merged
-    # Should still contain core words
-    assert "hello" in merged and "good" in merged