Spaces:

bichnhan2701
/

PhoWhisperBaseAPI

Sleeping

App Files Files Community

bichnhan2701 commited on Dec 20, 2025

Commit

7701a0c

1 Parent(s): 7158b5e

update phowhisper verver

Browse files

Files changed (5) hide show

app/config/settings.py +8 -0
app/core/asr_engine.py +98 -175
app/core/audio_utils.py +47 -1
app/jobs/transcribe_job.py +56 -32
requirements.txt +2 -1

app/config/settings.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 MAX_UPLOAD_BYTES = int(os.getenv("MAX_UPLOAD_BYTES", 100 * 1024 * 1024))
 MAX_DURATION_SECS = int(os.getenv("MAX_DURATION_SECS", 60 * 60))
@@ -23,3 +24,10 @@ REDIS_URL = os.getenv(
 )
 HTTPX_TIMEOUT = float(os.getenv("HTTPX_TIMEOUT", "10.0"))

 import os
+from pydantic import BaseSettings
 MAX_UPLOAD_BYTES = int(os.getenv("MAX_UPLOAD_BYTES", 100 * 1024 * 1024))
 MAX_DURATION_SECS = int(os.getenv("MAX_DURATION_SECS", 60 * 60))
 )
 HTTPX_TIMEOUT = float(os.getenv("HTTPX_TIMEOUT", "10.0"))
+class Settings(BaseSettings):
+    CLOUDINARY_CLOUD_NAME: str
+    CLOUDINARY_API_KEY: str
+    CLOUDINARY_API_SECRET: str
+settings = Settings()

app/core/asr_engine.py CHANGED Viewed

@@ -1,189 +1,112 @@
-# PhoWhisper inference engine
 import logging
 from transformers import pipeline
-from app.config.settings import MODEL_NAME
-from app.core.chunking import split_audio_to_chunks, ffmpeg_extract_segment
-from app.core.audio_utils import make_temp_path
-import os
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from app.core.audio_utils import get_audio_info, make_temp_path
-_model = None
-def load_model(chunk_length_s: int = None):
-    global _model
-    if _model is None:
-        logging.info(f"Loading ASR model {MODEL_NAME} ...")
-        kwargs = {}
-        if chunk_length_s is not None:
-            kwargs["chunk_length_s"] = chunk_length_s
-        _model = pipeline("automatic-speech-recognition", MODEL_NAME, **kwargs)
-        logging.info("Model loaded")
-    return _model
-def merge_chunks(chunks, max_overlap_words=12):
-    merged = []
-    for ch in chunks:
-        if not merged:
-            merged.append(ch)
-            continue
-        prev = merged[-1]
-        merged_text = merge_transcripts(
-            prev["text"],
-            ch["text"],
-            max_overlap_words=max_overlap_words
-        )
-        if merged_text != prev["text"]:
-            prev["text"] = merged_text
-            prev["end"] = ch["end"]
-        else:
-            merged.append(ch)
-    return merged
-def normalize_chunks(chunks):
-    normalized = []
-    last_end = 0.0
-    for ch in chunks:
-        start = max(ch["start"], last_end)
-        end = max(start, ch["end"])
-        text = ch["text"].strip()
-        if not text:
-            continue
-        normalized.append({
-            "start": round(start, 3),
-            "end": round(end, 3),
-            "text": text
-        })
-        last_end = end
-    return normalized
-# Heuristic merge for chunked transcripts
-def merge_transcripts(prev_text: str, new_text: str, max_overlap_words: int = 8) -> str:
-    if not prev_text:
-        return new_text
-    p_words = prev_text.strip().split()
-    n_words = new_text.strip().split()
-    max_ol = min(max_overlap_words, len(p_words), len(n_words))
-    best_k = 0
-    for k in range(max_ol, 0, -1):
-        if p_words[-k:] == n_words[:k]:
-            best_k = k
-            break
-    if best_k > 0:
-        merged = " ".join(p_words + n_words[best_k:])
-        return merged
-    for k in range(max_ol, 1, -1):
-        seq = " ".join(p_words[-k:])
-        if seq in new_text:
-            idx = new_text.find(seq)
-            merged = " ".join(p_words + new_text[idx + len(seq):].strip().split())
-            return merged
-    return prev_text.rstrip() + " " + new_text.lstrip()
-def transcribe_long_audio(model, wav_path: str, chunk_length_s: float = 30.0, overlap_s: float = 5.0, parallel: bool = False) -> str:
-    chunks = split_audio_to_chunks(wav_path, chunk_length_s=chunk_length_s, overlap_s=overlap_s)
-    logging.info(f"Split into {len(chunks)} chunks")
-    texts = []
-    if parallel:
-        def process_chunk(path):
-            try:
-                out = model(path)
-                if isinstance(out, dict):
-                    return out.get("text", "")
-                return str(out)
-            except Exception as e:
-                logging.exception("Chunk inference failed")
-                return ""
-        with ThreadPoolExecutor(max_workers=2) as ex:
-            futures = {ex.submit(process_chunk, c): c for c in chunks}
-            for fut in as_completed(futures):
-                texts.append(fut.result() or "")
-    else:
-        for c in chunks:
-            out = model(c)
-            if isinstance(out, dict):
-                texts.append(out.get("text", "") or "")
-            else:
-                texts.append(str(out) or "")
-    merged = ""
-    for t in texts:
-        merged = merge_transcripts(merged, t, max_overlap_words=12)
-    for c in chunks:
-        try:
-            os.remove(c)
-        except Exception:
-            pass
-    return merged
-def transcribe_file(model, wav_path: str, max_chunk_length: float = 30.0, overlap_s: float = 5.0):
-    info = get_audio_info(wav_path) or {}
-    duration = info.get("duration", 0.0)
-    if duration and duration > max_chunk_length * 1.1:
-        logging.info(f"Long audio detected ({duration}s) -> chunking")
-        return transcribe_long_audio(model, wav_path, chunk_length_s=max_chunk_length, overlap_s=overlap_s)
-    out = model(wav_path)
-    if isinstance(out, dict):
-        return out.get("text") or ""
-    return str(out)
 def transcribe_file_chunks(
     model,
     wav_path: str,
-    max_chunk_length: float = 30.0,
-    overlap_s: float = 5.0,
-):
-    info = get_audio_info(wav_path) or {}
-    duration = info.get("duration", 0.0)
-    step = max_chunk_length - overlap_s
-    if step <= 0:
-        raise ValueError("max_chunk_length must be > overlap_s")
-    starts = []
-    t = 0.0
-    while t < duration:
-        starts.append(t)
-        t += step
-    raw_chunks = []
-    for i, s in enumerate(starts):
-        chunk_end = min(s + max_chunk_length, duration)
-        dst = make_temp_path(suffix=f".chunk{i}.wav")
-        ffmpeg_extract_segment(wav_path, s, chunk_end - s, dst)
-        out = model(dst)
-        text = out.get("text", "") if isinstance(out, dict) else str(out)
-        raw_chunks.append({
-            "start": s,
-            "end": chunk_end,
-            "text": text
-        })
-        try:
-            os.remove(dst)
-        except Exception:
-            pass
-    # 🔽 CHUỖI XỬ LÝ CHUẨN
-    merged = merge_chunks(raw_chunks)
-    normalized = normalize_chunks(merged)
-    logging.info(
-        "ASR result: raw=%d merged=%d normalized=%d",
-        len(raw_chunks),
-        len(merged),
-        len(normalized),
     )
-    return normalized

 import logging
+from typing import List, Dict
+import torch
 from transformers import pipeline
+logger = logging.getLogger(__name__)
+# ===============================
+# Global model cache
+# ===============================
+_ASR_MODEL = None
+def load_model(chunk_length_s: float = 30.0):
+    """
+    Load ASR model once and reuse.
+    Safe to call multiple times.
+    """
+    global _ASR_MODEL
+    if _ASR_MODEL is not None:
+        return _ASR_MODEL
+    logger.info("Loading ASR model PhoWhisper-base")
+    device = 0 if torch.cuda.is_available() else -1
+    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+    _ASR_MODEL = pipeline(
+        task="automatic-speech-recognition",
+        model="vinai/PhoWhisper-base",
+        device=device,
+        torch_dtype=torch_dtype,
+        chunk_length_s=chunk_length_s,
+        return_timestamps=True,
+    )
+    logger.info(
+        "ASR model loaded (device=%s)", "cuda" if device >= 0 else "cpu"
+    )
+    return _ASR_MODEL
+# ===============================
+# Transcribe full text
+# ===============================
+def transcribe_file(
+    model,
+    wav_path: str,
+    chunk_length_s: float = 30.0,
+    stride_s: float = 5.0,
+) -> str:
+    """
+    Return full transcript text.
+    """
+    if not wav_path:
+        return ""
+    out = model(
+        wav_path,
+        chunk_length_s=chunk_length_s,
+        stride_length_s=stride_s,
+    )
+    text = out.get("text", "")
+    return text.strip() if text else ""
+# ===============================
+# Transcribe chunks with timestamps
+# ===============================
 def transcribe_file_chunks(
     model,
     wav_path: str,
+    chunk_length_s: float = 30.0,
+    stride_s: float = 5.0,
+) -> List[Dict]:
+    """
+    Return list of chunks:
+    [{ start, end, text }]
+    """
+    if not wav_path:
+        return []
+    out = model(
+        wav_path,
+        chunk_length_s=chunk_length_s,
+        stride_length_s=stride_s,
+        return_timestamps=True,
     )
+    chunks = []
+    for c in out.get("chunks", []) or []:
+        ts = c.get("timestamp") or [None, None]
+        start, end = ts if len(ts) == 2 else (None, None)
+        text = (c.get("text") or "").strip()
+        if not text:
+            continue
+        if start is None or end is None:
+            continue
+        chunks.append(
+            {
+                "start": float(start),
+                "end": float(end),
+                "text": text,
+            }
+        )
+    return chunks

app/core/audio_utils.py CHANGED Viewed

@@ -1,11 +1,15 @@
 # Audio utilities: ffmpeg, normalization, etc.
 import subprocess
 import shlex
 import uuid
 import requests
 from pathlib import Path
 import soundfile as sf
-from app.config.settings import TMP_DIR, MAX_UPLOAD_BYTES
 def save_upload_file(upload_file, dest_path: str):
     """Save FastAPI UploadFile to dest_path (streaming)."""
@@ -64,3 +68,45 @@ def ensure_wav_16k_mono(src_path: str, dest_path: str):
 def make_temp_path(suffix=".wav"):
     """Generate unique temp file path under TMP_DIR."""
     return str(Path(TMP_DIR) / f"{uuid.uuid4().hex}{suffix}")

 # Audio utilities: ffmpeg, normalization, etc.
+from asyncio.log import logger
 import subprocess
 import shlex
 import uuid
 import requests
 from pathlib import Path
 import soundfile as sf
+from app.config.settings import TMP_DIR, MAX_UPLOAD_BYTES, settings
+import cloudinary
+import cloudinary.uploader
+import os
 def save_upload_file(upload_file, dest_path: str):
     """Save FastAPI UploadFile to dest_path (streaming)."""
 def make_temp_path(suffix=".wav"):
     """Generate unique temp file path under TMP_DIR."""
     return str(Path(TMP_DIR) / f"{uuid.uuid4().hex}{suffix}")
+# init once
+cloudinary.config(
+    cloud_name=settings.CLOUDINARY_CLOUD_NAME,
+    api_key=settings.CLOUDINARY_API_KEY,
+    api_secret=settings.CLOUDINARY_API_SECRET,
+    secure=True,
+)
+def upload_temp_audio(
+    local_path: str,
+    *,
+    folder: str = "asr_uploads",
+    public_id: str | None = None,
+    ttl: int = 3600,
+) -> str:
+    """
+    Upload audio file to Cloudinary and return public URL.
+    File can be safely deleted locally after upload.
+    """
+    if not os.path.exists(local_path):
+        raise FileNotFoundError(local_path)
+    logger.info("Uploading audio to Cloudinary: %s", local_path)
+    result = cloudinary.uploader.upload(
+        local_path,
+        resource_type="video",   # ⚠️ audio MUST use video
+        folder=folder,
+        public_id=public_id,
+        overwrite=True,
+        invalidate=True,
+    )
+    url = result.get("secure_url")
+    if not url:
+        raise RuntimeError("Cloudinary upload failed")
+    logger.info("Uploaded audio -> %s", url)
+    return url

app/jobs/transcribe_job.py CHANGED Viewed

@@ -1,41 +1,65 @@
 import asyncio
 from app.core.asr_engine import load_model, transcribe_file, transcribe_file_chunks
 from app.services.note_client import NoteServiceClient
 from app.core.audio_utils import get_audio_info
-def transcribe_job(wav_path: str, note_id: str, user_id: str | None = None):
     model = load_model()
-    # 🔥 ASR giống hệt API sync
-    text = transcribe_file(model, wav_path, 30.0, 5.0)
-    chunks = transcribe_file_chunks(model, wav_path, 30.0, 5.0)
-    # drop invalid chunks (defensive)
-    chunks = [
-        c for c in chunks
-        if c.get("text", "").strip() and c.get("end", 0) > c.get("start", 0)
-    ]
-    note_status = "transcribed" if chunks else "error"
-    info = get_audio_info(wav_path) or {}
-    payload = {
-        "note_id": note_id,
-        "type": "audio",
-        "status": note_status,
-        "raw_text": text,
-        "metadata": {
-            "audio": {
-                "duration": info.get("duration"),
-                "sample_rate": info.get("samplerate"),
-                "chunks": chunks,
-                "asr_model": "PhoWhisper-base",
             },
-            "client": {"user_id": user_id},
-        },
-        "generate": ["normalize", "keywords", "summary", "mindmap"],
-    }
-    client = NoteServiceClient()
-    asyncio.run(client.create_audio_note(payload))

 import asyncio
+import tempfile
+import os
+import requests
 from app.core.asr_engine import load_model, transcribe_file, transcribe_file_chunks
 from app.services.note_client import NoteServiceClient
 from app.core.audio_utils import get_audio_info
+def download_audio(audio_url: str) -> str:
+    r = requests.get(audio_url, timeout=30)
+    r.raise_for_status()
+    fd, path = tempfile.mkstemp(suffix=".wav")
+    with os.fdopen(fd, "wb") as f:
+        f.write(r.content)
+    return path
+def transcribe_job(audio_url: str, note_id: str, user_id: str | None = None):
     model = load_model()
+    wav_path = None
+    try:
+        # 1️⃣ Worker tự fetch audio
+        wav_path = download_audio(audio_url)
+        # 2️⃣ ASR
+        text = transcribe_file(model, wav_path, 30.0, 5.0)
+        chunks = transcribe_file_chunks(model, wav_path, 30.0, 5.0)
+        chunks = [
+            c for c in chunks
+            if c.get("text", "").strip() and c.get("end", 0) > c.get("start", 0)
+        ]
+        note_status = "transcribed" if chunks else "error"
+        info = get_audio_info(wav_path) or {}
+        payload = {
+            "note_id": note_id,
+            "type": "audio",
+            "status": note_status,
+            "raw_text": text,
+            "metadata": {
+                "audio": {
+                    "duration": info.get("duration"),
+                    "sample_rate": info.get("samplerate"),
+                    "chunks": chunks,
+                    "asr_model": "PhoWhisper-base",
+                },
+                "client": {"user_id": user_id},
             },
+            "generate": ["normalize", "keywords", "summary", "mindmap"],
+        }
+        client = NoteServiceClient()
+        asyncio.run(client.create_audio_note(payload))
+    finally:
+        # 3️⃣ Cleanup
+        if wav_path and os.path.exists(wav_path):
+            os.remove(wav_path)

requirements.txt CHANGED Viewed

@@ -13,4 +13,5 @@ prometheus-client
 google-generativeai
 google-genai
 numpy
-pytest

 google-generativeai
 google-genai
 numpy
+pytest
+cloudinary