Spaces:

vyluong
/

PoC_ASR_v5

Sleeping

App Files Files Community

colab-user commited on Jan 27

Commit

fe21ffa

1 Parent(s): 8ce75a0

fix model transcription

Browse files

Files changed (4) hide show

app/api/routes.py +1 -1
app/core/config.py +1 -6
app/main.py +1 -1
app/services/transcription.py +71 -137

app/api/routes.py CHANGED Viewed

@@ -37,7 +37,7 @@ async def get_models():
     """Get available Whisper models."""
     return {
         "models": list(AVAILABLE_MODELS.keys()),
-        "default": settings.default_whisper_model
     }

     """Get available Whisper models."""
     return {
         "models": list(AVAILABLE_MODELS.keys()),
+        "default": settings.whisper_lora_model_dir
     }

app/core/config.py CHANGED Viewed

@@ -30,12 +30,7 @@ class Settings(BaseSettings):
     enable_vocal_separation: bool = True
     mdx_model: str = "Kim_Vocal_2.onnx"  # High quality vocal isolation
-    # Available Whisper models
-    available_whisper_models: Dict[str, str] = {
-        "EraX-WoW-Turbo": "erax-ai/EraX-WoW-Turbo-V1.1-CT2",
-        "PhoWhisper Large": "kiendt/PhoWhisper-large-ct2"
-    }
-    default_whisper_model: str = "PhoWhisper Large"
     # Diarization model
     diarization_model: str = "pyannote/speaker-diarization-community-1"

     enable_vocal_separation: bool = True
     mdx_model: str = "Kim_Vocal_2.onnx"  # High quality vocal isolation
+    whisper_lora_model_dir: str = "vyluong/pho-whisper-vi-lora-v5"
     # Diarization model
     diarization_model: str = "pyannote/speaker-diarization-community-1"

app/main.py CHANGED Viewed

@@ -35,7 +35,7 @@ async def lifespan(app: FastAPI):
     """
     logger.info("Starting PrecisionVoice application...")
     logger.info(f"Device: {settings.resolved_device}")
-    logger.info(f"Default Whisper model: {settings.default_whisper_model}")
     logger.info(f"Diarization model: {settings.diarization_model}")
     # Preload default Whisper model

     """
     logger.info("Starting PrecisionVoice application...")
     logger.info(f"Device: {settings.resolved_device}")
+    logger.info(f"Default Whisper model: {settings.whisper_lora_model_dir}")
     logger.info(f"Diarization model: {settings.diarization_model}")
     # Preload default Whisper model

app/services/transcription.py CHANGED Viewed

@@ -3,11 +3,14 @@ Transcription service using faster-whisper.
 Supports multiple Vietnamese Whisper models with caching.
 """
 import logging
 from typing import Dict, Optional, List
 from dataclasses import dataclass
 import numpy as np
-from faster_whisper import WhisperModel
 from app.core.config import get_settings
@@ -17,8 +20,9 @@ settings = get_settings()
 # Available Whisper models for Vietnamese
 AVAILABLE_MODELS = {
-    "EraX-WoW-Turbo": "erax-ai/EraX-WoW-Turbo-V1.1-CT2",
-    "PhoWhisper Large": "kiendt/PhoWhisper-large-ct2"
 }
@@ -36,138 +40,88 @@ class TranscriptionService:
     Supports multiple models with caching.
     """
-    _models: Dict[str, WhisperModel] = {}
     @classmethod
-    def get_model(cls, model_name: str = None) -> WhisperModel:
-        """
-        Get or load a Whisper model (lazy loading with caching).
-        Args:
-            model_name: Name of the model from AVAILABLE_MODELS
-        Returns:
-            Loaded WhisperModel instance
-        """
-        if model_name is None:
-            model_name = settings.default_whisper_model
-        cache_key = f"{model_name}_{settings.resolved_compute_type}"
-        if cache_key in cls._models:
-            return cls._models[cache_key]
-        # Get model path
-        if model_name in AVAILABLE_MODELS:
-            model_path = AVAILABLE_MODELS[model_name]
-        else:
-            # Fallback to first available model
-            model_name = list(AVAILABLE_MODELS.keys())[0]
-            model_path = AVAILABLE_MODELS[model_name]
-        logger.info(f"Loading Whisper model: {model_name} ({model_path})")
-        logger.debug(f"Device: {settings.resolved_device}, Compute type: {settings.resolved_compute_type}")
-        model = WhisperModel(
-            model_path,
-            device=settings.resolved_device,
-            compute_type=settings.resolved_compute_type,
-        )
-        cls._models[cache_key] = model
-        logger.info(f"Whisper model loaded: {model_name}")
-        return model
     @classmethod
-    def is_loaded(cls, model_name: str = None) -> bool:
-        if model_name is None:
-            model_name = settings.default_whisper_model
-        """Check if a model is loaded."""
-        cache_key = f"{model_name}_{settings.resolved_compute_type}"
-        return cache_key in cls._models
     @classmethod
-    def preload_model(cls, model_name: str = None) -> None:
-        """Preload a model during startup."""
-        if model_name is None:
-            model_name = settings.default_whisper_model
-        try:
-            cls.get_model(model_name)
-        except Exception as e:
-            logger.error(f"Failed to preload Whisper model: {e}")
-            raise
     @classmethod
     def transcribe_with_words(
         cls,
         audio_array: np.ndarray,
-        model_name: str = None,
         language: str = "vi",
-        vad_options: Optional[dict] = None,
         beam_size: int = 5,
-        temperature: float = 0.2,
-        best_of: int = 5,
-        initial_prompt: Optional[str] = None,
     ) -> Dict:
-        """
-        Transcribe audio and return word-level timestamps.
-        """
-        model = cls.get_model(model_name)
-        vad_filter = vad_options if vad_options else False
-        prompt = initial_prompt.strip() if initial_prompt and initial_prompt.strip() else None
-        segments_gen, info = model.transcribe(
             audio_array,
-            language=language if language != "auto" else None,
-            beam_size=beam_size,
-            temperature=temperature,
-            best_of=best_of,
-            # QA / Stability
-            condition_on_previous_text=False,
-            no_speech_threshold=0.6,
-            # hallucination
-            compression_ratio_threshold=2.4,
-            log_prob_threshold=-1.0,
-            word_timestamps=True,
-            # VAD
-            vad_filter=vad_filter,
-            vad_parameters=dict(
-                threshold=settings.vad_threshold,
-                min_speech_duration_ms=settings.vad_min_speech_duration_ms,
-                min_silence_duration_ms=settings.vad_min_silence_duration_ms,
-            ),
-            initial_prompt=prompt,
-        )
-        words = []
-        full_text = []
-        for seg in segments_gen:
-            if seg.text:
-                full_text.append(seg.text.strip())
-            if hasattr(seg, "words") and seg.words:
-                for w in seg.words:
-                    if not w.word.strip():
-                        continue
-                    words.append({
-                        "word": w.word.strip(),
-                        "start": float(w.start),
-                        "end": float(w.end),
-                    })
         return {
-            "text": " ".join(full_text).strip(),
-            "words": words,
-            "info": info,
         }
@@ -175,35 +129,15 @@ class TranscriptionService:
     async def transcribe_with_words_async(
         cls,
         audio_array: np.ndarray,
-        model_name: str = None,
-        language: str = "vi",
-        vad_options: Optional[dict] = None,
-        beam_size: int = 5,
-        temperature: float = 0.0,
-        best_of: int = 5,
-        initial_prompt: Optional[str] = None,
-    ) -> str:
-        """
-        Async wrapper for transcription (runs in thread pool).
-        """
         import asyncio
         loop = asyncio.get_event_loop()
         return await loop.run_in_executor(
             None,
-            lambda: cls.transcribe_with_words(
-                audio_array,
-                model_name=model_name,
-                language=language,
-                vad_options=vad_options,
-                beam_size=beam_size,
-                temperature=temperature,
-                best_of=best_of,
-                initial_prompt=initial_prompt
-            )
         )
     @classmethod
     def get_available_models(cls) -> Dict[str, str]:
-        """Return list of available models."""
         return AVAILABLE_MODELS.copy()

 Supports multiple Vietnamese Whisper models with caching.
 """
 import logging
+import torch
 from typing import Dict, Optional, List
 from dataclasses import dataclass
 import numpy as np
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+from peft import PeftModel
 from app.core.config import get_settings
 # Available Whisper models for Vietnamese
 AVAILABLE_MODELS = {
+    "Whisper-LoRA": settings.whisper_lora_model_dir
 }
     Supports multiple models with caching.
     """
+    _model = None
+    _processor = None
+    _device = "cuda" if torch.cuda.is_available() else "cpu"
     @classmethod
+    def get_model(cls):
+        if cls._model is not None:
+            return cls._model, cls._processor
+        model_dir = AVAILABLE_MODELS["Whisper-LoRA"]
+        logger.info(f"Loading Whisper + LoRA from {model_dir}")
+        logger.info(f"Device: {cls._device}")
+        base_model = WhisperForConditionalGeneration.from_pretrained(model_dir)
+        model = PeftModel.from_pretrained(base_model, model_dir)
+        model.to(cls._device)
+        model.eval()
+        processor = WhisperProcessor.from_pretrained(model_dir)
+        cls._model = model
+        cls._processor = processor
+        logger.info("Whisper + LoRA loaded successfully")
+        return model, processor
     @classmethod
+    def is_loaded(cls) -> bool:
+        return cls._model is not None
     @classmethod
+    def preload_model(cls) -> None:
+        cls.get_model()
     @classmethod
     def transcribe_with_words(
         cls,
         audio_array: np.ndarray,
         language: str = "vi",
         beam_size: int = 5,
+        temperature: float = 0.0,
     ) -> Dict:
+        model, processor = cls.get_model()
+        if audio_array.ndim > 1:
+            audio_array = np.mean(audio_array, axis=0)
+        inputs = processor(
             audio_array,
+            sampling_rate=16000,
+            return_tensors="pt"
+        ).input_features.to(cls._device)
+        forced_decoder_ids = processor.get_decoder_prompt_ids(
+            language=language,
+            task="transcribe"
+        )
+        with torch.no_grad():
+            generated_ids = model.generate(
+                inputs,
+                forced_decoder_ids=forced_decoder_ids,
+                num_beams=beam_size,
+                temperature=temperature,
+                max_new_tokens=settings.whisper_max_new_tokens,
+            )
+        text = processor.batch_decode(
+            generated_ids,
+            skip_special_tokens=True
+        )[0].strip()
         return {
+            "text": text,
+            "words": [],
+            "info": {
+                "engine": "transformers-whisper-lora",
+                "language": language,
+                "beam_size": beam_size,
+            },
         }
     async def transcribe_with_words_async(
         cls,
         audio_array: np.ndarray,
+        **kwargs
+    ) -> Dict:
         import asyncio
         loop = asyncio.get_event_loop()
         return await loop.run_in_executor(
             None,
+            lambda: cls.transcribe_with_words(audio_array, **kwargs)
         )
     @classmethod
     def get_available_models(cls) -> Dict[str, str]:
         return AVAILABLE_MODELS.copy()