Spaces:

vyluong
/

PoC_PrecisionVoice_test

Sleeping

App Files Files Community

colab-user commited on Feb 9

Commit

d8c95b8

1 Parent(s): 832e106

test model & pipeline

Browse files

Files changed (4) hide show

app/core/config.py +10 -8
app/services/orchestrator.py +1 -1
app/services/transcription.py +158 -72
requirements.txt +18 -3

app/core/config.py CHANGED Viewed

@@ -32,13 +32,13 @@ class Settings(BaseSettings):
     # Model settings
     whisper_model: str = "vyluong/pho-whisper-vi-ct2"
-    diarization_model: str = "pyannote/speaker-diarization-3.1"
     # Device settings
     device: Literal["cuda", "cpu", "auto"] = "auto"
     compute_type: str = "float16"  # float16 for GPU, int8 for CPU
-    # Upload settings
     max_upload_size_mb: int = 100
     allowed_extensions: list[str] = ["mp3", "wav", "m4a", "ogg", "flac", "webm"]
@@ -50,14 +50,16 @@ class Settings(BaseSettings):
     noise_reduction_level: float = 12.0  # Used by anlmdn
     enable_loudnorm: bool = True
-    # VAD parameters
-    vad_threshold: float = 0.5
-    vad_min_speech_duration_ms: int = 250
-    vad_min_silence_duration_ms: int = 500
     # Post-processing
-    merge_threshold_s: float = 0.5  # Merge segments from same speaker if gap < this
-    min_segment_duration_s: float = 0.3  # Remove segments shorter than this
     # Server settings
     host: str = "0.0.0.0"

     # Model settings
     whisper_model: str = "vyluong/pho-whisper-vi-ct2"
+    diarization_model: str = "pyannote/speaker-diarization-community-1"
     # Device settings
     device: Literal["cuda", "cpu", "auto"] = "auto"
     compute_type: str = "float16"  # float16 for GPU, int8 for CPU
+    # Upload settings
     max_upload_size_mb: int = 100
     allowed_extensions: list[str] = ["mp3", "wav", "m4a", "ogg", "flac", "webm"]
     noise_reduction_level: float = 12.0  # Used by anlmdn
     enable_loudnorm: bool = True
+      # VAD parameters
+    vad_threshold: float = 0.55
+    vad_min_speech_duration_ms: int = 200
+    vad_min_silence_duration_ms: int = 450
+    vad_speech_pad_ms: int = 250
     # Post-processing
+    merge_threshold_s: float = 0.35  # Merge segments from same speaker if gap < this
+    min_segment_duration_s: float = 0.85  # Remove segments shorter than this
     # Server settings
     host: str = "0.0.0.0"

app/services/orchestrator.py CHANGED Viewed

@@ -43,7 +43,7 @@ class PipelineOrchestrator:
         # Step 2: AI Processing (Transcription & Diarization)
         logger.info(f"[Step 2/4] Starting AI models (Whisper + Pyannote) for: {wav_path.name}")
-        transcription_task = TranscriptionService.transcribe_async(wav_path)
         diarization_task = DiarizationService.diarize_async(wav_path)
         try:

         # Step 2: AI Processing (Transcription & Diarization)
         logger.info(f"[Step 2/4] Starting AI models (Whisper + Pyannote) for: {wav_path.name}")
+        transcription_task = TranscriptionService.transcribe_with_words_async(wav_path)
         diarization_task = DiarizationService.diarize_async(wav_path)
         try:

app/services/transcription.py CHANGED Viewed

@@ -5,8 +5,9 @@ Returns word-level timestamps for precision alignment.
 """
 import logging
 from pathlib import Path
-from typing import List, Optional
 from dataclasses import dataclass
 from faster_whisper import WhisperModel
@@ -77,92 +78,177 @@ class TranscriptionService:
         return cls._model is not None
     @classmethod
-    def transcribe(
         cls,
-        audio_path: Path,
         language: str = "vi",
-        initial_prompt: Optional[str] = None
-    ) -> List[WordTimestamp]:
-        """
-        Transcribe audio file with word-level timestamps.
-        Args:
-            audio_path: Path to WAV audio file
-            language: Language code (default: Vietnamese)
-            initial_prompt: Optional prompt for context
-        Returns:
-            List of WordTimestamp with precise timing for each word
         """
-        model = cls.get_model()
-        logger.debug(f"Transcribing: {audio_path}")
-        # Run transcription with word timestamps - CRITICAL for precision alignment
-        segments_generator, info = model.transcribe(
-            str(audio_path),
-            language=language,
-            initial_prompt=initial_prompt,
-            word_timestamps=True,  # CRITICAL: Enable word-level timestamps
-            vad_filter=True,  # Re-enabled for optimization
-            vad_parameters=dict(
-                threshold=settings.vad_threshold,
-                min_speech_duration_ms=settings.vad_min_speech_duration_ms,
-                min_silence_duration_ms=settings.vad_min_silence_duration_ms,
-            ),
-            beam_size=5,
-            best_of=5,
         )
-        # Extract all words with timestamps
-        all_words = []
-        segment_count = 0
-        for segment in segments_generator:
-            segment_count += 1
-            if segment.words:
-                for word in segment.words:
-                    all_words.append(WordTimestamp(
-                        word=word.word.strip(),
-                        start=word.start,
-                        end=word.end
-                    ))
-        logger.info(f"Transcription complete: {segment_count} segments, {len(all_words)} words, detected language: {info.language}")
-        return all_words
     @classmethod
-    async def transcribe_async(
         cls,
-        audio_path: Path,
         language: str = "vi",
-        initial_prompt: Optional[str] = None
-    ) -> List[WordTimestamp]:
         """
         Async wrapper for transcription (runs in thread pool).
-        Args:
-            audio_path: Path to WAV audio file
-            language: Language code
-            initial_prompt: Optional prompt
-        Returns:
-            List of WordTimestamp
         """
         import asyncio
-        loop = asyncio.get_event_loop()
         return await loop.run_in_executor(
             None,
-            lambda: cls.transcribe(audio_path, language, initial_prompt)
         )
-    @classmethod
-    def preload_model(cls) -> None:
-        """Preload the model during startup."""
-        try:
-            cls.get_model()
-        except Exception as e:
-            logger.error(f"Failed to preload Whisper model: {e}")
-            raise

 """
 import logging
 from pathlib import Path
+from typing import List, Optional, Dict
 from dataclasses import dataclass
+import numpy as np
 from faster_whisper import WhisperModel
         return cls._model is not None
     @classmethod
+    def preload_model(cls) -> None:
+        """Preload the model during startup."""
+        try:
+            cls.get_model()
+        except Exception as e:
+            logger.error(f"Failed to preload Whisper model: {e}")
+            raise
+    @classmethod
+    def transcribe_with_words(
         cls,
+        audio_array: np.ndarray,
+        model_name: str = None,
         language: str = "vi",
+        vad_options: Optional[dict | bool] = None,
+        beam_size: int = 3,
+        temperature: float = 0.0,
+        best_of: int = 5,
+        patience: float = 1.0,
+        length_penalty: float = 1.0,
+        no_repeat_ngram_size: int = 3,
+        # Prompting
+        initial_prompt: str = "Hội thoại tổng đài. Chỉ ghi lại đúng lời nói trong audio.",
+        prefix_text: Optional[str] = None,
+        # Stability / filtering
+        condition_on_previous_text: bool = False,
+        no_speech_threshold: float = 0.70,
+        log_prob_threshold: float = -1.0,
+        compression_ratio_threshold: float = 2.4
+    ) -> Dict:
+        """
+        Transcribe audio and return word-level timestamps.
         """
+        model = cls.get_model(model_name)
+        if vad_options is None or vad_options is False:
+            use_vad = False
+            vad_parameters = None
+        elif vad_options is True:
+            use_vad = True
+            vad_parameters = {
+                "threshold": settings.vad_threshold,
+                "min_speech_duration_ms": settings.vad_min_speech_duration_ms,
+                "min_silence_duration_ms": settings.vad_min_silence_duration_ms,
+            }
+        elif isinstance(vad_options, dict):
+            use_vad = True
+            vad_parameters = vad_options
+        else:
+            use_vad = False
+            vad_parameters = None
+        prompt = (
+            initial_prompt.strip()
+            if isinstance(initial_prompt, str) and initial_prompt.strip()
+            else None
         )
+        prefix = (
+            prefix_text.strip()
+            if isinstance(prefix_text, str) and prefix_text.strip()
+            else None
+        )
+        segments_gen, info = model.transcribe(
+            audio_array,
+            language=language if language != "auto" else None,
+            # decoding
+            beam_size=beam_size,
+            temperature=temperature,
+            best_of=best_of,
+            patience=patience,
+            length_penalty=length_penalty,
+            no_repeat_ngram_size=no_repeat_ngram_size,
+            # prompting
+            prefix=prefix,
+            # QA / Stability
+            condition_on_previous_text=condition_on_previous_text,
+            no_speech_threshold=no_speech_threshold,
+            log_prob_threshold=log_prob_threshold,
+            compression_ratio_threshold=compression_ratio_threshold,
+            word_timestamps=True,
+            # VAD
+            vad_filter=use_vad,
+            vad_parameters=vad_parameters,
+            initial_prompt=prompt,
+        )
+        words = []
+        full_text = []
+        for seg in segments_gen:
+            if seg.text:
+                full_text.append(seg.text.strip())
+            if hasattr(seg, "words") and seg.words:
+                for w in seg.words:
+                    if not w.word.strip():
+                        continue
+                    words.append({
+                        "word": w.word.strip(),
+                        "start": float(w.start),
+                        "end": float(w.end),
+                    })
+        return {
+            "text": " ".join(full_text).strip(),
+            "words": words,
+            "info": info,
+        }
     @classmethod
+    async def transcribe_with_words_async(
         cls,
+        audio_array: np.ndarray,
+        model_name: str = None,
         language: str = "vi",
+        vad_options: Optional[dict | bool] = None,
+        beam_size: int = 5,
+        temperature: float = 0.0,
+        best_of: int = 5,
+        patience: float = 1.0,
+        length_penalty: float = 1.0,
+        no_repeat_ngram_size: int = 3,
+        initial_prompt: Optional[str] = None,
+        prefix_text: Optional[str] = None,
+        condition_on_previous_text: bool = False,
+        no_speech_threshold: float = 0.70,
+        log_prob_threshold: float = -1.0,
+        # text repetitive / nonsense
+        compression_ratio_threshold: float = 2.4
+    ) -> Dict:
         """
         Async wrapper for transcription (runs in thread pool).
         """
         import asyncio
+        loop = asyncio.get_running_loop()
         return await loop.run_in_executor(
             None,
+            lambda: cls.transcribe_with_words(
+                audio_array=audio_array,
+                model_name=model_name,
+                language=language,
+                vad_options=vad_options,
+                beam_size=beam_size,
+                temperature=temperature,
+                best_of=best_of,
+                patience=patience,
+                length_penalty=length_penalty,
+                no_repeat_ngram_size=no_repeat_ngram_size,
+                initial_prompt=initial_prompt,
+                prefix_text=prefix_text,
+                condition_on_previous_text=condition_on_previous_text,
+                no_speech_threshold=no_speech_threshold,
+                log_prob_threshold=log_prob_threshold,
+                compression_ratio_threshold=compression_ratio_threshold
+            )
         )

requirements.txt CHANGED Viewed

@@ -9,16 +9,29 @@ aiofiles>=23.2.1
 faster-whisper>=1.0.0
 ctranslate2>=4.0.0
-# AI/ML - Speaker Diarization
-pyannote.audio>=3.1.0
 torch>=2.1.0
 torchaudio>=2.1.0
 # AI/ML - Vocal Separation
 audio-separator[cpu]>=0.17.0
 denoiser>=0.1.4
 # Audio processing
 ffmpeg-python>=0.2.0
 pydub>=0.25.1
@@ -27,5 +40,7 @@ pydantic-settings>=2.1.0
 python-dotenv>=1.0.0
 # Utilities
-aiohttp>=3.9.0
 numpy>=1.24.0

 faster-whisper>=1.0.0
 ctranslate2>=4.0.0
+# AI/ML - Speaker Diarization (from notebook cell #2)
+pyannote.audio>=3.3.1
 torch>=2.1.0
 torchaudio>=2.1.0
+torchvision
+lightning
+torchmetrics
+# Transformers Whisper + LoRA
+transformers>=4.39.0,<5
+accelerate>=0.26.0
+peft>=0.8.0
+huggingface-hub>=0.20.0
+safetensors>=0.4.0
 # AI/ML - Vocal Separation
 audio-separator[cpu]>=0.17.0
 denoiser>=0.1.4
 # Audio processing
+librosa>=0.10.0
 ffmpeg-python>=0.2.0
 pydub>=0.25.1
 python-dotenv>=1.0.0
 # Utilities
 numpy>=1.24.0