colab-user commited on
Commit
d8c95b8
·
1 Parent(s): 832e106

test model & pipeline

Browse files
app/core/config.py CHANGED
@@ -32,13 +32,13 @@ class Settings(BaseSettings):
32
 
33
  # Model settings
34
  whisper_model: str = "vyluong/pho-whisper-vi-ct2"
35
- diarization_model: str = "pyannote/speaker-diarization-3.1"
36
 
37
  # Device settings
38
  device: Literal["cuda", "cpu", "auto"] = "auto"
39
  compute_type: str = "float16" # float16 for GPU, int8 for CPU
40
 
41
- # Upload settings
42
  max_upload_size_mb: int = 100
43
  allowed_extensions: list[str] = ["mp3", "wav", "m4a", "ogg", "flac", "webm"]
44
 
@@ -50,14 +50,16 @@ class Settings(BaseSettings):
50
  noise_reduction_level: float = 12.0 # Used by anlmdn
51
  enable_loudnorm: bool = True
52
 
53
- # VAD parameters
54
- vad_threshold: float = 0.5
55
- vad_min_speech_duration_ms: int = 250
56
- vad_min_silence_duration_ms: int = 500
 
 
57
 
58
  # Post-processing
59
- merge_threshold_s: float = 0.5 # Merge segments from same speaker if gap < this
60
- min_segment_duration_s: float = 0.3 # Remove segments shorter than this
61
 
62
  # Server settings
63
  host: str = "0.0.0.0"
 
32
 
33
  # Model settings
34
  whisper_model: str = "vyluong/pho-whisper-vi-ct2"
35
+ diarization_model: str = "pyannote/speaker-diarization-community-1"
36
 
37
  # Device settings
38
  device: Literal["cuda", "cpu", "auto"] = "auto"
39
  compute_type: str = "float16" # float16 for GPU, int8 for CPU
40
 
41
+ # Upload settings
42
  max_upload_size_mb: int = 100
43
  allowed_extensions: list[str] = ["mp3", "wav", "m4a", "ogg", "flac", "webm"]
44
 
 
50
  noise_reduction_level: float = 12.0 # Used by anlmdn
51
  enable_loudnorm: bool = True
52
 
53
+ # VAD parameters
54
+ vad_threshold: float = 0.55
55
+ vad_min_speech_duration_ms: int = 200
56
+ vad_min_silence_duration_ms: int = 450
57
+ vad_speech_pad_ms: int = 250
58
+
59
 
60
  # Post-processing
61
+ merge_threshold_s: float = 0.35 # Merge segments from same speaker if gap < this
62
+ min_segment_duration_s: float = 0.85 # Remove segments shorter than this
63
 
64
  # Server settings
65
  host: str = "0.0.0.0"
app/services/orchestrator.py CHANGED
@@ -43,7 +43,7 @@ class PipelineOrchestrator:
43
  # Step 2: AI Processing (Transcription & Diarization)
44
  logger.info(f"[Step 2/4] Starting AI models (Whisper + Pyannote) for: {wav_path.name}")
45
 
46
- transcription_task = TranscriptionService.transcribe_async(wav_path)
47
  diarization_task = DiarizationService.diarize_async(wav_path)
48
 
49
  try:
 
43
  # Step 2: AI Processing (Transcription & Diarization)
44
  logger.info(f"[Step 2/4] Starting AI models (Whisper + Pyannote) for: {wav_path.name}")
45
 
46
+ transcription_task = TranscriptionService.transcribe_with_words_async(wav_path)
47
  diarization_task = DiarizationService.diarize_async(wav_path)
48
 
49
  try:
app/services/transcription.py CHANGED
@@ -5,8 +5,9 @@ Returns word-level timestamps for precision alignment.
5
  """
6
  import logging
7
  from pathlib import Path
8
- from typing import List, Optional
9
  from dataclasses import dataclass
 
10
 
11
  from faster_whisper import WhisperModel
12
 
@@ -77,92 +78,177 @@ class TranscriptionService:
77
  return cls._model is not None
78
 
79
  @classmethod
80
- def transcribe(
 
 
 
 
 
 
 
 
 
81
  cls,
82
- audio_path: Path,
 
83
  language: str = "vi",
84
- initial_prompt: Optional[str] = None
85
- ) -> List[WordTimestamp]:
86
- """
87
- Transcribe audio file with word-level timestamps.
 
 
 
 
 
 
 
 
88
 
89
- Args:
90
- audio_path: Path to WAV audio file
91
- language: Language code (default: Vietnamese)
92
- initial_prompt: Optional prompt for context
93
-
94
- Returns:
95
- List of WordTimestamp with precise timing for each word
 
 
96
  """
97
- model = cls.get_model()
98
 
99
- logger.debug(f"Transcribing: {audio_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
- # Run transcription with word timestamps - CRITICAL for precision alignment
102
- segments_generator, info = model.transcribe(
103
- str(audio_path),
104
- language=language,
105
- initial_prompt=initial_prompt,
106
- word_timestamps=True, # CRITICAL: Enable word-level timestamps
107
- vad_filter=True, # Re-enabled for optimization
108
- vad_parameters=dict(
109
- threshold=settings.vad_threshold,
110
- min_speech_duration_ms=settings.vad_min_speech_duration_ms,
111
- min_silence_duration_ms=settings.vad_min_silence_duration_ms,
112
- ),
113
- beam_size=5,
114
- best_of=5,
115
  )
116
-
117
- # Extract all words with timestamps
118
- all_words = []
119
- segment_count = 0
120
-
121
- for segment in segments_generator:
122
- segment_count += 1
123
- if segment.words:
124
- for word in segment.words:
125
- all_words.append(WordTimestamp(
126
- word=word.word.strip(),
127
- start=word.start,
128
- end=word.end
129
- ))
130
-
131
- logger.info(f"Transcription complete: {segment_count} segments, {len(all_words)} words, detected language: {info.language}")
132
-
133
- return all_words
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
  @classmethod
136
- async def transcribe_async(
137
  cls,
138
- audio_path: Path,
 
139
  language: str = "vi",
140
- initial_prompt: Optional[str] = None
141
- ) -> List[WordTimestamp]:
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  """
143
  Async wrapper for transcription (runs in thread pool).
144
-
145
- Args:
146
- audio_path: Path to WAV audio file
147
- language: Language code
148
- initial_prompt: Optional prompt
149
-
150
- Returns:
151
- List of WordTimestamp
152
  """
153
  import asyncio
154
-
155
- loop = asyncio.get_event_loop()
156
  return await loop.run_in_executor(
157
  None,
158
- lambda: cls.transcribe(audio_path, language, initial_prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  )
160
-
161
- @classmethod
162
- def preload_model(cls) -> None:
163
- """Preload the model during startup."""
164
- try:
165
- cls.get_model()
166
- except Exception as e:
167
- logger.error(f"Failed to preload Whisper model: {e}")
168
- raise
 
5
  """
6
  import logging
7
  from pathlib import Path
8
+ from typing import List, Optional, Dict
9
  from dataclasses import dataclass
10
+ import numpy as np
11
 
12
  from faster_whisper import WhisperModel
13
 
 
78
  return cls._model is not None
79
 
80
  @classmethod
81
+ def preload_model(cls) -> None:
82
+ """Preload the model during startup."""
83
+ try:
84
+ cls.get_model()
85
+ except Exception as e:
86
+ logger.error(f"Failed to preload Whisper model: {e}")
87
+ raise
88
+
89
+ @classmethod
90
+ def transcribe_with_words(
91
  cls,
92
+ audio_array: np.ndarray,
93
+ model_name: str = None,
94
  language: str = "vi",
95
+ vad_options: Optional[dict | bool] = None,
96
+ beam_size: int = 3,
97
+ temperature: float = 0.0,
98
+ best_of: int = 5,
99
+ patience: float = 1.0,
100
+ length_penalty: float = 1.0,
101
+ no_repeat_ngram_size: int = 3,
102
+
103
+ # Prompting
104
+ initial_prompt: str = "Hội thoại tổng đài. Chỉ ghi lại đúng lời nói trong audio.",
105
+
106
+ prefix_text: Optional[str] = None,
107
 
108
+ # Stability / filtering
109
+ condition_on_previous_text: bool = False,
110
+ no_speech_threshold: float = 0.70,
111
+ log_prob_threshold: float = -1.0,
112
+ compression_ratio_threshold: float = 2.4
113
+
114
+ ) -> Dict:
115
+ """
116
+ Transcribe audio and return word-level timestamps.
117
  """
118
+ model = cls.get_model(model_name)
119
 
120
+ if vad_options is None or vad_options is False:
121
+ use_vad = False
122
+ vad_parameters = None
123
+
124
+ elif vad_options is True:
125
+ use_vad = True
126
+ vad_parameters = {
127
+ "threshold": settings.vad_threshold,
128
+ "min_speech_duration_ms": settings.vad_min_speech_duration_ms,
129
+ "min_silence_duration_ms": settings.vad_min_silence_duration_ms,
130
+ }
131
+
132
+ elif isinstance(vad_options, dict):
133
+ use_vad = True
134
+ vad_parameters = vad_options
135
+
136
+ else:
137
+ use_vad = False
138
+ vad_parameters = None
139
+
140
 
141
+ prompt = (
142
+ initial_prompt.strip()
143
+ if isinstance(initial_prompt, str) and initial_prompt.strip()
144
+ else None
 
 
 
 
 
 
 
 
 
 
145
  )
146
+
147
+ prefix = (
148
+ prefix_text.strip()
149
+ if isinstance(prefix_text, str) and prefix_text.strip()
150
+ else None
151
+ )
152
+
153
+ segments_gen, info = model.transcribe(
154
+ audio_array,
155
+ language=language if language != "auto" else None,
156
+
157
+ # decoding
158
+ beam_size=beam_size,
159
+ temperature=temperature,
160
+ best_of=best_of,
161
+ patience=patience,
162
+ length_penalty=length_penalty,
163
+ no_repeat_ngram_size=no_repeat_ngram_size,
164
+
165
+ # prompting
166
+ prefix=prefix,
167
+
168
+ # QA / Stability
169
+ condition_on_previous_text=condition_on_previous_text,
170
+ no_speech_threshold=no_speech_threshold,
171
+ log_prob_threshold=log_prob_threshold,
172
+ compression_ratio_threshold=compression_ratio_threshold,
173
+
174
+ word_timestamps=True,
175
+
176
+ # VAD
177
+ vad_filter=use_vad,
178
+ vad_parameters=vad_parameters,
179
+ initial_prompt=prompt,
180
+ )
181
+
182
+ words = []
183
+ full_text = []
184
+
185
+ for seg in segments_gen:
186
+ if seg.text:
187
+ full_text.append(seg.text.strip())
188
+
189
+ if hasattr(seg, "words") and seg.words:
190
+ for w in seg.words:
191
+ if not w.word.strip():
192
+ continue
193
+ words.append({
194
+ "word": w.word.strip(),
195
+ "start": float(w.start),
196
+ "end": float(w.end),
197
+ })
198
+
199
+ return {
200
+ "text": " ".join(full_text).strip(),
201
+ "words": words,
202
+ "info": info,
203
+ }
204
+
205
 
206
  @classmethod
207
+ async def transcribe_with_words_async(
208
  cls,
209
+ audio_array: np.ndarray,
210
+ model_name: str = None,
211
  language: str = "vi",
212
+ vad_options: Optional[dict | bool] = None,
213
+ beam_size: int = 5,
214
+ temperature: float = 0.0,
215
+ best_of: int = 5,
216
+ patience: float = 1.0,
217
+ length_penalty: float = 1.0,
218
+ no_repeat_ngram_size: int = 3,
219
+ initial_prompt: Optional[str] = None,
220
+ prefix_text: Optional[str] = None,
221
+ condition_on_previous_text: bool = False,
222
+ no_speech_threshold: float = 0.70,
223
+ log_prob_threshold: float = -1.0,
224
+ # text repetitive / nonsense
225
+ compression_ratio_threshold: float = 2.4
226
+ ) -> Dict:
227
  """
228
  Async wrapper for transcription (runs in thread pool).
 
 
 
 
 
 
 
 
229
  """
230
  import asyncio
231
+
232
+ loop = asyncio.get_running_loop()
233
  return await loop.run_in_executor(
234
  None,
235
+ lambda: cls.transcribe_with_words(
236
+ audio_array=audio_array,
237
+ model_name=model_name,
238
+ language=language,
239
+ vad_options=vad_options,
240
+ beam_size=beam_size,
241
+ temperature=temperature,
242
+ best_of=best_of,
243
+ patience=patience,
244
+ length_penalty=length_penalty,
245
+ no_repeat_ngram_size=no_repeat_ngram_size,
246
+ initial_prompt=initial_prompt,
247
+ prefix_text=prefix_text,
248
+ condition_on_previous_text=condition_on_previous_text,
249
+ no_speech_threshold=no_speech_threshold,
250
+ log_prob_threshold=log_prob_threshold,
251
+ compression_ratio_threshold=compression_ratio_threshold
252
+
253
+ )
254
  )
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -9,16 +9,29 @@ aiofiles>=23.2.1
9
  faster-whisper>=1.0.0
10
  ctranslate2>=4.0.0
11
 
12
- # AI/ML - Speaker Diarization
13
- pyannote.audio>=3.1.0
14
  torch>=2.1.0
15
  torchaudio>=2.1.0
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  # AI/ML - Vocal Separation
18
  audio-separator[cpu]>=0.17.0
19
  denoiser>=0.1.4
20
 
21
  # Audio processing
 
22
  ffmpeg-python>=0.2.0
23
  pydub>=0.25.1
24
 
@@ -27,5 +40,7 @@ pydantic-settings>=2.1.0
27
  python-dotenv>=1.0.0
28
 
29
  # Utilities
30
- aiohttp>=3.9.0
31
  numpy>=1.24.0
 
 
 
 
9
  faster-whisper>=1.0.0
10
  ctranslate2>=4.0.0
11
 
12
+ # AI/ML - Speaker Diarization (from notebook cell #2)
13
+ pyannote.audio>=3.3.1
14
  torch>=2.1.0
15
  torchaudio>=2.1.0
16
+ torchvision
17
+ lightning
18
+ torchmetrics
19
+
20
+
21
+ # Transformers Whisper + LoRA
22
+ transformers>=4.39.0,<5
23
+ accelerate>=0.26.0
24
+ peft>=0.8.0
25
+ huggingface-hub>=0.20.0
26
+ safetensors>=0.4.0
27
+
28
 
29
  # AI/ML - Vocal Separation
30
  audio-separator[cpu]>=0.17.0
31
  denoiser>=0.1.4
32
 
33
  # Audio processing
34
+ librosa>=0.10.0
35
  ffmpeg-python>=0.2.0
36
  pydub>=0.25.1
37
 
 
40
  python-dotenv>=1.0.0
41
 
42
  # Utilities
 
43
  numpy>=1.24.0
44
+
45
+
46
+