colab-user commited on
Commit
9f2d61a
·
1 Parent(s): 70edd34

fix transcription & post-processing

Browse files
Files changed (1) hide show
  1. app/services/transcription.py +20 -20
app/services/transcription.py CHANGED
@@ -168,12 +168,12 @@ class TranscriptionService:
168
  model_name: str = None,
169
  language: str = "vi",
170
  vad_options: Optional[dict] = None,
171
- beam_size: int = 3,
172
  temperature: float = 0.0,
173
- best_of: int = 2,
174
- patience: float = 0.7,
175
- length_penalty: float = 0.8,
176
- no_repeat_ngram_size=4,
177
 
178
  # Prompting
179
  initial_prompt: str = "Đây là hội thoại điện thoại giữa nhân viên và khách hàng.\
@@ -183,10 +183,9 @@ class TranscriptionService:
183
 
184
  # Stability / filtering
185
  condition_on_previous_text: bool = False,
186
- no_speech_threshold: float = 0.6,
187
  log_prob_threshold: float = -0.5,
188
- compression_ratio_threshold: float = 1.9,
189
-
190
  ) -> Dict:
191
  """
192
  Transcribe audio and return word-level timestamps.
@@ -273,18 +272,18 @@ class TranscriptionService:
273
  model_name: str = None,
274
  language: str = "vi",
275
  vad_options: Optional[dict] = None,
276
- beam_size: int = 3,
277
  temperature: float = 0.0,
278
- best_of: int = 2,
279
- patience: float = 0.7,
280
- length_penalty: float = 0.8,
281
- no_repeat_ngram_size =4,
282
  initial_prompt: Optional[str] = None,
283
  prefix_text: Optional[str] = None,
284
  condition_on_previous_text: bool = False,
285
- no_speech_threshold: float = 0.6,
286
  log_prob_threshold: float = -0.5,
287
- compression_ratio_threshold: float = 1.8,
288
  ) -> Dict:
289
  """
290
  Async wrapper for transcription (runs in thread pool).
@@ -310,7 +309,8 @@ class TranscriptionService:
310
  condition_on_previous_text=condition_on_previous_text,
311
  no_speech_threshold=no_speech_threshold,
312
  log_prob_threshold=log_prob_threshold,
313
- compression_ratio_threshold=compression_ratio_threshold,
 
314
  )
315
  )
316
 
@@ -443,12 +443,12 @@ class TranscriptionService:
443
  res = await cls.transcribe_with_words_async(
444
  chunk,
445
  model_name=model_name,
446
- beam_size=3 if not is_tail else 5,
447
- best_of=2,
448
  temperature=0,
449
- patience=0.7,
450
  condition_on_previous_text=False,
451
- no_speech_threshold=0.6,
452
  compression_ratio_threshold=1.8,
453
  )
454
 
 
168
  model_name: str = None,
169
  language: str = "vi",
170
  vad_options: Optional[dict] = None,
171
+ beam_size: int = 8,
172
  temperature: float = 0.0,
173
+ best_of: int = 5,
174
+ patience: float = 1.2,
175
+ length_penalty: float = 1.0,
176
+ no_repeat_ngram_size: int = 3,
177
 
178
  # Prompting
179
  initial_prompt: str = "Đây là hội thoại điện thoại giữa nhân viên và khách hàng.\
 
183
 
184
  # Stability / filtering
185
  condition_on_previous_text: bool = False,
186
+ no_speech_threshold: float = 0.3,
187
  log_prob_threshold: float = -0.5,
188
+ compression_ratio_threshold: float = 1.8
 
189
  ) -> Dict:
190
  """
191
  Transcribe audio and return word-level timestamps.
 
272
  model_name: str = None,
273
  language: str = "vi",
274
  vad_options: Optional[dict] = None,
275
+ beam_size: int = 8,
276
  temperature: float = 0.0,
277
+ best_of: int = 5,
278
+ patience: float = 1.2,
279
+ length_penalty: float = 1.0,
280
+ no_repeat_ngram_size: int = 3,
281
  initial_prompt: Optional[str] = None,
282
  prefix_text: Optional[str] = None,
283
  condition_on_previous_text: bool = False,
284
+ no_speech_threshold: float = 0.3,
285
  log_prob_threshold: float = -0.5,
286
+ compression_ratio_threshold: float = 1.8
287
  ) -> Dict:
288
  """
289
  Async wrapper for transcription (runs in thread pool).
 
309
  condition_on_previous_text=condition_on_previous_text,
310
  no_speech_threshold=no_speech_threshold,
311
  log_prob_threshold=log_prob_threshold,
312
+ compression_ratio_threshold=compression_ratio_threshold
313
+
314
  )
315
  )
316
 
 
443
  res = await cls.transcribe_with_words_async(
444
  chunk,
445
  model_name=model_name,
446
+ beam_size=8 if not is_tail else 5,
447
+ best_of=5,
448
  temperature=0,
449
+ patience=1.2,
450
  condition_on_previous_text=False,
451
+ no_speech_threshold=0.3,
452
  compression_ratio_threshold=1.8,
453
  )
454