vyluong commited on
Commit
18541e0
·
verified ·
1 Parent(s): 1763dd8

Update app/services/transcription.py

Browse files
Files changed (1) hide show
  1. app/services/transcription.py +16 -12
app/services/transcription.py CHANGED
@@ -211,20 +211,22 @@ class TranscriptionService:
211
  model_name: str = None,
212
  language: str = "vi",
213
  vad_options: Optional[dict | bool] = None,
214
- beam_size: int = 3,
215
  temperature: float = 0.0,
216
- best_of: int = 5,
217
  patience: float = 1.0,
218
  length_penalty: float = 1.0,
219
  no_repeat_ngram_size: int = 3,
220
  # Prompting
221
- initial_prompt: str = "Hội thoại tổng đài. Chỉ ghi lại đúng lời nói trong audio.",
222
  prefix_text: Optional[str] = None,
223
  # Stability / filtering
224
- condition_on_previous_text: bool = False,
225
  no_speech_threshold: float = 0.70,
226
- log_prob_threshold: float = -1.0,
227
- compression_ratio_threshold: float = 2.4,
 
 
228
  ) -> Dict:
229
  """
230
  Transcribe audio and return word-level timestamps.
@@ -319,19 +321,21 @@ class TranscriptionService:
319
  model_name: str = None,
320
  language: str = "vi",
321
  vad_options: Optional[dict | bool] = None,
322
- beam_size: int = 5,
323
  temperature: float = 0.0,
324
- best_of: int = 5,
325
  patience: float = 1.0,
326
  length_penalty: float = 1.0,
327
  no_repeat_ngram_size: int = 3,
328
  initial_prompt: Optional[str] = None,
329
  prefix_text: Optional[str] = None,
330
- condition_on_previous_text: bool = False,
331
- no_speech_threshold: float = 0.70,
332
- log_prob_threshold: float = -1.0,
333
  # text repetitive / nonsense
334
- compression_ratio_threshold: float = 2.4,
 
 
335
  ) -> Dict:
336
  """
337
  Async wrapper for transcription (runs in thread pool).
 
211
  model_name: str = None,
212
  language: str = "vi",
213
  vad_options: Optional[dict | bool] = None,
214
+ beam_size: int = 1,
215
  temperature: float = 0.0,
216
+ best_of: int = 1,
217
  patience: float = 1.0,
218
  length_penalty: float = 1.0,
219
  no_repeat_ngram_size: int = 3,
220
  # Prompting
221
+ initial_prompt: str = None,
222
  prefix_text: Optional[str] = None,
223
  # Stability / filtering
224
+ condition_on_previous_text: bool = True,
225
  no_speech_threshold: float = 0.70,
226
+ log_prob_threshold: float = -0.5,
227
+ compression_ratio_threshold: float = 1.8,
228
+ word_timestamps=True
229
+
230
  ) -> Dict:
231
  """
232
  Transcribe audio and return word-level timestamps.
 
321
  model_name: str = None,
322
  language: str = "vi",
323
  vad_options: Optional[dict | bool] = None,
324
+ beam_size: int = 1,
325
  temperature: float = 0.0,
326
+ best_of: int = 1,
327
  patience: float = 1.0,
328
  length_penalty: float = 1.0,
329
  no_repeat_ngram_size: int = 3,
330
  initial_prompt: Optional[str] = None,
331
  prefix_text: Optional[str] = None,
332
+ condition_on_previous_text: bool = True,
333
+ no_speech_threshold: float = 0.60,
334
+ log_prob_threshold: float = -0.5,
335
  # text repetitive / nonsense
336
+ compression_ratio_threshold: float = 1.8,
337
+ word_timestamps=True
338
+
339
  ) -> Dict:
340
  """
341
  Async wrapper for transcription (runs in thread pool).