Spaces:
Running
Running
Update app/services/transcription.py
Browse files- app/services/transcription.py +16 -12
app/services/transcription.py
CHANGED
|
@@ -211,20 +211,22 @@ class TranscriptionService:
|
|
| 211 |
model_name: str = None,
|
| 212 |
language: str = "vi",
|
| 213 |
vad_options: Optional[dict | bool] = None,
|
| 214 |
-
beam_size: int =
|
| 215 |
temperature: float = 0.0,
|
| 216 |
-
best_of: int =
|
| 217 |
patience: float = 1.0,
|
| 218 |
length_penalty: float = 1.0,
|
| 219 |
no_repeat_ngram_size: int = 3,
|
| 220 |
# Prompting
|
| 221 |
-
initial_prompt: str =
|
| 222 |
prefix_text: Optional[str] = None,
|
| 223 |
# Stability / filtering
|
| 224 |
-
condition_on_previous_text: bool =
|
| 225 |
no_speech_threshold: float = 0.70,
|
| 226 |
-
log_prob_threshold: float = -
|
| 227 |
-
compression_ratio_threshold: float =
|
|
|
|
|
|
|
| 228 |
) -> Dict:
|
| 229 |
"""
|
| 230 |
Transcribe audio and return word-level timestamps.
|
|
@@ -319,19 +321,21 @@ class TranscriptionService:
|
|
| 319 |
model_name: str = None,
|
| 320 |
language: str = "vi",
|
| 321 |
vad_options: Optional[dict | bool] = None,
|
| 322 |
-
beam_size: int =
|
| 323 |
temperature: float = 0.0,
|
| 324 |
-
best_of: int =
|
| 325 |
patience: float = 1.0,
|
| 326 |
length_penalty: float = 1.0,
|
| 327 |
no_repeat_ngram_size: int = 3,
|
| 328 |
initial_prompt: Optional[str] = None,
|
| 329 |
prefix_text: Optional[str] = None,
|
| 330 |
-
condition_on_previous_text: bool =
|
| 331 |
-
no_speech_threshold: float = 0.
|
| 332 |
-
log_prob_threshold: float = -
|
| 333 |
# text repetitive / nonsense
|
| 334 |
-
compression_ratio_threshold: float =
|
|
|
|
|
|
|
| 335 |
) -> Dict:
|
| 336 |
"""
|
| 337 |
Async wrapper for transcription (runs in thread pool).
|
|
|
|
| 211 |
model_name: str = None,
|
| 212 |
language: str = "vi",
|
| 213 |
vad_options: Optional[dict | bool] = None,
|
| 214 |
+
beam_size: int = 1,
|
| 215 |
temperature: float = 0.0,
|
| 216 |
+
best_of: int = 1,
|
| 217 |
patience: float = 1.0,
|
| 218 |
length_penalty: float = 1.0,
|
| 219 |
no_repeat_ngram_size: int = 3,
|
| 220 |
# Prompting
|
| 221 |
+
initial_prompt: str = None,
|
| 222 |
prefix_text: Optional[str] = None,
|
| 223 |
# Stability / filtering
|
| 224 |
+
condition_on_previous_text: bool = True,
|
| 225 |
no_speech_threshold: float = 0.70,
|
| 226 |
+
log_prob_threshold: float = -0.5,
|
| 227 |
+
compression_ratio_threshold: float = 1.8,
|
| 228 |
+
word_timestamps=True
|
| 229 |
+
|
| 230 |
) -> Dict:
|
| 231 |
"""
|
| 232 |
Transcribe audio and return word-level timestamps.
|
|
|
|
| 321 |
model_name: str = None,
|
| 322 |
language: str = "vi",
|
| 323 |
vad_options: Optional[dict | bool] = None,
|
| 324 |
+
beam_size: int = 1,
|
| 325 |
temperature: float = 0.0,
|
| 326 |
+
best_of: int = 1,
|
| 327 |
patience: float = 1.0,
|
| 328 |
length_penalty: float = 1.0,
|
| 329 |
no_repeat_ngram_size: int = 3,
|
| 330 |
initial_prompt: Optional[str] = None,
|
| 331 |
prefix_text: Optional[str] = None,
|
| 332 |
+
condition_on_previous_text: bool = True,
|
| 333 |
+
no_speech_threshold: float = 0.60,
|
| 334 |
+
log_prob_threshold: float = -0.5,
|
| 335 |
# text repetitive / nonsense
|
| 336 |
+
compression_ratio_threshold: float = 1.8,
|
| 337 |
+
word_timestamps=True
|
| 338 |
+
|
| 339 |
) -> Dict:
|
| 340 |
"""
|
| 341 |
Async wrapper for transcription (runs in thread pool).
|