Spaces:
Sleeping
Sleeping
colab-user commited on
Commit ·
9f2d61a
1
Parent(s): 70edd34
fix transcription & post-processing
Browse files- app/services/transcription.py +20 -20
app/services/transcription.py
CHANGED
|
@@ -168,12 +168,12 @@ class TranscriptionService:
|
|
| 168 |
model_name: str = None,
|
| 169 |
language: str = "vi",
|
| 170 |
vad_options: Optional[dict] = None,
|
| 171 |
-
beam_size: int =
|
| 172 |
temperature: float = 0.0,
|
| 173 |
-
best_of: int =
|
| 174 |
-
patience: float =
|
| 175 |
-
length_penalty: float =
|
| 176 |
-
no_repeat_ngram_size=
|
| 177 |
|
| 178 |
# Prompting
|
| 179 |
initial_prompt: str = "Đây là hội thoại điện thoại giữa nhân viên và khách hàng.\
|
|
@@ -183,10 +183,9 @@ class TranscriptionService:
|
|
| 183 |
|
| 184 |
# Stability / filtering
|
| 185 |
condition_on_previous_text: bool = False,
|
| 186 |
-
no_speech_threshold: float = 0.
|
| 187 |
log_prob_threshold: float = -0.5,
|
| 188 |
-
compression_ratio_threshold: float = 1.
|
| 189 |
-
|
| 190 |
) -> Dict:
|
| 191 |
"""
|
| 192 |
Transcribe audio and return word-level timestamps.
|
|
@@ -273,18 +272,18 @@ class TranscriptionService:
|
|
| 273 |
model_name: str = None,
|
| 274 |
language: str = "vi",
|
| 275 |
vad_options: Optional[dict] = None,
|
| 276 |
-
beam_size: int =
|
| 277 |
temperature: float = 0.0,
|
| 278 |
-
best_of: int =
|
| 279 |
-
patience: float =
|
| 280 |
-
length_penalty: float =
|
| 281 |
-
no_repeat_ngram_size =
|
| 282 |
initial_prompt: Optional[str] = None,
|
| 283 |
prefix_text: Optional[str] = None,
|
| 284 |
condition_on_previous_text: bool = False,
|
| 285 |
-
no_speech_threshold: float = 0.
|
| 286 |
log_prob_threshold: float = -0.5,
|
| 287 |
-
compression_ratio_threshold: float = 1.8
|
| 288 |
) -> Dict:
|
| 289 |
"""
|
| 290 |
Async wrapper for transcription (runs in thread pool).
|
|
@@ -310,7 +309,8 @@ class TranscriptionService:
|
|
| 310 |
condition_on_previous_text=condition_on_previous_text,
|
| 311 |
no_speech_threshold=no_speech_threshold,
|
| 312 |
log_prob_threshold=log_prob_threshold,
|
| 313 |
-
compression_ratio_threshold=compression_ratio_threshold
|
|
|
|
| 314 |
)
|
| 315 |
)
|
| 316 |
|
|
@@ -443,12 +443,12 @@ class TranscriptionService:
|
|
| 443 |
res = await cls.transcribe_with_words_async(
|
| 444 |
chunk,
|
| 445 |
model_name=model_name,
|
| 446 |
-
beam_size=
|
| 447 |
-
best_of=
|
| 448 |
temperature=0,
|
| 449 |
-
patience=
|
| 450 |
condition_on_previous_text=False,
|
| 451 |
-
no_speech_threshold=0.
|
| 452 |
compression_ratio_threshold=1.8,
|
| 453 |
)
|
| 454 |
|
|
|
|
| 168 |
model_name: str = None,
|
| 169 |
language: str = "vi",
|
| 170 |
vad_options: Optional[dict] = None,
|
| 171 |
+
beam_size: int = 8,
|
| 172 |
temperature: float = 0.0,
|
| 173 |
+
best_of: int = 5,
|
| 174 |
+
patience: float = 1.2,
|
| 175 |
+
length_penalty: float = 1.0,
|
| 176 |
+
no_repeat_ngram_size: int = 3,
|
| 177 |
|
| 178 |
# Prompting
|
| 179 |
initial_prompt: str = "Đây là hội thoại điện thoại giữa nhân viên và khách hàng.\
|
|
|
|
| 183 |
|
| 184 |
# Stability / filtering
|
| 185 |
condition_on_previous_text: bool = False,
|
| 186 |
+
no_speech_threshold: float = 0.3,
|
| 187 |
log_prob_threshold: float = -0.5,
|
| 188 |
+
compression_ratio_threshold: float = 1.8
|
|
|
|
| 189 |
) -> Dict:
|
| 190 |
"""
|
| 191 |
Transcribe audio and return word-level timestamps.
|
|
|
|
| 272 |
model_name: str = None,
|
| 273 |
language: str = "vi",
|
| 274 |
vad_options: Optional[dict] = None,
|
| 275 |
+
beam_size: int = 8,
|
| 276 |
temperature: float = 0.0,
|
| 277 |
+
best_of: int = 5,
|
| 278 |
+
patience: float = 1.2,
|
| 279 |
+
length_penalty: float = 1.0,
|
| 280 |
+
no_repeat_ngram_size: int = 3,
|
| 281 |
initial_prompt: Optional[str] = None,
|
| 282 |
prefix_text: Optional[str] = None,
|
| 283 |
condition_on_previous_text: bool = False,
|
| 284 |
+
no_speech_threshold: float = 0.3,
|
| 285 |
log_prob_threshold: float = -0.5,
|
| 286 |
+
compression_ratio_threshold: float = 1.8
|
| 287 |
) -> Dict:
|
| 288 |
"""
|
| 289 |
Async wrapper for transcription (runs in thread pool).
|
|
|
|
| 309 |
condition_on_previous_text=condition_on_previous_text,
|
| 310 |
no_speech_threshold=no_speech_threshold,
|
| 311 |
log_prob_threshold=log_prob_threshold,
|
| 312 |
+
compression_ratio_threshold=compression_ratio_threshold
|
| 313 |
+
|
| 314 |
)
|
| 315 |
)
|
| 316 |
|
|
|
|
| 443 |
res = await cls.transcribe_with_words_async(
|
| 444 |
chunk,
|
| 445 |
model_name=model_name,
|
| 446 |
+
beam_size=8 if not is_tail else 5,
|
| 447 |
+
best_of=5,
|
| 448 |
temperature=0,
|
| 449 |
+
patience=1.2,
|
| 450 |
condition_on_previous_text=False,
|
| 451 |
+
no_speech_threshold=0.3,
|
| 452 |
compression_ratio_threshold=1.8,
|
| 453 |
)
|
| 454 |
|