Spaces:

vyluong
/

PoC_ASR_v5

Sleeping

App Files Files Community

colab-user commited on Feb 5

Commit

9f2d61a

1 Parent(s): 70edd34

fix transcription & post-processing

Browse files

Files changed (1) hide show

app/services/transcription.py +20 -20

app/services/transcription.py CHANGED Viewed

@@ -168,12 +168,12 @@ class TranscriptionService:
         model_name: str = None,
         language: str = "vi",
         vad_options: Optional[dict] = None,
-        beam_size: int = 3,
         temperature: float = 0.0,
-        best_of: int = 2,
-        patience: float = 0.7,
-        length_penalty: float = 0.8,
-        no_repeat_ngram_size=4,
         # Prompting
         initial_prompt: str = "Đây là hội thoại điện thoại giữa nhân viên và khách hàng.\
@@ -183,10 +183,9 @@ class TranscriptionService:
         # Stability / filtering
         condition_on_previous_text: bool = False,
-        no_speech_threshold: float = 0.6,
         log_prob_threshold: float = -0.5,
-        compression_ratio_threshold: float = 1.9,
     ) -> Dict:
         """
         Transcribe audio and return word-level timestamps.
@@ -273,18 +272,18 @@ class TranscriptionService:
         model_name: str = None,
         language: str = "vi",
         vad_options: Optional[dict] = None,
-        beam_size: int = 3,
         temperature: float = 0.0,
-        best_of: int = 2,
-        patience: float = 0.7,
-        length_penalty: float = 0.8,
-        no_repeat_ngram_size =4,
         initial_prompt: Optional[str] = None,
         prefix_text: Optional[str] = None,
         condition_on_previous_text: bool = False,
-        no_speech_threshold: float = 0.6,
         log_prob_threshold: float = -0.5,
-        compression_ratio_threshold: float = 1.8,
     ) -> Dict:
         """
         Async wrapper for transcription (runs in thread pool).
@@ -310,7 +309,8 @@ class TranscriptionService:
                 condition_on_previous_text=condition_on_previous_text,
                 no_speech_threshold=no_speech_threshold,
                 log_prob_threshold=log_prob_threshold,
-                compression_ratio_threshold=compression_ratio_threshold,
             )
         )
@@ -443,12 +443,12 @@ class TranscriptionService:
                 res = await cls.transcribe_with_words_async(
                     chunk,
                     model_name=model_name,
-                    beam_size=3 if not is_tail else 5,
-                    best_of=2,
                     temperature=0,
-                    patience=0.7,
                     condition_on_previous_text=False,
-                    no_speech_threshold=0.6,
                     compression_ratio_threshold=1.8,
                 )

         model_name: str = None,
         language: str = "vi",
         vad_options: Optional[dict] = None,
+        beam_size: int = 8,
         temperature: float = 0.0,
+        best_of: int = 5,
+        patience: float = 1.2,
+        length_penalty: float = 1.0,
+        no_repeat_ngram_size: int = 3,
         # Prompting
         initial_prompt: str = "Đây là hội thoại điện thoại giữa nhân viên và khách hàng.\
         # Stability / filtering
         condition_on_previous_text: bool = False,
+        no_speech_threshold: float = 0.3,
         log_prob_threshold: float = -0.5,
+        compression_ratio_threshold: float = 1.8
     ) -> Dict:
         """
         Transcribe audio and return word-level timestamps.
         model_name: str = None,
         language: str = "vi",
         vad_options: Optional[dict] = None,
+        beam_size: int = 8,
         temperature: float = 0.0,
+        best_of: int = 5,
+        patience: float = 1.2,
+        length_penalty: float = 1.0,
+        no_repeat_ngram_size: int = 3,
         initial_prompt: Optional[str] = None,
         prefix_text: Optional[str] = None,
         condition_on_previous_text: bool = False,
+        no_speech_threshold: float = 0.3,
         log_prob_threshold: float = -0.5,
+        compression_ratio_threshold: float = 1.8
     ) -> Dict:
         """
         Async wrapper for transcription (runs in thread pool).
                 condition_on_previous_text=condition_on_previous_text,
                 no_speech_threshold=no_speech_threshold,
                 log_prob_threshold=log_prob_threshold,
+                compression_ratio_threshold=compression_ratio_threshold
             )
         )
                 res = await cls.transcribe_with_words_async(
                     chunk,
                     model_name=model_name,
+                    beam_size=8 if not is_tail else 5,
+                    best_of=5,
                     temperature=0,
+                    patience=1.2,
                     condition_on_previous_text=False,
+                    no_speech_threshold=0.3,
                     compression_ratio_threshold=1.8,
                 )