Spaces:

vyluong
/

PoC_ASR_v6_dev

Running

App Files Files Community

vyluong commited on Apr 2

Commit

18541e0

verified ·

1 Parent(s): 1763dd8

Update app/services/transcription.py

Browse files

Files changed (1) hide show

app/services/transcription.py +16 -12

app/services/transcription.py CHANGED Viewed

@@ -211,20 +211,22 @@ class TranscriptionService:
         model_name: str = None,
         language: str = "vi",
         vad_options: Optional[dict | bool] = None,
-        beam_size: int = 3,
         temperature: float = 0.0,
-        best_of: int = 5,
         patience: float = 1.0,
         length_penalty: float = 1.0,
         no_repeat_ngram_size: int = 3,
         # Prompting
-        initial_prompt: str = "Hội thoại tổng đài. Chỉ ghi lại đúng lời nói trong audio.",
         prefix_text: Optional[str] = None,
         # Stability / filtering
-        condition_on_previous_text: bool = False,
         no_speech_threshold: float = 0.70,
-        log_prob_threshold: float = -1.0,
-        compression_ratio_threshold: float = 2.4,
     ) -> Dict:
         """
         Transcribe audio and return word-level timestamps.
@@ -319,19 +321,21 @@ class TranscriptionService:
         model_name: str = None,
         language: str = "vi",
         vad_options: Optional[dict | bool] = None,
-        beam_size: int = 5,
         temperature: float = 0.0,
-        best_of: int = 5,
         patience: float = 1.0,
         length_penalty: float = 1.0,
         no_repeat_ngram_size: int = 3,
         initial_prompt: Optional[str] = None,
         prefix_text: Optional[str] = None,
-        condition_on_previous_text: bool = False,
-        no_speech_threshold: float = 0.70,
-        log_prob_threshold: float = -1.0,
         # text repetitive / nonsense
-        compression_ratio_threshold: float = 2.4,
     ) -> Dict:
         """
         Async wrapper for transcription (runs in thread pool).

         model_name: str = None,
         language: str = "vi",
         vad_options: Optional[dict | bool] = None,
+        beam_size: int = 1,
         temperature: float = 0.0,
+        best_of: int = 1,
         patience: float = 1.0,
         length_penalty: float = 1.0,
         no_repeat_ngram_size: int = 3,
         # Prompting
+        initial_prompt: str = None,
         prefix_text: Optional[str] = None,
         # Stability / filtering
+        condition_on_previous_text: bool = True,
         no_speech_threshold: float = 0.70,
+        log_prob_threshold: float = -0.5,
+        compression_ratio_threshold: float = 1.8,
+        word_timestamps=True
     ) -> Dict:
         """
         Transcribe audio and return word-level timestamps.
         model_name: str = None,
         language: str = "vi",
         vad_options: Optional[dict | bool] = None,
+        beam_size: int = 1,
         temperature: float = 0.0,
+        best_of: int = 1,
         patience: float = 1.0,
         length_penalty: float = 1.0,
         no_repeat_ngram_size: int = 3,
         initial_prompt: Optional[str] = None,
         prefix_text: Optional[str] = None,
+        condition_on_previous_text: bool = True,
+        no_speech_threshold: float = 0.60,
+        log_prob_threshold: float = -0.5,
         # text repetitive / nonsense
+        compression_ratio_threshold: float = 1.8,
+        word_timestamps=True
     ) -> Dict:
         """
         Async wrapper for transcription (runs in thread pool).