mazesmazes
/

tiny-audio

@@ -1,4 +1,3 @@
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 tokenizer_config.json -filter -diff -merge text
-tokenizer.json filter=lfs diff=lfs merge=lfs -text

 *.safetensors filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 tokenizer_config.json -filter -diff -merge text

asr_pipeline.py CHANGED Viewed

@@ -523,6 +523,13 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
         text = self._post_process_prediction(text)
         return {"text": text}
     def _post_process_prediction(self, text: str) -> str:
         """Post-process model output to fix common issues."""
         if not text:
@@ -531,22 +538,29 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
         # 1. LOWERCASE
         text = text.lower()
-        # 2. COMBINE ACRONYMS
         # Merge consecutive single letters into one word (e.g., "u s a" -> "usa")
         text = re.sub(r"\b([a-z])((?:\s+[a-z])+)\b", lambda m: m.group(0).replace(" ", ""), text)
-        # 3. NORMALIZE CURRENCY
         # Convert "eur X" to "X euros" for Whisper normalizer compatibility
         text = re.sub(r"\beur\s+(\d+)", r"\1 euros", text)
-        # 4. TRUNCATE TRAILING REPEATS
         text = self._truncate_trailing_repeats(text)
-        # 5. STRIP WHITESPACE
         return re.sub(r"\s+", " ", text).strip()
-    def _truncate_trailing_repeats(self, text: str, max_ngram: int = 4) -> str:
-        """Remove trailing repeated n-grams (1-4 words)."""
         words = text.split()
         if len(words) < 2:
             return text
@@ -566,3 +580,25 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
                     break  # Restart from largest n-gram
         return " ".join(words)

         text = self._post_process_prediction(text)
         return {"text": text}
+    # Known hallucination patterns that should be deleted entirely
+    HALLUCINATION_PATTERNS = frozenset(
+        [
+            "and gt and gt",
+        ]
+    )
     def _post_process_prediction(self, text: str) -> str:
         """Post-process model output to fix common issues."""
         if not text:
         # 1. LOWERCASE
         text = text.lower()
+        # 2. CHECK FOR KNOWN HALLUCINATIONS (delete entirely)
+        if text.strip() in self.HALLUCINATION_PATTERNS:
+            return ""
+        # 3. COMBINE ACRONYMS
         # Merge consecutive single letters into one word (e.g., "u s a" -> "usa")
         text = re.sub(r"\b([a-z])((?:\s+[a-z])+)\b", lambda m: m.group(0).replace(" ", ""), text)
+        # 4. NORMALIZE CURRENCY
         # Convert "eur X" to "X euros" for Whisper normalizer compatibility
         text = re.sub(r"\beur\s+(\d+)", r"\1 euros", text)
+        # 5. TRUNCATE CHARACTER REPETITIONS (e.g., "uhhhhhh" -> "uhh")
+        text = self._truncate_character_repetitions(text)
+        # 6. TRUNCATE TRAILING REPEATS (word-level)
         text = self._truncate_trailing_repeats(text)
+        # 7. STRIP WHITESPACE
         return re.sub(r"\s+", " ", text).strip()
+    def _truncate_trailing_repeats(self, text: str, max_ngram: int = 10) -> str:
+        """Remove trailing repeated n-grams (1-10 words)."""
         words = text.split()
         if len(words) < 2:
             return text
                     break  # Restart from largest n-gram
         return " ".join(words)
+    def _truncate_character_repetitions(self, text: str, max_repeats: int = 3) -> str:
+        """Remove excessive character repetitions (e.g., 'uhhhhhh' -> 'uhh').
+        Handles hallucinations where the model outputs the same character many times,
+        like "uhhhhhhhhhhhhhhhhhhhhhhhhh" at the end of a prediction.
+        Args:
+            text: Input text to clean
+            max_repeats: Maximum allowed consecutive repetitions of a character
+        Returns:
+            Text with character repetitions truncated
+        """
+        if not text:
+            return text
+        # Replace any character repeated more than max_repeats times with max_repeats
+        # Pattern: any character followed by itself N+ times
+        pattern = rf"(.)\1{{{max_repeats},}}"
+        replacement = r"\1" * max_repeats
+        return re.sub(pattern, replacement, text)

handler.py CHANGED Viewed

@@ -15,7 +15,18 @@ except ImportError:
 class EndpointHandler:
     def __init__(self, path: str = ""):
         import os
         import nltk
@@ -104,6 +115,14 @@ class EndpointHandler:
             print(f"Warmup skipped due to: {e}")
     def __call__(self, data: Dict[str, Any]) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
         inputs = data.get("inputs")
         if inputs is None:
             raise ValueError("Missing 'inputs' in request data")

 class EndpointHandler:
+    """HuggingFace Inference Endpoints handler for ASR model.
+    Handles model loading, warmup, and inference requests for deployment
+    on HuggingFace Inference Endpoints or similar services.
+    """
     def __init__(self, path: str = ""):
+        """Initialize the endpoint handler.
+        Args:
+            path: Path to model directory or HuggingFace model ID
+        """
         import os
         import nltk
             print(f"Warmup skipped due to: {e}")
     def __call__(self, data: Dict[str, Any]) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
+        """Process an inference request.
+        Args:
+            data: Request data containing 'inputs' (audio path/bytes) and optional 'parameters'
+        Returns:
+            Transcription result with 'text' key
+        """
         inputs = data.get("inputs")
         if inputs is None:
             raise ValueError("Missing 'inputs' in request data")