Spaces:

ranamhamoud
/

Authenticity

Sleeping

App Files Files Community

Ranam Hamoud commited on Dec 2, 2025

Commit

67597e5

1 Parent(s): 8b3fa78

Add audio validation and fix tensor reshape error for short/invalid audio

Browse files

Files changed (1) hide show

speech_recognizer.py +87 -41

speech_recognizer.py CHANGED Viewed

@@ -4,6 +4,7 @@ import numpy as np
 import re
 from typing import Dict, Optional, List
 import warnings
 warnings.filterwarnings("ignore")
@@ -19,6 +20,26 @@ class SpeechRecognizer:
         print(f"Whisper model loaded successfully.")
         self.model_size = model_size
     def transcribe(
         self,
@@ -26,8 +47,13 @@ class SpeechRecognizer:
         language: Optional[str] = None,
         task: str = "transcribe"
     ) -> Dict[str, any]:
-        # Transcribe with Whisper (with word-level timestamps for better pause detection)
-        # Using fp16=False to avoid KV cache issues in production environments
         try:
             result = self.model.transcribe(
                 audio_path,
@@ -38,21 +64,36 @@ class SpeechRecognizer:
                 fp16=False  # Disable fp16 to avoid KV cache KeyError
             )
         except (KeyError, RuntimeError) as e:
-            # Fallback: transcribe without word timestamps if KV cache fails
-            print(f"Warning: Word-level timestamps failed ({e}), retrying without them...")
-            result = self.model.transcribe(
-                audio_path,
-                language=language,
-                task=task,
-                verbose=False,
-                word_timestamps=False,
-                fp16=False
-            )
         transcription = result['text'].strip()
         detected_language = result.get('language', 'unknown')
         segments = result.get('segments', [])
         analysis = self._analyze_transcription(transcription, segments)
         duration = analysis['duration'] if analysis['duration'] > 0 else 1.0
@@ -76,6 +117,39 @@ class SpeechRecognizer:
             'interpretation': self._interpret_speech_patterns(analysis, kopparapu_features, kopparapu_score)
         }
     def _analyze_transcription(self, text: str, segments: List[Dict]) -> Dict:
         words = text.split()
         word_count = len(words)
@@ -160,10 +234,6 @@ class SpeechRecognizer:
         self, text: str, duration_sec: float,
         segments: List[Dict] = None, pause_patterns: Dict = None
     ) -> Dict:
-        """
-        Extract enhanced Kopparapu-like linguistic features for read speech detection.
-        Based on: https://arxiv.org/pdf/2306.08012 with extensions.
-        """
         text = text.strip()
         if len(text) == 0:
             return {
@@ -259,11 +329,6 @@ class SpeechRecognizer:
         }
     def _compute_rate_variability(self, segments: List[Dict]) -> float:
-        """
-        Compute speech rate variability across segments.
-        Read speech has consistent rate; spontaneous varies with thinking.
-        Returns 0-1 where higher = more variable = more spontaneous.
-        """
         if not segments or len(segments) < 3:
             return 0.0
@@ -287,11 +352,6 @@ class SpeechRecognizer:
         return float(min(1.0, cv / 0.5))  # CV of 0.5+ maps to 1.0
     def _compute_sentence_variance(self, text: str) -> float:
-        """
-        Compute variance in sentence lengths.
-        Read/scripted text tends to have more uniform sentence structure.
-        Returns 0-1 where higher = more variance = more spontaneous.
-        """
         # Split into sentences
         sentences = re.split(r'[.!?]+', text)
         sentences = [s.strip() for s in sentences if s.strip()]
@@ -307,23 +367,9 @@ class SpeechRecognizer:
         cv = std_len / mean_len if mean_len > 0 else 0
         return float(min(1.0, cv / 0.6))  # CV of 0.6+ maps to 1.0
-    def _logistic(self, x: float, a: float, b: float) -> float:
-        """Sigmoid function centered at 'a' with steepness 'b'."""
-        return 1.0 / (1.0 + np.exp(-(x - a) / b))
     def _calculate_kopparapu_score(self, features: Dict) -> float:
-        """
-        Calculate enhanced Kopparapu score for read vs spontaneous classification.
-        Score closer to 1 = more likely READ, closer to 0 = more likely SPONTANEOUS.
-        Key signals for READ speech:
-        - Higher chars_per_word (formal vocabulary)
-        - Faster, steadier words_per_sec
-        - Lower filler rate and disfluencies
-        - Regular pause patterns (pause_regularity high)
-        - Low speech rate variability
-        - Uniform sentence lengths
-        """
         # L1: Vocabulary complexity - higher chars/word = more formal = read
         f1 = features['chars_per_word']
         L1 = self._logistic(f1, a=4.8, b=1.2)

 import re
 from typing import Dict, Optional, List
 import warnings
+import librosa
 warnings.filterwarnings("ignore")
         print(f"Whisper model loaded successfully.")
         self.model_size = model_size
+    def _validate_audio(self, audio_path: str) -> tuple[bool, str, float]:
+        """Validate audio file before transcription."""
+        try:
+            # Load audio to check if it's valid
+            audio, sr = librosa.load(audio_path, sr=16000)
+            duration = len(audio) / sr
+            # Check if audio is too short
+            if duration < 0.1:
+                return False, "Audio is too short (< 0.1 seconds)", duration
+            # Check if audio is empty or silent
+            if np.max(np.abs(audio)) < 0.001:
+                return False, "Audio appears to be silent or empty", duration
+            return True, "Valid", duration
+        except Exception as e:
+            return False, f"Failed to load audio: {str(e)}", 0.0
     def transcribe(
         self,
         language: Optional[str] = None,
         task: str = "transcribe"
     ) -> Dict[str, any]:
+        # Validate audio first
+        is_valid, message, audio_duration = self._validate_audio(audio_path)
+        if not is_valid:
+            print(f"Audio validation failed: {message}")
+            # Return minimal valid response for invalid audio
+            return self._get_empty_response(message, audio_duration)
         try:
             result = self.model.transcribe(
                 audio_path,
                 fp16=False  # Disable fp16 to avoid KV cache KeyError
             )
         except (KeyError, RuntimeError) as e:
+            error_msg = str(e)
+            # Check if it's a tensor shape error (empty audio issue)
+            if "reshape tensor of 0 elements" in error_msg or "ambiguous" in error_msg:
+                print(f"Audio processing failed: Audio may be too short or corrupted")
+                return self._get_empty_response("Audio too short or corrupted", audio_duration)
+            # Fallback: transcribe without word timestamps for other errors
+            print(f"Warning: Transcription failed ({error_msg[:100]}), retrying without word timestamps...")
+            try:
+                result = self.model.transcribe(
+                    audio_path,
+                    language=language,
+                    task=task,
+                    verbose=False,
+                    word_timestamps=False,
+                    fp16=False
+                )
+            except Exception as e2:
+                print(f"Transcription completely failed: {e2}")
+                return self._get_empty_response(f"Transcription failed: {str(e2)[:100]}", audio_duration)
         transcription = result['text'].strip()
         detected_language = result.get('language', 'unknown')
         segments = result.get('segments', [])
+        # Handle empty transcription
+        if not transcription or len(transcription.strip()) == 0:
+            print("Warning: Transcription is empty")
+            return self._get_empty_response("No speech detected in audio", audio_duration)
         analysis = self._analyze_transcription(transcription, segments)
         duration = analysis['duration'] if analysis['duration'] > 0 else 1.0
             'interpretation': self._interpret_speech_patterns(analysis, kopparapu_features, kopparapu_score)
         }
+    def _get_empty_response(self, reason: str, duration: float = 0.0) -> Dict[str, any]:
+        """Return a valid empty response when transcription fails."""
+        return {
+            'transcription': f"[Error: {reason}]",
+            'language': 'unknown',
+            'segments': [],
+            'word_count': 0,
+            'duration': duration,
+            'speech_rate': 0.0,
+            'pause_patterns': {
+                'avg_pause': 0.0,
+                'max_pause': 0.0,
+                'num_pauses': 0,
+                'pause_variability': 0.0
+            },
+            'filler_words': {
+                'count': 0,
+                'ratio': 0.0,
+                'details': {}
+            },
+            'kopparapu_features': {
+                'chars_per_word': 0.0,
+                'words_per_sec': 0.0,
+                'nonalpha_per_sec': 0.0,
+                'filler_rate': 0.0,
+                'repetition_count': 0,
+                'alpha_ratio': 0.0
+            },
+            'kopparapu_score': 0.5,
+            'kopparapu_classification': 'unknown',
+            'interpretation': f"⚠️ Audio processing failed: {reason}\n\nPlease ensure:\n- Audio is at least 1 second long\n- Audio contains actual speech\n- Audio file is not corrupted"
+        }
     def _analyze_transcription(self, text: str, segments: List[Dict]) -> Dict:
         words = text.split()
         word_count = len(words)
         self, text: str, duration_sec: float,
         segments: List[Dict] = None, pause_patterns: Dict = None
     ) -> Dict:
         text = text.strip()
         if len(text) == 0:
             return {
         }
     def _compute_rate_variability(self, segments: List[Dict]) -> float:
         if not segments or len(segments) < 3:
             return 0.0
         return float(min(1.0, cv / 0.5))  # CV of 0.5+ maps to 1.0
     def _compute_sentence_variance(self, text: str) -> float:
         # Split into sentences
         sentences = re.split(r'[.!?]+', text)
         sentences = [s.strip() for s in sentences if s.strip()]
         cv = std_len / mean_len if mean_len > 0 else 0
         return float(min(1.0, cv / 0.6))  # CV of 0.6+ maps to 1.0
+    def _logistic(self, x: float, a: float, b: float) -> float:        return 1.0 / (1.0 + np.exp(-(x - a) / b))
     def _calculate_kopparapu_score(self, features: Dict) -> float:
         # L1: Vocabulary complexity - higher chars/word = more formal = read
         f1 = features['chars_per_word']
         L1 = self._logistic(f1, a=4.8, b=1.2)