Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

asr_diarization/pipeline.py +66 -25
requirements.txt +2 -0

asr_diarization/pipeline.py CHANGED Viewed

@@ -187,8 +187,26 @@ class ASR_Diarization:
         print(f"🎯 Final: {len(filtered_segments)} segments for Whisper")
         return filtered_segments
     def run_transcription(self, audio_path, diar_json):
-        """FIXED: Transcription with proper timestamp conversion and error handling"""
         # FIX: Load and standardize audio
         audio, sr = torchaudio.load(audio_path)
@@ -236,42 +254,62 @@ class ASR_Diarization:
                 reduced = chunk
             try:
-                result = self.asr_pipeline(reduced)
             except Exception as e:
                 print(f"⚠️ Whisper failed on segment {start:.2f}-{end:.2f}: {e}")
                 continue
             tokens = []
             segment_text = ""
             if "chunks" in result:
-                for word_info in result["chunks"]:
-                    # FIX: Convert relative timestamps to absolute
-                    timestamp = word_info.get("timestamp")
-                    text = word_info.get("text", "").strip()
-                    if text:
-                        if timestamp and isinstance(timestamp, (list, tuple)) and len(timestamp) == 2:
-                            rel_start, rel_end = timestamp
-                            # Validate timestamps are reasonable
-                            if 0 <= rel_start < rel_end <= (end - start):
-                                abs_start = start + rel_start  # Convert to absolute time
-                                abs_end = start + rel_end      # Convert to absolute time
-                            else:
-                                # Invalid timestamps, use segment boundaries
-                                abs_start = start
-                                abs_end = end
                         else:
-                            # No timestamps from Whisper, use segment boundaries
                             abs_start = start
                             abs_end = end
-                        tokens.append({
-                            "start": abs_start,  # Store absolute time
-                            "end": abs_end,      # Store absolute time
-                            "text": text,
-                            "tag": "w"
-                        })
                         segment_text += text + " "
@@ -316,6 +354,9 @@ class ASR_Diarization:
         diar_json = self.run_diarization(audio_path)
         merged_segments, speakers = self.run_transcription(audio_path, diar_json)
         # NEW: Combine ASR segments with NSE events if provided
         if nse_events:
             print(f"🔊 Combining {len(merged_segments)} ASR segments with {len(nse_events)} NSE events")

         print(f"🎯 Final: {len(filtered_segments)} segments for Whisper")
         return filtered_segments
+    def map_speaker_labels(self, segments, original_speakers=['A', 'B', 'C', 'D']):
+        """NEW: Map SPEAKER_XX labels to A, B, C, D format to match original"""
+        unique_speakers = list(set([seg['speaker'] for seg in segments]))
+        speaker_map = {}
+        # Create mapping from SPEAKER_00 -> A, SPEAKER_01 -> B, etc.
+        for i, spk in enumerate(sorted(unique_speakers)):
+            if i < len(original_speakers):
+                speaker_map[spk] = original_speakers[i]
+            else:
+                speaker_map[spk] = f"SPK_{i}"
+        # Apply mapping to all segments
+        for seg in segments:
+            seg['speaker'] = speaker_map[seg['speaker']]
+        return segments, list(speaker_map.values())
     def run_transcription(self, audio_path, diar_json):
+        """FIXED: Transcription with proper word-level timestamp extraction"""
         # FIX: Load and standardize audio
         audio, sr = torchaudio.load(audio_path)
                 reduced = chunk
             try:
+                # FIX: Force word-level timestamps and better configuration
+                result = self.asr_pipeline(
+                    reduced,
+                    return_timestamps="word",  # FORCE word-level timestamps
+                    generate_kwargs={
+                        "task": "transcribe",
+                        "language": "en"
+                    }
+                )
             except Exception as e:
                 print(f"⚠️ Whisper failed on segment {start:.2f}-{end:.2f}: {e}")
                 continue
             tokens = []
             segment_text = ""
+            # FIXED: Proper word-level timestamp extraction
             if "chunks" in result:
+                for chunk_info in result["chunks"]:
+                    timestamp = chunk_info.get("timestamp")
+                    text = chunk_info.get("text", "").strip()
+                    if text and timestamp:
+                        chunk_start, chunk_end = timestamp
+                        # Validate and convert to absolute time
+                        if 0 <= chunk_start <= chunk_end <= (end - start):
+                            abs_start = start + chunk_start
+                            abs_end = start + chunk_end
                         else:
+                            # Fallback: use segment boundaries
                             abs_start = start
                             abs_end = end
+                        # NEW: Split into individual words with distributed timestamps
+                        words = text.split()
+                        if len(words) == 1:
+                            # Single word - use original timestamp
+                            tokens.append({
+                                "start": abs_start,
+                                "end": abs_end,
+                                "text": text,
+                                "tag": "w"
+                            })
+                        else:
+                            # Multiple words - distribute time evenly
+                            word_duration = (abs_end - abs_start) / len(words)
+                            for i, word in enumerate(words):
+                                word_start = abs_start + (i * word_duration)
+                                word_end = word_start + word_duration
+                                tokens.append({
+                                    "start": word_start,
+                                    "end": word_end,
+                                    "text": word,
+                                    "tag": "w"
+                                })
                         segment_text += text + " "
         diar_json = self.run_diarization(audio_path)
         merged_segments, speakers = self.run_transcription(audio_path, diar_json)
+        # NEW: Map speaker labels to match original format (A, B, C, D)
+        merged_segments, speakers = self.map_speaker_labels(merged_segments)
         # NEW: Combine ASR segments with NSE events if provided
         if nse_events:
             print(f"🔊 Combining {len(merged_segments)} ASR segments with {len(nse_events)} NSE events")

requirements.txt CHANGED Viewed

@@ -5,3 +5,5 @@ transformers
 noisereduce
 jiwer
 librosa

 noisereduce
 jiwer
 librosa
+webrtcvad
+resampy