Spaces:

nsfwalex
/

whisper-transcribe-new

Runtime error

App Files Files Community

liuyang commited on Sep 16

Commit

7bde45c

1 Parent(s): 36812ab

Refactor speaker assignment logic in transcription: Enhanced the `assign_speakers_to_transcription` method to detect unmatched diarization segments and introduced a second pass for splitting segments with speaker changes. Improved handling of speaker transitions and added functionality to re-process unmatched segments.

Browse files

Files changed (1) hide show

app.py +210 -50

app.py CHANGED Viewed

@@ -565,9 +565,13 @@ class WhisperTranscriber:
         return _embedder
     def assign_speakers_to_transcription(self, transcription_results, diarization_segments):
-        """Assign speakers to words and segments based on overlap with diarization segments."""
         if not diarization_segments:
-            return transcription_results
         # Helper: find the diarization speaker active at time t, or closest
         def speaker_at(t: float):
             for dseg in diarization_segments:
@@ -607,8 +611,8 @@ class WhisperTranscriber:
             mid = (float(start_t) + float(end_t)) / 2.0
             return speaker_at(mid)
         for seg in transcription_results:
-            # Assign per-word speakers using overlap, then smooth and stabilize boundaries
             if seg.get("words"):
                 words = seg["words"]
                 # 1) Initial assignment by overlap
@@ -628,55 +632,165 @@ class WhisperTranscriber:
                             smoothed[i] = prev_spk
                     for i in range(len(words)):
                         words[i]["speaker"] = smoothed[i]
-                # 3) Determine dominant speaker by summed word durations
-                speaker_dur = {}
-                total_word_dur = 0.0
-                for w in words:
-                    dur = max(0.0, float(w["end"]) - float(w["start"]))
-                    total_word_dur += dur
-                    spk = w.get("speaker", "SPEAKER_00")
-                    speaker_dur[spk] = speaker_dur.get(spk, 0.0) + dur
-                if speaker_dur:
-                    dominant_speaker = max(speaker_dur.items(), key=lambda kv: kv[1])[0]
-                else:
-                    dominant_speaker = speaker_at((float(seg["start"]) + float(seg["end"])) / 2.0)
-                # 4) Boundary stabilization: relabel tiny prefix/suffix runs to dominant
-                seg_duration = max(1e-6, float(seg["end"]) - float(seg["start"]))
-                max_boundary_sec = 0.5  # hard cap for how much to relabel at edges
-                max_boundary_frac = 0.2  # or up to 20% of the segment duration
-                # prefix
-                prefix_dur = 0.0
-                prefix_count = 0
-                for w in words:
-                    if w.get("speaker") == dominant_speaker:
-                        break
-                    prefix_dur += max(0.0, float(w["end"]) - float(w["start"]))
-                    prefix_count += 1
-                if prefix_count > 0 and prefix_dur <= min(max_boundary_sec, max_boundary_frac * seg_duration):
-                    for i in range(prefix_count):
-                        words[i]["speaker"] = dominant_speaker
-                # suffix
-                suffix_dur = 0.0
-                suffix_count = 0
-                for w in reversed(words):
-                    if w.get("speaker") == dominant_speaker:
-                        break
-                    suffix_dur += max(0.0, float(w["end"]) - float(w["start"]))
-                    suffix_count += 1
-                if suffix_count > 0 and suffix_dur <= min(max_boundary_sec, max_boundary_frac * seg_duration):
-                    for i in range(len(words) - suffix_count, len(words)):
-                        words[i]["speaker"] = dominant_speaker
-                # 5) Final segment speaker
-                seg["speaker"] = dominant_speaker
             else:
                 # No word timings: choose by overlap with diarization over the whole segment
                 seg["speaker"] = best_speaker_for_interval(float(seg["start"]), float(seg["end"]))
-        return transcription_results
     def group_segments_by_speaker(self, segments, max_gap=1.0, max_duration=30.0):
         """Group consecutive segments from the same speaker"""
@@ -801,7 +915,53 @@ class WhisperTranscriber:
             )
             # Step 4: Merge diarization into transcription (assign speakers)
-            transcription_results = self.assign_speakers_to_transcription(transcription_results, diarization_segments)
             # Step 5: Group segments if requested
             if group_segments:

         return _embedder
     def assign_speakers_to_transcription(self, transcription_results, diarization_segments):
+        """Assign speakers to words and segments based on overlap with diarization segments.
+        Also detects diarization segments that do not overlap any transcription segment and
+        returns them so they can be re-processed (e.g., re-transcribed) later.
+        """
         if not diarization_segments:
+            return transcription_results, []
         # Helper: find the diarization speaker active at time t, or closest
         def speaker_at(t: float):
             for dseg in diarization_segments:
             mid = (float(start_t) + float(end_t)) / 2.0
             return speaker_at(mid)
+        # First pass: assign speakers to words and apply smoothing
         for seg in transcription_results:
             if seg.get("words"):
                 words = seg["words"]
                 # 1) Initial assignment by overlap
                             smoothed[i] = prev_spk
                     for i in range(len(words)):
                         words[i]["speaker"] = smoothed[i]
             else:
                 # No word timings: choose by overlap with diarization over the whole segment
                 seg["speaker"] = best_speaker_for_interval(float(seg["start"]), float(seg["end"]))
+        # Second pass: split segments that have speaker changes within them
+        split_segments = []
+        for seg in transcription_results:
+            words = seg.get("words", [])
+            if not words or len(words) <= 1:
+                # No words or single word - can't split, assign speaker directly
+                if not words:
+                    seg["speaker"] = best_speaker_for_interval(float(seg["start"]), float(seg["end"]))
+                else:
+                    seg["speaker"] = words[0].get("speaker", "SPEAKER_00")
+                split_segments.append(seg)
+                continue
+            # Find speaker transition points with minimum duration filter
+            current_speaker = words[0].get("speaker", "SPEAKER_00")
+            split_points = [0]  # Always start with first word
+            min_segment_duration = 0.5  # Minimum 0.5 seconds per segment
+            for i in range(1, len(words)):
+                word_speaker = words[i].get("speaker", "SPEAKER_00")
+                if word_speaker != current_speaker:
+                    # Check if this would create a segment that's too short
+                    if split_points:
+                        last_split = split_points[-1]
+                        segment_start_time = float(words[last_split]["start"])
+                        current_word_time = float(words[i-1]["end"])
+                        segment_duration = current_word_time - segment_start_time
+                        # Only split if the previous segment would be long enough
+                        if segment_duration >= min_segment_duration:
+                            split_points.append(i)
+                            current_speaker = word_speaker
+                        # If too short, continue without splitting (speaker will be resolved by dominant speaker logic)
+                    else:
+                        split_points.append(i)
+                        current_speaker = word_speaker
+            split_points.append(len(words))  # End point
+            # Create sub-segments if we found speaker changes
+            if len(split_points) <= 2:
+                # No splits needed - process as single segment
+                self._assign_dominant_speaker_to_segment(seg, speaker_at, best_speaker_for_interval)
+                split_segments.append(seg)
+            else:
+                # Split into multiple segments
+                for i in range(len(split_points) - 1):
+                    start_idx = split_points[i]
+                    end_idx = split_points[i + 1]
+                    if end_idx <= start_idx:
+                        continue
+                    subseg_words = words[start_idx:end_idx]
+                    if not subseg_words:
+                        continue
+                    # Calculate segment timing and text from words
+                    subseg_start = float(subseg_words[0]["start"])
+                    subseg_end = float(subseg_words[-1]["end"])
+                    subseg_text = " ".join(w.get("word", "").strip() for w in subseg_words if w.get("word", "").strip())
+                    # Create new sub-segment
+                    new_seg = {
+                        "start": subseg_start,
+                        "end": subseg_end,
+                        "text": subseg_text,
+                        "words": subseg_words,
+                        "duration": subseg_end - subseg_start,
+                    }
+                    # Copy over other fields from original segment if they exist
+                    for key in ["avg_logprob"]:
+                        if key in seg:
+                            new_seg[key] = seg[key]
+                    # Assign dominant speaker to this sub-segment
+                    self._assign_dominant_speaker_to_segment(new_seg, speaker_at, best_speaker_for_interval)
+                    split_segments.append(new_seg)
+        # Update transcription_results with split segments
+        transcription_results = split_segments
+        # Identify diarization segments that have no overlapping transcription segments
+        unmatched_diarization_segments = []
+        for dseg in diarization_segments:
+            d_start = float(dseg["start"])
+            d_end = float(dseg["end"])
+            has_overlap = False
+            for seg in transcription_results:
+                if interval_overlap(d_start, d_end, float(seg["start"]), float(seg["end"])) > 1e-6:
+                    has_overlap = True
+                    break
+            if not has_overlap:
+                unmatched_diarization_segments.append({
+                    "start": d_start,
+                    "end": d_end,
+                    "speaker": dseg["speaker"],
+                })
+        return transcription_results, unmatched_diarization_segments
+    def _assign_dominant_speaker_to_segment(self, seg, speaker_at_func, best_speaker_for_interval_func):
+        """Assign dominant speaker to a segment based on word durations and boundary stabilization."""
+        words = seg.get("words", [])
+        if not words:
+            # No words: use segment-level overlap
+            seg["speaker"] = best_speaker_for_interval_func(float(seg["start"]), float(seg["end"]))
+            return
+        # 1) Determine dominant speaker by summed word durations
+        speaker_dur = {}
+        total_word_dur = 0.0
+        for w in words:
+            dur = max(0.0, float(w["end"]) - float(w["start"]))
+            total_word_dur += dur
+            spk = w.get("speaker", "SPEAKER_00")
+            speaker_dur[spk] = speaker_dur.get(spk, 0.0) + dur
+        if speaker_dur:
+            dominant_speaker = max(speaker_dur.items(), key=lambda kv: kv[1])[0]
+        else:
+            dominant_speaker = speaker_at_func((float(seg["start"]) + float(seg["end"])) / 2.0)
+        # 2) Boundary stabilization: relabel tiny prefix/suffix runs to dominant
+        seg_duration = max(1e-6, float(seg["end"]) - float(seg["start"]))
+        max_boundary_sec = 0.5  # hard cap for how much to relabel at edges
+        max_boundary_frac = 0.2  # or up to 20% of the segment duration
+        # prefix
+        prefix_dur = 0.0
+        prefix_count = 0
+        for w in words:
+            if w.get("speaker") == dominant_speaker:
+                break
+            prefix_dur += max(0.0, float(w["end"]) - float(w["start"]))
+            prefix_count += 1
+        if prefix_count > 0 and prefix_dur <= min(max_boundary_sec, max_boundary_frac * seg_duration):
+            for i in range(prefix_count):
+                words[i]["speaker"] = dominant_speaker
+        # suffix
+        suffix_dur = 0.0
+        suffix_count = 0
+        for w in reversed(words):
+            if w.get("speaker") == dominant_speaker:
+                break
+            suffix_dur += max(0.0, float(w["end"]) - float(w["start"]))
+            suffix_count += 1
+        if suffix_count > 0 and suffix_dur <= min(max_boundary_sec, max_boundary_frac * seg_duration):
+            for i in range(len(words) - suffix_count, len(words)):
+                words[i]["speaker"] = dominant_speaker
+        # 3) Final segment speaker
+        seg["speaker"] = dominant_speaker
     def group_segments_by_speaker(self, segments, max_gap=1.0, max_duration=30.0):
         """Group consecutive segments from the same speaker"""
             )
             # Step 4: Merge diarization into transcription (assign speakers)
+            transcription_results, unmatched_diarization_segments = self.assign_speakers_to_transcription(
+                transcription_results, diarization_segments
+            )
+            # Step 4.1: Transcribe diarization-only regions and merge
+            if unmatched_diarization_segments:
+                waveform, sample_rate = torchaudio.load(wav_path)
+                extra_segments = []
+                for dseg in unmatched_diarization_segments:
+                    d_start = float(dseg["start"])  # global seconds
+                    d_end = float(dseg["end"])      # global seconds
+                    if d_end <= d_start:
+                        continue
+                    # Map global time to local file time
+                    local_start = max(0.0, d_start - float(base_offset_s))
+                    local_end = max(local_start, d_end - float(base_offset_s))
+                    start_sample = max(0, int(local_start * sample_rate))
+                    end_sample = min(waveform.shape[1], int(local_end * sample_rate))
+                    if end_sample <= start_sample:
+                        continue
+                    seg_wav = waveform[:, start_sample:end_sample].contiguous()
+                    tmp_f = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+                    tmp_path = tmp_f.name
+                    tmp_f.close()
+                    try:
+                        torchaudio.save(tmp_path, seg_wav.cpu(), sample_rate)
+                        seg_transcription, _ = self.transcribe_full_audio(
+                            tmp_path,
+                            language=language if language is not None else None,
+                            translate=translate,
+                            prompt=prompt,
+                            batch_size=batch_size,
+                            base_offset_s=d_start,
+                        )
+                        extra_segments.extend(seg_transcription)
+                    finally:
+                        try:
+                            os.unlink(tmp_path)
+                        except Exception:
+                            pass
+                if extra_segments:
+                    transcription_results.extend(extra_segments)
+                    transcription_results.sort(key=lambda s: float(s.get("start", 0.0)))
+                    # Re-assign speakers on the combined set
+                    transcription_results, _ = self.assign_speakers_to_transcription(
+                        transcription_results, diarization_segments
+                    )
             # Step 5: Group segments if requested
             if group_segments: