Spaces:

vyluong
/

PoC_ASR_v6_dev

Sleeping

App Files Files Community

vyluong commited on 6 days ago

Commit

b09cd72

verified ·

1 Parent(s): 77f83aa

Update app/services/alignment.py

Browse files

Files changed (1) hide show

app/services/alignment.py +429 -138

app/services/alignment.py CHANGED Viewed

@@ -1,11 +1,19 @@
 """
-Precision alignment service - Word-center-based speaker assignment.
 Merges word-level transcription with speaker diarization using precise timestamps.
 """
 import logging
 from pathlib import Path
 from typing import List, Tuple, Optional
 from dataclasses import dataclass
 from app.core.config import get_settings
 from app.services.transcription import WordTimestamp
@@ -25,6 +33,7 @@ class WordWithSpeaker:
     start: float
     end: float
     speaker: str
 class AlignmentService:
@@ -32,12 +41,30 @@ class AlignmentService:
     Precision alignment service.
     Uses word-center-based algorithm for accurate speaker-to-text mapping.
     """
-    PAUSE_THRESHOLD = 0.45
-    CENTER_TOL = 0.15 # s (150 ms)
-    OVERLAP_TH = 0.12 # > x% segments
-    DIA_MERGE_GAP = 0.25
-    MAX_SEGMENT_DURATION = 7.5
     @staticmethod
     def get_word_center(word: WordTimestamp) -> float:
@@ -52,25 +79,77 @@ class AlignmentService:
         return overlap / dur
-    # Diarization merge
     @classmethod
-    def merge_dia_segments(cls, segments: List[SpeakerSegment]) -> List[SpeakerSegment]:
         if not segments:
             return []
-        segments = sorted(segments, key=lambda s: s.start)
-        merged = [segments[0]]
-        for s in segments[1:]:
-            p = merged[-1]
-            if s.speaker == p.speaker and (s.start - p.end) <= cls.DIA_MERGE_GAP:
-                p.end = s.end
             else:
-                merged.append(s)
         return merged
     @classmethod
     def find_speaker_center(
         cls,
@@ -79,26 +158,43 @@ class AlignmentService:
     ) -> Optional[str]:
         for seg in speaker_segments:
-            if seg.start - cls.CENTER_TOL <= time <= seg.end + cls.CENTER_TOL:
                 return seg.speaker
         return None
     @staticmethod
-    def find_closest_speaker(time: float, speaker_segments: List[SpeakerSegment]) -> str:
         if not speaker_segments:
-            return "Unknown"
-        min_dist = float("inf")
-        closest = "Unknown"
         for seg in speaker_segments:
-            d = min(abs(time - seg.start), abs(time - seg.end))
-            if d < min_dist:
-                min_dist = d
-                closest = seg.speaker
-        return closest
     @classmethod
     def assign_speakers_to_words(
@@ -107,161 +203,342 @@ class AlignmentService:
         speaker_segments: List[SpeakerSegment],
     ) -> List[WordWithSpeaker]:
-        words = [w for w in words if w.word and w.word.strip()]
         if not speaker_segments:
-            logger.warning("No diarization, fallback single speaker")
             return [
-                WordWithSpeaker(w.word, w.start, w.end, "Speaker 1")
                 for w in words
             ]
-        speaker_segments = cls.merge_dia_segments(speaker_segments)
         results = []
         for word in words:
             center = cls.get_word_center(word)
-            # 1. CENTER
-            speaker = cls.find_speaker_center(center, speaker_segments)
             if speaker is None:
-                # 2. OVERLAP
-                best_ratio = 0
                 best_spk = None
                 for seg in speaker_segments:
-                    r = cls.overlap_ratio(word.start, word.end, seg.start, seg.end)
                     if r > best_ratio:
                         best_ratio = r
                         best_spk = seg.speaker
                 if best_ratio >= cls.OVERLAP_TH:
                     speaker = best_spk
-                else:
-                    # 3. CLOSEST
-                    speaker = cls.find_closest_speaker(center, speaker_segments)
             results.append(
-                WordWithSpeaker(word.word, word.start, word.end, speaker)
             )
         return results
     @classmethod
     def reconstruct_segments(
         cls,
-        words_with_speakers: List[WordWithSpeaker]
     ) -> List[TranscriptSegment]:
-        """
-        Step 3d: Reconstruct sentence segments from words.
-        Groups consecutive words of the same speaker into segments.
-        Creates new segment when:
-        - Speaker changes
-        - Pause > PAUSE_THRESHOLD between words
-        Args:
-            words_with_speakers: List of words with speaker assignments
-        Returns:
-            List of TranscriptSegment with complete sentences
-        """
         if not words_with_speakers:
             return []
         segments = []
-        # Start first segment
-        current_speaker = words_with_speakers[0].speaker
-        current_start = words_with_speakers[0].start
-        current_end = words_with_speakers[0].end
-        current_words = [words_with_speakers[0].word]
         for i in range(1, len(words_with_speakers)):
-            word = words_with_speakers[i]
-            prev_word = words_with_speakers[i - 1]
-            # Calculate pause between words
-            pause = word.start - prev_word.end
-            # Check if we need to start a new segment
-            speaker_changed = word.speaker != current_speaker
-            significant_pause = pause > cls.PAUSE_THRESHOLD
-            segment_duration = current_end - current_start
-            too_long = segment_duration > cls.MAX_SEGMENT_DURATION and pause > 0.15
-            if speaker_changed or significant_pause or too_long:
-                # Save current segment
-                segments.append(TranscriptSegment(
-                    start=current_start,
-                    end=current_end,
-                    speaker=current_speaker,
-                    role="UNKNOWN",
-                    text=" ".join(current_words)
-                ))
-                # Start new segment
-                current_speaker = word.speaker
-                current_start = word.start
-                current_end = word.end
-                current_words = [word.word]
             else:
-                # Continue current segment
-                current_end = word.end
-                current_words.append(word.word)
-        if current_words:
-            segments.append(TranscriptSegment(
-                start=current_start,
-                end=current_end,
-                speaker=current_speaker,
-                role="UNKNOWN",
-                text=" ".join(current_words)
-            ))
-        logger.debug(f"Reconstructed {len(segments)} segments from {len(words_with_speakers)} words")
         return segments
     @classmethod
     def resize_and_merge_segments(
         cls,
-        segments: List[TranscriptSegment]
     ) -> List[TranscriptSegment]:
-        """
-        Merge consecutive segments of the same speaker if the gap is small.
-        Also filters out extremely short segments.
-        """
-        if not segments:
-            return []
-        # Filter 1: Remove extremely short blips (noise)
-        segments = [s for s in segments if (s.end - s.start) >= settings.min_segment_duration_s]
         if not segments:
             return []
-        merged = []
-        curr = segments[0]
-        for i in range(1, len(segments)):
-            next_seg = segments[i]
-            # If same speaker and gap is small, merge
-            gap = next_seg.start - curr.end
-            if next_seg.speaker == curr.speaker and gap < settings.merge_threshold_s:
-                curr.end = next_seg.end
-                curr.text += " " + next_seg.text
             else:
-                merged.append(curr)
-                curr = next_seg
-        merged.append(curr)
-        logger.debug(f"Merged segments: {len(segments)} -> {len(merged)}")
         return merged
     @classmethod
@@ -280,15 +557,29 @@ class AlignmentService:
         Returns:
             List of TranscriptSegment with proper speaker assignments
         """
-        # Step 3c: Assign speakers to words
         words_with_speakers = cls.assign_speakers_to_words(words, speaker_segments)
-        # Step 3d: Reconstruct segments
         segments = cls.reconstruct_segments(words_with_speakers)
-        # Step 3e: Clustering/Merging (Optimization)
         segments = cls.resize_and_merge_segments(segments)
         return segments

 """
+- Precision alignment service - Word-center-based speaker assignment.
+- Keep softformer diarization service
+- Remove diarization noise using conf + duration
+- Preserve DOUBLE_TALK word by word
+- Reduce transcript fragmentation
+- Better KH/NV continuity
+- Stable realtime transcript rendering
 Merges word-level transcription with speaker diarization using precise timestamps.
 """
 import logging
 from pathlib import Path
 from typing import List, Tuple, Optional
 from dataclasses import dataclass
+from collections import Counter
 from app.core.config import get_settings
 from app.services.transcription import WordTimestamp
     start: float
     end: float
     speaker: str
+    confidence: float = 1.0
 class AlignmentService:
     Precision alignment service.
     Uses word-center-based algorithm for accurate speaker-to-text mapping.
     """
+    CENTER_TOL = 0.18 # 180 ms
+    OVERLAP_TH = 0.10 # > x% segments
+    # diarization
+    DIA_MERGE_GAP = 0.35
+    MIN_DIAR_DURATION = 0.12
+    MIN_DIAR_CONFIDENCE = 0.45
+    # segment
+    PAUSE_THRESHOLD = 0.65
+    MAX_SEGMENT_DURATION = 12.0
+    # merge
+    MERGE_GAP = 0.55
+    MAX_MERGED_DURATION = 10.0
+    # noise
+    MIN_SEGMENT_DURATION = 0.35
+    MIN_SEGMENT_AVG_CONF = 0.28
+    # interruption
+    SHORT_INTERRUPT_MAX_WORDS = 2
+    SHORT_INTERRUPT_MAX_DURATION = 1.25
     @staticmethod
     def get_word_center(word: WordTimestamp) -> float:
         return overlap / dur
     @classmethod
+    def clean_diarization_segments(
+        cls,
+        segments: List[SpeakerSegment],
+    ) -> List[SpeakerSegment]:
         if not segments:
             return []
+        segments = sorted(
+            segments,
+            key=lambda x: x.start
+        )
+        cleaned = []
+        for seg in segments:
+            dur = seg.end - seg.start
+            conf = getattr(
+                seg,
+                "confidence",
+                1.0
+            )
+            # obvious diarization noise
+            if (
+                dur < cls.MIN_DIAR_DURATION
+                and conf < cls.MIN_DIAR_CONFIDENCE
+            ):
+                continue
+            cleaned.append(seg)
+        if not cleaned:
+            return []
+        merged = [cleaned[0]]
+        for seg in cleaned[1:]:
+            prev = merged[-1]
+            gap = seg.start - prev.end
+            if (
+                seg.speaker == prev.speaker
+                and gap <= cls.DIA_MERGE_GAP
+            ):
+                prev.end = max(
+                    prev.end,
+                    seg.end
+                )
+                if hasattr(prev, "confidence"):
+                    prev.confidence = max(
+                        getattr(prev, "confidence", 1.0),
+                        getattr(seg, "confidence", 1.0)
+                    )
             else:
+                merged.append(seg)
         return merged
+    # FIND SPEAKER
     @classmethod
     def find_speaker_center(
         cls,
     ) -> Optional[str]:
         for seg in speaker_segments:
+            if (
+                seg.start - cls.CENTER_TOL
+                <= time
+                <= seg.end + cls.CENTER_TOL
+            ):
                 return seg.speaker
         return None
     @staticmethod
+    def find_closest_speaker(
+        time: float,
+        speaker_segments: List[SpeakerSegment],
+    ) -> str:
         if not speaker_segments:
+            return "UNKNOWN"
+        best_dist = float("inf")
+        best_spk = "UNKNOWN"
         for seg in speaker_segments:
+            d = min(
+                abs(time - seg.start),
+                abs(time - seg.end)
+            )
+            if d < best_dist:
+                best_dist = d
+                best_spk = seg.speaker
+        return best_spk
+    # ASSIGN SPEAKER TO WORDS
     @classmethod
     def assign_speakers_to_words(
         speaker_segments: List[SpeakerSegment],
     ) -> List[WordWithSpeaker]:
+        words = [
+            w for w in words
+            if w.word and w.word.strip()
+        ]
+        if not words:
+            return []
+        speaker_segments = cls.clean_diarization_segments(
+            speaker_segments
+        )
+        # fallback
         if not speaker_segments:
             return [
+                WordWithSpeaker(
+                    word=w.word,
+                    start=w.start,
+                    end=w.end,
+                    speaker="Speaker 1",
+                    confidence=getattr(w, "confidence", 1.0)
+                )
                 for w in words
             ]
         results = []
         for word in words:
             center = cls.get_word_center(word)
+            speaker = cls.find_speaker_center(
+                center,
+                speaker_segments
+            )
+            # overlap fallback
             if speaker is None:
+                best_ratio = 0.0
                 best_spk = None
                 for seg in speaker_segments:
+                    r = cls.overlap_ratio(
+                        word.start,
+                        word.end,
+                        seg.start,
+                        seg.end
+                    )
                     if r > best_ratio:
                         best_ratio = r
                         best_spk = seg.speaker
                 if best_ratio >= cls.OVERLAP_TH:
                     speaker = best_spk
+            # nearest fallback
+            if speaker is None:
+                speaker = cls.find_closest_speaker(
+                    center,
+                    speaker_segments
+                )
             results.append(
+                WordWithSpeaker(
+                    word=word.word,
+                    start=word.start,
+                    end=word.end,
+                    speaker=speaker,
+                    confidence=getattr(word, "confidence", 1.0)
+                )
             )
         return results
+    # ========================================================
+    # BUILD SEGMENT
+    # ========================================================
+    @classmethod
+    def build_segment(
+        cls,
+        words: List[WordWithSpeaker],
+    ) -> TranscriptSegment:
+        if not words:
+            return None
+        speaker_votes = [
+            w.speaker for w in words
+        ]
+        speaker = Counter(
+            speaker_votes
+        ).most_common(1)[0][0]
+        avg_conf = (
+            sum(w.confidence for w in words)
+            / max(1, len(words))
+        )
+        segment = TranscriptSegment(
+            start=words[0].start,
+            end=words[-1].end,
+            speaker=speaker,
+            role="UNKNOWN",
+            text=" ".join(
+                w.word for w in words
+            ),
+        )
+        # INTERNAL ONLY
+        setattr(segment, "_avg_conf", avg_conf)
+        setattr(segment, "_word_count", len(words))
+        return segment
     @classmethod
     def reconstruct_segments(
         cls,
+        words_with_speakers: List[WordWithSpeaker],
     ) -> List[TranscriptSegment]:
         if not words_with_speakers:
             return []
         segments = []
+        cur_words = [words_with_speakers[0]]
         for i in range(1, len(words_with_speakers)):
+            prev = words_with_speakers[i - 1]
+            curr = words_with_speakers[i]
+            pause = curr.start - prev.end
+            speaker_changed = (
+                curr.speaker != prev.speaker
+            )
+            long_pause = (
+                pause > cls.PAUSE_THRESHOLD
+            )
+            current_duration = (
+                cur_words[-1].end
+                - cur_words[0].start
+            )
+            too_long = (
+                current_duration > cls.MAX_SEGMENT_DURATION
+                and pause > 0.25
+            )
+            # =================================================
+            # SHORT INTERRUPTION
+            # =================================================
+            if speaker_changed:
+                lookahead = []
+                for j in range(
+                    i,
+                    min(i + 3, len(words_with_speakers))
+                ):
+                    lookahead.append(
+                        words_with_speakers[j]
+                    )
+                interrupt_duration = (
+                    lookahead[-1].end
+                    - lookahead[0].start
+                )
+                interrupt_speakers = [
+                    x.speaker
+                    for x in lookahead
+                ]
+                interrupt_same = (
+                    len(set(interrupt_speakers)) == 1
+                )
+                tiny_interrupt = (
+                    interrupt_same
+                    and len(lookahead)
+                    <= cls.SHORT_INTERRUPT_MAX_WORDS
+                    and interrupt_duration
+                    <= cls.SHORT_INTERRUPT_MAX_DURATION
+                )
+                # preserve continuity
+                if tiny_interrupt:
+                    cur_words.append(curr)
+                    continue
+                # real speaker switch
+                segments.append(
+                    cls.build_segment(cur_words)
+                )
+                cur_words = [curr]
+                continue
+            # =================================================
+            # SPLIT
+            # =================================================
+            if long_pause or too_long:
+                segments.append(
+                    cls.build_segment(cur_words)
+                )
+                cur_words = [curr]
             else:
+                cur_words.append(curr)
+        if cur_words:
+            segments.append(
+                cls.build_segment(cur_words)
+            )
         return segments
+    # ========================================================
+    # FILTER NOISE
+    # ========================================================
+    @classmethod
+    def filter_noise_segments(
+        cls,
+        segments: List[TranscriptSegment],
+    ) -> List[TranscriptSegment]:
+        filtered = []
+        for seg in segments:
+            duration = seg.end - seg.start
+            avg_conf = getattr(
+                seg,
+                "_avg_conf",
+                1.0
+            )
+            word_count = getattr(
+                seg,
+                "_word_count",
+                len(seg.text.split())
+            )
+            # hallucination/noise
+            if (
+                duration < cls.MIN_SEGMENT_DURATION
+                and avg_conf < cls.MIN_SEGMENT_AVG_CONF
+            ):
+                continue
+            # single-word garbage
+            if (
+                word_count <= 1
+                and avg_conf < 0.20
+            ):
+                continue
+            filtered.append(seg)
+        return filtered
+    # ========================================================
+    # REDUCE FRAGMENTATION
+    # ========================================================
     @classmethod
     def resize_and_merge_segments(
         cls,
+        segments: List[TranscriptSegment],
     ) -> List[TranscriptSegment]:
         if not segments:
             return []
+        segments = sorted(
+            segments,
+            key=lambda x: x.start
+        )
+        merged = [segments[0]]
+        for seg in segments[1:]:
+            prev = merged[-1]
+            gap = seg.start - prev.end
+            combined_duration = (
+                seg.end - prev.start
+            )
+            same_speaker = (
+                seg.speaker == prev.speaker
+            )
+            can_merge = (
+                same_speaker
+                and gap <= cls.MERGE_GAP
+                and combined_duration <= cls.MAX_MERGED_DURATION
+            )
+            if can_merge:
+                prev.text = (
+                    prev.text.strip()
+                    + " "
+                    + seg.text.strip()
+                ).strip()
+                prev.end = seg.end
             else:
+                merged.append(seg)
         return merged
     @classmethod
         Returns:
             List of TranscriptSegment with proper speaker assignments
         """
+        # Step 1: Assign speakers to words
         words_with_speakers = cls.assign_speakers_to_words(words, speaker_segments)
+        # Step 2: Reconstruct segments
         segments = cls.reconstruct_segments(words_with_speakers)
+        # Step 3: Remove noise
+        segments = cls.filter_noise_segments(
+            segments,
+            words_with_speakers
+        )
+        # Step 4: Clustering/Merging (Optimization)
         segments = cls.resize_and_merge_segments(segments)
+        logger.info(
+            f"Alignment output segments = {len(segments)}"
+        )
         return segments