Spaces:

vyluong
/

PoC_ASR_v5

Sleeping

App Files Files Community

colab-user commited on Feb 4

Commit

bad4ead

1 Parent(s): 1a0c8e0

fix ASR after diarization

Browse files

Files changed (2) hide show

app/services/processor.py +15 -22
app/services/transcription.py +112 -97

app/services/processor.py CHANGED Viewed

@@ -56,6 +56,7 @@ def normalize_asr_result(result: dict):
             "word": w["word"].strip(),
             "start": float(w["start"]),
             "end": float(w["end"]),
         })
     text = result.get("text", "").strip()
@@ -111,21 +112,6 @@ def overlap_prefix(a: str, b: str, n: int = 12) -> bool:
     return a[:n] in b or b[:n] in a
-MAX_SEGMENT_LEN = 15.0   # seconds
-MERGE_GAP = 0.4
-def split_long_segment(start, end, max_len=MAX_SEGMENT_LEN):
-    segments = []
-    t = start
-    while t < end:
-        segments.append((t, min(end, t + max_len)))
-        t += max_len
-    return segments
-# =========================
-# Processor
-# =========================
 class Processor:
     @classmethod
     async def process_audio(
@@ -206,12 +192,13 @@ class Processor:
         logger.info(f"roles(mapped) = {roles}")
-        # 7: Transcribe using batch external vad
         logger.info("Step 7: Running ASR with external VAD batch...")
-        asr_result = await TranscriptionService.transcribe_batch_with_external_vad(
             audio_array=y,
             sr=sr,
             model_name=model_name,
             vad_options={
                 "threshold": vad_threshold,
@@ -247,6 +234,7 @@ class Processor:
                             word=w["word"],
                             start=w["start"],
                             end=w["end"],
                         )
                     )
                 except:
@@ -267,10 +255,9 @@ class Processor:
                 else:
                     mid = duration / 2
-                label = guess_speaker_by_time(mid, mid, diarization_segments)
-                label = speaker_map.get(label, label)
                 processed_segments.append(
                     TranscriptSegment(
                         start=0,
@@ -283,7 +270,13 @@ class Processor:
             else:
                 for seg in aligned_segments:
-                    label = speaker_map.get(seg.speaker, seg.speaker)
                     role = roles.get(label, "KH")
                     processed_segments.append(

             "word": w["word"].strip(),
             "start": float(w["start"]),
             "end": float(w["end"]),
+            "speaker": w.get("speaker")
         })
     text = result.get("text", "").strip()
     return a[:n] in b or b[:n] in a
 class Processor:
     @classmethod
     async def process_audio(
         logger.info(f"roles(mapped) = {roles}")
+        # 7: Transcribe segments after diarization
         logger.info("Step 7: Running ASR with external VAD batch...")
+        asr_result = await TranscriptionService.transcribe_after_diarization(
             audio_array=y,
             sr=sr,
+            diarization_segments=diarization_segments,
             model_name=model_name,
             vad_options={
                 "threshold": vad_threshold,
                             word=w["word"],
                             start=w["start"],
                             end=w["end"],
+                            speaker=w.get("speaker")
                         )
                     )
                 except:
                 else:
                     mid = duration / 2
+                raw_spk = guess_speaker_by_time(mid, mid, diarization_segments)
+                label = speaker_map.get(raw_spk, "Speaker 1")
                 processed_segments.append(
                     TranscriptSegment(
                         start=0,
             else:
                 for seg in aligned_segments:
+                    raw_spk = seg.speaker
+                    if not raw_spk and hasattr(seg, "words"):
+                        raw_spk = seg.words[0].speaker if seg.words else None
+                    label = speaker_map.get(raw_spk, "Speaker 1")
                     role = roles.get(label, "KH")
                     processed_segments.append(

app/services/transcription.py CHANGED Viewed

@@ -30,6 +30,7 @@ class WordTimestamp:
     word: str
     start: float
     end: float
 class TranscriptionService:
     """
@@ -109,52 +110,52 @@ class TranscriptionService:
         if not segments:
             return []
-        normalized = []
-        buf_start, buf_end = segments[0]
-        for start, end in segments[1:]:
-            gap = start - buf_end
-            new_dur = end - buf_start
             if (
-                new_dur < settings.MIN_SEGMENT_SEC
                 and gap < settings.MERGE_GAP_SEC
             ):
-                buf_end = end
-                continue
-            normalized.append((buf_start, buf_end))
-            buf_start, buf_end = start, end
-        normalized.append((buf_start, buf_end))
-        # -------- Split long segment --------
-        final_segments = []
-        for start, end in normalized:
-            dur = end - start
             if dur <= settings.MAX_SEGMENT_SEC:
-                final_segments.append((start, end))
                 continue
-            cur = start
-            while cur < end:
-                split_end = min(cur + settings.TARGET_SEGMENT_SEC, end)
-                final_segments.append((cur, split_end))
-                cur = split_end
-        return final_segments
     @staticmethod
     def _slice_audio(audio: np.ndarray, sr: int, start: float, end: float):
-        s = int(start * sr)
-        e = int(end * sr)
         return audio[s:e]
@@ -167,9 +168,9 @@ class TranscriptionService:
         vad_options: Optional[dict] = None,
         beam_size: int = 6,
         temperature: float = 0.0,
-        best_of: int = 1,
         patience: float = 1.2,
-        length_penalty: float = 1.0,
         no_repeat_ngram_size=3,
         # Prompting
@@ -300,7 +301,7 @@ class TranscriptionService:
                 patience=patience,
                 length_penalty=length_penalty,
                 no_repeat_ngram_size=no_repeat_ngram_size,
-                initial_prompt=None,
                 prefix_text=prefix_text,
                 condition_on_previous_text=condition_on_previous_text,
                 no_speech_threshold=no_speech_threshold,
@@ -390,91 +391,105 @@ class TranscriptionService:
         return await loop.run_in_executor(None, _run_vad)
     @classmethod
-    async def transcribe_batch_with_external_vad(
         cls,
         audio_array: np.ndarray,
-        sr: int = 16000,
         model_name: Optional[str] = None,
         vad_options: Optional[dict] = None,
-    ) -> Dict:
-        vad_segments = await cls.get_vad_segments_async(
-            audio_array,
-            sr,
-            vad_options
-        )
-        vad_segments = cls._normalize_segments(vad_segments)
         all_words = []
-        texts = []
         total_audio_dur = len(audio_array) / sr
-        for start, end in vad_segments:
-            chunk = cls._slice_audio(audio_array, sr, start, end)
-            if len(chunk) < sr * 0.5:
-                continue
-            energy = float(np.sqrt(np.mean(chunk ** 2)))
-            if energy < 0.008:
                 continue
-            is_tail = (total_audio_dur - end) < 1.5
-            beam_size = 6
-            best_of = 5
-            patience = 1.2
-            length_penalty = 1.1
-            if is_tail:
-                beam_size = 4
-                best_of = 3
-                patience = 1.0
-            res = await cls.transcribe_with_words_async(
-                audio_array=chunk,
-                model_name=model_name,
-                language="vi",
-                beam_size=beam_size,
-                best_of=best_of,
-                temperature=0,
-                patience=patience,
-                length_penalty=length_penalty,
-                # reset prompt at tail
-                initial_prompt=None if is_tail else "Cuộc gọi thu hồi nợ.",
-                condition_on_previous_text=False,
-                no_speech_threshold=0.4 if not is_tail else 0.5,
-                log_prob_threshold=-2.0,
-                compression_ratio_threshold=2.2,
-            )
-            text = res["text"].strip()
-            if len(text.split()) > 6:
-                uniq_ratio = len(set(text.split())) / len(text.split())
-            if uniq_ratio < 0.45:
-                    logger.debug("Drop repetitive hallucination segment")
                     continue
-            if is_tail and len(text) > 80:
-                logger.debug("Drop suspicious tail text")
-                continue
-            for w in res["words"]:
-                w["start"] += start
-                w["end"] += start
-            texts.append(res["text"])
-            all_words.extend(res["words"])
         return {
-            "text": " ".join(texts),
-            "words": all_words
-        }

     word: str
     start: float
     end: float
+    speaker: Optional[str] = None
 class TranscriptionService:
     """
         if not segments:
             return []
+        merged = []
+        cur_s, cur_e = segments[0]
+        for s, e in segments[1:]:
+            gap = s - cur_e
+            new_dur = e - cur_s
             if (
+                new_dur < settings.MAX_SEGMENT_SEC
                 and gap < settings.MERGE_GAP_SEC
             ):
+                cur_e = e
+            else:
+                merged.append((cur_s, cur_e))
+                cur_s, cur_e = s, e
+        merged.append((cur_s, cur_e))
+        # split long segments
+        final = []
+        for s, e in merged:
+            dur = e - s
             if dur <= settings.MAX_SEGMENT_SEC:
+                final.append((s, e))
                 continue
+            cur = s
+            while cur < e:
+                nxt = min(cur + settings.TARGET_SEGMENT_SEC, e)
+                final.append((cur, nxt))
+                cur = nxt
+        return final
     @staticmethod
     def _slice_audio(audio: np.ndarray, sr: int, start: float, end: float):
+        s = max(0, int(start * sr))
+        e = min(len(audio), int(end * sr))
+        if e <= s:
+            return np.zeros(1, dtype=np.float32)
         return audio[s:e]
         vad_options: Optional[dict] = None,
         beam_size: int = 6,
         temperature: float = 0.0,
+        best_of: int = 5,
         patience: float = 1.2,
+        length_penalty: float = 0.95,
         no_repeat_ngram_size=3,
         # Prompting
                 patience=patience,
                 length_penalty=length_penalty,
                 no_repeat_ngram_size=no_repeat_ngram_size,
+                initial_prompt=initial_prompt,
                 prefix_text=prefix_text,
                 condition_on_previous_text=condition_on_previous_text,
                 no_speech_threshold=no_speech_threshold,
         return await loop.run_in_executor(None, _run_vad)
     @classmethod
+    async def transcribe_after_diarization(
         cls,
         audio_array: np.ndarray,
+        sr: int,
+        diarization_segments: List,
         model_name: Optional[str] = None,
         vad_options: Optional[dict] = None,
+    ):
         all_words = []
+        segments_text = []
         total_audio_dur = len(audio_array) / sr
+        for diar in diarization_segments:
+            spk = diar.speaker
+            seg_start = diar.start
+            seg_end = diar.end
+            speaker_audio = cls._slice_audio(
+                audio_array, sr, seg_start, seg_end
+            )
+            if len(speaker_audio) < sr * 0.5:
                 continue
+            vad_segments = await cls.get_vad_segments_async(
+                speaker_audio, sr, vad_options
+            )
+            vad_segments = cls._normalize_segments(vad_segments)
+            for v_start, v_end in vad_segments:
+                g_start = seg_start + v_start
+                g_end = seg_start + v_end
+                chunk = cls._slice_audio(audio_array, sr, g_start, g_end)
+                if len(chunk) < sr * 0.5:
                     continue
+                # energy filter
+                energy = float(np.sqrt(np.mean(chunk ** 2)))
+                if not np.isfinite(energy) or energy < 0.006:
+                    continue
+                is_tail = (total_audio_dur - g_end) < 1.5
+                res = await cls.transcribe_with_words_async(
+                    chunk,
+                    model_name=model_name,
+                    beam_size=6 if not is_tail else 4,
+                    best_of=5,
+                    temperature=0,
+                    patience=1.2,
+                    condition_on_previous_text=False,
+                    no_speech_threshold=0.4,
+                    compression_ratio_threshold=2.2,
+                )
+                if not res:
+                    continue
+                text = res.get("text")
+                if not text:
+                    continue
+                # hallucination filter
+                tokens = text.split()
+                if len(tokens) > 6:
+                    uniq_ratio = len(set(tokens)) / len(tokens)
+                    if uniq_ratio < 0.45:
+                        continue
+                for w in res.get("words", []):
+                    w["start"] += g_start
+                    w["end"] += g_start
+                    w["speaker"] = spk
+                    all_words.append(w)
+                segments_text.append(
+                    {
+                        "speaker": spk,
+                        "start": g_start,
+                        "end": g_end,
+                        "text": text,
+                    }
+                )
+        # ===== sort results =====
+        all_words.sort(key=lambda x: x["start"])
+        segments_text.sort(key=lambda x: x["start"])
+        full_text = " ".join(seg["text"] for seg in segments_text)
         return {
+            "segments": segments_text,
+            "words": all_words,
+            "text": full_text
+        }