Spaces:

vyluong
/

PoC_ASR_v5

Sleeping

App Files Files Community

colab-user commited on Feb 3

Commit

9b7234d

1 Parent(s): 0184e12

fix align words & text

Browse files

Files changed (2) hide show

app/services/processor.py +31 -17
app/services/transcription.py +2 -2

app/services/processor.py CHANGED Viewed

@@ -46,24 +46,28 @@ class ProcessingResult:
     csv_content: str = ""
-def normalize_asr_result(result):
-    """
-    Build text ONLY from words of THIS segment
-    """
     words = []
-    for w in result.words or []:
-        if not w.word.strip():
             continue
         words.append({
-            "word": w.word.strip(),
-            "start": float(w.start),
-            "end": float(w.end),
         })
-    text = " ".join(w["word"] for w in words)
-    return text.strip(), words
@@ -167,6 +171,7 @@ class Processor:
         # 4. Normalize speakers
         raw_speakers = sorted({seg.speaker for seg in diarization_segments})
         speaker_map = {
             spk: f"Speaker {i+1}"
@@ -247,17 +252,13 @@ class Processor:
                 )
                 text, raw_words = normalize_asr_result(result)
-                if not raw_words:
-                    speaker = diarization_segments[0].speaker
-                    label = speaker_map.get(speaker, speaker)
-                    role = roles.get(label, "KH")
                     processed_segments.append(
                         TranscriptSegment(
                             start=w_start,
                             end=w_end,
                             speaker=label,
-                            role=role,
                             text=text
                         )
                     )
@@ -289,6 +290,19 @@ class Processor:
                 word_objs,
                 diarization_segments
             )
             # ===== MAP WORD → ROLE =====
             for seg in aligned_segments:

     csv_content: str = ""
+def normalize_asr_result(result: dict):
     words = []
+    for w in result.get("words", []):
+        if not w.get("word", "").strip():
             continue
         words.append({
+            "word": w["word"].strip(),
+            "start": float(w["start"]),
+            "end": float(w["end"]),
         })
+    text = result.get("text", "").strip()
+    return text, words
+def guess_speaker_by_time(start, end, diarization_segments):
+    mid = (start + end) / 2
+    for d in diarization_segments:
+        if d.start <= mid <= d.end:
+            return d.speaker
+    return diarization_segments[0].speaker
         # 4. Normalize speakers
         raw_speakers = sorted({seg.speaker for seg in diarization_segments})
+        raw_speakers = guess_speaker_by_time(w_start, w_end, diarization_segments)
         speaker_map = {
             spk: f"Speaker {i+1}"
                 )
                 text, raw_words = normalize_asr_result(result)
+                if text and not raw_words:
                     processed_segments.append(
                         TranscriptSegment(
                             start=w_start,
                             end=w_end,
                             speaker=label,
+                            role=roles.get(label, "KH"),
                             text=text
                         )
                     )
                 word_objs,
                 diarization_segments
             )
+            if raw_words and not aligned_segments:
+                processed_segments.append(
+                    TranscriptSegment(
+                        start=w_start,
+                        end=w_end,
+                        speaker=speakers[0],
+                        role=roles[speakers[0]],
+                        text=text
+                    )
+                )
+                continue
             # ===== MAP WORD → ROLE =====
             for seg in aligned_segments:

app/services/transcription.py CHANGED Viewed

@@ -217,8 +217,8 @@ class TranscriptionService:
         initial_prompt: Optional[str] = None,
         prefix_text: Optional[str] = None,
         condition_on_previous_text: bool = False,
-        no_speech_threshold: float = 0.7,
-        log_prob_threshold: float = -1.4,
         compression_ratio_threshold: float = 2.3,
     ) -> Dict:
         """

         initial_prompt: Optional[str] = None,
         prefix_text: Optional[str] = None,
         condition_on_previous_text: bool = False,
+        no_speech_threshold: float = 0.3,
+        log_prob_threshold: float = -2.0,
         compression_ratio_threshold: float = 2.3,
     ) -> Dict:
         """