Spaces:

vyluong
/

PoC_ASR_v5

Sleeping

App Files Files Community

colab-user commited on Jan 30

Commit

12cdd04

1 Parent(s): 45711db

fix word alignment

Browse files

Files changed (1) hide show

app/services/processor.py +7 -37

app/services/processor.py CHANGED Viewed

@@ -76,26 +76,6 @@ def normalize_asr_result(result):
     return str(result), []
-def offset_words(words: List[dict], offset_s: float) -> List[dict]:
-    new_words = []
-    if not words:
-        return new_words
-    for w in words:
-        try:
-            w2 = dict(w)
-            start = float(w.get("start", 0.0))
-            end   = float(w.get("end", start))
-            w2["start"] = start + offset_s
-            w2["end"]   = end + offset_s
-            new_words.append(w2)
-        except Exception:
-            continue
-    return new_words
 def convert_audio_to_wav(audio_path: Path) -> Path:
     """Convert any audio to WAV 16kHz Mono using ffmpeg."""
     output_path = audio_path.parent / f"{audio_path.stem}_processed.wav"
@@ -344,27 +324,17 @@ class Processor:
                     initial_prompt=prev_prompt[-settings.PROMPT_MEMORY_CHARS:] if prev_prompt else None
                 )
-                text, words = normalize_asr_result(result)
             except Exception as e:
                 logger.error(f"Context transcribe error: {e}")
                 continue
-            if not words and text:
-                words = [{
-                    "word": text,
-                    "start": 0.0,
-                    "end": w_end - w_start
-                }]
-            if not words and not text:
-                continue
-            words = []
-            for w in words:
                 try:
-                    words.append(
                         WordTimestamp(
                             word=str(w.get("word", "")).strip(),
                             start=float(w.get("start", 0)) + w_start,
@@ -374,12 +344,12 @@ class Processor:
                 except:
                     pass
-            if not words:
                 continue
             # ===== ALIGNMENT =====
             aligned_segments = AlignmentService.align_precision(
-                words,
                 refined_segments
             )

     return str(result), []
 def convert_audio_to_wav(audio_path: Path) -> Path:
     """Convert any audio to WAV 16kHz Mono using ffmpeg."""
     output_path = audio_path.parent / f"{audio_path.stem}_processed.wav"
                     initial_prompt=prev_prompt[-settings.PROMPT_MEMORY_CHARS:] if prev_prompt else None
                 )
+                text, raw_words = normalize_asr_result(result)
             except Exception as e:
                 logger.error(f"Context transcribe error: {e}")
                 continue
+            word_objs: List[WordTimestamp] = []
+            for w in raw_words:
                 try:
+                    word_objs.append(
                         WordTimestamp(
                             word=str(w.get("word", "")).strip(),
                             start=float(w.get("start", 0)) + w_start,
                 except:
                     pass
+            if not word_objs:
                 continue
             # ===== ALIGNMENT =====
             aligned_segments = AlignmentService.align_precision(
+                word_objs,
                 refined_segments
             )