Spaces:
Sleeping
Sleeping
colab-user commited on
Commit ·
12cdd04
1
Parent(s): 45711db
fix word alignment
Browse files- app/services/processor.py +7 -37
app/services/processor.py
CHANGED
|
@@ -76,26 +76,6 @@ def normalize_asr_result(result):
|
|
| 76 |
return str(result), []
|
| 77 |
|
| 78 |
|
| 79 |
-
def offset_words(words: List[dict], offset_s: float) -> List[dict]:
|
| 80 |
-
new_words = []
|
| 81 |
-
if not words:
|
| 82 |
-
return new_words
|
| 83 |
-
|
| 84 |
-
for w in words:
|
| 85 |
-
try:
|
| 86 |
-
w2 = dict(w)
|
| 87 |
-
start = float(w.get("start", 0.0))
|
| 88 |
-
end = float(w.get("end", start))
|
| 89 |
-
|
| 90 |
-
w2["start"] = start + offset_s
|
| 91 |
-
w2["end"] = end + offset_s
|
| 92 |
-
new_words.append(w2)
|
| 93 |
-
except Exception:
|
| 94 |
-
continue
|
| 95 |
-
|
| 96 |
-
return new_words
|
| 97 |
-
|
| 98 |
-
|
| 99 |
def convert_audio_to_wav(audio_path: Path) -> Path:
|
| 100 |
"""Convert any audio to WAV 16kHz Mono using ffmpeg."""
|
| 101 |
output_path = audio_path.parent / f"{audio_path.stem}_processed.wav"
|
|
@@ -344,27 +324,17 @@ class Processor:
|
|
| 344 |
initial_prompt=prev_prompt[-settings.PROMPT_MEMORY_CHARS:] if prev_prompt else None
|
| 345 |
)
|
| 346 |
|
| 347 |
-
text,
|
| 348 |
|
| 349 |
except Exception as e:
|
| 350 |
logger.error(f"Context transcribe error: {e}")
|
| 351 |
continue
|
| 352 |
-
|
| 353 |
-
if not words and text:
|
| 354 |
-
words = [{
|
| 355 |
-
"word": text,
|
| 356 |
-
"start": 0.0,
|
| 357 |
-
"end": w_end - w_start
|
| 358 |
-
}]
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
if not words and not text:
|
| 362 |
-
continue
|
| 363 |
|
| 364 |
-
|
| 365 |
-
|
|
|
|
| 366 |
try:
|
| 367 |
-
|
| 368 |
WordTimestamp(
|
| 369 |
word=str(w.get("word", "")).strip(),
|
| 370 |
start=float(w.get("start", 0)) + w_start,
|
|
@@ -374,12 +344,12 @@ class Processor:
|
|
| 374 |
except:
|
| 375 |
pass
|
| 376 |
|
| 377 |
-
if not
|
| 378 |
continue
|
| 379 |
|
| 380 |
# ===== ALIGNMENT =====
|
| 381 |
aligned_segments = AlignmentService.align_precision(
|
| 382 |
-
|
| 383 |
refined_segments
|
| 384 |
)
|
| 385 |
|
|
|
|
| 76 |
return str(result), []
|
| 77 |
|
| 78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
def convert_audio_to_wav(audio_path: Path) -> Path:
|
| 80 |
"""Convert any audio to WAV 16kHz Mono using ffmpeg."""
|
| 81 |
output_path = audio_path.parent / f"{audio_path.stem}_processed.wav"
|
|
|
|
| 324 |
initial_prompt=prev_prompt[-settings.PROMPT_MEMORY_CHARS:] if prev_prompt else None
|
| 325 |
)
|
| 326 |
|
| 327 |
+
text, raw_words = normalize_asr_result(result)
|
| 328 |
|
| 329 |
except Exception as e:
|
| 330 |
logger.error(f"Context transcribe error: {e}")
|
| 331 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
|
| 333 |
+
word_objs: List[WordTimestamp] = []
|
| 334 |
+
|
| 335 |
+
for w in raw_words:
|
| 336 |
try:
|
| 337 |
+
word_objs.append(
|
| 338 |
WordTimestamp(
|
| 339 |
word=str(w.get("word", "")).strip(),
|
| 340 |
start=float(w.get("start", 0)) + w_start,
|
|
|
|
| 344 |
except:
|
| 345 |
pass
|
| 346 |
|
| 347 |
+
if not word_objs:
|
| 348 |
continue
|
| 349 |
|
| 350 |
# ===== ALIGNMENT =====
|
| 351 |
aligned_segments = AlignmentService.align_precision(
|
| 352 |
+
word_objs,
|
| 353 |
refined_segments
|
| 354 |
)
|
| 355 |
|