colab-user commited on
Commit
12cdd04
·
1 Parent(s): 45711db

fix word alignment

Browse files
Files changed (1) hide show
  1. app/services/processor.py +7 -37
app/services/processor.py CHANGED
@@ -76,26 +76,6 @@ def normalize_asr_result(result):
76
  return str(result), []
77
 
78
 
79
- def offset_words(words: List[dict], offset_s: float) -> List[dict]:
80
- new_words = []
81
- if not words:
82
- return new_words
83
-
84
- for w in words:
85
- try:
86
- w2 = dict(w)
87
- start = float(w.get("start", 0.0))
88
- end = float(w.get("end", start))
89
-
90
- w2["start"] = start + offset_s
91
- w2["end"] = end + offset_s
92
- new_words.append(w2)
93
- except Exception:
94
- continue
95
-
96
- return new_words
97
-
98
-
99
  def convert_audio_to_wav(audio_path: Path) -> Path:
100
  """Convert any audio to WAV 16kHz Mono using ffmpeg."""
101
  output_path = audio_path.parent / f"{audio_path.stem}_processed.wav"
@@ -344,27 +324,17 @@ class Processor:
344
  initial_prompt=prev_prompt[-settings.PROMPT_MEMORY_CHARS:] if prev_prompt else None
345
  )
346
 
347
- text, words = normalize_asr_result(result)
348
 
349
  except Exception as e:
350
  logger.error(f"Context transcribe error: {e}")
351
  continue
352
-
353
- if not words and text:
354
- words = [{
355
- "word": text,
356
- "start": 0.0,
357
- "end": w_end - w_start
358
- }]
359
-
360
-
361
- if not words and not text:
362
- continue
363
 
364
- words = []
365
- for w in words:
 
366
  try:
367
- words.append(
368
  WordTimestamp(
369
  word=str(w.get("word", "")).strip(),
370
  start=float(w.get("start", 0)) + w_start,
@@ -374,12 +344,12 @@ class Processor:
374
  except:
375
  pass
376
 
377
- if not words:
378
  continue
379
 
380
  # ===== ALIGNMENT =====
381
  aligned_segments = AlignmentService.align_precision(
382
- words,
383
  refined_segments
384
  )
385
 
 
76
  return str(result), []
77
 
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  def convert_audio_to_wav(audio_path: Path) -> Path:
80
  """Convert any audio to WAV 16kHz Mono using ffmpeg."""
81
  output_path = audio_path.parent / f"{audio_path.stem}_processed.wav"
 
324
  initial_prompt=prev_prompt[-settings.PROMPT_MEMORY_CHARS:] if prev_prompt else None
325
  )
326
 
327
+ text, raw_words = normalize_asr_result(result)
328
 
329
  except Exception as e:
330
  logger.error(f"Context transcribe error: {e}")
331
  continue
 
 
 
 
 
 
 
 
 
 
 
332
 
333
+ word_objs: List[WordTimestamp] = []
334
+
335
+ for w in raw_words:
336
  try:
337
+ word_objs.append(
338
  WordTimestamp(
339
  word=str(w.get("word", "")).strip(),
340
  start=float(w.get("start", 0)) + w_start,
 
344
  except:
345
  pass
346
 
347
+ if not word_objs:
348
  continue
349
 
350
  # ===== ALIGNMENT =====
351
  aligned_segments = AlignmentService.align_precision(
352
+ word_objs,
353
  refined_segments
354
  )
355