colab-user commited on
Commit
9b7234d
·
1 Parent(s): 0184e12

fix align words & text

Browse files
app/services/processor.py CHANGED
@@ -46,24 +46,28 @@ class ProcessingResult:
46
  csv_content: str = ""
47
 
48
 
49
- def normalize_asr_result(result):
50
- """
51
- Build text ONLY from words of THIS segment
52
- """
53
  words = []
54
 
55
- for w in result.words or []:
56
- if not w.word.strip():
57
  continue
58
  words.append({
59
- "word": w.word.strip(),
60
- "start": float(w.start),
61
- "end": float(w.end),
62
  })
63
 
64
- text = " ".join(w["word"] for w in words)
 
65
 
66
- return text.strip(), words
 
 
 
 
 
 
67
 
68
 
69
 
@@ -167,6 +171,7 @@ class Processor:
167
 
168
  # 4. Normalize speakers
169
  raw_speakers = sorted({seg.speaker for seg in diarization_segments})
 
170
 
171
  speaker_map = {
172
  spk: f"Speaker {i+1}"
@@ -247,17 +252,13 @@ class Processor:
247
  )
248
 
249
  text, raw_words = normalize_asr_result(result)
250
- if not raw_words:
251
- speaker = diarization_segments[0].speaker
252
- label = speaker_map.get(speaker, speaker)
253
- role = roles.get(label, "KH")
254
-
255
  processed_segments.append(
256
  TranscriptSegment(
257
  start=w_start,
258
  end=w_end,
259
  speaker=label,
260
- role=role,
261
  text=text
262
  )
263
  )
@@ -289,6 +290,19 @@ class Processor:
289
  word_objs,
290
  diarization_segments
291
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
293
  # ===== MAP WORD → ROLE =====
294
  for seg in aligned_segments:
 
46
  csv_content: str = ""
47
 
48
 
49
+ def normalize_asr_result(result: dict):
 
 
 
50
  words = []
51
 
52
+ for w in result.get("words", []):
53
+ if not w.get("word", "").strip():
54
  continue
55
  words.append({
56
+ "word": w["word"].strip(),
57
+ "start": float(w["start"]),
58
+ "end": float(w["end"]),
59
  })
60
 
61
+ text = result.get("text", "").strip()
62
+ return text, words
63
 
64
+
65
+ def guess_speaker_by_time(start, end, diarization_segments):
66
+ mid = (start + end) / 2
67
+ for d in diarization_segments:
68
+ if d.start <= mid <= d.end:
69
+ return d.speaker
70
+ return diarization_segments[0].speaker
71
 
72
 
73
 
 
171
 
172
  # 4. Normalize speakers
173
  raw_speakers = sorted({seg.speaker for seg in diarization_segments})
174
+ raw_speakers = guess_speaker_by_time(w_start, w_end, diarization_segments)
175
 
176
  speaker_map = {
177
  spk: f"Speaker {i+1}"
 
252
  )
253
 
254
  text, raw_words = normalize_asr_result(result)
255
+ if text and not raw_words:
 
 
 
 
256
  processed_segments.append(
257
  TranscriptSegment(
258
  start=w_start,
259
  end=w_end,
260
  speaker=label,
261
+ role=roles.get(label, "KH"),
262
  text=text
263
  )
264
  )
 
290
  word_objs,
291
  diarization_segments
292
  )
293
+
294
+ if raw_words and not aligned_segments:
295
+ processed_segments.append(
296
+ TranscriptSegment(
297
+ start=w_start,
298
+ end=w_end,
299
+ speaker=speakers[0],
300
+ role=roles[speakers[0]],
301
+ text=text
302
+ )
303
+ )
304
+ continue
305
+
306
 
307
  # ===== MAP WORD → ROLE =====
308
  for seg in aligned_segments:
app/services/transcription.py CHANGED
@@ -217,8 +217,8 @@ class TranscriptionService:
217
  initial_prompt: Optional[str] = None,
218
  prefix_text: Optional[str] = None,
219
  condition_on_previous_text: bool = False,
220
- no_speech_threshold: float = 0.7,
221
- log_prob_threshold: float = -1.4,
222
  compression_ratio_threshold: float = 2.3,
223
  ) -> Dict:
224
  """
 
217
  initial_prompt: Optional[str] = None,
218
  prefix_text: Optional[str] = None,
219
  condition_on_previous_text: bool = False,
220
+ no_speech_threshold: float = 0.3,
221
+ log_prob_threshold: float = -2.0,
222
  compression_ratio_threshold: float = 2.3,
223
  ) -> Dict:
224
  """