colab-user commited on
Commit
02fb5b8
·
1 Parent(s): f2126e0

fix transcribe settings

Browse files
Files changed (1) hide show
  1. app/services/processor.py +48 -0
app/services/processor.py CHANGED
@@ -284,6 +284,7 @@ class Processor:
284
  )
285
  ]
286
 
 
287
  processing_time = time.time() - t0
288
 
289
  txt_content = cls._generate_txt(
@@ -306,6 +307,53 @@ class Processor:
306
  txt_content=txt_content,
307
  csv_content=csv_content,
308
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
  @classmethod
310
  def _generate_txt(
311
  cls,
 
284
  )
285
  ]
286
 
287
+ processed_segments = cls._filter_segments_with_context(processed_segments)
288
  processing_time = time.time() - t0
289
 
290
  txt_content = cls._generate_txt(
 
307
  txt_content=txt_content,
308
  csv_content=csv_content,
309
  )
310
+
311
+
312
+ @staticmethod
313
+ def _is_meaningful_segment(
314
+ seg: TranscriptSegment,
315
+ min_duration_s: float = 0.6,
316
+ min_words: int = 3
317
+ ) -> bool:
318
+ duration = seg.end - seg.start
319
+ word_count = len(seg.text.split())
320
+
321
+ if duration >= min_duration_s:
322
+ return True
323
+ if word_count >= min_words:
324
+ return True
325
+ if seg.role == "KH":
326
+ return True
327
+
328
+ return False
329
+
330
+ @classmethod
331
+ def _filter_segments_with_context(
332
+ cls,
333
+ segments: List[TranscriptSegment]
334
+ ) -> List[TranscriptSegment]:
335
+ if not segments:
336
+ return segments
337
+
338
+ segments = sorted(segments, key=lambda s: s.start)
339
+ result = []
340
+ n = len(segments)
341
+
342
+ for i, seg in enumerate(segments):
343
+ prev_seg = segments[i - 1] if i > 0 else None
344
+ next_seg = segments[i + 1] if i < n - 1 else None
345
+
346
+ if cls._is_meaningful_segment(seg):
347
+ result.append(seg)
348
+ continue
349
+
350
+ if prev_seg and next_seg:
351
+ if prev_seg.speaker == seg.speaker == next_seg.speaker:
352
+ result.append(seg)
353
+
354
+ return result
355
+
356
+
357
  @classmethod
358
  def _generate_txt(
359
  cls,