Spaces:
Sleeping
Sleeping
colab-user commited on
Commit ·
02fb5b8
1
Parent(s): f2126e0
fix transcribe settings
Browse files- app/services/processor.py +48 -0
app/services/processor.py
CHANGED
|
@@ -284,6 +284,7 @@ class Processor:
|
|
| 284 |
)
|
| 285 |
]
|
| 286 |
|
|
|
|
| 287 |
processing_time = time.time() - t0
|
| 288 |
|
| 289 |
txt_content = cls._generate_txt(
|
|
@@ -306,6 +307,53 @@ class Processor:
|
|
| 306 |
txt_content=txt_content,
|
| 307 |
csv_content=csv_content,
|
| 308 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
@classmethod
|
| 310 |
def _generate_txt(
|
| 311 |
cls,
|
|
|
|
| 284 |
)
|
| 285 |
]
|
| 286 |
|
| 287 |
+
processed_segments = cls._filter_segments_with_context(processed_segments)
|
| 288 |
processing_time = time.time() - t0
|
| 289 |
|
| 290 |
txt_content = cls._generate_txt(
|
|
|
|
| 307 |
txt_content=txt_content,
|
| 308 |
csv_content=csv_content,
|
| 309 |
)
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
@staticmethod
|
| 313 |
+
def _is_meaningful_segment(
|
| 314 |
+
seg: TranscriptSegment,
|
| 315 |
+
min_duration_s: float = 0.6,
|
| 316 |
+
min_words: int = 3
|
| 317 |
+
) -> bool:
|
| 318 |
+
duration = seg.end - seg.start
|
| 319 |
+
word_count = len(seg.text.split())
|
| 320 |
+
|
| 321 |
+
if duration >= min_duration_s:
|
| 322 |
+
return True
|
| 323 |
+
if word_count >= min_words:
|
| 324 |
+
return True
|
| 325 |
+
if seg.role == "KH":
|
| 326 |
+
return True
|
| 327 |
+
|
| 328 |
+
return False
|
| 329 |
+
|
| 330 |
+
@classmethod
|
| 331 |
+
def _filter_segments_with_context(
|
| 332 |
+
cls,
|
| 333 |
+
segments: List[TranscriptSegment]
|
| 334 |
+
) -> List[TranscriptSegment]:
|
| 335 |
+
if not segments:
|
| 336 |
+
return segments
|
| 337 |
+
|
| 338 |
+
segments = sorted(segments, key=lambda s: s.start)
|
| 339 |
+
result = []
|
| 340 |
+
n = len(segments)
|
| 341 |
+
|
| 342 |
+
for i, seg in enumerate(segments):
|
| 343 |
+
prev_seg = segments[i - 1] if i > 0 else None
|
| 344 |
+
next_seg = segments[i + 1] if i < n - 1 else None
|
| 345 |
+
|
| 346 |
+
if cls._is_meaningful_segment(seg):
|
| 347 |
+
result.append(seg)
|
| 348 |
+
continue
|
| 349 |
+
|
| 350 |
+
if prev_seg and next_seg:
|
| 351 |
+
if prev_seg.speaker == seg.speaker == next_seg.speaker:
|
| 352 |
+
result.append(seg)
|
| 353 |
+
|
| 354 |
+
return result
|
| 355 |
+
|
| 356 |
+
|
| 357 |
@classmethod
|
| 358 |
def _generate_txt(
|
| 359 |
cls,
|