unknown commited on
Commit
7273fb2
·
1 Parent(s): 9a5efb7
Files changed (1) hide show
  1. app.py +468 -213
app.py CHANGED
@@ -1,6 +1,272 @@
1
- # import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  # import re
3
- # import math
4
  # import tempfile
5
  # from dataclasses import dataclass
6
  # from typing import List, Dict
@@ -11,7 +277,7 @@
11
 
12
 
13
  # # =========================================================
14
- # # 基础配置(通用规则)
15
  # # =========================================================
16
  # MEDIA_EXTS = (".mp4", ".m4a", ".mp3", ".wav", ".flac", ".ogg", ".aac", ".mov", ".avi")
17
  # VTT_EXTS = (".vtt",)
@@ -53,16 +319,28 @@
53
  # with open(path, "r", encoding="utf-8") as f:
54
  # content = f.read()
55
 
56
- # blocks = re.split(r"\r?\n\r?\n", content.replace("\ufeff", "").strip())
57
- # cues = []
 
 
 
 
58
 
59
  # for block in blocks:
60
- # lines = [l for l in block.splitlines() if l.strip()]
61
- # time_line = next((l for l in lines if "-->" in l), None)
62
- # if not time_line:
 
 
 
 
 
 
 
 
63
  # continue
64
 
65
- # m = _VTT_TIME_RE.search(time_line)
66
  # if not m:
67
  # continue
68
 
@@ -71,13 +349,21 @@
71
  # if end <= start:
72
  # continue
73
 
74
- # text = _strip_tags("\n".join(lines[1:]))
75
- # if text:
76
- # cues.append(Cue(start, end, text))
 
 
 
 
 
 
 
77
 
78
  # return sorted(cues, key=lambda x: x.start)
79
 
80
 
 
81
  # # =========================================================
82
  # # 对齐逻辑
83
  # # =========================================================
@@ -87,13 +373,19 @@
87
  # ma = (a[i].start + a[i].end) / 2
88
  # mb = (b[j].start + b[j].end) / 2
89
  # if abs(ma - mb) <= th:
90
- # out.append({
91
- # "idx": idx,
92
- # "start": min(a[i].start, b[j].start),
93
- # "end": max(a[i].end, b[j].end),
94
- # "a": a[i],
95
- # "b": b[j],
96
- # })
 
 
 
 
 
 
97
  # idx += 1
98
  # i += 1
99
  # j += 1
@@ -108,21 +400,41 @@
108
  # # 播放工具
109
  # # =========================================================
110
  # def export_segment(audio: AudioSegment, start: float, end: float) -> str:
111
- # seg = audio[int(start * 1000): int(end * 1000)]
 
 
112
  # tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
113
  # seg.export(tmp.name, format="wav")
114
  # return tmp.name
115
 
116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  # # =========================================================
118
- # # Gradio 回调
119
  # # =========================================================
120
  # def scan_dataset(repo_id: str, repo_type: str):
121
  # if not repo_id:
122
  # raise gr.Error("请填写 Dataset / Repo 名称。")
123
 
124
  # files = list_repo_files(repo_id, repo_type=repo_type)
125
-
126
  # media_files = sorted([f for f in files if f.lower().endswith(MEDIA_EXTS)])
127
  # vtt_files = sorted([f for f in files if f.lower().endswith(VTT_EXTS)])
128
 
@@ -131,6 +443,7 @@
131
  # if not vtt_files:
132
  # raise gr.Error("Dataset 中未发现 VTT 文件。")
133
 
 
134
  # return (
135
  # gr.update(choices=media_files, value=media_files[0]),
136
  # gr.update(choices=media_files, value=media_files[0]),
@@ -139,6 +452,9 @@
139
  # )
140
 
141
 
 
 
 
142
  # def load_and_align(
143
  # repo_id,
144
  # repo_type,
@@ -151,28 +467,33 @@
151
  # if not all([media_a_path, media_b_path, vtt_a_path, vtt_b_path]):
152
  # raise gr.Error("请为 Track A / B 分别选择媒体文件和 VTT 文件。")
153
 
154
- # media_a = AudioSegment.from_file(hf_hub_download(repo_id, media_a_path, repo_type=repo_type))
155
- # media_b = AudioSegment.from_file(hf_hub_download(repo_id, media_b_path, repo_type=repo_type))
 
 
156
 
157
- # cues_a = parse_vtt_file(hf_hub_download(repo_id, vtt_a_path, repo_type=repo_type))
158
- # cues_b = parse_vtt_file(hf_hub_download(repo_id, vtt_b_path, repo_type=repo_type))
 
 
 
159
 
160
  # if not cues_a or not cues_b:
161
- # raise gr.Error("VTT 解析为空,请检查字幕文件。")
162
 
163
  # aligned = align_by_time(cues_a, cues_b, th)
164
  # if not aligned:
165
  # raise gr.Error("未对齐到任何片段,请尝试增大对齐阈值。")
166
 
167
  # rows = [
168
- # [x["idx"], f'{x["start"]:.2f}-{x["end"]:.2f}', x["a"].text, x["b"].text]
169
  # for x in aligned
170
  # ]
171
 
172
  # state = {
173
  # "aligned": aligned,
174
- # "audio_a": media_a,
175
- # "audio_b": media_b,
176
  # }
177
 
178
  # stats = {
@@ -184,39 +505,76 @@
184
  # "max_mid_diff_sec": th,
185
  # }
186
 
187
- # return rows, stats, state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
 
 
189
 
190
- # def play_selected(df, row, state):
191
- # if row is None:
192
- # raise gr.Error("请先点击表格选择一行。")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
- # seg = state["aligned"][int(df[row][0]) - 1]
195
- # a_wav = export_segment(state["audio_a"], seg["start"], seg["end"])
196
- # b_wav = export_segment(state["audio_b"], seg["start"], seg["end"])
197
- # return a_wav, b_wav
198
 
199
 
200
  # # =========================================================
201
  # # UI
202
  # # =========================================================
203
- # with gr.Blocks(title="双语音频字幕对比(UI 指定 Dataset / A-B 文件)") as demo:
204
  # gr.Markdown(
205
- # "# 🎧 双语音频字幕对比\n"
206
- # " UI 中选择 Hugging Face Dataset,并分别指定 Track A / Track B 的媒体与字幕文件。"
207
  # )
208
 
209
  # state = gr.State()
210
- # selected_row = gr.State()
211
 
212
  # with gr.Row():
213
- # repo_id = gr.Textbox(label="Dataset / Repo 名称", placeholder="org/dataset")
214
  # repo_type = gr.Radio(["dataset", "model"], value="dataset", label="Repo 类型")
215
 
216
  # btn_scan = gr.Button("扫描 Dataset", variant="primary")
217
 
218
  # gr.Markdown("## Track A / Track B 文件选择(来自 Dataset)")
219
-
220
  # with gr.Row():
221
  # media_a = gr.Dropdown(label="Track A 媒体文件")
222
  # media_b = gr.Dropdown(label="Track B 媒体文件")
@@ -243,24 +601,28 @@
243
 
244
  # stats = gr.JSON(label="统计信息")
245
 
246
- # btn_align.click(
247
- # load_and_align,
248
- # inputs=[repo_id, repo_type, media_a, media_b, vtt_a, vtt_b, th],
249
- # outputs=[df, stats, state],
250
- # )
251
-
252
- # df.select(lambda e: e.index, None, selected_row)
253
-
254
- # btn_play = gr.Button("播放选中片段", variant="primary")
255
 
256
  # with gr.Row():
257
  # a_out = gr.Audio(label="Track A 片段")
258
  # b_out = gr.Audio(label="Track B 片段")
 
259
 
260
- # btn_play.click(
261
- # play_selected,
262
- # inputs=[df, selected_row, state],
263
- # outputs=[a_out, b_out],
 
 
 
 
 
 
 
264
  # )
265
 
266
  # if __name__ == "__main__":
@@ -277,16 +639,15 @@ from pydub import AudioSegment
277
 
278
 
279
  # =========================================================
280
- # 通用配置
281
  # =========================================================
282
  MEDIA_EXTS = (".mp4", ".m4a", ".mp3", ".wav", ".flac", ".ogg", ".aac", ".mov", ".avi")
283
  VTT_EXTS = (".vtt",)
284
-
285
  DEFAULT_MAX_MID_DIFF = 1.5
286
 
287
 
288
  # =========================================================
289
- # 数据结构 & VTT 解析
290
  # =========================================================
291
  @dataclass
292
  class Cue:
@@ -295,6 +656,9 @@ class Cue:
295
  text: str
296
 
297
 
 
 
 
298
  _TAG_RE = re.compile(r"</?[^>]+?>", re.IGNORECASE)
299
  _VTT_TIME_RE = re.compile(
300
  r"(?P<start>\d{2}:\d{2}:\d{2}\.\d{3}|\d{1,2}:\d{2}\.\d{3})\s*-->\s*"
@@ -310,16 +674,13 @@ def _time_to_seconds(t: str) -> float:
310
  parts = t.split(":")
311
  if len(parts) == 3:
312
  return int(parts[0]) * 3600 + int(parts[1]) * 60 + float(parts[2])
313
- if len(parts) == 2:
314
- return int(parts[0]) * 60 + float(parts[1])
315
- raise ValueError(t)
316
 
317
 
318
  def parse_vtt_file(path: str) -> List[Cue]:
319
  with open(path, "r", encoding="utf-8") as f:
320
  content = f.read()
321
 
322
- # 去 BOM / WEBVTT 头
323
  content = content.replace("\ufeff", "")
324
  content = re.sub(r"^\s*WEBVTT.*?\n", "", content, flags=re.IGNORECASE)
325
 
@@ -331,7 +692,6 @@ def parse_vtt_file(path: str) -> List[Cue]:
331
  if not lines:
332
  continue
333
 
334
- # 找时间轴行(必须包含 -->)
335
  time_idx = None
336
  for i, line in enumerate(lines):
337
  if "-->" in line:
@@ -349,23 +709,19 @@ def parse_vtt_file(path: str) -> List[Cue]:
349
  if end <= start:
350
  continue
351
 
352
- # ✅ 只取时间轴行之后的内容作为字幕
353
  text_lines = lines[time_idx + 1 :]
354
  if not text_lines:
355
  continue
356
 
357
- text = _strip_tags("\n".join(text_lines)).strip()
358
- if not text:
359
- continue
360
-
361
- cues.append(Cue(start=start, end=end, text=text))
362
 
363
  return sorted(cues, key=lambda x: x.start)
364
 
365
 
366
-
367
  # =========================================================
368
- # 对齐逻辑
369
  # =========================================================
370
  def align_by_time(a: List[Cue], b: List[Cue], th: float) -> List[Dict]:
371
  out, i, j, idx = [], 0, 0, 1
@@ -378,10 +734,6 @@ def align_by_time(a: List[Cue], b: List[Cue], th: float) -> List[Dict]:
378
  "idx": idx,
379
  "start": min(a[i].start, b[j].start),
380
  "end": max(a[i].end, b[j].end),
381
- "a_start": a[i].start,
382
- "a_end": a[i].end,
383
- "b_start": b[j].start,
384
- "b_end": b[j].end,
385
  "a_text": a[i].text,
386
  "b_text": b[j].text,
387
  }
@@ -397,53 +749,29 @@ def align_by_time(a: List[Cue], b: List[Cue], th: float) -> List[Dict]:
397
 
398
 
399
  # =========================================================
400
- # 播放工具
401
  # =========================================================
402
  def export_segment(audio: AudioSegment, start: float, end: float) -> str:
403
- start_ms = int(max(0.0, start) * 1000)
404
- end_ms = int(max(start + 0.01, end) * 1000)
405
- seg = audio[start_ms:end_ms]
406
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
407
  seg.export(tmp.name, format="wav")
408
  return tmp.name
409
 
410
 
411
- def pick_window(seg: Dict, mode: str, off_a: float, off_b: float):
412
- if mode == "global":
413
- a_start, a_end = seg["start"], seg["end"]
414
- b_start, b_end = seg["start"], seg["end"]
415
- else:
416
- a_start, a_end = seg["a_start"], seg["a_end"]
417
- b_start, b_end = seg["b_start"], seg["b_end"]
418
-
419
- a_start, a_end = a_start + off_a, a_end + off_a
420
- b_start, b_end = b_start + off_b, b_end + off_b
421
-
422
- a_start = max(0.0, a_start)
423
- b_start = max(0.0, b_start)
424
- a_end = max(a_start + 0.01, a_end)
425
- b_end = max(b_start + 0.01, b_end)
426
-
427
- return a_start, a_end, b_start, b_end
428
-
429
-
430
  # =========================================================
431
- # Gradio 回调:扫描 Dataset
432
  # =========================================================
433
  def scan_dataset(repo_id: str, repo_type: str):
434
  if not repo_id:
435
  raise gr.Error("请填写 Dataset / Repo 名称。")
436
 
437
  files = list_repo_files(repo_id, repo_type=repo_type)
438
- media_files = sorted([f for f in files if f.lower().endswith(MEDIA_EXTS)])
439
- vtt_files = sorted([f for f in files if f.lower().endswith(VTT_EXTS)])
440
 
441
- if not media_files:
442
- raise gr.Error("Dataset 中未发现媒体文件。")
443
- if not vtt_files:
444
- raise gr.Error("Dataset 中未发现 VTT 文件。")
445
 
446
- # 默认都选第一个(用户可再改)
447
  return (
448
  gr.update(choices=media_files, value=media_files[0]),
449
  gr.update(choices=media_files, value=media_files[0]),
@@ -452,38 +780,16 @@ def scan_dataset(repo_id: str, repo_type: str):
452
  )
453
 
454
 
455
- # =========================================================
456
- # Gradio 回调:加载并对齐
457
- # =========================================================
458
- def load_and_align(
459
- repo_id,
460
- repo_type,
461
- media_a_path,
462
- media_b_path,
463
- vtt_a_path,
464
- vtt_b_path,
465
- th,
466
- ):
467
- if not all([media_a_path, media_b_path, vtt_a_path, vtt_b_path]):
468
- raise gr.Error("请为 Track A / B 分别选择媒体文件和 VTT 文件。")
469
-
470
- local_media_a = hf_hub_download(repo_id, media_a_path, repo_type=repo_type)
471
- local_media_b = hf_hub_download(repo_id, media_b_path, repo_type=repo_type)
472
- local_vtt_a = hf_hub_download(repo_id, vtt_a_path, repo_type=repo_type)
473
- local_vtt_b = hf_hub_download(repo_id, vtt_b_path, repo_type=repo_type)
474
-
475
- audio_a = AudioSegment.from_file(local_media_a)
476
- audio_b = AudioSegment.from_file(local_media_b)
477
-
478
- cues_a = parse_vtt_file(local_vtt_a)
479
- cues_b = parse_vtt_file(local_vtt_b)
480
-
481
- if not cues_a or not cues_b:
482
- raise gr.Error("VTT 解析为空,请检查字幕文件内容。")
483
 
484
  aligned = align_by_time(cues_a, cues_b, th)
485
  if not aligned:
486
- raise gr.Error("未对齐到任何片段,请尝试增大对齐阈值。")
487
 
488
  rows = [
489
  [x["idx"], f'{x["start"]:.2f}-{x["end"]:.2f}', x["a_text"], x["b_text"]]
@@ -496,92 +802,49 @@ def load_and_align(
496
  "audio_b": audio_b,
497
  }
498
 
499
- stats = {
500
- "track_a_media": media_a_path,
501
- "track_b_media": media_b_path,
502
- "track_a_vtt": vtt_a_path,
503
- "track_b_vtt": vtt_b_path,
504
- "aligned_segments": len(aligned),
505
- "max_mid_diff_sec": th,
506
- }
507
-
508
- # 注意:对齐后清空播放器 & 播放信息,避免旧内容误导
509
- return rows, stats, state, None, None, {}
510
 
511
 
512
- # =========================================================
513
- # Gradio 回调:选择即播放
514
- # =========================================================
515
- def play_on_select(
516
- evt: gr.SelectData,
517
- df_value,
518
- play_mode,
519
- crop_mode,
520
- offset_a,
521
- offset_b,
522
- state,
523
- ):
524
- if not state or "aligned" not in state:
525
  raise gr.Error("请先加载并对齐。")
526
 
527
- idx_raw = evt.index
528
- row = int(idx_raw[0] if isinstance(idx_raw, (tuple, list)) else idx_raw)
529
-
530
- if not df_value or row < 0 or row >= len(df_value):
531
- raise gr.Error("无法读取选中行,请重试。")
532
-
533
  seg_idx = int(df_value[row][0])
534
  seg = state["aligned"][seg_idx - 1]
535
 
536
- a_start, a_end, b_start, b_end = pick_window(
537
- seg, crop_mode, float(offset_a), float(offset_b)
538
- )
539
-
540
- a_wav = export_segment(state["audio_a"], a_start, a_end)
541
- b_wav = export_segment(state["audio_b"], b_start, b_end)
542
 
543
  info = {
544
  "segment": seg_idx,
545
- "play_mode": play_mode,
546
- "A_time": f"{a_start:.2f}-{a_end:.2f}",
547
- "B_time": f"{b_start:.2f}-{b_end:.2f}",
548
  }
549
 
550
- # 关键修复:不要返回 None
551
- if play_mode == "A":
552
- return a_wav, gr.update(value=None), info
553
- elif play_mode == "B":
554
- return gr.update(value=None), b_wav, info
555
- else:
556
- return a_wav, b_wav, info
557
-
558
 
559
 
560
  # =========================================================
561
  # UI
562
  # =========================================================
563
- with gr.Blocks(title="双语音频字幕对比(选择即播放)") as demo:
564
- gr.Markdown(
565
- "# 🎧 双语音频字幕对比(选择即播放)\n"
566
- "步骤:扫描 Dataset → 分别选择 A/B 媒体与字幕 → 加载并对齐 → **点击表格任意单元格即可播放片段**"
567
- )
568
 
569
  state = gr.State()
570
 
571
  with gr.Row():
572
- repo_id = gr.Textbox(label="Dataset / Repo 名称", placeholder="org/dataset 或 org/repo")
573
  repo_type = gr.Radio(["dataset", "model"], value="dataset", label="Repo 类型")
574
 
575
  btn_scan = gr.Button("扫描 Dataset", variant="primary")
576
 
577
- gr.Markdown("## Track A / Track B 文件选择(来自 Dataset)")
578
  with gr.Row():
579
- media_a = gr.Dropdown(label="Track A 媒体文件")
580
- media_b = gr.Dropdown(label="Track B 媒体文件")
581
 
582
  with gr.Row():
583
- vtt_a = gr.Dropdown(label="Track A VTT 文件")
584
- vtt_b = gr.Dropdown(label="Track B VTT 文件")
585
 
586
  btn_scan.click(
587
  scan_dataset,
@@ -589,39 +852,31 @@ with gr.Blocks(title="双语音频字幕对比(选择即播放)") as demo:
589
  outputs=[media_a, media_b, vtt_a, vtt_b],
590
  )
591
 
592
- th = gr.Slider(0.3, 5.0, value=DEFAULT_MAX_MID_DIFF, step=0.1, label="字幕对齐阈值(秒)")
593
  btn_align = gr.Button("加载并对齐", variant="primary")
594
 
595
  df = gr.Dataframe(
596
- headers=["#", "Global Time", "Track A", "Track B"],
597
  interactive=True,
598
  wrap=True,
599
  max_height=520,
600
  )
601
 
602
- stats = gr.JSON(label="统计信息")
603
-
604
- gr.Markdown("## 播放参数(点击表格即可按这些参数播放)")
605
- play_mode = gr.Radio(["A", "B", "同时"], value="同时", label="播放模式")
606
- crop_mode = gr.Radio(["global", "per_track"], value="global", label="裁剪方式")
607
- offset_a = gr.Slider(-10, 10, value=0, step=0.1, label="Track A 偏移(s)")
608
- offset_b = gr.Slider(-10, 10, value=0, step=0.1, label="Track B 偏移(s)")
609
 
610
  with gr.Row():
611
  a_out = gr.Audio(label="Track A 片段")
612
  b_out = gr.Audio(label="Track B 片段")
613
- play_info = gr.JSON(label="播放信息")
614
 
615
- btn_align.click(
616
- load_and_align,
617
- inputs=[repo_id, repo_type, media_a, media_b, vtt_a, vtt_b, th],
618
- outputs=[df, stats, state, a_out, b_out, play_info],
619
- )
620
 
621
- # ✅ 关键:选择即播放(不再需要“播放选中片段”按钮)
622
  df.select(
623
- fn=play_on_select,
624
- inputs=[df, play_mode, crop_mode, offset_a, offset_b, state],
625
  outputs=[a_out, b_out, play_info],
626
  )
627
 
 
1
+ # # import os
2
+ # # import re
3
+ # # import math
4
+ # # import tempfile
5
+ # # from dataclasses import dataclass
6
+ # # from typing import List, Dict
7
+
8
+ # # import gradio as gr
9
+ # # from huggingface_hub import list_repo_files, hf_hub_download
10
+ # # from pydub import AudioSegment
11
+
12
+
13
+ # # # =========================================================
14
+ # # # 基础配置(通用规则)
15
+ # # # =========================================================
16
+ # # MEDIA_EXTS = (".mp4", ".m4a", ".mp3", ".wav", ".flac", ".ogg", ".aac", ".mov", ".avi")
17
+ # # VTT_EXTS = (".vtt",)
18
+
19
+ # # DEFAULT_MAX_MID_DIFF = 1.5
20
+
21
+
22
+ # # # =========================================================
23
+ # # # 数据结构 & VTT 解析
24
+ # # # =========================================================
25
+ # # @dataclass
26
+ # # class Cue:
27
+ # # start: float
28
+ # # end: float
29
+ # # text: str
30
+
31
+
32
+ # # _TAG_RE = re.compile(r"</?[^>]+?>", re.IGNORECASE)
33
+ # # _VTT_TIME_RE = re.compile(
34
+ # # r"(?P<start>\d{2}:\d{2}:\d{2}\.\d{3}|\d{1,2}:\d{2}\.\d{3})\s*-->\s*"
35
+ # # r"(?P<end>\d{2}:\d{2}:\d{2}\.\d{3}|\d{1,2}:\d{2}\.\d{3})"
36
+ # # )
37
+
38
+
39
+ # # def _strip_tags(text: str) -> str:
40
+ # # return _TAG_RE.sub("", text).strip()
41
+
42
+
43
+ # # def _time_to_seconds(t: str) -> float:
44
+ # # parts = t.split(":")
45
+ # # if len(parts) == 3:
46
+ # # return int(parts[0]) * 3600 + int(parts[1]) * 60 + float(parts[2])
47
+ # # if len(parts) == 2:
48
+ # # return int(parts[0]) * 60 + float(parts[1])
49
+ # # raise ValueError(t)
50
+
51
+
52
+ # # def parse_vtt_file(path: str) -> List[Cue]:
53
+ # # with open(path, "r", encoding="utf-8") as f:
54
+ # # content = f.read()
55
+
56
+ # # blocks = re.split(r"\r?\n\r?\n", content.replace("\ufeff", "").strip())
57
+ # # cues = []
58
+
59
+ # # for block in blocks:
60
+ # # lines = [l for l in block.splitlines() if l.strip()]
61
+ # # time_line = next((l for l in lines if "-->" in l), None)
62
+ # # if not time_line:
63
+ # # continue
64
+
65
+ # # m = _VTT_TIME_RE.search(time_line)
66
+ # # if not m:
67
+ # # continue
68
+
69
+ # # start = _time_to_seconds(m.group("start"))
70
+ # # end = _time_to_seconds(m.group("end"))
71
+ # # if end <= start:
72
+ # # continue
73
+
74
+ # # text = _strip_tags("\n".join(lines[1:]))
75
+ # # if text:
76
+ # # cues.append(Cue(start, end, text))
77
+
78
+ # # return sorted(cues, key=lambda x: x.start)
79
+
80
+
81
+ # # # =========================================================
82
+ # # # 对齐逻辑
83
+ # # # =========================================================
84
+ # # def align_by_time(a: List[Cue], b: List[Cue], th: float) -> List[Dict]:
85
+ # # out, i, j, idx = [], 0, 0, 1
86
+ # # while i < len(a) and j < len(b):
87
+ # # ma = (a[i].start + a[i].end) / 2
88
+ # # mb = (b[j].start + b[j].end) / 2
89
+ # # if abs(ma - mb) <= th:
90
+ # # out.append({
91
+ # # "idx": idx,
92
+ # # "start": min(a[i].start, b[j].start),
93
+ # # "end": max(a[i].end, b[j].end),
94
+ # # "a": a[i],
95
+ # # "b": b[j],
96
+ # # })
97
+ # # idx += 1
98
+ # # i += 1
99
+ # # j += 1
100
+ # # elif ma < mb:
101
+ # # i += 1
102
+ # # else:
103
+ # # j += 1
104
+ # # return out
105
+
106
+
107
+ # # # =========================================================
108
+ # # # 播放工具
109
+ # # # =========================================================
110
+ # # def export_segment(audio: AudioSegment, start: float, end: float) -> str:
111
+ # # seg = audio[int(start * 1000): int(end * 1000)]
112
+ # # tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
113
+ # # seg.export(tmp.name, format="wav")
114
+ # # return tmp.name
115
+
116
+
117
+ # # # =========================================================
118
+ # # # Gradio 回调
119
+ # # # =========================================================
120
+ # # def scan_dataset(repo_id: str, repo_type: str):
121
+ # # if not repo_id:
122
+ # # raise gr.Error("请填写 Dataset / Repo 名称。")
123
+
124
+ # # files = list_repo_files(repo_id, repo_type=repo_type)
125
+
126
+ # # media_files = sorted([f for f in files if f.lower().endswith(MEDIA_EXTS)])
127
+ # # vtt_files = sorted([f for f in files if f.lower().endswith(VTT_EXTS)])
128
+
129
+ # # if not media_files:
130
+ # # raise gr.Error("Dataset 中未发现媒体文件。")
131
+ # # if not vtt_files:
132
+ # # raise gr.Error("Dataset 中未发现 VTT 文件。")
133
+
134
+ # # return (
135
+ # # gr.update(choices=media_files, value=media_files[0]),
136
+ # # gr.update(choices=media_files, value=media_files[0]),
137
+ # # gr.update(choices=vtt_files, value=vtt_files[0]),
138
+ # # gr.update(choices=vtt_files, value=vtt_files[0]),
139
+ # # )
140
+
141
+
142
+ # # def load_and_align(
143
+ # # repo_id,
144
+ # # repo_type,
145
+ # # media_a_path,
146
+ # # media_b_path,
147
+ # # vtt_a_path,
148
+ # # vtt_b_path,
149
+ # # th,
150
+ # # ):
151
+ # # if not all([media_a_path, media_b_path, vtt_a_path, vtt_b_path]):
152
+ # # raise gr.Error("请为 Track A / B 分别选择媒体文件和 VTT 文件。")
153
+
154
+ # # media_a = AudioSegment.from_file(hf_hub_download(repo_id, media_a_path, repo_type=repo_type))
155
+ # # media_b = AudioSegment.from_file(hf_hub_download(repo_id, media_b_path, repo_type=repo_type))
156
+
157
+ # # cues_a = parse_vtt_file(hf_hub_download(repo_id, vtt_a_path, repo_type=repo_type))
158
+ # # cues_b = parse_vtt_file(hf_hub_download(repo_id, vtt_b_path, repo_type=repo_type))
159
+
160
+ # # if not cues_a or not cues_b:
161
+ # # raise gr.Error("VTT 解析为空,请检查字幕文件。")
162
+
163
+ # # aligned = align_by_time(cues_a, cues_b, th)
164
+ # # if not aligned:
165
+ # # raise gr.Error("未对齐到任何片段,请尝试增大对齐阈值。")
166
+
167
+ # # rows = [
168
+ # # [x["idx"], f'{x["start"]:.2f}-{x["end"]:.2f}', x["a"].text, x["b"].text]
169
+ # # for x in aligned
170
+ # # ]
171
+
172
+ # # state = {
173
+ # # "aligned": aligned,
174
+ # # "audio_a": media_a,
175
+ # # "audio_b": media_b,
176
+ # # }
177
+
178
+ # # stats = {
179
+ # # "track_a_media": media_a_path,
180
+ # # "track_b_media": media_b_path,
181
+ # # "track_a_vtt": vtt_a_path,
182
+ # # "track_b_vtt": vtt_b_path,
183
+ # # "aligned_segments": len(aligned),
184
+ # # "max_mid_diff_sec": th,
185
+ # # }
186
+
187
+ # # return rows, stats, state
188
+
189
+
190
+ # # def play_selected(df, row, state):
191
+ # # if row is None:
192
+ # # raise gr.Error("请先点击表格选择一行。")
193
+
194
+ # # seg = state["aligned"][int(df[row][0]) - 1]
195
+ # # a_wav = export_segment(state["audio_a"], seg["start"], seg["end"])
196
+ # # b_wav = export_segment(state["audio_b"], seg["start"], seg["end"])
197
+ # # return a_wav, b_wav
198
+
199
+
200
+ # # # =========================================================
201
+ # # # UI
202
+ # # # =========================================================
203
+ # # with gr.Blocks(title="双语音频字幕对比(UI 指定 Dataset / A-B 文件)") as demo:
204
+ # # gr.Markdown(
205
+ # # "# 🎧 双语音频字幕对比\n"
206
+ # # "在 UI 中选择 Hugging Face Dataset,并分别指定 Track A / Track B 的媒体与字幕文件。"
207
+ # # )
208
+
209
+ # # state = gr.State()
210
+ # # selected_row = gr.State()
211
+
212
+ # # with gr.Row():
213
+ # # repo_id = gr.Textbox(label="Dataset / Repo 名称", placeholder="org/dataset")
214
+ # # repo_type = gr.Radio(["dataset", "model"], value="dataset", label="Repo 类型")
215
+
216
+ # # btn_scan = gr.Button("扫描 Dataset", variant="primary")
217
+
218
+ # # gr.Markdown("## Track A / Track B 文件选择(来自 Dataset)")
219
+
220
+ # # with gr.Row():
221
+ # # media_a = gr.Dropdown(label="Track A 媒体文件")
222
+ # # media_b = gr.Dropdown(label="Track B 媒体文件")
223
+
224
+ # # with gr.Row():
225
+ # # vtt_a = gr.Dropdown(label="Track A VTT 文件")
226
+ # # vtt_b = gr.Dropdown(label="Track B VTT 文件")
227
+
228
+ # # btn_scan.click(
229
+ # # scan_dataset,
230
+ # # inputs=[repo_id, repo_type],
231
+ # # outputs=[media_a, media_b, vtt_a, vtt_b],
232
+ # # )
233
+
234
+ # # th = gr.Slider(0.3, 5.0, value=DEFAULT_MAX_MID_DIFF, step=0.1, label="字幕对齐阈值(秒)")
235
+ # # btn_align = gr.Button("加载并对齐", variant="primary")
236
+
237
+ # # df = gr.Dataframe(
238
+ # # headers=["#", "Global Time", "Track A", "Track B"],
239
+ # # interactive=True,
240
+ # # wrap=True,
241
+ # # max_height=520,
242
+ # # )
243
+
244
+ # # stats = gr.JSON(label="统计信息")
245
+
246
+ # # btn_align.click(
247
+ # # load_and_align,
248
+ # # inputs=[repo_id, repo_type, media_a, media_b, vtt_a, vtt_b, th],
249
+ # # outputs=[df, stats, state],
250
+ # # )
251
+
252
+ # # df.select(lambda e: e.index, None, selected_row)
253
+
254
+ # # btn_play = gr.Button("播放选中片段", variant="primary")
255
+
256
+ # # with gr.Row():
257
+ # # a_out = gr.Audio(label="Track A 片段")
258
+ # # b_out = gr.Audio(label="Track B 片段")
259
+
260
+ # # btn_play.click(
261
+ # # play_selected,
262
+ # # inputs=[df, selected_row, state],
263
+ # # outputs=[a_out, b_out],
264
+ # # )
265
+
266
+ # # if __name__ == "__main__":
267
+ # # demo.launch()
268
+
269
  # import re
 
270
  # import tempfile
271
  # from dataclasses import dataclass
272
  # from typing import List, Dict
 
277
 
278
 
279
  # # =========================================================
280
+ # # 通用配置
281
  # # =========================================================
282
  # MEDIA_EXTS = (".mp4", ".m4a", ".mp3", ".wav", ".flac", ".ogg", ".aac", ".mov", ".avi")
283
  # VTT_EXTS = (".vtt",)
 
319
  # with open(path, "r", encoding="utf-8") as f:
320
  # content = f.read()
321
 
322
+ # # BOM / WEBVTT 头
323
+ # content = content.replace("\ufeff", "")
324
+ # content = re.sub(r"^\s*WEBVTT.*?\n", "", content, flags=re.IGNORECASE)
325
+
326
+ # blocks = re.split(r"\r?\n\r?\n", content.strip())
327
+ # cues: List[Cue] = []
328
 
329
  # for block in blocks:
330
+ # lines = [l.strip() for l in block.splitlines() if l.strip()]
331
+ # if not lines:
332
+ # continue
333
+
334
+ # # 找时间轴行(必须包含 -->)
335
+ # time_idx = None
336
+ # for i, line in enumerate(lines):
337
+ # if "-->" in line:
338
+ # time_idx = i
339
+ # break
340
+ # if time_idx is None:
341
  # continue
342
 
343
+ # m = _VTT_TIME_RE.search(lines[time_idx])
344
  # if not m:
345
  # continue
346
 
 
349
  # if end <= start:
350
  # continue
351
 
352
+ # # 只取时间轴行之后的内容作为字幕
353
+ # text_lines = lines[time_idx + 1 :]
354
+ # if not text_lines:
355
+ # continue
356
+
357
+ # text = _strip_tags("\n".join(text_lines)).strip()
358
+ # if not text:
359
+ # continue
360
+
361
+ # cues.append(Cue(start=start, end=end, text=text))
362
 
363
  # return sorted(cues, key=lambda x: x.start)
364
 
365
 
366
+
367
  # # =========================================================
368
  # # 对齐逻辑
369
  # # =========================================================
 
373
  # ma = (a[i].start + a[i].end) / 2
374
  # mb = (b[j].start + b[j].end) / 2
375
  # if abs(ma - mb) <= th:
376
+ # out.append(
377
+ # {
378
+ # "idx": idx,
379
+ # "start": min(a[i].start, b[j].start),
380
+ # "end": max(a[i].end, b[j].end),
381
+ # "a_start": a[i].start,
382
+ # "a_end": a[i].end,
383
+ # "b_start": b[j].start,
384
+ # "b_end": b[j].end,
385
+ # "a_text": a[i].text,
386
+ # "b_text": b[j].text,
387
+ # }
388
+ # )
389
  # idx += 1
390
  # i += 1
391
  # j += 1
 
400
  # # 播放工具
401
  # # =========================================================
402
  # def export_segment(audio: AudioSegment, start: float, end: float) -> str:
403
+ # start_ms = int(max(0.0, start) * 1000)
404
+ # end_ms = int(max(start + 0.01, end) * 1000)
405
+ # seg = audio[start_ms:end_ms]
406
  # tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
407
  # seg.export(tmp.name, format="wav")
408
  # return tmp.name
409
 
410
 
411
+ # def pick_window(seg: Dict, mode: str, off_a: float, off_b: float):
412
+ # if mode == "global":
413
+ # a_start, a_end = seg["start"], seg["end"]
414
+ # b_start, b_end = seg["start"], seg["end"]
415
+ # else:
416
+ # a_start, a_end = seg["a_start"], seg["a_end"]
417
+ # b_start, b_end = seg["b_start"], seg["b_end"]
418
+
419
+ # a_start, a_end = a_start + off_a, a_end + off_a
420
+ # b_start, b_end = b_start + off_b, b_end + off_b
421
+
422
+ # a_start = max(0.0, a_start)
423
+ # b_start = max(0.0, b_start)
424
+ # a_end = max(a_start + 0.01, a_end)
425
+ # b_end = max(b_start + 0.01, b_end)
426
+
427
+ # return a_start, a_end, b_start, b_end
428
+
429
+
430
  # # =========================================================
431
+ # # Gradio 回调:扫描 Dataset
432
  # # =========================================================
433
  # def scan_dataset(repo_id: str, repo_type: str):
434
  # if not repo_id:
435
  # raise gr.Error("请填写 Dataset / Repo 名称。")
436
 
437
  # files = list_repo_files(repo_id, repo_type=repo_type)
 
438
  # media_files = sorted([f for f in files if f.lower().endswith(MEDIA_EXTS)])
439
  # vtt_files = sorted([f for f in files if f.lower().endswith(VTT_EXTS)])
440
 
 
443
  # if not vtt_files:
444
  # raise gr.Error("Dataset 中未发现 VTT 文件。")
445
 
446
+ # # 默认都选第一个(用户可再改)
447
  # return (
448
  # gr.update(choices=media_files, value=media_files[0]),
449
  # gr.update(choices=media_files, value=media_files[0]),
 
452
  # )
453
 
454
 
455
+ # # =========================================================
456
+ # # Gradio 回调:加载并对齐
457
+ # # =========================================================
458
  # def load_and_align(
459
  # repo_id,
460
  # repo_type,
 
467
  # if not all([media_a_path, media_b_path, vtt_a_path, vtt_b_path]):
468
  # raise gr.Error("请为 Track A / B 分别选择媒体文件和 VTT 文件。")
469
 
470
+ # local_media_a = hf_hub_download(repo_id, media_a_path, repo_type=repo_type)
471
+ # local_media_b = hf_hub_download(repo_id, media_b_path, repo_type=repo_type)
472
+ # local_vtt_a = hf_hub_download(repo_id, vtt_a_path, repo_type=repo_type)
473
+ # local_vtt_b = hf_hub_download(repo_id, vtt_b_path, repo_type=repo_type)
474
 
475
+ # audio_a = AudioSegment.from_file(local_media_a)
476
+ # audio_b = AudioSegment.from_file(local_media_b)
477
+
478
+ # cues_a = parse_vtt_file(local_vtt_a)
479
+ # cues_b = parse_vtt_file(local_vtt_b)
480
 
481
  # if not cues_a or not cues_b:
482
+ # raise gr.Error("VTT 解析为空,请检查字幕文件内容。")
483
 
484
  # aligned = align_by_time(cues_a, cues_b, th)
485
  # if not aligned:
486
  # raise gr.Error("未对齐到任何片段,请尝试增大对齐阈值。")
487
 
488
  # rows = [
489
+ # [x["idx"], f'{x["start"]:.2f}-{x["end"]:.2f}', x["a_text"], x["b_text"]]
490
  # for x in aligned
491
  # ]
492
 
493
  # state = {
494
  # "aligned": aligned,
495
+ # "audio_a": audio_a,
496
+ # "audio_b": audio_b,
497
  # }
498
 
499
  # stats = {
 
505
  # "max_mid_diff_sec": th,
506
  # }
507
 
508
+ # # 注意:对齐后清空播放器 & 播放信息,避免旧内容误导
509
+ # return rows, stats, state, None, None, {}
510
+
511
+
512
+ # # =========================================================
513
+ # # Gradio 回调:选择即播放
514
+ # # =========================================================
515
+ # def play_on_select(
516
+ # evt: gr.SelectData,
517
+ # df_value,
518
+ # play_mode,
519
+ # crop_mode,
520
+ # offset_a,
521
+ # offset_b,
522
+ # state,
523
+ # ):
524
+ # if not state or "aligned" not in state:
525
+ # raise gr.Error("请先加载并对齐。")
526
+
527
+ # idx_raw = evt.index
528
+ # row = int(idx_raw[0] if isinstance(idx_raw, (tuple, list)) else idx_raw)
529
 
530
+ # if not df_value or row < 0 or row >= len(df_value):
531
+ # raise gr.Error("无法读取选中行,请重试。")
532
 
533
+ # seg_idx = int(df_value[row][0])
534
+ # seg = state["aligned"][seg_idx - 1]
535
+
536
+ # a_start, a_end, b_start, b_end = pick_window(
537
+ # seg, crop_mode, float(offset_a), float(offset_b)
538
+ # )
539
+
540
+ # a_wav = export_segment(state["audio_a"], a_start, a_end)
541
+ # b_wav = export_segment(state["audio_b"], b_start, b_end)
542
+
543
+ # info = {
544
+ # "segment": seg_idx,
545
+ # "play_mode": play_mode,
546
+ # "A_time": f"{a_start:.2f}-{a_end:.2f}",
547
+ # "B_time": f"{b_start:.2f}-{b_end:.2f}",
548
+ # }
549
+
550
+ # # ✅ 关键修复:不要返回 None
551
+ # if play_mode == "A":
552
+ # return a_wav, gr.update(value=None), info
553
+ # elif play_mode == "B":
554
+ # return gr.update(value=None), b_wav, info
555
+ # else:
556
+ # return a_wav, b_wav, info
557
 
 
 
 
 
558
 
559
 
560
  # # =========================================================
561
  # # UI
562
  # # =========================================================
563
+ # with gr.Blocks(title="双语音频字幕对比(选择即播放)") as demo:
564
  # gr.Markdown(
565
+ # "# 🎧 双语音频字幕对比(选择即播放)\n"
566
+ # "步骤:扫描 Dataset 分别选择 A/B 媒体与字幕 加载并对齐 **点击表格任意单元格即可播放片段**"
567
  # )
568
 
569
  # state = gr.State()
 
570
 
571
  # with gr.Row():
572
+ # repo_id = gr.Textbox(label="Dataset / Repo 名称", placeholder="org/dataset 或 org/repo")
573
  # repo_type = gr.Radio(["dataset", "model"], value="dataset", label="Repo 类型")
574
 
575
  # btn_scan = gr.Button("扫描 Dataset", variant="primary")
576
 
577
  # gr.Markdown("## Track A / Track B 文件选择(来自 Dataset)")
 
578
  # with gr.Row():
579
  # media_a = gr.Dropdown(label="Track A 媒体文件")
580
  # media_b = gr.Dropdown(label="Track B 媒体文件")
 
601
 
602
  # stats = gr.JSON(label="统计信息")
603
 
604
+ # gr.Markdown("## ▶ 播放参数(点击表格即可按这���参数播放)")
605
+ # play_mode = gr.Radio(["A", "B", "同时"], value="同时", label="播放模式")
606
+ # crop_mode = gr.Radio(["global", "per_track"], value="global", label="裁剪方式")
607
+ # offset_a = gr.Slider(-10, 10, value=0, step=0.1, label="Track A 偏移(s)")
608
+ # offset_b = gr.Slider(-10, 10, value=0, step=0.1, label="Track B 偏移(s)")
 
 
 
 
609
 
610
  # with gr.Row():
611
  # a_out = gr.Audio(label="Track A 片段")
612
  # b_out = gr.Audio(label="Track B 片段")
613
+ # play_info = gr.JSON(label="播放信息")
614
 
615
+ # btn_align.click(
616
+ # load_and_align,
617
+ # inputs=[repo_id, repo_type, media_a, media_b, vtt_a, vtt_b, th],
618
+ # outputs=[df, stats, state, a_out, b_out, play_info],
619
+ # )
620
+
621
+ # # ✅ 关键:选择即播放(不再需要“播放选中片段”按钮)
622
+ # df.select(
623
+ # fn=play_on_select,
624
+ # inputs=[df, play_mode, crop_mode, offset_a, offset_b, state],
625
+ # outputs=[a_out, b_out, play_info],
626
  # )
627
 
628
  # if __name__ == "__main__":
 
639
 
640
 
641
  # =========================================================
642
+ # 基本配置
643
  # =========================================================
644
  MEDIA_EXTS = (".mp4", ".m4a", ".mp3", ".wav", ".flac", ".ogg", ".aac", ".mov", ".avi")
645
  VTT_EXTS = (".vtt",)
 
646
  DEFAULT_MAX_MID_DIFF = 1.5
647
 
648
 
649
  # =========================================================
650
+ # 数据结构
651
  # =========================================================
652
  @dataclass
653
  class Cue:
 
656
  text: str
657
 
658
 
659
+ # =========================================================
660
+ # VTT 解析(只保留纯字幕)
661
+ # =========================================================
662
  _TAG_RE = re.compile(r"</?[^>]+?>", re.IGNORECASE)
663
  _VTT_TIME_RE = re.compile(
664
  r"(?P<start>\d{2}:\d{2}:\d{2}\.\d{3}|\d{1,2}:\d{2}\.\d{3})\s*-->\s*"
 
674
  parts = t.split(":")
675
  if len(parts) == 3:
676
  return int(parts[0]) * 3600 + int(parts[1]) * 60 + float(parts[2])
677
+ return int(parts[0]) * 60 + float(parts[1])
 
 
678
 
679
 
680
  def parse_vtt_file(path: str) -> List[Cue]:
681
  with open(path, "r", encoding="utf-8") as f:
682
  content = f.read()
683
 
 
684
  content = content.replace("\ufeff", "")
685
  content = re.sub(r"^\s*WEBVTT.*?\n", "", content, flags=re.IGNORECASE)
686
 
 
692
  if not lines:
693
  continue
694
 
 
695
  time_idx = None
696
  for i, line in enumerate(lines):
697
  if "-->" in line:
 
709
  if end <= start:
710
  continue
711
 
 
712
  text_lines = lines[time_idx + 1 :]
713
  if not text_lines:
714
  continue
715
 
716
+ text = _strip_tags("\n".join(text_lines))
717
+ if text:
718
+ cues.append(Cue(start=start, end=end, text=text))
 
 
719
 
720
  return sorted(cues, key=lambda x: x.start)
721
 
722
 
 
723
  # =========================================================
724
+ # 字幕对齐(按时间中点)
725
  # =========================================================
726
  def align_by_time(a: List[Cue], b: List[Cue], th: float) -> List[Dict]:
727
  out, i, j, idx = [], 0, 0, 1
 
734
  "idx": idx,
735
  "start": min(a[i].start, b[j].start),
736
  "end": max(a[i].end, b[j].end),
 
 
 
 
737
  "a_text": a[i].text,
738
  "b_text": b[j].text,
739
  }
 
749
 
750
 
751
  # =========================================================
752
+ # 音频切片
753
  # =========================================================
754
  def export_segment(audio: AudioSegment, start: float, end: float) -> str:
755
+ seg = audio[int(start * 1000) : int(end * 1000)]
 
 
756
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
757
  seg.export(tmp.name, format="wav")
758
  return tmp.name
759
 
760
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
761
  # =========================================================
762
+ # Gradio 回调
763
  # =========================================================
764
  def scan_dataset(repo_id: str, repo_type: str):
765
  if not repo_id:
766
  raise gr.Error("请填写 Dataset / Repo 名称。")
767
 
768
  files = list_repo_files(repo_id, repo_type=repo_type)
769
+ media_files = [f for f in files if f.lower().endswith(MEDIA_EXTS)]
770
+ vtt_files = [f for f in files if f.lower().endswith(VTT_EXTS)]
771
 
772
+ if not media_files or not vtt_files:
773
+ raise gr.Error("Dataset 中未找到媒体文件或 VTT 文件。")
 
 
774
 
 
775
  return (
776
  gr.update(choices=media_files, value=media_files[0]),
777
  gr.update(choices=media_files, value=media_files[0]),
 
780
  )
781
 
782
 
783
+ def load_and_align(repo_id, repo_type, media_a, media_b, vtt_a, vtt_b, th):
784
+ audio_a = AudioSegment.from_file(hf_hub_download(repo_id, media_a, repo_type=repo_type))
785
+ audio_b = AudioSegment.from_file(hf_hub_download(repo_id, media_b, repo_type=repo_type))
786
+
787
+ cues_a = parse_vtt_file(hf_hub_download(repo_id, vtt_a, repo_type=repo_type))
788
+ cues_b = parse_vtt_file(hf_hub_download(repo_id, vtt_b, repo_type=repo_type))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
789
 
790
  aligned = align_by_time(cues_a, cues_b, th)
791
  if not aligned:
792
+ raise gr.Error("未对齐到任何字幕片段。")
793
 
794
  rows = [
795
  [x["idx"], f'{x["start"]:.2f}-{x["end"]:.2f}', x["a_text"], x["b_text"]]
 
802
  "audio_b": audio_b,
803
  }
804
 
805
+ return rows, state, None, None
 
 
 
 
 
 
 
 
 
 
806
 
807
 
808
+ def play_on_select(evt: gr.SelectData, df_value, state):
809
+ if not state:
 
 
 
 
 
 
 
 
 
 
 
810
  raise gr.Error("请先加载并对齐。")
811
 
812
+ row = evt.index[0] if isinstance(evt.index, (tuple, list)) else evt.index
 
 
 
 
 
813
  seg_idx = int(df_value[row][0])
814
  seg = state["aligned"][seg_idx - 1]
815
 
816
+ a_wav = export_segment(state["audio_a"], seg["start"], seg["end"])
817
+ b_wav = export_segment(state["audio_b"], seg["start"], seg["end"])
 
 
 
 
818
 
819
  info = {
820
  "segment": seg_idx,
821
+ "time": f'{seg["start"]:.2f} - {seg["end"]:.2f}',
 
 
822
  }
823
 
824
+ return a_wav, b_wav, info
 
 
 
 
 
 
 
825
 
826
 
827
  # =========================================================
828
  # UI
829
  # =========================================================
830
+ with gr.Blocks(title="双语音频字幕对齐(点击即播放)") as demo:
831
+ gr.Markdown("# 🎧 双语音频字幕对齐(点击表格即播放)")
 
 
 
832
 
833
  state = gr.State()
834
 
835
  with gr.Row():
836
+ repo_id = gr.Textbox(label="Dataset / Repo 名称", placeholder="org/dataset")
837
  repo_type = gr.Radio(["dataset", "model"], value="dataset", label="Repo 类型")
838
 
839
  btn_scan = gr.Button("扫描 Dataset", variant="primary")
840
 
 
841
  with gr.Row():
842
+ media_a = gr.Dropdown(label="Track A 媒体")
843
+ media_b = gr.Dropdown(label="Track B 媒体")
844
 
845
  with gr.Row():
846
+ vtt_a = gr.Dropdown(label="Track A 字幕")
847
+ vtt_b = gr.Dropdown(label="Track B 字幕")
848
 
849
  btn_scan.click(
850
  scan_dataset,
 
852
  outputs=[media_a, media_b, vtt_a, vtt_b],
853
  )
854
 
855
+ th = gr.Slider(0.3, 5.0, value=DEFAULT_MAX_MID_DIFF, step=0.1, label="对齐阈值(秒)")
856
  btn_align = gr.Button("加载并对齐", variant="primary")
857
 
858
  df = gr.Dataframe(
859
+ headers=["#", "Time", "Track A", "Track B"],
860
  interactive=True,
861
  wrap=True,
862
  max_height=520,
863
  )
864
 
865
+ btn_align.click(
866
+ load_and_align,
867
+ inputs=[repo_id, repo_type, media_a, media_b, vtt_a, vtt_b, th],
868
+ outputs=[df, state, gr.Audio(), gr.Audio()],
869
+ )
 
 
870
 
871
  with gr.Row():
872
  a_out = gr.Audio(label="Track A 片段")
873
  b_out = gr.Audio(label="Track B 片段")
 
874
 
875
+ play_info = gr.JSON(label="当前片段")
 
 
 
 
876
 
 
877
  df.select(
878
+ play_on_select,
879
+ inputs=[df, state],
880
  outputs=[a_out, b_out, play_info],
881
  )
882