unknown commited on
Commit
fa56eeb
·
1 Parent(s): 50d1858
Files changed (1) hide show
  1. app.py +382 -1
app.py CHANGED
@@ -628,6 +628,388 @@
628
  # if __name__ == "__main__":
629
  # demo.launch()
630
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
631
  import re
632
  from dataclasses import dataclass
633
  from typing import Any, Dict, List, Tuple, Optional
@@ -1009,4 +1391,3 @@ with gr.Blocks(title="双语音频字幕对齐(点击即播放)") as demo:
1009
 
1010
  if __name__ == "__main__":
1011
  demo.launch()
1012
-
 
628
  # if __name__ == "__main__":
629
  # demo.launch()
630
 
631
+ # import re
632
+ # from dataclasses import dataclass
633
+ # from typing import Any, Dict, List, Tuple, Optional
634
+
635
+ # import gradio as gr
636
+ # from huggingface_hub import list_repo_files, hf_hub_download
637
+ # from pydub import AudioSegment
638
+ # import numpy as np
639
+
640
+ # # =========================================================
641
+ # # Config
642
+ # # =========================================================
643
+ # MEDIA_EXTS = (".mp4", ".m4a", ".mp3", ".wav", ".flac", ".ogg", ".aac", ".mov", ".avi")
644
+ # VTT_EXTS = (".vtt",)
645
+
646
+ # DEFAULT_MAX_MID_DIFF = 1.5
647
+
648
+ # # Normalize audio for stable playback in browsers
649
+ # TARGET_SR = 48000
650
+ # TARGET_CH = 1 # mono
651
+ # TARGET_SW = 2 # 16-bit PCM
652
+
653
+
654
+ # # =========================================================
655
+ # # Data structures
656
+ # # =========================================================
657
+ # @dataclass
658
+ # class Cue:
659
+ # start: float
660
+ # end: float
661
+ # text: str
662
+
663
+
664
+ # # =========================================================
665
+ # # VTT parsing
666
+ # # =========================================================
667
+ # _TAG_RE = re.compile(r"</?[^>]+?>", re.IGNORECASE)
668
+ # _VTT_TIME_RE = re.compile(
669
+ # r"(?P<start>\d{2}:\d{2}:\d{2}\.\d{3}|\d{1,2}:\d{2}\.\d{3})\s*-->\s*"
670
+ # r"(?P<end>\d{2}:\d{2}:\d{2}\.\d{3}|\d{1,2}:\d{2}\.\d{3})"
671
+ # )
672
+
673
+
674
+ # def _strip_tags(text: str) -> str:
675
+ # return _TAG_RE.sub("", text).strip()
676
+
677
+
678
+ # def _time_to_seconds(t: str) -> float:
679
+ # parts = t.split(":")
680
+ # if len(parts) == 3:
681
+ # return int(parts[0]) * 3600 + int(parts[1]) * 60 + float(parts[2])
682
+ # if len(parts) == 2:
683
+ # return int(parts[0]) * 60 + float(parts[1])
684
+ # raise ValueError(f"Bad VTT timestamp: {t}")
685
+
686
+
687
+ # def parse_vtt_file(path: str) -> List[Cue]:
688
+ # with open(path, "r", encoding="utf-8") as f:
689
+ # content = f.read()
690
+
691
+ # # Remove BOM / WEBVTT header (if any)
692
+ # content = content.replace("\ufeff", "")
693
+ # content = re.sub(r"^\s*WEBVTT.*?\n", "", content, flags=re.IGNORECASE)
694
+
695
+ # blocks = re.split(r"\r?\n\r?\n", content.strip())
696
+ # cues: List[Cue] = []
697
+
698
+ # for block in blocks:
699
+ # lines = [l.strip() for l in block.splitlines() if l.strip()]
700
+ # if not lines:
701
+ # continue
702
+
703
+ # # Locate the timestamp line (must contain "-->")
704
+ # time_idx: Optional[int] = None
705
+ # for i, line in enumerate(lines):
706
+ # if "-->" in line:
707
+ # time_idx = i
708
+ # break
709
+ # if time_idx is None:
710
+ # continue
711
+
712
+ # m = _VTT_TIME_RE.search(lines[time_idx])
713
+ # if not m:
714
+ # continue
715
+
716
+ # start = _time_to_seconds(m.group("start"))
717
+ # end = _time_to_seconds(m.group("end"))
718
+ # if end <= start:
719
+ # continue
720
+
721
+ # # Only take lines after the timestamp line as subtitle text
722
+ # text_lines = lines[time_idx + 1 :]
723
+ # if not text_lines:
724
+ # continue
725
+
726
+ # text = _strip_tags("\n".join(text_lines))
727
+ # if text:
728
+ # cues.append(Cue(start=start, end=end, text=text))
729
+
730
+ # return sorted(cues, key=lambda x: x.start)
731
+
732
+
733
+ # # =========================================================
734
+ # # Alignment (match by mid time), preserve per-track windows
735
+ # # =========================================================
736
+ # def align_by_time(a: List[Cue], b: List[Cue], th: float) -> List[Dict[str, Any]]:
737
+ # out: List[Dict[str, Any]] = []
738
+ # i, j, idx = 0, 0, 1
739
+
740
+ # while i < len(a) and j < len(b):
741
+ # ma = (a[i].start + a[i].end) / 2
742
+ # mb = (b[j].start + b[j].end) / 2
743
+
744
+ # if abs(ma - mb) <= th:
745
+ # out.append(
746
+ # {
747
+ # "idx": idx,
748
+ # # Per-track time window (recommended for playback)
749
+ # "a_start": a[i].start,
750
+ # "a_end": a[i].end,
751
+ # "b_start": b[j].start,
752
+ # "b_end": b[j].end,
753
+ # # Optional global time window (for comparison/debug)
754
+ # "start": min(a[i].start, b[j].start),
755
+ # "end": max(a[i].end, b[j].end),
756
+ # "a_text": a[i].text,
757
+ # "b_text": b[j].text,
758
+ # }
759
+ # )
760
+ # idx += 1
761
+ # i += 1
762
+ # j += 1
763
+ # elif ma < mb:
764
+ # i += 1
765
+ # else:
766
+ # j += 1
767
+
768
+ # return out
769
+
770
+
771
+ # # =========================================================
772
+ # # Audio slicing -> return (sr, np.int16) for gr.Audio(type="numpy")
773
+ # # =========================================================
774
+ # def export_segment_numpy(audio: AudioSegment, start: float, end: float) -> Tuple[int, np.ndarray]:
775
+ # start = max(0.0, float(start))
776
+ # end = max(start + 0.05, float(end)) # avoid too-short segments
777
+
778
+ # seg = audio[int(start * 1000) : int(end * 1000)]
779
+
780
+ # # Normalize to mono/48k/int16 for stable browser playback
781
+ # seg = seg.set_channels(TARGET_CH).set_frame_rate(TARGET_SR).set_sample_width(TARGET_SW)
782
+
783
+ # arr = np.asarray(seg.get_array_of_samples())
784
+ # if arr.dtype != np.int16:
785
+ # arr = arr.astype(np.int16, copy=False)
786
+
787
+ # # Shape should be (n,) for mono
788
+ # arr = np.ascontiguousarray(arr)
789
+ # return TARGET_SR, arr
790
+
791
+
792
+ # # =========================================================
793
+ # # Helper: robustly read seg_idx from gr.Dataframe value
794
+ # # =========================================================
795
+ # def _get_seg_idx_from_df(df_value: Any, row: int) -> Optional[int]:
796
+ # if df_value is None:
797
+ # return None
798
+
799
+ # # pandas DataFrame in some Gradio versions
800
+ # try:
801
+ # import pandas as pd # type: ignore
802
+ # if isinstance(df_value, pd.DataFrame):
803
+ # if row < 0 or row >= len(df_value.index) or df_value.shape[1] < 1:
804
+ # return None
805
+ # return int(df_value.iloc[row, 0])
806
+ # except Exception:
807
+ # pass
808
+
809
+ # # list-of-lists
810
+ # try:
811
+ # if isinstance(df_value, list) and row >= 0 and row < len(df_value) and len(df_value[row]) >= 1:
812
+ # return int(df_value[row][0])
813
+ # except Exception:
814
+ # return None
815
+
816
+ # return None
817
+
818
+
819
+ # # =========================================================
820
+ # # Gradio callbacks
821
+ # # =========================================================
822
+ # def scan_dataset(repo_id: str, repo_type: str):
823
+ # if not repo_id:
824
+ # raise gr.Error("请填写 Dataset / Repo 名称(例如 org/dataset)。")
825
+
826
+ # files = list_repo_files(repo_id, repo_type=repo_type)
827
+ # media_files = sorted([f for f in files if f.lower().endswith(MEDIA_EXTS)])
828
+ # vtt_files = sorted([f for f in files if f.lower().endswith(VTT_EXTS)])
829
+
830
+ # if not media_files:
831
+ # raise gr.Error("未找到媒体文件(mp4/mp3/wav 等)。")
832
+ # if not vtt_files:
833
+ # raise gr.Error("未找到 VTT 字幕文件。")
834
+
835
+ # return (
836
+ # gr.update(choices=media_files, value=media_files[0]),
837
+ # gr.update(choices=media_files, value=media_files[0]),
838
+ # gr.update(choices=vtt_files, value=vtt_files[0]),
839
+ # gr.update(choices=vtt_files, value=vtt_files[0]),
840
+ # )
841
+
842
+
843
+ # def load_and_align(repo_id, repo_type, media_a, media_b, vtt_a, vtt_b, th):
844
+ # if not all([repo_id, repo_type, media_a, media_b, vtt_a, vtt_b]):
845
+ # raise gr.Error("请先选择 A/B 的媒体文件与 VTT 文件。")
846
+
847
+ # local_media_a = hf_hub_download(repo_id, media_a, repo_type=repo_type)
848
+ # local_media_b = hf_hub_download(repo_id, media_b, repo_type=repo_type)
849
+ # local_vtt_a = hf_hub_download(repo_id, vtt_a, repo_type=repo_type)
850
+ # local_vtt_b = hf_hub_download(repo_id, vtt_b, repo_type=repo_type)
851
+
852
+ # try:
853
+ # audio_a = AudioSegment.from_file(local_media_a)
854
+ # audio_b = AudioSegment.from_file(local_media_b)
855
+ # except Exception as e:
856
+ # raise gr.Error(
857
+ # "媒体解码失败。若是 mp4/m4a,通常需要 ffmpeg。\n"
858
+ # f"原始错误: {repr(e)}"
859
+ # )
860
+
861
+ # cues_a = parse_vtt_file(local_vtt_a)
862
+ # cues_b = parse_vtt_file(local_vtt_b)
863
+ # if not cues_a or not cues_b:
864
+ # raise gr.Error("VTT 解析为空,请检查字幕文件内容。")
865
+
866
+ # aligned = align_by_time(cues_a, cues_b, float(th))
867
+ # if not aligned:
868
+ # raise gr.Error("未对齐到任何字幕片段,请尝试增大对齐阈值。")
869
+
870
+ # rows = [
871
+ # [
872
+ # x["idx"],
873
+ # f'{x["a_start"]:.2f}-{x["a_end"]:.2f}',
874
+ # f'{x["b_start"]:.2f}-{x["b_end"]:.2f}',
875
+ # x["a_text"],
876
+ # x["b_text"],
877
+ # ]
878
+ # for x in aligned
879
+ # ]
880
+
881
+ # # Critical: build idx -> seg map to survive dataframe sorting/reordering
882
+ # idx_map = {int(x["idx"]): x for x in aligned}
883
+
884
+ # state = {
885
+ # "aligned": aligned,
886
+ # "idx_map": idx_map,
887
+ # "audio_a": audio_a,
888
+ # "audio_b": audio_b,
889
+ # }
890
+
891
+ # # Clear old playback outputs
892
+ # return rows, state, None, None, {}
893
+
894
+
895
+ # def play_on_select(evt: gr.SelectData, df_value, crop_mode, offset_a, offset_b, state):
896
+ # if not state or "aligned" not in state:
897
+ # raise gr.Error("请先加载并对齐。")
898
+
899
+ # # evt.index: int or (row, col)
900
+ # idx_raw = evt.index
901
+ # row = int(idx_raw[0] if isinstance(idx_raw, (tuple, list)) else idx_raw)
902
+
903
+ # offset_a = float(offset_a)
904
+ # offset_b = float(offset_b)
905
+
906
+ # # Prefer seg_idx from the clicked row's first column; then resolve via idx_map.
907
+ # seg_idx = _get_seg_idx_from_df(df_value, row)
908
+ # seg = None
909
+ # idx_map = state.get("idx_map", {}) or {}
910
+ # if seg_idx is not None and seg_idx in idx_map:
911
+ # seg = idx_map[seg_idx]
912
+ # else:
913
+ # # Fallback to row->aligned if idx missing (should be rare)
914
+ # aligned = state["aligned"]
915
+ # if row < 0 or row >= len(aligned):
916
+ # raise gr.Error("选中行越界,请重试或重新对齐。")
917
+ # seg = aligned[row]
918
+ # seg_idx = int(seg.get("idx", row + 1))
919
+
920
+ # if crop_mode == "global":
921
+ # a_start, a_end = seg["start"] + offset_a, seg["end"] + offset_a
922
+ # b_start, b_end = seg["start"] + offset_b, seg["end"] + offset_b
923
+ # else:
924
+ # # per_track playback (recommended)
925
+ # a_start, a_end = seg["a_start"] + offset_a, seg["a_end"] + offset_a
926
+ # b_start, b_end = seg["b_start"] + offset_b, seg["b_end"] + offset_b
927
+
928
+ # a_np = export_segment_numpy(state["audio_a"], a_start, a_end)
929
+ # b_np = export_segment_numpy(state["audio_b"], b_start, b_end)
930
+
931
+ # info = {
932
+ # "segment": seg_idx,
933
+ # "row": row,
934
+ # "crop_mode": crop_mode,
935
+ # "A_time": f"{a_start:.2f}-{a_end:.2f}",
936
+ # "B_time": f"{b_start:.2f}-{b_end:.2f}",
937
+ # }
938
+ # return a_np, b_np, info
939
+
940
+
941
+ # # =========================================================
942
+ # # UI
943
+ # # =========================================================
944
+ # with gr.Blocks(title="双语音频字幕对齐(点击即播放)") as demo:
945
+ # gr.Markdown(
946
+ # "# 双语音频字幕对齐(点击表格即播放)\n"
947
+ # "流程:扫描 Dataset → 选择 A/B 媒体与字幕 → 加载并对齐 → 点击表格任意单元格播放对应片段。\n"
948
+ # "若字幕与音频整体存在固定延迟,可用 Track A/B 偏移进行校正。"
949
+ # )
950
+
951
+ # state = gr.State()
952
+
953
+ # with gr.Row():
954
+ # repo_id = gr.Textbox(label="Dataset / Repo 名称", placeholder="org/dataset")
955
+ # repo_type = gr.Radio(["dataset", "model"], value="dataset", label="Repo 类型")
956
+
957
+ # btn_scan = gr.Button("扫描 Dataset", variant="primary")
958
+
959
+ # with gr.Row():
960
+ # media_a = gr.Dropdown(label="Track A 媒体")
961
+ # media_b = gr.Dropdown(label="Track B 媒体")
962
+
963
+ # with gr.Row():
964
+ # vtt_a = gr.Dropdown(label="Track A 字幕")
965
+ # vtt_b = gr.Dropdown(label="Track B 字幕")
966
+
967
+ # btn_scan.click(
968
+ # scan_dataset,
969
+ # inputs=[repo_id, repo_type],
970
+ # outputs=[media_a, media_b, vtt_a, vtt_b],
971
+ # )
972
+
973
+ # th = gr.Slider(0.3, 5.0, value=DEFAULT_MAX_MID_DIFF, step=0.1, label="对齐阈值(秒)")
974
+ # btn_align = gr.Button("加载并对齐", variant="primary")
975
+
976
+ # df = gr.Dataframe(
977
+ # headers=["#", "A Time", "B Time", "Track A", "Track B"],
978
+ # interactive=True, # can be sorted/edited; mapping is stable due to idx_map
979
+ # wrap=True,
980
+ # max_height=520,
981
+ # )
982
+
983
+ # with gr.Row():
984
+ # crop_mode = gr.Radio(
985
+ # choices=["per_track", "global"],
986
+ # value="per_track",
987
+ # label="裁剪方式(建议 per_track)",
988
+ # )
989
+ # offset_a = gr.Slider(-20, 20, value=0.0, step=0.05, label="Track A 时间偏移(s)")
990
+ # offset_b = gr.Slider(-20, 20, value=0.0, step=0.05, label="Track B 时间偏移(s)")
991
+
992
+ # with gr.Row():
993
+ # a_out = gr.Audio(label="Track A 片段", type="numpy")
994
+ # b_out = gr.Audio(label="Track B 片段", type="numpy")
995
+
996
+ # play_info = gr.JSON(label="当前片段")
997
+
998
+ # btn_align.click(
999
+ # load_and_align,
1000
+ # inputs=[repo_id, repo_type, media_a, media_b, vtt_a, vtt_b, th],
1001
+ # outputs=[df, state, a_out, b_out, play_info],
1002
+ # )
1003
+
1004
+ # df.select(
1005
+ # play_on_select,
1006
+ # inputs=[df, crop_mode, offset_a, offset_b, state],
1007
+ # outputs=[a_out, b_out, play_info],
1008
+ # )
1009
+
1010
+ # if __name__ == "__main__":
1011
+ # demo.launch()
1012
+
1013
  import re
1014
  from dataclasses import dataclass
1015
  from typing import Any, Dict, List, Tuple, Optional
 
1391
 
1392
  if __name__ == "__main__":
1393
  demo.launch()