ZTXRiley commited on
Commit
28bef82
·
verified ·
1 Parent(s): 67223f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1057 -429
app.py CHANGED
@@ -1,481 +1,1097 @@
1
- # import json
2
- # import io
3
  # import os
4
- # from typing import Dict, Any, Tuple, Optional, List
 
 
 
 
5
 
6
  # import gradio as gr
7
-
8
- # # Optional deps
9
  # import numpy as np
10
-
11
- # try:
12
- # import pandas as pd
13
- # except Exception as e:
14
- # pd = None
15
-
16
- # # Audio backends
17
- # _AUDIO_BACKEND = None
18
- # try:
19
- # import soundfile as sf # best for wav/flac/ogg
20
- # _AUDIO_BACKEND = "soundfile"
21
- # except Exception:
22
- # sf = None
23
-
24
- # try:
25
- # from pydub import AudioSegment # fallback, works well with ffmpeg
26
- # _AUDIO_BACKEND = _AUDIO_BACKEND or "pydub"
27
- # except Exception:
28
- # AudioSegment = None
29
-
30
-
31
- # def _load_json(json_path: str) -> Dict[str, Any]:
32
- # with open(json_path, "r", encoding="utf-8") as f:
33
- # return json.load(f)
34
-
35
-
36
- # def _normalize_segments(data: Dict[str, Any]) -> List[Dict[str, Any]]:
37
- # segs = data.get("segments", [])
38
- # norm = []
39
- # for i, s in enumerate(segs):
40
- # # Some files have index only for valid segments; keep a stable row id
41
- # row_id = s.get("index", None)
42
- # if row_id is None:
43
- # row_id = i # fallback
44
-
45
- # norm.append({
46
- # "row_id": row_id,
47
- # "start": float(s.get("start", 0.0)),
48
- # "end": float(s.get("end", 0.0)),
49
- # "dur": float(s.get("end", 0.0)) - float(s.get("start", 0.0)),
50
- # "status": s.get("status", ""),
51
- # "speaker": s.get("speaker", ""),
52
- # "gender": s.get("gender", ""),
53
- # "age_group": s.get("age_group", ""),
54
- # "emotion": s.get("emotion", ""),
55
- # "text": s.get("text", "") or "",
56
- # })
57
- # return norm
58
-
59
-
60
- # def _load_audio_to_np(audio_path: str) -> Tuple[np.ndarray, int]:
61
- # """
62
- # Returns mono float32 audio in [-1, 1], sample_rate
63
- # """
64
- # if sf is not None:
65
- # try:
66
- # y, sr = sf.read(audio_path, always_2d=False)
67
- # if y.ndim == 2:
68
- # y = np.mean(y, axis=1) # to mono
69
- # y = y.astype(np.float32, copy=False)
70
- # # If integer dtype, normalize (soundfile typically returns float already)
71
- # if y.size > 0:
72
- # mx = np.max(np.abs(y))
73
- # if mx > 1.5: # heuristic for int-like
74
- # y = y / mx
75
- # return y, sr
76
- # except Exception:
77
- # pass
78
-
79
- # if AudioSegment is not None:
80
- # # Convert via pydub -> raw samples
81
- # seg = AudioSegment.from_file(audio_path)
82
- # sr = seg.frame_rate
83
- # samples = np.array(seg.get_array_of_samples())
84
- # if seg.channels > 1:
85
- # samples = samples.reshape((-1, seg.channels)).mean(axis=1)
86
- # # normalize by sample width
87
- # max_val = float(1 << (8 * seg.sample_width - 1))
88
- # y = (samples / max_val).astype(np.float32)
89
- # return y, sr
90
-
91
- # raise RuntimeError(
92
- # "Cannot load audio: install soundfile or pydub (with ffmpeg)."
93
- # )
94
-
95
-
96
- # def _slice_audio(y: np.ndarray, sr: int, start_s: float, end_s: float) -> Tuple[int, np.ndarray]:
97
- # start_i = max(0, int(round(start_s * sr)))
98
- # end_i = min(len(y), int(round(end_s * sr)))
99
- # if end_i <= start_i:
100
- # return sr, np.zeros((0,), dtype=np.float32)
101
- # return sr, y[start_i:end_i]
102
-
103
-
104
- # def _safe_unique(values: List[str]) -> List[str]:
105
- # out = []
106
- # seen = set()
107
- # for v in values:
108
- # v = v or ""
109
- # if v not in seen:
110
- # out.append(v)
111
- # seen.add(v)
112
- # return out
113
-
114
-
115
- # def _build_df(segments: List[Dict[str, Any]]):
116
- # if pd is None:
117
- # # Fallback: return list-of-lists for gr.Dataframe
118
- # headers = ["row_id", "start", "end", "dur", "status", "speaker", "gender", "age_group", "emotion", "text"]
119
- # rows = [[s.get(h, "") for h in headers] for s in segments]
120
- # return headers, rows, None
121
- # df = pd.DataFrame(segments)
122
- # # sort by start time
123
- # df = df.sort_values(["start", "end"]).reset_index(drop=True)
124
- # return df.columns.tolist(), df.values.tolist(), df
125
-
126
-
127
- # def _filter_segments(segments: List[Dict[str, Any]],
128
- # speaker: str, gender: str, age_group: str, emotion: str, status: str,
129
- # text_query: str) -> List[Dict[str, Any]]:
130
- # def ok(v: str, sel: str) -> bool:
131
- # if sel in (None, "", "ALL"):
132
- # return True
133
- # return (v or "") == sel
134
-
135
- # tq = (text_query or "").strip().lower()
136
-
137
- # out = []
138
- # for s in segments:
139
- # if not ok(s.get("speaker", ""), speaker):
140
  # continue
141
- # if not ok(s.get("gender", ""), gender):
 
 
 
 
 
142
  # continue
143
- # if not ok(s.get("age_group", ""), age_group):
 
144
  # continue
145
- # if not ok(s.get("emotion", ""), emotion):
 
 
146
  # continue
147
- # if not ok(s.get("status", ""), status):
 
 
148
  # continue
149
- # if tq:
150
- # if tq not in (s.get("text", "") or "").lower():
151
- # continue
152
- # out.append(s)
153
- # return out
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
- # def _make_waveform_plot(y: np.ndarray, sr: int, segments: List[Dict[str, Any]]):
157
- # # Optional: simple waveform plot with vertical lines on segment boundaries
158
- # try:
159
- # import matplotlib.pyplot as plt
160
- # except Exception:
161
- # return None
 
 
 
 
 
 
 
 
 
162
 
163
- # if y.size == 0:
164
- # return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
- # # downsample for plotting speed
167
- # max_points = 20000
168
- # if len(y) > max_points:
169
- # step = len(y) // max_points
170
- # y_plot = y[::step]
171
- # t = np.linspace(0, len(y) / sr, num=len(y_plot))
172
- # else:
173
- # y_plot = y
174
- # t = np.linspace(0, len(y) / sr, num=len(y))
175
-
176
- # fig = plt.figure(figsize=(10, 3))
177
- # ax = fig.add_subplot(111)
178
- # ax.plot(t, y_plot, linewidth=0.7)
179
- # ax.set_title("Waveform (segment boundaries)")
180
- # ax.set_xlabel("Time (s)")
181
- # ax.set_ylabel("Amplitude")
182
-
183
- # # add some boundaries (avoid too many lines)
184
- # if segments:
185
- # # show up to 120 boundaries to avoid clutter
186
- # boundaries = []
187
- # for s in segments:
188
- # boundaries.append(float(s.get("start", 0.0)))
189
- # boundaries.append(float(s.get("end", 0.0)))
190
- # boundaries = sorted(set(boundaries))
191
- # if len(boundaries) > 120:
192
- # # evenly sample
193
- # idx = np.linspace(0, len(boundaries) - 1, 120).astype(int)
194
- # boundaries = [boundaries[i] for i in idx]
195
-
196
- # for b in boundaries:
197
- # ax.axvline(b, linewidth=0.5, alpha=0.25)
198
-
199
- # fig.tight_layout()
200
- # return fig
201
-
202
-
203
- # def load_assets(audio_file, json_file):
204
- # if audio_file is None or json_file is None:
205
- # raise gr.Error("请同时上传音频文件和JSON字幕文件。")
206
-
207
- # audio_path = audio_file
208
- # json_path = json_file
209
-
210
- # data = _load_json(json_path)
211
- # segments = _normalize_segments(data)
212
-
213
- # # load audio
214
- # y, sr = _load_audio_to_np(audio_path)
215
-
216
- # # build options
217
- # speakers = ["ALL"] + _safe_unique([s.get("speaker", "") for s in segments if s.get("speaker", "")])
218
- # genders = ["ALL"] + _safe_unique([s.get("gender", "") for s in segments if s.get("gender", "")])
219
- # ages = ["ALL"] + _safe_unique([s.get("age_group", "") for s in segments if s.get("age_group", "")])
220
- # emotions = ["ALL"] + _safe_unique([s.get("emotion", "") for s in segments if s.get("emotion", "")])
221
- # statuses = ["ALL"] + _safe_unique([s.get("status", "") for s in segments if s.get("status", "")])
222
-
223
- # headers, rows, _ = _build_df(segments)
224
-
225
- # # waveform plot (optional)
226
- # fig = _make_waveform_plot(y, sr, segments)
227
-
228
- # # store in state
229
- # state = {
230
- # "audio_path": audio_path,
231
- # "json_path": json_path,
232
- # "audio_sr": sr,
233
- # "audio_y": y,
234
- # "segments": segments,
235
- # "headers": headers,
236
- # "rows_full": rows,
237
- # }
238
 
239
- # info_md = (
240
- # f"**audio_name**: `{data.get('audio_name','')}` \n"
241
- # f"**segments**: {len(segments)} \n"
242
- # f"**audio**: sr={sr}, duration≈{len(y)/sr:.2f}s \n"
243
- # f"**backend**: `{_AUDIO_BACKEND}`"
244
- # )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
 
246
- # return (
247
- # state,
248
- # gr.update(value=info_md),
249
- # gr.update(choices=speakers, value="ALL"),
250
- # gr.update(choices=genders, value="ALL"),
251
- # gr.update(choices=ages, value="ALL"),
252
- # gr.update(choices=emotions, value="ALL"),
253
- # gr.update(choices=statuses, value="ALL"),
254
- # gr.update(value=rows, headers=headers),
255
- # fig
256
- # )
 
 
 
 
257
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
 
259
- # def apply_filters(state, speaker, gender, age_group, emotion, status, text_query):
260
- # if not state:
261
- # raise gr.Error("请先加载音频和JSON。")
 
 
 
 
 
262
 
263
- # segments = state["segments"]
264
- # filtered = _filter_segments(segments, speaker, gender, age_group, emotion, status, text_query)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
 
266
- # headers, rows, _ = _build_df(filtered)
 
 
 
 
 
 
267
 
268
- # state2 = dict(state)
269
- # state2["rows_filtered"] = rows
270
- # state2["headers_filtered"] = headers
271
- # state2["segments_filtered"] = filtered
272
 
273
- # return state2, gr.update(value=rows, headers=headers)
 
 
274
 
 
 
 
 
 
 
275
 
276
- # def _get_segment_by_row(state, row_data: List[Any]) -> Dict[str, Any]:
277
- # # row_data is one row from the displayed dataframe
278
- # # We reconstruct by column names
279
- # headers = None
280
- # if "headers_filtered" in state:
281
- # headers = state["headers_filtered"]
282
- # else:
283
- # headers = state["headers"]
284
-
285
- # row_map = {headers[i]: row_data[i] for i in range(min(len(headers), len(row_data)))}
286
-
287
- # # We try to match by (start,end,text,speaker) or row_id
288
- # target_row_id = row_map.get("row_id", None)
289
- # start = float(row_map.get("start", 0.0))
290
- # end = float(row_map.get("end", 0.0))
291
- # text = (row_map.get("text", "") or "")
292
-
293
- # candidates = state.get("segments_filtered", state["segments"])
294
- # for s in candidates:
295
- # if target_row_id is not None and str(s.get("row_id")) == str(target_row_id):
296
- # return s
297
-
298
- # # fallback match by time+text
299
- # for s in candidates:
300
- # if abs(float(s.get("start", 0.0)) - start) < 1e-3 and abs(float(s.get("end", 0.0)) - end) < 1e-3:
301
- # if (s.get("text", "") or "") == text:
302
- # return s
303
-
304
- # # last resort: return nearest start
305
- # best = None
306
- # best_d = 1e9
307
- # for s in candidates:
308
- # d = abs(float(s.get("start", 0.0)) - start)
309
- # if d < best_d:
310
- # best_d = d
311
- # best = s
312
- # return best or candidates[0]
313
-
314
-
315
- # def on_row_select(evt: gr.SelectData, state):
316
  # """
317
- # evt.value: selected cell value (not enough)
318
- # evt.index: (row, col)
319
- # We need to access the dataframe contents; Gradio passes it through component state in newer versions.
320
- # We'll rely on evt.row_value if available; else use a hidden state approach via gr.Dataframe 'value'.
321
  # """
322
- # if not state:
323
- # raise gr.Error("请先加载音频和JSON。")
324
-
325
- # row_value = getattr(evt, "row_value", None)
326
- # if row_value is None:
327
- # # If not available, we cannot slice; return a hint
328
- # return (
329
- # None,
330
- # gr.update(value="无法读取选中行数据(你的Gradio版本可能较旧)。建议升级gradio>=4。"),
331
- # gr.update(value=""),
332
- # )
333
 
334
- # seg = _get_segment_by_row(state, row_value)
335
-
336
- # y = state["audio_y"]
337
- # sr = state["audio_sr"]
338
- # start = float(seg.get("start", 0.0))
339
- # end = float(seg.get("end", 0.0))
340
- # sr_out, y_seg = _slice_audio(y, sr, start, end)
341
-
342
- # # Build display
343
- # md = (
344
- # f"### 选中片段\n"
345
- # f"- **time**: `{start:.3f}s` → `{end:.3f}s` (dur≈{end-start:.3f}s)\n"
346
- # f"- **status**: `{seg.get('status','')}`\n"
347
- # f"- **speaker**: `{seg.get('speaker','')}`\n"
348
- # f"- **gender**: `{seg.get('gender','')}`\n"
349
- # f"- **age_group**: `{seg.get('age_group','')}`\n"
350
- # f"- **emotion**: `{seg.get('emotion','')}`\n"
351
  # )
352
 
353
- # text = seg.get("text", "") or ""
354
- # if not text.strip():
355
- # text = "(empty)"
356
 
357
- # # Gradio Audio can take (sr, np.ndarray)
358
- # return ( (sr_out, y_seg), gr.update(value=md), gr.update(value=text) )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
 
 
360
 
361
- # def build_app():
362
- # with gr.Blocks(title="Audio Segment Explorer") as demo:
363
- # gr.Markdown(
364
- # "# 音频分段可视化与分段播放(Speaker / Gender / Age / Emotion)\n"
365
- # "上传音频 + JSON(包含 `segments`)即可查看分段列表,点击任意分段可直接播放该段音频。"
366
- # )
 
 
 
 
 
 
367
 
368
- # state = gr.State(value={})
369
-
370
- # with gr.Row():
371
- # audio_in = gr.Audio(label="上传音频文件", type="filepath")
372
- # json_in = gr.File(label="上传字幕JSON文件", file_types=[".json"])
373
-
374
- # load_btn = gr.Button("加载 / 解析", variant="primary")
375
- # info = gr.Markdown()
376
-
377
- # with gr.Row():
378
- # speaker_dd = gr.Dropdown(label="Speaker", choices=["ALL"], value="ALL")
379
- # gender_dd = gr.Dropdown(label="Gender", choices=["ALL"], value="ALL")
380
- # age_dd = gr.Dropdown(label="Age group", choices=["ALL"], value="ALL")
381
- # emotion_dd = gr.Dropdown(label="Emotion", choices=["ALL"], value="ALL")
382
- # status_dd = gr.Dropdown(label="Status", choices=["ALL"], value="ALL")
383
-
384
- # text_query = gr.Textbox(label="Text contains(可选)", placeholder="输入关键词过滤 text(支持阿拉伯语/英文)")
385
- # filter_btn = gr.Button("应用筛选")
386
-
387
- # with gr.Row():
388
- # df = gr.Dataframe(
389
- # label="Segments(点击行以播放该段)",
390
- # headers=["row_id", "start", "end", "dur", "status", "speaker", "gender", "age_group", "emotion", "text"],
391
- # datatype=["number", "number", "number", "number", "str", "str", "str", "str", "str", "str"],
392
- # wrap=True,
393
- # interactive=False,
394
- # max_height=420,
395
  # )
396
 
397
- # plot = gr.Plot(label="Waveform(可选)")
 
 
 
 
 
 
 
 
 
398
 
399
- # with gr.Row():
400
- # seg_audio = gr.Audio(label="分段播放", type="numpy")
401
- # seg_meta = gr.Markdown()
402
- # seg_text = gr.Textbox(label="该段转写文本", lines=4)
 
 
 
 
 
 
 
403
 
404
- # # events
405
- # load_btn.click(
406
- # fn=load_assets,
407
- # inputs=[audio_in, json_in],
408
- # outputs=[state, info, speaker_dd, gender_dd, age_dd, emotion_dd, status_dd, df, plot],
409
- # )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
410
 
411
- # filter_btn.click(
412
- # fn=apply_filters,
413
- # inputs=[state, speaker_dd, gender_dd, age_dd, emotion_dd, status_dd, text_query],
414
- # outputs=[state, df],
415
- # )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
416
 
417
- # df.select(
418
- # fn=on_row_select,
419
- # inputs=[state],
420
- # outputs=[seg_audio, seg_meta, seg_text],
421
- # )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
422
 
423
- # gr.Markdown(
424
- # "### 小提示\n"
425
- # "- `invalid` 片段通常是静音/噪声/无文本,你可以用 **Status=valid** 过滤。\n"
426
- # "- 如果音频是 mp3 且 `soundfile` 无法读取,代码会自动尝试 `pydub`(Spaces 一般有 ffmpeg)。"
427
- # )
428
 
429
- # return demo
 
 
 
 
 
430
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
431
 
432
  # if __name__ == "__main__":
433
- # demo = build_app()
434
  # demo.launch()
435
 
 
436
  import json
437
  import numpy as np
438
  import gradio as gr
439
 
440
- from huggingface_hub import hf_hub_download, list_repo_files
 
 
 
441
 
442
  import soundfile as sf
443
 
444
- # =====================
445
- # 固定配置(你的数据)
446
- # =====================
 
447
  REPO_ID = "AlexTYJ/Multilingual-ASR-Benchmark"
 
 
448
  AUDIO_DIR = "audio/testbatch/ARE"
449
  JSON_DIR = "text/ref/testbatch/ARE"
450
 
451
 
452
- # =====================
453
  # 工具函数
454
- # =====================
455
 
456
  def list_are_audio_files():
457
- files = list_repo_files(REPO_ID)
 
 
 
 
 
 
 
458
  audio_files = [
459
  f for f in files
460
- if f.startswith(AUDIO_DIR) and f.lower().endswith((".wav", ".mp3", ".flac"))
 
461
  ]
462
  audio_files.sort()
463
  return audio_files
464
 
465
 
466
- def load_audio_and_json(audio_path):
467
- # ---- 推导 json 路径 ----
 
 
 
 
 
468
  filename = audio_path.split("/")[-1]
469
- json_path = f"{JSON_DIR}/{filename.replace('.wav', '.json').replace('.mp3', '.json').replace('.flac', '.json')}"
 
470
 
471
  # ---- 下载 ----
472
- local_audio = hf_hub_download(REPO_ID, audio_path)
473
- local_json = hf_hub_download(REPO_ID, json_path)
 
 
 
 
 
 
 
 
 
474
 
475
  # ---- 读音频 ----
476
  audio, sr = sf.read(local_audio)
477
  if audio.ndim == 2:
478
  audio = audio.mean(axis=1)
 
479
 
480
  # ---- 读 JSON ----
481
  with open(local_json, "r", encoding="utf-8") as f:
@@ -485,9 +1101,9 @@ def load_audio_and_json(audio_path):
485
  for i, s in enumerate(data["segments"]):
486
  segments.append({
487
  "row_id": s.get("index", i),
488
- "start": float(s["start"]),
489
- "end": float(s["end"]),
490
- "dur": float(s["end"] - s["start"]),
491
  "status": s.get("status", ""),
492
  "speaker": s.get("speaker", ""),
493
  "gender": s.get("gender", ""),
@@ -500,14 +1116,16 @@ def load_audio_and_json(audio_path):
500
 
501
 
502
  def slice_audio(audio, sr, start, end):
503
- return sr, audio[int(start * sr): int(end * sr)]
 
 
504
 
505
 
506
- # =====================
507
- # Gradio 交互逻辑
508
- # =====================
509
 
510
- def on_select_file(audio_path):
511
  audio, sr, segments, audio_name = load_audio_and_json(audio_path)
512
 
513
  rows = [
@@ -529,7 +1147,7 @@ def on_select_file(audio_path):
529
  state = {
530
  "audio": audio,
531
  "sr": sr,
532
- "segments": segments
533
  }
534
 
535
  return state, rows, info
@@ -537,28 +1155,39 @@ def on_select_file(audio_path):
537
 
538
  def on_select_segment(evt: gr.SelectData, state):
539
  row = evt.row_value
540
- start, end = float(row[1]), float(row[2])
541
 
542
- sr, audio_seg = slice_audio(state["audio"], state["sr"], start, end)
 
 
 
 
 
 
 
 
543
 
544
  meta = (
545
- f"- **speaker**: {row[5]}\n"
546
- f"- **gender**: {row[6]}\n"
547
- f"- **age_group**: {row[7]}\n"
548
- f"- **emotion**: {row[8]}"
 
549
  )
550
 
551
- return (sr, audio_seg), meta, row[9]
 
 
552
 
553
 
554
- # =====================
555
  # UI
556
- # =====================
557
 
558
  with gr.Blocks(title="ARE Audio Segment Explorer") as demo:
559
  gr.Markdown(
560
- "# 🎧 ARE 音频 & 字幕可视化(Hugging Face Dataset)\n"
561
- "数据来源:`AlexTYJ/Multilingual-ASR-Benchmark`"
 
562
  )
563
 
564
  state = gr.State()
@@ -566,12 +1195,12 @@ with gr.Blocks(title="ARE Audio Segment Explorer") as demo:
566
  audio_files = list_are_audio_files()
567
 
568
  audio_selector = gr.Dropdown(
569
- choices=audio_files,
570
  label="选择音频文件(ARE)",
571
- value=audio_files[0] if audio_files else None
 
572
  )
573
 
574
- load_btn = gr.Button("加载", variant="primary")
575
  info = gr.Markdown()
576
 
577
  df = gr.Dataframe(
@@ -592,7 +1221,7 @@ with gr.Blocks(title="ARE Audio Segment Explorer") as demo:
592
  text = gr.Textbox(label="字幕文本", lines=4)
593
 
594
  load_btn.click(
595
- on_select_file,
596
  inputs=audio_selector,
597
  outputs=[state, df, info],
598
  )
@@ -603,5 +1232,4 @@ with gr.Blocks(title="ARE Audio Segment Explorer") as demo:
603
  outputs=[audio_out, meta, text],
604
  )
605
 
606
- demo.launch()
607
-
 
1
+ # import re
 
2
  # import os
3
+ # import json
4
+ # import math
5
+ # import tempfile
6
+ # from dataclasses import dataclass
7
+ # from typing import List, Optional, Tuple, Dict, Any
8
 
9
  # import gradio as gr
 
 
10
  # import numpy as np
11
+ # import srt
12
+ # from pydub import AudioSegment
13
+
14
+ # from langdetect import DetectorFactory, detect_langs
15
+ # DetectorFactory.seed = 0 # deterministic
16
+
17
+
18
+ # # -----------------------------
19
+ # # Data structures
20
+ # # -----------------------------
21
+ # @dataclass
22
+ # class Cue:
23
+ # start: float
24
+ # end: float
25
+ # text: str
26
+
27
+
28
+ # # -----------------------------
29
+ # # Language utilities
30
+ # # -----------------------------
31
+ # LANG_LABELS = ["原文(不指定)", "中文", "English", "日本語"]
32
+
33
+ # def normalize_lang_code(code: str) -> str:
34
+ # c = (code or "").lower()
35
+ # if c.startswith("zh"):
36
+ # return "中文"
37
+ # if c == "en":
38
+ # return "English"
39
+ # if c == "ja":
40
+ # return "日本語"
41
+ # return "其他"
42
+
43
+ # def detect_language_label(text: str) -> Dict[str, Any]:
44
+ # t = (text or "").strip()
45
+ # if not t:
46
+ # return {"raw_code": None, "raw_prob": None, "label": None, "normalized_label": None}
47
+ # t = t.replace("\n", " ").strip()[:400]
48
+ # try:
49
+ # langs = detect_langs(t)
50
+ # if not langs:
51
+ # return {"raw_code": None, "raw_prob": None, "label": None, "normalized_label": None}
52
+ # top = langs[0]
53
+ # raw_code = getattr(top, "lang", None)
54
+ # raw_prob = float(getattr(top, "prob", 0.0)) if top else None
55
+ # norm = normalize_lang_code(raw_code)
56
+ # return {
57
+ # "raw_code": raw_code,
58
+ # "raw_prob": round(raw_prob, 3) if raw_prob is not None else None,
59
+ # "label": norm if norm != "其他" else "原文(不指定)",
60
+ # "normalized_label": norm,
61
+ # }
62
+ # except Exception:
63
+ # return {"raw_code": None, "raw_prob": None, "label": None, "normalized_label": None}
64
+
65
+ # def check_declared_mismatch(declared: str, detected_norm: Optional[str]) -> bool:
66
+ # if declared == "原文(不指定)":
67
+ # return False
68
+ # if detected_norm is None:
69
+ # return True
70
+ # if detected_norm == "其他":
71
+ # return True
72
+ # return detected_norm != declared
73
+
74
+ # def analyze_language_for_cues(cues: List[Cue]) -> Dict[str, Any]:
75
+ # counts = {"中文": 0, "English": 0, "日本語": 0, "其他": 0}
76
+ # for c in cues:
77
+ # d = detect_language_label(c.text)
78
+ # norm = d.get("normalized_label")
79
+ # if norm in counts:
80
+ # counts[norm] += 1
81
+ # dominant = max(counts.items(), key=lambda x: x[1])[0] if cues else None
82
+ # return {"counts": counts, "dominant_norm": dominant}
83
+
84
+
85
+ # # -----------------------------
86
+ # # Subtitle parsing
87
+ # # -----------------------------
88
+ # _TAG_RE = re.compile(r"</?[^>]+?>", re.IGNORECASE)
89
+ # _VTT_TIME_RE = re.compile(
90
+ # r"(?P<start>\d{2}:\d{2}:\d{2}\.\d{3}|\d{1,2}:\d{2}\.\d{3})\s*-->\s*"
91
+ # r"(?P<end>\d{2}:\d{2}:\d{2}\.\d{3}|\d{1,2}:\d{2}\.\d{3})"
92
+ # )
93
+
94
+ # def _strip_tags(text: str) -> str:
95
+ # text = _TAG_RE.sub("", text)
96
+ # text = text.replace("<c>", "").replace("</c>", "")
97
+ # return text.strip()
98
+
99
+ # def _time_to_seconds(t: str) -> float:
100
+ # t = t.strip().split()[0]
101
+ # parts = t.split(":")
102
+ # if len(parts) == 3:
103
+ # h = int(parts[0])
104
+ # m = int(parts[1])
105
+ # s = float(parts[2])
106
+ # return h * 3600 + m * 60 + s
107
+ # if len(parts) == 2:
108
+ # m = int(parts[0])
109
+ # s = float(parts[1])
110
+ # return m * 60 + s
111
+ # raise ValueError(f"Unsupported time format: {t}")
112
+
113
+ # def parse_vtt(content: str) -> List[Cue]:
114
+ # content = content.replace("\ufeff", "")
115
+ # content = re.sub(r"^\s*WEBVTT.*?\n", "", content, flags=re.IGNORECASE)
116
+ # blocks = re.split(r"\r?\n\r?\n", content.strip())
117
+ # cues: List[Cue] = []
118
+ # for block in blocks:
119
+ # lines = [ln.rstrip("\n") for ln in re.split(r"\r?\n", block) if ln.strip() != ""]
120
+ # if not lines:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  # continue
122
+ # time_line_idx = None
123
+ # for idx in range(min(2, len(lines))):
124
+ # if "-->" in lines[idx]:
125
+ # time_line_idx = idx
126
+ # break
127
+ # if time_line_idx is None:
128
  # continue
129
+ # m = _VTT_TIME_RE.search(lines[time_line_idx])
130
+ # if not m:
131
  # continue
132
+ # start = _time_to_seconds(m.group("start"))
133
+ # end = _time_to_seconds(m.group("end"))
134
+ # if not (math.isfinite(start) and math.isfinite(end)) or end <= start:
135
  # continue
136
+ # text_lines = lines[time_line_idx + 1 :]
137
+ # text = _strip_tags("\n".join(text_lines)).strip()
138
+ # if not text:
139
  # continue
140
+ # cues.append(Cue(start=start, end=end, text=text))
141
+ # cues.sort(key=lambda x: x.start)
142
+ # return cues
143
+
144
+ # def parse_srt(content: str) -> List[Cue]:
145
+ # content = content.replace("\ufeff", "")
146
+ # subs = list(srt.parse(content))
147
+ # cues: List[Cue] = []
148
+ # for sub in subs:
149
+ # cues.append(
150
+ # Cue(
151
+ # start=sub.start.total_seconds(),
152
+ # end=sub.end.total_seconds(),
153
+ # text=sub.content.strip(),
154
+ # )
155
+ # )
156
+ # cues.sort(key=lambda x: x.start)
157
+ # return cues
158
+
159
+ # def parse_subtitle_file(path: Optional[str]) -> List[Cue]:
160
+ # if not path:
161
+ # return []
162
+ # with open(path, "r", encoding="utf-8") as f:
163
+ # content = f.read()
164
+ # head = content.lstrip()[:80].upper()
165
+ # ext = os.path.splitext(path)[1].lower()
166
+ # if "WEBVTT" in head or ext == ".vtt":
167
+ # return parse_vtt(content)
168
+ # return parse_srt(content)
169
+
170
+
171
+ # # -----------------------------
172
+ # # Alignment
173
+ # # -----------------------------
174
+ # def align_by_time(
175
+ # a: List[Cue],
176
+ # b: List[Cue],
177
+ # max_mid_diff: float = 1.5,
178
+ # ) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
179
+ # aligned: List[Dict[str, Any]] = []
180
+ # i, j = 0, 0
181
+ # matched_a = set()
182
+ # matched_b = set()
183
+
184
+ # while i < len(a) and j < len(b):
185
+ # x = a[i]
186
+ # y = b[j]
187
+ # mid_x = (x.start + x.end) / 2
188
+ # mid_y = (y.start + y.end) / 2
189
+ # diff = mid_x - mid_y
190
+
191
+ # if abs(diff) <= max_mid_diff:
192
+ # # global window for "corresponding part"
193
+ # g_start = min(x.start, y.start)
194
+ # g_end = max(x.end, y.end)
195
+ # aligned.append(
196
+ # {
197
+ # "idx": len(aligned) + 1,
198
+ # "start": g_start,
199
+ # "end": g_end,
200
+ # "en_start": x.start,
201
+ # "en_end": x.end,
202
+ # "zh_start": y.start,
203
+ # "zh_end": y.end,
204
+ # "text_en": x.text,
205
+ # "text_zh": y.text,
206
+ # }
207
+ # )
208
+ # matched_a.add(i)
209
+ # matched_b.add(j)
210
+ # i += 1
211
+ # j += 1
212
+ # elif diff < 0:
213
+ # i += 1
214
+ # else:
215
+ # j += 1
216
+
217
+ # stats = {
218
+ # "trackA_total": len(a),
219
+ # "trackB_total": len(b),
220
+ # "aligned_pairs": len(aligned),
221
+ # "trackA_unmatched": len(a) - len(matched_a),
222
+ # "trackB_unmatched": len(b) - len(matched_b),
223
+ # "max_mid_diff_sec": max_mid_diff,
224
+ # }
225
+ # return aligned, stats
226
+
227
+
228
+ # # -----------------------------
229
+ # # Audio processing / QC
230
+ # # -----------------------------
231
+ # def load_audio(path: str) -> AudioSegment:
232
+ # return AudioSegment.from_file(path)
233
+
234
+ # def segment_to_wav_file(audio: AudioSegment, start_s: float, end_s: float) -> str:
235
+ # start_ms = max(0, int(start_s * 1000))
236
+ # end_ms = max(start_ms + 1, int(end_s * 1000))
237
+ # seg = audio[start_ms:end_ms]
238
+ # tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
239
+ # tmp.close()
240
+ # seg.export(tmp.name, format="wav")
241
+ # return tmp.name
242
+
243
+ # def compute_dbfs(audio: AudioSegment) -> float:
244
+ # v = audio.dBFS
245
+ # if v == float("-inf"):
246
+ # return -120.0
247
+ # return float(v)
248
+
249
+ # def qc_on_aligned_segments(
250
+ # audio_a: AudioSegment,
251
+ # audio_b: AudioSegment,
252
+ # aligned: List[Dict[str, Any]],
253
+ # silence_dbfs_threshold: float = -50.0,
254
+ # low_dbfs_threshold: float = -40.0,
255
+ # ) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
256
+ # issues: List[Dict[str, Any]] = []
257
+ # a_levels = []
258
+ # b_levels = []
259
+
260
+ # for seg in aligned:
261
+ # a_start, a_end = seg["en_start"], seg["en_end"]
262
+ # b_start, b_end = seg["zh_start"], seg["zh_end"]
263
+
264
+ # a = audio_a[int(a_start * 1000) : int(a_end * 1000)]
265
+ # b = audio_b[int(b_start * 1000) : int(b_end * 1000)]
266
+
267
+ # a_dbfs = compute_dbfs(a)
268
+ # b_dbfs = compute_dbfs(b)
269
+
270
+ # a_levels.append(a_dbfs)
271
+ # b_levels.append(b_dbfs)
272
+
273
+ # problems = []
274
+ # if a_dbfs <= silence_dbfs_threshold:
275
+ # problems.append("Track A 静音/近静音")
276
+ # elif a_dbfs <= low_dbfs_threshold:
277
+ # problems.append("Track A 音量偏低")
278
+
279
+ # if b_dbfs <= silence_dbfs_threshold:
280
+ # problems.append("Track B 静音/近静音")
281
+ # elif b_dbfs <= low_dbfs_threshold:
282
+ # problems.append("Track B 音量偏低")
283
+
284
+ # if abs(a_dbfs - b_dbfs) >= 12.0:
285
+ # problems.append("两路音量差异过大(≥12dB)")
286
+
287
+ # if problems:
288
+ # issues.append(
289
+ # {
290
+ # "segment": seg["idx"],
291
+ # "time": f"{seg['start']:.2f}-{seg['end']:.2f}s",
292
+ # "A_time": f"{a_start:.2f}-{a_end:.2f}s",
293
+ # "B_time": f"{b_start:.2f}-{b_end:.2f}s",
294
+ # "problems": problems,
295
+ # "text_A": seg["text_en"][:120] + ("..." if len(seg["text_en"]) > 120 else ""),
296
+ # "text_B": seg["text_zh"][:120] + ("..." if len(seg["text_zh"]) > 120 else ""),
297
+ # "A_dbfs": round(a_dbfs, 1),
298
+ # "B_dbfs": round(b_dbfs, 1),
299
+ # }
300
+ # )
301
 
302
+ # def _safe_mean(xs):
303
+ # return float(np.mean(xs)) if xs else 0.0
304
+
305
+ # qc_stats = {
306
+ # "aligned_pairs": len(aligned),
307
+ # "issue_segments": len(issues),
308
+ # "A_avg_dbfs": round(_safe_mean(a_levels), 1),
309
+ # "B_avg_dbfs": round(_safe_mean(b_levels), 1),
310
+ # "A_min_dbfs": round(float(np.min(a_levels)), 1) if a_levels else None,
311
+ # "B_min_dbfs": round(float(np.min(b_levels)), 1) if b_levels else None,
312
+ # "silence_dbfs_threshold": silence_dbfs_threshold,
313
+ # "low_dbfs_threshold": low_dbfs_threshold,
314
+ # }
315
+ # return issues, qc_stats
316
+
317
+
318
+ # # -----------------------------
319
+ # # Interactive HTML table (buttons per row)
320
+ # # -----------------------------
321
+ # def _escape_html(s: str) -> str:
322
+ # return (s or "").replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
323
+
324
+ # def build_interactive_table_html(aligned: List[Dict[str, Any]]) -> str:
325
+ # max_rows = 2000
326
+ # aligned_view = aligned[:max_rows]
327
+
328
+ # rows_html = []
329
+ # for seg in aligned_view:
330
+ # idx = seg["idx"]
331
+ # t = f'{seg["start"]:.2f}-{seg["end"]:.2f}'
332
+ # a_t = f'{seg["en_start"]:.2f}-{seg["en_end"]:.2f}'
333
+ # b_t = f'{seg["zh_start"]:.2f}-{seg["zh_end"]:.2f}'
334
+ # a_txt = _escape_html(seg["text_en"])
335
+ # b_txt = _escape_html(seg["text_zh"])
336
+
337
+ # a_lang = _escape_html(seg.get("en_lang_label") or "-")
338
+ # b_lang = _escape_html(seg.get("zh_lang_label") or "-")
339
+ # a_conf = seg.get("en_lang_prob")
340
+ # b_conf = seg.get("zh_lang_prob")
341
+ # a_conf_s = f"{a_conf:.3f}" if isinstance(a_conf, (int, float)) else "-"
342
+ # b_conf_s = f"{b_conf:.3f}" if isinstance(b_conf, (int, float)) else "-"
343
+
344
+ # a_bad = seg.get("en_lang_mismatch", False)
345
+ # b_bad = seg.get("zh_lang_mismatch", False)
346
+ # a_cls = "bad" if a_bad else "ok"
347
+ # b_cls = "bad" if b_bad else "ok"
348
+
349
+ # btns = (
350
+ # f'<button class="segbtn a" onclick="__playSeg({idx}, \'a\')">A</button>'
351
+ # f'<button class="segbtn b" onclick="__playSeg({idx}, \'b\')">B</button>'
352
+ # f'<button class="segbtn both" onclick="__playSeg({idx}, \'both\')">同时</button>'
353
+ # )
354
 
355
+ # rows_html.append(
356
+ # f"""
357
+ # <tr>
358
+ # <td class="mono">{idx:04d}</td>
359
+ # <td class="mono">{t}</td>
360
+ # <td class="btncell">{btns}</td>
361
+ # <td class="textcell">{a_txt}</td>
362
+ # <td class="textcell">{b_txt}</td>
363
+ # <td class="mono">{a_t}</td>
364
+ # <td class="mono">{b_t}</td>
365
+ # <td class="mono {a_cls}">{a_lang} ({a_conf_s})</td>
366
+ # <td class="mono {b_cls}">{b_lang} ({b_conf_s})</td>
367
+ # </tr>
368
+ # """.strip()
369
+ # )
370
 
371
+ # # IMPORTANT FIX:
372
+ # # - __qs must be `#${id}`, not `#${{id}}`
373
+ # # - action_box/action_btn must exist in DOM (we keep them and hide via CSS)
374
+ # # - Wait for audio src to change before play (avoid playing old segment)
375
+ # table_html = f"""
376
+ # <div class="segwrap">
377
+ # <div class="note">
378
+ # 点击每行按钮可生成并播放片段(A / B / 同时)。若浏览器阻止自动播放,请在右侧播放器手动点一次播放键。
379
+ # </div>
380
+ # <div class="tablewrap">
381
+ # <table class="segtable">
382
+ # <thead>
383
+ # <tr>
384
+ # <th>#</th>
385
+ # <th>Global(s)</th>
386
+ # <th>Play</th>
387
+ # <th>Track A</th>
388
+ # <th>Track B</th>
389
+ # <th>A time</th>
390
+ # <th>B time</th>
391
+ # <th>A Lang</th>
392
+ # <th>B Lang</th>
393
+ # </tr>
394
+ # </thead>
395
+ # <tbody>
396
+ # {"".join(rows_html)}
397
+ # </tbody>
398
+ # </table>
399
+ # </div>
400
+ # </div>
401
+
402
+ # <style>
403
+ # .segwrap {{ margin-top: 8px; }}
404
+ # .note {{ font-size: 12px; opacity: 0.85; margin-bottom: 8px; }}
405
+ # .tablewrap {{ overflow: auto; max-height: 560px; border: 1px solid rgba(127,127,127,0.25); border-radius: 8px; }}
406
+ # table.segtable {{ width: 100%; border-collapse: collapse; }}
407
+ # table.segtable th, table.segtable td {{ border-bottom: 1px solid rgba(127,127,127,0.18); padding: 8px; vertical-align: top; }}
408
+ # table.segtable thead th {{ position: sticky; top: 0; background: rgba(250,250,250,0.95); z-index: 1; }}
409
+ # .mono {{ font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace; white-space: nowrap; }}
410
+ # .textcell {{ min-width: 320px; white-space: pre-wrap; }}
411
+ # .btncell {{ white-space: nowrap; min-width: 150px; }}
412
+ # .segbtn {{
413
+ # padding: 4px 10px;
414
+ # margin-right: 6px;
415
+ # border-radius: 8px;
416
+ # border: 1px solid rgba(127,127,127,0.35);
417
+ # background: white;
418
+ # cursor: pointer;
419
+ # font-size: 12px;
420
+ # }}
421
+ # .segbtn:hover {{ background: rgba(0,0,0,0.04); }}
422
+ # .segbtn.both {{ font-weight: 800; }}
423
+ # .ok {{ color: inherit; }}
424
+ # .bad {{ color: #b00020; font-weight: 800; }}
425
+ # </style>
426
+
427
+ # <script>
428
+ # function __gradioAppRoot() {{
429
+ # const ga = document.querySelector("gradio-app");
430
+ # return ga ? ga.shadowRoot : document;
431
+ # }}
432
+
433
+ # function __qs(id) {{
434
+ # const root = __gradioAppRoot();
435
+ # return root.querySelector(`#${{id}}`);
436
+ # }}
437
+
438
+ # function __setTextboxValue(elemId, value) {{
439
+ # const box = __qs(elemId);
440
+ # if (!box) return false;
441
+ # const input = box.querySelector("textarea, input");
442
+ # if (!input) return false;
443
+ # input.value = value;
444
+ # input.dispatchEvent(new Event("input", {{ bubbles: true }}));
445
+ # return true;
446
+ # }}
447
+
448
+ # function __clickButton(elemId) {{
449
+ # const btn = __qs(elemId);
450
+ # if (!btn) return false;
451
+ # const realBtn = btn.querySelector("button");
452
+ # if (!realBtn) return false;
453
+ # realBtn.click();
454
+ # return true;
455
+ # }}
456
+
457
+ # function __getAudioTag(containerElemId) {{
458
+ # const c = __qs(containerElemId);
459
+ # if (!c) return null;
460
+ # return c.querySelector("audio");
461
+ # }}
462
+
463
+ # window.__desiredPlayMode = null; // "a" | "b" | "both"
464
+ # window.__desiredPlayNonce = 0;
465
+
466
+ # function __waitAndPlay(nonce, prevASrc, prevBSrc) {{
467
+ # let tries = 0;
468
+ # const maxTries = 80; // ~16s
469
+ # const intervalMs = 200;
470
+
471
+ # const timer = setInterval(() => {{
472
+ # tries += 1;
473
+ # if (window.__desiredPlayNonce !== nonce) {{
474
+ # clearInterval(timer);
475
+ # return;
476
+ # }}
477
+
478
+ # const aAudio = __getAudioTag("a_audio_seg");
479
+ # const bAudio = __getAudioTag("b_audio_seg");
480
+
481
+ # if (!aAudio && !bAudio) {{
482
+ # if (tries >= maxTries) clearInterval(timer);
483
+ # return;
484
+ # }}
485
+
486
+ # const mode = window.__desiredPlayMode;
487
+
488
+ # const aReady = aAudio && (aAudio.src && aAudio.src !== prevASrc) && aAudio.readyState >= 2;
489
+ # const bReady = bAudio && (bAudio.src && bAudio.src !== prevBSrc) && bAudio.readyState >= 2;
490
+
491
+ # const canPlay =
492
+ # (mode === "a" && aReady) ||
493
+ # (mode === "b" && bReady) ||
494
+ # (mode === "both" && aReady && bReady);
495
+
496
+ # if (canPlay) {{
497
+ # try {{
498
+ # if (mode === "a") {{
499
+ # if (bAudio) bAudio.pause();
500
+ # aAudio.currentTime = 0;
501
+ # aAudio.play();
502
+ # }} else if (mode === "b") {{
503
+ # if (aAudio) aAudio.pause();
504
+ # bAudio.currentTime = 0;
505
+ # bAudio.play();
506
+ # }} else if (mode === "both") {{
507
+ # aAudio.currentTime = 0;
508
+ # bAudio.currentTime = 0;
509
+ # aAudio.play();
510
+ # bAudio.play();
511
+ # }}
512
+ # }} catch (e) {{}}
513
+ # clearInterval(timer);
514
+ # return;
515
+ # }}
516
+
517
+ # if (tries >= maxTries) {{
518
+ # clearInterval(timer);
519
+ # }}
520
+ # }}, intervalMs);
521
+ # }}
522
+
523
+ # function __playSeg(idx, mode) {{
524
+ # window.__desiredPlayMode = mode;
525
+ # window.__desiredPlayNonce += 1;
526
+ # const nonce = window.__desiredPlayNonce;
527
+
528
+ # const aAudio = __getAudioTag("a_audio_seg");
529
+ # const bAudio = __getAudioTag("b_audio_seg");
530
+ # const prevASrc = aAudio ? aAudio.src : "";
531
+ # const prevBSrc = bAudio ? bAudio.src : "";
532
+
533
+ # const payload = JSON.stringify({{ idx: idx, mode: mode }});
534
+ # const ok1 = __setTextboxValue("action_box", payload);
535
+ # const ok2 = __clickButton("action_btn");
536
+
537
+ # if (ok1 && ok2) {{
538
+ # __waitAndPlay(nonce, prevASrc, prevBSrc);
539
+ # }} else {{
540
+ # console.warn("Failed to trigger backend action.", ok1, ok2);
541
+ # }}
542
+ # }}
543
+ # </script>
544
+ # """.strip()
545
+
546
+ # if len(aligned) > max_rows:
547
+ # table_html = (
548
+ # f"<div style='margin:8px 0;font-size:12px;opacity:.85;'>"
549
+ # f"提示:对齐片段数为 {len(aligned)},为保证渲染性能仅展示前 {max_rows} 行。</div>\n"
550
+ # + table_html
551
+ # )
552
+ # return table_html
553
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
554
 
555
+ # # -----------------------------
556
+ # # Segment generation (core fix: global window + offset)
557
+ # # -----------------------------
558
+ # def _pick_window(seg: Dict[str, Any], mode: str, offset_a: float, offset_b: float) -> Tuple[float, float, float, float]:
559
+ # """
560
+ # mode:
561
+ # - "global": use seg["start"]/["end"] for BOTH tracks
562
+ # - "per_track": use seg["en_start"]/["en_end"] for A, seg["zh_start"]/["zh_end"] for B
563
+ # offsets are applied per track.
564
+ # Returns: (a_start, a_end, b_start, b_end)
565
+ # """
566
+ # if mode == "per_track":
567
+ # a_start, a_end = seg["en_start"], seg["en_end"]
568
+ # b_start, b_end = seg["zh_start"], seg["zh_end"]
569
+ # else:
570
+ # # global window (recommended)
571
+ # a_start, a_end = seg["start"], seg["end"]
572
+ # b_start, b_end = seg["start"], seg["end"]
573
+
574
+ # a_start = max(0.0, a_start + float(offset_a))
575
+ # a_end = max(a_start + 0.01, a_end + float(offset_a))
576
+ # b_start = max(0.0, b_start + float(offset_b))
577
+ # b_end = max(b_start + 0.01, b_end + float(offset_b))
578
+ # return a_start, a_end, b_start, b_end
579
+
580
+
581
+ # # -----------------------------
582
+ # # Gradio callbacks
583
+ # # -----------------------------
584
+ # def parse_align_and_qc(
585
+ # audio_a_path: Optional[str],
586
+ # audio_b_path: Optional[str],
587
+ # sub_a_path: Optional[str],
588
+ # sub_b_path: Optional[str],
589
+ # declared_a_lang: str,
590
+ # declared_b_lang: str,
591
+ # crop_mode: str,
592
+ # offset_a: float,
593
+ # offset_b: float,
594
+ # max_mid_diff: float,
595
+ # silence_th: float,
596
+ # low_th: float,
597
+ # ):
598
+ # if not audio_a_path or not audio_b_path:
599
+ # return (
600
+ # [],
601
+ # "",
602
+ # {"error": "请同时上传 Track A 与 Track B 音频。"},
603
+ # [],
604
+ # None,
605
+ # None,
606
+ # gr.update(choices=[], value=None),
607
+ # None,
608
+ # )
609
 
610
+ # try:
611
+ # cues_a = parse_subtitle_file(sub_a_path) if sub_a_path else []
612
+ # cues_b = parse_subtitle_file(sub_b_path) if sub_b_path else []
613
+
614
+ # if not cues_a or not cues_b:
615
+ # return (
616
+ # [],
617
+ # "",
618
+ # {"error": "请同时提供两路字幕(SRT/VTT)。", "A_cues": len(cues_a), "B_cues": len(cues_b)},
619
+ # [],
620
+ # audio_a_path,
621
+ # audio_b_path,
622
+ # gr.update(choices=[], value=None),
623
+ # None,
624
+ # )
625
 
626
+ # # overall language stats
627
+ # a_lang_stats = analyze_language_for_cues(cues_a)
628
+ # b_lang_stats = analyze_language_for_cues(cues_b)
629
+
630
+ # aligned, align_stats = align_by_time(cues_a, cues_b, max_mid_diff=max_mid_diff)
631
+
632
+ # # per-segment language + mismatch issues
633
+ # lang_mismatch_issues = []
634
+ # for seg in aligned:
635
+ # da = detect_language_label(seg["text_en"])
636
+ # db = detect_language_label(seg["text_zh"])
637
+ # seg["en_lang_label"] = da.get("label")
638
+ # seg["en_lang_prob"] = da.get("raw_prob")
639
+ # seg["en_lang_norm"] = da.get("normalized_label")
640
+ # seg["zh_lang_label"] = db.get("label")
641
+ # seg["zh_lang_prob"] = db.get("raw_prob")
642
+ # seg["zh_lang_norm"] = db.get("normalized_label")
643
+
644
+ # seg["en_lang_mismatch"] = check_declared_mismatch(declared_a_lang, seg["en_lang_norm"])
645
+ # seg["zh_lang_mismatch"] = check_declared_mismatch(declared_b_lang, seg["zh_lang_norm"])
646
+
647
+ # if seg["en_lang_mismatch"] or seg["zh_lang_mismatch"]:
648
+ # problems = []
649
+ # if seg["en_lang_mismatch"]:
650
+ # problems.append(f"Track A 声明={declared_a_lang} 检测={seg['en_lang_label']}")
651
+ # if seg["zh_lang_mismatch"]:
652
+ # problems.append(f"Track B 声明={declared_b_lang} 检测={seg['zh_lang_label']}")
653
+ # lang_mismatch_issues.append(
654
+ # {
655
+ # "segment": seg["idx"],
656
+ # "time": f"{seg['start']:.2f}-{seg['end']:.2f}s",
657
+ # "type": "LanguageMismatch",
658
+ # "problems": problems,
659
+ # "text_A": seg["text_en"][:120] + ("..." if len(seg["text_en"]) > 120 else ""),
660
+ # "text_B": seg["text_zh"][:120] + ("..." if len(seg["text_zh"]) > 120 else ""),
661
+ # }
662
+ # )
663
+
664
+ # # dataframe rows
665
+ # rows = []
666
+ # for seg in aligned:
667
+ # rows.append(
668
+ # [
669
+ # seg["idx"],
670
+ # f'{seg["start"]:.2f}',
671
+ # f'{seg["end"]:.2f}',
672
+ # seg["text_en"],
673
+ # seg["text_zh"],
674
+ # f'{seg["en_start"]:.2f}-{seg["en_end"]:.2f}',
675
+ # f'{seg["zh_start"]:.2f}-{seg["zh_end"]:.2f}',
676
+ # ]
677
+ # )
678
 
679
+ # # audio QC (still based on per-track subtitle times; keep original semantics)
680
+ # audio_a = load_audio(audio_a_path)
681
+ # audio_b = load_audio(audio_b_path)
682
+ # issues_qc, qc_stats = qc_on_aligned_segments(
683
+ # audio_a, audio_b, aligned,
684
+ # silence_dbfs_threshold=silence_th,
685
+ # low_dbfs_threshold=low_th
686
+ # )
687
 
688
+ # issues_all = lang_mismatch_issues + issues_qc
689
+
690
+ # stats = {
691
+ # "alignment": align_stats,
692
+ # "qc": qc_stats,
693
+ # "language": {
694
+ # "declared": {"TrackA": declared_a_lang, "TrackB": declared_b_lang},
695
+ # "detected_overall": {
696
+ # "TrackA_dominant_norm": a_lang_stats["dominant_norm"],
697
+ # "TrackB_dominant_norm": b_lang_stats["dominant_norm"],
698
+ # "TrackA_counts": a_lang_stats["counts"],
699
+ # "TrackB_counts": b_lang_stats["counts"],
700
+ # },
701
+ # "segment_mismatch_count": len(lang_mismatch_issues),
702
+ # },
703
+ # "segment_crop": {
704
+ # "mode": crop_mode,
705
+ # "offset_a_sec": offset_a,
706
+ # "offset_b_sec": offset_b,
707
+ # "note": "若对比播放不对应:优先使用“对齐全局时间(推荐)”,并微调 Track A/B 时间偏移。",
708
+ # }
709
+ # }
710
+
711
+ # choices = [f'{seg["idx"]:04d} | {seg["start"]:.2f}-{seg["end"]:.2f}s' for seg in aligned]
712
+ # selector_update = gr.update(choices=choices, value=(choices[0] if choices else None))
713
+
714
+ # state = {
715
+ # "aligned": aligned,
716
+ # "audio_a_path": audio_a_path,
717
+ # "audio_b_path": audio_b_path,
718
+ # "declared": {"TrackA": declared_a_lang, "TrackB": declared_b_lang},
719
+ # "crop_mode": crop_mode,
720
+ # "offset_a": float(offset_a),
721
+ # "offset_b": float(offset_b),
722
+ # }
723
+
724
+ # html_table = build_interactive_table_html(aligned)
725
+
726
+ # return rows, html_table, stats, issues_all, audio_a_path, audio_b_path, selector_update, state
727
+
728
+ # except Exception as e:
729
+ # return (
730
+ # [],
731
+ # "",
732
+ # {"error": f"解析/对齐失败: {str(e)}"},
733
+ # [{"error": str(e)}],
734
+ # audio_a_path,
735
+ # audio_b_path,
736
+ # gr.update(choices=[], value=None),
737
+ # None,
738
+ # )
739
 
740
+ # def _parse_selector_value(v: Optional[str]) -> Optional[int]:
741
+ # if not v:
742
+ # return None
743
+ # m = re.match(r"^\s*(\d+)\s*\|", v)
744
+ # if not m:
745
+ # return None
746
+ # return int(m.group(1))
747
 
748
+ # def make_segment_audio_by_idx(idx: int, state: Optional[Dict[str, Any]]):
749
+ # if not state or "aligned" not in state:
750
+ # return None, None, {"error": "请先完成字幕解析与对齐。"}
 
751
 
752
+ # aligned = state["aligned"]
753
+ # if idx < 1 or idx > len(aligned):
754
+ # return None, None, {"error": "片段索引越界。"}
755
 
756
+ # seg = aligned[idx - 1]
757
+ # audio_a_path = state["audio_a_path"]
758
+ # audio_b_path = state["audio_b_path"]
759
+ # crop_mode = state.get("crop_mode", "global")
760
+ # offset_a = float(state.get("offset_a", 0.0))
761
+ # offset_b = float(state.get("offset_b", 0.0))
762
 
763
+ # try:
764
+ # audio_a = load_audio(audio_a_path)
765
+ # audio_b = load_audio(audio_b_path)
766
+
767
+ # a_start, a_end, b_start, b_end = _pick_window(seg, crop_mode, offset_a, offset_b)
768
+
769
+ # a_wav = segment_to_wav_file(audio_a, a_start, a_end)
770
+ # b_wav = segment_to_wav_file(audio_b, b_start, b_end)
771
+
772
+ # info = {
773
+ # "segment": idx,
774
+ # "global_time": f'{seg["start"]:.2f}-{seg["end"]:.2f}s',
775
+ # "crop_mode": crop_mode,
776
+ # "offset_a_sec": offset_a,
777
+ # "offset_b_sec": offset_b,
778
+ # "crop_A_time": f"{a_start:.2f}-{a_end:.2f}s",
779
+ # "crop_B_time": f"{b_start:.2f}-{b_end:.2f}s",
780
+ # "subtitle_A_time": f'{seg["en_start"]:.2f}-{seg["en_end"]:.2f}s',
781
+ # "subtitle_B_time": f'{seg["zh_start"]:.2f}-{seg["zh_end"]:.2f}s',
782
+ # "text_A": seg["text_en"],
783
+ # "text_B": seg["text_zh"],
784
+ # }
785
+ # return a_wav, b_wav, info
786
+ # except Exception as e:
787
+ # return None, None, {"error": f"生成片段音频失败: {str(e)}"}
788
+
789
+ # def make_segment_audio(selector_value: Optional[str], state: Optional[Dict[str, Any]]):
790
+ # idx = _parse_selector_value(selector_value)
791
+ # if idx is None:
792
+ # return None, None, {"error": "请选择一个有效片段。"}
793
+ # return make_segment_audio_by_idx(idx, state)
794
+
795
+ # def make_segment_audio_from_action(action_json: str, state: Optional[Dict[str, Any]]):
 
 
 
 
 
 
 
796
  # """
797
+ # action_json: {"idx": <int>, "mode": "a"/"b"/"both"}
798
+ # mode is only used by frontend for play policy; backend always returns both segment audios.
 
 
799
  # """
800
+ # try:
801
+ # payload = json.loads(action_json or "{}")
802
+ # idx = int(payload.get("idx"))
803
+ # except Exception:
804
+ # return None, None, {"error": "动作解析失败(action_json 无效)。"}
805
+ # return make_segment_audio_by_idx(idx, state)
 
 
 
 
 
806
 
807
+ # def clear_all():
808
+ # return (
809
+ # None, None, None, None,
810
+ # "原文(不指定)", "原文(不指定)",
811
+ # "global", 0.0, 0.0,
812
+ # [], "", {}, [],
813
+ # None, None,
814
+ # gr.update(choices=[], value=None),
815
+ # None,
816
+ # None, None, {},
817
+ # "",
 
 
 
 
 
 
818
  # )
819
 
 
 
 
820
 
821
+ # # -----------------------------
822
+ # # UI
823
+ # # -----------------------------
824
+ # CSS = """
825
+ # /* Keep components in DOM for injected JS to locate them */
826
+ # .dom-hidden { display: none !important; }
827
+ # """
828
+
829
+ # with gr.Blocks(css=CSS) as demo:
830
+ # gr.Markdown(
831
+ # """
832
+ # # 可视化语音质检平台(交互逐行播放 + 语言一致性 + 对齐裁剪)
833
+ # - 交互表格每行播放:A / B / 同时
834
+ # - “对齐全局时间(推荐)”可显著减少“对比不对应”的听感问题
835
+ # - “时间偏移”用于修正音频头部静音/时间轴偏差
836
+ # """.strip()
837
+ # )
838
 
839
+ # state = gr.State(value=None)
840
 
841
+ # # Must exist in DOM (NOT visible=False)
842
+ # action_box = gr.Textbox(
843
+ # label="__action_box",
844
+ # value="",
845
+ # elem_id="action_box",
846
+ # elem_classes=["dom-hidden"],
847
+ # )
848
+ # action_btn = gr.Button(
849
+ # "__action_btn",
850
+ # elem_id="action_btn",
851
+ # elem_classes=["dom-hidden"],
852
+ # )
853
 
854
+ # with gr.Row():
855
+ # with gr.Column(scale=1):
856
+ # gr.Markdown("## 1) 上传文件")
857
+ # audio_a = gr.File(
858
+ # label="Track A 音频/视频",
859
+ # file_types=[".mp3", ".wav", ".m4a", ".flac", ".ogg", ".aac", ".mp4", ".mov", ".avi"],
860
+ # type="filepath",
861
+ # )
862
+ # audio_b = gr.File(
863
+ # label="Track B 音频/视频",
864
+ # file_types=[".mp3", ".wav", ".m4a", ".flac", ".ogg", ".aac", ".mp4", ".mov", ".avi"],
865
+ # type="filepath",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
866
  # )
867
 
868
+ # sub_a = gr.File(
869
+ # label="Track A 字幕(.srt/.vtt)",
870
+ # file_types=[".srt", ".vtt", ".txt"],
871
+ # type="filepath",
872
+ # )
873
+ # sub_b = gr.File(
874
+ # label="Track B 字幕(.srt/.vtt)",
875
+ # file_types=[".srt", ".vtt", ".txt"],
876
+ # type="filepath",
877
+ # )
878
 
879
+ # gr.Markdown("## 2) 声明语言(用于一致性检查)")
880
+ # declared_a_lang = gr.Dropdown(
881
+ # label="Track A 声明语言",
882
+ # choices=LANG_LABELS,
883
+ # value="原文(不指定)",
884
+ # )
885
+ # declared_b_lang = gr.Dropdown(
886
+ # label="Track B 声明语言",
887
+ # choices=LANG_LABELS,
888
+ # value="原文(不指定)",
889
+ # )
890
 
891
+ # gr.Markdown("## 3) 片段裁剪设置(解决“不对应”)")
892
+ # crop_mode = gr.Radio(
893
+ # label="裁剪基准",
894
+ # choices=[
895
+ # ("对齐全局时间(推荐)", "global"),
896
+ # ("各自字幕时间", "per_track"),
897
+ # ],
898
+ # value="global",
899
+ # )
900
+ # offset_a = gr.Slider(
901
+ # label="Track A 时间偏移(秒,可为负)",
902
+ # minimum=-30.0,
903
+ # maximum=30.0,
904
+ # value=0.0,
905
+ # step=0.1,
906
+ # )
907
+ # offset_b = gr.Slider(
908
+ # label="Track B 时间偏移(秒,可为负)",
909
+ # minimum=-30.0,
910
+ # maximum=30.0,
911
+ # value=0.0,
912
+ # step=0.1,
913
+ # )
914
 
915
+ # gr.Markdown("## 4) 对齐与质检参数")
916
+ # max_mid_diff = gr.Slider(
917
+ # label="两路字幕对齐阈值(中点时间差,秒)",
918
+ # minimum=0.3,
919
+ # maximum=5.0,
920
+ # value=1.5,
921
+ # step=0.1,
922
+ # )
923
+ # silence_th = gr.Slider(
924
+ # label="静音/近静音阈值(dBFS)",
925
+ # minimum=-80,
926
+ # maximum=-20,
927
+ # value=-50,
928
+ # step=1,
929
+ # )
930
+ # low_th = gr.Slider(
931
+ # label="音量偏低阈值(dBFS)",
932
+ # minimum=-80,
933
+ # maximum=-20,
934
+ # value=-40,
935
+ # step=1,
936
+ # )
937
 
938
+ # with gr.Row():
939
+ # btn_run = gr.Button("解析对齐 + 质检", variant="primary")
940
+ # btn_clear = gr.Button("清空", variant="secondary")
941
+
942
+ # with gr.Column(scale=2):
943
+ # with gr.Tabs():
944
+ # with gr.TabItem("交互播放表格(推荐)"):
945
+ # interactive_table = gr.HTML()
946
+
947
+ # with gr.TabItem("分段对照表(Dataframe)"):
948
+ # seg_table = gr.Dataframe(
949
+ # headers=["#", "start(s)", "end(s)", "Track A", "Track B", "A time", "B time"],
950
+ # datatype=["number", "str", "str", "str", "str", "str", "str"],
951
+ # interactive=False,
952
+ # wrap=True,
953
+ # )
954
+
955
+ # with gr.TabItem("统计概览"):
956
+ # stats = gr.JSON(label="统计信息(对齐 + QC + 语言一致性 + 裁剪设置)")
957
+
958
+ # with gr.TabItem("问题片段"):
959
+ # issues = gr.JSON(label="问题列表(含 LanguageMismatch)")
960
+
961
+ # with gr.TabItem("音频对比播放"):
962
+ # gr.Markdown("### 原始音频(整段)")
963
+ # player_a_full = gr.Audio(label="Track A 原始音频", interactive=False)
964
+ # player_b_full = gr.Audio(label="Track B 原始音频", interactive=False)
965
+
966
+ # gr.Markdown("### 选择片段并生成对比音频(备用方式)")
967
+ # selector = gr.Dropdown(label="片段选择", choices=[], value=None)
968
+ # btn_make = gr.Button("生成所选片段(两路)", variant="primary")
969
+
970
+ # # elem_id used by injected JS
971
+ # player_a_seg = gr.Audio(label="Track A 片段", interactive=False, elem_id="a_audio_seg")
972
+ # player_b_seg = gr.Audio(label="Track B 片段", interactive=False, elem_id="b_audio_seg")
973
+ # seg_info = gr.JSON(label="片段信息(含裁剪窗口)")
974
+
975
+ # btn_run.click(
976
+ # fn=parse_align_and_qc,
977
+ # inputs=[
978
+ # audio_a, audio_b, sub_a, sub_b,
979
+ # declared_a_lang, declared_b_lang,
980
+ # crop_mode, offset_a, offset_b,
981
+ # max_mid_diff, silence_th, low_th
982
+ # ],
983
+ # outputs=[
984
+ # seg_table, interactive_table, stats, issues,
985
+ # player_a_full, player_b_full,
986
+ # selector, state
987
+ # ],
988
+ # )
989
 
990
+ # btn_make.click(
991
+ # fn=make_segment_audio,
992
+ # inputs=[selector, state],
993
+ # outputs=[player_a_seg, player_b_seg, seg_info],
994
+ # )
995
 
996
+ # # triggered by table buttons
997
+ # action_btn.click(
998
+ # fn=make_segment_audio_from_action,
999
+ # inputs=[action_box, state],
1000
+ # outputs=[player_a_seg, player_b_seg, seg_info],
1001
+ # )
1002
 
1003
+ # btn_clear.click(
1004
+ # fn=clear_all,
1005
+ # inputs=[],
1006
+ # outputs=[
1007
+ # audio_a, audio_b, sub_a, sub_b,
1008
+ # declared_a_lang, declared_b_lang,
1009
+ # crop_mode, offset_a, offset_b,
1010
+ # seg_table, interactive_table, stats, issues,
1011
+ # player_a_full, player_b_full,
1012
+ # selector, state,
1013
+ # player_a_seg, player_b_seg, seg_info,
1014
+ # action_box
1015
+ # ],
1016
+ # )
1017
 
1018
  # if __name__ == "__main__":
 
1019
  # demo.launch()
1020
 
1021
+
1022
  import json
1023
  import numpy as np
1024
  import gradio as gr
1025
 
1026
+ from huggingface_hub import (
1027
+ list_repo_files,
1028
+ hf_hub_download,
1029
+ )
1030
 
1031
  import soundfile as sf
1032
 
1033
+
1034
+ # =====================================================
1035
+ # 固定配置(你的 Dataset 真实路径)
1036
+ # =====================================================
1037
  REPO_ID = "AlexTYJ/Multilingual-ASR-Benchmark"
1038
+ REPO_TYPE = "dataset"
1039
+
1040
  AUDIO_DIR = "audio/testbatch/ARE"
1041
  JSON_DIR = "text/ref/testbatch/ARE"
1042
 
1043
 
1044
+ # =====================================================
1045
  # 工具函数
1046
+ # =====================================================
1047
 
1048
  def list_are_audio_files():
1049
+ """
1050
+ 列出 ARE 目录下所有音频文件
1051
+ """
1052
+ files = list_repo_files(
1053
+ repo_id=REPO_ID,
1054
+ repo_type=REPO_TYPE,
1055
+ )
1056
+
1057
  audio_files = [
1058
  f for f in files
1059
+ if f.startswith(AUDIO_DIR)
1060
+ and f.lower().endswith((".wav", ".mp3", ".flac"))
1061
  ]
1062
  audio_files.sort()
1063
  return audio_files
1064
 
1065
 
1066
+ def load_audio_and_json(audio_path: str):
1067
+ """
1068
+ 给定 audio 路径:
1069
+ - 下载音频
1070
+ - 推导并下载对应 json
1071
+ - 解析 segments
1072
+ """
1073
  filename = audio_path.split("/")[-1]
1074
+ base = filename.rsplit(".", 1)[0]
1075
+ json_path = f"{JSON_DIR}/{base}.json"
1076
 
1077
  # ---- 下载 ----
1078
+ local_audio = hf_hub_download(
1079
+ repo_id=REPO_ID,
1080
+ filename=audio_path,
1081
+ repo_type=REPO_TYPE,
1082
+ )
1083
+
1084
+ local_json = hf_hub_download(
1085
+ repo_id=REPO_ID,
1086
+ filename=json_path,
1087
+ repo_type=REPO_TYPE,
1088
+ )
1089
 
1090
  # ---- 读音频 ----
1091
  audio, sr = sf.read(local_audio)
1092
  if audio.ndim == 2:
1093
  audio = audio.mean(axis=1)
1094
+ audio = audio.astype(np.float32)
1095
 
1096
  # ---- 读 JSON ----
1097
  with open(local_json, "r", encoding="utf-8") as f:
 
1101
  for i, s in enumerate(data["segments"]):
1102
  segments.append({
1103
  "row_id": s.get("index", i),
1104
+ "start": float(s.get("start", 0.0)),
1105
+ "end": float(s.get("end", 0.0)),
1106
+ "dur": float(s.get("end", 0.0) - s.get("start", 0.0)),
1107
  "status": s.get("status", ""),
1108
  "speaker": s.get("speaker", ""),
1109
  "gender": s.get("gender", ""),
 
1116
 
1117
 
1118
  def slice_audio(audio, sr, start, end):
1119
+ s = max(0, int(start * sr))
1120
+ e = min(len(audio), int(end * sr))
1121
+ return sr, audio[s:e]
1122
 
1123
 
1124
+ # =====================================================
1125
+ # Gradio 回调
1126
+ # =====================================================
1127
 
1128
+ def on_load_audio(audio_path):
1129
  audio, sr, segments, audio_name = load_audio_and_json(audio_path)
1130
 
1131
  rows = [
 
1147
  state = {
1148
  "audio": audio,
1149
  "sr": sr,
1150
+ "segments": segments,
1151
  }
1152
 
1153
  return state, rows, info
 
1155
 
1156
  def on_select_segment(evt: gr.SelectData, state):
1157
  row = evt.row_value
 
1158
 
1159
+ start = float(row[1])
1160
+ end = float(row[2])
1161
+
1162
+ sr, audio_seg = slice_audio(
1163
+ state["audio"],
1164
+ state["sr"],
1165
+ start,
1166
+ end
1167
+ )
1168
 
1169
  meta = (
1170
+ f"- **speaker**: `{row[5]}`\n"
1171
+ f"- **gender**: `{row[6]}`\n"
1172
+ f"- **age_group**: `{row[7]}`\n"
1173
+ f"- **emotion**: `{row[8]}`\n"
1174
+ f"- **status**: `{row[4]}`"
1175
  )
1176
 
1177
+ text = row[9] if row[9].strip() else "(empty)"
1178
+
1179
+ return (sr, audio_seg), meta, text
1180
 
1181
 
1182
+ # =====================================================
1183
  # UI
1184
+ # =====================================================
1185
 
1186
  with gr.Blocks(title="ARE Audio Segment Explorer") as demo:
1187
  gr.Markdown(
1188
+ "# 🎧 ARE 音频分段可视化(Hugging Face Dataset)\n"
1189
+ "**Dataset**: `AlexTYJ/Multilingual-ASR-Benchmark`\n\n"
1190
+ "从 Hugging Face Hub 直接读取音频与字幕,不需要上传文件。"
1191
  )
1192
 
1193
  state = gr.State()
 
1195
  audio_files = list_are_audio_files()
1196
 
1197
  audio_selector = gr.Dropdown(
 
1198
  label="选择音频文件(ARE)",
1199
+ choices=audio_files,
1200
+ value=audio_files[0] if audio_files else None,
1201
  )
1202
 
1203
+ load_btn = gr.Button("加载音频 & 字幕", variant="primary")
1204
  info = gr.Markdown()
1205
 
1206
  df = gr.Dataframe(
 
1221
  text = gr.Textbox(label="字幕文本", lines=4)
1222
 
1223
  load_btn.click(
1224
+ on_load_audio,
1225
  inputs=audio_selector,
1226
  outputs=[state, df, info],
1227
  )
 
1232
  outputs=[audio_out, meta, text],
1233
  )
1234
 
1235
+ demo.launch()