Update app.py
Browse files
app.py
CHANGED
|
@@ -1,481 +1,1097 @@
|
|
| 1 |
-
# import
|
| 2 |
-
# import io
|
| 3 |
# import os
|
| 4 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
# import gradio as gr
|
| 7 |
-
|
| 8 |
-
# # Optional deps
|
| 9 |
# import numpy as np
|
| 10 |
-
|
| 11 |
-
#
|
| 12 |
-
|
| 13 |
-
#
|
| 14 |
-
#
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
#
|
| 18 |
-
#
|
| 19 |
-
#
|
| 20 |
-
#
|
| 21 |
-
#
|
| 22 |
-
#
|
| 23 |
-
|
| 24 |
-
#
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
#
|
| 28 |
-
#
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
#
|
| 33 |
-
#
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
#
|
| 37 |
-
#
|
| 38 |
-
#
|
| 39 |
-
#
|
| 40 |
-
#
|
| 41 |
-
|
| 42 |
-
#
|
| 43 |
-
#
|
| 44 |
-
|
| 45 |
-
#
|
| 46 |
-
#
|
| 47 |
-
#
|
| 48 |
-
#
|
| 49 |
-
#
|
| 50 |
-
# "
|
| 51 |
-
#
|
| 52 |
-
#
|
| 53 |
-
#
|
| 54 |
-
#
|
| 55 |
-
#
|
| 56 |
-
#
|
| 57 |
-
#
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
#
|
| 61 |
-
#
|
| 62 |
-
#
|
| 63 |
-
|
| 64 |
-
#
|
| 65 |
-
#
|
| 66 |
-
#
|
| 67 |
-
#
|
| 68 |
-
#
|
| 69 |
-
#
|
| 70 |
-
#
|
| 71 |
-
#
|
| 72 |
-
|
| 73 |
-
#
|
| 74 |
-
#
|
| 75 |
-
#
|
| 76 |
-
#
|
| 77 |
-
#
|
| 78 |
-
|
| 79 |
-
#
|
| 80 |
-
#
|
| 81 |
-
#
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
#
|
| 85 |
-
#
|
| 86 |
-
#
|
| 87 |
-
#
|
| 88 |
-
#
|
| 89 |
-
#
|
| 90 |
-
|
| 91 |
-
#
|
| 92 |
-
|
| 93 |
-
#
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
#
|
| 97 |
-
|
| 98 |
-
#
|
| 99 |
-
#
|
| 100 |
-
#
|
| 101 |
-
#
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
#
|
| 105 |
-
#
|
| 106 |
-
#
|
| 107 |
-
#
|
| 108 |
-
#
|
| 109 |
-
#
|
| 110 |
-
#
|
| 111 |
-
|
| 112 |
-
#
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
#
|
| 116 |
-
#
|
| 117 |
-
#
|
| 118 |
-
#
|
| 119 |
-
#
|
| 120 |
-
# return headers, rows, None
|
| 121 |
-
# df = pd.DataFrame(segments)
|
| 122 |
-
# # sort by start time
|
| 123 |
-
# df = df.sort_values(["start", "end"]).reset_index(drop=True)
|
| 124 |
-
# return df.columns.tolist(), df.values.tolist(), df
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
# def _filter_segments(segments: List[Dict[str, Any]],
|
| 128 |
-
# speaker: str, gender: str, age_group: str, emotion: str, status: str,
|
| 129 |
-
# text_query: str) -> List[Dict[str, Any]]:
|
| 130 |
-
# def ok(v: str, sel: str) -> bool:
|
| 131 |
-
# if sel in (None, "", "ALL"):
|
| 132 |
-
# return True
|
| 133 |
-
# return (v or "") == sel
|
| 134 |
-
|
| 135 |
-
# tq = (text_query or "").strip().lower()
|
| 136 |
-
|
| 137 |
-
# out = []
|
| 138 |
-
# for s in segments:
|
| 139 |
-
# if not ok(s.get("speaker", ""), speaker):
|
| 140 |
# continue
|
| 141 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
# continue
|
| 143 |
-
#
|
|
|
|
| 144 |
# continue
|
| 145 |
-
#
|
|
|
|
|
|
|
| 146 |
# continue
|
| 147 |
-
#
|
|
|
|
|
|
|
| 148 |
# continue
|
| 149 |
-
#
|
| 150 |
-
#
|
| 151 |
-
#
|
| 152 |
-
|
| 153 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
-
#
|
| 157 |
-
#
|
| 158 |
-
#
|
| 159 |
-
#
|
| 160 |
-
#
|
| 161 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
-
#
|
| 164 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
|
| 166 |
-
# # downsample for plotting speed
|
| 167 |
-
# max_points = 20000
|
| 168 |
-
# if len(y) > max_points:
|
| 169 |
-
# step = len(y) // max_points
|
| 170 |
-
# y_plot = y[::step]
|
| 171 |
-
# t = np.linspace(0, len(y) / sr, num=len(y_plot))
|
| 172 |
-
# else:
|
| 173 |
-
# y_plot = y
|
| 174 |
-
# t = np.linspace(0, len(y) / sr, num=len(y))
|
| 175 |
-
|
| 176 |
-
# fig = plt.figure(figsize=(10, 3))
|
| 177 |
-
# ax = fig.add_subplot(111)
|
| 178 |
-
# ax.plot(t, y_plot, linewidth=0.7)
|
| 179 |
-
# ax.set_title("Waveform (segment boundaries)")
|
| 180 |
-
# ax.set_xlabel("Time (s)")
|
| 181 |
-
# ax.set_ylabel("Amplitude")
|
| 182 |
-
|
| 183 |
-
# # add some boundaries (avoid too many lines)
|
| 184 |
-
# if segments:
|
| 185 |
-
# # show up to 120 boundaries to avoid clutter
|
| 186 |
-
# boundaries = []
|
| 187 |
-
# for s in segments:
|
| 188 |
-
# boundaries.append(float(s.get("start", 0.0)))
|
| 189 |
-
# boundaries.append(float(s.get("end", 0.0)))
|
| 190 |
-
# boundaries = sorted(set(boundaries))
|
| 191 |
-
# if len(boundaries) > 120:
|
| 192 |
-
# # evenly sample
|
| 193 |
-
# idx = np.linspace(0, len(boundaries) - 1, 120).astype(int)
|
| 194 |
-
# boundaries = [boundaries[i] for i in idx]
|
| 195 |
-
|
| 196 |
-
# for b in boundaries:
|
| 197 |
-
# ax.axvline(b, linewidth=0.5, alpha=0.25)
|
| 198 |
-
|
| 199 |
-
# fig.tight_layout()
|
| 200 |
-
# return fig
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
# def load_assets(audio_file, json_file):
|
| 204 |
-
# if audio_file is None or json_file is None:
|
| 205 |
-
# raise gr.Error("请同时上传音频文件和JSON字幕文件。")
|
| 206 |
-
|
| 207 |
-
# audio_path = audio_file
|
| 208 |
-
# json_path = json_file
|
| 209 |
-
|
| 210 |
-
# data = _load_json(json_path)
|
| 211 |
-
# segments = _normalize_segments(data)
|
| 212 |
-
|
| 213 |
-
# # load audio
|
| 214 |
-
# y, sr = _load_audio_to_np(audio_path)
|
| 215 |
-
|
| 216 |
-
# # build options
|
| 217 |
-
# speakers = ["ALL"] + _safe_unique([s.get("speaker", "") for s in segments if s.get("speaker", "")])
|
| 218 |
-
# genders = ["ALL"] + _safe_unique([s.get("gender", "") for s in segments if s.get("gender", "")])
|
| 219 |
-
# ages = ["ALL"] + _safe_unique([s.get("age_group", "") for s in segments if s.get("age_group", "")])
|
| 220 |
-
# emotions = ["ALL"] + _safe_unique([s.get("emotion", "") for s in segments if s.get("emotion", "")])
|
| 221 |
-
# statuses = ["ALL"] + _safe_unique([s.get("status", "") for s in segments if s.get("status", "")])
|
| 222 |
-
|
| 223 |
-
# headers, rows, _ = _build_df(segments)
|
| 224 |
-
|
| 225 |
-
# # waveform plot (optional)
|
| 226 |
-
# fig = _make_waveform_plot(y, sr, segments)
|
| 227 |
-
|
| 228 |
-
# # store in state
|
| 229 |
-
# state = {
|
| 230 |
-
# "audio_path": audio_path,
|
| 231 |
-
# "json_path": json_path,
|
| 232 |
-
# "audio_sr": sr,
|
| 233 |
-
# "audio_y": y,
|
| 234 |
-
# "segments": segments,
|
| 235 |
-
# "headers": headers,
|
| 236 |
-
# "rows_full": rows,
|
| 237 |
-
# }
|
| 238 |
|
| 239 |
-
#
|
| 240 |
-
#
|
| 241 |
-
#
|
| 242 |
-
#
|
| 243 |
-
#
|
| 244 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
|
| 246 |
-
#
|
| 247 |
-
#
|
| 248 |
-
#
|
| 249 |
-
|
| 250 |
-
#
|
| 251 |
-
#
|
| 252 |
-
#
|
| 253 |
-
#
|
| 254 |
-
#
|
| 255 |
-
#
|
| 256 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
|
| 259 |
-
#
|
| 260 |
-
#
|
| 261 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
|
| 263 |
-
#
|
| 264 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
|
| 266 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
|
| 268 |
-
#
|
| 269 |
-
#
|
| 270 |
-
#
|
| 271 |
-
# state2["segments_filtered"] = filtered
|
| 272 |
|
| 273 |
-
#
|
|
|
|
|
|
|
| 274 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
|
| 276 |
-
#
|
| 277 |
-
#
|
| 278 |
-
#
|
| 279 |
-
|
| 280 |
-
#
|
| 281 |
-
|
| 282 |
-
#
|
| 283 |
-
#
|
| 284 |
-
|
| 285 |
-
#
|
| 286 |
-
|
| 287 |
-
#
|
| 288 |
-
#
|
| 289 |
-
#
|
| 290 |
-
#
|
| 291 |
-
#
|
| 292 |
-
|
| 293 |
-
#
|
| 294 |
-
#
|
| 295 |
-
#
|
| 296 |
-
#
|
| 297 |
-
|
| 298 |
-
#
|
| 299 |
-
#
|
| 300 |
-
#
|
| 301 |
-
|
| 302 |
-
#
|
| 303 |
-
|
| 304 |
-
#
|
| 305 |
-
#
|
| 306 |
-
#
|
| 307 |
-
|
| 308 |
-
#
|
| 309 |
-
# if d < best_d:
|
| 310 |
-
# best_d = d
|
| 311 |
-
# best = s
|
| 312 |
-
# return best or candidates[0]
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
# def on_row_select(evt: gr.SelectData, state):
|
| 316 |
# """
|
| 317 |
-
#
|
| 318 |
-
#
|
| 319 |
-
# We need to access the dataframe contents; Gradio passes it through component state in newer versions.
|
| 320 |
-
# We'll rely on evt.row_value if available; else use a hidden state approach via gr.Dataframe 'value'.
|
| 321 |
# """
|
| 322 |
-
#
|
| 323 |
-
#
|
| 324 |
-
|
| 325 |
-
#
|
| 326 |
-
#
|
| 327 |
-
#
|
| 328 |
-
# return (
|
| 329 |
-
# None,
|
| 330 |
-
# gr.update(value="无法读取选中行数据(你的Gradio版本可能较旧)。建议升级gradio>=4。"),
|
| 331 |
-
# gr.update(value=""),
|
| 332 |
-
# )
|
| 333 |
|
| 334 |
-
#
|
| 335 |
-
|
| 336 |
-
#
|
| 337 |
-
#
|
| 338 |
-
#
|
| 339 |
-
#
|
| 340 |
-
#
|
| 341 |
-
|
| 342 |
-
#
|
| 343 |
-
#
|
| 344 |
-
#
|
| 345 |
-
# f"- **time**: `{start:.3f}s` → `{end:.3f}s` (dur≈{end-start:.3f}s)\n"
|
| 346 |
-
# f"- **status**: `{seg.get('status','')}`\n"
|
| 347 |
-
# f"- **speaker**: `{seg.get('speaker','')}`\n"
|
| 348 |
-
# f"- **gender**: `{seg.get('gender','')}`\n"
|
| 349 |
-
# f"- **age_group**: `{seg.get('age_group','')}`\n"
|
| 350 |
-
# f"- **emotion**: `{seg.get('emotion','')}`\n"
|
| 351 |
# )
|
| 352 |
|
| 353 |
-
# text = seg.get("text", "") or ""
|
| 354 |
-
# if not text.strip():
|
| 355 |
-
# text = "(empty)"
|
| 356 |
|
| 357 |
-
#
|
| 358 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
|
|
|
|
| 360 |
|
| 361 |
-
#
|
| 362 |
-
#
|
| 363 |
-
#
|
| 364 |
-
#
|
| 365 |
-
#
|
| 366 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 367 |
|
| 368 |
-
#
|
| 369 |
-
|
| 370 |
-
#
|
| 371 |
-
#
|
| 372 |
-
#
|
| 373 |
-
|
| 374 |
-
#
|
| 375 |
-
#
|
| 376 |
-
|
| 377 |
-
#
|
| 378 |
-
#
|
| 379 |
-
#
|
| 380 |
-
# age_dd = gr.Dropdown(label="Age group", choices=["ALL"], value="ALL")
|
| 381 |
-
# emotion_dd = gr.Dropdown(label="Emotion", choices=["ALL"], value="ALL")
|
| 382 |
-
# status_dd = gr.Dropdown(label="Status", choices=["ALL"], value="ALL")
|
| 383 |
-
|
| 384 |
-
# text_query = gr.Textbox(label="Text contains(可选)", placeholder="输入关键词过滤 text(支持阿拉伯语/英文)")
|
| 385 |
-
# filter_btn = gr.Button("应用筛选")
|
| 386 |
-
|
| 387 |
-
# with gr.Row():
|
| 388 |
-
# df = gr.Dataframe(
|
| 389 |
-
# label="Segments(点击行以播放该段)",
|
| 390 |
-
# headers=["row_id", "start", "end", "dur", "status", "speaker", "gender", "age_group", "emotion", "text"],
|
| 391 |
-
# datatype=["number", "number", "number", "number", "str", "str", "str", "str", "str", "str"],
|
| 392 |
-
# wrap=True,
|
| 393 |
-
# interactive=False,
|
| 394 |
-
# max_height=420,
|
| 395 |
# )
|
| 396 |
|
| 397 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 398 |
|
| 399 |
-
#
|
| 400 |
-
#
|
| 401 |
-
#
|
| 402 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 403 |
|
| 404 |
-
#
|
| 405 |
-
#
|
| 406 |
-
#
|
| 407 |
-
#
|
| 408 |
-
#
|
| 409 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 410 |
|
| 411 |
-
#
|
| 412 |
-
#
|
| 413 |
-
#
|
| 414 |
-
#
|
| 415 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 416 |
|
| 417 |
-
#
|
| 418 |
-
#
|
| 419 |
-
#
|
| 420 |
-
|
| 421 |
-
# )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 422 |
|
| 423 |
-
#
|
| 424 |
-
#
|
| 425 |
-
#
|
| 426 |
-
#
|
| 427 |
-
#
|
| 428 |
|
| 429 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 430 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 431 |
|
| 432 |
# if __name__ == "__main__":
|
| 433 |
-
# demo = build_app()
|
| 434 |
# demo.launch()
|
| 435 |
|
|
|
|
| 436 |
import json
|
| 437 |
import numpy as np
|
| 438 |
import gradio as gr
|
| 439 |
|
| 440 |
-
from huggingface_hub import
|
|
|
|
|
|
|
|
|
|
| 441 |
|
| 442 |
import soundfile as sf
|
| 443 |
|
| 444 |
-
|
| 445 |
-
#
|
| 446 |
-
#
|
|
|
|
| 447 |
REPO_ID = "AlexTYJ/Multilingual-ASR-Benchmark"
|
|
|
|
|
|
|
| 448 |
AUDIO_DIR = "audio/testbatch/ARE"
|
| 449 |
JSON_DIR = "text/ref/testbatch/ARE"
|
| 450 |
|
| 451 |
|
| 452 |
-
# =====================
|
| 453 |
# 工具函数
|
| 454 |
-
# =====================
|
| 455 |
|
| 456 |
def list_are_audio_files():
|
| 457 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 458 |
audio_files = [
|
| 459 |
f for f in files
|
| 460 |
-
if f.startswith(AUDIO_DIR)
|
|
|
|
| 461 |
]
|
| 462 |
audio_files.sort()
|
| 463 |
return audio_files
|
| 464 |
|
| 465 |
|
| 466 |
-
def load_audio_and_json(audio_path):
|
| 467 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 468 |
filename = audio_path.split("/")[-1]
|
| 469 |
-
|
|
|
|
| 470 |
|
| 471 |
# ---- 下载 ----
|
| 472 |
-
local_audio = hf_hub_download(
|
| 473 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 474 |
|
| 475 |
# ---- 读音频 ----
|
| 476 |
audio, sr = sf.read(local_audio)
|
| 477 |
if audio.ndim == 2:
|
| 478 |
audio = audio.mean(axis=1)
|
|
|
|
| 479 |
|
| 480 |
# ---- 读 JSON ----
|
| 481 |
with open(local_json, "r", encoding="utf-8") as f:
|
|
@@ -485,9 +1101,9 @@ def load_audio_and_json(audio_path):
|
|
| 485 |
for i, s in enumerate(data["segments"]):
|
| 486 |
segments.append({
|
| 487 |
"row_id": s.get("index", i),
|
| 488 |
-
"start": float(s
|
| 489 |
-
"end": float(s
|
| 490 |
-
"dur": float(s
|
| 491 |
"status": s.get("status", ""),
|
| 492 |
"speaker": s.get("speaker", ""),
|
| 493 |
"gender": s.get("gender", ""),
|
|
@@ -500,14 +1116,16 @@ def load_audio_and_json(audio_path):
|
|
| 500 |
|
| 501 |
|
| 502 |
def slice_audio(audio, sr, start, end):
|
| 503 |
-
|
|
|
|
|
|
|
| 504 |
|
| 505 |
|
| 506 |
-
# =====================
|
| 507 |
-
# Gradio
|
| 508 |
-
# =====================
|
| 509 |
|
| 510 |
-
def
|
| 511 |
audio, sr, segments, audio_name = load_audio_and_json(audio_path)
|
| 512 |
|
| 513 |
rows = [
|
|
@@ -529,7 +1147,7 @@ def on_select_file(audio_path):
|
|
| 529 |
state = {
|
| 530 |
"audio": audio,
|
| 531 |
"sr": sr,
|
| 532 |
-
"segments": segments
|
| 533 |
}
|
| 534 |
|
| 535 |
return state, rows, info
|
|
@@ -537,28 +1155,39 @@ def on_select_file(audio_path):
|
|
| 537 |
|
| 538 |
def on_select_segment(evt: gr.SelectData, state):
|
| 539 |
row = evt.row_value
|
| 540 |
-
start, end = float(row[1]), float(row[2])
|
| 541 |
|
| 542 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 543 |
|
| 544 |
meta = (
|
| 545 |
-
f"- **speaker**: {row[5]}\n"
|
| 546 |
-
f"- **gender**: {row[6]}\n"
|
| 547 |
-
f"- **age_group**: {row[7]}\n"
|
| 548 |
-
f"- **emotion**: {row[8]}"
|
|
|
|
| 549 |
)
|
| 550 |
|
| 551 |
-
|
|
|
|
|
|
|
| 552 |
|
| 553 |
|
| 554 |
-
# =====================
|
| 555 |
# UI
|
| 556 |
-
# =====================
|
| 557 |
|
| 558 |
with gr.Blocks(title="ARE Audio Segment Explorer") as demo:
|
| 559 |
gr.Markdown(
|
| 560 |
-
"# 🎧 ARE 音频
|
| 561 |
-
"
|
|
|
|
| 562 |
)
|
| 563 |
|
| 564 |
state = gr.State()
|
|
@@ -566,12 +1195,12 @@ with gr.Blocks(title="ARE Audio Segment Explorer") as demo:
|
|
| 566 |
audio_files = list_are_audio_files()
|
| 567 |
|
| 568 |
audio_selector = gr.Dropdown(
|
| 569 |
-
choices=audio_files,
|
| 570 |
label="选择音频文件(ARE)",
|
| 571 |
-
|
|
|
|
| 572 |
)
|
| 573 |
|
| 574 |
-
load_btn = gr.Button("加载", variant="primary")
|
| 575 |
info = gr.Markdown()
|
| 576 |
|
| 577 |
df = gr.Dataframe(
|
|
@@ -592,7 +1221,7 @@ with gr.Blocks(title="ARE Audio Segment Explorer") as demo:
|
|
| 592 |
text = gr.Textbox(label="字幕文本", lines=4)
|
| 593 |
|
| 594 |
load_btn.click(
|
| 595 |
-
|
| 596 |
inputs=audio_selector,
|
| 597 |
outputs=[state, df, info],
|
| 598 |
)
|
|
@@ -603,5 +1232,4 @@ with gr.Blocks(title="ARE Audio Segment Explorer") as demo:
|
|
| 603 |
outputs=[audio_out, meta, text],
|
| 604 |
)
|
| 605 |
|
| 606 |
-
demo.launch()
|
| 607 |
-
|
|
|
|
| 1 |
+
# import re
|
|
|
|
| 2 |
# import os
|
| 3 |
+
# import json
|
| 4 |
+
# import math
|
| 5 |
+
# import tempfile
|
| 6 |
+
# from dataclasses import dataclass
|
| 7 |
+
# from typing import List, Optional, Tuple, Dict, Any
|
| 8 |
|
| 9 |
# import gradio as gr
|
|
|
|
|
|
|
| 10 |
# import numpy as np
|
| 11 |
+
# import srt
|
| 12 |
+
# from pydub import AudioSegment
|
| 13 |
+
|
| 14 |
+
# from langdetect import DetectorFactory, detect_langs
|
| 15 |
+
# DetectorFactory.seed = 0 # deterministic
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# # -----------------------------
|
| 19 |
+
# # Data structures
|
| 20 |
+
# # -----------------------------
|
| 21 |
+
# @dataclass
|
| 22 |
+
# class Cue:
|
| 23 |
+
# start: float
|
| 24 |
+
# end: float
|
| 25 |
+
# text: str
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
# # -----------------------------
|
| 29 |
+
# # Language utilities
|
| 30 |
+
# # -----------------------------
|
| 31 |
+
# LANG_LABELS = ["原文(不指定)", "中文", "English", "日本語"]
|
| 32 |
+
|
| 33 |
+
# def normalize_lang_code(code: str) -> str:
|
| 34 |
+
# c = (code or "").lower()
|
| 35 |
+
# if c.startswith("zh"):
|
| 36 |
+
# return "中文"
|
| 37 |
+
# if c == "en":
|
| 38 |
+
# return "English"
|
| 39 |
+
# if c == "ja":
|
| 40 |
+
# return "日本語"
|
| 41 |
+
# return "其他"
|
| 42 |
+
|
| 43 |
+
# def detect_language_label(text: str) -> Dict[str, Any]:
|
| 44 |
+
# t = (text or "").strip()
|
| 45 |
+
# if not t:
|
| 46 |
+
# return {"raw_code": None, "raw_prob": None, "label": None, "normalized_label": None}
|
| 47 |
+
# t = t.replace("\n", " ").strip()[:400]
|
| 48 |
+
# try:
|
| 49 |
+
# langs = detect_langs(t)
|
| 50 |
+
# if not langs:
|
| 51 |
+
# return {"raw_code": None, "raw_prob": None, "label": None, "normalized_label": None}
|
| 52 |
+
# top = langs[0]
|
| 53 |
+
# raw_code = getattr(top, "lang", None)
|
| 54 |
+
# raw_prob = float(getattr(top, "prob", 0.0)) if top else None
|
| 55 |
+
# norm = normalize_lang_code(raw_code)
|
| 56 |
+
# return {
|
| 57 |
+
# "raw_code": raw_code,
|
| 58 |
+
# "raw_prob": round(raw_prob, 3) if raw_prob is not None else None,
|
| 59 |
+
# "label": norm if norm != "其他" else "原文(不指定)",
|
| 60 |
+
# "normalized_label": norm,
|
| 61 |
+
# }
|
| 62 |
+
# except Exception:
|
| 63 |
+
# return {"raw_code": None, "raw_prob": None, "label": None, "normalized_label": None}
|
| 64 |
+
|
| 65 |
+
# def check_declared_mismatch(declared: str, detected_norm: Optional[str]) -> bool:
|
| 66 |
+
# if declared == "原文(不指定)":
|
| 67 |
+
# return False
|
| 68 |
+
# if detected_norm is None:
|
| 69 |
+
# return True
|
| 70 |
+
# if detected_norm == "其他":
|
| 71 |
+
# return True
|
| 72 |
+
# return detected_norm != declared
|
| 73 |
+
|
| 74 |
+
# def analyze_language_for_cues(cues: List[Cue]) -> Dict[str, Any]:
|
| 75 |
+
# counts = {"中文": 0, "English": 0, "日本語": 0, "其他": 0}
|
| 76 |
+
# for c in cues:
|
| 77 |
+
# d = detect_language_label(c.text)
|
| 78 |
+
# norm = d.get("normalized_label")
|
| 79 |
+
# if norm in counts:
|
| 80 |
+
# counts[norm] += 1
|
| 81 |
+
# dominant = max(counts.items(), key=lambda x: x[1])[0] if cues else None
|
| 82 |
+
# return {"counts": counts, "dominant_norm": dominant}
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
# # -----------------------------
|
| 86 |
+
# # Subtitle parsing
|
| 87 |
+
# # -----------------------------
|
| 88 |
+
# _TAG_RE = re.compile(r"</?[^>]+?>", re.IGNORECASE)
|
| 89 |
+
# _VTT_TIME_RE = re.compile(
|
| 90 |
+
# r"(?P<start>\d{2}:\d{2}:\d{2}\.\d{3}|\d{1,2}:\d{2}\.\d{3})\s*-->\s*"
|
| 91 |
+
# r"(?P<end>\d{2}:\d{2}:\d{2}\.\d{3}|\d{1,2}:\d{2}\.\d{3})"
|
| 92 |
+
# )
|
| 93 |
+
|
| 94 |
+
# def _strip_tags(text: str) -> str:
|
| 95 |
+
# text = _TAG_RE.sub("", text)
|
| 96 |
+
# text = text.replace("<c>", "").replace("</c>", "")
|
| 97 |
+
# return text.strip()
|
| 98 |
+
|
| 99 |
+
# def _time_to_seconds(t: str) -> float:
|
| 100 |
+
# t = t.strip().split()[0]
|
| 101 |
+
# parts = t.split(":")
|
| 102 |
+
# if len(parts) == 3:
|
| 103 |
+
# h = int(parts[0])
|
| 104 |
+
# m = int(parts[1])
|
| 105 |
+
# s = float(parts[2])
|
| 106 |
+
# return h * 3600 + m * 60 + s
|
| 107 |
+
# if len(parts) == 2:
|
| 108 |
+
# m = int(parts[0])
|
| 109 |
+
# s = float(parts[1])
|
| 110 |
+
# return m * 60 + s
|
| 111 |
+
# raise ValueError(f"Unsupported time format: {t}")
|
| 112 |
+
|
| 113 |
+
# def parse_vtt(content: str) -> List[Cue]:
|
| 114 |
+
# content = content.replace("\ufeff", "")
|
| 115 |
+
# content = re.sub(r"^\s*WEBVTT.*?\n", "", content, flags=re.IGNORECASE)
|
| 116 |
+
# blocks = re.split(r"\r?\n\r?\n", content.strip())
|
| 117 |
+
# cues: List[Cue] = []
|
| 118 |
+
# for block in blocks:
|
| 119 |
+
# lines = [ln.rstrip("\n") for ln in re.split(r"\r?\n", block) if ln.strip() != ""]
|
| 120 |
+
# if not lines:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
# continue
|
| 122 |
+
# time_line_idx = None
|
| 123 |
+
# for idx in range(min(2, len(lines))):
|
| 124 |
+
# if "-->" in lines[idx]:
|
| 125 |
+
# time_line_idx = idx
|
| 126 |
+
# break
|
| 127 |
+
# if time_line_idx is None:
|
| 128 |
# continue
|
| 129 |
+
# m = _VTT_TIME_RE.search(lines[time_line_idx])
|
| 130 |
+
# if not m:
|
| 131 |
# continue
|
| 132 |
+
# start = _time_to_seconds(m.group("start"))
|
| 133 |
+
# end = _time_to_seconds(m.group("end"))
|
| 134 |
+
# if not (math.isfinite(start) and math.isfinite(end)) or end <= start:
|
| 135 |
# continue
|
| 136 |
+
# text_lines = lines[time_line_idx + 1 :]
|
| 137 |
+
# text = _strip_tags("\n".join(text_lines)).strip()
|
| 138 |
+
# if not text:
|
| 139 |
# continue
|
| 140 |
+
# cues.append(Cue(start=start, end=end, text=text))
|
| 141 |
+
# cues.sort(key=lambda x: x.start)
|
| 142 |
+
# return cues
|
| 143 |
+
|
| 144 |
+
# def parse_srt(content: str) -> List[Cue]:
|
| 145 |
+
# content = content.replace("\ufeff", "")
|
| 146 |
+
# subs = list(srt.parse(content))
|
| 147 |
+
# cues: List[Cue] = []
|
| 148 |
+
# for sub in subs:
|
| 149 |
+
# cues.append(
|
| 150 |
+
# Cue(
|
| 151 |
+
# start=sub.start.total_seconds(),
|
| 152 |
+
# end=sub.end.total_seconds(),
|
| 153 |
+
# text=sub.content.strip(),
|
| 154 |
+
# )
|
| 155 |
+
# )
|
| 156 |
+
# cues.sort(key=lambda x: x.start)
|
| 157 |
+
# return cues
|
| 158 |
+
|
| 159 |
+
# def parse_subtitle_file(path: Optional[str]) -> List[Cue]:
|
| 160 |
+
# if not path:
|
| 161 |
+
# return []
|
| 162 |
+
# with open(path, "r", encoding="utf-8") as f:
|
| 163 |
+
# content = f.read()
|
| 164 |
+
# head = content.lstrip()[:80].upper()
|
| 165 |
+
# ext = os.path.splitext(path)[1].lower()
|
| 166 |
+
# if "WEBVTT" in head or ext == ".vtt":
|
| 167 |
+
# return parse_vtt(content)
|
| 168 |
+
# return parse_srt(content)
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
# # -----------------------------
|
| 172 |
+
# # Alignment
|
| 173 |
+
# # -----------------------------
|
| 174 |
+
# def align_by_time(
|
| 175 |
+
# a: List[Cue],
|
| 176 |
+
# b: List[Cue],
|
| 177 |
+
# max_mid_diff: float = 1.5,
|
| 178 |
+
# ) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
|
| 179 |
+
# aligned: List[Dict[str, Any]] = []
|
| 180 |
+
# i, j = 0, 0
|
| 181 |
+
# matched_a = set()
|
| 182 |
+
# matched_b = set()
|
| 183 |
+
|
| 184 |
+
# while i < len(a) and j < len(b):
|
| 185 |
+
# x = a[i]
|
| 186 |
+
# y = b[j]
|
| 187 |
+
# mid_x = (x.start + x.end) / 2
|
| 188 |
+
# mid_y = (y.start + y.end) / 2
|
| 189 |
+
# diff = mid_x - mid_y
|
| 190 |
+
|
| 191 |
+
# if abs(diff) <= max_mid_diff:
|
| 192 |
+
# # global window for "corresponding part"
|
| 193 |
+
# g_start = min(x.start, y.start)
|
| 194 |
+
# g_end = max(x.end, y.end)
|
| 195 |
+
# aligned.append(
|
| 196 |
+
# {
|
| 197 |
+
# "idx": len(aligned) + 1,
|
| 198 |
+
# "start": g_start,
|
| 199 |
+
# "end": g_end,
|
| 200 |
+
# "en_start": x.start,
|
| 201 |
+
# "en_end": x.end,
|
| 202 |
+
# "zh_start": y.start,
|
| 203 |
+
# "zh_end": y.end,
|
| 204 |
+
# "text_en": x.text,
|
| 205 |
+
# "text_zh": y.text,
|
| 206 |
+
# }
|
| 207 |
+
# )
|
| 208 |
+
# matched_a.add(i)
|
| 209 |
+
# matched_b.add(j)
|
| 210 |
+
# i += 1
|
| 211 |
+
# j += 1
|
| 212 |
+
# elif diff < 0:
|
| 213 |
+
# i += 1
|
| 214 |
+
# else:
|
| 215 |
+
# j += 1
|
| 216 |
+
|
| 217 |
+
# stats = {
|
| 218 |
+
# "trackA_total": len(a),
|
| 219 |
+
# "trackB_total": len(b),
|
| 220 |
+
# "aligned_pairs": len(aligned),
|
| 221 |
+
# "trackA_unmatched": len(a) - len(matched_a),
|
| 222 |
+
# "trackB_unmatched": len(b) - len(matched_b),
|
| 223 |
+
# "max_mid_diff_sec": max_mid_diff,
|
| 224 |
+
# }
|
| 225 |
+
# return aligned, stats
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
# # -----------------------------
|
| 229 |
+
# # Audio processing / QC
|
| 230 |
+
# # -----------------------------
|
| 231 |
+
# def load_audio(path: str) -> AudioSegment:
|
| 232 |
+
# return AudioSegment.from_file(path)
|
| 233 |
+
|
| 234 |
+
# def segment_to_wav_file(audio: AudioSegment, start_s: float, end_s: float) -> str:
|
| 235 |
+
# start_ms = max(0, int(start_s * 1000))
|
| 236 |
+
# end_ms = max(start_ms + 1, int(end_s * 1000))
|
| 237 |
+
# seg = audio[start_ms:end_ms]
|
| 238 |
+
# tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
| 239 |
+
# tmp.close()
|
| 240 |
+
# seg.export(tmp.name, format="wav")
|
| 241 |
+
# return tmp.name
|
| 242 |
+
|
| 243 |
+
# def compute_dbfs(audio: AudioSegment) -> float:
|
| 244 |
+
# v = audio.dBFS
|
| 245 |
+
# if v == float("-inf"):
|
| 246 |
+
# return -120.0
|
| 247 |
+
# return float(v)
|
| 248 |
+
|
| 249 |
+
# def qc_on_aligned_segments(
|
| 250 |
+
# audio_a: AudioSegment,
|
| 251 |
+
# audio_b: AudioSegment,
|
| 252 |
+
# aligned: List[Dict[str, Any]],
|
| 253 |
+
# silence_dbfs_threshold: float = -50.0,
|
| 254 |
+
# low_dbfs_threshold: float = -40.0,
|
| 255 |
+
# ) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
|
| 256 |
+
# issues: List[Dict[str, Any]] = []
|
| 257 |
+
# a_levels = []
|
| 258 |
+
# b_levels = []
|
| 259 |
+
|
| 260 |
+
# for seg in aligned:
|
| 261 |
+
# a_start, a_end = seg["en_start"], seg["en_end"]
|
| 262 |
+
# b_start, b_end = seg["zh_start"], seg["zh_end"]
|
| 263 |
+
|
| 264 |
+
# a = audio_a[int(a_start * 1000) : int(a_end * 1000)]
|
| 265 |
+
# b = audio_b[int(b_start * 1000) : int(b_end * 1000)]
|
| 266 |
+
|
| 267 |
+
# a_dbfs = compute_dbfs(a)
|
| 268 |
+
# b_dbfs = compute_dbfs(b)
|
| 269 |
+
|
| 270 |
+
# a_levels.append(a_dbfs)
|
| 271 |
+
# b_levels.append(b_dbfs)
|
| 272 |
+
|
| 273 |
+
# problems = []
|
| 274 |
+
# if a_dbfs <= silence_dbfs_threshold:
|
| 275 |
+
# problems.append("Track A 静音/近静音")
|
| 276 |
+
# elif a_dbfs <= low_dbfs_threshold:
|
| 277 |
+
# problems.append("Track A 音量偏低")
|
| 278 |
+
|
| 279 |
+
# if b_dbfs <= silence_dbfs_threshold:
|
| 280 |
+
# problems.append("Track B 静音/近静音")
|
| 281 |
+
# elif b_dbfs <= low_dbfs_threshold:
|
| 282 |
+
# problems.append("Track B 音量偏低")
|
| 283 |
+
|
| 284 |
+
# if abs(a_dbfs - b_dbfs) >= 12.0:
|
| 285 |
+
# problems.append("两路音量差异过大(≥12dB)")
|
| 286 |
+
|
| 287 |
+
# if problems:
|
| 288 |
+
# issues.append(
|
| 289 |
+
# {
|
| 290 |
+
# "segment": seg["idx"],
|
| 291 |
+
# "time": f"{seg['start']:.2f}-{seg['end']:.2f}s",
|
| 292 |
+
# "A_time": f"{a_start:.2f}-{a_end:.2f}s",
|
| 293 |
+
# "B_time": f"{b_start:.2f}-{b_end:.2f}s",
|
| 294 |
+
# "problems": problems,
|
| 295 |
+
# "text_A": seg["text_en"][:120] + ("..." if len(seg["text_en"]) > 120 else ""),
|
| 296 |
+
# "text_B": seg["text_zh"][:120] + ("..." if len(seg["text_zh"]) > 120 else ""),
|
| 297 |
+
# "A_dbfs": round(a_dbfs, 1),
|
| 298 |
+
# "B_dbfs": round(b_dbfs, 1),
|
| 299 |
+
# }
|
| 300 |
+
# )
|
| 301 |
|
| 302 |
+
# def _safe_mean(xs):
|
| 303 |
+
# return float(np.mean(xs)) if xs else 0.0
|
| 304 |
+
|
| 305 |
+
# qc_stats = {
|
| 306 |
+
# "aligned_pairs": len(aligned),
|
| 307 |
+
# "issue_segments": len(issues),
|
| 308 |
+
# "A_avg_dbfs": round(_safe_mean(a_levels), 1),
|
| 309 |
+
# "B_avg_dbfs": round(_safe_mean(b_levels), 1),
|
| 310 |
+
# "A_min_dbfs": round(float(np.min(a_levels)), 1) if a_levels else None,
|
| 311 |
+
# "B_min_dbfs": round(float(np.min(b_levels)), 1) if b_levels else None,
|
| 312 |
+
# "silence_dbfs_threshold": silence_dbfs_threshold,
|
| 313 |
+
# "low_dbfs_threshold": low_dbfs_threshold,
|
| 314 |
+
# }
|
| 315 |
+
# return issues, qc_stats
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
# # -----------------------------
|
| 319 |
+
# # Interactive HTML table (buttons per row)
|
| 320 |
+
# # -----------------------------
|
| 321 |
+
# def _escape_html(s: str) -> str:
|
| 322 |
+
# return (s or "").replace("&", "&").replace("<", "<").replace(">", ">")
|
| 323 |
+
|
| 324 |
+
# def build_interactive_table_html(aligned: List[Dict[str, Any]]) -> str:
|
| 325 |
+
# max_rows = 2000
|
| 326 |
+
# aligned_view = aligned[:max_rows]
|
| 327 |
+
|
| 328 |
+
# rows_html = []
|
| 329 |
+
# for seg in aligned_view:
|
| 330 |
+
# idx = seg["idx"]
|
| 331 |
+
# t = f'{seg["start"]:.2f}-{seg["end"]:.2f}'
|
| 332 |
+
# a_t = f'{seg["en_start"]:.2f}-{seg["en_end"]:.2f}'
|
| 333 |
+
# b_t = f'{seg["zh_start"]:.2f}-{seg["zh_end"]:.2f}'
|
| 334 |
+
# a_txt = _escape_html(seg["text_en"])
|
| 335 |
+
# b_txt = _escape_html(seg["text_zh"])
|
| 336 |
+
|
| 337 |
+
# a_lang = _escape_html(seg.get("en_lang_label") or "-")
|
| 338 |
+
# b_lang = _escape_html(seg.get("zh_lang_label") or "-")
|
| 339 |
+
# a_conf = seg.get("en_lang_prob")
|
| 340 |
+
# b_conf = seg.get("zh_lang_prob")
|
| 341 |
+
# a_conf_s = f"{a_conf:.3f}" if isinstance(a_conf, (int, float)) else "-"
|
| 342 |
+
# b_conf_s = f"{b_conf:.3f}" if isinstance(b_conf, (int, float)) else "-"
|
| 343 |
+
|
| 344 |
+
# a_bad = seg.get("en_lang_mismatch", False)
|
| 345 |
+
# b_bad = seg.get("zh_lang_mismatch", False)
|
| 346 |
+
# a_cls = "bad" if a_bad else "ok"
|
| 347 |
+
# b_cls = "bad" if b_bad else "ok"
|
| 348 |
+
|
| 349 |
+
# btns = (
|
| 350 |
+
# f'<button class="segbtn a" onclick="__playSeg({idx}, \'a\')">A</button>'
|
| 351 |
+
# f'<button class="segbtn b" onclick="__playSeg({idx}, \'b\')">B</button>'
|
| 352 |
+
# f'<button class="segbtn both" onclick="__playSeg({idx}, \'both\')">同时</button>'
|
| 353 |
+
# )
|
| 354 |
|
| 355 |
+
# rows_html.append(
|
| 356 |
+
# f"""
|
| 357 |
+
# <tr>
|
| 358 |
+
# <td class="mono">{idx:04d}</td>
|
| 359 |
+
# <td class="mono">{t}</td>
|
| 360 |
+
# <td class="btncell">{btns}</td>
|
| 361 |
+
# <td class="textcell">{a_txt}</td>
|
| 362 |
+
# <td class="textcell">{b_txt}</td>
|
| 363 |
+
# <td class="mono">{a_t}</td>
|
| 364 |
+
# <td class="mono">{b_t}</td>
|
| 365 |
+
# <td class="mono {a_cls}">{a_lang} ({a_conf_s})</td>
|
| 366 |
+
# <td class="mono {b_cls}">{b_lang} ({b_conf_s})</td>
|
| 367 |
+
# </tr>
|
| 368 |
+
# """.strip()
|
| 369 |
+
# )
|
| 370 |
|
| 371 |
+
# # IMPORTANT FIX:
|
| 372 |
+
# # - __qs must be `#${id}`, not `#${{id}}`
|
| 373 |
+
# # - action_box/action_btn must exist in DOM (we keep them and hide via CSS)
|
| 374 |
+
# # - Wait for audio src to change before play (avoid playing old segment)
|
| 375 |
+
# table_html = f"""
|
| 376 |
+
# <div class="segwrap">
|
| 377 |
+
# <div class="note">
|
| 378 |
+
# 点击每行按钮可生成并播放片段(A / B / 同时)。若浏览器阻止自动播放,请在右侧播放器手动点一次播放键。
|
| 379 |
+
# </div>
|
| 380 |
+
# <div class="tablewrap">
|
| 381 |
+
# <table class="segtable">
|
| 382 |
+
# <thead>
|
| 383 |
+
# <tr>
|
| 384 |
+
# <th>#</th>
|
| 385 |
+
# <th>Global(s)</th>
|
| 386 |
+
# <th>Play</th>
|
| 387 |
+
# <th>Track A</th>
|
| 388 |
+
# <th>Track B</th>
|
| 389 |
+
# <th>A time</th>
|
| 390 |
+
# <th>B time</th>
|
| 391 |
+
# <th>A Lang</th>
|
| 392 |
+
# <th>B Lang</th>
|
| 393 |
+
# </tr>
|
| 394 |
+
# </thead>
|
| 395 |
+
# <tbody>
|
| 396 |
+
# {"".join(rows_html)}
|
| 397 |
+
# </tbody>
|
| 398 |
+
# </table>
|
| 399 |
+
# </div>
|
| 400 |
+
# </div>
|
| 401 |
+
|
| 402 |
+
# <style>
|
| 403 |
+
# .segwrap {{ margin-top: 8px; }}
|
| 404 |
+
# .note {{ font-size: 12px; opacity: 0.85; margin-bottom: 8px; }}
|
| 405 |
+
# .tablewrap {{ overflow: auto; max-height: 560px; border: 1px solid rgba(127,127,127,0.25); border-radius: 8px; }}
|
| 406 |
+
# table.segtable {{ width: 100%; border-collapse: collapse; }}
|
| 407 |
+
# table.segtable th, table.segtable td {{ border-bottom: 1px solid rgba(127,127,127,0.18); padding: 8px; vertical-align: top; }}
|
| 408 |
+
# table.segtable thead th {{ position: sticky; top: 0; background: rgba(250,250,250,0.95); z-index: 1; }}
|
| 409 |
+
# .mono {{ font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace; white-space: nowrap; }}
|
| 410 |
+
# .textcell {{ min-width: 320px; white-space: pre-wrap; }}
|
| 411 |
+
# .btncell {{ white-space: nowrap; min-width: 150px; }}
|
| 412 |
+
# .segbtn {{
|
| 413 |
+
# padding: 4px 10px;
|
| 414 |
+
# margin-right: 6px;
|
| 415 |
+
# border-radius: 8px;
|
| 416 |
+
# border: 1px solid rgba(127,127,127,0.35);
|
| 417 |
+
# background: white;
|
| 418 |
+
# cursor: pointer;
|
| 419 |
+
# font-size: 12px;
|
| 420 |
+
# }}
|
| 421 |
+
# .segbtn:hover {{ background: rgba(0,0,0,0.04); }}
|
| 422 |
+
# .segbtn.both {{ font-weight: 800; }}
|
| 423 |
+
# .ok {{ color: inherit; }}
|
| 424 |
+
# .bad {{ color: #b00020; font-weight: 800; }}
|
| 425 |
+
# </style>
|
| 426 |
+
|
| 427 |
+
# <script>
|
| 428 |
+
# function __gradioAppRoot() {{
|
| 429 |
+
# const ga = document.querySelector("gradio-app");
|
| 430 |
+
# return ga ? ga.shadowRoot : document;
|
| 431 |
+
# }}
|
| 432 |
+
|
| 433 |
+
# function __qs(id) {{
|
| 434 |
+
# const root = __gradioAppRoot();
|
| 435 |
+
# return root.querySelector(`#${{id}}`);
|
| 436 |
+
# }}
|
| 437 |
+
|
| 438 |
+
# function __setTextboxValue(elemId, value) {{
|
| 439 |
+
# const box = __qs(elemId);
|
| 440 |
+
# if (!box) return false;
|
| 441 |
+
# const input = box.querySelector("textarea, input");
|
| 442 |
+
# if (!input) return false;
|
| 443 |
+
# input.value = value;
|
| 444 |
+
# input.dispatchEvent(new Event("input", {{ bubbles: true }}));
|
| 445 |
+
# return true;
|
| 446 |
+
# }}
|
| 447 |
+
|
| 448 |
+
# function __clickButton(elemId) {{
|
| 449 |
+
# const btn = __qs(elemId);
|
| 450 |
+
# if (!btn) return false;
|
| 451 |
+
# const realBtn = btn.querySelector("button");
|
| 452 |
+
# if (!realBtn) return false;
|
| 453 |
+
# realBtn.click();
|
| 454 |
+
# return true;
|
| 455 |
+
# }}
|
| 456 |
+
|
| 457 |
+
# function __getAudioTag(containerElemId) {{
|
| 458 |
+
# const c = __qs(containerElemId);
|
| 459 |
+
# if (!c) return null;
|
| 460 |
+
# return c.querySelector("audio");
|
| 461 |
+
# }}
|
| 462 |
+
|
| 463 |
+
# window.__desiredPlayMode = null; // "a" | "b" | "both"
|
| 464 |
+
# window.__desiredPlayNonce = 0;
|
| 465 |
+
|
| 466 |
+
# function __waitAndPlay(nonce, prevASrc, prevBSrc) {{
|
| 467 |
+
# let tries = 0;
|
| 468 |
+
# const maxTries = 80; // ~16s
|
| 469 |
+
# const intervalMs = 200;
|
| 470 |
+
|
| 471 |
+
# const timer = setInterval(() => {{
|
| 472 |
+
# tries += 1;
|
| 473 |
+
# if (window.__desiredPlayNonce !== nonce) {{
|
| 474 |
+
# clearInterval(timer);
|
| 475 |
+
# return;
|
| 476 |
+
# }}
|
| 477 |
+
|
| 478 |
+
# const aAudio = __getAudioTag("a_audio_seg");
|
| 479 |
+
# const bAudio = __getAudioTag("b_audio_seg");
|
| 480 |
+
|
| 481 |
+
# if (!aAudio && !bAudio) {{
|
| 482 |
+
# if (tries >= maxTries) clearInterval(timer);
|
| 483 |
+
# return;
|
| 484 |
+
# }}
|
| 485 |
+
|
| 486 |
+
# const mode = window.__desiredPlayMode;
|
| 487 |
+
|
| 488 |
+
# const aReady = aAudio && (aAudio.src && aAudio.src !== prevASrc) && aAudio.readyState >= 2;
|
| 489 |
+
# const bReady = bAudio && (bAudio.src && bAudio.src !== prevBSrc) && bAudio.readyState >= 2;
|
| 490 |
+
|
| 491 |
+
# const canPlay =
|
| 492 |
+
# (mode === "a" && aReady) ||
|
| 493 |
+
# (mode === "b" && bReady) ||
|
| 494 |
+
# (mode === "both" && aReady && bReady);
|
| 495 |
+
|
| 496 |
+
# if (canPlay) {{
|
| 497 |
+
# try {{
|
| 498 |
+
# if (mode === "a") {{
|
| 499 |
+
# if (bAudio) bAudio.pause();
|
| 500 |
+
# aAudio.currentTime = 0;
|
| 501 |
+
# aAudio.play();
|
| 502 |
+
# }} else if (mode === "b") {{
|
| 503 |
+
# if (aAudio) aAudio.pause();
|
| 504 |
+
# bAudio.currentTime = 0;
|
| 505 |
+
# bAudio.play();
|
| 506 |
+
# }} else if (mode === "both") {{
|
| 507 |
+
# aAudio.currentTime = 0;
|
| 508 |
+
# bAudio.currentTime = 0;
|
| 509 |
+
# aAudio.play();
|
| 510 |
+
# bAudio.play();
|
| 511 |
+
# }}
|
| 512 |
+
# }} catch (e) {{}}
|
| 513 |
+
# clearInterval(timer);
|
| 514 |
+
# return;
|
| 515 |
+
# }}
|
| 516 |
+
|
| 517 |
+
# if (tries >= maxTries) {{
|
| 518 |
+
# clearInterval(timer);
|
| 519 |
+
# }}
|
| 520 |
+
# }}, intervalMs);
|
| 521 |
+
# }}
|
| 522 |
+
|
| 523 |
+
# function __playSeg(idx, mode) {{
|
| 524 |
+
# window.__desiredPlayMode = mode;
|
| 525 |
+
# window.__desiredPlayNonce += 1;
|
| 526 |
+
# const nonce = window.__desiredPlayNonce;
|
| 527 |
+
|
| 528 |
+
# const aAudio = __getAudioTag("a_audio_seg");
|
| 529 |
+
# const bAudio = __getAudioTag("b_audio_seg");
|
| 530 |
+
# const prevASrc = aAudio ? aAudio.src : "";
|
| 531 |
+
# const prevBSrc = bAudio ? bAudio.src : "";
|
| 532 |
+
|
| 533 |
+
# const payload = JSON.stringify({{ idx: idx, mode: mode }});
|
| 534 |
+
# const ok1 = __setTextboxValue("action_box", payload);
|
| 535 |
+
# const ok2 = __clickButton("action_btn");
|
| 536 |
+
|
| 537 |
+
# if (ok1 && ok2) {{
|
| 538 |
+
# __waitAndPlay(nonce, prevASrc, prevBSrc);
|
| 539 |
+
# }} else {{
|
| 540 |
+
# console.warn("Failed to trigger backend action.", ok1, ok2);
|
| 541 |
+
# }}
|
| 542 |
+
# }}
|
| 543 |
+
# </script>
|
| 544 |
+
# """.strip()
|
| 545 |
+
|
| 546 |
+
# if len(aligned) > max_rows:
|
| 547 |
+
# table_html = (
|
| 548 |
+
# f"<div style='margin:8px 0;font-size:12px;opacity:.85;'>"
|
| 549 |
+
# f"提示:对齐片段数为 {len(aligned)},为保证渲染性能仅展示前 {max_rows} 行。</div>\n"
|
| 550 |
+
# + table_html
|
| 551 |
+
# )
|
| 552 |
+
# return table_html
|
| 553 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 554 |
|
| 555 |
+
# # -----------------------------
|
| 556 |
+
# # Segment generation (core fix: global window + offset)
|
| 557 |
+
# # -----------------------------
|
| 558 |
+
# def _pick_window(seg: Dict[str, Any], mode: str, offset_a: float, offset_b: float) -> Tuple[float, float, float, float]:
|
| 559 |
+
# """
|
| 560 |
+
# mode:
|
| 561 |
+
# - "global": use seg["start"]/["end"] for BOTH tracks
|
| 562 |
+
# - "per_track": use seg["en_start"]/["en_end"] for A, seg["zh_start"]/["zh_end"] for B
|
| 563 |
+
# offsets are applied per track.
|
| 564 |
+
# Returns: (a_start, a_end, b_start, b_end)
|
| 565 |
+
# """
|
| 566 |
+
# if mode == "per_track":
|
| 567 |
+
# a_start, a_end = seg["en_start"], seg["en_end"]
|
| 568 |
+
# b_start, b_end = seg["zh_start"], seg["zh_end"]
|
| 569 |
+
# else:
|
| 570 |
+
# # global window (recommended)
|
| 571 |
+
# a_start, a_end = seg["start"], seg["end"]
|
| 572 |
+
# b_start, b_end = seg["start"], seg["end"]
|
| 573 |
+
|
| 574 |
+
# a_start = max(0.0, a_start + float(offset_a))
|
| 575 |
+
# a_end = max(a_start + 0.01, a_end + float(offset_a))
|
| 576 |
+
# b_start = max(0.0, b_start + float(offset_b))
|
| 577 |
+
# b_end = max(b_start + 0.01, b_end + float(offset_b))
|
| 578 |
+
# return a_start, a_end, b_start, b_end
|
| 579 |
+
|
| 580 |
+
|
| 581 |
+
# # -----------------------------
|
| 582 |
+
# # Gradio callbacks
|
| 583 |
+
# # -----------------------------
|
| 584 |
+
# def parse_align_and_qc(
|
| 585 |
+
# audio_a_path: Optional[str],
|
| 586 |
+
# audio_b_path: Optional[str],
|
| 587 |
+
# sub_a_path: Optional[str],
|
| 588 |
+
# sub_b_path: Optional[str],
|
| 589 |
+
# declared_a_lang: str,
|
| 590 |
+
# declared_b_lang: str,
|
| 591 |
+
# crop_mode: str,
|
| 592 |
+
# offset_a: float,
|
| 593 |
+
# offset_b: float,
|
| 594 |
+
# max_mid_diff: float,
|
| 595 |
+
# silence_th: float,
|
| 596 |
+
# low_th: float,
|
| 597 |
+
# ):
|
| 598 |
+
# if not audio_a_path or not audio_b_path:
|
| 599 |
+
# return (
|
| 600 |
+
# [],
|
| 601 |
+
# "",
|
| 602 |
+
# {"error": "请同时上传 Track A 与 Track B 音频。"},
|
| 603 |
+
# [],
|
| 604 |
+
# None,
|
| 605 |
+
# None,
|
| 606 |
+
# gr.update(choices=[], value=None),
|
| 607 |
+
# None,
|
| 608 |
+
# )
|
| 609 |
|
| 610 |
+
# try:
|
| 611 |
+
# cues_a = parse_subtitle_file(sub_a_path) if sub_a_path else []
|
| 612 |
+
# cues_b = parse_subtitle_file(sub_b_path) if sub_b_path else []
|
| 613 |
+
|
| 614 |
+
# if not cues_a or not cues_b:
|
| 615 |
+
# return (
|
| 616 |
+
# [],
|
| 617 |
+
# "",
|
| 618 |
+
# {"error": "请同时提供两路字幕(SRT/VTT)。", "A_cues": len(cues_a), "B_cues": len(cues_b)},
|
| 619 |
+
# [],
|
| 620 |
+
# audio_a_path,
|
| 621 |
+
# audio_b_path,
|
| 622 |
+
# gr.update(choices=[], value=None),
|
| 623 |
+
# None,
|
| 624 |
+
# )
|
| 625 |
|
| 626 |
+
# # overall language stats
|
| 627 |
+
# a_lang_stats = analyze_language_for_cues(cues_a)
|
| 628 |
+
# b_lang_stats = analyze_language_for_cues(cues_b)
|
| 629 |
+
|
| 630 |
+
# aligned, align_stats = align_by_time(cues_a, cues_b, max_mid_diff=max_mid_diff)
|
| 631 |
+
|
| 632 |
+
# # per-segment language + mismatch issues
|
| 633 |
+
# lang_mismatch_issues = []
|
| 634 |
+
# for seg in aligned:
|
| 635 |
+
# da = detect_language_label(seg["text_en"])
|
| 636 |
+
# db = detect_language_label(seg["text_zh"])
|
| 637 |
+
# seg["en_lang_label"] = da.get("label")
|
| 638 |
+
# seg["en_lang_prob"] = da.get("raw_prob")
|
| 639 |
+
# seg["en_lang_norm"] = da.get("normalized_label")
|
| 640 |
+
# seg["zh_lang_label"] = db.get("label")
|
| 641 |
+
# seg["zh_lang_prob"] = db.get("raw_prob")
|
| 642 |
+
# seg["zh_lang_norm"] = db.get("normalized_label")
|
| 643 |
+
|
| 644 |
+
# seg["en_lang_mismatch"] = check_declared_mismatch(declared_a_lang, seg["en_lang_norm"])
|
| 645 |
+
# seg["zh_lang_mismatch"] = check_declared_mismatch(declared_b_lang, seg["zh_lang_norm"])
|
| 646 |
+
|
| 647 |
+
# if seg["en_lang_mismatch"] or seg["zh_lang_mismatch"]:
|
| 648 |
+
# problems = []
|
| 649 |
+
# if seg["en_lang_mismatch"]:
|
| 650 |
+
# problems.append(f"Track A 声明={declared_a_lang} 检测={seg['en_lang_label']}")
|
| 651 |
+
# if seg["zh_lang_mismatch"]:
|
| 652 |
+
# problems.append(f"Track B 声明={declared_b_lang} 检测={seg['zh_lang_label']}")
|
| 653 |
+
# lang_mismatch_issues.append(
|
| 654 |
+
# {
|
| 655 |
+
# "segment": seg["idx"],
|
| 656 |
+
# "time": f"{seg['start']:.2f}-{seg['end']:.2f}s",
|
| 657 |
+
# "type": "LanguageMismatch",
|
| 658 |
+
# "problems": problems,
|
| 659 |
+
# "text_A": seg["text_en"][:120] + ("..." if len(seg["text_en"]) > 120 else ""),
|
| 660 |
+
# "text_B": seg["text_zh"][:120] + ("..." if len(seg["text_zh"]) > 120 else ""),
|
| 661 |
+
# }
|
| 662 |
+
# )
|
| 663 |
+
|
| 664 |
+
# # dataframe rows
|
| 665 |
+
# rows = []
|
| 666 |
+
# for seg in aligned:
|
| 667 |
+
# rows.append(
|
| 668 |
+
# [
|
| 669 |
+
# seg["idx"],
|
| 670 |
+
# f'{seg["start"]:.2f}',
|
| 671 |
+
# f'{seg["end"]:.2f}',
|
| 672 |
+
# seg["text_en"],
|
| 673 |
+
# seg["text_zh"],
|
| 674 |
+
# f'{seg["en_start"]:.2f}-{seg["en_end"]:.2f}',
|
| 675 |
+
# f'{seg["zh_start"]:.2f}-{seg["zh_end"]:.2f}',
|
| 676 |
+
# ]
|
| 677 |
+
# )
|
| 678 |
|
| 679 |
+
# # audio QC (still based on per-track subtitle times; keep original semantics)
|
| 680 |
+
# audio_a = load_audio(audio_a_path)
|
| 681 |
+
# audio_b = load_audio(audio_b_path)
|
| 682 |
+
# issues_qc, qc_stats = qc_on_aligned_segments(
|
| 683 |
+
# audio_a, audio_b, aligned,
|
| 684 |
+
# silence_dbfs_threshold=silence_th,
|
| 685 |
+
# low_dbfs_threshold=low_th
|
| 686 |
+
# )
|
| 687 |
|
| 688 |
+
# issues_all = lang_mismatch_issues + issues_qc
|
| 689 |
+
|
| 690 |
+
# stats = {
|
| 691 |
+
# "alignment": align_stats,
|
| 692 |
+
# "qc": qc_stats,
|
| 693 |
+
# "language": {
|
| 694 |
+
# "declared": {"TrackA": declared_a_lang, "TrackB": declared_b_lang},
|
| 695 |
+
# "detected_overall": {
|
| 696 |
+
# "TrackA_dominant_norm": a_lang_stats["dominant_norm"],
|
| 697 |
+
# "TrackB_dominant_norm": b_lang_stats["dominant_norm"],
|
| 698 |
+
# "TrackA_counts": a_lang_stats["counts"],
|
| 699 |
+
# "TrackB_counts": b_lang_stats["counts"],
|
| 700 |
+
# },
|
| 701 |
+
# "segment_mismatch_count": len(lang_mismatch_issues),
|
| 702 |
+
# },
|
| 703 |
+
# "segment_crop": {
|
| 704 |
+
# "mode": crop_mode,
|
| 705 |
+
# "offset_a_sec": offset_a,
|
| 706 |
+
# "offset_b_sec": offset_b,
|
| 707 |
+
# "note": "若对比播放不对应:优先使用“对齐全局时间(推荐)”,并微调 Track A/B 时间偏移。",
|
| 708 |
+
# }
|
| 709 |
+
# }
|
| 710 |
+
|
| 711 |
+
# choices = [f'{seg["idx"]:04d} | {seg["start"]:.2f}-{seg["end"]:.2f}s' for seg in aligned]
|
| 712 |
+
# selector_update = gr.update(choices=choices, value=(choices[0] if choices else None))
|
| 713 |
+
|
| 714 |
+
# state = {
|
| 715 |
+
# "aligned": aligned,
|
| 716 |
+
# "audio_a_path": audio_a_path,
|
| 717 |
+
# "audio_b_path": audio_b_path,
|
| 718 |
+
# "declared": {"TrackA": declared_a_lang, "TrackB": declared_b_lang},
|
| 719 |
+
# "crop_mode": crop_mode,
|
| 720 |
+
# "offset_a": float(offset_a),
|
| 721 |
+
# "offset_b": float(offset_b),
|
| 722 |
+
# }
|
| 723 |
+
|
| 724 |
+
# html_table = build_interactive_table_html(aligned)
|
| 725 |
+
|
| 726 |
+
# return rows, html_table, stats, issues_all, audio_a_path, audio_b_path, selector_update, state
|
| 727 |
+
|
| 728 |
+
# except Exception as e:
|
| 729 |
+
# return (
|
| 730 |
+
# [],
|
| 731 |
+
# "",
|
| 732 |
+
# {"error": f"解析/对齐失败: {str(e)}"},
|
| 733 |
+
# [{"error": str(e)}],
|
| 734 |
+
# audio_a_path,
|
| 735 |
+
# audio_b_path,
|
| 736 |
+
# gr.update(choices=[], value=None),
|
| 737 |
+
# None,
|
| 738 |
+
# )
|
| 739 |
|
| 740 |
+
# def _parse_selector_value(v: Optional[str]) -> Optional[int]:
|
| 741 |
+
# if not v:
|
| 742 |
+
# return None
|
| 743 |
+
# m = re.match(r"^\s*(\d+)\s*\|", v)
|
| 744 |
+
# if not m:
|
| 745 |
+
# return None
|
| 746 |
+
# return int(m.group(1))
|
| 747 |
|
| 748 |
+
# def make_segment_audio_by_idx(idx: int, state: Optional[Dict[str, Any]]):
|
| 749 |
+
# if not state or "aligned" not in state:
|
| 750 |
+
# return None, None, {"error": "请先完成字幕解析与对齐。"}
|
|
|
|
| 751 |
|
| 752 |
+
# aligned = state["aligned"]
|
| 753 |
+
# if idx < 1 or idx > len(aligned):
|
| 754 |
+
# return None, None, {"error": "片段索引越界。"}
|
| 755 |
|
| 756 |
+
# seg = aligned[idx - 1]
|
| 757 |
+
# audio_a_path = state["audio_a_path"]
|
| 758 |
+
# audio_b_path = state["audio_b_path"]
|
| 759 |
+
# crop_mode = state.get("crop_mode", "global")
|
| 760 |
+
# offset_a = float(state.get("offset_a", 0.0))
|
| 761 |
+
# offset_b = float(state.get("offset_b", 0.0))
|
| 762 |
|
| 763 |
+
# try:
|
| 764 |
+
# audio_a = load_audio(audio_a_path)
|
| 765 |
+
# audio_b = load_audio(audio_b_path)
|
| 766 |
+
|
| 767 |
+
# a_start, a_end, b_start, b_end = _pick_window(seg, crop_mode, offset_a, offset_b)
|
| 768 |
+
|
| 769 |
+
# a_wav = segment_to_wav_file(audio_a, a_start, a_end)
|
| 770 |
+
# b_wav = segment_to_wav_file(audio_b, b_start, b_end)
|
| 771 |
+
|
| 772 |
+
# info = {
|
| 773 |
+
# "segment": idx,
|
| 774 |
+
# "global_time": f'{seg["start"]:.2f}-{seg["end"]:.2f}s',
|
| 775 |
+
# "crop_mode": crop_mode,
|
| 776 |
+
# "offset_a_sec": offset_a,
|
| 777 |
+
# "offset_b_sec": offset_b,
|
| 778 |
+
# "crop_A_time": f"{a_start:.2f}-{a_end:.2f}s",
|
| 779 |
+
# "crop_B_time": f"{b_start:.2f}-{b_end:.2f}s",
|
| 780 |
+
# "subtitle_A_time": f'{seg["en_start"]:.2f}-{seg["en_end"]:.2f}s',
|
| 781 |
+
# "subtitle_B_time": f'{seg["zh_start"]:.2f}-{seg["zh_end"]:.2f}s',
|
| 782 |
+
# "text_A": seg["text_en"],
|
| 783 |
+
# "text_B": seg["text_zh"],
|
| 784 |
+
# }
|
| 785 |
+
# return a_wav, b_wav, info
|
| 786 |
+
# except Exception as e:
|
| 787 |
+
# return None, None, {"error": f"生成片段音频失败: {str(e)}"}
|
| 788 |
+
|
| 789 |
+
# def make_segment_audio(selector_value: Optional[str], state: Optional[Dict[str, Any]]):
|
| 790 |
+
# idx = _parse_selector_value(selector_value)
|
| 791 |
+
# if idx is None:
|
| 792 |
+
# return None, None, {"error": "请选择一个有效片段。"}
|
| 793 |
+
# return make_segment_audio_by_idx(idx, state)
|
| 794 |
+
|
| 795 |
+
# def make_segment_audio_from_action(action_json: str, state: Optional[Dict[str, Any]]):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 796 |
# """
|
| 797 |
+
# action_json: {"idx": <int>, "mode": "a"/"b"/"both"}
|
| 798 |
+
# mode is only used by frontend for play policy; backend always returns both segment audios.
|
|
|
|
|
|
|
| 799 |
# """
|
| 800 |
+
# try:
|
| 801 |
+
# payload = json.loads(action_json or "{}")
|
| 802 |
+
# idx = int(payload.get("idx"))
|
| 803 |
+
# except Exception:
|
| 804 |
+
# return None, None, {"error": "动作解析失败(action_json 无效)。"}
|
| 805 |
+
# return make_segment_audio_by_idx(idx, state)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 806 |
|
| 807 |
+
# def clear_all():
|
| 808 |
+
# return (
|
| 809 |
+
# None, None, None, None,
|
| 810 |
+
# "原文(不指定)", "原文(不指定)",
|
| 811 |
+
# "global", 0.0, 0.0,
|
| 812 |
+
# [], "", {}, [],
|
| 813 |
+
# None, None,
|
| 814 |
+
# gr.update(choices=[], value=None),
|
| 815 |
+
# None,
|
| 816 |
+
# None, None, {},
|
| 817 |
+
# "",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 818 |
# )
|
| 819 |
|
|
|
|
|
|
|
|
|
|
| 820 |
|
| 821 |
+
# # -----------------------------
|
| 822 |
+
# # UI
|
| 823 |
+
# # -----------------------------
|
| 824 |
+
# CSS = """
|
| 825 |
+
# /* Keep components in DOM for injected JS to locate them */
|
| 826 |
+
# .dom-hidden { display: none !important; }
|
| 827 |
+
# """
|
| 828 |
+
|
| 829 |
+
# with gr.Blocks(css=CSS) as demo:
|
| 830 |
+
# gr.Markdown(
|
| 831 |
+
# """
|
| 832 |
+
# # 可视化语音质检平台(交互逐行播放 + 语言一致性 + 对齐裁剪)
|
| 833 |
+
# - 交互表格每行播放:A / B / 同时
|
| 834 |
+
# - “对齐全局时间(推荐)”可显著减少“对比不对应”的听感问题
|
| 835 |
+
# - “时间偏移”用于修正音频头部静音/时间轴偏差
|
| 836 |
+
# """.strip()
|
| 837 |
+
# )
|
| 838 |
|
| 839 |
+
# state = gr.State(value=None)
|
| 840 |
|
| 841 |
+
# # Must exist in DOM (NOT visible=False)
|
| 842 |
+
# action_box = gr.Textbox(
|
| 843 |
+
# label="__action_box",
|
| 844 |
+
# value="",
|
| 845 |
+
# elem_id="action_box",
|
| 846 |
+
# elem_classes=["dom-hidden"],
|
| 847 |
+
# )
|
| 848 |
+
# action_btn = gr.Button(
|
| 849 |
+
# "__action_btn",
|
| 850 |
+
# elem_id="action_btn",
|
| 851 |
+
# elem_classes=["dom-hidden"],
|
| 852 |
+
# )
|
| 853 |
|
| 854 |
+
# with gr.Row():
|
| 855 |
+
# with gr.Column(scale=1):
|
| 856 |
+
# gr.Markdown("## 1) 上传文件")
|
| 857 |
+
# audio_a = gr.File(
|
| 858 |
+
# label="Track A 音频/视频",
|
| 859 |
+
# file_types=[".mp3", ".wav", ".m4a", ".flac", ".ogg", ".aac", ".mp4", ".mov", ".avi"],
|
| 860 |
+
# type="filepath",
|
| 861 |
+
# )
|
| 862 |
+
# audio_b = gr.File(
|
| 863 |
+
# label="Track B 音频/视频",
|
| 864 |
+
# file_types=[".mp3", ".wav", ".m4a", ".flac", ".ogg", ".aac", ".mp4", ".mov", ".avi"],
|
| 865 |
+
# type="filepath",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 866 |
# )
|
| 867 |
|
| 868 |
+
# sub_a = gr.File(
|
| 869 |
+
# label="Track A 字幕(.srt/.vtt)",
|
| 870 |
+
# file_types=[".srt", ".vtt", ".txt"],
|
| 871 |
+
# type="filepath",
|
| 872 |
+
# )
|
| 873 |
+
# sub_b = gr.File(
|
| 874 |
+
# label="Track B 字幕(.srt/.vtt)",
|
| 875 |
+
# file_types=[".srt", ".vtt", ".txt"],
|
| 876 |
+
# type="filepath",
|
| 877 |
+
# )
|
| 878 |
|
| 879 |
+
# gr.Markdown("## 2) 声明语言(用于一致性检查)")
|
| 880 |
+
# declared_a_lang = gr.Dropdown(
|
| 881 |
+
# label="Track A 声明语言",
|
| 882 |
+
# choices=LANG_LABELS,
|
| 883 |
+
# value="原文(不指定)",
|
| 884 |
+
# )
|
| 885 |
+
# declared_b_lang = gr.Dropdown(
|
| 886 |
+
# label="Track B 声明语言",
|
| 887 |
+
# choices=LANG_LABELS,
|
| 888 |
+
# value="原文(不指定)",
|
| 889 |
+
# )
|
| 890 |
|
| 891 |
+
# gr.Markdown("## 3) 片段裁剪设置(解决“不对应”)")
|
| 892 |
+
# crop_mode = gr.Radio(
|
| 893 |
+
# label="裁剪基准",
|
| 894 |
+
# choices=[
|
| 895 |
+
# ("对齐全局时间(推荐)", "global"),
|
| 896 |
+
# ("各自字幕时间", "per_track"),
|
| 897 |
+
# ],
|
| 898 |
+
# value="global",
|
| 899 |
+
# )
|
| 900 |
+
# offset_a = gr.Slider(
|
| 901 |
+
# label="Track A 时间偏移(秒,可为负)",
|
| 902 |
+
# minimum=-30.0,
|
| 903 |
+
# maximum=30.0,
|
| 904 |
+
# value=0.0,
|
| 905 |
+
# step=0.1,
|
| 906 |
+
# )
|
| 907 |
+
# offset_b = gr.Slider(
|
| 908 |
+
# label="Track B 时间偏移(秒,可为负)",
|
| 909 |
+
# minimum=-30.0,
|
| 910 |
+
# maximum=30.0,
|
| 911 |
+
# value=0.0,
|
| 912 |
+
# step=0.1,
|
| 913 |
+
# )
|
| 914 |
|
| 915 |
+
# gr.Markdown("## 4) 对齐与质检参数")
|
| 916 |
+
# max_mid_diff = gr.Slider(
|
| 917 |
+
# label="两路字幕对齐阈值(中点时间差,秒)",
|
| 918 |
+
# minimum=0.3,
|
| 919 |
+
# maximum=5.0,
|
| 920 |
+
# value=1.5,
|
| 921 |
+
# step=0.1,
|
| 922 |
+
# )
|
| 923 |
+
# silence_th = gr.Slider(
|
| 924 |
+
# label="静音/近静音阈值(dBFS)",
|
| 925 |
+
# minimum=-80,
|
| 926 |
+
# maximum=-20,
|
| 927 |
+
# value=-50,
|
| 928 |
+
# step=1,
|
| 929 |
+
# )
|
| 930 |
+
# low_th = gr.Slider(
|
| 931 |
+
# label="音量偏低阈值(dBFS)",
|
| 932 |
+
# minimum=-80,
|
| 933 |
+
# maximum=-20,
|
| 934 |
+
# value=-40,
|
| 935 |
+
# step=1,
|
| 936 |
+
# )
|
| 937 |
|
| 938 |
+
# with gr.Row():
|
| 939 |
+
# btn_run = gr.Button("解析对齐 + 质检", variant="primary")
|
| 940 |
+
# btn_clear = gr.Button("清空", variant="secondary")
|
| 941 |
+
|
| 942 |
+
# with gr.Column(scale=2):
|
| 943 |
+
# with gr.Tabs():
|
| 944 |
+
# with gr.TabItem("交互播放表格(推荐)"):
|
| 945 |
+
# interactive_table = gr.HTML()
|
| 946 |
+
|
| 947 |
+
# with gr.TabItem("分段对照表(Dataframe)"):
|
| 948 |
+
# seg_table = gr.Dataframe(
|
| 949 |
+
# headers=["#", "start(s)", "end(s)", "Track A", "Track B", "A time", "B time"],
|
| 950 |
+
# datatype=["number", "str", "str", "str", "str", "str", "str"],
|
| 951 |
+
# interactive=False,
|
| 952 |
+
# wrap=True,
|
| 953 |
+
# )
|
| 954 |
+
|
| 955 |
+
# with gr.TabItem("统计概览"):
|
| 956 |
+
# stats = gr.JSON(label="统计信息(对齐 + QC + 语言一致性 + 裁剪设置)")
|
| 957 |
+
|
| 958 |
+
# with gr.TabItem("问题片段"):
|
| 959 |
+
# issues = gr.JSON(label="问题列表(含 LanguageMismatch)")
|
| 960 |
+
|
| 961 |
+
# with gr.TabItem("音频对比播放"):
|
| 962 |
+
# gr.Markdown("### 原始音频(整段)")
|
| 963 |
+
# player_a_full = gr.Audio(label="Track A 原始音频", interactive=False)
|
| 964 |
+
# player_b_full = gr.Audio(label="Track B 原始音频", interactive=False)
|
| 965 |
+
|
| 966 |
+
# gr.Markdown("### 选择片段并生成对比音频(备用方式)")
|
| 967 |
+
# selector = gr.Dropdown(label="片段选择", choices=[], value=None)
|
| 968 |
+
# btn_make = gr.Button("生成所选片段(两路)", variant="primary")
|
| 969 |
+
|
| 970 |
+
# # elem_id used by injected JS
|
| 971 |
+
# player_a_seg = gr.Audio(label="Track A 片段", interactive=False, elem_id="a_audio_seg")
|
| 972 |
+
# player_b_seg = gr.Audio(label="Track B 片段", interactive=False, elem_id="b_audio_seg")
|
| 973 |
+
# seg_info = gr.JSON(label="片段信息(含裁剪窗口)")
|
| 974 |
+
|
| 975 |
+
# btn_run.click(
|
| 976 |
+
# fn=parse_align_and_qc,
|
| 977 |
+
# inputs=[
|
| 978 |
+
# audio_a, audio_b, sub_a, sub_b,
|
| 979 |
+
# declared_a_lang, declared_b_lang,
|
| 980 |
+
# crop_mode, offset_a, offset_b,
|
| 981 |
+
# max_mid_diff, silence_th, low_th
|
| 982 |
+
# ],
|
| 983 |
+
# outputs=[
|
| 984 |
+
# seg_table, interactive_table, stats, issues,
|
| 985 |
+
# player_a_full, player_b_full,
|
| 986 |
+
# selector, state
|
| 987 |
+
# ],
|
| 988 |
+
# )
|
| 989 |
|
| 990 |
+
# btn_make.click(
|
| 991 |
+
# fn=make_segment_audio,
|
| 992 |
+
# inputs=[selector, state],
|
| 993 |
+
# outputs=[player_a_seg, player_b_seg, seg_info],
|
| 994 |
+
# )
|
| 995 |
|
| 996 |
+
# # triggered by table buttons
|
| 997 |
+
# action_btn.click(
|
| 998 |
+
# fn=make_segment_audio_from_action,
|
| 999 |
+
# inputs=[action_box, state],
|
| 1000 |
+
# outputs=[player_a_seg, player_b_seg, seg_info],
|
| 1001 |
+
# )
|
| 1002 |
|
| 1003 |
+
# btn_clear.click(
|
| 1004 |
+
# fn=clear_all,
|
| 1005 |
+
# inputs=[],
|
| 1006 |
+
# outputs=[
|
| 1007 |
+
# audio_a, audio_b, sub_a, sub_b,
|
| 1008 |
+
# declared_a_lang, declared_b_lang,
|
| 1009 |
+
# crop_mode, offset_a, offset_b,
|
| 1010 |
+
# seg_table, interactive_table, stats, issues,
|
| 1011 |
+
# player_a_full, player_b_full,
|
| 1012 |
+
# selector, state,
|
| 1013 |
+
# player_a_seg, player_b_seg, seg_info,
|
| 1014 |
+
# action_box
|
| 1015 |
+
# ],
|
| 1016 |
+
# )
|
| 1017 |
|
| 1018 |
# if __name__ == "__main__":
|
|
|
|
| 1019 |
# demo.launch()
|
| 1020 |
|
| 1021 |
+
|
| 1022 |
import json
|
| 1023 |
import numpy as np
|
| 1024 |
import gradio as gr
|
| 1025 |
|
| 1026 |
+
from huggingface_hub import (
|
| 1027 |
+
list_repo_files,
|
| 1028 |
+
hf_hub_download,
|
| 1029 |
+
)
|
| 1030 |
|
| 1031 |
import soundfile as sf
|
| 1032 |
|
| 1033 |
+
|
| 1034 |
+
# =====================================================
|
| 1035 |
+
# 固定配置(你的 Dataset 真实路径)
|
| 1036 |
+
# =====================================================
|
| 1037 |
REPO_ID = "AlexTYJ/Multilingual-ASR-Benchmark"
|
| 1038 |
+
REPO_TYPE = "dataset"
|
| 1039 |
+
|
| 1040 |
AUDIO_DIR = "audio/testbatch/ARE"
|
| 1041 |
JSON_DIR = "text/ref/testbatch/ARE"
|
| 1042 |
|
| 1043 |
|
| 1044 |
+
# =====================================================
|
| 1045 |
# 工具函数
|
| 1046 |
+
# =====================================================
|
| 1047 |
|
| 1048 |
def list_are_audio_files():
|
| 1049 |
+
"""
|
| 1050 |
+
列出 ARE 目录下所有音频文件
|
| 1051 |
+
"""
|
| 1052 |
+
files = list_repo_files(
|
| 1053 |
+
repo_id=REPO_ID,
|
| 1054 |
+
repo_type=REPO_TYPE,
|
| 1055 |
+
)
|
| 1056 |
+
|
| 1057 |
audio_files = [
|
| 1058 |
f for f in files
|
| 1059 |
+
if f.startswith(AUDIO_DIR)
|
| 1060 |
+
and f.lower().endswith((".wav", ".mp3", ".flac"))
|
| 1061 |
]
|
| 1062 |
audio_files.sort()
|
| 1063 |
return audio_files
|
| 1064 |
|
| 1065 |
|
| 1066 |
+
def load_audio_and_json(audio_path: str):
|
| 1067 |
+
"""
|
| 1068 |
+
给定 audio 路径:
|
| 1069 |
+
- 下载音频
|
| 1070 |
+
- 推导并下载对应 json
|
| 1071 |
+
- 解析 segments
|
| 1072 |
+
"""
|
| 1073 |
filename = audio_path.split("/")[-1]
|
| 1074 |
+
base = filename.rsplit(".", 1)[0]
|
| 1075 |
+
json_path = f"{JSON_DIR}/{base}.json"
|
| 1076 |
|
| 1077 |
# ---- 下载 ----
|
| 1078 |
+
local_audio = hf_hub_download(
|
| 1079 |
+
repo_id=REPO_ID,
|
| 1080 |
+
filename=audio_path,
|
| 1081 |
+
repo_type=REPO_TYPE,
|
| 1082 |
+
)
|
| 1083 |
+
|
| 1084 |
+
local_json = hf_hub_download(
|
| 1085 |
+
repo_id=REPO_ID,
|
| 1086 |
+
filename=json_path,
|
| 1087 |
+
repo_type=REPO_TYPE,
|
| 1088 |
+
)
|
| 1089 |
|
| 1090 |
# ---- 读音频 ----
|
| 1091 |
audio, sr = sf.read(local_audio)
|
| 1092 |
if audio.ndim == 2:
|
| 1093 |
audio = audio.mean(axis=1)
|
| 1094 |
+
audio = audio.astype(np.float32)
|
| 1095 |
|
| 1096 |
# ---- 读 JSON ----
|
| 1097 |
with open(local_json, "r", encoding="utf-8") as f:
|
|
|
|
| 1101 |
for i, s in enumerate(data["segments"]):
|
| 1102 |
segments.append({
|
| 1103 |
"row_id": s.get("index", i),
|
| 1104 |
+
"start": float(s.get("start", 0.0)),
|
| 1105 |
+
"end": float(s.get("end", 0.0)),
|
| 1106 |
+
"dur": float(s.get("end", 0.0) - s.get("start", 0.0)),
|
| 1107 |
"status": s.get("status", ""),
|
| 1108 |
"speaker": s.get("speaker", ""),
|
| 1109 |
"gender": s.get("gender", ""),
|
|
|
|
| 1116 |
|
| 1117 |
|
| 1118 |
def slice_audio(audio, sr, start, end):
|
| 1119 |
+
s = max(0, int(start * sr))
|
| 1120 |
+
e = min(len(audio), int(end * sr))
|
| 1121 |
+
return sr, audio[s:e]
|
| 1122 |
|
| 1123 |
|
| 1124 |
+
# =====================================================
|
| 1125 |
+
# Gradio 回调
|
| 1126 |
+
# =====================================================
|
| 1127 |
|
| 1128 |
+
def on_load_audio(audio_path):
|
| 1129 |
audio, sr, segments, audio_name = load_audio_and_json(audio_path)
|
| 1130 |
|
| 1131 |
rows = [
|
|
|
|
| 1147 |
state = {
|
| 1148 |
"audio": audio,
|
| 1149 |
"sr": sr,
|
| 1150 |
+
"segments": segments,
|
| 1151 |
}
|
| 1152 |
|
| 1153 |
return state, rows, info
|
|
|
|
| 1155 |
|
| 1156 |
def on_select_segment(evt: gr.SelectData, state):
|
| 1157 |
row = evt.row_value
|
|
|
|
| 1158 |
|
| 1159 |
+
start = float(row[1])
|
| 1160 |
+
end = float(row[2])
|
| 1161 |
+
|
| 1162 |
+
sr, audio_seg = slice_audio(
|
| 1163 |
+
state["audio"],
|
| 1164 |
+
state["sr"],
|
| 1165 |
+
start,
|
| 1166 |
+
end
|
| 1167 |
+
)
|
| 1168 |
|
| 1169 |
meta = (
|
| 1170 |
+
f"- **speaker**: `{row[5]}`\n"
|
| 1171 |
+
f"- **gender**: `{row[6]}`\n"
|
| 1172 |
+
f"- **age_group**: `{row[7]}`\n"
|
| 1173 |
+
f"- **emotion**: `{row[8]}`\n"
|
| 1174 |
+
f"- **status**: `{row[4]}`"
|
| 1175 |
)
|
| 1176 |
|
| 1177 |
+
text = row[9] if row[9].strip() else "(empty)"
|
| 1178 |
+
|
| 1179 |
+
return (sr, audio_seg), meta, text
|
| 1180 |
|
| 1181 |
|
| 1182 |
+
# =====================================================
|
| 1183 |
# UI
|
| 1184 |
+
# =====================================================
|
| 1185 |
|
| 1186 |
with gr.Blocks(title="ARE Audio Segment Explorer") as demo:
|
| 1187 |
gr.Markdown(
|
| 1188 |
+
"# 🎧 ARE 音频分段可视化(Hugging Face Dataset)\n"
|
| 1189 |
+
"**Dataset**: `AlexTYJ/Multilingual-ASR-Benchmark`\n\n"
|
| 1190 |
+
"从 Hugging Face Hub 直接读取音频与字幕,不需要上传文件。"
|
| 1191 |
)
|
| 1192 |
|
| 1193 |
state = gr.State()
|
|
|
|
| 1195 |
audio_files = list_are_audio_files()
|
| 1196 |
|
| 1197 |
audio_selector = gr.Dropdown(
|
|
|
|
| 1198 |
label="选择音频文件(ARE)",
|
| 1199 |
+
choices=audio_files,
|
| 1200 |
+
value=audio_files[0] if audio_files else None,
|
| 1201 |
)
|
| 1202 |
|
| 1203 |
+
load_btn = gr.Button("加载音频 & 字幕", variant="primary")
|
| 1204 |
info = gr.Markdown()
|
| 1205 |
|
| 1206 |
df = gr.Dataframe(
|
|
|
|
| 1221 |
text = gr.Textbox(label="字幕文本", lines=4)
|
| 1222 |
|
| 1223 |
load_btn.click(
|
| 1224 |
+
on_load_audio,
|
| 1225 |
inputs=audio_selector,
|
| 1226 |
outputs=[state, df, info],
|
| 1227 |
)
|
|
|
|
| 1232 |
outputs=[audio_out, meta, text],
|
| 1233 |
)
|
| 1234 |
|
| 1235 |
+
demo.launch()
|
|
|