unknown commited on
Commit ·
66849b2
1
Parent(s): b6be6a8
Dataset上传
Browse files
app.py
CHANGED
|
@@ -433,78 +433,82 @@
|
|
| 433 |
# demo = build_app()
|
| 434 |
# demo.launch()
|
| 435 |
|
| 436 |
-
import
|
| 437 |
import numpy as np
|
| 438 |
-
|
| 439 |
|
| 440 |
-
|
| 441 |
|
| 442 |
-
|
| 443 |
-
out = []
|
| 444 |
-
for i, s in enumerate(segments):
|
| 445 |
-
out.append({
|
| 446 |
-
"row_id": s.get("index", i),
|
| 447 |
-
"start": float(s.get("start", 0.0)),
|
| 448 |
-
"end": float(s.get("end", 0.0)),
|
| 449 |
-
"dur": float(s.get("end", 0.0)) - float(s.get("start", 0.0)),
|
| 450 |
-
"status": s.get("status", ""),
|
| 451 |
-
"speaker": s.get("speaker", ""),
|
| 452 |
-
"gender": s.get("gender", ""),
|
| 453 |
-
"age_group": s.get("age_group", ""),
|
| 454 |
-
"emotion": s.get("emotion", ""),
|
| 455 |
-
"text": s.get("text", "") or "",
|
| 456 |
-
})
|
| 457 |
-
return out
|
| 458 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
|
| 460 |
-
def slice_audio(audio, sr, start, end):
|
| 461 |
-
s = int(start * sr)
|
| 462 |
-
e = int(end * sr)
|
| 463 |
-
return sr, audio[s:e]
|
| 464 |
|
|
|
|
|
|
|
|
|
|
| 465 |
|
| 466 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 467 |
|
| 468 |
-
def load_dataset_meta(dataset_name, split):
|
| 469 |
-
ds = load_dataset(dataset_name, split=split)
|
| 470 |
-
return ds, len(ds)
|
| 471 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 472 |
|
| 473 |
-
|
| 474 |
-
|
|
|
|
| 475 |
|
| 476 |
-
|
|
|
|
|
|
|
|
|
|
| 477 |
|
| 478 |
-
# ----
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 485 |
|
| 486 |
-
|
| 487 |
-
if "segments" in sample:
|
| 488 |
-
segments = sample["segments"]
|
| 489 |
-
elif "transcript" in sample and "segments" in sample["transcript"]:
|
| 490 |
-
segments = sample["transcript"]["segments"]
|
| 491 |
-
else:
|
| 492 |
-
raise ValueError("未找到 segments")
|
| 493 |
|
| 494 |
-
segments = normalize_segments(segments)
|
| 495 |
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
"sr": sr,
|
| 499 |
-
"segments": segments,
|
| 500 |
-
"sample_id": sample.get("id", index),
|
| 501 |
-
}
|
| 502 |
|
| 503 |
|
| 504 |
-
# ==========
|
|
|
|
|
|
|
| 505 |
|
| 506 |
-
def
|
| 507 |
-
|
| 508 |
|
| 509 |
rows = [
|
| 510 |
[
|
|
@@ -512,17 +516,22 @@ def on_load_sample(dataset_name, split, index):
|
|
| 512 |
s["status"], s["speaker"], s["gender"],
|
| 513 |
s["age_group"], s["emotion"], s["text"]
|
| 514 |
]
|
| 515 |
-
for s in
|
| 516 |
]
|
| 517 |
|
| 518 |
info = (
|
| 519 |
-
f"**
|
| 520 |
-
f"**
|
| 521 |
-
f"**
|
| 522 |
-
f"**
|
| 523 |
-
f"**Sample rate**: {state['sr']} Hz"
|
| 524 |
)
|
| 525 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 526 |
return state, rows, info
|
| 527 |
|
| 528 |
|
|
@@ -530,9 +539,7 @@ def on_select_segment(evt: gr.SelectData, state):
|
|
| 530 |
row = evt.row_value
|
| 531 |
start, end = float(row[1]), float(row[2])
|
| 532 |
|
| 533 |
-
sr, audio_seg = slice_audio(
|
| 534 |
-
state["audio"], state["sr"], start, end
|
| 535 |
-
)
|
| 536 |
|
| 537 |
meta = (
|
| 538 |
f"- **speaker**: {row[5]}\n"
|
|
@@ -544,22 +551,27 @@ def on_select_segment(evt: gr.SelectData, state):
|
|
| 544 |
return (sr, audio_seg), meta, row[9]
|
| 545 |
|
| 546 |
|
| 547 |
-
# ==========
|
|
|
|
|
|
|
| 548 |
|
| 549 |
-
with gr.Blocks(title="
|
| 550 |
-
gr.Markdown(
|
|
|
|
|
|
|
|
|
|
| 551 |
|
| 552 |
state = gr.State()
|
| 553 |
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
load_btn = gr.Button("加载
|
| 563 |
info = gr.Markdown()
|
| 564 |
|
| 565 |
df = gr.Dataframe(
|
|
@@ -568,20 +580,20 @@ with gr.Blocks(title="HF Dataset Audio Segment Explorer") as demo:
|
|
| 568 |
"status", "speaker", "gender",
|
| 569 |
"age_group", "emotion", "text"
|
| 570 |
],
|
| 571 |
-
interactive=False,
|
| 572 |
wrap=True,
|
| 573 |
-
|
|
|
|
| 574 |
)
|
| 575 |
|
| 576 |
with gr.Row():
|
| 577 |
audio_out = gr.Audio(label="分段播放", type="numpy")
|
| 578 |
meta = gr.Markdown()
|
| 579 |
|
| 580 |
-
text = gr.Textbox(label="
|
| 581 |
|
| 582 |
load_btn.click(
|
| 583 |
-
|
| 584 |
-
inputs=
|
| 585 |
outputs=[state, df, info],
|
| 586 |
)
|
| 587 |
|
|
@@ -592,3 +604,4 @@ with gr.Blocks(title="HF Dataset Audio Segment Explorer") as demo:
|
|
| 592 |
)
|
| 593 |
|
| 594 |
demo.launch()
|
|
|
|
|
|
| 433 |
# demo = build_app()
|
| 434 |
# demo.launch()
|
| 435 |
|
| 436 |
+
import json
|
| 437 |
import numpy as np
|
| 438 |
+
import gradio as gr
|
| 439 |
|
| 440 |
+
from huggingface_hub import hf_hub_download, list_repo_files
|
| 441 |
|
| 442 |
+
import soundfile as sf
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 443 |
|
| 444 |
+
# =====================
|
| 445 |
+
# 固定配置(你的数据)
|
| 446 |
+
# =====================
|
| 447 |
+
REPO_ID = "AlexTYJ/Multilingual-ASR-Benchmark"
|
| 448 |
+
AUDIO_DIR = "audio/testbatch/ARE"
|
| 449 |
+
JSON_DIR = "text/ref/testbatch/ARE"
|
| 450 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 451 |
|
| 452 |
+
# =====================
|
| 453 |
+
# 工具函数
|
| 454 |
+
# =====================
|
| 455 |
|
| 456 |
+
def list_are_audio_files():
|
| 457 |
+
files = list_repo_files(REPO_ID)
|
| 458 |
+
audio_files = [
|
| 459 |
+
f for f in files
|
| 460 |
+
if f.startswith(AUDIO_DIR) and f.lower().endswith((".wav", ".mp3", ".flac"))
|
| 461 |
+
]
|
| 462 |
+
audio_files.sort()
|
| 463 |
+
return audio_files
|
| 464 |
|
|
|
|
|
|
|
|
|
|
| 465 |
|
| 466 |
+
def load_audio_and_json(audio_path):
|
| 467 |
+
# ---- 推导 json 路径 ----
|
| 468 |
+
filename = audio_path.split("/")[-1]
|
| 469 |
+
json_path = f"{JSON_DIR}/{filename.replace('.wav', '.json').replace('.mp3', '.json').replace('.flac', '.json')}"
|
| 470 |
|
| 471 |
+
# ---- 下载 ----
|
| 472 |
+
local_audio = hf_hub_download(REPO_ID, audio_path)
|
| 473 |
+
local_json = hf_hub_download(REPO_ID, json_path)
|
| 474 |
|
| 475 |
+
# ---- 读音频 ----
|
| 476 |
+
audio, sr = sf.read(local_audio)
|
| 477 |
+
if audio.ndim == 2:
|
| 478 |
+
audio = audio.mean(axis=1)
|
| 479 |
|
| 480 |
+
# ---- 读 JSON ----
|
| 481 |
+
with open(local_json, "r", encoding="utf-8") as f:
|
| 482 |
+
data = json.load(f)
|
| 483 |
+
|
| 484 |
+
segments = []
|
| 485 |
+
for i, s in enumerate(data["segments"]):
|
| 486 |
+
segments.append({
|
| 487 |
+
"row_id": s.get("index", i),
|
| 488 |
+
"start": float(s["start"]),
|
| 489 |
+
"end": float(s["end"]),
|
| 490 |
+
"dur": float(s["end"] - s["start"]),
|
| 491 |
+
"status": s.get("status", ""),
|
| 492 |
+
"speaker": s.get("speaker", ""),
|
| 493 |
+
"gender": s.get("gender", ""),
|
| 494 |
+
"age_group": s.get("age_group", ""),
|
| 495 |
+
"emotion": s.get("emotion", ""),
|
| 496 |
+
"text": s.get("text", "") or "",
|
| 497 |
+
})
|
| 498 |
|
| 499 |
+
return audio, sr, segments, data.get("audio_name", filename)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 500 |
|
|
|
|
| 501 |
|
| 502 |
+
def slice_audio(audio, sr, start, end):
|
| 503 |
+
return sr, audio[int(start * sr): int(end * sr)]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 504 |
|
| 505 |
|
| 506 |
+
# =====================
|
| 507 |
+
# Gradio 交互逻辑
|
| 508 |
+
# =====================
|
| 509 |
|
| 510 |
+
def on_select_file(audio_path):
|
| 511 |
+
audio, sr, segments, audio_name = load_audio_and_json(audio_path)
|
| 512 |
|
| 513 |
rows = [
|
| 514 |
[
|
|
|
|
| 516 |
s["status"], s["speaker"], s["gender"],
|
| 517 |
s["age_group"], s["emotion"], s["text"]
|
| 518 |
]
|
| 519 |
+
for s in segments
|
| 520 |
]
|
| 521 |
|
| 522 |
info = (
|
| 523 |
+
f"**Repo**: `{REPO_ID}` \n"
|
| 524 |
+
f"**Audio**: `{audio_name}` \n"
|
| 525 |
+
f"**Segments**: {len(segments)} \n"
|
| 526 |
+
f"**Sample rate**: {sr} Hz"
|
|
|
|
| 527 |
)
|
| 528 |
|
| 529 |
+
state = {
|
| 530 |
+
"audio": audio,
|
| 531 |
+
"sr": sr,
|
| 532 |
+
"segments": segments
|
| 533 |
+
}
|
| 534 |
+
|
| 535 |
return state, rows, info
|
| 536 |
|
| 537 |
|
|
|
|
| 539 |
row = evt.row_value
|
| 540 |
start, end = float(row[1]), float(row[2])
|
| 541 |
|
| 542 |
+
sr, audio_seg = slice_audio(state["audio"], state["sr"], start, end)
|
|
|
|
|
|
|
| 543 |
|
| 544 |
meta = (
|
| 545 |
f"- **speaker**: {row[5]}\n"
|
|
|
|
| 551 |
return (sr, audio_seg), meta, row[9]
|
| 552 |
|
| 553 |
|
| 554 |
+
# =====================
|
| 555 |
+
# UI
|
| 556 |
+
# =====================
|
| 557 |
|
| 558 |
+
with gr.Blocks(title="ARE Audio Segment Explorer") as demo:
|
| 559 |
+
gr.Markdown(
|
| 560 |
+
"# 🎧 ARE 音频 & 字幕可视化(Hugging Face Dataset)\n"
|
| 561 |
+
"数据来源:`AlexTYJ/Multilingual-ASR-Benchmark`"
|
| 562 |
+
)
|
| 563 |
|
| 564 |
state = gr.State()
|
| 565 |
|
| 566 |
+
audio_files = list_are_audio_files()
|
| 567 |
+
|
| 568 |
+
audio_selector = gr.Dropdown(
|
| 569 |
+
choices=audio_files,
|
| 570 |
+
label="选择音频文件(ARE)",
|
| 571 |
+
value=audio_files[0] if audio_files else None
|
| 572 |
+
)
|
| 573 |
+
|
| 574 |
+
load_btn = gr.Button("加载", variant="primary")
|
| 575 |
info = gr.Markdown()
|
| 576 |
|
| 577 |
df = gr.Dataframe(
|
|
|
|
| 580 |
"status", "speaker", "gender",
|
| 581 |
"age_group", "emotion", "text"
|
| 582 |
],
|
|
|
|
| 583 |
wrap=True,
|
| 584 |
+
interactive=False,
|
| 585 |
+
max_height=420,
|
| 586 |
)
|
| 587 |
|
| 588 |
with gr.Row():
|
| 589 |
audio_out = gr.Audio(label="分段播放", type="numpy")
|
| 590 |
meta = gr.Markdown()
|
| 591 |
|
| 592 |
+
text = gr.Textbox(label="字幕文本", lines=4)
|
| 593 |
|
| 594 |
load_btn.click(
|
| 595 |
+
on_select_file,
|
| 596 |
+
inputs=audio_selector,
|
| 597 |
outputs=[state, df, info],
|
| 598 |
)
|
| 599 |
|
|
|
|
| 604 |
)
|
| 605 |
|
| 606 |
demo.launch()
|
| 607 |
+
|