Juna190825's picture
Update app.py
15f0c84 verified
# app.py
import os
import json
import csv
import tempfile
import shutil
import gradio as gr
from huggingface_hub import HfApi, hf_hub_download, login
hf_token = os.environ.get("HF_TOKEN")
if hf_token is None:
raise ValueError("HF_TOKEN environment variable not set")
login(token=hf_token)
api = HfApi()
AUDIO_EXTS = {".wav", ".mp3", ".flac", ".ogg", ".m4a"}
def list_dataset_files(repo_id: str):
try:
files = api.list_repo_files(
repo_id=repo_id,
repo_type="dataset",
token=os.getenv("HF_TOKEN")
)
if not files:
return [], "No files found."
audio_files = [f for f in files if os.path.splitext(f)[1].lower() in AUDIO_EXTS]
return audio_files, "\n".join(files)
except Exception as e:
return [], f"Error: {e}"
# -----------------------------
# METADATA LOADING
# -----------------------------
def load_metadata(repo_id):
"""Download and parse metadata files if they exist."""
metadata = {}
# Possible metadata files
candidates = ["metadata.jsonl"] #, "metadata.json", "metadata.csv"]
for fname in candidates:
try:
path = hf_hub_download(
repo_id=repo_id,
filename=fname,
repo_type="dataset",
token=os.getenv("HF_TOKEN")
)
ext = os.path.splitext(fname)[1]
if ext == ".jsonl":
with open(path, "r", encoding="utf-8") as f:
for line in f:
item = json.loads(line)
audio = item.get("audio") or item.get("file")
if audio:
metadata[audio] = item
elif ext == ".json":
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
for item in data:
audio = item.get("audio") or item.get("file")
if audio:
metadata[audio] = item
elif ext == ".csv":
with open(path, "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
audio = row.get("audio") or row.get("file")
if audio:
metadata[audio] = row
except Exception:
pass # File doesn't exist, skip
return metadata
# -----------------------------
# AUDIO LOADING
# -----------------------------
def load_audio(repo_id, file_path):
try:
local_path = hf_hub_download(
repo_id=repo_id,
filename=file_path,
repo_type="dataset",
token=os.getenv("HF_TOKEN")
)
tmp_dir = tempfile.gettempdir()
safe_path = os.path.join(tmp_dir, os.path.basename(local_path))
shutil.copy(local_path, safe_path)
return safe_path
except Exception:
return None
# -----------------------------
# COMBINED HANDLER
# -----------------------------
def load_audio_and_metadata(repo_id, file_path):
if not file_path:
return None, "No file selected."
audio = load_audio(repo_id, file_path)
metadata = load_metadata(repo_id)
# Try to match metadata
info = metadata.get(file_path, "")
# If no metadata file, try matching .txt file
if not info:
txt_candidate = file_path.replace(".wav", ".txt").replace(".mp3", ".txt")
try:
txt_path = hf_hub_download(
repo_id=repo_id,
filename=txt_candidate,
repo_type="dataset",
token=os.getenv("HF_TOKEN")
)
with open(txt_path, "r", encoding="utf-8") as f:
info = f.read()
except Exception:
info = "No metadata found."
return audio, json.dumps(info, indent=2) if isinstance(info, dict) else info
# -----------------------------
# NAVIGATION FUNCTIONS
# -----------------------------
def navigate_files(current_file, direction, audio_files):
"""Navigate to previous or next file in the list."""
if not audio_files or current_file not in audio_files:
return current_file
current_index = audio_files.index(current_file)
if direction == "next":
new_index = (current_index + 1) % len(audio_files)
elif direction == "prev":
new_index = (current_index - 1) % len(audio_files)
else:
return current_file
return audio_files[new_index]
def update_file_selection(repo_id, current_file, direction, audio_files_state, autoplay):
"""Update file selection and load new audio/metadata."""
if not audio_files_state:
return current_file, None, "No audio files available."
new_file = navigate_files(current_file, direction, audio_files_state)
audio, metadata = load_audio_and_metadata(repo_id, new_file)
return new_file, audio, metadata
# -----------------------------
# UI
# -----------------------------
with gr.Blocks() as demo:
gr.Markdown("# 🎧 List, Play & View Metadata for HF Dataset Audio")
repo_input = gr.Textbox(label="Dataset repo_id", value="username/dataset_name")
# State to store audio files list
audio_files_state = gr.State([])
with gr.Row():
file_list = gr.Dropdown(label="Audio files", choices=[], interactive=True)
with gr.Row():
prev_btn = gr.Button("◀ Previous")
next_btn = gr.Button("Next ▶")
autoplay_toggle = gr.Checkbox(label="Autoplay", value=True)
with gr.Row():
play_audio = gr.Audio(label="Audio Player", autoplay=True)
metadata_box = gr.Textbox(label="Metadata / Text", lines=10)
output = gr.Textbox(label="All Files", lines=20)
btn = gr.Button("List files")
def update_files(repo_id):
audio_files, all_files_text = list_dataset_files(repo_id)
# Update state and dropdown
return (
gr.Dropdown(choices=audio_files, value=audio_files[0] if audio_files else None),
all_files_text,
audio_files # Update state
)
btn.click(
update_files,
inputs=repo_input,
outputs=[file_list, output, audio_files_state]
)
# When file is selected from dropdown
def dropdown_change_handler(repo_id, file_path, autoplay):
audio, metadata = load_audio_and_metadata(repo_id, file_path)
# Update audio player autoplay based on toggle
return gr.Audio(value=audio, autoplay=autoplay), metadata
file_list.change(
dropdown_change_handler,
inputs=[repo_input, file_list, autoplay_toggle],
outputs=[play_audio, metadata_box]
)
# Previous button click
def prev_button_handler(repo_id, current_file, audio_files_state, autoplay):
new_file, audio, metadata = update_file_selection(
repo_id, current_file, "prev", audio_files_state, autoplay
)
return new_file, gr.Audio(value=audio, autoplay=autoplay), metadata
prev_btn.click(
prev_button_handler,
inputs=[repo_input, file_list, audio_files_state, autoplay_toggle],
outputs=[file_list, play_audio, metadata_box]
)
# Next button click
def next_button_handler(repo_id, current_file, audio_files_state, autoplay):
new_file, audio, metadata = update_file_selection(
repo_id, current_file, "next", audio_files_state, autoplay
)
return new_file, gr.Audio(value=audio, autoplay=autoplay), metadata
next_btn.click(
next_button_handler,
inputs=[repo_input, file_list, audio_files_state, autoplay_toggle],
outputs=[file_list, play_audio, metadata_box]
)
# Autoplay toggle change handler
def autoplay_changed(autoplay, current_audio):
return gr.Audio(value=current_audio, autoplay=autoplay)
autoplay_toggle.change(
autoplay_changed,
inputs=[autoplay_toggle, play_audio],
outputs=[play_audio]
)
demo.launch()