Spaces:
Sleeping
Sleeping
| # app.py | |
| import os | |
| import json | |
| import csv | |
| import tempfile | |
| import shutil | |
| import gradio as gr | |
| from huggingface_hub import HfApi, hf_hub_download, login | |
| hf_token = os.environ.get("HF_TOKEN") | |
| if hf_token is None: | |
| raise ValueError("HF_TOKEN environment variable not set") | |
| login(token=hf_token) | |
| api = HfApi() | |
| AUDIO_EXTS = {".wav", ".mp3", ".flac", ".ogg", ".m4a"} | |
| def list_dataset_files(repo_id: str): | |
| try: | |
| files = api.list_repo_files( | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| token=os.getenv("HF_TOKEN") | |
| ) | |
| if not files: | |
| return [], "No files found." | |
| audio_files = [f for f in files if os.path.splitext(f)[1].lower() in AUDIO_EXTS] | |
| return audio_files, "\n".join(files) | |
| except Exception as e: | |
| return [], f"Error: {e}" | |
| # ----------------------------- | |
| # METADATA LOADING | |
| # ----------------------------- | |
| def load_metadata(repo_id): | |
| """Download and parse metadata files if they exist.""" | |
| metadata = {} | |
| # Possible metadata files | |
| candidates = ["metadata.jsonl"] #, "metadata.json", "metadata.csv"] | |
| for fname in candidates: | |
| try: | |
| path = hf_hub_download( | |
| repo_id=repo_id, | |
| filename=fname, | |
| repo_type="dataset", | |
| token=os.getenv("HF_TOKEN") | |
| ) | |
| ext = os.path.splitext(fname)[1] | |
| if ext == ".jsonl": | |
| with open(path, "r", encoding="utf-8") as f: | |
| for line in f: | |
| item = json.loads(line) | |
| audio = item.get("audio") or item.get("file") | |
| if audio: | |
| metadata[audio] = item | |
| elif ext == ".json": | |
| with open(path, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| for item in data: | |
| audio = item.get("audio") or item.get("file") | |
| if audio: | |
| metadata[audio] = item | |
| elif ext == ".csv": | |
| with open(path, "r", encoding="utf-8") as f: | |
| reader = csv.DictReader(f) | |
| for row in reader: | |
| audio = row.get("audio") or row.get("file") | |
| if audio: | |
| metadata[audio] = row | |
| except Exception: | |
| pass # File doesn't exist, skip | |
| return metadata | |
| # ----------------------------- | |
| # AUDIO LOADING | |
| # ----------------------------- | |
| def load_audio(repo_id, file_path): | |
| try: | |
| local_path = hf_hub_download( | |
| repo_id=repo_id, | |
| filename=file_path, | |
| repo_type="dataset", | |
| token=os.getenv("HF_TOKEN") | |
| ) | |
| tmp_dir = tempfile.gettempdir() | |
| safe_path = os.path.join(tmp_dir, os.path.basename(local_path)) | |
| shutil.copy(local_path, safe_path) | |
| return safe_path | |
| except Exception: | |
| return None | |
| # ----------------------------- | |
| # COMBINED HANDLER | |
| # ----------------------------- | |
| def load_audio_and_metadata(repo_id, file_path): | |
| if not file_path: | |
| return None, "No file selected." | |
| audio = load_audio(repo_id, file_path) | |
| metadata = load_metadata(repo_id) | |
| # Try to match metadata | |
| info = metadata.get(file_path, "") | |
| # If no metadata file, try matching .txt file | |
| if not info: | |
| txt_candidate = file_path.replace(".wav", ".txt").replace(".mp3", ".txt") | |
| try: | |
| txt_path = hf_hub_download( | |
| repo_id=repo_id, | |
| filename=txt_candidate, | |
| repo_type="dataset", | |
| token=os.getenv("HF_TOKEN") | |
| ) | |
| with open(txt_path, "r", encoding="utf-8") as f: | |
| info = f.read() | |
| except Exception: | |
| info = "No metadata found." | |
| return audio, json.dumps(info, indent=2) if isinstance(info, dict) else info | |
| # ----------------------------- | |
| # NAVIGATION FUNCTIONS | |
| # ----------------------------- | |
| def navigate_files(current_file, direction, audio_files): | |
| """Navigate to previous or next file in the list.""" | |
| if not audio_files or current_file not in audio_files: | |
| return current_file | |
| current_index = audio_files.index(current_file) | |
| if direction == "next": | |
| new_index = (current_index + 1) % len(audio_files) | |
| elif direction == "prev": | |
| new_index = (current_index - 1) % len(audio_files) | |
| else: | |
| return current_file | |
| return audio_files[new_index] | |
| def update_file_selection(repo_id, current_file, direction, audio_files_state, autoplay): | |
| """Update file selection and load new audio/metadata.""" | |
| if not audio_files_state: | |
| return current_file, None, "No audio files available." | |
| new_file = navigate_files(current_file, direction, audio_files_state) | |
| audio, metadata = load_audio_and_metadata(repo_id, new_file) | |
| return new_file, audio, metadata | |
| # ----------------------------- | |
| # UI | |
| # ----------------------------- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# 🎧 List, Play & View Metadata for HF Dataset Audio") | |
| repo_input = gr.Textbox(label="Dataset repo_id", value="username/dataset_name") | |
| # State to store audio files list | |
| audio_files_state = gr.State([]) | |
| with gr.Row(): | |
| file_list = gr.Dropdown(label="Audio files", choices=[], interactive=True) | |
| with gr.Row(): | |
| prev_btn = gr.Button("◀ Previous") | |
| next_btn = gr.Button("Next ▶") | |
| autoplay_toggle = gr.Checkbox(label="Autoplay", value=True) | |
| with gr.Row(): | |
| play_audio = gr.Audio(label="Audio Player", autoplay=True) | |
| metadata_box = gr.Textbox(label="Metadata / Text", lines=10) | |
| output = gr.Textbox(label="All Files", lines=20) | |
| btn = gr.Button("List files") | |
| def update_files(repo_id): | |
| audio_files, all_files_text = list_dataset_files(repo_id) | |
| # Update state and dropdown | |
| return ( | |
| gr.Dropdown(choices=audio_files, value=audio_files[0] if audio_files else None), | |
| all_files_text, | |
| audio_files # Update state | |
| ) | |
| btn.click( | |
| update_files, | |
| inputs=repo_input, | |
| outputs=[file_list, output, audio_files_state] | |
| ) | |
| # When file is selected from dropdown | |
| def dropdown_change_handler(repo_id, file_path, autoplay): | |
| audio, metadata = load_audio_and_metadata(repo_id, file_path) | |
| # Update audio player autoplay based on toggle | |
| return gr.Audio(value=audio, autoplay=autoplay), metadata | |
| file_list.change( | |
| dropdown_change_handler, | |
| inputs=[repo_input, file_list, autoplay_toggle], | |
| outputs=[play_audio, metadata_box] | |
| ) | |
| # Previous button click | |
| def prev_button_handler(repo_id, current_file, audio_files_state, autoplay): | |
| new_file, audio, metadata = update_file_selection( | |
| repo_id, current_file, "prev", audio_files_state, autoplay | |
| ) | |
| return new_file, gr.Audio(value=audio, autoplay=autoplay), metadata | |
| prev_btn.click( | |
| prev_button_handler, | |
| inputs=[repo_input, file_list, audio_files_state, autoplay_toggle], | |
| outputs=[file_list, play_audio, metadata_box] | |
| ) | |
| # Next button click | |
| def next_button_handler(repo_id, current_file, audio_files_state, autoplay): | |
| new_file, audio, metadata = update_file_selection( | |
| repo_id, current_file, "next", audio_files_state, autoplay | |
| ) | |
| return new_file, gr.Audio(value=audio, autoplay=autoplay), metadata | |
| next_btn.click( | |
| next_button_handler, | |
| inputs=[repo_input, file_list, audio_files_state, autoplay_toggle], | |
| outputs=[file_list, play_audio, metadata_box] | |
| ) | |
| # Autoplay toggle change handler | |
| def autoplay_changed(autoplay, current_audio): | |
| return gr.Audio(value=current_audio, autoplay=autoplay) | |
| autoplay_toggle.change( | |
| autoplay_changed, | |
| inputs=[autoplay_toggle, play_audio], | |
| outputs=[play_audio] | |
| ) | |
| demo.launch() |