from functools import lru_cache import traceback import gradio as gr from huggingface_hub import HfApi from huggingface_hub.utils import HfHubHTTPError DEFAULT_REPO_ID = "mlfoundations-cua-dev/human_eval" api = HfApi() @lru_cache(maxsize=16) def _list_repo_files(repo_id: str) -> list[str]: """Return all file paths contained in a Hugging Face dataset repository.""" return api.list_repo_files(repo_id=repo_id, repo_type="dataset") def _extract_top_level(repo_id: str) -> tuple[list[str], list[str]]: """Split top-level folders and files for the given repository.""" files = _list_repo_files(repo_id) top_level_dirs = sorted({path.split("/", 1)[0] for path in files if "/" in path}) top_level_files = sorted(path for path in files if "/" not in path) return top_level_dirs, top_level_files def _summarize_directory(repo_id: str, directory: str) -> dict: """Return a lightweight summary of the contents of a top-level directory.""" if not directory: return {} files = [path for path in _list_repo_files(repo_id) if path.startswith(f"{directory}/")] relative_paths = [path[len(directory) + 1 :] for path in files] child_dirs = sorted({rel.split("/", 1)[0] for rel in relative_paths if "/" in rel}) child_files = sorted(rel for rel in relative_paths if rel and "/" not in rel) sample_files = child_files[:10] has_more_files = len(child_files) > len(sample_files) return { "folder": directory, "total_files": len(files), "direct_subdirectories": child_dirs, "sample_files": sample_files + (["..."] if has_more_files else []), } def refresh_repo(repo_id: str): try: top_dirs, top_files = _extract_top_level(repo_id) except HfHubHTTPError as error: print(f"[refresh_repo] Hub HTTP error for {repo_id}: {error}", flush=True) print(traceback.format_exc(), flush=True) return ( gr.update(choices=[], value=None, interactive=False), gr.update(value=f"❌ Unable to load repo `{repo_id}`: {error}"), {} ) except Exception as error: # pragma: no cover - network and auth edge cases print(f"[refresh_repo] Unexpected error for {repo_id}: {error}", flush=True) print(traceback.format_exc(), flush=True) return ( gr.update(choices=[], value=None, interactive=False), gr.update(value=f"❌ Unexpected error loading `{repo_id}`: {error}"), {} ) status_lines = [ f"✅ Loaded `{repo_id}`", f"• Top-level folders: {len(top_dirs)}", ] if top_files: status_lines.append(f"• Loose files at root: {len(top_files)}") if not top_dirs: status_lines.append("• No sub-folders found at root.") dropdown_value = top_dirs[0] if top_dirs else None dropdown_update = gr.update( choices=top_dirs, value=dropdown_value, interactive=bool(top_dirs), label="Top-level folders", info="Choose a folder to explore" ) folder_summary = _summarize_directory(repo_id, dropdown_value) if dropdown_value else {} return dropdown_update, gr.update(value="\n".join(status_lines)), folder_summary def update_directory(repo_id: str, directory: str): try: return _summarize_directory(repo_id, directory) except Exception as error: print(f"[update_directory] Error for {repo_id}/{directory}: {error}", flush=True) print(traceback.format_exc(), flush=True) return {} with gr.Blocks(title="HF Dataset Explorer") as demo: gr.Markdown( """# Hugging Face Dataset Explorer Provide a dataset repository ID (e.g. `org/dataset`) to list its top-level folders.""" ) with gr.Row(): repo_id_input = gr.Textbox( value=DEFAULT_REPO_ID, label="Dataset repo ID", placeholder="owner/dataset", info="Any public dataset on the Hugging Face Hub" ) reload_button = gr.Button("Load repo", variant="primary") status_display = gr.Markdown() folder_dropdown = gr.Dropdown(label="Top-level folders", interactive=False) folder_details = gr.JSON(label="Folder summary") reload_button.click( refresh_repo, inputs=repo_id_input, outputs=[folder_dropdown, status_display, folder_details], ) folder_dropdown.change( update_directory, inputs=[repo_id_input, folder_dropdown], outputs=folder_details, ) demo.load( refresh_repo, inputs=repo_id_input, outputs=[folder_dropdown, status_display, folder_details], ) if __name__ == "__main__": demo.launch()