human_eval_app / app.py
mertunsall
Fix
ddc5859
raw
history blame
19.5 kB
from functools import lru_cache
import re
import traceback
from typing import Optional
import gradio as gr
from huggingface_hub import HfApi, hf_hub_url
from huggingface_hub.utils import HfHubHTTPError
DEFAULT_REPO_ID = "mlfoundations-cua-dev/human_eval"
IMAGE_EXTENSIONS = (
".jpg",
".jpeg",
".png",
".bmp",
".gif",
".webp",
".tif",
".tiff",
)
INIT_SCREENSHOT_NAMES = {"intial_screenshot", "initial_screenshot"}
STEP_FILENAME_PATTERN = re.compile(r"^step_(\d+)(?:\.[^.]+)?$", re.IGNORECASE)
api = HfApi()
@lru_cache(maxsize=16)
def _list_repo_files(repo_id: str) -> list[str]:
"""Return all file paths contained in a Hugging Face dataset repository."""
return api.list_repo_files(repo_id=repo_id, repo_type="dataset")
def _extract_top_level(repo_id: str) -> tuple[list[str], list[str]]:
"""Split top-level folders and files for the given repository."""
files = _list_repo_files(repo_id)
top_level_dirs = sorted({path.split("/", 1)[0] for path in files if "/" in path})
top_level_files = sorted(path for path in files if "/" not in path)
return top_level_dirs, top_level_files
def _get_subdirectories(repo_id: str, directory: str) -> list[str]:
"""Return the direct subdirectories of the given directory."""
if not directory:
return []
files = [path for path in _list_repo_files(repo_id) if path.startswith(f"{directory}/")]
relative_paths = [path[len(directory) + 1 :] for path in files]
# Get immediate subdirectories (first level only)
child_dirs = sorted({rel.split("/", 1)[0] for rel in relative_paths if "/" in rel})
return child_dirs
def _build_path(*parts) -> str:
"""Join path parts while skipping empty values."""
return "/".join(part for part in parts if part)
def _get_image_urls(repo_id: str, directory: str) -> list[str]:
"""Return URLs for image files directly within the given directory."""
if not directory:
return []
prefix = f"{directory}/"
files = [path for path in _list_repo_files(repo_id) if path.startswith(prefix)]
image_files = [
path
for path in files
if "/" not in path[len(prefix) :]
and path.lower().endswith(IMAGE_EXTENSIONS)
]
sorted_files = sorted(image_files, key=_image_sort_key)
return [
hf_hub_url(repo_id=repo_id, filename=path, repo_type="dataset")
for path in sorted_files
]
def _image_sort_key(path: str):
filename = path.rsplit("/", 1)[-1]
lower_name = filename.lower()
if any(lower_name.startswith(name) for name in INIT_SCREENSHOT_NAMES):
return (0, 0)
match = STEP_FILENAME_PATTERN.match(lower_name)
if match:
return (1, int(match.group(1)))
return (2, lower_name)
def _dropdown_update(
*,
choices: list[str],
value: Optional[str],
label: str,
filled_info: str,
empty_info: str,
):
has_choices = bool(choices)
return gr.update(
choices=choices,
value=value if has_choices else None,
interactive=has_choices,
label=label,
info=filled_info if has_choices else empty_info,
)
def refresh_repo(repo_id: str):
try:
top_dirs, top_files = _extract_top_level(repo_id)
except HfHubHTTPError as error:
print(f"[refresh_repo] Hub HTTP error for {repo_id}: {error}", flush=True)
print(traceback.format_exc(), flush=True)
return (
gr.update(choices=[], value=None, interactive=False),
gr.update(choices=[], value=None, interactive=False),
gr.update(choices=[], value=None, interactive=False),
gr.update(choices=[], value=None, interactive=False),
gr.update(value=[]),
gr.update(value=f"❌ Unable to load repo `{repo_id}`: {error}"),
)
except Exception as error: # pragma: no cover - network and auth edge cases
print(f"[refresh_repo] Unexpected error for {repo_id}: {error}", flush=True)
print(traceback.format_exc(), flush=True)
return (
gr.update(choices=[], value=None, interactive=False),
gr.update(choices=[], value=None, interactive=False),
gr.update(choices=[], value=None, interactive=False),
gr.update(choices=[], value=None, interactive=False),
gr.update(value=[]),
gr.update(value=f"❌ Unexpected error loading `{repo_id}`: {error}"),
)
status_lines = [
f"✅ Loaded `{repo_id}`",
f"• Top-level folders: {len(top_dirs)}",
]
if top_files:
status_lines.append(f"• Loose files at root: {len(top_files)}")
if not top_dirs:
status_lines.append("• No sub-folders found at root.")
top_value = top_dirs[0] if top_dirs else None
second_dirs = _get_subdirectories(repo_id, top_value) if top_value else []
second_value = second_dirs[0] if second_dirs else None
third_dirs = (
_get_subdirectories(repo_id, _build_path(top_value, second_value))
if second_value
else []
)
third_value = third_dirs[0] if third_dirs else None
fourth_dirs = (
_get_subdirectories(repo_id, _build_path(top_value, second_value, third_value))
if third_value
else []
)
fourth_value = fourth_dirs[0] if fourth_dirs else None
image_urls = (
_get_image_urls(
repo_id, _build_path(top_value, second_value, third_value, fourth_value)
)
if fourth_value
else []
)
first_dropdown_update = _dropdown_update(
choices=top_dirs,
value=top_value,
label="Top-level folders",
filled_info="Choose a folder to explore",
empty_info="No folders found at the repository root",
)
second_dropdown_update = _dropdown_update(
choices=second_dirs,
value=second_value,
label="Second-level folders",
filled_info="Choose a second-level folder",
empty_info="No subdirectories under the selected folder",
)
third_dropdown_update = _dropdown_update(
choices=third_dirs,
value=third_value,
label="Third-level folders",
filled_info="Choose a third-level folder",
empty_info="No third-level folders under the selection",
)
fourth_dropdown_update = _dropdown_update(
choices=fourth_dirs,
value=fourth_value,
label="Fourth-level folders",
filled_info="Choose a fourth-level folder",
empty_info="No fourth-level folders under the selection",
)
gallery_update = gr.update(value=image_urls)
return (
first_dropdown_update,
second_dropdown_update,
third_dropdown_update,
fourth_dropdown_update,
gallery_update,
gr.update(value="\n".join(status_lines)),
)
def update_second_dropdown(repo_id: str, top_level_dir: str):
"""Update downstream dropdowns when the top-level selection changes."""
try:
if not top_level_dir:
empty_second = _dropdown_update(
choices=[],
value=None,
label="Second-level folders",
filled_info="Choose a second-level folder",
empty_info="Select a top-level folder first",
)
empty_third = _dropdown_update(
choices=[],
value=None,
label="Third-level folders",
filled_info="Choose a third-level folder",
empty_info="Select a higher-level folder first",
)
empty_fourth = _dropdown_update(
choices=[],
value=None,
label="Fourth-level folders",
filled_info="Choose a fourth-level folder",
empty_info="Select a higher-level folder first",
)
return empty_second, empty_third, empty_fourth, gr.update(value=[])
second_dirs = _get_subdirectories(repo_id, top_level_dir)
second_value = second_dirs[0] if second_dirs else None
third_dirs = (
_get_subdirectories(repo_id, _build_path(top_level_dir, second_value))
if second_value
else []
)
third_value = third_dirs[0] if third_dirs else None
fourth_dirs = (
_get_subdirectories(repo_id, _build_path(top_level_dir, second_value, third_value))
if third_value
else []
)
fourth_value = fourth_dirs[0] if fourth_dirs else None
image_urls = (
_get_image_urls(
repo_id,
_build_path(top_level_dir, second_value, third_value, fourth_value),
)
if fourth_value
else []
)
return (
_dropdown_update(
choices=second_dirs,
value=second_value,
label="Second-level folders",
filled_info="Choose a second-level folder",
empty_info="No subdirectories under the selected folder",
),
_dropdown_update(
choices=third_dirs,
value=third_value,
label="Third-level folders",
filled_info="Choose a third-level folder",
empty_info="No third-level folders under the selection",
),
_dropdown_update(
choices=fourth_dirs,
value=fourth_value,
label="Fourth-level folders",
filled_info="Choose a fourth-level folder",
empty_info="No fourth-level folders under the selection",
),
gr.update(value=image_urls),
)
except Exception as error:
print(f"[update_second_dropdown] Error for {repo_id}/{top_level_dir}: {error}", flush=True)
print(traceback.format_exc(), flush=True)
empty_second = _dropdown_update(
choices=[],
value=None,
label="Second-level folders",
filled_info="Choose a second-level folder",
empty_info="Unable to load subdirectories",
)
empty_third = _dropdown_update(
choices=[],
value=None,
label="Third-level folders",
filled_info="Choose a third-level folder",
empty_info="Unable to load subdirectories",
)
empty_fourth = _dropdown_update(
choices=[],
value=None,
label="Fourth-level folders",
filled_info="Choose a fourth-level folder",
empty_info="Unable to load subdirectories",
)
return empty_second, empty_third, empty_fourth, gr.update(value=[])
def update_third_dropdown(repo_id: str, top_level_dir: str, second_level_dir: str):
"""Update the third and fourth dropdowns when the second-level changes."""
try:
if not top_level_dir or not second_level_dir:
empty_third = _dropdown_update(
choices=[],
value=None,
label="Third-level folders",
filled_info="Choose a third-level folder",
empty_info="Select higher-level folders first",
)
empty_fourth = _dropdown_update(
choices=[],
value=None,
label="Fourth-level folders",
filled_info="Choose a fourth-level folder",
empty_info="Select higher-level folders first",
)
return empty_third, empty_fourth, gr.update(value=[])
third_dirs = _get_subdirectories(
repo_id, _build_path(top_level_dir, second_level_dir)
)
third_value = third_dirs[0] if third_dirs else None
fourth_dirs = (
_get_subdirectories(
repo_id, _build_path(top_level_dir, second_level_dir, third_value)
)
if third_value
else []
)
fourth_value = fourth_dirs[0] if fourth_dirs else None
image_urls = (
_get_image_urls(
repo_id,
_build_path(top_level_dir, second_level_dir, third_value, fourth_value),
)
if fourth_value
else []
)
return (
_dropdown_update(
choices=third_dirs,
value=third_value,
label="Third-level folders",
filled_info="Choose a third-level folder",
empty_info="No third-level folders under the selection",
),
_dropdown_update(
choices=fourth_dirs,
value=fourth_value,
label="Fourth-level folders",
filled_info="Choose a fourth-level folder",
empty_info="No fourth-level folders under the selection",
),
gr.update(value=image_urls),
)
except Exception as error:
print(
f"[update_third_dropdown] Error for {repo_id}/{top_level_dir}/{second_level_dir}: {error}",
flush=True,
)
print(traceback.format_exc(), flush=True)
empty_third = _dropdown_update(
choices=[],
value=None,
label="Third-level folders",
filled_info="Choose a third-level folder",
empty_info="Unable to load subdirectories",
)
empty_fourth = _dropdown_update(
choices=[],
value=None,
label="Fourth-level folders",
filled_info="Choose a fourth-level folder",
empty_info="Unable to load subdirectories",
)
return empty_third, empty_fourth, gr.update(value=[])
def update_fourth_dropdown(
repo_id: str,
top_level_dir: str,
second_level_dir: str,
third_level_dir: str,
):
"""Update the fourth dropdown and gallery when the third-level changes."""
try:
if not top_level_dir or not second_level_dir or not third_level_dir:
empty_fourth = _dropdown_update(
choices=[],
value=None,
label="Fourth-level folders",
filled_info="Choose a fourth-level folder",
empty_info="Select higher-level folders first",
)
return empty_fourth, gr.update(value=[])
fourth_dirs = _get_subdirectories(
repo_id,
_build_path(top_level_dir, second_level_dir, third_level_dir),
)
fourth_value = fourth_dirs[0] if fourth_dirs else None
image_urls = (
_get_image_urls(
repo_id,
_build_path(
top_level_dir,
second_level_dir,
third_level_dir,
fourth_value,
),
)
if fourth_value
else []
)
return (
_dropdown_update(
choices=fourth_dirs,
value=fourth_value,
label="Fourth-level folders",
filled_info="Choose a fourth-level folder",
empty_info="No fourth-level folders under the selection",
),
gr.update(value=image_urls),
)
except Exception as error:
print(
"[update_fourth_dropdown] Error for "
f"{repo_id}/{top_level_dir}/{second_level_dir}/{third_level_dir}: {error}",
flush=True,
)
print(traceback.format_exc(), flush=True)
empty_fourth = _dropdown_update(
choices=[],
value=None,
label="Fourth-level folders",
filled_info="Choose a fourth-level folder",
empty_info="Unable to load subdirectories",
)
return empty_fourth, gr.update(value=[])
def update_gallery(
repo_id: str,
top_level_dir: str,
second_level_dir: str,
third_level_dir: str,
fourth_level_dir: str,
):
"""Update the image gallery when the fourth-level selection changes."""
try:
if not all([top_level_dir, second_level_dir, third_level_dir, fourth_level_dir]):
return gr.update(value=[])
image_urls = _get_image_urls(
repo_id,
_build_path(
top_level_dir,
second_level_dir,
third_level_dir,
fourth_level_dir,
),
)
return gr.update(value=image_urls)
except Exception as error:
print(
"[update_gallery] Error for "
f"{repo_id}/{top_level_dir}/{second_level_dir}/{third_level_dir}/{fourth_level_dir}: {error}",
flush=True,
)
print(traceback.format_exc(), flush=True)
return gr.update(value=[])
with gr.Blocks(title="HF Dataset Explorer") as demo:
gr.Markdown(
"""# Hugging Face Dataset Explorer
Provide a dataset repository ID (e.g. `org/dataset`) to list its top-level folders."""
)
with gr.Row():
repo_id_input = gr.Textbox(
value=DEFAULT_REPO_ID,
label="Dataset repo ID",
placeholder="owner/dataset",
info="Any public dataset on the Hugging Face Hub"
)
reload_button = gr.Button("Load repo", variant="primary")
status_display = gr.Markdown()
folder_dropdown = gr.Dropdown(label="Top-level folders", interactive=False)
second_level_dropdown = gr.Dropdown(label="Second-level folders", interactive=False)
third_level_dropdown = gr.Dropdown(label="Third-level folders", interactive=False)
fourth_level_dropdown = gr.Dropdown(label="Fourth-level folders", interactive=False)
image_gallery = gr.Gallery(label="Images", columns=4)
reload_button.click(
refresh_repo,
inputs=repo_id_input,
outputs=[
folder_dropdown,
second_level_dropdown,
third_level_dropdown,
fourth_level_dropdown,
image_gallery,
status_display,
],
)
folder_dropdown.change(
update_second_dropdown,
inputs=[repo_id_input, folder_dropdown],
outputs=[
second_level_dropdown,
third_level_dropdown,
fourth_level_dropdown,
image_gallery,
],
)
second_level_dropdown.change(
update_third_dropdown,
inputs=[repo_id_input, folder_dropdown, second_level_dropdown],
outputs=[third_level_dropdown, fourth_level_dropdown, image_gallery],
)
third_level_dropdown.change(
update_fourth_dropdown,
inputs=[
repo_id_input,
folder_dropdown,
second_level_dropdown,
third_level_dropdown,
],
outputs=[fourth_level_dropdown, image_gallery],
)
fourth_level_dropdown.change(
update_gallery,
inputs=[
repo_id_input,
folder_dropdown,
second_level_dropdown,
third_level_dropdown,
fourth_level_dropdown,
],
outputs=[image_gallery],
)
demo.load(
refresh_repo,
inputs=repo_id_input,
outputs=[
folder_dropdown,
second_level_dropdown,
third_level_dropdown,
fourth_level_dropdown,
image_gallery,
status_display,
],
)
if __name__ == "__main__":
demo.launch()