Spaces:

piekenius123
/

Amaze-Visualization

Sleeping

App Files Files Community

piekenius123 commited on 17 days ago

Commit

eaf430a

verified ·

1 Parent(s): 553a2bb

Create app.py

Browse files

Files changed (1) hide show

app.py +438 -0

app.py ADDED Viewed

	@@ -0,0 +1,438 @@

+# app.py
+import io
+import json
+import base64
+import random
+from typing import Optional, Dict, Any, List, Tuple
+import pandas as pd
+from PIL import Image
+import gradio as gr
+from huggingface_hub import HfApi, hf_hub_download
+# =========================
+# Hugging Face Dataset Repo
+# =========================
+DATASET_REPO_ID = "piekenius123/Amaze"  # your dataset
+REPO_TYPE = "dataset"
+SHAPES = ["circle", "hexagon", "square", "triangle"]
+SPLITS = ["train", "val", "test"]
+IMAGE_COLS = ["original_img", "m_original_img", "sol_img", "mask_img", "cell_map"]
+# =========================
+# Helpers
+# =========================
+def infer_shape_from_repo_path(path: str) -> Optional[str]:
+    p = path.replace("\\", "/").lower()
+    for s in SHAPES:
+        if p.startswith(f"{s}/") or f"/{s}/" in p:
+            return s
+    return None
+def infer_split_from_repo_path(path: str) -> Optional[str]:
+    """
+    Rules (based on your dataset description):
+    - .../maze_dataset_train.parquet => train
+    - .../maze_dataset_test.parquet:
+        * if under .../maze-dataset_train/ => val
+        * else if under .../maze-dataset/  => test
+    """
+    p = path.replace("\\", "/").lower()
+    fn = p.split("/")[-1]
+    if fn == "maze_dataset_train.parquet":
+        return "train"
+    if fn == "maze_dataset_test.parquet":
+        if "/maze-dataset_train/" in p:
+            return "val"
+        if "/maze-dataset/" in p:
+            return "test"
+    return None
+def decode_base64_image(base64_str: Any) -> Optional[Image.Image]:
+    if base64_str is None:
+        return None
+    if isinstance(base64_str, float) and pd.isna(base64_str):
+        return None
+    if isinstance(base64_str, str) and (base64_str.strip() == "" or base64_str.strip().lower() == "null"):
+        return None
+    if not isinstance(base64_str, str):
+        return None
+    s = base64_str.strip()
+    try:
+        # Remove data URL prefix if present
+        if s.startswith("data:"):
+            s = s.split(",", 1)[1]
+        img_bytes = base64.b64decode(s)
+        img = Image.open(io.BytesIO(img_bytes))
+        img.load()
+        return img
+    except Exception:
+        return None
+def safe_json_loads(s: Any) -> Tuple[Optional[Dict[str, Any]], Optional[str]]:
+    if s is None:
+        return None, None
+    if isinstance(s, float) and pd.isna(s):
+        return None, None
+    if not isinstance(s, str):
+        return None, f"metadata is not a string, got type={type(s)}"
+    ss = s.strip()
+    if ss == "" or ss.lower() == "null":
+        return None, None
+    try:
+        return json.loads(ss), None
+    except Exception as e:
+        return None, str(e)
+def summarize_df(df: pd.DataFrame) -> str:
+    cols = list(df.columns)
+    return f"Rows: {len(df)}\nCols: {len(cols)}\nColumns: {', '.join(cols)}"
+def row_to_kv_table(row: pd.Series) -> pd.DataFrame:
+    records = []
+    for k, v in row.items():
+        if k in IMAGE_COLS:
+            records.append((k, f"<base64 image str> len={len(v) if isinstance(v, str) else 'NA'}"))
+        elif k == "metadata":
+            records.append((k, f"<json str> len={len(v) if isinstance(v, str) else 'NA'}"))
+        else:
+            if isinstance(v, str) and len(v) > 500:
+                vv = v[:500] + " ... (truncated)"
+            else:
+                vv = v
+            records.append((k, vv))
+    return pd.DataFrame(records, columns=["field", "value"])
+def render_sample(df: pd.DataFrame, index: int):
+    if len(df) == 0:
+        return (
+            0, "Empty dataframe.", "",
+            None, None, None, None, None,
+            {}, "", pd.DataFrame(columns=["field", "value"])
+        )
+    index = max(0, min(int(index), len(df) - 1))
+    row = df.iloc[index]
+    sample_id = str(row.get("id", f"maze_{index}"))
+    instruction = str(row.get("instruction", ""))
+    imgs = {col: decode_base64_image(row.get(col, None)) for col in IMAGE_COLS}
+    meta_dict, meta_err = safe_json_loads(row.get("metadata", None))
+    meta_raw = row.get("metadata", "")
+    meta_json = {"_parse_error": meta_err} if meta_err else (meta_dict if meta_dict is not None else {})
+    kv_df = row_to_kv_table(row)
+    status = f"Index: {index} / {len(df)-1} | id: {sample_id}"
+    return (
+        index,
+        status,
+        instruction,
+        imgs["original_img"],
+        imgs["m_original_img"],
+        imgs["sol_img"],
+        imgs["mask_img"],
+        imgs["cell_map"],
+        meta_json,
+        meta_raw if isinstance(meta_raw, str) else str(meta_raw),
+        kv_df,
+    )
+def find_index_by_id(df: pd.DataFrame, sample_id: str) -> Optional[int]:
+    if "id" not in df.columns or not sample_id:
+        return None
+    # exact match
+    try:
+        mask = df["id"] == sample_id
+        if mask.any():
+            return int(df.index[mask][0]) if not isinstance(df.index, pd.RangeIndex) else int(mask.idxmax())
+    except Exception:
+        pass
+    # substring match
+    try:
+        mask = df["id"].astype(str).str.contains(sample_id, na=False)
+        if mask.any():
+            # return first match position
+            pos = df[mask].index[0]
+            # convert label to positional index
+            return int(df.index.get_loc(pos))
+    except Exception:
+        pass
+    return None
+# =========================
+# HF repo indexing + caching
+# =========================
+def build_repo_index() -> List[Dict[str, str]]:
+    """
+    List all files in dataset repo, keep parquet only, infer shape/split.
+    """
+    api = HfApi()
+    files = api.list_repo_files(repo_id=DATASET_REPO_ID, repo_type=REPO_TYPE)
+    # list_repo_files is part of HfApi; repo_type supports "dataset". :contentReference[oaicite:3]{index=3}
+    records: List[Dict[str, str]] = []
+    for f in files:
+        if not f.lower().endswith(".parquet"):
+            continue
+        shape = infer_shape_from_repo_path(f)
+        split = infer_split_from_repo_path(f)
+        if shape and split:
+            records.append({"repo_path": f, "shape": shape, "split": split})
+    records.sort(key=lambda r: r["repo_path"])
+    return records
+# cache dataframes per local downloaded file path
+_DF_CACHE: Dict[str, pd.DataFrame] = {}
+def download_and_load_df(repo_path: str) -> pd.DataFrame:
+    """
+    Download parquet from dataset repo (cached by hf_hub_download), then read to pandas.
+    """
+    local_path = hf_hub_download(
+        repo_id=DATASET_REPO_ID,
+        repo_type=REPO_TYPE,
+        filename=repo_path,
+    )
+    # hf_hub_download caches files and returns local path; do not modify cached file. :contentReference[oaicite:4]{index=4}
+    if local_path in _DF_CACHE:
+        return _DF_CACHE[local_path]
+    df = pd.read_parquet(local_path)
+    _DF_CACHE[local_path] = df
+    return df
+def get_repo_paths(records: List[Dict[str, str]], shape: str, split: str) -> List[str]:
+    out = [r["repo_path"] for r in (records or []) if r["shape"] == shape and r["split"] == split]
+    out.sort()
+    return out
+# =========================
+# Gradio callbacks
+# =========================
+def init_app():
+    try:
+        recs = build_repo_index()
+        info = f"Dataset: {DATASET_REPO_ID}\nParquet files indexed: {len(recs)}"
+        return recs, info
+    except Exception as e:
+        return [], f"Failed to index dataset repo: {e}"
+def on_shape_split_change(records: List[Dict[str, str]], shape: str, split: str):
+    choices = get_repo_paths(records, shape, split)
+    value = choices[0] if choices else None
+    tip = f"Matched parquet files: {len(choices)}"
+    return gr.Dropdown(choices=choices, value=value), tip
+def on_select_parquet(repo_path: str):
+    if not repo_path:
+        return "No parquet selected.", 0, 0
+    df = download_and_load_df(repo_path)
+    summary = summarize_df(df)
+    max_idx = max(0, len(df) - 1)
+    return summary, max_idx, 0
+def on_show(repo_path: str, index: int):
+    if not repo_path:
+        return (
+            0, "No parquet selected.", "",
+            None, None, None, None, None,
+            {}, "", pd.DataFrame(columns=["field", "value"])
+        )
+    df = download_and_load_df(repo_path)
+    return render_sample(df, index)
+def on_random(repo_path: str):
+    if not repo_path:
+        return on_show(repo_path, 0)
+    df = download_and_load_df(repo_path)
+    if len(df) == 0:
+        return on_show(repo_path, 0)
+    idx = random.randint(0, len(df) - 1)
+    return render_sample(df, idx)
+def on_find_id(repo_path: str, query_id: str):
+    if not repo_path:
+        return on_show(repo_path, 0)
+    df = download_and_load_df(repo_path)
+    pos = find_index_by_id(df, query_id.strip() if isinstance(query_id, str) else "")
+    if pos is None:
+        out = list(render_sample(df, 0))
+        out[1] = out[1] + f" | id search '{query_id}' NOT FOUND"
+        return tuple(out)
+    return render_sample(df, pos)
+# =========================
+# UI
+# =========================
+def build_ui():
+    with gr.Blocks(title="Amaze Parquet Viewer (HF Dataset)") as demo:
+        gr.Markdown(
+            "# Amaze Benchmark Parquet Viewer (HF Space)\n"
+            f"数据来自 Hugging Face Dataset：`{DATASET_REPO_ID}`。\n\n"
+            "选择 **shape / split(train/val/test)** 后，Space 会按需下载对应 parquet 并可视化每条样本。"
+        )
+        records_state = gr.State([])
+        scan_info = gr.Textbox(label="Repo index status", interactive=False)
+        with gr.Row():
+            shape_dd = gr.Dropdown(label="Shape", choices=SHAPES, value=SHAPES[0])
+            split_dd = gr.Dropdown(label="Split", choices=SPLITS, value="test")
+        parquet_tip = gr.Markdown(value="Matched parquet files: 0")
+        parquet_dd = gr.Dropdown(label="Matched parquet files (repo path)", choices=[], value=None, interactive=True)
+        with gr.Row():
+            file_summary = gr.Textbox(label="Selected parquet summary", interactive=False)
+            idx_slider = gr.Slider(label="Row index", minimum=0, maximum=0, value=0, step=1, interactive=True)
+        with gr.Row():
+            show_btn = gr.Button("Show")
+            random_btn = gr.Button("Random")
+            id_query = gr.Textbox(label="Find by id (exact or substring)", placeholder="paste UUID or substring")
+            find_btn = gr.Button("Find")
+        status = gr.Textbox(label="Status", interactive=False)
+        instruction = gr.Textbox(label="Instruction", lines=4, interactive=False)
+        with gr.Tabs():
+            with gr.Tab("Images"):
+                with gr.Row():
+                    original_img = gr.Image(label="original_img", type="pil")
+                    m_original_img = gr.Image(label="m_original_img", type="pil")
+                with gr.Row():
+                    sol_img = gr.Image(label="sol_img", type="pil")
+                    mask_img = gr.Image(label="mask_img", type="pil")
+                with gr.Row():
+                    cell_map = gr.Image(label="cell_map", type="pil")
+            with gr.Tab("Metadata"):
+                meta_json = gr.JSON(label="metadata (parsed)")
+                meta_raw = gr.Textbox(label="metadata (raw)", lines=8, interactive=False)
+            with gr.Tab("Row fields"):
+                kv_table = gr.Dataframe(
+                    label="All fields (base64 summarized)",
+                    headers=["field", "value"],
+                    wrap=True,
+                    interactive=False,
+                )
+        # Events
+        demo.load(
+            fn=init_app,
+            inputs=None,
+            outputs=[records_state, scan_info],
+        ).then(
+            fn=on_shape_split_change,
+            inputs=[records_state, shape_dd, split_dd],
+            outputs=[parquet_dd, parquet_tip],
+        ).then(
+            fn=lambda p: on_select_parquet(p) if p else ("No parquet selected.", 0, 0),
+            inputs=[parquet_dd],
+            outputs=[file_summary, idx_slider, idx_slider],
+        ).then(
+            fn=lambda p: on_show(p, 0) if p else (
+                0, "No parquet selected.", "",
+                None, None, None, None, None,
+                {}, "", pd.DataFrame(columns=["field", "value"])
+            ),
+            inputs=[parquet_dd],
+            outputs=[
+                idx_slider, status, instruction,
+                original_img, m_original_img, sol_img, mask_img, cell_map,
+                meta_json, meta_raw, kv_table
+            ],
+        )
+        shape_dd.change(
+            fn=on_shape_split_change,
+            inputs=[records_state, shape_dd, split_dd],
+            outputs=[parquet_dd, parquet_tip],
+        )
+        split_dd.change(
+            fn=on_shape_split_change,
+            inputs=[records_state, shape_dd, split_dd],
+            outputs=[parquet_dd, parquet_tip],
+        )
+        parquet_dd.change(
+            fn=on_select_parquet,
+            inputs=[parquet_dd],
+            outputs=[file_summary, idx_slider, idx_slider],
+        )
+        show_btn.click(
+            fn=on_show,
+            inputs=[parquet_dd, idx_slider],
+            outputs=[
+                idx_slider, status, instruction,
+                original_img, m_original_img, sol_img, mask_img, cell_map,
+                meta_json, meta_raw, kv_table
+            ],
+        )
+        idx_slider.release(
+            fn=on_show,
+            inputs=[parquet_dd, idx_slider],
+            outputs=[
+                idx_slider, status, instruction,
+                original_img, m_original_img, sol_img, mask_img, cell_map,
+                meta_json, meta_raw, kv_table
+            ],
+        )
+        random_btn.click(
+            fn=on_random,
+            inputs=[parquet_dd],
+            outputs=[
+                idx_slider, status, instruction,
+                original_img, m_original_img, sol_img, mask_img, cell_map,
+                meta_json, meta_raw, kv_table
+            ],
+        )
+        find_btn.click(
+            fn=on_find_id,
+            inputs=[parquet_dd, id_query],
+            outputs=[
+                idx_slider, status, instruction,
+                original_img, m_original_img, sol_img, mask_img, cell_map,
+                meta_json, meta_raw, kv_table
+            ],
+        )
+    return demo
+if __name__ == "__main__":
+    demo = build_ui()
+    demo.launch()