Spaces:

evaleval
/

eee_validator

Sleeping

File size: 8,544 Bytes

"""Post-merge actions: update manifest.json and dataset card on PR merge."""

import io
import json
import logging
import os
from datetime import datetime, timezone

from huggingface_hub import HfApi, hf_hub_download, snapshot_download
from huggingface_hub.utils import EntryNotFoundError

from dedup import DATASET_REPO_ID, compute_fingerprint, compute_sha256

logger = logging.getLogger(__name__)

api = HfApi()


def _list_data_dirs(api: HfApi) -> list[str]:
    """List top-level directory names under data/ (these become config/split names)."""
    dirs: list[str] = []
    for entry in api.list_repo_tree(
        repo_id=DATASET_REPO_ID,
        repo_type="dataset",
        revision="main",
        path_in_repo="data",
    ):
        if not hasattr(entry, "rfilename"):  # it's a directory
            # entry.path is like "data/global-mmlu-lite"
            name = entry.path.split("/", 1)[-1]
            dirs.append(name)
    return sorted(dirs)


def _build_dataset_card(configs: list[str]) -> str:
    """Build a dataset card README.md with YAML frontmatter for the viewer."""
    yaml_configs = []
    for config in configs:
        yaml_configs.append(f"  - config_name: {config}")
        yaml_configs.append(f"    data_files:")
        yaml_configs.append(f"      - split: train")
        yaml_configs.append(f"        path: data/{config}/**/*.json")

    yaml_block = "\n".join(yaml_configs)

    return f"""---
configs:
{yaml_block}
license: mit
---

# EEE Datastore

Evaluation data for the EEE project.
"""


def update_manifest(api: HfApi, merged_files: list[str]) -> None:
    """Download merged files from main, compute hashes, and update manifest.json."""
    # Load existing manifest
    try:
        manifest_path = hf_hub_download(
            repo_id=DATASET_REPO_ID,
            filename="manifest.json",
            repo_type="dataset",
            revision="main",
        )
        with open(manifest_path, "r") as f:
            manifest = json.load(f)
    except (EntryNotFoundError, Exception):
        manifest = {"files": {}}

    now = datetime.now(timezone.utc).isoformat()

    for file_path in merged_files:
        try:
            local_path = hf_hub_download(
                repo_id=DATASET_REPO_ID,
                filename=file_path,
                repo_type="dataset",
                revision="main",
            )
            with open(local_path, "rb") as f:
                content = f.read()

            sha256 = compute_sha256(content)

            if file_path.endswith(".json"):
                fingerprint = compute_fingerprint(content)
            else:
                fingerprint = sha256

            manifest["files"][file_path] = {
                "sha256": sha256,
                "fingerprint": fingerprint,
                "added_at": now,
            }
            logger.info("Added %s to manifest", file_path)
        except Exception:
            logger.exception("Failed to process %s for manifest", file_path)

    # Upload updated manifest
    manifest_bytes = json.dumps(manifest, indent=2, sort_keys=True).encode()
    api.upload_file(
        path_or_fileobj=io.BytesIO(manifest_bytes),
        path_in_repo="manifest.json",
        repo_id=DATASET_REPO_ID,
        repo_type="dataset",
        commit_message="Update manifest.json after PR merge",
    )
    logger.info("Uploaded updated manifest.json (%d files)", len(manifest["files"]))


def update_dataset_card(api: HfApi) -> None:
    """Regenerate the dataset card README.md with configs for all data/ subdirs."""
    configs = _list_data_dirs(api)
    if not configs:
        logger.warning("No data directories found, skipping dataset card update")
        return

    card_content = _build_dataset_card(configs)
    api.upload_file(
        path_or_fileobj=io.BytesIO(card_content.encode()),
        path_in_repo="README.md",
        repo_id=DATASET_REPO_ID,
        repo_type="dataset",
        commit_message="Update dataset card with configs for viewer",
    )
    logger.info("Updated dataset card with %d configs: %s", len(configs), configs)


def full_rebuild() -> dict:
    """Rebuild manifest.json from scratch and regenerate the dataset card.

    Downloads the entire dataset at once via snapshot_download, then walks
    the local directory to compute hashes/fingerprints for all data files.
    """
    logger.info("Starting full rebuild of manifest + dataset card")

    # Download entire dataset in one shot
    local_dir = snapshot_download(
        repo_id=DATASET_REPO_ID,
        repo_type="dataset",
        revision="main",
    )
    logger.info("Downloaded dataset snapshot to %s", local_dir)

    # Walk local data/ directory to find all data files
    data_root = os.path.join(local_dir, "data")
    now = datetime.now(timezone.utc).isoformat()
    manifest = {"files": {}}

    if not os.path.isdir(data_root):
        logger.warning("No data/ directory found in snapshot")
    else:
        for dirpath, _, filenames in os.walk(data_root):
            for filename in filenames:
                if not (filename.endswith(".json") or filename.endswith(".jsonl")):
                    continue
                local_path = os.path.join(dirpath, filename)
                # Convert to repo-relative path (data/...)
                repo_path = os.path.relpath(local_path, local_dir)
                try:
                    with open(local_path, "rb") as f:
                        content = f.read()

                    sha256 = compute_sha256(content)
                    if filename.endswith(".json"):
                        fingerprint = compute_fingerprint(content)
                    else:
                        fingerprint = sha256

                    manifest["files"][repo_path] = {
                        "sha256": sha256,
                        "fingerprint": fingerprint,
                        "added_at": now,
                    }
                    logger.info("Indexed %s", repo_path)
                except Exception:
                    logger.exception("Failed to index %s", repo_path)

    logger.info("Indexed %d data files total", len(manifest["files"]))

    # Upload fresh manifest
    manifest_bytes = json.dumps(manifest, indent=2, sort_keys=True).encode()
    api.upload_file(
        path_or_fileobj=io.BytesIO(manifest_bytes),
        path_in_repo="manifest.json",
        repo_id=DATASET_REPO_ID,
        repo_type="dataset",
        commit_message="Full rebuild of manifest.json",
    )
    logger.info("Uploaded rebuilt manifest.json (%d files)", len(manifest["files"]))

    # Regenerate dataset card
    update_dataset_card(api)

    return {
        "status": "ok",
        "action": "full_rebuild",
        "files_indexed": len(manifest["files"]),
    }


def handle_merge(pr_num: int) -> dict:
    """Run all post-merge actions for a PR."""
    logger.info("Handling merge for PR #%d", pr_num)

    # Find which data files were added/changed by this PR
    # After merge, everything is on main, so we list all data files
    # and update manifest for any that aren't already tracked
    try:
        manifest_path = hf_hub_download(
            repo_id=DATASET_REPO_ID,
            filename="manifest.json",
            repo_type="dataset",
            revision="main",
        )
        with open(manifest_path, "r") as f:
            manifest = json.load(f)
    except (EntryNotFoundError, Exception):
        manifest = {"files": {}}

    # List all data files on main and find untracked ones
    all_data_files: list[str] = []
    for entry in api.list_repo_tree(
        repo_id=DATASET_REPO_ID,
        repo_type="dataset",
        revision="main",
        recursive=True,
    ):
        if not hasattr(entry, "rfilename"):
            continue
        path = entry.rfilename
        if path.startswith("data/") and (path.endswith(".json") or path.endswith(".jsonl")):
            all_data_files.append(path)

    untracked = [f for f in all_data_files if f not in manifest.get("files", {})]
    logger.info("Found %d untracked data files after merge", len(untracked))

    if untracked:
        update_manifest(api, untracked)

    update_dataset_card(api)

    return {
        "status": "ok",
        "action": "post_merge",
        "pr": pr_num,
        "files_added_to_manifest": len(untracked),
    }


if __name__ == "__main__":
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s %(levelname)s %(name)s: %(message)s",
    )
    result = full_rebuild()
    print(json.dumps(result, indent=2))