Spaces:
Runtime error
Runtime error
| from datasets import load_dataset | |
| from huggingface_hub import HfApi, DatasetInfo | |
| from typing import Dict, Any | |
| def load_dataset_from_hub(model_repo: str, hf_token: str) -> Dict[str, Any]: | |
| """ | |
| Load a dataset from the Hugging Face Hub and return its metadata. | |
| This function securely loads a dataset from the Hugging Face Hub using a token, | |
| and also fetches metadata such as version (revision), split sizes, and other info. | |
| Parameters | |
| ---------- | |
| model_repo : str | |
| The name or path of the dataset repository on the Hugging Face Hub. | |
| Example: "username/dataset_name". | |
| hf_token : str | |
| Your Hugging Face access token with permission to read the dataset. | |
| Returns | |
| ------- | |
| result : dict | |
| { | |
| "dataset": datasets.DatasetDict or datasets.Dataset, | |
| "metadata": { | |
| "repo_id": str, | |
| "sha": str, | |
| "splits": {"train": int, "test": int, "validation": int, ...}, | |
| "card_data": dict | |
| } | |
| } | |
| Raises | |
| ------ | |
| ValueError | |
| If the dataset cannot be found or loaded. | |
| """ | |
| try: | |
| # Load dataset securely | |
| dataset = load_dataset(model_repo, token=hf_token) | |
| # Initialize Hugging Face API client | |
| api = HfApi() | |
| # Fetch dataset metadata from the Hub | |
| ds_info: DatasetInfo = api.dataset_info(repo_id=model_repo, token=hf_token) | |
| # Extract useful details | |
| sha = ds_info.sha or "unknown" | |
| card_data = ds_info.card_data or {} | |
| # Get latest tag (if exists) | |
| latest_tag = None | |
| try: | |
| repo_refs = api.list_repo_refs(repo_id=model_repo, repo_type="dataset") | |
| if repo_refs.tags: | |
| latest_tag = repo_refs.tags[0].name # e.g., "v1.0", "stable" | |
| else: | |
| latest_tag = "no-tag" | |
| except Exception: | |
| latest_tag = "no-tag" | |
| # Compute split sizes from loaded dataset | |
| splits = {split: len(dataset[split]) for split in dataset.keys()} if isinstance(dataset, dict) else {"default": len(dataset)} | |
| # compute size | |
| size = sum(splits.values()) | |
| metadata = { | |
| "dataset_repo_id": model_repo, | |
| "dataset_version_tag": latest_tag, | |
| "dataset_size": size, | |
| "dataset_splits": splits, | |
| } | |
| return {"dataset": dataset, "metadata": metadata} | |
| except Exception as e: | |
| raise ValueError(f"Failed to load dataset '{model_repo}' from Hugging Face Hub: {e}") |