from datasets import load_dataset
from huggingface_hub import HfApi, DatasetInfo
from typing import Dict, Any


def load_dataset_from_hub(model_repo: str, hf_token: str) -> Dict[str, Any]:
    """
    Load a dataset from the Hugging Face Hub and return its metadata.

    This function securely loads a dataset from the Hugging Face Hub using a token,
    and also fetches metadata such as version (revision), split sizes, and other info.

    Parameters
    ----------
    model_repo : str
        The name or path of the dataset repository on the Hugging Face Hub.
        Example: "username/dataset_name".
    hf_token : str
        Your Hugging Face access token with permission to read the dataset.

    Returns
    -------
    result : dict
        {
            "dataset": datasets.DatasetDict or datasets.Dataset,
            "metadata": {
                "repo_id": str,
                "sha": str,
                "splits": {"train": int, "test": int, "validation": int, ...},
                "card_data": dict
            }
        }

    Raises
    ------
    ValueError
        If the dataset cannot be found or loaded.
    """
    try:
        #  Load dataset securely 
        dataset = load_dataset(model_repo, token=hf_token)

        #  Initialize Hugging Face API client 
        api = HfApi()

        #  Fetch dataset metadata from the Hub 
        ds_info: DatasetInfo = api.dataset_info(repo_id=model_repo, token=hf_token)

        # Extract useful details
        sha = ds_info.sha or "unknown"
        card_data = ds_info.card_data or {}

        #  Get latest tag (if exists) 
        latest_tag = None
        try:
            repo_refs = api.list_repo_refs(repo_id=model_repo, repo_type="dataset")
            if repo_refs.tags:
                latest_tag = repo_refs.tags[0].name  # e.g., "v1.0", "stable"
            else:
                latest_tag = "no-tag"
        except Exception:
            latest_tag = "no-tag"

        # Compute split sizes from loaded dataset
        splits = {split: len(dataset[split]) for split in dataset.keys()} if isinstance(dataset, dict) else {"default": len(dataset)}

        # compute size
        size = sum(splits.values())

        
        metadata = {
            "dataset_repo_id": model_repo,
            "dataset_version_tag": latest_tag,
            "dataset_size": size,
            "dataset_splits": splits,
        }

        return {"dataset": dataset, "metadata": metadata}

    except Exception as e:
        raise ValueError(f"Failed to load dataset '{model_repo}' from Hugging Face Hub: {e}")