mr-kush commited on
Commit
f7d6bbd
·
1 Parent(s): 625fb41

Enhance dataset loading function to include metadata retrieval and improve documentation

Browse files
Files changed (1) hide show
  1. load_dataset.py +59 -20
load_dataset.py CHANGED
@@ -1,43 +1,82 @@
1
  from datasets import load_dataset
 
 
2
 
3
- def load_dataset_from_hub(model_repo: str, hf_token: str):
 
4
  """
5
- Load a dataset from the Hugging Face Hub using a provided access token.
6
 
7
- This function securely loads a dataset hosted on the Hugging Face Hub
8
- without requiring a persistent login. The token is passed directly
9
- to the `load_dataset` call for temporary authentication.
10
 
11
  Parameters
12
  ----------
13
  model_repo : str
14
  The name or path of the dataset repository on the Hugging Face Hub.
15
- Example: "username/dataset_name" or "glue".
16
  hf_token : str
17
  Your Hugging Face access token with permission to read the dataset.
18
 
19
  Returns
20
  -------
21
- dataset : datasets.DatasetDict or datasets.Dataset
22
- The loaded Hugging Face dataset object. Depending on the dataset,
23
- it may return a `DatasetDict` (with splits such as "train", "test", etc.)
24
- or a single `Dataset` object.
 
 
 
 
 
 
25
 
26
  Raises
27
  ------
28
  ValueError
29
- If the dataset cannot be found or loaded from the Hub.
30
-
31
- Examples
32
- --------
33
- >>> ds = load_dataset_from_hub("imdb", hf_token="hf_xxx")
34
- >>> print(ds["train"][0])
35
- {'text': 'An amazing movie...', 'label': 1}
36
  """
37
  try:
38
- # Load dataset securely using the provided token
39
  dataset = load_dataset(model_repo, token=hf_token)
40
- return dataset
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  except Exception as e:
43
- raise ValueError(f"Failed to load dataset '{model_repo}' from the Hugging Face Hub: {e}")
 
1
  from datasets import load_dataset
2
+ from huggingface_hub import HfApi, DatasetInfo
3
+ from typing import Dict, Any
4
 
5
+
6
+ def load_dataset_from_hub(model_repo: str, hf_token: str) -> Dict[str, Any]:
7
  """
8
+ Load a dataset from the Hugging Face Hub and return its metadata.
9
 
10
+ This function securely loads a dataset from the Hugging Face Hub using a token,
11
+ and also fetches metadata such as version (revision), split sizes, and other info.
 
12
 
13
  Parameters
14
  ----------
15
  model_repo : str
16
  The name or path of the dataset repository on the Hugging Face Hub.
17
+ Example: "username/dataset_name".
18
  hf_token : str
19
  Your Hugging Face access token with permission to read the dataset.
20
 
21
  Returns
22
  -------
23
+ result : dict
24
+ {
25
+ "dataset": datasets.DatasetDict or datasets.Dataset,
26
+ "metadata": {
27
+ "repo_id": str,
28
+ "sha": str,
29
+ "splits": {"train": int, "test": int, "validation": int, ...},
30
+ "card_data": dict
31
+ }
32
+ }
33
 
34
  Raises
35
  ------
36
  ValueError
37
+ If the dataset cannot be found or loaded.
 
 
 
 
 
 
38
  """
39
  try:
40
+ # Load dataset securely
41
  dataset = load_dataset(model_repo, token=hf_token)
42
+
43
+ # Initialize Hugging Face API client
44
+ api = HfApi()
45
+
46
+ # Fetch dataset metadata from the Hub
47
+ ds_info: DatasetInfo = api.dataset_info(repo_id=model_repo, token=hf_token)
48
+
49
+ # Extract useful details
50
+ sha = ds_info.sha or "unknown"
51
+ card_data = ds_info.card_data or {}
52
+
53
+ # Get latest tag (if exists)
54
+ latest_tag = None
55
+ try:
56
+ repo_refs = api.list_repo_refs(repo_id=model_repo, repo_type="dataset")
57
+ if repo_refs.tags:
58
+ latest_tag = repo_refs.tags[0].name # e.g., "v1.0", "stable"
59
+ else:
60
+ latest_tag = "no-tag"
61
+ except Exception:
62
+ latest_tag = "no-tag"
63
+
64
+ # Compute split sizes from loaded dataset
65
+ splits = {split: len(dataset[split]) for split in dataset.keys()} if isinstance(dataset, dict) else {"default": len(dataset)}
66
+
67
+ # compute size
68
+ size = sum(splits.values())
69
+
70
+
71
+
72
+ metadata = {
73
+ "dataset_repo_id": model_repo,
74
+ "dataset_version_tag": latest_tag,
75
+ "dataset_size": size,
76
+ "dataset_splits": splits,
77
+ }
78
+
79
+ return {"dataset": dataset, "metadata": metadata}
80
 
81
  except Exception as e:
82
+ raise ValueError(f"Failed to load dataset '{model_repo}' from Hugging Face Hub: {e}")