issdandavis commited on
Commit
f47473f
·
1 Parent(s): 46d2791

feat: add hugging face datasets setup tooling

Browse files
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ .venv/
2
+ __pycache__/
3
+ *.pyc
4
+ data/
README.md CHANGED
@@ -45,8 +45,44 @@ vector = embedder.encode("Book a flight from SFO to NYC")
45
  # Returns: 21D numpy array in Poincare Ball coordinates
46
  ```
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  ## Related
49
 
50
  - [SCBE-AETHERMOORE GitHub](https://github.com/issdandavis/SCBE-AETHERMOORE)
51
  - [Knowledge Base Dataset](https://huggingface.co/datasets/issdandavis/scbe-aethermoore-knowledge-base)
52
- - [Interaction Logs Dataset](https://huggingface.co/datasets/issdandavis/scbe-interaction-logs)
 
45
  # Returns: 21D numpy array in Poincare Ball coordinates
46
  ```
47
 
48
+ ## Dataset Setup (PowerShell)
49
+
50
+ Use this repo as a working directory for Hugging Face datasets:
51
+
52
+ ```powershell
53
+ cd C:\Users\issda\hf-repos\phdm-21d-embedding
54
+ python -m venv .venv
55
+ .\.venv\Scripts\Activate.ps1
56
+ python -m pip install --upgrade pip
57
+ python -m pip install -r requirements-datasets.txt
58
+ ```
59
+
60
+ Set your token in the current shell session:
61
+
62
+ ```powershell
63
+ $env:HF_TOKEN="hf_your_token_here"
64
+ ```
65
+
66
+ Load and preview a dataset split:
67
+
68
+ ```powershell
69
+ python scripts/load_hf_dataset.py --dataset-id issdandavis/scbe-aethermoore-knowledge-base --split train --limit 3
70
+ ```
71
+
72
+ Push local JSONL files to a dataset repo:
73
+
74
+ ```powershell
75
+ python scripts/push_jsonl_dataset.py --dataset-id issdandavis/scbe-aethermoore-knowledge-base --train .\data\train.jsonl --validation .\data\validation.jsonl
76
+ ```
77
+
78
+ Expected JSONL row format example:
79
+
80
+ ```json
81
+ {"text":"Example source content","source":"notion","category":"policy"}
82
+ ```
83
+
84
  ## Related
85
 
86
  - [SCBE-AETHERMOORE GitHub](https://github.com/issdandavis/SCBE-AETHERMOORE)
87
  - [Knowledge Base Dataset](https://huggingface.co/datasets/issdandavis/scbe-aethermoore-knowledge-base)
88
+ - [Interaction Logs Dataset](https://huggingface.co/datasets/issdandavis/scbe-interaction-logs)
requirements-datasets.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ datasets>=2.19.0
2
+ huggingface_hub>=0.24.0
3
+ pyarrow>=15.0.0
4
+ pandas>=2.2.0
scripts/load_hf_dataset.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """Quick dataset loader for Hugging Face Hub datasets."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import os
8
+
9
+ from datasets import load_dataset
10
+ from huggingface_hub import HfApi
11
+
12
+
13
+ def parse_args() -> argparse.Namespace:
14
+ parser = argparse.ArgumentParser(
15
+ description="Load and preview a dataset split from Hugging Face Hub."
16
+ )
17
+ parser.add_argument(
18
+ "--dataset-id",
19
+ default="issdandavis/scbe-aethermoore-knowledge-base",
20
+ help="Dataset repo id on Hugging Face Hub (for example: username/dataset-name).",
21
+ )
22
+ parser.add_argument("--split", default="train", help="Split to load.")
23
+ parser.add_argument(
24
+ "--limit",
25
+ type=int,
26
+ default=3,
27
+ help="How many examples to print from the split.",
28
+ )
29
+ parser.add_argument(
30
+ "--streaming",
31
+ action="store_true",
32
+ help="Stream examples without downloading the full dataset.",
33
+ )
34
+ parser.add_argument(
35
+ "--token",
36
+ default=os.environ.get("HF_TOKEN"),
37
+ help="HF access token. Defaults to HF_TOKEN env var.",
38
+ )
39
+ return parser.parse_args()
40
+
41
+
42
+ def main() -> None:
43
+ args = parse_args()
44
+ token = args.token
45
+
46
+ if token:
47
+ try:
48
+ user = HfApi(token=token).whoami()["name"]
49
+ print(f"Authenticated as: {user}")
50
+ except Exception as exc: # pragma: no cover - network/auth failure path
51
+ print(f"Token check failed ({exc}). Retrying without token for public access.")
52
+ token = None
53
+ else:
54
+ print("No HF token provided. Public datasets only.")
55
+
56
+ try:
57
+ ds = load_dataset(
58
+ path=args.dataset_id,
59
+ split=args.split,
60
+ token=token,
61
+ streaming=args.streaming,
62
+ )
63
+ except Exception as exc: # pragma: no cover - network/hub failure path
64
+ raise SystemExit(
65
+ f"Failed to load dataset '{args.dataset_id}' split '{args.split}': {exc}"
66
+ ) from exc
67
+
68
+ if args.streaming:
69
+ print(f"Loaded streaming split '{args.split}' from '{args.dataset_id}'.")
70
+ for idx, row in enumerate(ds):
71
+ print(f"[{idx}] {row}")
72
+ if idx + 1 >= args.limit:
73
+ break
74
+ return
75
+
76
+ print(f"Loaded split '{args.split}' from '{args.dataset_id}'.")
77
+ print(f"Rows: {len(ds)}")
78
+ print(f"Columns: {ds.column_names}")
79
+ print(f"Features: {ds.features}")
80
+ for idx in range(min(args.limit, len(ds))):
81
+ print(f"[{idx}] {ds[idx]}")
82
+
83
+
84
+ if __name__ == "__main__":
85
+ main()
scripts/push_jsonl_dataset.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """Build and push a DatasetDict from local JSONL files."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import os
8
+ from pathlib import Path
9
+
10
+ from datasets import DatasetDict, load_dataset
11
+ from huggingface_hub import create_repo
12
+
13
+
14
+ def parse_args() -> argparse.Namespace:
15
+ parser = argparse.ArgumentParser(
16
+ description="Push train/validation/test JSONL files to a Hugging Face dataset repo."
17
+ )
18
+ parser.add_argument(
19
+ "--dataset-id",
20
+ required=True,
21
+ help="Dataset repo id (for example: username/dataset-name).",
22
+ )
23
+ parser.add_argument("--train", help="Path to train JSONL file.")
24
+ parser.add_argument("--validation", help="Path to validation JSONL file.")
25
+ parser.add_argument("--test", help="Path to test JSONL file.")
26
+ parser.add_argument(
27
+ "--private",
28
+ action="store_true",
29
+ help="Create/update the dataset as private.",
30
+ )
31
+ parser.add_argument(
32
+ "--token",
33
+ default=os.environ.get("HF_TOKEN"),
34
+ help="HF access token. Defaults to HF_TOKEN env var.",
35
+ )
36
+ return parser.parse_args()
37
+
38
+
39
+ def validate_split_path(name: str, path: str | None) -> Path | None:
40
+ if not path:
41
+ return None
42
+
43
+ file_path = Path(path).expanduser().resolve()
44
+ if not file_path.exists():
45
+ raise FileNotFoundError(f"{name} file not found: {file_path}")
46
+ if file_path.suffix.lower() != ".jsonl":
47
+ raise ValueError(f"{name} file must be a .jsonl file: {file_path}")
48
+ return file_path
49
+
50
+
51
+ def main() -> None:
52
+ args = parse_args()
53
+
54
+ split_paths = {
55
+ "train": validate_split_path("train", args.train),
56
+ "validation": validate_split_path("validation", args.validation),
57
+ "test": validate_split_path("test", args.test),
58
+ }
59
+ split_paths = {k: str(v) for k, v in split_paths.items() if v is not None}
60
+
61
+ if not split_paths:
62
+ raise ValueError("Provide at least one split: --train, --validation, or --test.")
63
+
64
+ if not args.token:
65
+ raise ValueError("Set HF_TOKEN or pass --token to push a dataset.")
66
+
67
+ try:
68
+ create_repo(
69
+ repo_id=args.dataset_id,
70
+ repo_type="dataset",
71
+ private=args.private,
72
+ exist_ok=True,
73
+ token=args.token,
74
+ )
75
+ except Exception as exc: # pragma: no cover - network/hub failure path
76
+ raise SystemExit(f"Failed to create/access dataset repo '{args.dataset_id}': {exc}") from exc
77
+
78
+ split_datasets = {}
79
+ for split_name, path in split_paths.items():
80
+ split_datasets[split_name] = load_dataset(
81
+ "json",
82
+ data_files={split_name: path},
83
+ split=split_name,
84
+ )
85
+ print(f"Loaded {split_name}: {len(split_datasets[split_name])} rows from {path}")
86
+
87
+ dataset_dict = DatasetDict(split_datasets)
88
+ try:
89
+ dataset_dict.push_to_hub(
90
+ repo_id=args.dataset_id,
91
+ private=args.private,
92
+ token=args.token,
93
+ )
94
+ except Exception as exc: # pragma: no cover - network/hub failure path
95
+ raise SystemExit(f"Failed to push dataset '{args.dataset_id}': {exc}") from exc
96
+
97
+ print(f"Pushed dataset to: https://huggingface.co/datasets/{args.dataset_id}")
98
+
99
+
100
+ if __name__ == "__main__":
101
+ main()