Spaces:

smitathkr1
/

ord-training-simple

Paused

App Files Files Community

Vaishnav14220 commited on Nov 3, 2025

Commit

fdbfba8

1 Parent(s): bd4ecf7

Push datasets via push_to_hub and load from Hub on resume

Browse files

Files changed (2) hide show

app.py +9 -41
src/dataset_prepare.py +8 -7

app.py CHANGED Viewed

@@ -10,7 +10,8 @@ from pathlib import Path
 from datetime import datetime
 from typing import List, Tuple
-from huggingface_hub import login, snapshot_download, hf_hub_download, HfApi
 from src.config import (
     FORWARD_DATASET_NAME,
     RETRO_DATASET_NAME,
@@ -72,52 +73,19 @@ def _ensure_clean_dir(path: Path):
 def _download_dataset(repo_id: str, target_dir: Path) -> bool:
-    if _dir_has_arrow_files(target_dir) and (target_dir / "dataset_dict.json").exists():
         return True
     if not HF_MODEL_TOKEN:
         print(f"⚠️ Cannot download dataset {repo_id}: HF_MODEL_TOKEN not set.")
         return False
     try:
-        print(f"⬇️ Downloading cached dataset from {repo_id}...")
         _ensure_clean_dir(target_dir)
-        downloaded_path = Path(
-            snapshot_download(
-                repo_id=repo_id,
-                repo_type="dataset",
-                local_dir=str(target_dir),
-                local_dir_use_symlinks=False,
-                token=HF_MODEL_TOKEN,
-                allow_patterns=["*"],
-            )
-        )
-        if downloaded_path != target_dir:
-            for item in downloaded_path.iterdir():
-                dest = target_dir / item.name
-                if dest.exists():
-                    if dest.is_dir():
-                        shutil.rmtree(dest)
-                    else:
-                        dest.unlink()
-                shutil.move(str(item), str(dest))
-        dataset_file = target_dir / "dataset_dict.json"
-        if not dataset_file.exists():
-            nested = list(target_dir.glob("**/dataset_dict.json"))
-            for cand in nested:
-                if cand.parent == target_dir:
-                    dataset_file = cand
-                    break
-                # move nested dataset up one level
-                for child in cand.parent.iterdir():
-                    dest = target_dir / child.name
-                    if dest.exists():
-                        if dest.is_dir():
-                            shutil.rmtree(dest)
-                        else:
-                            dest.unlink()
-                    shutil.move(str(child), str(dest))
-                dataset_file = target_dir / "dataset_dict.json"
-                break
-        return dataset_file.exists() and _dir_has_arrow_files(target_dir)
     except Exception as exc:
         print(f"⚠️ Could not download dataset {repo_id}: {exc}")
         return False

 from datetime import datetime
 from typing import List, Tuple
+from huggingface_hub import login, hf_hub_download, HfApi
+from datasets import load_dataset, DatasetDict
 from src.config import (
     FORWARD_DATASET_NAME,
     RETRO_DATASET_NAME,
 def _download_dataset(repo_id: str, target_dir: Path) -> bool:
+    if (target_dir / "dataset_dict.json").exists() and _dir_has_arrow_files(target_dir):
         return True
     if not HF_MODEL_TOKEN:
         print(f"⚠️ Cannot download dataset {repo_id}: HF_MODEL_TOKEN not set.")
         return False
     try:
+        print(f"⬇️ Loading dataset {repo_id} from Hugging Face Hub...")
+        ds = load_dataset(repo_id)
+        if not isinstance(ds, DatasetDict):
+            ds = DatasetDict({k: v for k, v in ds.items()})
         _ensure_clean_dir(target_dir)
+        ds.save_to_disk(str(target_dir))
+        return (target_dir / "dataset_dict.json").exists() and _dir_has_arrow_files(target_dir)
     except Exception as exc:
         print(f"⚠️ Could not download dataset {repo_id}: {exc}")
         return False

src/dataset_prepare.py CHANGED Viewed

@@ -24,7 +24,7 @@ HF_API = HfApi(token=HF_TOKEN)
 CACHE_DIR.mkdir(parents=True, exist_ok=True)
-def upload_split(local_dir: Path, repo_id: str, label: str):
     if not UPLOAD_DATASETS:
         print(f"Skipping upload of {label} dataset (ORD_UPLOAD_DATASETS disabled).")
         return
@@ -34,12 +34,13 @@ def upload_split(local_dir: Path, repo_id: str, label: str):
         return
     try:
-        print(f"Uploading {label} dataset to {repo_id}...")
         create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True, token=HF_TOKEN)
-        HF_API.upload_folder(
-            folder_path=str(local_dir),
             repo_id=repo_id,
-            repo_type="dataset",
             commit_message=f"Update {label} dataset",
         )
         print(f"✅ Uploaded {label} dataset to Hugging Face Hub.")
@@ -97,9 +98,9 @@ def build_dataset(map_fn, name: str, max_samples=None):
     dsd.save_to_disk(str(save_path))
     if name == "forward":
-        upload_split(save_path, FORWARD_DATASET_NAME, "forward")
     elif name == "retro":
-        upload_split(save_path, RETRO_DATASET_NAME, "retro")
     print(f"\n{name} dataset statistics:")
     for split_name, ds in dsd.items():

 CACHE_DIR.mkdir(parents=True, exist_ok=True)
+def push_dataset(dataset: DatasetDict, repo_id: str, label: str):
     if not UPLOAD_DATASETS:
         print(f"Skipping upload of {label} dataset (ORD_UPLOAD_DATASETS disabled).")
         return
         return
     try:
+        print(f"Uploading {label} dataset to {repo_id} via push_to_hub...")
         create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True, token=HF_TOKEN)
+        dataset.push_to_hub(
             repo_id=repo_id,
+            token=HF_TOKEN,
+            max_shard_size="2GB",
+            private=False,
             commit_message=f"Update {label} dataset",
         )
         print(f"✅ Uploaded {label} dataset to Hugging Face Hub.")
     dsd.save_to_disk(str(save_path))
     if name == "forward":
+        push_dataset(dsd, FORWARD_DATASET_NAME, "forward")
     elif name == "retro":
+        push_dataset(dsd, RETRO_DATASET_NAME, "retro")
     print(f"\n{name} dataset statistics:")
     for split_name, ds in dsd.items():