Spaces:
Sleeping
Sleeping
File size: 1,967 Bytes
8e5ba9e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | """Upload generated dataset to Hugging Face Hub.
Usage:
python -m src.data.upload_dataset --repo-id Mkaru/structural-mechanics-analytical-100k
"""
import argparse
import logging
from pathlib import Path
from huggingface_hub import HfApi, create_repo
logger = logging.getLogger(__name__)
def upload_dataset(
data_dir: str = "data/generated",
repo_id: str = "Mkaru/structural-mechanics-analytical-100k",
dataset_card_path: str = "docs/DATASET_CARD.md",
) -> None:
"""Upload Parquet files and dataset card to HF Hub."""
api = HfApi()
# Create repo if it doesn't exist
create_repo(repo_id, repo_type="dataset", exist_ok=True)
logger.info(f"Repository: {repo_id}")
data_path = Path(data_dir)
# Upload Parquet files
for split_file in data_path.glob("*.parquet"):
api.upload_file(
path_or_fileobj=str(split_file),
path_in_repo=f"data/{split_file.name}",
repo_id=repo_id,
repo_type="dataset",
)
logger.info(f"Uploaded {split_file.name}")
# Upload dataset card as README
card_path = Path(dataset_card_path)
if card_path.exists():
api.upload_file(
path_or_fileobj=str(card_path),
path_in_repo="README.md",
repo_id=repo_id,
repo_type="dataset",
)
logger.info("Uploaded dataset card")
logger.info(f"Dataset available at: https://huggingface.co/datasets/{repo_id}")
def main() -> None:
parser = argparse.ArgumentParser(description="Upload dataset to HF Hub")
parser.add_argument("--data-dir", default="data/generated")
parser.add_argument("--repo-id", default="Mkaru/structural-mechanics-analytical-100k")
parser.add_argument("--card", default="docs/DATASET_CARD.md")
args = parser.parse_args()
logging.basicConfig(level=logging.INFO)
upload_dataset(args.data_dir, args.repo_id, args.card)
if __name__ == "__main__":
main()
|