File size: 1,967 Bytes
8e5ba9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
"""Upload generated dataset to Hugging Face Hub.

Usage:
    python -m src.data.upload_dataset --repo-id Mkaru/structural-mechanics-analytical-100k
"""

import argparse
import logging
from pathlib import Path

from huggingface_hub import HfApi, create_repo

logger = logging.getLogger(__name__)


def upload_dataset(
    data_dir: str = "data/generated",
    repo_id: str = "Mkaru/structural-mechanics-analytical-100k",
    dataset_card_path: str = "docs/DATASET_CARD.md",
) -> None:
    """Upload Parquet files and dataset card to HF Hub."""
    api = HfApi()

    # Create repo if it doesn't exist
    create_repo(repo_id, repo_type="dataset", exist_ok=True)
    logger.info(f"Repository: {repo_id}")

    data_path = Path(data_dir)

    # Upload Parquet files
    for split_file in data_path.glob("*.parquet"):
        api.upload_file(
            path_or_fileobj=str(split_file),
            path_in_repo=f"data/{split_file.name}",
            repo_id=repo_id,
            repo_type="dataset",
        )
        logger.info(f"Uploaded {split_file.name}")

    # Upload dataset card as README
    card_path = Path(dataset_card_path)
    if card_path.exists():
        api.upload_file(
            path_or_fileobj=str(card_path),
            path_in_repo="README.md",
            repo_id=repo_id,
            repo_type="dataset",
        )
        logger.info("Uploaded dataset card")

    logger.info(f"Dataset available at: https://huggingface.co/datasets/{repo_id}")


def main() -> None:
    parser = argparse.ArgumentParser(description="Upload dataset to HF Hub")
    parser.add_argument("--data-dir", default="data/generated")
    parser.add_argument("--repo-id", default="Mkaru/structural-mechanics-analytical-100k")
    parser.add_argument("--card", default="docs/DATASET_CARD.md")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    upload_dataset(args.data_dir, args.repo_id, args.card)


if __name__ == "__main__":
    main()