fea-surrogate / src /data /upload_dataset.py
WolfDavid's picture
Upload folder using huggingface_hub
8e5ba9e verified
"""Upload generated dataset to Hugging Face Hub.
Usage:
python -m src.data.upload_dataset --repo-id Mkaru/structural-mechanics-analytical-100k
"""
import argparse
import logging
from pathlib import Path
from huggingface_hub import HfApi, create_repo
logger = logging.getLogger(__name__)
def upload_dataset(
data_dir: str = "data/generated",
repo_id: str = "Mkaru/structural-mechanics-analytical-100k",
dataset_card_path: str = "docs/DATASET_CARD.md",
) -> None:
"""Upload Parquet files and dataset card to HF Hub."""
api = HfApi()
# Create repo if it doesn't exist
create_repo(repo_id, repo_type="dataset", exist_ok=True)
logger.info(f"Repository: {repo_id}")
data_path = Path(data_dir)
# Upload Parquet files
for split_file in data_path.glob("*.parquet"):
api.upload_file(
path_or_fileobj=str(split_file),
path_in_repo=f"data/{split_file.name}",
repo_id=repo_id,
repo_type="dataset",
)
logger.info(f"Uploaded {split_file.name}")
# Upload dataset card as README
card_path = Path(dataset_card_path)
if card_path.exists():
api.upload_file(
path_or_fileobj=str(card_path),
path_in_repo="README.md",
repo_id=repo_id,
repo_type="dataset",
)
logger.info("Uploaded dataset card")
logger.info(f"Dataset available at: https://huggingface.co/datasets/{repo_id}")
def main() -> None:
parser = argparse.ArgumentParser(description="Upload dataset to HF Hub")
parser.add_argument("--data-dir", default="data/generated")
parser.add_argument("--repo-id", default="Mkaru/structural-mechanics-analytical-100k")
parser.add_argument("--card", default="docs/DATASET_CARD.md")
args = parser.parse_args()
logging.basicConfig(level=logging.INFO)
upload_dataset(args.data_dir, args.repo_id, args.card)
if __name__ == "__main__":
main()