"""Upload generated dataset to Hugging Face Hub. Usage: python -m src.data.upload_dataset --repo-id Mkaru/structural-mechanics-analytical-100k """ import argparse import logging from pathlib import Path from huggingface_hub import HfApi, create_repo logger = logging.getLogger(__name__) def upload_dataset( data_dir: str = "data/generated", repo_id: str = "Mkaru/structural-mechanics-analytical-100k", dataset_card_path: str = "docs/DATASET_CARD.md", ) -> None: """Upload Parquet files and dataset card to HF Hub.""" api = HfApi() # Create repo if it doesn't exist create_repo(repo_id, repo_type="dataset", exist_ok=True) logger.info(f"Repository: {repo_id}") data_path = Path(data_dir) # Upload Parquet files for split_file in data_path.glob("*.parquet"): api.upload_file( path_or_fileobj=str(split_file), path_in_repo=f"data/{split_file.name}", repo_id=repo_id, repo_type="dataset", ) logger.info(f"Uploaded {split_file.name}") # Upload dataset card as README card_path = Path(dataset_card_path) if card_path.exists(): api.upload_file( path_or_fileobj=str(card_path), path_in_repo="README.md", repo_id=repo_id, repo_type="dataset", ) logger.info("Uploaded dataset card") logger.info(f"Dataset available at: https://huggingface.co/datasets/{repo_id}") def main() -> None: parser = argparse.ArgumentParser(description="Upload dataset to HF Hub") parser.add_argument("--data-dir", default="data/generated") parser.add_argument("--repo-id", default="Mkaru/structural-mechanics-analytical-100k") parser.add_argument("--card", default="docs/DATASET_CARD.md") args = parser.parse_args() logging.basicConfig(level=logging.INFO) upload_dataset(args.data_dir, args.repo_id, args.card) if __name__ == "__main__": main()