Spaces:
Sleeping
Sleeping
| # tools/upload_to_hf.py | |
| import os | |
| from huggingface_hub import HfApi, create_repo | |
| import json | |
| from pathlib import Path | |
| def upload_dataset(): | |
| """Upload dataset to Hugging Face using GitHub secrets""" | |
| # Get credentials from GitHub secrets | |
| hf_token = os.getenv("HF_WRITE_TOKEN") | |
| hf_username = os.getenv("HF_USERNAME") | |
| if not hf_token: | |
| raise ValueError("β HF_WRITE_TOKEN secret not found in GitHub") | |
| if not hf_username: | |
| raise ValueError("β HF_USERNAME secret not found in GitHub") | |
| # Build repo ID from username | |
| hf_repo = f"{hf_username}/sap-dataset" | |
| print(f"π€ Uploading to Hugging Face: {hf_repo}") | |
| # Initialize HF API | |
| api = HfApi(token=hf_token) | |
| # Create repo if it doesn't exist | |
| try: | |
| create_repo(repo_id=hf_repo, repo_type="dataset", exist_ok=True, token=hf_token) | |
| print("β Repository ready") | |
| except Exception as e: | |
| print(f"β οΈ Note: {e}") | |
| # Upload dataset file | |
| dataset_path = "data/sap_dataset.json" | |
| if Path(dataset_path).exists(): | |
| api.upload_file( | |
| path_or_fileobj=dataset_path, | |
| path_in_repo="sap_dataset.json", | |
| repo_id=hf_repo, | |
| repo_type="dataset", | |
| token=hf_token | |
| ) | |
| print(f"β Dataset uploaded successfully to {hf_repo}") | |
| # Also upload a dataset card | |
| dataset_card = { | |
| "dataset_name": "SAP Knowledge Base", | |
| "description": "Multi-source SAP dataset (Community, StackOverflow, GitHub, Dev.to, Medium, SAP Developers tutorials)", | |
| "language": "en", | |
| "task_categories": ["question-answering", "text-generation"], | |
| "tags": ["sap", "basis", "abap", "hana", "btp", "fiori", "ui5", "qa"] | |
| } | |
| with open("data/dataset_card.json", "w") as f: | |
| json.dump(dataset_card, f, indent=2) | |
| api.upload_file( | |
| path_or_fileobj="data/dataset_card.json", | |
| path_in_repo="dataset_card.json", | |
| repo_id=hf_repo, | |
| repo_type="dataset", | |
| token=hf_token | |
| ) | |
| print("β Dataset card uploaded") | |
| else: | |
| print(f"β Dataset file {dataset_path} not found") | |
| if __name__ == "__main__": | |
| upload_dataset() | |