sap-chatbot / tools /upload_to_hf.py
github-actions[bot]
Deploy from GitHub Actions 2025-12-11_00:05:39
0f77bc1
# tools/upload_to_hf.py
import os
from huggingface_hub import HfApi, create_repo
import json
from pathlib import Path
def upload_dataset():
"""Upload dataset to Hugging Face using GitHub secrets"""
# Get credentials from GitHub secrets
hf_token = os.getenv("HF_WRITE_TOKEN")
hf_username = os.getenv("HF_USERNAME")
if not hf_token:
raise ValueError("❌ HF_WRITE_TOKEN secret not found in GitHub")
if not hf_username:
raise ValueError("❌ HF_USERNAME secret not found in GitHub")
# Build repo ID from username
hf_repo = f"{hf_username}/sap-dataset"
print(f"πŸ“€ Uploading to Hugging Face: {hf_repo}")
# Initialize HF API
api = HfApi(token=hf_token)
# Create repo if it doesn't exist
try:
create_repo(repo_id=hf_repo, repo_type="dataset", exist_ok=True, token=hf_token)
print("βœ… Repository ready")
except Exception as e:
print(f"⚠️ Note: {e}")
# Upload dataset file
dataset_path = "data/sap_dataset.json"
if Path(dataset_path).exists():
api.upload_file(
path_or_fileobj=dataset_path,
path_in_repo="sap_dataset.json",
repo_id=hf_repo,
repo_type="dataset",
token=hf_token
)
print(f"βœ… Dataset uploaded successfully to {hf_repo}")
# Also upload a dataset card
dataset_card = {
"dataset_name": "SAP Knowledge Base",
"description": "Multi-source SAP dataset (Community, StackOverflow, GitHub, Dev.to, Medium, SAP Developers tutorials)",
"language": "en",
"task_categories": ["question-answering", "text-generation"],
"tags": ["sap", "basis", "abap", "hana", "btp", "fiori", "ui5", "qa"]
}
with open("data/dataset_card.json", "w") as f:
json.dump(dataset_card, f, indent=2)
api.upload_file(
path_or_fileobj="data/dataset_card.json",
path_in_repo="dataset_card.json",
repo_id=hf_repo,
repo_type="dataset",
token=hf_token
)
print("βœ… Dataset card uploaded")
else:
print(f"❌ Dataset file {dataset_path} not found")
if __name__ == "__main__":
upload_dataset()