File size: 2,334 Bytes
0f77bc1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73

# tools/upload_to_hf.py
import os
from huggingface_hub import HfApi, create_repo
import json
from pathlib import Path

def upload_dataset():
    """Upload dataset to Hugging Face using GitHub secrets"""
    
    # Get credentials from GitHub secrets
    hf_token = os.getenv("HF_WRITE_TOKEN")
    hf_username = os.getenv("HF_USERNAME")
    
    if not hf_token:
        raise ValueError("❌ HF_WRITE_TOKEN secret not found in GitHub")
    if not hf_username:
        raise ValueError("❌ HF_USERNAME secret not found in GitHub")
    
    # Build repo ID from username
    hf_repo = f"{hf_username}/sap-dataset"
    
    print(f"πŸ“€ Uploading to Hugging Face: {hf_repo}")
    
    # Initialize HF API
    api = HfApi(token=hf_token)
    
    # Create repo if it doesn't exist
    try:
        create_repo(repo_id=hf_repo, repo_type="dataset", exist_ok=True, token=hf_token)
        print("βœ… Repository ready")
    except Exception as e:
        print(f"⚠️  Note: {e}")
    
    # Upload dataset file
    dataset_path = "data/sap_dataset.json"
    if Path(dataset_path).exists():
        api.upload_file(
            path_or_fileobj=dataset_path,
            path_in_repo="sap_dataset.json",
            repo_id=hf_repo,
            repo_type="dataset",
            token=hf_token
        )
        print(f"βœ… Dataset uploaded successfully to {hf_repo}")
        
        # Also upload a dataset card
        dataset_card = {
            "dataset_name": "SAP Knowledge Base",
            "description": "Multi-source SAP dataset (Community, StackOverflow, GitHub, Dev.to, Medium, SAP Developers tutorials)",
            "language": "en",
            "task_categories": ["question-answering", "text-generation"],
            "tags": ["sap", "basis", "abap", "hana", "btp", "fiori", "ui5", "qa"]
        }
        
        with open("data/dataset_card.json", "w") as f:
            json.dump(dataset_card, f, indent=2)
            
        api.upload_file(
            path_or_fileobj="data/dataset_card.json",
            path_in_repo="dataset_card.json",
            repo_id=hf_repo,
            repo_type="dataset",
            token=hf_token
        )
        print("βœ… Dataset card uploaded")
        
    else:
        print(f"❌ Dataset file {dataset_path} not found")

if __name__ == "__main__":
    upload_dataset()