File size: 4,240 Bytes
b6bdef5
b3387cc
84255b6
cf7c487
 
 
 
b6bdef5
 
4b94183
 
84255b6
b3387cc
 
b6bdef5
b3387cc
 
b6bdef5
b3387cc
 
b6bdef5
b3387cc
3cc7351
 
 
 
 
 
b3387cc
 
 
 
c692f5b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b6bdef5
b3387cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b6bdef5
 
b3387cc
 
 
 
 
 
 
 
 
84255b6
 
 
 
 
b3387cc
 
 
4b94183
 
 
 
b3387cc
 
 
 
cf7c487
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
import argparse
from dotenv import load_dotenv

# Enable hf_transfer for faster uploads
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

from huggingface_hub import HfApi, create_repo

# Load environment variables from .env.local, overriding system envs if present
load_dotenv(".env.local", override=True)

def upload_data(repo_id, token, data_dir=".", db_file="rag-kb.db", vector_store_dir="vector_store"):
    print(f"Starting upload to {repo_id}...")
    api = HfApi(token=token)
    
    # Create repo if not exists (private by default for safety)
    try:
        create_repo(repo_id, repo_type="dataset", private=True, token=token, exist_ok=True)
        print(f"Repository {repo_id} ensures.")
    except Exception as e:
        print(f"Note: Repo creation check: {e}")
        if "401" in str(e) or "Unauthorized" in str(e):
            print("\n❌ Error: Authentication failed. Your HF_TOKEN is invalid or expired.")
            print("Please check your token at https://huggingface.co/settings/tokens")
            print("You can run this script with a specific token:")
            print(f"  python3 scripts/upload_data.py --token hf_YOUR_TOKEN")
            return

    # Upload database
    if os.path.exists(db_file):
        print(f"Uploading {db_file}...")
        try:
            api.upload_file(
                path_or_fileobj=db_file,
                path_in_repo=db_file,
                repo_id=repo_id,
                repo_type="dataset",
                token=token
            )
        except Exception as e:
            print(f"Upload failed: {e}")
            if "LFS pointer" in str(e) or "400" in str(e):
                print("Attempting to fix LFS state by deleting remote file first...")
                try:
                    api.delete_file(
                        path_in_repo=db_file,
                        repo_id=repo_id,
                        repo_type="dataset",
                        token=token
                    )
                    print("Remote file deleted. Retrying upload...")
                    api.upload_file(
                        path_or_fileobj=db_file,
                        path_in_repo=db_file,
                        repo_id=repo_id,
                        repo_type="dataset",
                        token=token
                    )
                except Exception as delete_error:
                    print(f"Failed to delete/retry: {delete_error}")
                    raise e
            else:
                raise e
    else:
        print(f"Warning: {db_file} not found locally.")

    # Upload vector store
    if os.path.exists(vector_store_dir):
        print(f"Uploading {vector_store_dir}...")
        api.upload_folder(
            folder_path=vector_store_dir,
            path_in_repo=vector_store_dir,
            repo_id=repo_id,
            repo_type="dataset",
            token=token
        )
    else:
        print(f"Warning: {vector_store_dir} not found locally.")
        
    print("Upload complete!")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Upload RAG data to Hugging Face Dataset")
    parser.add_argument("--repo", type=str, default="duqing2026/rag-kb-data", help="Dataset repository ID")
    parser.add_argument("--token", type=str, help="Hugging Face Token (write access)")
    
    args = parser.parse_args()
    
    # Try to get token from env if not provided
    token = args.token or os.environ.get("HF_TOKEN")
    
    # Check for dummy token and ignore it
    if token == "hf_XXXXXXXXXXXXXXXX":
        print("Warning: Detected dummy HF_TOKEN 'hf_XXXXXXXXXXXXXXXX'. Ignoring it.")
        token = None
    
    if not token:
        token = input("Enter your Hugging Face Token (Write permission): ").strip()
        
    if not token:
        print("\n❌ Error: No token provided. Cannot proceed.")
        exit(1)
        
    repo_id = args.repo
    if not repo_id:
        repo_id = input("Enter Dataset Repo ID (e.g. username/rag-data): ").strip()
        
    try:
        upload_data(repo_id, token)
    except KeyboardInterrupt:
        print("\n\nUpload cancelled by user.")
    except Exception as e:
        print(f"\n\n❌ An unexpected error occurred: {e}")