Spaces:
Build error
Build error
File size: 4,240 Bytes
b6bdef5 b3387cc 84255b6 cf7c487 b6bdef5 4b94183 84255b6 b3387cc b6bdef5 b3387cc b6bdef5 b3387cc b6bdef5 b3387cc 3cc7351 b3387cc c692f5b b6bdef5 b3387cc b6bdef5 b3387cc 84255b6 b3387cc 4b94183 b3387cc cf7c487 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 | import os
import argparse
from dotenv import load_dotenv
# Enable hf_transfer for faster uploads
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
from huggingface_hub import HfApi, create_repo
# Load environment variables from .env.local, overriding system envs if present
load_dotenv(".env.local", override=True)
def upload_data(repo_id, token, data_dir=".", db_file="rag-kb.db", vector_store_dir="vector_store"):
print(f"Starting upload to {repo_id}...")
api = HfApi(token=token)
# Create repo if not exists (private by default for safety)
try:
create_repo(repo_id, repo_type="dataset", private=True, token=token, exist_ok=True)
print(f"Repository {repo_id} ensures.")
except Exception as e:
print(f"Note: Repo creation check: {e}")
if "401" in str(e) or "Unauthorized" in str(e):
print("\n❌ Error: Authentication failed. Your HF_TOKEN is invalid or expired.")
print("Please check your token at https://huggingface.co/settings/tokens")
print("You can run this script with a specific token:")
print(f" python3 scripts/upload_data.py --token hf_YOUR_TOKEN")
return
# Upload database
if os.path.exists(db_file):
print(f"Uploading {db_file}...")
try:
api.upload_file(
path_or_fileobj=db_file,
path_in_repo=db_file,
repo_id=repo_id,
repo_type="dataset",
token=token
)
except Exception as e:
print(f"Upload failed: {e}")
if "LFS pointer" in str(e) or "400" in str(e):
print("Attempting to fix LFS state by deleting remote file first...")
try:
api.delete_file(
path_in_repo=db_file,
repo_id=repo_id,
repo_type="dataset",
token=token
)
print("Remote file deleted. Retrying upload...")
api.upload_file(
path_or_fileobj=db_file,
path_in_repo=db_file,
repo_id=repo_id,
repo_type="dataset",
token=token
)
except Exception as delete_error:
print(f"Failed to delete/retry: {delete_error}")
raise e
else:
raise e
else:
print(f"Warning: {db_file} not found locally.")
# Upload vector store
if os.path.exists(vector_store_dir):
print(f"Uploading {vector_store_dir}...")
api.upload_folder(
folder_path=vector_store_dir,
path_in_repo=vector_store_dir,
repo_id=repo_id,
repo_type="dataset",
token=token
)
else:
print(f"Warning: {vector_store_dir} not found locally.")
print("Upload complete!")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Upload RAG data to Hugging Face Dataset")
parser.add_argument("--repo", type=str, default="duqing2026/rag-kb-data", help="Dataset repository ID")
parser.add_argument("--token", type=str, help="Hugging Face Token (write access)")
args = parser.parse_args()
# Try to get token from env if not provided
token = args.token or os.environ.get("HF_TOKEN")
# Check for dummy token and ignore it
if token == "hf_XXXXXXXXXXXXXXXX":
print("Warning: Detected dummy HF_TOKEN 'hf_XXXXXXXXXXXXXXXX'. Ignoring it.")
token = None
if not token:
token = input("Enter your Hugging Face Token (Write permission): ").strip()
if not token:
print("\n❌ Error: No token provided. Cannot proceed.")
exit(1)
repo_id = args.repo
if not repo_id:
repo_id = input("Enter Dataset Repo ID (e.g. username/rag-data): ").strip()
try:
upload_data(repo_id, token)
except KeyboardInterrupt:
print("\n\nUpload cancelled by user.")
except Exception as e:
print(f"\n\n❌ An unexpected error occurred: {e}")
|