rag-kb-system / scripts /upload_data.py
duqing2026's picture
加速数据库上传:hf_transfer加速库
cf7c487
import os
import argparse
from dotenv import load_dotenv
# Enable hf_transfer for faster uploads
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
from huggingface_hub import HfApi, create_repo
# Load environment variables from .env.local, overriding system envs if present
load_dotenv(".env.local", override=True)
def upload_data(repo_id, token, data_dir=".", db_file="rag-kb.db", vector_store_dir="vector_store"):
print(f"Starting upload to {repo_id}...")
api = HfApi(token=token)
# Create repo if not exists (private by default for safety)
try:
create_repo(repo_id, repo_type="dataset", private=True, token=token, exist_ok=True)
print(f"Repository {repo_id} ensures.")
except Exception as e:
print(f"Note: Repo creation check: {e}")
if "401" in str(e) or "Unauthorized" in str(e):
print("\n❌ Error: Authentication failed. Your HF_TOKEN is invalid or expired.")
print("Please check your token at https://huggingface.co/settings/tokens")
print("You can run this script with a specific token:")
print(f" python3 scripts/upload_data.py --token hf_YOUR_TOKEN")
return
# Upload database
if os.path.exists(db_file):
print(f"Uploading {db_file}...")
try:
api.upload_file(
path_or_fileobj=db_file,
path_in_repo=db_file,
repo_id=repo_id,
repo_type="dataset",
token=token
)
except Exception as e:
print(f"Upload failed: {e}")
if "LFS pointer" in str(e) or "400" in str(e):
print("Attempting to fix LFS state by deleting remote file first...")
try:
api.delete_file(
path_in_repo=db_file,
repo_id=repo_id,
repo_type="dataset",
token=token
)
print("Remote file deleted. Retrying upload...")
api.upload_file(
path_or_fileobj=db_file,
path_in_repo=db_file,
repo_id=repo_id,
repo_type="dataset",
token=token
)
except Exception as delete_error:
print(f"Failed to delete/retry: {delete_error}")
raise e
else:
raise e
else:
print(f"Warning: {db_file} not found locally.")
# Upload vector store
if os.path.exists(vector_store_dir):
print(f"Uploading {vector_store_dir}...")
api.upload_folder(
folder_path=vector_store_dir,
path_in_repo=vector_store_dir,
repo_id=repo_id,
repo_type="dataset",
token=token
)
else:
print(f"Warning: {vector_store_dir} not found locally.")
print("Upload complete!")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Upload RAG data to Hugging Face Dataset")
parser.add_argument("--repo", type=str, default="duqing2026/rag-kb-data", help="Dataset repository ID")
parser.add_argument("--token", type=str, help="Hugging Face Token (write access)")
args = parser.parse_args()
# Try to get token from env if not provided
token = args.token or os.environ.get("HF_TOKEN")
# Check for dummy token and ignore it
if token == "hf_XXXXXXXXXXXXXXXX":
print("Warning: Detected dummy HF_TOKEN 'hf_XXXXXXXXXXXXXXXX'. Ignoring it.")
token = None
if not token:
token = input("Enter your Hugging Face Token (Write permission): ").strip()
if not token:
print("\n❌ Error: No token provided. Cannot proceed.")
exit(1)
repo_id = args.repo
if not repo_id:
repo_id = input("Enter Dataset Repo ID (e.g. username/rag-data): ").strip()
try:
upload_data(repo_id, token)
except KeyboardInterrupt:
print("\n\nUpload cancelled by user.")
except Exception as e:
print(f"\n\n❌ An unexpected error occurred: {e}")