import os import argparse from dotenv import load_dotenv # Enable hf_transfer for faster uploads os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" from huggingface_hub import HfApi, create_repo # Load environment variables from .env.local, overriding system envs if present load_dotenv(".env.local", override=True) def upload_data(repo_id, token, data_dir=".", db_file="rag-kb.db", vector_store_dir="vector_store"): print(f"Starting upload to {repo_id}...") api = HfApi(token=token) # Create repo if not exists (private by default for safety) try: create_repo(repo_id, repo_type="dataset", private=True, token=token, exist_ok=True) print(f"Repository {repo_id} ensures.") except Exception as e: print(f"Note: Repo creation check: {e}") if "401" in str(e) or "Unauthorized" in str(e): print("\n❌ Error: Authentication failed. Your HF_TOKEN is invalid or expired.") print("Please check your token at https://huggingface.co/settings/tokens") print("You can run this script with a specific token:") print(f" python3 scripts/upload_data.py --token hf_YOUR_TOKEN") return # Upload database if os.path.exists(db_file): print(f"Uploading {db_file}...") try: api.upload_file( path_or_fileobj=db_file, path_in_repo=db_file, repo_id=repo_id, repo_type="dataset", token=token ) except Exception as e: print(f"Upload failed: {e}") if "LFS pointer" in str(e) or "400" in str(e): print("Attempting to fix LFS state by deleting remote file first...") try: api.delete_file( path_in_repo=db_file, repo_id=repo_id, repo_type="dataset", token=token ) print("Remote file deleted. Retrying upload...") api.upload_file( path_or_fileobj=db_file, path_in_repo=db_file, repo_id=repo_id, repo_type="dataset", token=token ) except Exception as delete_error: print(f"Failed to delete/retry: {delete_error}") raise e else: raise e else: print(f"Warning: {db_file} not found locally.") # Upload vector store if os.path.exists(vector_store_dir): print(f"Uploading {vector_store_dir}...") api.upload_folder( folder_path=vector_store_dir, path_in_repo=vector_store_dir, repo_id=repo_id, repo_type="dataset", token=token ) else: print(f"Warning: {vector_store_dir} not found locally.") print("Upload complete!") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Upload RAG data to Hugging Face Dataset") parser.add_argument("--repo", type=str, default="duqing2026/rag-kb-data", help="Dataset repository ID") parser.add_argument("--token", type=str, help="Hugging Face Token (write access)") args = parser.parse_args() # Try to get token from env if not provided token = args.token or os.environ.get("HF_TOKEN") # Check for dummy token and ignore it if token == "hf_XXXXXXXXXXXXXXXX": print("Warning: Detected dummy HF_TOKEN 'hf_XXXXXXXXXXXXXXXX'. Ignoring it.") token = None if not token: token = input("Enter your Hugging Face Token (Write permission): ").strip() if not token: print("\n❌ Error: No token provided. Cannot proceed.") exit(1) repo_id = args.repo if not repo_id: repo_id = input("Enter Dataset Repo ID (e.g. username/rag-data): ").strip() try: upload_data(repo_id, token) except KeyboardInterrupt: print("\n\nUpload cancelled by user.") except Exception as e: print(f"\n\n❌ An unexpected error occurred: {e}")