Spaces:
Build error
Build error
| import os | |
| import argparse | |
| from dotenv import load_dotenv | |
| # Enable hf_transfer for faster uploads | |
| os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" | |
| from huggingface_hub import HfApi, create_repo | |
| # Load environment variables from .env.local, overriding system envs if present | |
| load_dotenv(".env.local", override=True) | |
| def upload_data(repo_id, token, data_dir=".", db_file="rag-kb.db", vector_store_dir="vector_store"): | |
| print(f"Starting upload to {repo_id}...") | |
| api = HfApi(token=token) | |
| # Create repo if not exists (private by default for safety) | |
| try: | |
| create_repo(repo_id, repo_type="dataset", private=True, token=token, exist_ok=True) | |
| print(f"Repository {repo_id} ensures.") | |
| except Exception as e: | |
| print(f"Note: Repo creation check: {e}") | |
| if "401" in str(e) or "Unauthorized" in str(e): | |
| print("\n❌ Error: Authentication failed. Your HF_TOKEN is invalid or expired.") | |
| print("Please check your token at https://huggingface.co/settings/tokens") | |
| print("You can run this script with a specific token:") | |
| print(f" python3 scripts/upload_data.py --token hf_YOUR_TOKEN") | |
| return | |
| # Upload database | |
| if os.path.exists(db_file): | |
| print(f"Uploading {db_file}...") | |
| try: | |
| api.upload_file( | |
| path_or_fileobj=db_file, | |
| path_in_repo=db_file, | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| token=token | |
| ) | |
| except Exception as e: | |
| print(f"Upload failed: {e}") | |
| if "LFS pointer" in str(e) or "400" in str(e): | |
| print("Attempting to fix LFS state by deleting remote file first...") | |
| try: | |
| api.delete_file( | |
| path_in_repo=db_file, | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| token=token | |
| ) | |
| print("Remote file deleted. Retrying upload...") | |
| api.upload_file( | |
| path_or_fileobj=db_file, | |
| path_in_repo=db_file, | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| token=token | |
| ) | |
| except Exception as delete_error: | |
| print(f"Failed to delete/retry: {delete_error}") | |
| raise e | |
| else: | |
| raise e | |
| else: | |
| print(f"Warning: {db_file} not found locally.") | |
| # Upload vector store | |
| if os.path.exists(vector_store_dir): | |
| print(f"Uploading {vector_store_dir}...") | |
| api.upload_folder( | |
| folder_path=vector_store_dir, | |
| path_in_repo=vector_store_dir, | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| token=token | |
| ) | |
| else: | |
| print(f"Warning: {vector_store_dir} not found locally.") | |
| print("Upload complete!") | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="Upload RAG data to Hugging Face Dataset") | |
| parser.add_argument("--repo", type=str, default="duqing2026/rag-kb-data", help="Dataset repository ID") | |
| parser.add_argument("--token", type=str, help="Hugging Face Token (write access)") | |
| args = parser.parse_args() | |
| # Try to get token from env if not provided | |
| token = args.token or os.environ.get("HF_TOKEN") | |
| # Check for dummy token and ignore it | |
| if token == "hf_XXXXXXXXXXXXXXXX": | |
| print("Warning: Detected dummy HF_TOKEN 'hf_XXXXXXXXXXXXXXXX'. Ignoring it.") | |
| token = None | |
| if not token: | |
| token = input("Enter your Hugging Face Token (Write permission): ").strip() | |
| if not token: | |
| print("\n❌ Error: No token provided. Cannot proceed.") | |
| exit(1) | |
| repo_id = args.repo | |
| if not repo_id: | |
| repo_id = input("Enter Dataset Repo ID (e.g. username/rag-data): ").strip() | |
| try: | |
| upload_data(repo_id, token) | |
| except KeyboardInterrupt: | |
| print("\n\nUpload cancelled by user.") | |
| except Exception as e: | |
| print(f"\n\n❌ An unexpected error occurred: {e}") | |