import os import argparse from dotenv import load_dotenv from huggingface_hub import hf_hub_download, snapshot_download # Load environment variables from .env.local, overriding system envs if present load_dotenv(".env.local", override=True) def download_data(repo_id, token=None, output_dir="."): print(f"Checking for data in {repo_id}...", flush=True) # Download database try: print("Downloading rag-kb.db...", flush=True) hf_hub_download( repo_id=repo_id, filename="rag-kb.db", repo_type="dataset", local_dir=output_dir, token=token ) print("rag-kb.db downloaded.", flush=True) except Exception as e: print(f"Could not download rag-kb.db: {e}", flush=True) print("Starting with empty/new database if not present.", flush=True) # Download vector store try: print("Downloading vector_store...", flush=True) snapshot_download( repo_id=repo_id, repo_type="dataset", allow_patterns="vector_store/*", local_dir=output_dir, token=token ) print("vector_store downloaded.", flush=True) except Exception as e: print(f"Could not download vector_store: {e}", flush=True) if __name__ == "__main__": repo_id = os.environ.get("HF_DATASET_REPO", "duqing2026/rag-kb-data") token = os.environ.get("HF_TOKEN") # Check for dummy token and ignore it if token == "hf_XXXXXXXXXXXXXXXX": print("Warning: Detected dummy HF_TOKEN 'hf_XXXXXXXXXXXXXXXX'. Ignoring it.") token = None if not repo_id: print("No HF_DATASET_REPO environment variable set. Skipping download.") else: download_data(repo_id, token)