rag-kb-system / scripts /download_data.py
duqing2026's picture
Enhance startup script logging and fix CMD execution
2f8a316
import os
import argparse
from dotenv import load_dotenv
from huggingface_hub import hf_hub_download, snapshot_download
# Load environment variables from .env.local, overriding system envs if present
load_dotenv(".env.local", override=True)
def download_data(repo_id, token=None, output_dir="."):
print(f"Checking for data in {repo_id}...", flush=True)
# Download database
try:
print("Downloading rag-kb.db...", flush=True)
hf_hub_download(
repo_id=repo_id,
filename="rag-kb.db",
repo_type="dataset",
local_dir=output_dir,
token=token
)
print("rag-kb.db downloaded.", flush=True)
except Exception as e:
print(f"Could not download rag-kb.db: {e}", flush=True)
print("Starting with empty/new database if not present.", flush=True)
# Download vector store
try:
print("Downloading vector_store...", flush=True)
snapshot_download(
repo_id=repo_id,
repo_type="dataset",
allow_patterns="vector_store/*",
local_dir=output_dir,
token=token
)
print("vector_store downloaded.", flush=True)
except Exception as e:
print(f"Could not download vector_store: {e}", flush=True)
if __name__ == "__main__":
repo_id = os.environ.get("HF_DATASET_REPO", "duqing2026/rag-kb-data")
token = os.environ.get("HF_TOKEN")
# Check for dummy token and ignore it
if token == "hf_XXXXXXXXXXXXXXXX":
print("Warning: Detected dummy HF_TOKEN 'hf_XXXXXXXXXXXXXXXX'. Ignoring it.")
token = None
if not repo_id:
print("No HF_DATASET_REPO environment variable set. Skipping download.")
else:
download_data(repo_id, token)