from __future__ import annotations import os from pathlib import Path from huggingface_hub import snapshot_download PROJECT_ROOT = Path(__file__).resolve().parent DATA_DIR = PROJECT_ROOT / "data" def ensure_data_directories() -> None: for relative_path in [ "raw/html", "raw/text", "raw/csv", "raw/pdf", "raw/images", "raw/metadata", "processed/chunks", "evaluation/reports", "logs", ]: (DATA_DIR / relative_path).mkdir(parents=True, exist_ok=True) def download_dataset() -> None: repo_id = os.getenv("CHATVNS_DATASET_REPO", "").strip() if not repo_id: print("CHATVNS_DATASET_REPO is not configured; starting with bundled data.") return token = os.getenv("HF_DATASET_TOKEN") or None print(f"Syncing ChatVNS data from dataset repo: {repo_id}") snapshot_download( repo_id=repo_id, repo_type="dataset", token=token, local_dir=PROJECT_ROOT, allow_patterns=[ "data/raw/**", "data/processed/**", "data/evaluation/**", ], ) if __name__ == "__main__": ensure_data_directories() download_dataset()