| from __future__ import annotations | |
| import os | |
| from pathlib import Path | |
| from huggingface_hub import snapshot_download | |
| PROJECT_ROOT = Path(__file__).resolve().parent | |
| DATA_DIR = PROJECT_ROOT / "data" | |
| def ensure_data_directories() -> None: | |
| for relative_path in [ | |
| "raw/html", | |
| "raw/text", | |
| "raw/csv", | |
| "raw/pdf", | |
| "raw/images", | |
| "raw/metadata", | |
| "processed/chunks", | |
| "evaluation/reports", | |
| "logs", | |
| ]: | |
| (DATA_DIR / relative_path).mkdir(parents=True, exist_ok=True) | |
| def download_dataset() -> None: | |
| repo_id = os.getenv("CHATVNS_DATASET_REPO", "").strip() | |
| if not repo_id: | |
| print("CHATVNS_DATASET_REPO is not configured; starting with bundled data.") | |
| return | |
| token = os.getenv("HF_DATASET_TOKEN") or None | |
| print(f"Syncing ChatVNS data from dataset repo: {repo_id}") | |
| snapshot_download( | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| token=token, | |
| local_dir=PROJECT_ROOT, | |
| allow_patterns=[ | |
| "data/raw/**", | |
| "data/processed/**", | |
| "data/evaluation/**", | |
| ], | |
| ) | |
| if __name__ == "__main__": | |
| ensure_data_directories() | |
| download_dataset() | |