chatvns / hf_bootstrap.py
liamxdev's picture
Upload folder using huggingface_hub
34b531b verified
Raw
History Blame Contribute Delete
1.22 kB
from __future__ import annotations
import os
from pathlib import Path
from huggingface_hub import snapshot_download
PROJECT_ROOT = Path(__file__).resolve().parent
DATA_DIR = PROJECT_ROOT / "data"
def ensure_data_directories() -> None:
for relative_path in [
"raw/html",
"raw/text",
"raw/csv",
"raw/pdf",
"raw/images",
"raw/metadata",
"processed/chunks",
"evaluation/reports",
"logs",
]:
(DATA_DIR / relative_path).mkdir(parents=True, exist_ok=True)
def download_dataset() -> None:
repo_id = os.getenv("CHATVNS_DATASET_REPO", "").strip()
if not repo_id:
print("CHATVNS_DATASET_REPO is not configured; starting with bundled data.")
return
token = os.getenv("HF_DATASET_TOKEN") or None
print(f"Syncing ChatVNS data from dataset repo: {repo_id}")
snapshot_download(
repo_id=repo_id,
repo_type="dataset",
token=token,
local_dir=PROJECT_ROOT,
allow_patterns=[
"data/raw/**",
"data/processed/**",
"data/evaluation/**",
],
)
if __name__ == "__main__":
ensure_data_directories()
download_dataset()