File size: 1,220 Bytes
34b531b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from __future__ import annotations

import os
from pathlib import Path

from huggingface_hub import snapshot_download


PROJECT_ROOT = Path(__file__).resolve().parent
DATA_DIR = PROJECT_ROOT / "data"


def ensure_data_directories() -> None:
    for relative_path in [
        "raw/html",
        "raw/text",
        "raw/csv",
        "raw/pdf",
        "raw/images",
        "raw/metadata",
        "processed/chunks",
        "evaluation/reports",
        "logs",
    ]:
        (DATA_DIR / relative_path).mkdir(parents=True, exist_ok=True)


def download_dataset() -> None:
    repo_id = os.getenv("CHATVNS_DATASET_REPO", "").strip()
    if not repo_id:
        print("CHATVNS_DATASET_REPO is not configured; starting with bundled data.")
        return

    token = os.getenv("HF_DATASET_TOKEN") or None
    print(f"Syncing ChatVNS data from dataset repo: {repo_id}")
    snapshot_download(
        repo_id=repo_id,
        repo_type="dataset",
        token=token,
        local_dir=PROJECT_ROOT,
        allow_patterns=[
            "data/raw/**",
            "data/processed/**",
            "data/evaluation/**",
        ],
    )


if __name__ == "__main__":
    ensure_data_directories()
    download_dataset()