File size: 2,451 Bytes
a617acd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from __future__ import annotations

import sys
from pathlib import Path

import yaml

ROOT = Path(__file__).resolve().parents[1]
SRC = ROOT / "src"
if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))

from auditenv.datasets.public_loader import load_hf_financial_rows, load_kaggle_procurement_tables


def _ok(msg: str) -> str:
    return f"OK: {msg}"


def _warn(msg: str) -> str:
    return f"WARN: {msg}"


def main() -> None:
    cfg_path = Path("configs/datasets.yaml")
    if not cfg_path.exists():
        print(_warn("configs/datasets.yaml not found"))
        return

    cfg = yaml.safe_load(cfg_path.read_text(encoding="utf-8")) or {}

    medium_cfg = cfg.get("medium", {})
    hard_cfg = cfg.get("hard", {})

    print("Data source status")
    print("-" * 40)

    # Medium
    if not medium_cfg.get("use_external", False):
        print(_warn("medium: external disabled, using synthetic fallback"))
    else:
        kp = Path(str(medium_cfg.get("kaggle_procurement_path", "")))
        if kp.exists():
            try:
                tables = load_kaggle_procurement_tables(kp)
                inv = tables["invoices"]
                print(_ok(f"medium: Kaggle invoices loaded, rows={len(inv)}"))
            except Exception as exc:
                print(_warn(f"medium: Kaggle load failed: {exc}"))
        else:
            print(_warn(f"medium: path missing: {kp}"))

    # Hard
    if not hard_cfg.get("use_external", False):
        print(_warn("hard: external disabled, using synthetic fallback"))
    else:
        kp = Path(str(hard_cfg.get("kaggle_procurement_path", "")))
        if kp.exists():
            try:
                tables = load_kaggle_procurement_tables(kp)
                inv = tables["invoices"]
                print(_ok(f"hard: Kaggle invoices loaded, rows={len(inv)}"))
            except Exception as exc:
                print(_warn(f"hard: Kaggle load failed: {exc}"))
        else:
            print(_warn(f"hard: path missing: {kp}"))

        if hard_cfg.get("use_hf_financial", True):
            try:
                rows = load_hf_financial_rows(split="train", max_rows=5)
                print(_ok(f"hard: HF financial dataset reachable, sample_rows={len(rows)}"))
            except Exception as exc:
                print(_warn(f"hard: HF load failed: {exc}"))
        else:
            print(_warn("hard: HF dataset disabled in config"))


if __name__ == "__main__":
    main()