Spaces:
Sleeping
Sleeping
File size: 2,451 Bytes
a617acd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 | from __future__ import annotations
import sys
from pathlib import Path
import yaml
ROOT = Path(__file__).resolve().parents[1]
SRC = ROOT / "src"
if str(SRC) not in sys.path:
sys.path.insert(0, str(SRC))
from auditenv.datasets.public_loader import load_hf_financial_rows, load_kaggle_procurement_tables
def _ok(msg: str) -> str:
return f"OK: {msg}"
def _warn(msg: str) -> str:
return f"WARN: {msg}"
def main() -> None:
cfg_path = Path("configs/datasets.yaml")
if not cfg_path.exists():
print(_warn("configs/datasets.yaml not found"))
return
cfg = yaml.safe_load(cfg_path.read_text(encoding="utf-8")) or {}
medium_cfg = cfg.get("medium", {})
hard_cfg = cfg.get("hard", {})
print("Data source status")
print("-" * 40)
# Medium
if not medium_cfg.get("use_external", False):
print(_warn("medium: external disabled, using synthetic fallback"))
else:
kp = Path(str(medium_cfg.get("kaggle_procurement_path", "")))
if kp.exists():
try:
tables = load_kaggle_procurement_tables(kp)
inv = tables["invoices"]
print(_ok(f"medium: Kaggle invoices loaded, rows={len(inv)}"))
except Exception as exc:
print(_warn(f"medium: Kaggle load failed: {exc}"))
else:
print(_warn(f"medium: path missing: {kp}"))
# Hard
if not hard_cfg.get("use_external", False):
print(_warn("hard: external disabled, using synthetic fallback"))
else:
kp = Path(str(hard_cfg.get("kaggle_procurement_path", "")))
if kp.exists():
try:
tables = load_kaggle_procurement_tables(kp)
inv = tables["invoices"]
print(_ok(f"hard: Kaggle invoices loaded, rows={len(inv)}"))
except Exception as exc:
print(_warn(f"hard: Kaggle load failed: {exc}"))
else:
print(_warn(f"hard: path missing: {kp}"))
if hard_cfg.get("use_hf_financial", True):
try:
rows = load_hf_financial_rows(split="train", max_rows=5)
print(_ok(f"hard: HF financial dataset reachable, sample_rows={len(rows)}"))
except Exception as exc:
print(_warn(f"hard: HF load failed: {exc}"))
else:
print(_warn("hard: HF dataset disabled in config"))
if __name__ == "__main__":
main()
|