from __future__ import annotations import sys from pathlib import Path import yaml ROOT = Path(__file__).resolve().parents[1] SRC = ROOT / "src" if str(SRC) not in sys.path: sys.path.insert(0, str(SRC)) from auditenv.datasets.public_loader import load_hf_financial_rows, load_kaggle_procurement_tables def _ok(msg: str) -> str: return f"OK: {msg}" def _warn(msg: str) -> str: return f"WARN: {msg}" def main() -> None: cfg_path = Path("configs/datasets.yaml") if not cfg_path.exists(): print(_warn("configs/datasets.yaml not found")) return cfg = yaml.safe_load(cfg_path.read_text(encoding="utf-8")) or {} medium_cfg = cfg.get("medium", {}) hard_cfg = cfg.get("hard", {}) print("Data source status") print("-" * 40) # Medium if not medium_cfg.get("use_external", False): print(_warn("medium: external disabled, using synthetic fallback")) else: kp = Path(str(medium_cfg.get("kaggle_procurement_path", ""))) if kp.exists(): try: tables = load_kaggle_procurement_tables(kp) inv = tables["invoices"] print(_ok(f"medium: Kaggle invoices loaded, rows={len(inv)}")) except Exception as exc: print(_warn(f"medium: Kaggle load failed: {exc}")) else: print(_warn(f"medium: path missing: {kp}")) # Hard if not hard_cfg.get("use_external", False): print(_warn("hard: external disabled, using synthetic fallback")) else: kp = Path(str(hard_cfg.get("kaggle_procurement_path", ""))) if kp.exists(): try: tables = load_kaggle_procurement_tables(kp) inv = tables["invoices"] print(_ok(f"hard: Kaggle invoices loaded, rows={len(inv)}")) except Exception as exc: print(_warn(f"hard: Kaggle load failed: {exc}")) else: print(_warn(f"hard: path missing: {kp}")) if hard_cfg.get("use_hf_financial", True): try: rows = load_hf_financial_rows(split="train", max_rows=5) print(_ok(f"hard: HF financial dataset reachable, sample_rows={len(rows)}")) except Exception as exc: print(_warn(f"hard: HF load failed: {exc}")) else: print(_warn("hard: HF dataset disabled in config")) if __name__ == "__main__": main()