Corp_AI / scripts /check_data_sources.py
Arpit Deep
feat: initial AuditEnv submission
a617acd
from __future__ import annotations
import sys
from pathlib import Path
import yaml
ROOT = Path(__file__).resolve().parents[1]
SRC = ROOT / "src"
if str(SRC) not in sys.path:
sys.path.insert(0, str(SRC))
from auditenv.datasets.public_loader import load_hf_financial_rows, load_kaggle_procurement_tables
def _ok(msg: str) -> str:
return f"OK: {msg}"
def _warn(msg: str) -> str:
return f"WARN: {msg}"
def main() -> None:
cfg_path = Path("configs/datasets.yaml")
if not cfg_path.exists():
print(_warn("configs/datasets.yaml not found"))
return
cfg = yaml.safe_load(cfg_path.read_text(encoding="utf-8")) or {}
medium_cfg = cfg.get("medium", {})
hard_cfg = cfg.get("hard", {})
print("Data source status")
print("-" * 40)
# Medium
if not medium_cfg.get("use_external", False):
print(_warn("medium: external disabled, using synthetic fallback"))
else:
kp = Path(str(medium_cfg.get("kaggle_procurement_path", "")))
if kp.exists():
try:
tables = load_kaggle_procurement_tables(kp)
inv = tables["invoices"]
print(_ok(f"medium: Kaggle invoices loaded, rows={len(inv)}"))
except Exception as exc:
print(_warn(f"medium: Kaggle load failed: {exc}"))
else:
print(_warn(f"medium: path missing: {kp}"))
# Hard
if not hard_cfg.get("use_external", False):
print(_warn("hard: external disabled, using synthetic fallback"))
else:
kp = Path(str(hard_cfg.get("kaggle_procurement_path", "")))
if kp.exists():
try:
tables = load_kaggle_procurement_tables(kp)
inv = tables["invoices"]
print(_ok(f"hard: Kaggle invoices loaded, rows={len(inv)}"))
except Exception as exc:
print(_warn(f"hard: Kaggle load failed: {exc}"))
else:
print(_warn(f"hard: path missing: {kp}"))
if hard_cfg.get("use_hf_financial", True):
try:
rows = load_hf_financial_rows(split="train", max_rows=5)
print(_ok(f"hard: HF financial dataset reachable, sample_rows={len(rows)}"))
except Exception as exc:
print(_warn(f"hard: HF load failed: {exc}"))
else:
print(_warn("hard: HF dataset disabled in config"))
if __name__ == "__main__":
main()