Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import sys | |
| from pathlib import Path | |
| import yaml | |
| ROOT = Path(__file__).resolve().parents[1] | |
| SRC = ROOT / "src" | |
| if str(SRC) not in sys.path: | |
| sys.path.insert(0, str(SRC)) | |
| from auditenv.datasets.public_loader import load_hf_financial_rows, load_kaggle_procurement_tables | |
| def _ok(msg: str) -> str: | |
| return f"OK: {msg}" | |
| def _warn(msg: str) -> str: | |
| return f"WARN: {msg}" | |
| def main() -> None: | |
| cfg_path = Path("configs/datasets.yaml") | |
| if not cfg_path.exists(): | |
| print(_warn("configs/datasets.yaml not found")) | |
| return | |
| cfg = yaml.safe_load(cfg_path.read_text(encoding="utf-8")) or {} | |
| medium_cfg = cfg.get("medium", {}) | |
| hard_cfg = cfg.get("hard", {}) | |
| print("Data source status") | |
| print("-" * 40) | |
| # Medium | |
| if not medium_cfg.get("use_external", False): | |
| print(_warn("medium: external disabled, using synthetic fallback")) | |
| else: | |
| kp = Path(str(medium_cfg.get("kaggle_procurement_path", ""))) | |
| if kp.exists(): | |
| try: | |
| tables = load_kaggle_procurement_tables(kp) | |
| inv = tables["invoices"] | |
| print(_ok(f"medium: Kaggle invoices loaded, rows={len(inv)}")) | |
| except Exception as exc: | |
| print(_warn(f"medium: Kaggle load failed: {exc}")) | |
| else: | |
| print(_warn(f"medium: path missing: {kp}")) | |
| # Hard | |
| if not hard_cfg.get("use_external", False): | |
| print(_warn("hard: external disabled, using synthetic fallback")) | |
| else: | |
| kp = Path(str(hard_cfg.get("kaggle_procurement_path", ""))) | |
| if kp.exists(): | |
| try: | |
| tables = load_kaggle_procurement_tables(kp) | |
| inv = tables["invoices"] | |
| print(_ok(f"hard: Kaggle invoices loaded, rows={len(inv)}")) | |
| except Exception as exc: | |
| print(_warn(f"hard: Kaggle load failed: {exc}")) | |
| else: | |
| print(_warn(f"hard: path missing: {kp}")) | |
| if hard_cfg.get("use_hf_financial", True): | |
| try: | |
| rows = load_hf_financial_rows(split="train", max_rows=5) | |
| print(_ok(f"hard: HF financial dataset reachable, sample_rows={len(rows)}")) | |
| except Exception as exc: | |
| print(_warn(f"hard: HF load failed: {exc}")) | |
| else: | |
| print(_warn("hard: HF dataset disabled in config")) | |
| if __name__ == "__main__": | |
| main() | |