Corp_AI / src /auditenv /datasets /public_loader.py
Arpit Deep
feat: initial AuditEnv submission
a617acd
from __future__ import annotations
from pathlib import Path
from typing import Any, Dict, Optional
def load_kaggle_procurement_tables(base_path: str | Path) -> Dict[str, Any]:
try:
import pandas as pd # type: ignore[import-not-found]
except ImportError as exc:
raise RuntimeError("pandas is required for Kaggle dataset ingestion") from exc
base = Path(base_path)
if not base.exists():
raise FileNotFoundError(f"Kaggle dataset path not found: {base}")
tables: Dict[str, Any] = {}
for table_name in ["invoices", "suppliers", "departments", "labels"]:
parquet_path = base / f"{table_name}.parquet"
csv_path = base / f"{table_name}.csv"
if parquet_path.exists():
tables[table_name] = pd.read_parquet(parquet_path)
elif csv_path.exists():
tables[table_name] = pd.read_csv(csv_path)
if "invoices" not in tables:
raise RuntimeError("Expected invoices table in Kaggle dataset folder")
return tables
def load_hf_financial_rows(split: str = "train", max_rows: Optional[int] = None) -> list[dict[str, Any]]:
try:
from datasets import load_dataset
except ImportError as exc:
raise RuntimeError("datasets library is required for Hugging Face dataset ingestion") from exc
ds = load_dataset("amitkedia/Financial-Fraud-Dataset", split=split)
if max_rows is not None:
ds = ds.select(range(min(max_rows, len(ds))))
return [dict(row) for row in ds]