Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| from pathlib import Path | |
| from typing import Any, Dict, Optional | |
| def load_kaggle_procurement_tables(base_path: str | Path) -> Dict[str, Any]: | |
| try: | |
| import pandas as pd # type: ignore[import-not-found] | |
| except ImportError as exc: | |
| raise RuntimeError("pandas is required for Kaggle dataset ingestion") from exc | |
| base = Path(base_path) | |
| if not base.exists(): | |
| raise FileNotFoundError(f"Kaggle dataset path not found: {base}") | |
| tables: Dict[str, Any] = {} | |
| for table_name in ["invoices", "suppliers", "departments", "labels"]: | |
| parquet_path = base / f"{table_name}.parquet" | |
| csv_path = base / f"{table_name}.csv" | |
| if parquet_path.exists(): | |
| tables[table_name] = pd.read_parquet(parquet_path) | |
| elif csv_path.exists(): | |
| tables[table_name] = pd.read_csv(csv_path) | |
| if "invoices" not in tables: | |
| raise RuntimeError("Expected invoices table in Kaggle dataset folder") | |
| return tables | |
| def load_hf_financial_rows(split: str = "train", max_rows: Optional[int] = None) -> list[dict[str, Any]]: | |
| try: | |
| from datasets import load_dataset | |
| except ImportError as exc: | |
| raise RuntimeError("datasets library is required for Hugging Face dataset ingestion") from exc | |
| ds = load_dataset("amitkedia/Financial-Fraud-Dataset", split=split) | |
| if max_rows is not None: | |
| ds = ds.select(range(min(max_rows, len(ds)))) | |
| return [dict(row) for row in ds] | |