| import json |
| import re |
| from pathlib import Path |
| from typing import Any |
|
|
|
|
| REQUIRED_COLUMNS = ["date", "description", "voucher_type", "amount", "closing"] |
|
|
|
|
| def extract_json_object(text: str) -> dict[str, Any]: |
| cleaned = text.strip() |
| cleaned = re.sub(r"^```(?:json)?\s*", "", cleaned, flags=re.IGNORECASE) |
| cleaned = re.sub(r"\s*```$", "", cleaned) |
| decoder = json.JSONDecoder() |
|
|
| try: |
| parsed = json.loads(cleaned) |
| except json.JSONDecodeError: |
| parsed = None |
| for match in re.finditer(r"\{", cleaned): |
| try: |
| candidate, _ = decoder.raw_decode(cleaned[match.start() :]) |
| except json.JSONDecodeError: |
| continue |
| if ( |
| isinstance(candidate, dict) |
| and candidate.get("success") is True |
| and isinstance(candidate.get("columns"), list) |
| and isinstance(candidate.get("data"), list) |
| ): |
| parsed = candidate |
| break |
| if parsed is None: |
| raise ValueError("The AI response did not contain a complete JSON object.") |
|
|
| if not isinstance(parsed, dict): |
| raise ValueError("The AI response JSON must be an object.") |
| if not isinstance(parsed.get("data"), list): |
| raise ValueError("The AI response JSON must contain a data array.") |
| if parsed["data"] == [ |
| { |
| "date": "DD/MM/YYYY", |
| "description": "Full transaction narration exactly as shown", |
| "voucher_type": "Payment or Receipt", |
| "amount": "0.00", |
| "closing": "0.00", |
| } |
| ]: |
| raise ValueError("The AI returned the placeholder example row instead of extracted transactions.") |
| return parsed |
|
|
|
|
| def combine_batch_files(batch_files: list[Path], output_path: Path) -> dict[str, Any]: |
| combined = {"success": True, "columns": REQUIRED_COLUMNS, "data": []} |
|
|
| for batch_file in batch_files: |
| parsed = json.loads(batch_file.read_text(encoding="utf-8")) |
| rows = parsed.get("data", []) |
| if not isinstance(rows, list): |
| raise ValueError(f"{batch_file.name} does not contain a data array.") |
| combined["data"].extend(rows) |
|
|
| output_path.write_text( |
| json.dumps(combined, ensure_ascii=False, indent=2), |
| encoding="utf-8", |
| ) |
| return combined |
|
|