|
|
""" |
|
|
FinEE Validator - JSON repair and validation. |
|
|
|
|
|
Handles: |
|
|
- Broken JSON repair (using json-repair) |
|
|
- Schema validation |
|
|
- Field type coercion |
|
|
""" |
|
|
|
|
|
import json |
|
|
from typing import Dict, Any, Optional, List |
|
|
import re |
|
|
|
|
|
try: |
|
|
from json_repair import repair_json |
|
|
HAS_JSON_REPAIR = True |
|
|
except ImportError: |
|
|
HAS_JSON_REPAIR = False |
|
|
|
|
|
|
|
|
from .schema import ExtractionResult, TransactionType, Category |
|
|
|
|
|
|
|
|
def repair_llm_json(raw_output: str) -> Optional[Dict[str, Any]]: |
|
|
""" |
|
|
Attempt to repair and parse LLM JSON output. |
|
|
|
|
|
Handles common issues: |
|
|
- Missing quotes |
|
|
- Trailing commas |
|
|
- Single quotes instead of double |
|
|
- Incomplete JSON |
|
|
|
|
|
Args: |
|
|
raw_output: Raw LLM output string |
|
|
|
|
|
Returns: |
|
|
Parsed dictionary or None if repair fails |
|
|
""" |
|
|
if not raw_output: |
|
|
return None |
|
|
|
|
|
|
|
|
json_str = extract_json_from_text(raw_output) |
|
|
|
|
|
if not json_str: |
|
|
return None |
|
|
|
|
|
|
|
|
try: |
|
|
return json.loads(json_str) |
|
|
except json.JSONDecodeError: |
|
|
pass |
|
|
|
|
|
|
|
|
if HAS_JSON_REPAIR: |
|
|
try: |
|
|
repaired = repair_json(json_str) |
|
|
return json.loads(repaired) |
|
|
except (json.JSONDecodeError, Exception): |
|
|
pass |
|
|
|
|
|
|
|
|
repaired = manual_json_repair(json_str) |
|
|
try: |
|
|
return json.loads(repaired) |
|
|
except json.JSONDecodeError: |
|
|
return None |
|
|
|
|
|
|
|
|
def extract_json_from_text(text: str) -> Optional[str]: |
|
|
""" |
|
|
Extract JSON object from text that may contain other content. |
|
|
|
|
|
Args: |
|
|
text: Text potentially containing JSON |
|
|
|
|
|
Returns: |
|
|
Extracted JSON string or None |
|
|
""" |
|
|
if not text: |
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
start = text.find('{') |
|
|
end = text.rfind('}') |
|
|
|
|
|
if start != -1 and end != -1 and end > start: |
|
|
return text[start:end + 1] |
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
def manual_json_repair(json_str: str) -> str: |
|
|
""" |
|
|
Manually repair common JSON issues. |
|
|
|
|
|
Args: |
|
|
json_str: Potentially broken JSON string |
|
|
|
|
|
Returns: |
|
|
Repaired JSON string |
|
|
""" |
|
|
if not json_str: |
|
|
return json_str |
|
|
|
|
|
repaired = json_str |
|
|
|
|
|
|
|
|
repaired = re.sub(r"'([^']*)':", r'"\1":', repaired) |
|
|
repaired = re.sub(r":\s*'([^']*)'", r': "\1"', repaired) |
|
|
|
|
|
|
|
|
repaired = re.sub(r',\s*}', '}', repaired) |
|
|
repaired = re.sub(r',\s*]', ']', repaired) |
|
|
|
|
|
|
|
|
repaired = re.sub(r'(\{|\,)\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', repaired) |
|
|
|
|
|
|
|
|
repaired = repaired.replace(': None', ': null') |
|
|
repaired = repaired.replace(':None', ':null') |
|
|
repaired = repaired.replace(': True', ': true') |
|
|
repaired = repaired.replace(':True', ':true') |
|
|
repaired = repaired.replace(': False', ': false') |
|
|
repaired = repaired.replace(':False', ':false') |
|
|
|
|
|
return repaired |
|
|
|
|
|
|
|
|
def validate_extraction_result(data: Dict[str, Any]) -> ExtractionResult: |
|
|
""" |
|
|
Validate and coerce a dictionary into an ExtractionResult. |
|
|
|
|
|
Args: |
|
|
data: Dictionary from parsed JSON |
|
|
|
|
|
Returns: |
|
|
Validated ExtractionResult |
|
|
""" |
|
|
result = ExtractionResult() |
|
|
|
|
|
|
|
|
if 'amount' in data: |
|
|
amount = data['amount'] |
|
|
if isinstance(amount, (int, float)): |
|
|
result.amount = float(amount) |
|
|
elif isinstance(amount, str): |
|
|
try: |
|
|
|
|
|
cleaned = re.sub(r'[Rs\.₹,\s]', '', amount) |
|
|
result.amount = float(cleaned) |
|
|
except ValueError: |
|
|
pass |
|
|
|
|
|
|
|
|
if 'type' in data: |
|
|
type_val = str(data['type']).lower() |
|
|
if 'debit' in type_val: |
|
|
result.type = TransactionType.DEBIT |
|
|
elif 'credit' in type_val: |
|
|
result.type = TransactionType.CREDIT |
|
|
|
|
|
|
|
|
if 'date' in data: |
|
|
result.date = str(data['date']) |
|
|
|
|
|
|
|
|
for field in ['account', 'reference', 'vpa', 'merchant', 'payment_method', 'bank']: |
|
|
if field in data and data[field]: |
|
|
setattr(result, field, str(data[field])) |
|
|
|
|
|
|
|
|
if 'category' in data: |
|
|
cat_val = str(data['category']).lower() |
|
|
try: |
|
|
result.category = Category(cat_val) |
|
|
except ValueError: |
|
|
|
|
|
category_map = { |
|
|
'food': Category.FOOD, |
|
|
'dining': Category.FOOD, |
|
|
'restaurant': Category.FOOD, |
|
|
'grocery': Category.FOOD, |
|
|
'shop': Category.SHOPPING, |
|
|
'shopping': Category.SHOPPING, |
|
|
'retail': Category.SHOPPING, |
|
|
'travel': Category.TRANSPORT, |
|
|
'transport': Category.TRANSPORT, |
|
|
'cab': Category.TRANSPORT, |
|
|
'utility': Category.UTILITIES, |
|
|
'utilities': Category.UTILITIES, |
|
|
'bill': Category.UTILITIES, |
|
|
'entertainment': Category.ENTERTAINMENT, |
|
|
'movie': Category.ENTERTAINMENT, |
|
|
'transfer': Category.TRANSFER, |
|
|
'payment': Category.TRANSFER, |
|
|
} |
|
|
result.category = category_map.get(cat_val, Category.OTHER) |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
def is_valid_amount(amount: Optional[float]) -> bool: |
|
|
"""Check if amount is valid.""" |
|
|
if amount is None: |
|
|
return False |
|
|
return isinstance(amount, (int, float)) and amount > 0 |
|
|
|
|
|
|
|
|
def is_valid_date(date_str: Optional[str]) -> bool: |
|
|
"""Check if date string is valid.""" |
|
|
if not date_str: |
|
|
return False |
|
|
|
|
|
|
|
|
pattern = r'^\d{1,2}[-/]\d{1,2}[-/]\d{2,4}$' |
|
|
return bool(re.match(pattern, date_str)) |
|
|
|
|
|
|
|
|
def is_valid_reference(ref: Optional[str]) -> bool: |
|
|
"""Check if reference number is valid.""" |
|
|
if not ref: |
|
|
return False |
|
|
|
|
|
|
|
|
cleaned = re.sub(r'\W', '', ref) |
|
|
return len(cleaned) >= 10 |
|
|
|
|
|
|
|
|
def is_valid_vpa(vpa: Optional[str]) -> bool: |
|
|
"""Check if VPA is valid.""" |
|
|
if not vpa: |
|
|
return False |
|
|
|
|
|
|
|
|
pattern = r'^[a-zA-Z0-9._-]+@[a-zA-Z0-9]+$' |
|
|
return bool(re.match(pattern, vpa.lower())) |
|
|
|