File size: 6,553 Bytes
dcc24f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
"""
FinEE Validator - JSON repair and validation.

Handles:
- Broken JSON repair (using json-repair)
- Schema validation
- Field type coercion
"""

import json
from typing import Dict, Any, Optional, List
import re

try:
    from json_repair import repair_json
    HAS_JSON_REPAIR = True
except ImportError:
    HAS_JSON_REPAIR = False


from .schema import ExtractionResult, TransactionType, Category


def repair_llm_json(raw_output: str) -> Optional[Dict[str, Any]]:
    """
    Attempt to repair and parse LLM JSON output.
    
    Handles common issues:
    - Missing quotes
    - Trailing commas
    - Single quotes instead of double
    - Incomplete JSON
    
    Args:
        raw_output: Raw LLM output string
        
    Returns:
        Parsed dictionary or None if repair fails
    """
    if not raw_output:
        return None
    
    # Try to extract JSON from the output
    json_str = extract_json_from_text(raw_output)
    
    if not json_str:
        return None
    
    # First, try direct parsing
    try:
        return json.loads(json_str)
    except json.JSONDecodeError:
        pass
    
    # Use json-repair if available
    if HAS_JSON_REPAIR:
        try:
            repaired = repair_json(json_str)
            return json.loads(repaired)
        except (json.JSONDecodeError, Exception):
            pass
    
    # Manual repair attempts
    repaired = manual_json_repair(json_str)
    try:
        return json.loads(repaired)
    except json.JSONDecodeError:
        return None


def extract_json_from_text(text: str) -> Optional[str]:
    """
    Extract JSON object from text that may contain other content.
    
    Args:
        text: Text potentially containing JSON
        
    Returns:
        Extracted JSON string or None
    """
    if not text:
        return None
    
    # Look for JSON object pattern
    # Find first { and last }
    start = text.find('{')
    end = text.rfind('}')
    
    if start != -1 and end != -1 and end > start:
        return text[start:end + 1]
    
    return None


def manual_json_repair(json_str: str) -> str:
    """
    Manually repair common JSON issues.
    
    Args:
        json_str: Potentially broken JSON string
        
    Returns:
        Repaired JSON string
    """
    if not json_str:
        return json_str
    
    repaired = json_str
    
    # Replace single quotes with double quotes
    repaired = re.sub(r"'([^']*)':", r'"\1":', repaired)
    repaired = re.sub(r":\s*'([^']*)'", r': "\1"', repaired)
    
    # Remove trailing commas
    repaired = re.sub(r',\s*}', '}', repaired)
    repaired = re.sub(r',\s*]', ']', repaired)
    
    # Add missing quotes around unquoted keys
    repaired = re.sub(r'(\{|\,)\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', repaired)
    
    # Handle Python-style None/True/False
    repaired = repaired.replace(': None', ': null')
    repaired = repaired.replace(':None', ':null')
    repaired = repaired.replace(': True', ': true')
    repaired = repaired.replace(':True', ':true')
    repaired = repaired.replace(': False', ': false')
    repaired = repaired.replace(':False', ':false')
    
    return repaired


def validate_extraction_result(data: Dict[str, Any]) -> ExtractionResult:
    """
    Validate and coerce a dictionary into an ExtractionResult.
    
    Args:
        data: Dictionary from parsed JSON
        
    Returns:
        Validated ExtractionResult
    """
    result = ExtractionResult()
    
    # Amount
    if 'amount' in data:
        amount = data['amount']
        if isinstance(amount, (int, float)):
            result.amount = float(amount)
        elif isinstance(amount, str):
            try:
                # Remove currency symbols
                cleaned = re.sub(r'[Rs\.₹,\s]', '', amount)
                result.amount = float(cleaned)
            except ValueError:
                pass
    
    # Type
    if 'type' in data:
        type_val = str(data['type']).lower()
        if 'debit' in type_val:
            result.type = TransactionType.DEBIT
        elif 'credit' in type_val:
            result.type = TransactionType.CREDIT
    
    # Date (keep as string)
    if 'date' in data:
        result.date = str(data['date'])
    
    # Simple string fields
    for field in ['account', 'reference', 'vpa', 'merchant', 'payment_method', 'bank']:
        if field in data and data[field]:
            setattr(result, field, str(data[field]))
    
    # Category
    if 'category' in data:
        cat_val = str(data['category']).lower()
        try:
            result.category = Category(cat_val)
        except ValueError:
            # Map common variations
            category_map = {
                'food': Category.FOOD,
                'dining': Category.FOOD,
                'restaurant': Category.FOOD,
                'grocery': Category.FOOD,
                'shop': Category.SHOPPING,
                'shopping': Category.SHOPPING,
                'retail': Category.SHOPPING,
                'travel': Category.TRANSPORT,
                'transport': Category.TRANSPORT,
                'cab': Category.TRANSPORT,
                'utility': Category.UTILITIES,
                'utilities': Category.UTILITIES,
                'bill': Category.UTILITIES,
                'entertainment': Category.ENTERTAINMENT,
                'movie': Category.ENTERTAINMENT,
                'transfer': Category.TRANSFER,
                'payment': Category.TRANSFER,
            }
            result.category = category_map.get(cat_val, Category.OTHER)
    
    return result


def is_valid_amount(amount: Optional[float]) -> bool:
    """Check if amount is valid."""
    if amount is None:
        return False
    return isinstance(amount, (int, float)) and amount > 0


def is_valid_date(date_str: Optional[str]) -> bool:
    """Check if date string is valid."""
    if not date_str:
        return False
    
    # Basic format check (DD-MM-YYYY)
    pattern = r'^\d{1,2}[-/]\d{1,2}[-/]\d{2,4}$'
    return bool(re.match(pattern, date_str))


def is_valid_reference(ref: Optional[str]) -> bool:
    """Check if reference number is valid."""
    if not ref:
        return False
    
    # Should be 10+ alphanumeric characters
    cleaned = re.sub(r'\W', '', ref)
    return len(cleaned) >= 10


def is_valid_vpa(vpa: Optional[str]) -> bool:
    """Check if VPA is valid."""
    if not vpa:
        return False
    
    # Basic VPA format: user@bank
    pattern = r'^[a-zA-Z0-9._-]+@[a-zA-Z0-9]+$'
    return bool(re.match(pattern, vpa.lower()))