File size: 10,529 Bytes
dcc24f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
"""
Finance Entity Extractor - Professional Inference Module.

Provides structured API with JSON schema enforcement for
extracting financial entities from Indian banking emails.

Author: Ranjit Behera
License: MIT
Version: 0.8.0

Example:
    >>> from inference import FinanceExtractor
    >>> extractor = FinanceExtractor()
    >>> result = extractor.extract("Rs.2500.00 debited from account 3545...")
    >>> print(result.amount)  # "2500.00"
"""

import json
import re
from dataclasses import dataclass, asdict, field
from typing import Optional, Dict, Any, List
from enum import Enum


class TransactionType(str, Enum):
    """Transaction type enumeration."""
    CREDIT = "credit"
    DEBIT = "debit"
    UNKNOWN = "unknown"


class ExtractionFormat(str, Enum):
    """Supported input formats."""
    EMAIL = "email"
    BANK_STATEMENT = "bank_statement"
    PHONEPE = "phonepe"
    GPAY = "gpay"
    PAYTM = "paytm"


@dataclass
class FinanceEntity:
    """
    Structured financial entity extracted from text.
    
    All fields are validated and typed. Missing fields are None.
    """
    amount: Optional[str] = None
    type: Optional[str] = None
    date: Optional[str] = None
    account: Optional[str] = None
    reference: Optional[str] = None
    merchant: Optional[str] = None
    category: Optional[str] = None
    bank: Optional[str] = None
    raw_response: Optional[str] = field(default=None, repr=False)
    
    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary, excluding None values and internal fields."""
        result = {}
        for k, v in asdict(self).items():
            if v is not None and k != 'raw_response':
                result[k] = v
        return result
    
    def to_json(self) -> str:
        """Convert to JSON string."""
        return json.dumps(self.to_dict(), indent=2)
    
    def is_valid(self) -> bool:
        """Check if extraction has minimum required fields."""
        return self.amount is not None and self.type is not None
    
    def __str__(self) -> str:
        return self.to_json()


def build_prompt(text: str, format_type: ExtractionFormat = ExtractionFormat.EMAIL) -> str:
    """
    Build a standardized prompt for the model.
    
    This is the official prompt format that the model was trained on.
    Do not modify this format - it will degrade extraction quality.
    
    Args:
        text: The input text (email body, statement row, etc.)
        format_type: The type of input format
        
    Returns:
        Formatted prompt string
    """
    # Format-specific prefixes (as used in training)
    prefixes = {
        ExtractionFormat.EMAIL: "",
        ExtractionFormat.BANK_STATEMENT: "[BANK_STATEMENT] ",
        ExtractionFormat.PHONEPE: "[PHONEPE] ",
        ExtractionFormat.GPAY: "[GPAY] ",
        ExtractionFormat.PAYTM: "[PAYTM] ",
    }
    
    prefix = prefixes.get(format_type, "")
    
    # Standard prompt format (trained on this exact format)
    prompt = f"""{prefix}Extract financial entities from this email:

{text}

Extract: amount, type, date, account, reference, merchant, category
Output JSON:"""
    
    return prompt


def parse_json_response(response: str) -> Dict[str, Any]:
    """
    Parse JSON from model response with fallback patterns.
    
    Handles various response formats:
    - Clean JSON: {"amount": "500"}
    - Markdown JSON: ```json {"amount": "500"} ```
    - Conversational: "Here is the data: {..."
    
    Args:
        response: Raw model output string
        
    Returns:
        Parsed dictionary or empty dict if parsing fails
    """
    # Try direct JSON parse first
    try:
        return json.loads(response.strip())
    except json.JSONDecodeError:
        pass
    
    # Try to find JSON object in response
    patterns = [
        r'\{[^{}]+\}',  # Simple object
        r'```json\s*(\{[^`]+\})\s*```',  # Markdown code block
        r'```\s*(\{[^`]+\})\s*```',  # Generic code block
    ]
    
    for pattern in patterns:
        match = re.search(pattern, response, re.DOTALL)
        if match:
            try:
                json_str = match.group(1) if match.lastindex else match.group(0)
                return json.loads(json_str)
            except (json.JSONDecodeError, IndexError):
                continue
    
    return {}


def validate_entity(data: Dict[str, Any]) -> FinanceEntity:
    """
    Validate and normalize extracted entity data.
    
    Args:
        data: Raw parsed dictionary
        
    Returns:
        Validated FinanceEntity object
    """
    # Normalize type field
    txn_type = data.get('type', '').lower()
    if txn_type not in ('credit', 'debit'):
        txn_type = None
    
    # Normalize amount (remove commas, validate numeric)
    amount = data.get('amount', '')
    if amount:
        amount = str(amount).replace(',', '').strip()
        # Validate it's numeric
        try:
            float(amount.replace('.', '').replace('-', ''))
        except ValueError:
            amount = None
    else:
        amount = None
    
    return FinanceEntity(
        amount=amount,
        type=txn_type,
        date=data.get('date'),
        account=str(data.get('account', '')) if data.get('account') else None,
        reference=str(data.get('reference', '')) if data.get('reference') else None,
        merchant=data.get('merchant'),
        category=data.get('category'),
        bank=data.get('bank'),
    )


class FinanceExtractor:
    """
    High-level API for financial entity extraction.
    
    Provides a clean, validated interface for extracting
    financial data from Indian banking emails and statements.
    
    Example:
        >>> extractor = FinanceExtractor()
        >>> result = extractor.extract(
        ...     "Rs.2500.00 debited from account 3545 to VPA swiggy@ybl"
        ... )
        >>> print(result.amount)  # "2500.00"
        >>> print(result.to_json())
    """
    
    def __init__(self, model_path: str = None, adapter_path: str = None):
        """
        Initialize the extractor.
        
        Args:
            model_path: Path to base model (default: from HuggingFace)
            adapter_path: Path to LoRA adapters (default: from HuggingFace)
        """
        self.model_path = model_path
        self.adapter_path = adapter_path
        self._model = None
        self._tokenizer = None
    
    def _load_model(self):
        """Lazy load model on first use."""
        if self._model is not None:
            return
        
        try:
            from mlx_lm import load
        except ImportError:
            raise ImportError(
                "mlx_lm is required for MLX inference. "
                "Install with: pip install mlx-lm>=0.19.0"
            )
        
        if self.model_path and self.adapter_path:
            self._model, self._tokenizer = load(
                self.model_path,
                adapter_path=self.adapter_path
            )
        else:
            # Load from HuggingFace
            self._model, self._tokenizer = load(
                "Ranjit0034/finance-entity-extractor"
            )
    
    def extract(
        self,
        text: str,
        format_type: ExtractionFormat = ExtractionFormat.EMAIL,
        max_tokens: int = 200,
    ) -> FinanceEntity:
        """
        Extract financial entities from text.
        
        Args:
            text: Input text (email body, statement row, etc.)
            format_type: Type of input format
            max_tokens: Maximum tokens to generate
            
        Returns:
            FinanceEntity with extracted data
        """
        self._load_model()
        
        from mlx_lm import generate
        
        prompt = build_prompt(text, format_type)
        response = generate(
            self._model,
            self._tokenizer,
            prompt=prompt,
            max_tokens=max_tokens,
        )
        
        # Parse and validate
        data = parse_json_response(response)
        entity = validate_entity(data)
        entity.raw_response = response
        
        return entity
    
    def extract_batch(
        self,
        texts: List[str],
        format_type: ExtractionFormat = ExtractionFormat.EMAIL,
    ) -> List[FinanceEntity]:
        """
        Extract entities from multiple texts.
        
        Args:
            texts: List of input texts
            format_type: Type of input format
            
        Returns:
            List of FinanceEntity objects
        """
        return [self.extract(text, format_type) for text in texts]


# Convenience function for simple usage
def extract(text: str, format_type: str = "email") -> Dict[str, Any]:
    """
    Simple extraction function.
    
    Args:
        text: Input text to extract from
        format_type: One of "email", "bank_statement", "phonepe", "gpay", "paytm"
        
    Returns:
        Dictionary with extracted entities
        
    Example:
        >>> from inference import extract
        >>> result = extract("Rs.500 debited from A/c 1234")
        >>> print(result["amount"])  # "500"
    """
    format_map = {
        "email": ExtractionFormat.EMAIL,
        "bank_statement": ExtractionFormat.BANK_STATEMENT,
        "phonepe": ExtractionFormat.PHONEPE,
        "gpay": ExtractionFormat.GPAY,
        "paytm": ExtractionFormat.PAYTM,
    }
    
    extractor = FinanceExtractor()
    fmt = format_map.get(format_type.lower(), ExtractionFormat.EMAIL)
    entity = extractor.extract(text, fmt)
    
    return entity.to_dict()


if __name__ == "__main__":
    # Demo usage
    demo_email = """
    HDFC BANK Dear Customer,
    Rs.2500.00 has been debited from account 3545 to VPA swiggy@ybl 
    SWIGGY INDIA on 28-12-25.
    Your UPI transaction reference number is 534567891234.
    """
    
    print("=" * 60)
    print("Finance Entity Extractor v0.8.0 - Demo")
    print("=" * 60)
    print(f"\nInput:\n{demo_email.strip()}")
    print("\nBuilding prompt...")
    prompt = build_prompt(demo_email)
    print(f"Prompt:\n{prompt[:200]}...")
    
    # Simulate response (for testing without model)
    mock_response = '''{"amount": "2500.00", "type": "debit", "date": "28-12-25", "account": "3545", "reference": "534567891234", "merchant": "swiggy", "category": "food"}'''
    
    print("\nParsing response...")
    data = parse_json_response(mock_response)
    entity = validate_entity(data)
    
    print(f"\nExtracted Entity:")
    print(entity.to_json())
    print(f"\nValid: {entity.is_valid()}")