|
|
""" |
|
|
Tests for financial entity extraction functions. |
|
|
Run with: pytest tests/ -v |
|
|
""" |
|
|
|
|
|
import pytest |
|
|
import re |
|
|
import json |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_entities(text: str) -> dict: |
|
|
"""Extract financial entities from email text.""" |
|
|
|
|
|
entities = {} |
|
|
|
|
|
|
|
|
amount_match = re.search(r'(?:Rs\.?|₹)\s*([\d,]+(?:\.\d{2})?)', text) |
|
|
if amount_match: |
|
|
entities['amount'] = amount_match.group(1).replace(',', '') |
|
|
|
|
|
|
|
|
if 'debited' in text.lower(): |
|
|
entities['type'] = 'debit' |
|
|
elif 'credited' in text.lower(): |
|
|
entities['type'] = 'credit' |
|
|
|
|
|
|
|
|
account_match = re.search(r'(?:account|A/C|a/c)\s*[:\s]?\s*(\w+)', text, re.IGNORECASE) |
|
|
if account_match: |
|
|
entities['account'] = account_match.group(1) |
|
|
|
|
|
|
|
|
date_match = re.search(r'(\d{2}-\d{2}-\d{2,4})', text) |
|
|
if date_match: |
|
|
entities['date'] = date_match.group(1) |
|
|
|
|
|
|
|
|
ref_match = re.search(r'reference\s*(?:number|no\.?)?\s*(?:is)?\s*(\d+)', text, re.IGNORECASE) |
|
|
if ref_match: |
|
|
entities['reference'] = ref_match.group(1) |
|
|
|
|
|
return entities |
|
|
|
|
|
|
|
|
def clean_text(text: str) -> str: |
|
|
"""Remove noise from email text.""" |
|
|
|
|
|
text = re.sub(r'http[s]?://\S+', '', text) |
|
|
|
|
|
text = re.sub(r'\S+@\S+\.\S+', '', text) |
|
|
|
|
|
text = re.sub(r'\s+', ' ', text) |
|
|
|
|
|
text = re.sub(r'\S{80,}', '', text) |
|
|
|
|
|
return text.strip() |
|
|
|
|
|
|
|
|
def extract_json(response: str) -> dict: |
|
|
"""Extract JSON object from LLM response.""" |
|
|
match = re.search(r'\{[^{}]*\}', response) |
|
|
if match: |
|
|
try: |
|
|
return json.loads(match.group()) |
|
|
except json.JSONDecodeError: |
|
|
return None |
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestAmountExtraction: |
|
|
"""Test cases for amount extraction.""" |
|
|
|
|
|
def test_amount_with_rupee_symbol(self): |
|
|
"""Test extraction with ₹ symbol.""" |
|
|
text = "₹2500.00 has been debited" |
|
|
result = extract_entities(text) |
|
|
assert result.get('amount') == '2500.00' |
|
|
|
|
|
def test_amount_with_rs_dot(self): |
|
|
"""Test extraction with Rs. format.""" |
|
|
text = "Rs.1500.50 credited to your account" |
|
|
result = extract_entities(text) |
|
|
assert result.get('amount') == '1500.50' |
|
|
|
|
|
def test_amount_with_rs_space(self): |
|
|
"""Test extraction with Rs (no dot).""" |
|
|
text = "Rs 3000 has been debited" |
|
|
result = extract_entities(text) |
|
|
assert result.get('amount') == '3000' |
|
|
|
|
|
def test_amount_with_commas(self): |
|
|
"""Test extraction with comma-separated amount.""" |
|
|
text = "Rs.50,000.00 credited" |
|
|
result = extract_entities(text) |
|
|
assert result.get('amount') == '50000.00' |
|
|
|
|
|
def test_amount_large_number(self): |
|
|
"""Test large amount extraction.""" |
|
|
text = "₹1,25,000.00 transferred" |
|
|
result = extract_entities(text) |
|
|
assert result.get('amount') == '125000.00' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestTransactionType: |
|
|
"""Test cases for transaction type detection.""" |
|
|
|
|
|
def test_debit_detection(self): |
|
|
"""Test debit transaction detection.""" |
|
|
text = "Rs.500 has been debited from your account" |
|
|
result = extract_entities(text) |
|
|
assert result.get('type') == 'debit' |
|
|
|
|
|
def test_credit_detection(self): |
|
|
"""Test credit transaction detection.""" |
|
|
text = "Rs.1000 credited to your account" |
|
|
result = extract_entities(text) |
|
|
assert result.get('type') == 'credit' |
|
|
|
|
|
def test_debit_case_insensitive(self): |
|
|
"""Test case insensitive debit detection.""" |
|
|
text = "Rs.500 DEBITED from account" |
|
|
result = extract_entities(text) |
|
|
assert result.get('type') == 'debit' |
|
|
|
|
|
def test_no_transaction_type(self): |
|
|
"""Test when no transaction type is present.""" |
|
|
text = "Rs.500 transferred to your account" |
|
|
result = extract_entities(text) |
|
|
assert 'type' not in result |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestAccountExtraction: |
|
|
"""Test cases for account number extraction.""" |
|
|
|
|
|
def test_account_with_word(self): |
|
|
"""Test 'account XXXX' format.""" |
|
|
text = "debited from account 3545" |
|
|
result = extract_entities(text) |
|
|
assert result.get('account') == '3545' |
|
|
|
|
|
def test_account_with_ac(self): |
|
|
"""Test 'A/C XXXX' format.""" |
|
|
text = "credited to A/C 7890" |
|
|
result = extract_entities(text) |
|
|
assert result.get('account') == '7890' |
|
|
|
|
|
def test_account_masked(self): |
|
|
"""Test masked account like **3545. |
|
|
|
|
|
NOTE: Current regex doesn't capture masked accounts with ** prefix. |
|
|
This is a known limitation - future improvement needed. |
|
|
""" |
|
|
text = "credited to your account **3545" |
|
|
result = extract_entities(text) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
assert result.get('account') is None or result.get('account').isalnum() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestDateExtraction: |
|
|
"""Test cases for date extraction.""" |
|
|
|
|
|
def test_date_dd_mm_yy(self): |
|
|
"""Test DD-MM-YY format.""" |
|
|
text = "transaction on 28-12-25" |
|
|
result = extract_entities(text) |
|
|
assert result.get('date') == '28-12-25' |
|
|
|
|
|
def test_date_dd_mm_yyyy(self): |
|
|
"""Test DD-MM-YYYY format.""" |
|
|
text = "transaction on 28-12-2025" |
|
|
result = extract_entities(text) |
|
|
assert result.get('date') == '28-12-2025' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestReferenceExtraction: |
|
|
"""Test cases for reference number extraction.""" |
|
|
|
|
|
def test_reference_number(self): |
|
|
"""Test UPI reference extraction.""" |
|
|
text = "Your UPI transaction reference number is 535899488403" |
|
|
result = extract_entities(text) |
|
|
assert result.get('reference') == '535899488403' |
|
|
|
|
|
def test_reference_no(self): |
|
|
"""Test reference no. format.""" |
|
|
text = "Reference no. 123456789" |
|
|
result = extract_entities(text) |
|
|
assert result.get('reference') == '123456789' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestFullEmailExtraction: |
|
|
"""Test complete email extraction.""" |
|
|
|
|
|
def test_hdfc_upi_debit(self): |
|
|
"""Test HDFC UPI debit email.""" |
|
|
text = """ |
|
|
HDFC BANK Dear Customer, Rs.2500.00 has been debited from account 3545 |
|
|
to VPA swiggy@ybl for Swiggy order on 28-12-25. |
|
|
Your UPI transaction reference number is 534567891234. |
|
|
""" |
|
|
result = extract_entities(text) |
|
|
|
|
|
assert result.get('amount') == '2500.00' |
|
|
assert result.get('type') == 'debit' |
|
|
assert result.get('account') == '3545' |
|
|
assert result.get('date') == '28-12-25' |
|
|
assert result.get('reference') == '534567891234' |
|
|
|
|
|
def test_salary_credit(self): |
|
|
"""Test salary credit email.""" |
|
|
text = """ |
|
|
Dear Customer, Rs.45,000.00 has been credited to your account 7890 |
|
|
on 27-12-25. Salary from ACME CORP. Reference number is 123456789. |
|
|
""" |
|
|
result = extract_entities(text) |
|
|
|
|
|
assert result.get('amount') == '45000.00' |
|
|
assert result.get('type') == 'credit' |
|
|
assert result.get('date') == '27-12-25' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestTextCleaning: |
|
|
"""Test cases for text cleaning.""" |
|
|
|
|
|
def test_remove_urls(self): |
|
|
"""Test URL removal.""" |
|
|
text = "Click here https://example.com/link to verify" |
|
|
result = clean_text(text) |
|
|
assert 'https://example.com' not in result |
|
|
assert 'Click here' in result |
|
|
|
|
|
def test_remove_email_addresses(self): |
|
|
"""Test email address removal.""" |
|
|
text = "Contact us at support@bank.com for help" |
|
|
result = clean_text(text) |
|
|
assert 'support@bank.com' not in result |
|
|
|
|
|
def test_normalize_whitespace(self): |
|
|
"""Test whitespace normalization.""" |
|
|
text = "Hello World\n\nTest" |
|
|
result = clean_text(text) |
|
|
assert result == "Hello World Test" |
|
|
|
|
|
def test_empty_string(self): |
|
|
"""Test empty string handling.""" |
|
|
result = clean_text("") |
|
|
assert result == "" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestJsonExtraction: |
|
|
"""Test cases for JSON extraction from LLM response.""" |
|
|
|
|
|
def test_extract_valid_json(self): |
|
|
"""Test valid JSON extraction.""" |
|
|
response = 'Some text {"category": "finance", "confidence": "high"} more text' |
|
|
result = extract_json(response) |
|
|
assert result == {"category": "finance", "confidence": "high"} |
|
|
|
|
|
def test_extract_json_with_newlines(self): |
|
|
"""Test JSON with formatting.""" |
|
|
response = ''' |
|
|
Here is the result: |
|
|
{"amount": "500", "type": "debit"} |
|
|
''' |
|
|
result = extract_json(response) |
|
|
assert result['amount'] == '500' |
|
|
assert result['type'] == 'debit' |
|
|
|
|
|
def test_no_json_in_response(self): |
|
|
"""Test when no JSON is present.""" |
|
|
response = "I couldn't extract any entities from this email." |
|
|
result = extract_json(response) |
|
|
assert result is None |
|
|
|
|
|
def test_invalid_json(self): |
|
|
"""Test malformed JSON handling.""" |
|
|
response = '{"amount": 500, "type": }' |
|
|
result = extract_json(response) |
|
|
assert result is None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestEdgeCases: |
|
|
"""Test edge cases and boundary conditions.""" |
|
|
|
|
|
def test_empty_text(self): |
|
|
"""Test empty text.""" |
|
|
result = extract_entities("") |
|
|
assert result == {} |
|
|
|
|
|
def test_no_financial_content(self): |
|
|
"""Test non-financial email.""" |
|
|
text = "Hello, how are you? Let's meet tomorrow." |
|
|
result = extract_entities(text) |
|
|
assert 'amount' not in result |
|
|
assert 'type' not in result |
|
|
|
|
|
def test_multiple_amounts(self): |
|
|
"""Test email with multiple amounts (should get first).""" |
|
|
text = "Rs.500 debited, balance Rs.1000" |
|
|
result = extract_entities(text) |
|
|
assert result.get('amount') == '500' |
|
|
|
|
|
def test_unicode_text(self): |
|
|
"""Test with Unicode characters.""" |
|
|
text = "₹500.00 has been debited 📱💰" |
|
|
result = extract_entities(text) |
|
|
assert result.get('amount') == '500.00' |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
pytest.main([__file__, "-v"]) |
|
|
|