scam / tests /unit /test_extractor.py
Gankit12's picture
Upload 129 files
31f0e50 verified
"""
Unit Tests for Intelligence Extraction Module.
Tests Task 7.1 implementation with acceptance criteria:
- AC-3.1.1: UPI ID extraction precision >90%
- AC-3.1.2: Bank account precision >85%
- AC-3.1.3: IFSC code precision >95%
- AC-3.1.4: Phone number precision >90%
- AC-3.1.5: Phishing link precision >95%
- AC-3.3.1: Devanagari digit conversion 100% accurate
"""
import pytest
from app.models.extractor import (
IntelligenceExtractor,
extract_intelligence,
extract_from_messages,
get_extractor,
reset_extractor,
VALID_UPI_PROVIDERS,
)
# ============================================================================
# Setup and Fixtures
# ============================================================================
@pytest.fixture
def extractor():
"""Create fresh extractor instance."""
return IntelligenceExtractor(use_spacy=False)
@pytest.fixture(autouse=True)
def reset_singleton():
"""Reset singleton before each test."""
reset_extractor()
yield
reset_extractor()
# ============================================================================
# Basic Initialization Tests
# ============================================================================
class TestExtractorInitialization:
"""Tests for IntelligenceExtractor initialization."""
def test_extractor_initialization(self, extractor):
"""Test IntelligenceExtractor initializes without errors."""
assert extractor is not None
assert isinstance(extractor.patterns, dict)
def test_extractor_has_all_patterns(self, extractor):
"""Test extractor has all required patterns."""
required = ["upi_ids", "bank_accounts", "ifsc_codes", "phone_numbers", "phishing_links"]
for pattern_name in required:
assert pattern_name in extractor.patterns
def test_extractor_has_devanagari_map(self, extractor):
"""Test extractor has Devanagari digit mapping."""
assert extractor.devanagari_map is not None
assert len(extractor.devanagari_map) == 10
def test_extract_returns_expected_format(self, extractor):
"""Test extract method returns expected tuple format."""
text = "Send money to scammer@paytm"
intel, confidence = extractor.extract(text)
assert isinstance(intel, dict)
assert isinstance(confidence, float)
assert "upi_ids" in intel
assert "bank_accounts" in intel
assert "ifsc_codes" in intel
assert "phone_numbers" in intel
assert "phishing_links" in intel
# ============================================================================
# AC-3.1.1: UPI ID Extraction Tests (>90% precision)
# ============================================================================
class TestUPIExtraction:
"""Tests for UPI ID extraction - AC-3.1.1."""
def test_extract_common_upi_providers(self, extractor):
"""Test extraction of common UPI providers."""
test_cases = [
("Pay to user@paytm", "user@paytm"),
("Send to fraud@ybl", "fraud@ybl"),
("UPI: victim@okaxis", "victim@okaxis"),
("UPI ID: scammer@okhdfcbank", "scammer@okhdfcbank"),
("Pay user@oksbi immediately", "user@oksbi"),
("Transfer to target@icici", "target@icici"),
]
for text, expected_upi in test_cases:
intel, _ = extractor.extract(text)
assert expected_upi in intel["upi_ids"], f"Failed for: {text}"
def test_extract_multiple_upi_ids(self, extractor):
"""Test extraction of multiple UPI IDs."""
text = "Pay to user@paytm or fraud@ybl or backup@okaxis"
intel, _ = extractor.extract(text)
assert len(intel["upi_ids"]) >= 3
assert "user@paytm" in intel["upi_ids"]
assert "fraud@ybl" in intel["upi_ids"]
assert "backup@okaxis" in intel["upi_ids"]
def test_exclude_email_addresses(self, extractor):
"""Test that email addresses are not extracted as UPI IDs."""
text = "Email me at user@gmail.com or contact@company.org"
intel, _ = extractor.extract(text)
# Should NOT include email domains
for upi in intel["upi_ids"]:
assert not upi.endswith("@gmail.com")
assert not upi.endswith("@company.org")
def test_upi_with_special_characters(self, extractor):
"""Test UPI IDs with dots, underscores, hyphens."""
text = "Pay to user.name@paytm or user_123@ybl or user-test@okaxis"
intel, _ = extractor.extract(text)
assert len(intel["upi_ids"]) == 3
def test_exclude_short_user_names(self, extractor):
"""Test that very short usernames are excluded."""
text = "Invalid: a@paytm"
intel, _ = extractor.extract(text)
# Single character usernames should be excluded
assert "a@paytm" not in intel["upi_ids"]
def test_valid_upi_providers_list(self):
"""Test VALID_UPI_PROVIDERS contains major providers."""
major_providers = ["paytm", "ybl", "okaxis", "okhdfcbank", "oksbi", "icici"]
for provider in major_providers:
assert provider in VALID_UPI_PROVIDERS
# ============================================================================
# AC-3.1.2: Bank Account Extraction Tests (>85% precision)
# ============================================================================
class TestBankAccountExtraction:
"""Tests for bank account extraction - AC-3.1.2."""
def test_extract_valid_bank_accounts(self, extractor):
"""Test extraction of valid bank account numbers."""
test_cases = [
("Account: 123456789012", "123456789012"), # 12 digits
("A/C No: 12345678901234", "12345678901234"), # 14 digits
("Bank account 123456789", "123456789"), # 9 digits
]
for text, expected in test_cases:
intel, _ = extractor.extract(text)
assert expected in intel["bank_accounts"], f"Failed for: {text}"
def test_exclude_phone_numbers_as_accounts(self, extractor):
"""Test that 10-digit phone numbers are excluded."""
text = "Call 9876543210 for account details"
intel, _ = extractor.extract(text)
# 10-digit numbers should not be in bank_accounts
for acc in intel["bank_accounts"]:
assert len(acc) != 10
def test_exclude_otp_codes(self, extractor):
"""Test that OTP-like numbers are excluded."""
text = "OTP: 123456 for account verification"
intel, _ = extractor.extract(text)
# 6-digit OTPs should not be extracted
assert "123456" not in intel["bank_accounts"]
def test_exclude_repeated_digits(self, extractor):
"""Test that repeated digit patterns are excluded."""
text = "Account: 111111111111"
intel, _ = extractor.extract(text)
assert "111111111111" not in intel["bank_accounts"]
def test_multiple_account_numbers(self, extractor):
"""Test extraction of multiple account numbers."""
# Use 11 and 12 digit numbers that don't start with phone-like patterns
text = "Primary: 12345678901 Secondary: 234567890123"
intel, _ = extractor.extract(text)
assert len(intel["bank_accounts"]) >= 2
assert "12345678901" in intel["bank_accounts"]
assert "234567890123" in intel["bank_accounts"]
def test_account_with_leading_zeros_excluded(self, extractor):
"""Test that numbers starting with 0 are excluded (pattern starts with 1-9)."""
text = "Account: 012345678901"
intel, _ = extractor.extract(text)
# Pattern requires first digit to be 1-9
# This might match "12345678901" instead
for acc in intel["bank_accounts"]:
assert not acc.startswith("0")
# ============================================================================
# AC-3.1.3: IFSC Code Extraction Tests (>95% precision)
# ============================================================================
class TestIFSCExtraction:
"""Tests for IFSC code extraction - AC-3.1.3."""
def test_extract_valid_ifsc_codes(self, extractor):
"""Test extraction of valid IFSC codes."""
test_cases = [
("IFSC: SBIN0001234", "SBIN0001234"), # SBI
("Code: HDFC0123456", "HDFC0123456"), # HDFC
("IFSC ICIC0000789", "ICIC0000789"), # ICICI
("Bank AXIS0SAMPLE", "AXIS0SAMPLE"), # Axis
]
for text, expected in test_cases:
intel, _ = extractor.extract(text)
assert expected in intel["ifsc_codes"], f"Failed for: {text}"
def test_ifsc_case_insensitive(self, extractor):
"""Test IFSC extraction is case insensitive but normalizes to upper."""
text = "ifsc: sbin0001234"
intel, _ = extractor.extract(text)
assert "SBIN0001234" in intel["ifsc_codes"]
def test_invalid_ifsc_format(self, extractor):
"""Test that invalid IFSC formats are excluded."""
invalid_cases = [
"SBI0001234", # Only 3 letters at start
"SBIN1001234", # 5th char not 0
"SBINX001234", # Invalid format
"SBIN000123", # Too short
]
for invalid in invalid_cases:
text = f"IFSC: {invalid}"
intel, _ = extractor.extract(text)
assert invalid not in intel["ifsc_codes"], f"Should exclude: {invalid}"
def test_multiple_ifsc_codes(self, extractor):
"""Test extraction of multiple IFSC codes."""
text = "Primary SBIN0001234, Secondary HDFC0567890"
intel, _ = extractor.extract(text)
assert len(intel["ifsc_codes"]) == 2
assert "SBIN0001234" in intel["ifsc_codes"]
assert "HDFC0567890" in intel["ifsc_codes"]
# ============================================================================
# AC-3.1.4: Phone Number Extraction Tests (>90% precision)
# ============================================================================
class TestPhoneNumberExtraction:
"""Tests for phone number extraction - AC-3.1.4."""
def test_extract_indian_mobile_numbers(self, extractor):
"""Test extraction of Indian mobile numbers."""
test_cases = [
("Call 9876543210", "+919876543210"),
("Phone: +919876543210", "+919876543210"),
("Mobile: +91-9876543210", "+919876543210"),
("Contact: 91 9876543210", "+919876543210"),
]
for text, expected in test_cases:
intel, _ = extractor.extract(text)
assert expected in intel["phone_numbers"], f"Failed for: {text}"
def test_normalize_phone_format(self, extractor):
"""Test that phone numbers are normalized to +91 format."""
text = "Call 9876543210 or +91-8765432109 or 07654321098"
intel, _ = extractor.extract(text)
# All should be normalized to +91XXXXXXXXXX format
for phone in intel["phone_numbers"]:
assert phone.startswith("+91")
assert len(phone) == 13 # +91 + 10 digits
def test_exclude_invalid_starting_digits(self, extractor):
"""Test that numbers not starting with 6-9 are excluded."""
text = "Invalid: 0123456789 or 5123456789"
intel, _ = extractor.extract(text)
for phone in intel["phone_numbers"]:
# The 10 digits after +91 should start with 6-9
assert phone[3] in "6789"
def test_exclude_repeated_digits(self, extractor):
"""Test that repeated digit patterns are excluded."""
text = "Phone: 9999999999"
intel, _ = extractor.extract(text)
assert "+919999999999" not in intel["phone_numbers"]
def test_multiple_phone_numbers(self, extractor):
"""Test extraction of multiple phone numbers."""
text = "Call 9876543210 or 8765432109 for details"
intel, _ = extractor.extract(text)
assert len(intel["phone_numbers"]) >= 2
# ============================================================================
# AC-3.1.5: Phishing Link Extraction Tests (>95% precision)
# ============================================================================
class TestPhishingLinkExtraction:
"""Tests for phishing link extraction - AC-3.1.5."""
def test_extract_suspicious_links(self, extractor):
"""Test extraction of suspicious links."""
suspicious = [
"http://fake-bank.xyz/verify",
"https://secure-banking.tk/login",
"http://kyc-update.ml/verify",
]
for link in suspicious:
text = f"Click {link} now"
intel, _ = extractor.extract(text)
assert link in intel["phishing_links"], f"Should extract: {link}"
def test_exclude_legitimate_domains(self, extractor):
"""Test that legitimate domains are excluded."""
legitimate = [
"https://www.google.com",
"https://www.paytm.com/",
"https://www.sbi.co.in",
]
for link in legitimate:
text = f"Visit {link}"
intel, _ = extractor.extract(text)
# Legitimate links should NOT be in phishing_links
for extracted in intel["phishing_links"]:
assert "google.com" not in extracted
assert "paytm.com" not in extracted
assert "sbi.co.in" not in extracted
def test_flag_ip_based_urls(self, extractor):
"""Test that IP-based URLs are flagged as suspicious."""
text = "Visit http://192.168.1.1/verify"
intel, _ = extractor.extract(text)
assert len(intel["phishing_links"]) > 0
def test_flag_url_shorteners(self, extractor):
"""Test that URL shorteners are flagged."""
shorteners = [
"http://bit.ly/abc123",
"http://tinyurl.com/xyz",
]
for link in shorteners:
text = f"Click {link}"
intel, _ = extractor.extract(text)
assert link in intel["phishing_links"], f"Should flag: {link}"
def test_flag_http_non_https(self, extractor):
"""Test that HTTP (non-HTTPS) links to unknown domains are flagged."""
text = "Visit http://unknown-bank.com/login"
intel, _ = extractor.extract(text)
assert len(intel["phishing_links"]) > 0
def test_multiple_phishing_links(self, extractor):
"""Test extraction of multiple phishing links."""
text = "Click http://fake1.xyz or http://fake2.tk for verification"
intel, _ = extractor.extract(text)
assert len(intel["phishing_links"]) >= 2
# ============================================================================
# AC-3.3.1: Devanagari Digit Conversion Tests (100% accurate)
# ============================================================================
class TestDevanagariConversion:
"""Tests for Devanagari digit conversion - AC-3.3.1."""
def test_convert_all_devanagari_digits(self, extractor):
"""Test conversion of all Devanagari digits."""
text = "Account: ०१२३४५६७८९"
converted = extractor._convert_devanagari_digits(text)
assert "0123456789" in converted
def test_mixed_devanagari_and_ascii(self, extractor):
"""Test mixed Devanagari and ASCII digits."""
text = "Phone: ९८७६5४3210"
converted = extractor._convert_devanagari_digits(text)
assert "9876543210" in converted
def test_devanagari_in_upi_context(self, extractor):
"""Test Devanagari digits in UPI payment context."""
text = "Pay ₹५००० to scammer@paytm"
converted = extractor._convert_devanagari_digits(text)
assert "5000" in converted
def test_devanagari_phone_number(self, extractor):
"""Test Devanagari phone number extraction."""
text = "Call ९८७६५४३२१०"
intel, _ = extractor.extract(text)
assert "+919876543210" in intel["phone_numbers"]
def test_devanagari_bank_account(self, extractor):
"""Test Devanagari bank account extraction."""
text = "Account: १२३४५६७८९०१२"
intel, _ = extractor.extract(text)
assert "123456789012" in intel["bank_accounts"]
def test_full_devanagari_text(self, extractor):
"""Test full Devanagari numeric text."""
# All Devanagari digits: ०१२३४५६७८९
text = "खाता संख्या: ९८७६५४३२१०१२"
intel, _ = extractor.extract(text)
# Should extract 987654321012 as bank account
assert any("9876543210" in acc for acc in intel["bank_accounts"] + intel["phone_numbers"])
# ============================================================================
# Confidence Score Tests
# ============================================================================
class TestConfidenceCalculation:
"""Tests for confidence score calculation."""
def test_empty_intel_zero_confidence(self, extractor):
"""Test that empty intel gives 0 confidence."""
intel, confidence = extractor.extract("")
assert confidence == 0.0
def test_upi_only_confidence(self, extractor):
"""Test confidence with only UPI ID."""
intel, confidence = extractor.extract("Pay to scammer@paytm")
assert confidence == 0.3 # UPI weight is 0.3
def test_upi_and_phone_confidence(self, extractor):
"""Test confidence with UPI and phone."""
intel, confidence = extractor.extract("Pay scammer@paytm or call 9876543210")
assert confidence == 0.4 # UPI(0.3) + phone(0.1)
def test_full_intel_confidence(self, extractor):
"""Test confidence with all entity types."""
text = """
Pay to fraud@paytm account 12345678901234
IFSC SBIN0001234 call 9876543210
verify http://fake.xyz
"""
intel, confidence = extractor.extract(text)
assert confidence == 1.0
def test_confidence_capped_at_1(self, extractor):
"""Test that confidence is capped at 1.0."""
text = """
Multiple UPIs: a1@paytm b2@ybl c3@okaxis
Multiple accounts: 12345678901 98765432109
Multiple phones: 9876543210 8765432109
"""
intel, confidence = extractor.extract(text)
assert confidence <= 1.0
# ============================================================================
# Convenience Function Tests
# ============================================================================
class TestExtractIntelligenceFunction:
"""Tests for convenience extract_intelligence function."""
def test_function_returns_tuple(self):
"""Test function returns expected tuple."""
text = "Send to scammer@paytm"
intel, confidence = extract_intelligence(text)
assert isinstance(intel, dict)
assert isinstance(confidence, float)
def test_function_with_empty_text(self):
"""Test function handles empty text."""
intel, confidence = extract_intelligence("")
assert len(intel["upi_ids"]) == 0
assert confidence == 0.0
def test_function_with_complex_text(self):
"""Test function with realistic scam message."""
text = """
Congratulations! You've won ₹10,00,000!
To claim, send ₹500 to winner@paytm or transfer to
account 12345678901234 IFSC HDFC0123456.
Call +919876543210 for verification.
Click http://claim-prize.xyz/verify
"""
intel, confidence = extract_intelligence(text)
assert "winner@paytm" in intel["upi_ids"]
assert "12345678901234" in intel["bank_accounts"]
assert "HDFC0123456" in intel["ifsc_codes"]
assert "+919876543210" in intel["phone_numbers"]
assert any("claim-prize.xyz" in link for link in intel["phishing_links"])
assert confidence == 1.0
def test_singleton_pattern(self):
"""Test that get_extractor returns same instance."""
ext1 = get_extractor()
ext2 = get_extractor()
assert ext1 is ext2
def test_reset_singleton(self):
"""Test singleton reset works."""
ext1 = get_extractor()
reset_extractor()
ext2 = get_extractor()
assert ext1 is not ext2
class TestExtractFromMessages:
"""Tests for extract_from_messages function."""
def test_extract_from_message_list(self):
"""Test extraction from list of messages."""
messages = [
{"turn": 1, "sender": "scammer", "message": "Send to fraud@paytm"},
{"turn": 2, "sender": "agent", "message": "What is your account?"},
{"turn": 3, "sender": "scammer", "message": "Account 12345678901234"},
]
intel, confidence = extract_from_messages(messages)
assert "fraud@paytm" in intel["upi_ids"]
assert "12345678901234" in intel["bank_accounts"]
def test_extract_from_empty_messages(self):
"""Test extraction from empty message list."""
intel, confidence = extract_from_messages([])
assert confidence == 0.0
def test_extract_handles_missing_message_key(self):
"""Test extraction handles messages without 'message' key."""
messages = [
{"turn": 1, "sender": "scammer"}, # No message key
{"turn": 2, "sender": "agent", "message": "Pay to test@paytm"},
]
intel, confidence = extract_from_messages(messages)
# Should not raise, and should extract from valid message
assert "test@paytm" in intel["upi_ids"]
# ============================================================================
# Edge Cases and Error Handling
# ============================================================================
class TestEdgeCases:
"""Tests for edge cases and error handling."""
def test_very_long_text(self, extractor):
"""Test extraction from very long text."""
text = "Send to test@paytm " + "x" * 10000 + " account 12345678901"
intel, confidence = extractor.extract(text)
assert "test@paytm" in intel["upi_ids"]
assert "12345678901" in intel["bank_accounts"]
def test_unicode_text(self, extractor):
"""Test extraction with Unicode characters."""
text = "Pay ₹5000 to scammer@paytm 📱 call 9876543210"
intel, _ = extractor.extract(text)
assert "scammer@paytm" in intel["upi_ids"]
assert "+919876543210" in intel["phone_numbers"]
def test_special_characters(self, extractor):
"""Test extraction with special characters."""
text = "Pay to user@paytm!!! Account: 12345678901###"
intel, _ = extractor.extract(text)
assert "user@paytm" in intel["upi_ids"]
def test_html_content(self, extractor):
"""Test extraction from HTML-like content."""
text = "<p>Pay to scammer@paytm</p> <a href='http://fake.xyz'>Click</a>"
intel, _ = extractor.extract(text)
assert "scammer@paytm" in intel["upi_ids"]
def test_none_text(self, extractor):
"""Test that None text doesn't crash."""
# The function expects str, but should handle gracefully
try:
intel, confidence = extractor.extract(None)
assert confidence == 0.0
except TypeError:
# Expected behavior - None is not a string
pass
def test_newlines_and_tabs(self, extractor):
"""Test extraction with newlines and tabs."""
text = "Pay to:\n\tscammer@paytm\n\tAccount:\t12345678901"
intel, _ = extractor.extract(text)
assert "scammer@paytm" in intel["upi_ids"]
assert "12345678901" in intel["bank_accounts"]
# ============================================================================
# Acceptance Criteria Verification Tests
# ============================================================================
class TestAcceptanceCriteria:
"""Tests to verify Task 7.1 acceptance criteria."""
def test_ac_3_1_1_upi_precision(self, extractor):
"""AC-3.1.1: UPI ID extraction precision >90%."""
# Test with known UPI IDs - all should be extracted
valid_upis = [
"user@paytm",
"fraud@ybl",
"scam@okaxis",
"target@okhdfcbank",
"victim@oksbi",
]
for upi in valid_upis:
intel, _ = extractor.extract(f"Pay to {upi}")
assert upi in intel["upi_ids"], f"Failed for: {upi}"
# Test false positive exclusion (email domains)
false_positives = [
"user@gmail.com",
"contact@company.org",
]
for fp in false_positives:
intel, _ = extractor.extract(f"Email: {fp}")
assert fp not in intel["upi_ids"], f"False positive: {fp}"
def test_ac_3_1_2_bank_account_precision(self, extractor):
"""AC-3.1.2: Bank account precision >85%."""
# Valid bank accounts
valid_accounts = [
"123456789012", # 12 digits
"12345678901234", # 14 digits
"123456789", # 9 digits
]
for acc in valid_accounts:
intel, _ = extractor.extract(f"Account: {acc}")
assert acc in intel["bank_accounts"], f"Failed for: {acc}"
# Should exclude phone numbers
intel, _ = extractor.extract("Phone: 9876543210")
assert "9876543210" not in intel["bank_accounts"]
def test_ac_3_1_3_ifsc_precision(self, extractor):
"""AC-3.1.3: IFSC code precision >95%."""
valid_ifsc = ["SBIN0001234", "HDFC0567890", "ICIC0BRANCH"]
for ifsc in valid_ifsc:
intel, _ = extractor.extract(f"IFSC: {ifsc}")
assert ifsc in intel["ifsc_codes"], f"Failed for: {ifsc}"
def test_ac_3_1_4_phone_precision(self, extractor):
"""AC-3.1.4: Phone number precision >90%."""
valid_phones = [
("9876543210", "+919876543210"),
("+919876543210", "+919876543210"),
("+91-9876543210", "+919876543210"),
]
for input_phone, expected in valid_phones:
intel, _ = extractor.extract(f"Call: {input_phone}")
assert expected in intel["phone_numbers"], f"Failed for: {input_phone}"
def test_ac_3_1_5_phishing_precision(self, extractor):
"""AC-3.1.5: Phishing link precision >95%."""
suspicious_links = [
"http://fake-bank.xyz/verify",
"http://bit.ly/scam",
"http://192.168.1.1/login",
]
for link in suspicious_links:
intel, _ = extractor.extract(f"Click: {link}")
assert link in intel["phishing_links"], f"Failed for: {link}"
def test_ac_3_3_1_devanagari_100_percent(self, extractor):
"""AC-3.3.1: Devanagari digit conversion 100% accurate."""
# Test all Devanagari digits
devanagari = "०१२३४५६७८९"
ascii_expected = "0123456789"
converted = extractor._convert_devanagari_digits(devanagari)
assert converted == ascii_expected
def test_verification_example_from_tasks(self, extractor):
"""Test the exact example from TASKS.md verification."""
text = "Send ₹5000 to scammer@paytm or call +919876543210"
intel, conf = extractor.extract(text)
assert "scammer@paytm" in intel["upi_ids"]
assert "+919876543210" in intel["phone_numbers"]
assert conf > 0.3