Ranjit Behera
FinEE v1.0 - Finance Entity Extractor
dcc24f8
"""
Email Classification Module.
This module provides production-grade email classification using both
rule-based and LLM-based approaches. It categorizes emails into predefined
categories with confidence scoring and transaction detection.
Categories:
- finance: Bank transactions, investments, payments
- shopping: E-commerce, orders, deliveries
- work: Job-related, meetings, projects
- newsletter: Digests, articles, subscriptions
- promotional: Marketing, offers, discounts
- social: Social networks, personal messages
- other: Uncategorized emails
Example:
>>> from src.data.classifier import EmailClassifier
>>> classifier = EmailClassifier()
>>> result = classifier.classify(
... subject="Transaction Alert",
... sender="HDFC Bank",
... body="Rs.500 debited from your account"
... )
>>> print(result.category)
'finance'
>>> print(result.is_transaction)
True
Author: Ranjit Behera
License: MIT
"""
from __future__ import annotations
import json
import logging
import re
from dataclasses import dataclass, asdict
from enum import Enum
from typing import (
Any,
ClassVar,
Dict,
List,
Optional,
Tuple,
Union,
)
# Configure module logger
logger = logging.getLogger(__name__)
class EmailCategory(Enum):
"""
Enumeration of email categories.
Each category represents a distinct type of email with specific
characteristics and handling requirements.
Attributes:
FINANCE: Bank and financial transaction emails.
SHOPPING: E-commerce and order-related emails.
WORK: Professional and job-related emails.
NEWSLETTER: News, articles, and subscription content.
PROMOTIONAL: Marketing and advertising emails.
SOCIAL: Social network and personal communication.
OTHER: Emails that don't fit other categories.
"""
FINANCE = "finance"
SHOPPING = "shopping"
WORK = "work"
NEWSLETTER = "newsletter"
PROMOTIONAL = "promotional"
SOCIAL = "social"
OTHER = "other"
@classmethod
def from_string(cls, value: str) -> EmailCategory:
"""
Convert string to EmailCategory enum.
Args:
value: Category name as string.
Returns:
EmailCategory: Corresponding enum value.
Raises:
ValueError: If value doesn't match any category.
"""
try:
return cls(value.lower())
except ValueError:
logger.warning(f"Unknown category '{value}', defaulting to OTHER")
return cls.OTHER
@dataclass
class ClassificationResult:
"""
Result of email classification.
Contains the predicted category, confidence level, reasoning,
and whether the email is a financial transaction.
Attributes:
category: Predicted email category.
confidence: Confidence level ('high', 'medium', 'low').
reason: Human-readable explanation for classification.
is_transaction: True if email is a financial transaction.
scores: Optional dict of category scores for debugging.
Example:
>>> result = ClassificationResult(
... category="finance",
... confidence="high",
... reason="Contains debit keywords and amount",
... is_transaction=True
... )
>>> result.to_dict()
{'category': 'finance', 'confidence': 'high', ...}
"""
category: str
confidence: str
reason: str
is_transaction: bool = False
scores: Optional[Dict[str, float]] = None
# Validation
VALID_CONFIDENCE_LEVELS: ClassVar[set] = {"high", "medium", "low"}
def __post_init__(self) -> None:
"""Validate classification result."""
if self.confidence not in self.VALID_CONFIDENCE_LEVELS:
logger.warning(f"Invalid confidence '{self.confidence}', setting to 'low'")
self.confidence = "low"
def to_dict(self) -> Dict[str, Any]:
"""
Convert result to dictionary.
Returns:
Dict[str, Any]: Classification result as dictionary.
"""
result = asdict(self)
if self.scores is None:
del result['scores']
return result
def to_json(self) -> str:
"""
Convert result to JSON string.
Returns:
str: JSON representation.
"""
return json.dumps(self.to_dict(), indent=2)
class EmailClassifier:
"""
Production-grade email classifier with rule-based and LLM support.
This classifier uses a sophisticated pattern matching system to
categorize emails and detect financial transactions. It can optionally
use an LLM for more nuanced classification.
Features:
- Multi-pattern rule-based classification
- Sender-based categorization
- Transaction detection
- Confidence scoring
- Optional LLM integration
Attributes:
use_llm: Whether to use LLM for classification.
model: LLM model instance (if use_llm=True).
tokenizer: LLM tokenizer (if use_llm=True).
Example:
>>> classifier = EmailClassifier()
>>> result = classifier.classify(
... subject="Your order has shipped",
... sender="Amazon.in",
... body="Your order #123 is on the way"
... )
>>> print(result.category)
'shopping'
Note:
For production use, the rule-based classifier is recommended
due to its speed and consistency. LLM mode requires additional
dependencies and model loading time.
"""
# Transaction detection keywords
TRANSACTION_KEYWORDS: ClassVar[set] = {
'debited', 'credited', 'transaction', 'transfer', 'payment',
'withdrawn', 'deposited', 'paid', 'received', 'upi', 'neft', 'imps',
'rtgs', 'mandate', 'autopay', 'emi', 'refund', 'cashback'
}
# Category patterns with senders and keywords
CATEGORY_PATTERNS: ClassVar[Dict[EmailCategory, Dict[str, List[str]]]] = {
EmailCategory.FINANCE: {
'senders': [
'hdfc', 'icici', 'sbi', 'axis', 'kotak', 'pnb', 'bob',
'canara', 'union bank', 'idbi', 'yes bank', 'indusind',
'bank', 'banking', 'credit card', 'mutual fund', 'zerodha',
'groww', 'upstox', 'cred', 'slice', 'paytm', 'phonepe', 'gpay'
],
'keywords': [
'transaction', 'statement', 'balance', 'debited', 'credited',
'payment', 'transfer', 'upi', 'neft', 'imps', 'account',
'investment', 'dividend', 'interest', 'emi', 'loan', 'credit',
'debit', 'mandate', 'autopay', 'sip', 'mutual fund'
],
},
EmailCategory.SHOPPING: {
'senders': [
'amazon', 'flipkart', 'myntra', 'ajio', 'nykaa', 'meesho',
'bigbasket', 'zepto', 'blinkit', 'swiggy', 'zomato', 'dunzo',
'decathlon', 'ikea', 'pepperfry', 'urban ladder'
],
'keywords': [
'order', 'shipped', 'delivered', 'delivery', 'tracking',
'purchase', 'cart', 'checkout', 'invoice', 'receipt',
'refund', 'return', 'exchange', 'dispatched', 'out for delivery'
],
},
EmailCategory.WORK: {
'senders': [
'linkedin', 'indeed', 'naukri', 'glassdoor', 'angel.co',
'slack', 'zoom', 'teams', 'meet', 'jira', 'confluence',
'github', 'gitlab', 'bitbucket', 'notion', 'asana', 'trello'
],
'keywords': [
'interview', 'meeting', 'agenda', 'project', 'deadline',
'review', 'standup', 'sprint', 'task', 'report', 'submission',
'application', 'resume', 'cv', 'job', 'position', 'hiring',
'salary', 'offer letter', 'joining', 'onboarding'
],
},
EmailCategory.NEWSLETTER: {
'senders': [
'substack', 'medium', 'morning brew', 'digest', 'newsletter',
'daily', 'weekly', 'update', 'news', 'times', 'hindu', 'express'
],
'keywords': [
'newsletter', 'digest', 'weekly', 'daily', 'update', 'news',
'article', 'read', 'story', 'trending', 'top stories',
'this week', 'this month', 'roundup', 'edition'
],
},
EmailCategory.PROMOTIONAL: {
'senders': [
'offer', 'deal', 'discount', 'sale', 'promo', 'marketing',
'shopify', 'mailchimp', 'campaign'
],
'keywords': [
'offer', 'discount', 'sale', 'deal', 'coupon', 'promo',
'limited time', 'exclusive', 'special', 'hurry', 'ends soon',
'flash sale', 'clearance', 'save', '% off', 'free shipping',
'buy now', 'shop now', 'don\'t miss'
],
},
EmailCategory.SOCIAL: {
'senders': [
'facebook', 'instagram', 'twitter', 'whatsapp', 'telegram',
'snapchat', 'tiktok', 'youtube', 'reddit', 'discord', 'quora'
],
'keywords': [
'friend request', 'like', 'comment', 'share', 'mentioned',
'tagged', 'message', 'follow', 'notification', 'birthday',
'invitation', 'event', 'rsvp', 'group'
],
},
}
# LLM prompt template
CLASSIFICATION_PROMPT: ClassVar[str] = """You are an email classifier. Analyze this email and categorize it.
EMAIL:
Subject: {subject}
From: {sender}
Body: {body}
TASK:
Classify this email into exactly ONE category.
CATEGORIES:
- finance: Banks, payments, transactions, investments, credit cards, loans
- shopping: Orders, deliveries, purchases, e-commerce, food delivery
- work: Job-related, recruitment, office, meetings, projects
- newsletter: Digests, subscriptions, blogs, articles
- promotional: Marketing, offers, discounts, advertisements
- social: Social networks, personal messages, invitations
- other: Anything that doesn't fit above
OUTPUT FORMAT (JSON only, no other text):
{{"category": "<category>", "confidence": "<high/medium/low>", "reason": "<brief reason>"}}
"""
def __init__(
self,
use_llm: bool = False,
model_path: Optional[str] = None,
debug: bool = False
) -> None:
"""
Initialize the EmailClassifier.
Args:
use_llm: If True, use LLM for classification (slower but more accurate).
model_path: Path to LLM model (required if use_llm=True).
debug: If True, enable debug logging.
Example:
>>> classifier = EmailClassifier() # Rule-based
>>> classifier = EmailClassifier(use_llm=True, model_path="path/to/model")
Raises:
ValueError: If use_llm=True but model_path not provided.
"""
self.use_llm = use_llm
self.debug = debug
self.model = None
self.tokenizer = None
if debug:
logger.setLevel(logging.DEBUG)
if use_llm:
if not model_path:
raise ValueError("model_path required when use_llm=True")
self._load_model(model_path)
logger.info(f"EmailClassifier initialized (use_llm={use_llm})")
def _load_model(self, model_path: str) -> None:
"""Load LLM model for classification."""
try:
from mlx_lm import load
self.model, self.tokenizer = load(model_path)
logger.info(f"Loaded LLM from {model_path}")
except ImportError:
logger.error("mlx_lm not installed. Install with: pip install mlx-lm")
raise
except Exception as e:
logger.error(f"Failed to load model: {e}")
raise
def classify(
self,
subject: str = "",
sender: str = "",
body: str = ""
) -> ClassificationResult:
"""
Classify an email into a category.
This method accepts the email components and returns a classification
result with category, confidence, and reasoning.
Args:
subject: Email subject line.
sender: Sender name or email address.
body: Email body text.
Returns:
ClassificationResult: Classification with category and confidence.
Example:
>>> classifier = EmailClassifier()
>>> result = classifier.classify(
... subject="Transaction Alert",
... sender="HDFC Bank",
... body="Rs.500 debited from your account"
... )
>>> print(result.category)
'finance'
>>> print(result.is_transaction)
True
Note:
At least one of subject, sender, or body should be non-empty.
Empty input returns 'other' category with low confidence.
"""
# Validate input
if not any([subject, sender, body]):
logger.warning("Empty input provided")
return ClassificationResult(
category=EmailCategory.OTHER.value,
confidence="low",
reason="No content to classify",
is_transaction=False
)
try:
if self.use_llm and self.model is not None:
return self._classify_llm(subject, sender, body)
else:
return self._classify_rules(subject, sender, body)
except Exception as e:
logger.error(f"Classification failed: {e}", exc_info=True)
return ClassificationResult(
category=EmailCategory.OTHER.value,
confidence="low",
reason=f"Classification error: {str(e)}",
is_transaction=False
)
def _classify_rules(
self,
subject: str,
sender: str,
body: str
) -> ClassificationResult:
"""Classify using rule-based approach."""
combined = f"{subject} {sender} {body}".lower()
# Check for transaction first
is_transaction = any(kw in combined for kw in self.TRANSACTION_KEYWORDS)
# Score each category
scores: Dict[EmailCategory, int] = {}
reasons: Dict[EmailCategory, List[str]] = {}
for category, patterns in self.CATEGORY_PATTERNS.items():
score = 0
matched = []
# Check sender patterns (strong signal)
sender_lower = sender.lower()
for pattern in patterns.get('senders', []):
if pattern in sender_lower:
score += 3
matched.append(f"sender:{pattern}")
# Check keyword patterns
for keyword in patterns.get('keywords', []):
if keyword in combined:
score += 1
if len(matched) < 3: # Limit reasons
matched.append(keyword)
if score > 0:
scores[category] = score
reasons[category] = matched
# Handle no matches
if not scores:
return ClassificationResult(
category=EmailCategory.OTHER.value,
confidence="low",
reason="No matching patterns found",
is_transaction=is_transaction
)
# Get highest scoring category
best_category = max(scores, key=lambda k: scores[k])
best_score = scores[best_category]
best_reasons = reasons.get(best_category, [])
# Determine confidence based on score
if best_score >= 5:
confidence = "high"
elif best_score >= 3:
confidence = "medium"
else:
confidence = "low"
reason = f"Matched: {', '.join(best_reasons[:3])}"
logger.debug(f"Classification: {best_category.value} ({confidence}), scores: {scores}")
return ClassificationResult(
category=best_category.value,
confidence=confidence,
reason=reason,
is_transaction=is_transaction,
scores={k.value: v for k, v in scores.items()} if self.debug else None
)
def _classify_llm(
self,
subject: str,
sender: str,
body: str
) -> ClassificationResult:
"""Classify using LLM."""
from mlx_lm import generate
# Truncate body if too long
max_body_length = 1000
body_truncated = body[:max_body_length] if len(body) > max_body_length else body
# Build prompt
prompt = self.CLASSIFICATION_PROMPT.format(
subject=subject,
sender=sender,
body=body_truncated
)
# Generate response
response = generate(
self.model,
self.tokenizer,
prompt=prompt,
max_tokens=100,
verbose=False
)
# Parse JSON response
return self._parse_llm_response(response)
def _parse_llm_response(self, response: str) -> ClassificationResult:
"""Parse LLM JSON response into ClassificationResult."""
# Find JSON in response
json_match = re.search(r'\{[^}]+\}', response, re.DOTALL)
if not json_match:
logger.warning(f"No JSON found in LLM response: {response[:100]}")
return ClassificationResult(
category=EmailCategory.OTHER.value,
confidence="low",
reason="Failed to parse LLM response",
is_transaction=False
)
try:
data = json.loads(json_match.group())
category = data.get("category", "other").lower()
# Validate category
if category not in [c.value for c in EmailCategory]:
category = "other"
return ClassificationResult(
category=category,
confidence=data.get("confidence", "medium"),
reason=data.get("reason", "LLM classification"),
is_transaction=category == "finance"
)
except json.JSONDecodeError as e:
logger.warning(f"JSON parse error: {e}")
return ClassificationResult(
category=EmailCategory.OTHER.value,
confidence="low",
reason="Invalid JSON in response",
is_transaction=False
)
def is_financial_email(self, subject: str, sender: str, body: str) -> bool:
"""
Quick check if email is financial.
Faster than full classification when you only need to know
if the email is finance-related.
Args:
subject: Email subject.
sender: Email sender.
body: Email body.
Returns:
bool: True if email is finance-related.
"""
result = self.classify(subject, sender, body)
return result.category == EmailCategory.FINANCE.value or result.is_transaction
# Convenience function
def classify_email(
subject: str = "",
sender: str = "",
body: str = ""
) -> ClassificationResult:
"""
Convenience function to classify an email without instantiating class.
Args:
subject: Email subject.
sender: Email sender.
body: Email body.
Returns:
ClassificationResult: Classification result.
Example:
>>> from src.data.classifier import classify_email
>>> result = classify_email(
... subject="Order Shipped",
... sender="Amazon",
... body="Your order is on the way"
... )
>>> print(result.category)
'shopping'
"""
return EmailClassifier().classify(subject, sender, body)
if __name__ == "__main__":
# Self-test
logging.basicConfig(level=logging.DEBUG)
classifier = EmailClassifier(debug=True)
test_cases = [
("Transaction Alert", "HDFC Bank", "Rs.500 debited from your account"),
("Your order has shipped", "Amazon.in", "Order #123 is on the way"),
("Interview Invitation", "LinkedIn", "You have an interview scheduled"),
("Weekly Digest", "Substack", "Top 10 articles this week"),
]
for subject, sender, body in test_cases:
result = classifier.classify(subject, sender, body)
print(f"\n{subject} | {sender}")
print(f" → {result.category} ({result.confidence})")
print(f" → {result.reason}")
print(f" → Transaction: {result.is_transaction}")