|
|
""" |
|
|
Email Classification Module. |
|
|
|
|
|
This module provides production-grade email classification using both |
|
|
rule-based and LLM-based approaches. It categorizes emails into predefined |
|
|
categories with confidence scoring and transaction detection. |
|
|
|
|
|
Categories: |
|
|
- finance: Bank transactions, investments, payments |
|
|
- shopping: E-commerce, orders, deliveries |
|
|
- work: Job-related, meetings, projects |
|
|
- newsletter: Digests, articles, subscriptions |
|
|
- promotional: Marketing, offers, discounts |
|
|
- social: Social networks, personal messages |
|
|
- other: Uncategorized emails |
|
|
|
|
|
Example: |
|
|
>>> from src.data.classifier import EmailClassifier |
|
|
>>> classifier = EmailClassifier() |
|
|
>>> result = classifier.classify( |
|
|
... subject="Transaction Alert", |
|
|
... sender="HDFC Bank", |
|
|
... body="Rs.500 debited from your account" |
|
|
... ) |
|
|
>>> print(result.category) |
|
|
'finance' |
|
|
>>> print(result.is_transaction) |
|
|
True |
|
|
|
|
|
Author: Ranjit Behera |
|
|
License: MIT |
|
|
""" |
|
|
|
|
|
from __future__ import annotations |
|
|
|
|
|
import json |
|
|
import logging |
|
|
import re |
|
|
from dataclasses import dataclass, asdict |
|
|
from enum import Enum |
|
|
from typing import ( |
|
|
Any, |
|
|
ClassVar, |
|
|
Dict, |
|
|
List, |
|
|
Optional, |
|
|
Tuple, |
|
|
Union, |
|
|
) |
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
class EmailCategory(Enum): |
|
|
""" |
|
|
Enumeration of email categories. |
|
|
|
|
|
Each category represents a distinct type of email with specific |
|
|
characteristics and handling requirements. |
|
|
|
|
|
Attributes: |
|
|
FINANCE: Bank and financial transaction emails. |
|
|
SHOPPING: E-commerce and order-related emails. |
|
|
WORK: Professional and job-related emails. |
|
|
NEWSLETTER: News, articles, and subscription content. |
|
|
PROMOTIONAL: Marketing and advertising emails. |
|
|
SOCIAL: Social network and personal communication. |
|
|
OTHER: Emails that don't fit other categories. |
|
|
""" |
|
|
|
|
|
FINANCE = "finance" |
|
|
SHOPPING = "shopping" |
|
|
WORK = "work" |
|
|
NEWSLETTER = "newsletter" |
|
|
PROMOTIONAL = "promotional" |
|
|
SOCIAL = "social" |
|
|
OTHER = "other" |
|
|
|
|
|
@classmethod |
|
|
def from_string(cls, value: str) -> EmailCategory: |
|
|
""" |
|
|
Convert string to EmailCategory enum. |
|
|
|
|
|
Args: |
|
|
value: Category name as string. |
|
|
|
|
|
Returns: |
|
|
EmailCategory: Corresponding enum value. |
|
|
|
|
|
Raises: |
|
|
ValueError: If value doesn't match any category. |
|
|
""" |
|
|
try: |
|
|
return cls(value.lower()) |
|
|
except ValueError: |
|
|
logger.warning(f"Unknown category '{value}', defaulting to OTHER") |
|
|
return cls.OTHER |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class ClassificationResult: |
|
|
""" |
|
|
Result of email classification. |
|
|
|
|
|
Contains the predicted category, confidence level, reasoning, |
|
|
and whether the email is a financial transaction. |
|
|
|
|
|
Attributes: |
|
|
category: Predicted email category. |
|
|
confidence: Confidence level ('high', 'medium', 'low'). |
|
|
reason: Human-readable explanation for classification. |
|
|
is_transaction: True if email is a financial transaction. |
|
|
scores: Optional dict of category scores for debugging. |
|
|
|
|
|
Example: |
|
|
>>> result = ClassificationResult( |
|
|
... category="finance", |
|
|
... confidence="high", |
|
|
... reason="Contains debit keywords and amount", |
|
|
... is_transaction=True |
|
|
... ) |
|
|
>>> result.to_dict() |
|
|
{'category': 'finance', 'confidence': 'high', ...} |
|
|
""" |
|
|
|
|
|
category: str |
|
|
confidence: str |
|
|
reason: str |
|
|
is_transaction: bool = False |
|
|
scores: Optional[Dict[str, float]] = None |
|
|
|
|
|
|
|
|
VALID_CONFIDENCE_LEVELS: ClassVar[set] = {"high", "medium", "low"} |
|
|
|
|
|
def __post_init__(self) -> None: |
|
|
"""Validate classification result.""" |
|
|
if self.confidence not in self.VALID_CONFIDENCE_LEVELS: |
|
|
logger.warning(f"Invalid confidence '{self.confidence}', setting to 'low'") |
|
|
self.confidence = "low" |
|
|
|
|
|
def to_dict(self) -> Dict[str, Any]: |
|
|
""" |
|
|
Convert result to dictionary. |
|
|
|
|
|
Returns: |
|
|
Dict[str, Any]: Classification result as dictionary. |
|
|
""" |
|
|
result = asdict(self) |
|
|
if self.scores is None: |
|
|
del result['scores'] |
|
|
return result |
|
|
|
|
|
def to_json(self) -> str: |
|
|
""" |
|
|
Convert result to JSON string. |
|
|
|
|
|
Returns: |
|
|
str: JSON representation. |
|
|
""" |
|
|
return json.dumps(self.to_dict(), indent=2) |
|
|
|
|
|
|
|
|
class EmailClassifier: |
|
|
""" |
|
|
Production-grade email classifier with rule-based and LLM support. |
|
|
|
|
|
This classifier uses a sophisticated pattern matching system to |
|
|
categorize emails and detect financial transactions. It can optionally |
|
|
use an LLM for more nuanced classification. |
|
|
|
|
|
Features: |
|
|
- Multi-pattern rule-based classification |
|
|
- Sender-based categorization |
|
|
- Transaction detection |
|
|
- Confidence scoring |
|
|
- Optional LLM integration |
|
|
|
|
|
Attributes: |
|
|
use_llm: Whether to use LLM for classification. |
|
|
model: LLM model instance (if use_llm=True). |
|
|
tokenizer: LLM tokenizer (if use_llm=True). |
|
|
|
|
|
Example: |
|
|
>>> classifier = EmailClassifier() |
|
|
>>> result = classifier.classify( |
|
|
... subject="Your order has shipped", |
|
|
... sender="Amazon.in", |
|
|
... body="Your order #123 is on the way" |
|
|
... ) |
|
|
>>> print(result.category) |
|
|
'shopping' |
|
|
|
|
|
Note: |
|
|
For production use, the rule-based classifier is recommended |
|
|
due to its speed and consistency. LLM mode requires additional |
|
|
dependencies and model loading time. |
|
|
""" |
|
|
|
|
|
|
|
|
TRANSACTION_KEYWORDS: ClassVar[set] = { |
|
|
'debited', 'credited', 'transaction', 'transfer', 'payment', |
|
|
'withdrawn', 'deposited', 'paid', 'received', 'upi', 'neft', 'imps', |
|
|
'rtgs', 'mandate', 'autopay', 'emi', 'refund', 'cashback' |
|
|
} |
|
|
|
|
|
|
|
|
CATEGORY_PATTERNS: ClassVar[Dict[EmailCategory, Dict[str, List[str]]]] = { |
|
|
EmailCategory.FINANCE: { |
|
|
'senders': [ |
|
|
'hdfc', 'icici', 'sbi', 'axis', 'kotak', 'pnb', 'bob', |
|
|
'canara', 'union bank', 'idbi', 'yes bank', 'indusind', |
|
|
'bank', 'banking', 'credit card', 'mutual fund', 'zerodha', |
|
|
'groww', 'upstox', 'cred', 'slice', 'paytm', 'phonepe', 'gpay' |
|
|
], |
|
|
'keywords': [ |
|
|
'transaction', 'statement', 'balance', 'debited', 'credited', |
|
|
'payment', 'transfer', 'upi', 'neft', 'imps', 'account', |
|
|
'investment', 'dividend', 'interest', 'emi', 'loan', 'credit', |
|
|
'debit', 'mandate', 'autopay', 'sip', 'mutual fund' |
|
|
], |
|
|
}, |
|
|
EmailCategory.SHOPPING: { |
|
|
'senders': [ |
|
|
'amazon', 'flipkart', 'myntra', 'ajio', 'nykaa', 'meesho', |
|
|
'bigbasket', 'zepto', 'blinkit', 'swiggy', 'zomato', 'dunzo', |
|
|
'decathlon', 'ikea', 'pepperfry', 'urban ladder' |
|
|
], |
|
|
'keywords': [ |
|
|
'order', 'shipped', 'delivered', 'delivery', 'tracking', |
|
|
'purchase', 'cart', 'checkout', 'invoice', 'receipt', |
|
|
'refund', 'return', 'exchange', 'dispatched', 'out for delivery' |
|
|
], |
|
|
}, |
|
|
EmailCategory.WORK: { |
|
|
'senders': [ |
|
|
'linkedin', 'indeed', 'naukri', 'glassdoor', 'angel.co', |
|
|
'slack', 'zoom', 'teams', 'meet', 'jira', 'confluence', |
|
|
'github', 'gitlab', 'bitbucket', 'notion', 'asana', 'trello' |
|
|
], |
|
|
'keywords': [ |
|
|
'interview', 'meeting', 'agenda', 'project', 'deadline', |
|
|
'review', 'standup', 'sprint', 'task', 'report', 'submission', |
|
|
'application', 'resume', 'cv', 'job', 'position', 'hiring', |
|
|
'salary', 'offer letter', 'joining', 'onboarding' |
|
|
], |
|
|
}, |
|
|
EmailCategory.NEWSLETTER: { |
|
|
'senders': [ |
|
|
'substack', 'medium', 'morning brew', 'digest', 'newsletter', |
|
|
'daily', 'weekly', 'update', 'news', 'times', 'hindu', 'express' |
|
|
], |
|
|
'keywords': [ |
|
|
'newsletter', 'digest', 'weekly', 'daily', 'update', 'news', |
|
|
'article', 'read', 'story', 'trending', 'top stories', |
|
|
'this week', 'this month', 'roundup', 'edition' |
|
|
], |
|
|
}, |
|
|
EmailCategory.PROMOTIONAL: { |
|
|
'senders': [ |
|
|
'offer', 'deal', 'discount', 'sale', 'promo', 'marketing', |
|
|
'shopify', 'mailchimp', 'campaign' |
|
|
], |
|
|
'keywords': [ |
|
|
'offer', 'discount', 'sale', 'deal', 'coupon', 'promo', |
|
|
'limited time', 'exclusive', 'special', 'hurry', 'ends soon', |
|
|
'flash sale', 'clearance', 'save', '% off', 'free shipping', |
|
|
'buy now', 'shop now', 'don\'t miss' |
|
|
], |
|
|
}, |
|
|
EmailCategory.SOCIAL: { |
|
|
'senders': [ |
|
|
'facebook', 'instagram', 'twitter', 'whatsapp', 'telegram', |
|
|
'snapchat', 'tiktok', 'youtube', 'reddit', 'discord', 'quora' |
|
|
], |
|
|
'keywords': [ |
|
|
'friend request', 'like', 'comment', 'share', 'mentioned', |
|
|
'tagged', 'message', 'follow', 'notification', 'birthday', |
|
|
'invitation', 'event', 'rsvp', 'group' |
|
|
], |
|
|
}, |
|
|
} |
|
|
|
|
|
|
|
|
CLASSIFICATION_PROMPT: ClassVar[str] = """You are an email classifier. Analyze this email and categorize it. |
|
|
|
|
|
EMAIL: |
|
|
Subject: {subject} |
|
|
From: {sender} |
|
|
Body: {body} |
|
|
|
|
|
TASK: |
|
|
Classify this email into exactly ONE category. |
|
|
|
|
|
CATEGORIES: |
|
|
- finance: Banks, payments, transactions, investments, credit cards, loans |
|
|
- shopping: Orders, deliveries, purchases, e-commerce, food delivery |
|
|
- work: Job-related, recruitment, office, meetings, projects |
|
|
- newsletter: Digests, subscriptions, blogs, articles |
|
|
- promotional: Marketing, offers, discounts, advertisements |
|
|
- social: Social networks, personal messages, invitations |
|
|
- other: Anything that doesn't fit above |
|
|
|
|
|
OUTPUT FORMAT (JSON only, no other text): |
|
|
{{"category": "<category>", "confidence": "<high/medium/low>", "reason": "<brief reason>"}} |
|
|
""" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
use_llm: bool = False, |
|
|
model_path: Optional[str] = None, |
|
|
debug: bool = False |
|
|
) -> None: |
|
|
""" |
|
|
Initialize the EmailClassifier. |
|
|
|
|
|
Args: |
|
|
use_llm: If True, use LLM for classification (slower but more accurate). |
|
|
model_path: Path to LLM model (required if use_llm=True). |
|
|
debug: If True, enable debug logging. |
|
|
|
|
|
Example: |
|
|
>>> classifier = EmailClassifier() # Rule-based |
|
|
>>> classifier = EmailClassifier(use_llm=True, model_path="path/to/model") |
|
|
|
|
|
Raises: |
|
|
ValueError: If use_llm=True but model_path not provided. |
|
|
""" |
|
|
self.use_llm = use_llm |
|
|
self.debug = debug |
|
|
self.model = None |
|
|
self.tokenizer = None |
|
|
|
|
|
if debug: |
|
|
logger.setLevel(logging.DEBUG) |
|
|
|
|
|
if use_llm: |
|
|
if not model_path: |
|
|
raise ValueError("model_path required when use_llm=True") |
|
|
self._load_model(model_path) |
|
|
|
|
|
logger.info(f"EmailClassifier initialized (use_llm={use_llm})") |
|
|
|
|
|
def _load_model(self, model_path: str) -> None: |
|
|
"""Load LLM model for classification.""" |
|
|
try: |
|
|
from mlx_lm import load |
|
|
self.model, self.tokenizer = load(model_path) |
|
|
logger.info(f"Loaded LLM from {model_path}") |
|
|
except ImportError: |
|
|
logger.error("mlx_lm not installed. Install with: pip install mlx-lm") |
|
|
raise |
|
|
except Exception as e: |
|
|
logger.error(f"Failed to load model: {e}") |
|
|
raise |
|
|
|
|
|
def classify( |
|
|
self, |
|
|
subject: str = "", |
|
|
sender: str = "", |
|
|
body: str = "" |
|
|
) -> ClassificationResult: |
|
|
""" |
|
|
Classify an email into a category. |
|
|
|
|
|
This method accepts the email components and returns a classification |
|
|
result with category, confidence, and reasoning. |
|
|
|
|
|
Args: |
|
|
subject: Email subject line. |
|
|
sender: Sender name or email address. |
|
|
body: Email body text. |
|
|
|
|
|
Returns: |
|
|
ClassificationResult: Classification with category and confidence. |
|
|
|
|
|
Example: |
|
|
>>> classifier = EmailClassifier() |
|
|
>>> result = classifier.classify( |
|
|
... subject="Transaction Alert", |
|
|
... sender="HDFC Bank", |
|
|
... body="Rs.500 debited from your account" |
|
|
... ) |
|
|
>>> print(result.category) |
|
|
'finance' |
|
|
>>> print(result.is_transaction) |
|
|
True |
|
|
|
|
|
Note: |
|
|
At least one of subject, sender, or body should be non-empty. |
|
|
Empty input returns 'other' category with low confidence. |
|
|
""" |
|
|
|
|
|
if not any([subject, sender, body]): |
|
|
logger.warning("Empty input provided") |
|
|
return ClassificationResult( |
|
|
category=EmailCategory.OTHER.value, |
|
|
confidence="low", |
|
|
reason="No content to classify", |
|
|
is_transaction=False |
|
|
) |
|
|
|
|
|
try: |
|
|
if self.use_llm and self.model is not None: |
|
|
return self._classify_llm(subject, sender, body) |
|
|
else: |
|
|
return self._classify_rules(subject, sender, body) |
|
|
except Exception as e: |
|
|
logger.error(f"Classification failed: {e}", exc_info=True) |
|
|
return ClassificationResult( |
|
|
category=EmailCategory.OTHER.value, |
|
|
confidence="low", |
|
|
reason=f"Classification error: {str(e)}", |
|
|
is_transaction=False |
|
|
) |
|
|
|
|
|
def _classify_rules( |
|
|
self, |
|
|
subject: str, |
|
|
sender: str, |
|
|
body: str |
|
|
) -> ClassificationResult: |
|
|
"""Classify using rule-based approach.""" |
|
|
combined = f"{subject} {sender} {body}".lower() |
|
|
|
|
|
|
|
|
is_transaction = any(kw in combined for kw in self.TRANSACTION_KEYWORDS) |
|
|
|
|
|
|
|
|
scores: Dict[EmailCategory, int] = {} |
|
|
reasons: Dict[EmailCategory, List[str]] = {} |
|
|
|
|
|
for category, patterns in self.CATEGORY_PATTERNS.items(): |
|
|
score = 0 |
|
|
matched = [] |
|
|
|
|
|
|
|
|
sender_lower = sender.lower() |
|
|
for pattern in patterns.get('senders', []): |
|
|
if pattern in sender_lower: |
|
|
score += 3 |
|
|
matched.append(f"sender:{pattern}") |
|
|
|
|
|
|
|
|
for keyword in patterns.get('keywords', []): |
|
|
if keyword in combined: |
|
|
score += 1 |
|
|
if len(matched) < 3: |
|
|
matched.append(keyword) |
|
|
|
|
|
if score > 0: |
|
|
scores[category] = score |
|
|
reasons[category] = matched |
|
|
|
|
|
|
|
|
if not scores: |
|
|
return ClassificationResult( |
|
|
category=EmailCategory.OTHER.value, |
|
|
confidence="low", |
|
|
reason="No matching patterns found", |
|
|
is_transaction=is_transaction |
|
|
) |
|
|
|
|
|
|
|
|
best_category = max(scores, key=lambda k: scores[k]) |
|
|
best_score = scores[best_category] |
|
|
best_reasons = reasons.get(best_category, []) |
|
|
|
|
|
|
|
|
if best_score >= 5: |
|
|
confidence = "high" |
|
|
elif best_score >= 3: |
|
|
confidence = "medium" |
|
|
else: |
|
|
confidence = "low" |
|
|
|
|
|
reason = f"Matched: {', '.join(best_reasons[:3])}" |
|
|
|
|
|
logger.debug(f"Classification: {best_category.value} ({confidence}), scores: {scores}") |
|
|
|
|
|
return ClassificationResult( |
|
|
category=best_category.value, |
|
|
confidence=confidence, |
|
|
reason=reason, |
|
|
is_transaction=is_transaction, |
|
|
scores={k.value: v for k, v in scores.items()} if self.debug else None |
|
|
) |
|
|
|
|
|
def _classify_llm( |
|
|
self, |
|
|
subject: str, |
|
|
sender: str, |
|
|
body: str |
|
|
) -> ClassificationResult: |
|
|
"""Classify using LLM.""" |
|
|
from mlx_lm import generate |
|
|
|
|
|
|
|
|
max_body_length = 1000 |
|
|
body_truncated = body[:max_body_length] if len(body) > max_body_length else body |
|
|
|
|
|
|
|
|
prompt = self.CLASSIFICATION_PROMPT.format( |
|
|
subject=subject, |
|
|
sender=sender, |
|
|
body=body_truncated |
|
|
) |
|
|
|
|
|
|
|
|
response = generate( |
|
|
self.model, |
|
|
self.tokenizer, |
|
|
prompt=prompt, |
|
|
max_tokens=100, |
|
|
verbose=False |
|
|
) |
|
|
|
|
|
|
|
|
return self._parse_llm_response(response) |
|
|
|
|
|
def _parse_llm_response(self, response: str) -> ClassificationResult: |
|
|
"""Parse LLM JSON response into ClassificationResult.""" |
|
|
|
|
|
json_match = re.search(r'\{[^}]+\}', response, re.DOTALL) |
|
|
|
|
|
if not json_match: |
|
|
logger.warning(f"No JSON found in LLM response: {response[:100]}") |
|
|
return ClassificationResult( |
|
|
category=EmailCategory.OTHER.value, |
|
|
confidence="low", |
|
|
reason="Failed to parse LLM response", |
|
|
is_transaction=False |
|
|
) |
|
|
|
|
|
try: |
|
|
data = json.loads(json_match.group()) |
|
|
category = data.get("category", "other").lower() |
|
|
|
|
|
|
|
|
if category not in [c.value for c in EmailCategory]: |
|
|
category = "other" |
|
|
|
|
|
return ClassificationResult( |
|
|
category=category, |
|
|
confidence=data.get("confidence", "medium"), |
|
|
reason=data.get("reason", "LLM classification"), |
|
|
is_transaction=category == "finance" |
|
|
) |
|
|
except json.JSONDecodeError as e: |
|
|
logger.warning(f"JSON parse error: {e}") |
|
|
return ClassificationResult( |
|
|
category=EmailCategory.OTHER.value, |
|
|
confidence="low", |
|
|
reason="Invalid JSON in response", |
|
|
is_transaction=False |
|
|
) |
|
|
|
|
|
def is_financial_email(self, subject: str, sender: str, body: str) -> bool: |
|
|
""" |
|
|
Quick check if email is financial. |
|
|
|
|
|
Faster than full classification when you only need to know |
|
|
if the email is finance-related. |
|
|
|
|
|
Args: |
|
|
subject: Email subject. |
|
|
sender: Email sender. |
|
|
body: Email body. |
|
|
|
|
|
Returns: |
|
|
bool: True if email is finance-related. |
|
|
""" |
|
|
result = self.classify(subject, sender, body) |
|
|
return result.category == EmailCategory.FINANCE.value or result.is_transaction |
|
|
|
|
|
|
|
|
|
|
|
def classify_email( |
|
|
subject: str = "", |
|
|
sender: str = "", |
|
|
body: str = "" |
|
|
) -> ClassificationResult: |
|
|
""" |
|
|
Convenience function to classify an email without instantiating class. |
|
|
|
|
|
Args: |
|
|
subject: Email subject. |
|
|
sender: Email sender. |
|
|
body: Email body. |
|
|
|
|
|
Returns: |
|
|
ClassificationResult: Classification result. |
|
|
|
|
|
Example: |
|
|
>>> from src.data.classifier import classify_email |
|
|
>>> result = classify_email( |
|
|
... subject="Order Shipped", |
|
|
... sender="Amazon", |
|
|
... body="Your order is on the way" |
|
|
... ) |
|
|
>>> print(result.category) |
|
|
'shopping' |
|
|
""" |
|
|
return EmailClassifier().classify(subject, sender, body) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
logging.basicConfig(level=logging.DEBUG) |
|
|
|
|
|
classifier = EmailClassifier(debug=True) |
|
|
|
|
|
test_cases = [ |
|
|
("Transaction Alert", "HDFC Bank", "Rs.500 debited from your account"), |
|
|
("Your order has shipped", "Amazon.in", "Order #123 is on the way"), |
|
|
("Interview Invitation", "LinkedIn", "You have an interview scheduled"), |
|
|
("Weekly Digest", "Substack", "Top 10 articles this week"), |
|
|
] |
|
|
|
|
|
for subject, sender, body in test_cases: |
|
|
result = classifier.classify(subject, sender, body) |
|
|
print(f"\n{subject} | {sender}") |
|
|
print(f" → {result.category} ({result.confidence})") |
|
|
print(f" → {result.reason}") |
|
|
print(f" → Transaction: {result.is_transaction}") |
|
|
|