""" Email Classification Module. This module provides production-grade email classification using both rule-based and LLM-based approaches. It categorizes emails into predefined categories with confidence scoring and transaction detection. Categories: - finance: Bank transactions, investments, payments - shopping: E-commerce, orders, deliveries - work: Job-related, meetings, projects - newsletter: Digests, articles, subscriptions - promotional: Marketing, offers, discounts - social: Social networks, personal messages - other: Uncategorized emails Example: >>> from src.data.classifier import EmailClassifier >>> classifier = EmailClassifier() >>> result = classifier.classify( ... subject="Transaction Alert", ... sender="HDFC Bank", ... body="Rs.500 debited from your account" ... ) >>> print(result.category) 'finance' >>> print(result.is_transaction) True Author: Ranjit Behera License: MIT """ from __future__ import annotations import json import logging import re from dataclasses import dataclass, asdict from enum import Enum from typing import ( Any, ClassVar, Dict, List, Optional, Tuple, Union, ) # Configure module logger logger = logging.getLogger(__name__) class EmailCategory(Enum): """ Enumeration of email categories. Each category represents a distinct type of email with specific characteristics and handling requirements. Attributes: FINANCE: Bank and financial transaction emails. SHOPPING: E-commerce and order-related emails. WORK: Professional and job-related emails. NEWSLETTER: News, articles, and subscription content. PROMOTIONAL: Marketing and advertising emails. SOCIAL: Social network and personal communication. OTHER: Emails that don't fit other categories. """ FINANCE = "finance" SHOPPING = "shopping" WORK = "work" NEWSLETTER = "newsletter" PROMOTIONAL = "promotional" SOCIAL = "social" OTHER = "other" @classmethod def from_string(cls, value: str) -> EmailCategory: """ Convert string to EmailCategory enum. Args: value: Category name as string. Returns: EmailCategory: Corresponding enum value. Raises: ValueError: If value doesn't match any category. """ try: return cls(value.lower()) except ValueError: logger.warning(f"Unknown category '{value}', defaulting to OTHER") return cls.OTHER @dataclass class ClassificationResult: """ Result of email classification. Contains the predicted category, confidence level, reasoning, and whether the email is a financial transaction. Attributes: category: Predicted email category. confidence: Confidence level ('high', 'medium', 'low'). reason: Human-readable explanation for classification. is_transaction: True if email is a financial transaction. scores: Optional dict of category scores for debugging. Example: >>> result = ClassificationResult( ... category="finance", ... confidence="high", ... reason="Contains debit keywords and amount", ... is_transaction=True ... ) >>> result.to_dict() {'category': 'finance', 'confidence': 'high', ...} """ category: str confidence: str reason: str is_transaction: bool = False scores: Optional[Dict[str, float]] = None # Validation VALID_CONFIDENCE_LEVELS: ClassVar[set] = {"high", "medium", "low"} def __post_init__(self) -> None: """Validate classification result.""" if self.confidence not in self.VALID_CONFIDENCE_LEVELS: logger.warning(f"Invalid confidence '{self.confidence}', setting to 'low'") self.confidence = "low" def to_dict(self) -> Dict[str, Any]: """ Convert result to dictionary. Returns: Dict[str, Any]: Classification result as dictionary. """ result = asdict(self) if self.scores is None: del result['scores'] return result def to_json(self) -> str: """ Convert result to JSON string. Returns: str: JSON representation. """ return json.dumps(self.to_dict(), indent=2) class EmailClassifier: """ Production-grade email classifier with rule-based and LLM support. This classifier uses a sophisticated pattern matching system to categorize emails and detect financial transactions. It can optionally use an LLM for more nuanced classification. Features: - Multi-pattern rule-based classification - Sender-based categorization - Transaction detection - Confidence scoring - Optional LLM integration Attributes: use_llm: Whether to use LLM for classification. model: LLM model instance (if use_llm=True). tokenizer: LLM tokenizer (if use_llm=True). Example: >>> classifier = EmailClassifier() >>> result = classifier.classify( ... subject="Your order has shipped", ... sender="Amazon.in", ... body="Your order #123 is on the way" ... ) >>> print(result.category) 'shopping' Note: For production use, the rule-based classifier is recommended due to its speed and consistency. LLM mode requires additional dependencies and model loading time. """ # Transaction detection keywords TRANSACTION_KEYWORDS: ClassVar[set] = { 'debited', 'credited', 'transaction', 'transfer', 'payment', 'withdrawn', 'deposited', 'paid', 'received', 'upi', 'neft', 'imps', 'rtgs', 'mandate', 'autopay', 'emi', 'refund', 'cashback' } # Category patterns with senders and keywords CATEGORY_PATTERNS: ClassVar[Dict[EmailCategory, Dict[str, List[str]]]] = { EmailCategory.FINANCE: { 'senders': [ 'hdfc', 'icici', 'sbi', 'axis', 'kotak', 'pnb', 'bob', 'canara', 'union bank', 'idbi', 'yes bank', 'indusind', 'bank', 'banking', 'credit card', 'mutual fund', 'zerodha', 'groww', 'upstox', 'cred', 'slice', 'paytm', 'phonepe', 'gpay' ], 'keywords': [ 'transaction', 'statement', 'balance', 'debited', 'credited', 'payment', 'transfer', 'upi', 'neft', 'imps', 'account', 'investment', 'dividend', 'interest', 'emi', 'loan', 'credit', 'debit', 'mandate', 'autopay', 'sip', 'mutual fund' ], }, EmailCategory.SHOPPING: { 'senders': [ 'amazon', 'flipkart', 'myntra', 'ajio', 'nykaa', 'meesho', 'bigbasket', 'zepto', 'blinkit', 'swiggy', 'zomato', 'dunzo', 'decathlon', 'ikea', 'pepperfry', 'urban ladder' ], 'keywords': [ 'order', 'shipped', 'delivered', 'delivery', 'tracking', 'purchase', 'cart', 'checkout', 'invoice', 'receipt', 'refund', 'return', 'exchange', 'dispatched', 'out for delivery' ], }, EmailCategory.WORK: { 'senders': [ 'linkedin', 'indeed', 'naukri', 'glassdoor', 'angel.co', 'slack', 'zoom', 'teams', 'meet', 'jira', 'confluence', 'github', 'gitlab', 'bitbucket', 'notion', 'asana', 'trello' ], 'keywords': [ 'interview', 'meeting', 'agenda', 'project', 'deadline', 'review', 'standup', 'sprint', 'task', 'report', 'submission', 'application', 'resume', 'cv', 'job', 'position', 'hiring', 'salary', 'offer letter', 'joining', 'onboarding' ], }, EmailCategory.NEWSLETTER: { 'senders': [ 'substack', 'medium', 'morning brew', 'digest', 'newsletter', 'daily', 'weekly', 'update', 'news', 'times', 'hindu', 'express' ], 'keywords': [ 'newsletter', 'digest', 'weekly', 'daily', 'update', 'news', 'article', 'read', 'story', 'trending', 'top stories', 'this week', 'this month', 'roundup', 'edition' ], }, EmailCategory.PROMOTIONAL: { 'senders': [ 'offer', 'deal', 'discount', 'sale', 'promo', 'marketing', 'shopify', 'mailchimp', 'campaign' ], 'keywords': [ 'offer', 'discount', 'sale', 'deal', 'coupon', 'promo', 'limited time', 'exclusive', 'special', 'hurry', 'ends soon', 'flash sale', 'clearance', 'save', '% off', 'free shipping', 'buy now', 'shop now', 'don\'t miss' ], }, EmailCategory.SOCIAL: { 'senders': [ 'facebook', 'instagram', 'twitter', 'whatsapp', 'telegram', 'snapchat', 'tiktok', 'youtube', 'reddit', 'discord', 'quora' ], 'keywords': [ 'friend request', 'like', 'comment', 'share', 'mentioned', 'tagged', 'message', 'follow', 'notification', 'birthday', 'invitation', 'event', 'rsvp', 'group' ], }, } # LLM prompt template CLASSIFICATION_PROMPT: ClassVar[str] = """You are an email classifier. Analyze this email and categorize it. EMAIL: Subject: {subject} From: {sender} Body: {body} TASK: Classify this email into exactly ONE category. CATEGORIES: - finance: Banks, payments, transactions, investments, credit cards, loans - shopping: Orders, deliveries, purchases, e-commerce, food delivery - work: Job-related, recruitment, office, meetings, projects - newsletter: Digests, subscriptions, blogs, articles - promotional: Marketing, offers, discounts, advertisements - social: Social networks, personal messages, invitations - other: Anything that doesn't fit above OUTPUT FORMAT (JSON only, no other text): {{"category": "", "confidence": "", "reason": ""}} """ def __init__( self, use_llm: bool = False, model_path: Optional[str] = None, debug: bool = False ) -> None: """ Initialize the EmailClassifier. Args: use_llm: If True, use LLM for classification (slower but more accurate). model_path: Path to LLM model (required if use_llm=True). debug: If True, enable debug logging. Example: >>> classifier = EmailClassifier() # Rule-based >>> classifier = EmailClassifier(use_llm=True, model_path="path/to/model") Raises: ValueError: If use_llm=True but model_path not provided. """ self.use_llm = use_llm self.debug = debug self.model = None self.tokenizer = None if debug: logger.setLevel(logging.DEBUG) if use_llm: if not model_path: raise ValueError("model_path required when use_llm=True") self._load_model(model_path) logger.info(f"EmailClassifier initialized (use_llm={use_llm})") def _load_model(self, model_path: str) -> None: """Load LLM model for classification.""" try: from mlx_lm import load self.model, self.tokenizer = load(model_path) logger.info(f"Loaded LLM from {model_path}") except ImportError: logger.error("mlx_lm not installed. Install with: pip install mlx-lm") raise except Exception as e: logger.error(f"Failed to load model: {e}") raise def classify( self, subject: str = "", sender: str = "", body: str = "" ) -> ClassificationResult: """ Classify an email into a category. This method accepts the email components and returns a classification result with category, confidence, and reasoning. Args: subject: Email subject line. sender: Sender name or email address. body: Email body text. Returns: ClassificationResult: Classification with category and confidence. Example: >>> classifier = EmailClassifier() >>> result = classifier.classify( ... subject="Transaction Alert", ... sender="HDFC Bank", ... body="Rs.500 debited from your account" ... ) >>> print(result.category) 'finance' >>> print(result.is_transaction) True Note: At least one of subject, sender, or body should be non-empty. Empty input returns 'other' category with low confidence. """ # Validate input if not any([subject, sender, body]): logger.warning("Empty input provided") return ClassificationResult( category=EmailCategory.OTHER.value, confidence="low", reason="No content to classify", is_transaction=False ) try: if self.use_llm and self.model is not None: return self._classify_llm(subject, sender, body) else: return self._classify_rules(subject, sender, body) except Exception as e: logger.error(f"Classification failed: {e}", exc_info=True) return ClassificationResult( category=EmailCategory.OTHER.value, confidence="low", reason=f"Classification error: {str(e)}", is_transaction=False ) def _classify_rules( self, subject: str, sender: str, body: str ) -> ClassificationResult: """Classify using rule-based approach.""" combined = f"{subject} {sender} {body}".lower() # Check for transaction first is_transaction = any(kw in combined for kw in self.TRANSACTION_KEYWORDS) # Score each category scores: Dict[EmailCategory, int] = {} reasons: Dict[EmailCategory, List[str]] = {} for category, patterns in self.CATEGORY_PATTERNS.items(): score = 0 matched = [] # Check sender patterns (strong signal) sender_lower = sender.lower() for pattern in patterns.get('senders', []): if pattern in sender_lower: score += 3 matched.append(f"sender:{pattern}") # Check keyword patterns for keyword in patterns.get('keywords', []): if keyword in combined: score += 1 if len(matched) < 3: # Limit reasons matched.append(keyword) if score > 0: scores[category] = score reasons[category] = matched # Handle no matches if not scores: return ClassificationResult( category=EmailCategory.OTHER.value, confidence="low", reason="No matching patterns found", is_transaction=is_transaction ) # Get highest scoring category best_category = max(scores, key=lambda k: scores[k]) best_score = scores[best_category] best_reasons = reasons.get(best_category, []) # Determine confidence based on score if best_score >= 5: confidence = "high" elif best_score >= 3: confidence = "medium" else: confidence = "low" reason = f"Matched: {', '.join(best_reasons[:3])}" logger.debug(f"Classification: {best_category.value} ({confidence}), scores: {scores}") return ClassificationResult( category=best_category.value, confidence=confidence, reason=reason, is_transaction=is_transaction, scores={k.value: v for k, v in scores.items()} if self.debug else None ) def _classify_llm( self, subject: str, sender: str, body: str ) -> ClassificationResult: """Classify using LLM.""" from mlx_lm import generate # Truncate body if too long max_body_length = 1000 body_truncated = body[:max_body_length] if len(body) > max_body_length else body # Build prompt prompt = self.CLASSIFICATION_PROMPT.format( subject=subject, sender=sender, body=body_truncated ) # Generate response response = generate( self.model, self.tokenizer, prompt=prompt, max_tokens=100, verbose=False ) # Parse JSON response return self._parse_llm_response(response) def _parse_llm_response(self, response: str) -> ClassificationResult: """Parse LLM JSON response into ClassificationResult.""" # Find JSON in response json_match = re.search(r'\{[^}]+\}', response, re.DOTALL) if not json_match: logger.warning(f"No JSON found in LLM response: {response[:100]}") return ClassificationResult( category=EmailCategory.OTHER.value, confidence="low", reason="Failed to parse LLM response", is_transaction=False ) try: data = json.loads(json_match.group()) category = data.get("category", "other").lower() # Validate category if category not in [c.value for c in EmailCategory]: category = "other" return ClassificationResult( category=category, confidence=data.get("confidence", "medium"), reason=data.get("reason", "LLM classification"), is_transaction=category == "finance" ) except json.JSONDecodeError as e: logger.warning(f"JSON parse error: {e}") return ClassificationResult( category=EmailCategory.OTHER.value, confidence="low", reason="Invalid JSON in response", is_transaction=False ) def is_financial_email(self, subject: str, sender: str, body: str) -> bool: """ Quick check if email is financial. Faster than full classification when you only need to know if the email is finance-related. Args: subject: Email subject. sender: Email sender. body: Email body. Returns: bool: True if email is finance-related. """ result = self.classify(subject, sender, body) return result.category == EmailCategory.FINANCE.value or result.is_transaction # Convenience function def classify_email( subject: str = "", sender: str = "", body: str = "" ) -> ClassificationResult: """ Convenience function to classify an email without instantiating class. Args: subject: Email subject. sender: Email sender. body: Email body. Returns: ClassificationResult: Classification result. Example: >>> from src.data.classifier import classify_email >>> result = classify_email( ... subject="Order Shipped", ... sender="Amazon", ... body="Your order is on the way" ... ) >>> print(result.category) 'shopping' """ return EmailClassifier().classify(subject, sender, body) if __name__ == "__main__": # Self-test logging.basicConfig(level=logging.DEBUG) classifier = EmailClassifier(debug=True) test_cases = [ ("Transaction Alert", "HDFC Bank", "Rs.500 debited from your account"), ("Your order has shipped", "Amazon.in", "Order #123 is on the way"), ("Interview Invitation", "LinkedIn", "You have an interview scheduled"), ("Weekly Digest", "Substack", "Top 10 articles this week"), ] for subject, sender, body in test_cases: result = classifier.classify(subject, sender, body) print(f"\n{subject} | {sender}") print(f" → {result.category} ({result.confidence})") print(f" → {result.reason}") print(f" → Transaction: {result.is_transaction}")