| |
| """ |
| Transaction Classifier for Tax Optimization |
| Classifies Mono API and manual transactions into tax-relevant categories |
| """ |
| from __future__ import annotations |
| from typing import Dict, List, Any, Optional |
| import re |
| from dataclasses import dataclass |
| from datetime import datetime |
|
|
|
|
| @dataclass |
| class TaxClassification: |
| """Result of classifying a transaction for tax purposes""" |
| tax_category: str |
| tax_treatment: str |
| deductible: bool |
| confidence: float |
| suggested_rule_ids: List[str] |
| notes: Optional[str] = None |
|
|
|
|
| class TransactionClassifier: |
| """ |
| Classifies bank transactions (from Mono API or manual entry) into tax categories |
| """ |
| |
| |
| INCOME_PATTERNS = { |
| 'employment_income': [ |
| r'\bSALARY\b', r'\bWAGES\b', r'\bPAYROLL\b', r'\bSTIPEND\b', |
| r'\bEMPLOYMENT\b', r'\bMONTHLY PAY\b', r'\bNET PAY\b', |
| r'\bGROSS PAY\b', r'\bEARNINGS\b', r'\bSALARY PAYMENT\b' |
| ], |
| 'business_income': [ |
| r'\bSALES\b', r'\bREVENUE\b', r'\bINVOICE\b', r'\bPAYMENT RECEIVED\b', |
| r'\bCUSTOMER\b', r'\bCLIENT\b', r'\bPROJECT\b', r'\bCONSULTING\b', |
| r'\bFREELANCE\b', r'\bCONTRACT\b' |
| ], |
| 'rental_income': [ |
| r'\bRENT RECEIVED\b', r'\bTENANT\b', r'\bLEASE PAYMENT\b', |
| r'\bPROPERTY INCOME\b', r'\bRENTAL\b' |
| ], |
| 'investment_income': [ |
| r'\bDIVIDEND\b', r'\bINTEREST\b', r'\bINVESTMENT\b', |
| r'\bCOUPON\b', r'\bBOND\b', r'\bSTOCK\b', r'\bSHARE\b' |
| ] |
| } |
| |
| DEDUCTION_PATTERNS = { |
| 'pension_contribution': [ |
| r'\bPENSION\b', r'\bPFA\b', r'\bRSA\b', r'\bRETIREMENT\b', |
| r'\bPENSION FUND\b', r'\bPENSION CONTRIBUTION\b' |
| ], |
| 'nhf_contribution': [ |
| r'\bNHF\b', r'\bHOUSING FUND\b', r'\bNATIONAL HOUSING\b' |
| ], |
| 'life_insurance': [ |
| r'\bLIFE INSURANCE\b', r'\bLIFE ASSURANCE\b', r'\bINSURANCE PREMIUM\b', |
| r'\bPOLICY PREMIUM\b' |
| ], |
| 'health_insurance': [ |
| r'\bHEALTH INSURANCE\b', r'\bHMO\b', r'\bMEDICAL INSURANCE\b', |
| r'\bHEALTH PLAN\b' |
| ], |
| 'rent_paid': [ |
| r'\bRENT\b', r'\bLANDLORD\b', r'\bLEASE\b', r'\bHOUSE RENT\b', |
| r'\bAPARTMENT RENT\b' |
| ], |
| 'union_dues': [ |
| r'\bUNION DUES\b', r'\bPROFESSIONAL FEES\b', r'\bASSOCIATION FEES\b', |
| r'\bMEMBERSHIP DUES\b' |
| ] |
| } |
| |
| def __init__(self, rag_pipeline: Optional[Any] = None): |
| """ |
| Initialize classifier |
| |
| Args: |
| rag_pipeline: Optional RAG pipeline for LLM-based classification of ambiguous transactions |
| """ |
| self.rag = rag_pipeline |
| |
| def classify_transaction(self, transaction: Dict[str, Any]) -> Dict[str, Any]: |
| """ |
| Classify a transaction (from Mono API or manual entry) |
| |
| Accepts both formats: |
| - Mono API: {"_id", "type": "credit/debit", "amount": 50000, "narration": "..."} |
| - Backend: {"id", "type": "income/expense", "amount_kobo": 5000000, "description": "..."} |
| |
| Returns enriched transaction with tax classification |
| """ |
| |
| narration = (transaction.get("narration") or transaction.get("description") or "").upper() |
| |
| |
| amount = transaction.get("amount") |
| if amount is None: |
| amount_kobo = transaction.get("amount_kobo", 0) |
| amount = abs(float(amount_kobo) / 100.0) |
| else: |
| amount = abs(float(amount)) |
| |
| |
| tx_type = transaction.get("type", "").lower() |
| if tx_type in ['income']: |
| tx_type = 'credit' |
| elif tx_type in ['expense']: |
| tx_type = 'debit' |
| |
| |
| classification = self._classify_by_patterns(narration, tx_type, amount) |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| return { |
| **transaction, |
| "tax_category": classification.tax_category, |
| "tax_treatment": classification.tax_treatment, |
| "deductible": classification.deductible, |
| "confidence": classification.confidence, |
| "suggested_rule_ids": classification.suggested_rule_ids, |
| "tax_notes": classification.notes |
| } |
| |
| def classify_batch(self, transactions: List[Dict[str, Any]]) -> List[Dict[str, Any]]: |
| """Classify multiple transactions""" |
| return [self.classify_transaction(tx) for tx in transactions] |
| |
| def _classify_by_patterns( |
| self, |
| narration: str, |
| tx_type: str, |
| amount: float |
| ) -> TaxClassification: |
| """Pattern-based classification using regex""" |
| |
| |
| if tx_type == "credit": |
| for category, patterns in self.INCOME_PATTERNS.items(): |
| for pattern in patterns: |
| if re.search(pattern, narration): |
| return self._get_income_classification(category, amount) |
| |
| |
| if tx_type == "debit": |
| for category, patterns in self.DEDUCTION_PATTERNS.items(): |
| for pattern in patterns: |
| if re.search(pattern, narration): |
| return self._get_deduction_classification(category, amount) |
| |
| |
| return TaxClassification( |
| tax_category="uncategorized", |
| tax_treatment="unknown", |
| deductible=False, |
| confidence=0.3, |
| suggested_rule_ids=[], |
| notes="Could not automatically categorize. Manual review recommended." |
| ) |
| |
| def _get_income_classification(self, category: str, amount: float) -> TaxClassification: |
| """Get classification for income categories""" |
| |
| classifications = { |
| 'employment_income': TaxClassification( |
| tax_category="employment_income", |
| tax_treatment="taxable", |
| deductible=False, |
| confidence=0.95, |
| suggested_rule_ids=["pit.base.gross_income"], |
| notes="Employment income is fully taxable under PITA" |
| ), |
| 'business_income': TaxClassification( |
| tax_category="business_income", |
| tax_treatment="taxable", |
| deductible=False, |
| confidence=0.85, |
| suggested_rule_ids=["cit.rate.small_2025", "cit.rate.medium_2025", "cit.rate.large_2025"], |
| notes="Business income subject to CIT or PIT depending on structure" |
| ), |
| 'rental_income': TaxClassification( |
| tax_category="rental_income", |
| tax_treatment="taxable", |
| deductible=False, |
| confidence=0.90, |
| suggested_rule_ids=["pit.base.gross_income"], |
| notes="Rental income is taxable. Consider property expenses as deductions." |
| ), |
| 'investment_income': TaxClassification( |
| tax_category="investment_income", |
| tax_treatment="taxable", |
| deductible=False, |
| confidence=0.85, |
| suggested_rule_ids=[], |
| notes="Investment income may be subject to withholding tax" |
| ) |
| } |
| |
| return classifications.get(category, TaxClassification( |
| tax_category="other_income", |
| tax_treatment="taxable", |
| deductible=False, |
| confidence=0.5, |
| suggested_rule_ids=[] |
| )) |
| |
| def _get_deduction_classification(self, category: str, amount: float) -> TaxClassification: |
| """Get classification for deduction categories""" |
| |
| classifications = { |
| 'pension_contribution': TaxClassification( |
| tax_category="pension_contribution", |
| tax_treatment="deductible", |
| deductible=True, |
| confidence=0.95, |
| suggested_rule_ids=["pit.deduction.pension"], |
| notes="Pension contributions to PRA-approved schemes are tax deductible (PITA s.20(1)(g))" |
| ), |
| 'nhf_contribution': TaxClassification( |
| tax_category="nhf_contribution", |
| tax_treatment="deductible", |
| deductible=True, |
| confidence=0.95, |
| suggested_rule_ids=["pit.base.taxable_income"], |
| notes="NHF contributions are tax deductible (2.5% of basic salary)" |
| ), |
| 'life_insurance': TaxClassification( |
| tax_category="life_insurance", |
| tax_treatment="deductible", |
| deductible=True, |
| confidence=0.85, |
| suggested_rule_ids=["pit.base.taxable_income"], |
| notes="Life insurance premiums are tax deductible if policy is with licensed insurer" |
| ), |
| 'health_insurance': TaxClassification( |
| tax_category="health_insurance", |
| tax_treatment="deductible", |
| deductible=True, |
| confidence=0.80, |
| suggested_rule_ids=["pit.base.taxable_income"], |
| notes="Health insurance premiums may be tax deductible" |
| ), |
| 'rent_paid': TaxClassification( |
| tax_category="rent_paid", |
| tax_treatment="potentially_deductible", |
| deductible=False, |
| confidence=0.85, |
| suggested_rule_ids=["pit.relief.rent_2026"], |
| notes="Rent paid: Not deductible in 2025. From 2026, 20% of rent (max ₦500K) under NTA 2025" |
| ), |
| 'union_dues': TaxClassification( |
| tax_category="union_dues", |
| tax_treatment="deductible", |
| deductible=True, |
| confidence=0.80, |
| suggested_rule_ids=["pit.base.taxable_income"], |
| notes="Professional association fees and union dues are tax deductible" |
| ) |
| } |
| |
| return classifications.get(category, TaxClassification( |
| tax_category="other_expense", |
| tax_treatment="unknown", |
| deductible=False, |
| confidence=0.4, |
| suggested_rule_ids=[] |
| )) |
| |
| def _llm_classify(self, transaction: Dict[str, Any]) -> TaxClassification: |
| """ |
| Use LLM/RAG to classify ambiguous transactions |
| This is a fallback for transactions that don't match patterns |
| """ |
| if not self.rag: |
| return TaxClassification( |
| tax_category="uncategorized", |
| tax_treatment="unknown", |
| deductible=False, |
| confidence=0.3, |
| suggested_rule_ids=[] |
| ) |
| |
| narration = transaction.get("narration", "") |
| amount = transaction.get("amount", 0) |
| tx_type = transaction.get("type", "") |
| |
| prompt = f""" |
| Classify this Nigerian bank transaction for tax purposes: |
| |
| Transaction Details: |
| - Narration: {narration} |
| - Amount: ₦{amount:,.2f} |
| - Type: {tx_type} |
| |
| Classify into ONE of these categories: |
| - employment_income (salary, wages, stipend) |
| - business_income (sales, revenue, client payments) |
| - rental_income (rent received from tenants) |
| - pension_contribution (PFA, RSA contributions) |
| - nhf_contribution (National Housing Fund) |
| - life_insurance (insurance premiums) |
| - rent_paid (rent paid to landlord) |
| - union_dues (professional fees, association dues) |
| - uncategorized (if unclear) |
| |
| Also indicate: |
| 1. Is it tax deductible? (yes/no) |
| 2. Confidence level (0.0 to 1.0) |
| |
| Respond with just the category name, deductible status, and confidence. |
| Example: "employment_income, no, 0.95" |
| """ |
| |
| try: |
| |
| response = self.rag.query(prompt, verbose=False) |
| |
| |
| parts = response.lower().split(',') |
| if len(parts) >= 3: |
| category = parts[0].strip() |
| deductible = 'yes' in parts[1].strip() |
| confidence = float(parts[2].strip()) |
| |
| return TaxClassification( |
| tax_category=category, |
| tax_treatment="deductible" if deductible else "taxable", |
| deductible=deductible, |
| confidence=min(confidence, 0.85), |
| suggested_rule_ids=[], |
| notes="Classified using AI analysis" |
| ) |
| except Exception as e: |
| print(f"LLM classification failed: {e}") |
| |
| |
| return TaxClassification( |
| tax_category="uncategorized", |
| tax_treatment="unknown", |
| deductible=False, |
| confidence=0.3, |
| suggested_rule_ids=[] |
| ) |
| |
| def get_classification_summary(self, classified_transactions: List[Dict[str, Any]]) -> Dict[str, Any]: |
| """Generate summary statistics of classified transactions""" |
| |
| total = len(classified_transactions) |
| if total == 0: |
| return {"total": 0, "categorized": 0, "high_confidence": 0} |
| |
| categorized = len([t for t in classified_transactions if t.get("tax_category") != "uncategorized"]) |
| high_confidence = len([t for t in classified_transactions if t.get("confidence", 0) > 0.8]) |
| |
| |
| by_category = {} |
| for tx in classified_transactions: |
| cat = tx.get("tax_category", "uncategorized") |
| by_category[cat] = by_category.get(cat, 0) + 1 |
| |
| |
| amounts_by_category = {} |
| for tx in classified_transactions: |
| cat = tx.get("tax_category", "uncategorized") |
| amt = abs(float(tx.get("amount", 0))) |
| amounts_by_category[cat] = amounts_by_category.get(cat, 0) + amt |
| |
| return { |
| "total_transactions": total, |
| "categorized": categorized, |
| "uncategorized": total - categorized, |
| "high_confidence": high_confidence, |
| "categorization_rate": categorized / total if total > 0 else 0, |
| "transactions_by_category": by_category, |
| "amounts_by_category": amounts_by_category |
| } |
|
|