|
|
""" |
|
|
Data Module - Email Processing and Entity Extraction. |
|
|
|
|
|
This module provides production-grade tools for processing financial emails, |
|
|
extracting structured data, and classifying email content. |
|
|
|
|
|
Components: |
|
|
- EmailParser: Parse emails from MBOX files |
|
|
- EntityExtractor: Extract financial entities from text |
|
|
- EmailClassifier: Classify emails into categories |
|
|
- PDFExtractor: Extract transactions from bank statement PDFs |
|
|
- BankEmailGenerator: Generate synthetic training data |
|
|
|
|
|
Example: |
|
|
>>> from src.data import EntityExtractor, EmailClassifier |
|
|
>>> |
|
|
>>> extractor = EntityExtractor() |
|
|
>>> result = extractor.extract("Rs.500 debited from account 1234") |
|
|
>>> print(result.to_dict()) |
|
|
{'amount': '500', 'type': 'debit', 'account': '1234'} |
|
|
>>> |
|
|
>>> classifier = EmailClassifier() |
|
|
>>> result = classifier.classify(subject="Transaction Alert", ...) |
|
|
>>> print(result.category) |
|
|
'finance' |
|
|
|
|
|
Author: Ranjit Behera |
|
|
License: MIT |
|
|
""" |
|
|
|
|
|
from __future__ import annotations |
|
|
|
|
|
|
|
|
from src.data.extractor import ( |
|
|
EntityExtractor, |
|
|
FinancialEntity, |
|
|
extract_entities, |
|
|
) |
|
|
|
|
|
from src.data.classifier import ( |
|
|
EmailClassifier, |
|
|
EmailCategory, |
|
|
ClassificationResult, |
|
|
classify_email, |
|
|
) |
|
|
|
|
|
from src.data.parser import EmailParser |
|
|
|
|
|
|
|
|
try: |
|
|
from src.data.pdf_extractor import PDFExtractor, Transaction |
|
|
except ImportError: |
|
|
PDFExtractor = None |
|
|
Transaction = None |
|
|
|
|
|
try: |
|
|
from src.data.bank_templates import BankEmailGenerator |
|
|
except ImportError: |
|
|
BankEmailGenerator = None |
|
|
|
|
|
try: |
|
|
from src.data.labeling import LabelingPipeline, LabeledExample |
|
|
except ImportError: |
|
|
LabelingPipeline = None |
|
|
LabeledExample = None |
|
|
|
|
|
|
|
|
__all__ = [ |
|
|
|
|
|
"EntityExtractor", |
|
|
"FinancialEntity", |
|
|
"EmailClassifier", |
|
|
"EmailCategory", |
|
|
"ClassificationResult", |
|
|
"EmailParser", |
|
|
|
|
|
|
|
|
"extract_entities", |
|
|
"classify_email", |
|
|
|
|
|
|
|
|
"PDFExtractor", |
|
|
"Transaction", |
|
|
"BankEmailGenerator", |
|
|
"LabelingPipeline", |
|
|
"LabeledExample", |
|
|
] |
|
|
|
|
|
|
|
|
def get_version() -> str: |
|
|
"""Get the package version.""" |
|
|
from src import __version__ |
|
|
return __version__ |
|
|
|