File size: 2,337 Bytes
dcc24f8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
"""
Data Module - Email Processing and Entity Extraction.
This module provides production-grade tools for processing financial emails,
extracting structured data, and classifying email content.
Components:
- EmailParser: Parse emails from MBOX files
- EntityExtractor: Extract financial entities from text
- EmailClassifier: Classify emails into categories
- PDFExtractor: Extract transactions from bank statement PDFs
- BankEmailGenerator: Generate synthetic training data
Example:
>>> from src.data import EntityExtractor, EmailClassifier
>>>
>>> extractor = EntityExtractor()
>>> result = extractor.extract("Rs.500 debited from account 1234")
>>> print(result.to_dict())
{'amount': '500', 'type': 'debit', 'account': '1234'}
>>>
>>> classifier = EmailClassifier()
>>> result = classifier.classify(subject="Transaction Alert", ...)
>>> print(result.category)
'finance'
Author: Ranjit Behera
License: MIT
"""
from __future__ import annotations
# Core exports
from src.data.extractor import (
EntityExtractor,
FinancialEntity,
extract_entities,
)
from src.data.classifier import (
EmailClassifier,
EmailCategory,
ClassificationResult,
classify_email,
)
from src.data.parser import EmailParser
# Optional exports (may not be installed)
try:
from src.data.pdf_extractor import PDFExtractor, Transaction
except ImportError:
PDFExtractor = None # type: ignore
Transaction = None # type: ignore
try:
from src.data.bank_templates import BankEmailGenerator
except ImportError:
BankEmailGenerator = None # type: ignore
try:
from src.data.labeling import LabelingPipeline, LabeledExample
except ImportError:
LabelingPipeline = None # type: ignore
LabeledExample = None # type: ignore
# Public API
__all__ = [
# Core classes
"EntityExtractor",
"FinancialEntity",
"EmailClassifier",
"EmailCategory",
"ClassificationResult",
"EmailParser",
# Convenience functions
"extract_entities",
"classify_email",
# Optional classes
"PDFExtractor",
"Transaction",
"BankEmailGenerator",
"LabelingPipeline",
"LabeledExample",
]
def get_version() -> str:
"""Get the package version."""
from src import __version__
return __version__
|