Ranjit Behera
FinEE v1.0 - Finance Entity Extractor
dcc24f8
"""
Data Module - Email Processing and Entity Extraction.
This module provides production-grade tools for processing financial emails,
extracting structured data, and classifying email content.
Components:
- EmailParser: Parse emails from MBOX files
- EntityExtractor: Extract financial entities from text
- EmailClassifier: Classify emails into categories
- PDFExtractor: Extract transactions from bank statement PDFs
- BankEmailGenerator: Generate synthetic training data
Example:
>>> from src.data import EntityExtractor, EmailClassifier
>>>
>>> extractor = EntityExtractor()
>>> result = extractor.extract("Rs.500 debited from account 1234")
>>> print(result.to_dict())
{'amount': '500', 'type': 'debit', 'account': '1234'}
>>>
>>> classifier = EmailClassifier()
>>> result = classifier.classify(subject="Transaction Alert", ...)
>>> print(result.category)
'finance'
Author: Ranjit Behera
License: MIT
"""
from __future__ import annotations
# Core exports
from src.data.extractor import (
EntityExtractor,
FinancialEntity,
extract_entities,
)
from src.data.classifier import (
EmailClassifier,
EmailCategory,
ClassificationResult,
classify_email,
)
from src.data.parser import EmailParser
# Optional exports (may not be installed)
try:
from src.data.pdf_extractor import PDFExtractor, Transaction
except ImportError:
PDFExtractor = None # type: ignore
Transaction = None # type: ignore
try:
from src.data.bank_templates import BankEmailGenerator
except ImportError:
BankEmailGenerator = None # type: ignore
try:
from src.data.labeling import LabelingPipeline, LabeledExample
except ImportError:
LabelingPipeline = None # type: ignore
LabeledExample = None # type: ignore
# Public API
__all__ = [
# Core classes
"EntityExtractor",
"FinancialEntity",
"EmailClassifier",
"EmailCategory",
"ClassificationResult",
"EmailParser",
# Convenience functions
"extract_entities",
"classify_email",
# Optional classes
"PDFExtractor",
"Transaction",
"BankEmailGenerator",
"LabelingPipeline",
"LabeledExample",
]
def get_version() -> str:
"""Get the package version."""
from src import __version__
return __version__