File size: 2,337 Bytes
dcc24f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
"""
Data Module - Email Processing and Entity Extraction.

This module provides production-grade tools for processing financial emails,
extracting structured data, and classifying email content.

Components:
    - EmailParser: Parse emails from MBOX files
    - EntityExtractor: Extract financial entities from text
    - EmailClassifier: Classify emails into categories
    - PDFExtractor: Extract transactions from bank statement PDFs
    - BankEmailGenerator: Generate synthetic training data

Example:
    >>> from src.data import EntityExtractor, EmailClassifier
    >>> 
    >>> extractor = EntityExtractor()
    >>> result = extractor.extract("Rs.500 debited from account 1234")
    >>> print(result.to_dict())
    {'amount': '500', 'type': 'debit', 'account': '1234'}
    >>> 
    >>> classifier = EmailClassifier()
    >>> result = classifier.classify(subject="Transaction Alert", ...)
    >>> print(result.category)
    'finance'

Author: Ranjit Behera
License: MIT
"""

from __future__ import annotations

# Core exports
from src.data.extractor import (
    EntityExtractor,
    FinancialEntity,
    extract_entities,
)

from src.data.classifier import (
    EmailClassifier,
    EmailCategory,
    ClassificationResult,
    classify_email,
)

from src.data.parser import EmailParser

# Optional exports (may not be installed)
try:
    from src.data.pdf_extractor import PDFExtractor, Transaction
except ImportError:
    PDFExtractor = None  # type: ignore
    Transaction = None  # type: ignore

try:
    from src.data.bank_templates import BankEmailGenerator
except ImportError:
    BankEmailGenerator = None  # type: ignore

try:
    from src.data.labeling import LabelingPipeline, LabeledExample
except ImportError:
    LabelingPipeline = None  # type: ignore
    LabeledExample = None  # type: ignore

# Public API
__all__ = [
    # Core classes
    "EntityExtractor",
    "FinancialEntity",
    "EmailClassifier",
    "EmailCategory",
    "ClassificationResult",
    "EmailParser",
    
    # Convenience functions
    "extract_entities",
    "classify_email",
    
    # Optional classes
    "PDFExtractor",
    "Transaction",
    "BankEmailGenerator",
    "LabelingPipeline",
    "LabeledExample",
]


def get_version() -> str:
    """Get the package version."""
    from src import __version__
    return __version__