Ranjit0034
/

finance-entity-extractor

+"""
+PDF Parser for Bank Statements
+==============================
+Extract transactions from Indian bank statement PDFs.
+Supports:
+- HDFC Bank statements
+- ICICI Bank statements
+- SBI Bank statements
+- Axis Bank statements
+- And more...
+Author: Ranjit Behera
+"""
+import re
+from pathlib import Path
+from typing import List, Dict, Optional, Tuple
+from dataclasses import dataclass
+from datetime import datetime
+import io
+@dataclass
+class PDFTransaction:
+    """Parsed transaction from PDF."""
+    date: str
+    description: str
+    amount: float
+    type: str  # debit or credit
+    balance: Optional[float] = None
+    reference: Optional[str] = None
+class BankStatementParser:
+    """
+    Parse bank statement PDFs and extract transactions.
+    Uses pdfplumber for text extraction and regex for parsing.
+    """
+    # Bank-specific patterns
+    BANK_PATTERNS = {
+        "hdfc": {
+            "header": r"HDFC\s+BANK",
+            "date": r"(\d{2}/\d{2}/\d{2,4})",
+            "transaction": r"(\d{2}/\d{2}/\d{2,4})\s+(.+?)\s+([\d,]+\.\d{2})\s*([DC]r)?\s*([\d,]+\.\d{2})?",
+        },
+        "icici": {
+            "header": r"ICICI\s+BANK",
+            "date": r"(\d{2}-\w{3}-\d{2,4})",
+            "transaction": r"(\d{2}-\w{3}-\d{2,4})\s+(.+?)\s+([\d,]+\.\d{2})\s*(Dr|Cr)?\s*([\d,]+\.\d{2})?",
+        },
+        "sbi": {
+            "header": r"State\s+Bank\s+of\s+India",
+            "date": r"(\d{2}\s+\w{3}\s+\d{2,4})",
+            "transaction": r"(\d{2}\s+\w{3}\s+\d{4})\s+(.+?)\s+([\d,]+\.\d{2})\s*([\d,]+\.\d{2})?",
+        },
+        "axis": {
+            "header": r"AXIS\s+BANK",
+            "date": r"(\d{2}-\d{2}-\d{2,4})",
+            "transaction": r"(\d{2}-\d{2}-\d{2,4})\s+(.+?)\s+([\d,]+\.\d{2})\s*([\d,]+\.\d{2})?",
+        },
+    }
+    def __init__(self):
+        self.pdfplumber = None
+        self._check_dependencies()
+    def _check_dependencies(self):
+        """Check if pdfplumber is available."""
+        try:
+            import pdfplumber
+            self.pdfplumber = pdfplumber
+        except ImportError:
+            self.pdfplumber = None
+    def parse_file(self, file_path: Path) -> List[PDFTransaction]:
+        """
+        Parse a PDF file and extract transactions.
+        Args:
+            file_path: Path to PDF file
+        Returns:
+            List of extracted transactions
+        """
+        if self.pdfplumber is None:
+            raise ImportError("pdfplumber is required. Install with: pip install pdfplumber")
+        with self.pdfplumber.open(file_path) as pdf:
+            text = ""
+            for page in pdf.pages:
+                text += page.extract_text() or ""
+        return self.parse_text(text)
+    def parse_bytes(self, pdf_bytes: bytes) -> List[PDFTransaction]:
+        """
+        Parse PDF from bytes.
+        Args:
+            pdf_bytes: PDF file content as bytes
+        Returns:
+            List of extracted transactions
+        """
+        if self.pdfplumber is None:
+            raise ImportError("pdfplumber is required. Install with: pip install pdfplumber")
+        with self.pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
+            text = ""
+            for page in pdf.pages:
+                text += page.extract_text() or ""
+        return self.parse_text(text)
+    def parse_text(self, text: str) -> List[PDFTransaction]:
+        """
+        Parse extracted text and identify transactions.
+        Args:
+            text: Extracted text from PDF
+        Returns:
+            List of transactions
+        """
+        # Detect bank
+        bank = self._detect_bank(text)
+        if bank:
+            return self._parse_with_pattern(text, bank)
+        else:
+            return self._parse_generic(text)
+    def _detect_bank(self, text: str) -> Optional[str]:
+        """Detect which bank's statement this is."""
+        text_upper = text.upper()
+        for bank, patterns in self.BANK_PATTERNS.items():
+            if re.search(patterns["header"], text_upper, re.IGNORECASE):
+                return bank
+        return None
+    def _parse_with_pattern(self, text: str, bank: str) -> List[PDFTransaction]:
+        """Parse using bank-specific pattern."""
+        patterns = self.BANK_PATTERNS[bank]
+        transactions = []
+        for match in re.finditer(patterns["transaction"], text, re.MULTILINE):
+            try:
+                date = match.group(1)
+                description = match.group(2).strip()
+                amount = float(match.group(3).replace(',', ''))
+                # Determine type
+                txn_type = "debit"
+                if len(match.groups()) > 3 and match.group(4):
+                    if match.group(4).upper() in ["CR", "C"]:
+                        txn_type = "credit"
+                # Extract balance if present
+                balance = None
+                if len(match.groups()) > 4 and match.group(5):
+                    balance = float(match.group(5).replace(',', ''))
+                # Extract reference from description
+                reference = self._extract_reference(description)
+                transactions.append(PDFTransaction(
+                    date=date,
+                    description=description,
+                    amount=amount,
+                    type=txn_type,
+                    balance=balance,
+                    reference=reference,
+                ))
+            except (ValueError, IndexError):
+                continue
+        return transactions
+    def _parse_generic(self, text: str) -> List[PDFTransaction]:
+        """Generic parsing for unknown bank formats."""
+        transactions = []
+        # Generic pattern: date, description, amount
+        pattern = r"(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})\s+(.+?)\s+([\d,]+\.\d{2})"
+        for match in re.finditer(pattern, text, re.MULTILINE):
+            try:
+                date = match.group(1)
+                description = match.group(2).strip()
+                amount = float(match.group(3).replace(',', ''))
+                # Infer type from description
+                txn_type = self._infer_type(description)
+                reference = self._extract_reference(description)
+                transactions.append(PDFTransaction(
+                    date=date,
+                    description=description,
+                    amount=amount,
+                    type=txn_type,
+                    reference=reference,
+                ))
+            except (ValueError, IndexError):
+                continue
+        return transactions
+    def _extract_reference(self, description: str) -> Optional[str]:
+        """Extract reference number from description."""
+        patterns = [
+            r"[Rr]ef[.:# ]*(\d{10,18})",
+            r"UTR[.:# ]*(\w{12,22})",
+            r"IMPS[.:# ]*(\d{12})",
+            r"NEFT[.:# ]*(\w{10,16})",
+        ]
+        for pattern in patterns:
+            match = re.search(pattern, description)
+            if match:
+                return match.group(1)
+        return None
+    def _infer_type(self, description: str) -> str:
+        """Infer transaction type from description."""
+        description_lower = description.lower()
+        credit_keywords = ["salary", "credited", "received", "refund", "cashback", "interest"]
+        debit_keywords = ["debited", "paid", "withdrawn", "transfer to", "payment"]
+        for kw in credit_keywords:
+            if kw in description_lower:
+                return "credit"
+        for kw in debit_keywords:
+            if kw in description_lower:
+                return "debit"
+        return "debit"  # Default to debit
+    def to_dict_list(self, transactions: List[PDFTransaction]) -> List[Dict]:
+        """Convert transactions to list of dictionaries."""
+        return [
+            {
+                "date": t.date,
+                "description": t.description,
+                "amount": t.amount,
+                "type": t.type,
+                "balance": t.balance,
+                "reference": t.reference,
+            }
+            for t in transactions
+        ]
+class ImageOCRParser:
+    """
+    Parse transaction screenshots using OCR.
+    Uses EasyOCR or pytesseract for text extraction.
+    """
+    def __init__(self, backend: str = "auto"):
+        """
+        Initialize OCR parser.
+        Args:
+            backend: "easyocr", "tesseract", or "auto"
+        """
+        self.backend = backend
+        self.reader = None
+        self._init_backend()
+    def _init_backend(self):
+        """Initialize OCR backend."""
+        if self.backend == "auto":
+            try:
+                import easyocr
+                self.reader = easyocr.Reader(['en', 'hi'])
+                self.backend = "easyocr"
+            except ImportError:
+                try:
+                    import pytesseract
+                    self.backend = "tesseract"
+                except ImportError:
+                    raise ImportError("No OCR backend available. Install easyocr or pytesseract")
+        elif self.backend == "easyocr":
+            import easyocr
+            self.reader = easyocr.Reader(['en', 'hi'])
+        elif self.backend == "tesseract":
+            import pytesseract
+    def extract_text(self, image_path: Path) -> str:
+        """
+        Extract text from image.
+        Args:
+            image_path: Path to image file
+        Returns:
+            Extracted text
+        """
+        if self.backend == "easyocr":
+            results = self.reader.readtext(str(image_path))
+            return "\n".join([r[1] for r in results])
+        elif self.backend == "tesseract":
+            import pytesseract
+            from PIL import Image
+            image = Image.open(image_path)
+            return pytesseract.image_to_string(image)
+        return ""
+    def extract_text_from_bytes(self, image_bytes: bytes) -> str:
+        """
+        Extract text from image bytes.
+        Args:
+            image_bytes: Image content as bytes
+        Returns:
+            Extracted text
+        """
+        if self.backend == "easyocr":
+            import numpy as np
+            from PIL import Image
+            image = Image.open(io.BytesIO(image_bytes))
+            image_array = np.array(image)
+            results = self.reader.readtext(image_array)
+            return "\n".join([r[1] for r in results])
+        elif self.backend == "tesseract":
+            import pytesseract
+            from PIL import Image
+            image = Image.open(io.BytesIO(image_bytes))
+            return pytesseract.image_to_string(image)
+        return ""
+# ============================================================================
+# UTILITY FUNCTIONS
+# ============================================================================
+def parse_pdf(file_path: str) -> List[Dict]:
+    """
+    Convenience function to parse PDF.
+    Args:
+        file_path: Path to PDF file
+    Returns:
+        List of transaction dictionaries
+    """
+    parser = BankStatementParser()
+    transactions = parser.parse_file(Path(file_path))
+    return parser.to_dict_list(transactions)
+def parse_image(file_path: str) -> str:
+    """
+    Convenience function to extract text from image.
+    Args:
+        file_path: Path to image file
+    Returns:
+        Extracted text
+    """
+    parser = ImageOCRParser()
+    return parser.extract_text(Path(file_path))
+# ============================================================================
+# MAIN
+# ============================================================================
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) < 2:
+        print("Usage: python pdf_parser.py <file.pdf>")
+        sys.exit(1)
+    file_path = sys.argv[1]
+    if file_path.endswith('.pdf'):
+        try:
+            transactions = parse_pdf(file_path)
+            print(f"Found {len(transactions)} transactions:")
+            for t in transactions[:10]:
+                print(f"  {t['date']}: {t['type']} ₹{t['amount']:,.2f} - {t['description'][:40]}")
+        except ImportError as e:
+            print(f"Error: {e}")
+    else:
+        try:
+            text = parse_image(file_path)
+            print("Extracted text:")
+            print(text)
+        except ImportError as e:
+            print(f"Error: {e}")