Ranjit0034 commited on
Commit
f60e9c2
·
verified ·
1 Parent(s): e8fe888

Upload src/finee/pdf_parser.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. src/finee/pdf_parser.py +413 -0
src/finee/pdf_parser.py ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PDF Parser for Bank Statements
3
+ ==============================
4
+
5
+ Extract transactions from Indian bank statement PDFs.
6
+
7
+ Supports:
8
+ - HDFC Bank statements
9
+ - ICICI Bank statements
10
+ - SBI Bank statements
11
+ - Axis Bank statements
12
+ - And more...
13
+
14
+ Author: Ranjit Behera
15
+ """
16
+
17
+ import re
18
+ from pathlib import Path
19
+ from typing import List, Dict, Optional, Tuple
20
+ from dataclasses import dataclass
21
+ from datetime import datetime
22
+ import io
23
+
24
+
25
+ @dataclass
26
+ class PDFTransaction:
27
+ """Parsed transaction from PDF."""
28
+ date: str
29
+ description: str
30
+ amount: float
31
+ type: str # debit or credit
32
+ balance: Optional[float] = None
33
+ reference: Optional[str] = None
34
+
35
+
36
+ class BankStatementParser:
37
+ """
38
+ Parse bank statement PDFs and extract transactions.
39
+
40
+ Uses pdfplumber for text extraction and regex for parsing.
41
+ """
42
+
43
+ # Bank-specific patterns
44
+ BANK_PATTERNS = {
45
+ "hdfc": {
46
+ "header": r"HDFC\s+BANK",
47
+ "date": r"(\d{2}/\d{2}/\d{2,4})",
48
+ "transaction": r"(\d{2}/\d{2}/\d{2,4})\s+(.+?)\s+([\d,]+\.\d{2})\s*([DC]r)?\s*([\d,]+\.\d{2})?",
49
+ },
50
+ "icici": {
51
+ "header": r"ICICI\s+BANK",
52
+ "date": r"(\d{2}-\w{3}-\d{2,4})",
53
+ "transaction": r"(\d{2}-\w{3}-\d{2,4})\s+(.+?)\s+([\d,]+\.\d{2})\s*(Dr|Cr)?\s*([\d,]+\.\d{2})?",
54
+ },
55
+ "sbi": {
56
+ "header": r"State\s+Bank\s+of\s+India",
57
+ "date": r"(\d{2}\s+\w{3}\s+\d{2,4})",
58
+ "transaction": r"(\d{2}\s+\w{3}\s+\d{4})\s+(.+?)\s+([\d,]+\.\d{2})\s*([\d,]+\.\d{2})?",
59
+ },
60
+ "axis": {
61
+ "header": r"AXIS\s+BANK",
62
+ "date": r"(\d{2}-\d{2}-\d{2,4})",
63
+ "transaction": r"(\d{2}-\d{2}-\d{2,4})\s+(.+?)\s+([\d,]+\.\d{2})\s*([\d,]+\.\d{2})?",
64
+ },
65
+ }
66
+
67
+ def __init__(self):
68
+ self.pdfplumber = None
69
+ self._check_dependencies()
70
+
71
+ def _check_dependencies(self):
72
+ """Check if pdfplumber is available."""
73
+ try:
74
+ import pdfplumber
75
+ self.pdfplumber = pdfplumber
76
+ except ImportError:
77
+ self.pdfplumber = None
78
+
79
+ def parse_file(self, file_path: Path) -> List[PDFTransaction]:
80
+ """
81
+ Parse a PDF file and extract transactions.
82
+
83
+ Args:
84
+ file_path: Path to PDF file
85
+
86
+ Returns:
87
+ List of extracted transactions
88
+ """
89
+ if self.pdfplumber is None:
90
+ raise ImportError("pdfplumber is required. Install with: pip install pdfplumber")
91
+
92
+ with self.pdfplumber.open(file_path) as pdf:
93
+ text = ""
94
+ for page in pdf.pages:
95
+ text += page.extract_text() or ""
96
+
97
+ return self.parse_text(text)
98
+
99
+ def parse_bytes(self, pdf_bytes: bytes) -> List[PDFTransaction]:
100
+ """
101
+ Parse PDF from bytes.
102
+
103
+ Args:
104
+ pdf_bytes: PDF file content as bytes
105
+
106
+ Returns:
107
+ List of extracted transactions
108
+ """
109
+ if self.pdfplumber is None:
110
+ raise ImportError("pdfplumber is required. Install with: pip install pdfplumber")
111
+
112
+ with self.pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
113
+ text = ""
114
+ for page in pdf.pages:
115
+ text += page.extract_text() or ""
116
+
117
+ return self.parse_text(text)
118
+
119
+ def parse_text(self, text: str) -> List[PDFTransaction]:
120
+ """
121
+ Parse extracted text and identify transactions.
122
+
123
+ Args:
124
+ text: Extracted text from PDF
125
+
126
+ Returns:
127
+ List of transactions
128
+ """
129
+ # Detect bank
130
+ bank = self._detect_bank(text)
131
+
132
+ if bank:
133
+ return self._parse_with_pattern(text, bank)
134
+ else:
135
+ return self._parse_generic(text)
136
+
137
+ def _detect_bank(self, text: str) -> Optional[str]:
138
+ """Detect which bank's statement this is."""
139
+ text_upper = text.upper()
140
+
141
+ for bank, patterns in self.BANK_PATTERNS.items():
142
+ if re.search(patterns["header"], text_upper, re.IGNORECASE):
143
+ return bank
144
+
145
+ return None
146
+
147
+ def _parse_with_pattern(self, text: str, bank: str) -> List[PDFTransaction]:
148
+ """Parse using bank-specific pattern."""
149
+ patterns = self.BANK_PATTERNS[bank]
150
+ transactions = []
151
+
152
+ for match in re.finditer(patterns["transaction"], text, re.MULTILINE):
153
+ try:
154
+ date = match.group(1)
155
+ description = match.group(2).strip()
156
+ amount = float(match.group(3).replace(',', ''))
157
+
158
+ # Determine type
159
+ txn_type = "debit"
160
+ if len(match.groups()) > 3 and match.group(4):
161
+ if match.group(4).upper() in ["CR", "C"]:
162
+ txn_type = "credit"
163
+
164
+ # Extract balance if present
165
+ balance = None
166
+ if len(match.groups()) > 4 and match.group(5):
167
+ balance = float(match.group(5).replace(',', ''))
168
+
169
+ # Extract reference from description
170
+ reference = self._extract_reference(description)
171
+
172
+ transactions.append(PDFTransaction(
173
+ date=date,
174
+ description=description,
175
+ amount=amount,
176
+ type=txn_type,
177
+ balance=balance,
178
+ reference=reference,
179
+ ))
180
+ except (ValueError, IndexError):
181
+ continue
182
+
183
+ return transactions
184
+
185
+ def _parse_generic(self, text: str) -> List[PDFTransaction]:
186
+ """Generic parsing for unknown bank formats."""
187
+ transactions = []
188
+
189
+ # Generic pattern: date, description, amount
190
+ pattern = r"(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})\s+(.+?)\s+([\d,]+\.\d{2})"
191
+
192
+ for match in re.finditer(pattern, text, re.MULTILINE):
193
+ try:
194
+ date = match.group(1)
195
+ description = match.group(2).strip()
196
+ amount = float(match.group(3).replace(',', ''))
197
+
198
+ # Infer type from description
199
+ txn_type = self._infer_type(description)
200
+ reference = self._extract_reference(description)
201
+
202
+ transactions.append(PDFTransaction(
203
+ date=date,
204
+ description=description,
205
+ amount=amount,
206
+ type=txn_type,
207
+ reference=reference,
208
+ ))
209
+ except (ValueError, IndexError):
210
+ continue
211
+
212
+ return transactions
213
+
214
+ def _extract_reference(self, description: str) -> Optional[str]:
215
+ """Extract reference number from description."""
216
+ patterns = [
217
+ r"[Rr]ef[.:# ]*(\d{10,18})",
218
+ r"UTR[.:# ]*(\w{12,22})",
219
+ r"IMPS[.:# ]*(\d{12})",
220
+ r"NEFT[.:# ]*(\w{10,16})",
221
+ ]
222
+
223
+ for pattern in patterns:
224
+ match = re.search(pattern, description)
225
+ if match:
226
+ return match.group(1)
227
+
228
+ return None
229
+
230
+ def _infer_type(self, description: str) -> str:
231
+ """Infer transaction type from description."""
232
+ description_lower = description.lower()
233
+
234
+ credit_keywords = ["salary", "credited", "received", "refund", "cashback", "interest"]
235
+ debit_keywords = ["debited", "paid", "withdrawn", "transfer to", "payment"]
236
+
237
+ for kw in credit_keywords:
238
+ if kw in description_lower:
239
+ return "credit"
240
+
241
+ for kw in debit_keywords:
242
+ if kw in description_lower:
243
+ return "debit"
244
+
245
+ return "debit" # Default to debit
246
+
247
+ def to_dict_list(self, transactions: List[PDFTransaction]) -> List[Dict]:
248
+ """Convert transactions to list of dictionaries."""
249
+ return [
250
+ {
251
+ "date": t.date,
252
+ "description": t.description,
253
+ "amount": t.amount,
254
+ "type": t.type,
255
+ "balance": t.balance,
256
+ "reference": t.reference,
257
+ }
258
+ for t in transactions
259
+ ]
260
+
261
+
262
+ class ImageOCRParser:
263
+ """
264
+ Parse transaction screenshots using OCR.
265
+
266
+ Uses EasyOCR or pytesseract for text extraction.
267
+ """
268
+
269
+ def __init__(self, backend: str = "auto"):
270
+ """
271
+ Initialize OCR parser.
272
+
273
+ Args:
274
+ backend: "easyocr", "tesseract", or "auto"
275
+ """
276
+ self.backend = backend
277
+ self.reader = None
278
+ self._init_backend()
279
+
280
+ def _init_backend(self):
281
+ """Initialize OCR backend."""
282
+ if self.backend == "auto":
283
+ try:
284
+ import easyocr
285
+ self.reader = easyocr.Reader(['en', 'hi'])
286
+ self.backend = "easyocr"
287
+ except ImportError:
288
+ try:
289
+ import pytesseract
290
+ self.backend = "tesseract"
291
+ except ImportError:
292
+ raise ImportError("No OCR backend available. Install easyocr or pytesseract")
293
+
294
+ elif self.backend == "easyocr":
295
+ import easyocr
296
+ self.reader = easyocr.Reader(['en', 'hi'])
297
+
298
+ elif self.backend == "tesseract":
299
+ import pytesseract
300
+
301
+ def extract_text(self, image_path: Path) -> str:
302
+ """
303
+ Extract text from image.
304
+
305
+ Args:
306
+ image_path: Path to image file
307
+
308
+ Returns:
309
+ Extracted text
310
+ """
311
+ if self.backend == "easyocr":
312
+ results = self.reader.readtext(str(image_path))
313
+ return "\n".join([r[1] for r in results])
314
+
315
+ elif self.backend == "tesseract":
316
+ import pytesseract
317
+ from PIL import Image
318
+
319
+ image = Image.open(image_path)
320
+ return pytesseract.image_to_string(image)
321
+
322
+ return ""
323
+
324
+ def extract_text_from_bytes(self, image_bytes: bytes) -> str:
325
+ """
326
+ Extract text from image bytes.
327
+
328
+ Args:
329
+ image_bytes: Image content as bytes
330
+
331
+ Returns:
332
+ Extracted text
333
+ """
334
+ if self.backend == "easyocr":
335
+ import numpy as np
336
+ from PIL import Image
337
+
338
+ image = Image.open(io.BytesIO(image_bytes))
339
+ image_array = np.array(image)
340
+ results = self.reader.readtext(image_array)
341
+ return "\n".join([r[1] for r in results])
342
+
343
+ elif self.backend == "tesseract":
344
+ import pytesseract
345
+ from PIL import Image
346
+
347
+ image = Image.open(io.BytesIO(image_bytes))
348
+ return pytesseract.image_to_string(image)
349
+
350
+ return ""
351
+
352
+
353
+ # ============================================================================
354
+ # UTILITY FUNCTIONS
355
+ # ============================================================================
356
+
357
+ def parse_pdf(file_path: str) -> List[Dict]:
358
+ """
359
+ Convenience function to parse PDF.
360
+
361
+ Args:
362
+ file_path: Path to PDF file
363
+
364
+ Returns:
365
+ List of transaction dictionaries
366
+ """
367
+ parser = BankStatementParser()
368
+ transactions = parser.parse_file(Path(file_path))
369
+ return parser.to_dict_list(transactions)
370
+
371
+
372
+ def parse_image(file_path: str) -> str:
373
+ """
374
+ Convenience function to extract text from image.
375
+
376
+ Args:
377
+ file_path: Path to image file
378
+
379
+ Returns:
380
+ Extracted text
381
+ """
382
+ parser = ImageOCRParser()
383
+ return parser.extract_text(Path(file_path))
384
+
385
+
386
+ # ============================================================================
387
+ # MAIN
388
+ # ============================================================================
389
+
390
+ if __name__ == "__main__":
391
+ import sys
392
+
393
+ if len(sys.argv) < 2:
394
+ print("Usage: python pdf_parser.py <file.pdf>")
395
+ sys.exit(1)
396
+
397
+ file_path = sys.argv[1]
398
+
399
+ if file_path.endswith('.pdf'):
400
+ try:
401
+ transactions = parse_pdf(file_path)
402
+ print(f"Found {len(transactions)} transactions:")
403
+ for t in transactions[:10]:
404
+ print(f" {t['date']}: {t['type']} ₹{t['amount']:,.2f} - {t['description'][:40]}")
405
+ except ImportError as e:
406
+ print(f"Error: {e}")
407
+ else:
408
+ try:
409
+ text = parse_image(file_path)
410
+ print("Extracted text:")
411
+ print(text)
412
+ except ImportError as e:
413
+ print(f"Error: {e}")