GSoumyajit2005 commited on
Commit
b99270c
·
1 Parent(s): 5d04abb

Updated Extraction Logic for more Robustness

Browse files
Files changed (1) hide show
  1. src/extraction.py +47 -18
src/extraction.py CHANGED
@@ -2,18 +2,33 @@
2
 
3
  import re
4
  from typing import List, Dict, Optional, Any
 
5
 
6
  def extract_dates(text: str) -> List[str]:
 
 
 
 
7
  if not text: return []
8
- dates = []
9
- # DD/MM/YYYY or DD-MM-YYYY
10
- pattern1 = r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b'
11
- # YYYY-MM-DD
12
- pattern2 = r'\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b'
13
 
14
- dates.extend(re.findall(pattern1, text))
15
- dates.extend(re.findall(pattern2, text))
16
- return list(dict.fromkeys(dates))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  def extract_amounts(text: str) -> List[float]:
19
  if not text: return []
@@ -32,11 +47,11 @@ def extract_amounts(text: str) -> List[float]:
32
 
33
  def extract_total(text: str) -> Optional[float]:
34
  """
35
- Robust total extraction looking for keywords + largest number context.
36
  """
37
  if not text: return None
38
 
39
- # 1. Try specific "Total" keywords first
40
  # Looks for "Total: 123.45" or "Total Amount $123.45"
41
  pattern = r'(?:TOTAL|AMOUNT DUE|GRAND TOTAL|BALANCE|PAYABLE)[\w\s]*[:$]?\s*([\d,]+\.\d{2})'
42
  matches = re.findall(pattern, text, re.IGNORECASE)
@@ -48,11 +63,25 @@ def extract_total(text: str) -> Optional[float]:
48
  except ValueError:
49
  pass
50
 
51
- # 2. Fallback: Find the largest monetary value in the bottom half of text
52
- # (Risky, but better than None)
53
- amounts = extract_amounts(text)
54
- if amounts:
55
- return max(amounts)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  return None
58
 
@@ -74,13 +103,13 @@ def extract_vendor(text: str) -> Optional[str]:
74
 
75
  def extract_invoice_number(text: str) -> Optional[str]:
76
  """
77
- Improved regex that handles alphanumeric AND numeric IDs.
78
  """
79
  if not text: return None
80
 
81
  # Strategy 1: Look for "Invoice No: XXXXX" pattern
82
- # Matches: "Invoice No: 12345", "Inv #: AB-123", "Bill No. 999"
83
- keyword_pattern = r'(?:INVOICE|BILL|RECEIPT)\s*(?:NO|NUMBER|#|NUM)?[\s\.:-]*([A-Z0-9\-/]{3,})'
84
  match = re.search(keyword_pattern, text, re.IGNORECASE)
85
  if match:
86
  return match.group(1)
 
2
 
3
  import re
4
  from typing import List, Dict, Optional, Any
5
+ from datetime import datetime
6
 
7
  def extract_dates(text: str) -> List[str]:
8
+ """
9
+ Robust date extraction that handles noisy OCR separators (spaces, pipes, dots)
10
+ and validates using datetime to ensure semantic correctness.
11
+ """
12
  if not text: return []
 
 
 
 
 
13
 
14
+ # Matches DD/MM/YYYY, DD-MM-YYYY, DD.MM.YYYY, DD MM YYYY
15
+ # Also handles OCR noise like pipes (|) instead of slashes
16
+ pattern = r'\b(\d{1,2})[\s/|.-](\d{1,2})[\s/|.-](\d{2,4})\b'
17
+ matches = re.findall(pattern, text)
18
+
19
+ valid_dates = []
20
+ for d, m, y in matches:
21
+ try:
22
+ # Try to parse it to check if it's a real date
23
+ # This filters out "99/99/2000" or random phone numbers like 12 34 5678
24
+ # Assuming Day-Month-Year format which is common in SROIE/International
25
+ # For US format, you might swap d and m
26
+ dt = datetime(int(y), int(m), int(d))
27
+ valid_dates.append(dt.strftime("%d/%m/%Y"))
28
+ except ValueError:
29
+ continue # Invalid date logic (e.g. Month 13 or Day 32)
30
+
31
+ return list(dict.fromkeys(valid_dates)) # Deduplicate
32
 
33
  def extract_amounts(text: str) -> List[float]:
34
  if not text: return []
 
47
 
48
  def extract_total(text: str) -> Optional[float]:
49
  """
50
+ Robust total extraction using keyword confidence + Footer Search.
51
  """
52
  if not text: return None
53
 
54
+ # 1. Try specific "Total" keywords first (Highest Confidence)
55
  # Looks for "Total: 123.45" or "Total Amount $123.45"
56
  pattern = r'(?:TOTAL|AMOUNT DUE|GRAND TOTAL|BALANCE|PAYABLE)[\w\s]*[:$]?\s*([\d,]+\.\d{2})'
57
  matches = re.findall(pattern, text, re.IGNORECASE)
 
63
  except ValueError:
64
  pass
65
 
66
+ # 2. Fallback: Context-Aware Footer Search (Medium Confidence)
67
+ # Instead of taking max() of the whole doc (risky), we only look at the bottom 30%
68
+ lines = text.split('\n')
69
+ if not lines: return None
70
+
71
+ # Focus on the footer where totals usually live
72
+ footer_lines = lines[-int(len(lines)*0.3):]
73
+
74
+ candidates = []
75
+ for line in footer_lines:
76
+ line_amounts = extract_amounts(line)
77
+ for amt in line_amounts:
78
+ # Simple heuristic: Totals are rarely 'years' like 2024 or 2025
79
+ if 2000 <= amt <= 2030 and float(amt).is_integer():
80
+ continue
81
+ candidates.append(amt)
82
+
83
+ if candidates:
84
+ return max(candidates)
85
 
86
  return None
87
 
 
103
 
104
  def extract_invoice_number(text: str) -> Optional[str]:
105
  """
106
+ Improved regex that handles alphanumeric AND numeric IDs, plus variations like "Tax Inv".
107
  """
108
  if not text: return None
109
 
110
  # Strategy 1: Look for "Invoice No: XXXXX" pattern
111
+ # UPDATED: Handles "Tax Invoice", "Inv No", and standard variations
112
+ keyword_pattern = r'(?:TAX\s*)?(?:INVOICE|INV|BILL|RECEIPT)\s*(?:NO|NUMBER|#|NUM)?[\s\.:-]*([A-Z0-9\-/]{3,})'
113
  match = re.search(keyword_pattern, text, re.IGNORECASE)
114
  if match:
115
  return match.group(1)