Spaces:
Running
Running
Raghu
commited on
Commit
·
760ab34
1
Parent(s):
6fe5290
Fix S/$ OCR confusion: add _fix_ocr_text method to convert S154.06 to $154.06
Browse files
app.py
CHANGED
|
@@ -742,7 +742,14 @@ class ReceiptOCR:
|
|
| 742 |
|
| 743 |
def postprocess_receipt(self, ocr_results):
|
| 744 |
"""Extract structured fields from OCR results with improved patterns."""
|
| 745 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 746 |
|
| 747 |
fields = {
|
| 748 |
'vendor': self._extract_vendor(ocr_results),
|
|
@@ -785,28 +792,36 @@ class ReceiptOCR:
|
|
| 785 |
return None
|
| 786 |
|
| 787 |
def _extract_total(self, text):
|
| 788 |
-
"""Extract total amount -
|
| 789 |
-
#
|
| 790 |
-
|
| 791 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 792 |
|
| 793 |
if not all_amounts:
|
| 794 |
return None
|
| 795 |
|
| 796 |
# Look for "TOTAL", "AMOUNT DUE", "BALANCE" keywords and find amount near them
|
| 797 |
-
lines =
|
| 798 |
for i, line in enumerate(lines):
|
| 799 |
line_upper = line.upper()
|
| 800 |
if any(keyword in line_upper for keyword in ['TOTAL', 'AMOUNT DUE', 'BALANCE DUE', 'DUE']):
|
| 801 |
# Check this line and next 2 lines for amount
|
| 802 |
search_text = ' '.join(lines[i:min(i+3, len(lines))])
|
| 803 |
-
|
|
|
|
| 804 |
if matches:
|
| 805 |
amounts_near_total = [float(m.replace(',', '')) for m in matches]
|
| 806 |
-
# Return largest amount near TOTAL keyword
|
| 807 |
return f"{max(amounts_near_total):.2f}"
|
| 808 |
|
| 809 |
-
# Fallback: return largest amount overall
|
| 810 |
return f"{max(all_amounts):.2f}"
|
| 811 |
|
| 812 |
def _extract_time(self, text):
|
|
@@ -820,6 +835,14 @@ class ReceiptOCR:
|
|
| 820 |
if match:
|
| 821 |
return match.group(0)
|
| 822 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 823 |
|
| 824 |
class LayoutLMFieldExtractor:
|
| 825 |
"""LayoutLMv3-based field extractor using fine-tuned weights if available."""
|
|
|
|
| 742 |
|
| 743 |
def postprocess_receipt(self, ocr_results):
|
| 744 |
"""Extract structured fields from OCR results with improved patterns."""
|
| 745 |
+
# Fix common OCR errors (S->$ in amounts)
|
| 746 |
+
fixed_results = []
|
| 747 |
+
for r in ocr_results:
|
| 748 |
+
fixed_r = r.copy()
|
| 749 |
+
fixed_r['text'] = self._fix_ocr_text(r['text'])
|
| 750 |
+
fixed_results.append(fixed_r)
|
| 751 |
+
|
| 752 |
+
full_text = ' '.join([r['text'] for r in fixed_results])
|
| 753 |
|
| 754 |
fields = {
|
| 755 |
'vendor': self._extract_vendor(ocr_results),
|
|
|
|
| 792 |
return None
|
| 793 |
|
| 794 |
def _extract_total(self, text):
|
| 795 |
+
"""Extract total amount - handles S/$ OCR confusion."""
|
| 796 |
+
# Fix S -> $ in amounts (common OCR error)
|
| 797 |
+
fixed_text = re.sub(r'\bS(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)\b', r'$\1', text)
|
| 798 |
+
|
| 799 |
+
# Find all dollar amounts (now with fixed $ symbols)
|
| 800 |
+
all_amounts = re.findall(r'[\$S](\d{1,3}(?:,\d{3})*(?:\.\d{2})?)', fixed_text)
|
| 801 |
+
all_amounts = [float(a.replace(',', '')) for a in all_amounts if a]
|
| 802 |
+
|
| 803 |
+
if not all_amounts:
|
| 804 |
+
# Try finding any decimal amounts
|
| 805 |
+
all_amounts = re.findall(r'(\d{1,3}(?:,\d{3})*\.\d{2})', fixed_text)
|
| 806 |
+
all_amounts = [float(a.replace(',', '')) for a in all_amounts if a]
|
| 807 |
|
| 808 |
if not all_amounts:
|
| 809 |
return None
|
| 810 |
|
| 811 |
# Look for "TOTAL", "AMOUNT DUE", "BALANCE" keywords and find amount near them
|
| 812 |
+
lines = fixed_text.split('\n')
|
| 813 |
for i, line in enumerate(lines):
|
| 814 |
line_upper = line.upper()
|
| 815 |
if any(keyword in line_upper for keyword in ['TOTAL', 'AMOUNT DUE', 'BALANCE DUE', 'DUE']):
|
| 816 |
# Check this line and next 2 lines for amount
|
| 817 |
search_text = ' '.join(lines[i:min(i+3, len(lines))])
|
| 818 |
+
# Match both $ and S followed by amounts
|
| 819 |
+
matches = re.findall(r'[\$S](\d{1,3}(?:,\d{3})*(?:\.\d{2})?)', search_text)
|
| 820 |
if matches:
|
| 821 |
amounts_near_total = [float(m.replace(',', '')) for m in matches]
|
|
|
|
| 822 |
return f"{max(amounts_near_total):.2f}"
|
| 823 |
|
| 824 |
+
# Fallback: return largest amount overall
|
| 825 |
return f"{max(all_amounts):.2f}"
|
| 826 |
|
| 827 |
def _extract_time(self, text):
|
|
|
|
| 835 |
if match:
|
| 836 |
return match.group(0)
|
| 837 |
return None
|
| 838 |
+
|
| 839 |
+
def _fix_ocr_text(self, text):
|
| 840 |
+
"""Fix common OCR errors like S->$ in amounts."""
|
| 841 |
+
# Fix S followed by digits -> $ (e.g., S154.06 -> $154.06)
|
| 842 |
+
text = re.sub(r'\bS(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)\b', r'$\1', text)
|
| 843 |
+
# Fix Subtolal -> Subtotal (common OCR error)
|
| 844 |
+
text = re.sub(r'\bSubtolal\b', 'Subtotal', text, flags=re.IGNORECASE)
|
| 845 |
+
return text
|
| 846 |
|
| 847 |
class LayoutLMFieldExtractor:
|
| 848 |
"""LayoutLMv3-based field extractor using fine-tuned weights if available."""
|