Raghu commited on
Commit
760ab34
·
1 Parent(s): 6fe5290

Fix S/$ OCR confusion: add _fix_ocr_text method to convert S154.06 to $154.06

Browse files
Files changed (1) hide show
  1. app.py +32 -9
app.py CHANGED
@@ -742,7 +742,14 @@ class ReceiptOCR:
742
 
743
  def postprocess_receipt(self, ocr_results):
744
  """Extract structured fields from OCR results with improved patterns."""
745
- full_text = ' '.join([r['text'] for r in ocr_results])
 
 
 
 
 
 
 
746
 
747
  fields = {
748
  'vendor': self._extract_vendor(ocr_results),
@@ -785,28 +792,36 @@ class ReceiptOCR:
785
  return None
786
 
787
  def _extract_total(self, text):
788
- """Extract total amount - improved to find largest amount near TOTAL keyword."""
789
- # First, find all dollar amounts in the text
790
- all_amounts = re.findall(r'\$(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)', text)
791
- all_amounts = [float(a.replace(',', '')) for a in all_amounts]
 
 
 
 
 
 
 
 
792
 
793
  if not all_amounts:
794
  return None
795
 
796
  # Look for "TOTAL", "AMOUNT DUE", "BALANCE" keywords and find amount near them
797
- lines = text.split('\n')
798
  for i, line in enumerate(lines):
799
  line_upper = line.upper()
800
  if any(keyword in line_upper for keyword in ['TOTAL', 'AMOUNT DUE', 'BALANCE DUE', 'DUE']):
801
  # Check this line and next 2 lines for amount
802
  search_text = ' '.join(lines[i:min(i+3, len(lines))])
803
- matches = re.findall(r'\$(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)', search_text)
 
804
  if matches:
805
  amounts_near_total = [float(m.replace(',', '')) for m in matches]
806
- # Return largest amount near TOTAL keyword
807
  return f"{max(amounts_near_total):.2f}"
808
 
809
- # Fallback: return largest amount overall (usually the total)
810
  return f"{max(all_amounts):.2f}"
811
 
812
  def _extract_time(self, text):
@@ -820,6 +835,14 @@ class ReceiptOCR:
820
  if match:
821
  return match.group(0)
822
  return None
 
 
 
 
 
 
 
 
823
 
824
  class LayoutLMFieldExtractor:
825
  """LayoutLMv3-based field extractor using fine-tuned weights if available."""
 
742
 
743
  def postprocess_receipt(self, ocr_results):
744
  """Extract structured fields from OCR results with improved patterns."""
745
+ # Fix common OCR errors (S->$ in amounts)
746
+ fixed_results = []
747
+ for r in ocr_results:
748
+ fixed_r = r.copy()
749
+ fixed_r['text'] = self._fix_ocr_text(r['text'])
750
+ fixed_results.append(fixed_r)
751
+
752
+ full_text = ' '.join([r['text'] for r in fixed_results])
753
 
754
  fields = {
755
  'vendor': self._extract_vendor(ocr_results),
 
792
  return None
793
 
794
  def _extract_total(self, text):
795
+ """Extract total amount - handles S/$ OCR confusion."""
796
+ # Fix S -> $ in amounts (common OCR error)
797
+ fixed_text = re.sub(r'\bS(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)\b', r'$\1', text)
798
+
799
+ # Find all dollar amounts (now with fixed $ symbols)
800
+ all_amounts = re.findall(r'[\$S](\d{1,3}(?:,\d{3})*(?:\.\d{2})?)', fixed_text)
801
+ all_amounts = [float(a.replace(',', '')) for a in all_amounts if a]
802
+
803
+ if not all_amounts:
804
+ # Try finding any decimal amounts
805
+ all_amounts = re.findall(r'(\d{1,3}(?:,\d{3})*\.\d{2})', fixed_text)
806
+ all_amounts = [float(a.replace(',', '')) for a in all_amounts if a]
807
 
808
  if not all_amounts:
809
  return None
810
 
811
  # Look for "TOTAL", "AMOUNT DUE", "BALANCE" keywords and find amount near them
812
+ lines = fixed_text.split('\n')
813
  for i, line in enumerate(lines):
814
  line_upper = line.upper()
815
  if any(keyword in line_upper for keyword in ['TOTAL', 'AMOUNT DUE', 'BALANCE DUE', 'DUE']):
816
  # Check this line and next 2 lines for amount
817
  search_text = ' '.join(lines[i:min(i+3, len(lines))])
818
+ # Match both $ and S followed by amounts
819
+ matches = re.findall(r'[\$S](\d{1,3}(?:,\d{3})*(?:\.\d{2})?)', search_text)
820
  if matches:
821
  amounts_near_total = [float(m.replace(',', '')) for m in matches]
 
822
  return f"{max(amounts_near_total):.2f}"
823
 
824
+ # Fallback: return largest amount overall
825
  return f"{max(all_amounts):.2f}"
826
 
827
  def _extract_time(self, text):
 
835
  if match:
836
  return match.group(0)
837
  return None
838
+
839
+ def _fix_ocr_text(self, text):
840
+ """Fix common OCR errors like S->$ in amounts."""
841
+ # Fix S followed by digits -> $ (e.g., S154.06 -> $154.06)
842
+ text = re.sub(r'\bS(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)\b', r'$\1', text)
843
+ # Fix Subtolal -> Subtotal (common OCR error)
844
+ text = re.sub(r'\bSubtolal\b', 'Subtotal', text, flags=re.IGNORECASE)
845
+ return text
846
 
847
  class LayoutLMFieldExtractor:
848
  """LayoutLMv3-based field extractor using fine-tuned weights if available."""