Raghu commited on
Commit
4b81303
·
1 Parent(s): 760ab34

Fix vendor extraction: detect known vendors like Einstein, skip numbers/IDs, prefer longer names. Remove Time field from display.

Browse files
Files changed (1) hide show
  1. app.py +46 -10
app.py CHANGED
@@ -761,22 +761,59 @@ class ReceiptOCR:
761
  return fields
762
 
763
  def _extract_vendor(self, ocr_results):
764
- """Extract vendor name, usually in first few lines."""
765
  if not ocr_results:
766
  return None
767
 
768
- # Look for vendor in top 3 results (usually at top of receipt)
769
- top_results = sorted(ocr_results, key=lambda x: x['bbox'][1])[:3]
770
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
771
  for result in top_results:
772
  text = result['text'].strip()
773
- # Skip common non-vendor words
774
- if text and len(text) > 2 and text.upper() not in ['TOTAL', 'DATE', 'TIME', 'RECEIPT', 'THANK', 'YOU']:
775
- # Take longest text as vendor (usually company name)
776
- if len(text) > 5:
777
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
778
 
779
- return top_results[0]['text'] if top_results else None
780
 
781
  def _extract_date(self, text):
782
  """Extract date with improved patterns."""
@@ -1298,7 +1335,6 @@ def process_receipt(image):
1298
  ('Vendor', fields.get('vendor')),
1299
  ('Date', fields.get('date')),
1300
  ('Total', f"${fields.get('total')}" if fields.get('total') else None),
1301
- ('Time', fields.get('time')),
1302
  ]:
1303
  display = value or '<span style="color: #9ca3af;">Not found</span>'
1304
  fields_html += f"<div style='padding: 8px; background: #0f172a; color: #e5e7eb; border: 1px solid #1f2937; border-radius: 6px; margin: 4px 0;'><b>{name}:</b> {display}</div>"
 
761
  return fields
762
 
763
  def _extract_vendor(self, ocr_results):
764
+ """Extract vendor name - look for business name in top portion of receipt."""
765
  if not ocr_results:
766
  return None
767
 
768
+ # Sort by vertical position (top of receipt first)
769
+ sorted_results = sorted(ocr_results, key=lambda x: x['bbox'][1] if isinstance(x['bbox'], list) and len(x['bbox']) > 1 else 0)
770
 
771
+ # Look in top 10 results for vendor name
772
+ top_results = sorted_results[:10]
773
+
774
+ # Skip words that are clearly not vendor names
775
+ skip_words = {'TOTAL', 'DATE', 'TIME', 'RECEIPT', 'THANK', 'YOU', 'STORE', 'HOST',
776
+ 'ORDER', 'TYPE', 'TOGO', 'DINE', 'IN', 'CHECK', 'CLOSED', 'AMEX',
777
+ 'VISA', 'MASTERCARD', 'CASH', 'CHANGE', 'SUBTOTAL', 'TAX'}
778
+
779
+ # Known vendor patterns (common stores)
780
+ known_vendors = ['EINSTEIN', 'STARBUCKS', 'MCDONALDS', 'WALMART', 'TARGET',
781
+ 'CHIPOTLE', 'PANERA', 'DUNKIN', 'SUBWAY', 'CHICK-FIL-A']
782
+
783
+ # First, check if any known vendor is in the OCR results
784
+ for result in top_results:
785
+ text = result['text'].strip().upper()
786
+ for vendor in known_vendors:
787
+ if vendor in text:
788
+ return result['text'].strip()
789
+
790
+ # Look for longest meaningful text (likely the business name)
791
+ candidates = []
792
  for result in top_results:
793
  text = result['text'].strip()
794
+ text_upper = text.upper()
795
+
796
+ # Skip short texts, numbers, and common skip words
797
+ if len(text) < 3:
798
+ continue
799
+ if text_upper in skip_words:
800
+ continue
801
+ if re.match(r'^[\d\s\-\/\.\$\,]+$', text): # Skip pure numbers/symbols
802
+ continue
803
+ if re.match(r'^#?\d+$', text): # Skip store numbers like #2846
804
+ continue
805
+
806
+ # Prefer texts with letters and reasonable length
807
+ if len(text) >= 4 and any(c.isalpha() for c in text):
808
+ candidates.append((text, len(text), result['confidence']))
809
+
810
+ # Return the longest candidate with good confidence
811
+ if candidates:
812
+ # Sort by length (longer = more likely to be full vendor name)
813
+ candidates.sort(key=lambda x: (x[1], x[2]), reverse=True)
814
+ return candidates[0][0]
815
 
816
+ return None
817
 
818
  def _extract_date(self, text):
819
  """Extract date with improved patterns."""
 
1335
  ('Vendor', fields.get('vendor')),
1336
  ('Date', fields.get('date')),
1337
  ('Total', f"${fields.get('total')}" if fields.get('total') else None),
 
1338
  ]:
1339
  display = value or '<span style="color: #9ca3af;">Not found</span>'
1340
  fields_html += f"<div style='padding: 8px; background: #0f172a; color: #e5e7eb; border: 1px solid #1f2937; border-radius: 6px; margin: 4px 0;'><b>{name}:</b> {display}</div>"