Spaces:
Sleeping
Sleeping
Raghu commited on
Commit ·
4b81303
1
Parent(s): 760ab34
Fix vendor extraction: detect known vendors like Einstein, skip numbers/IDs, prefer longer names. Remove Time field from display.
Browse files
app.py
CHANGED
|
@@ -761,22 +761,59 @@ class ReceiptOCR:
|
|
| 761 |
return fields
|
| 762 |
|
| 763 |
def _extract_vendor(self, ocr_results):
|
| 764 |
-
"""Extract vendor name
|
| 765 |
if not ocr_results:
|
| 766 |
return None
|
| 767 |
|
| 768 |
-
#
|
| 769 |
-
|
| 770 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 771 |
for result in top_results:
|
| 772 |
text = result['text'].strip()
|
| 773 |
-
|
| 774 |
-
|
| 775 |
-
|
| 776 |
-
|
| 777 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 778 |
|
| 779 |
-
return
|
| 780 |
|
| 781 |
def _extract_date(self, text):
|
| 782 |
"""Extract date with improved patterns."""
|
|
@@ -1298,7 +1335,6 @@ def process_receipt(image):
|
|
| 1298 |
('Vendor', fields.get('vendor')),
|
| 1299 |
('Date', fields.get('date')),
|
| 1300 |
('Total', f"${fields.get('total')}" if fields.get('total') else None),
|
| 1301 |
-
('Time', fields.get('time')),
|
| 1302 |
]:
|
| 1303 |
display = value or '<span style="color: #9ca3af;">Not found</span>'
|
| 1304 |
fields_html += f"<div style='padding: 8px; background: #0f172a; color: #e5e7eb; border: 1px solid #1f2937; border-radius: 6px; margin: 4px 0;'><b>{name}:</b> {display}</div>"
|
|
|
|
| 761 |
return fields
|
| 762 |
|
| 763 |
def _extract_vendor(self, ocr_results):
|
| 764 |
+
"""Extract vendor name - look for business name in top portion of receipt."""
|
| 765 |
if not ocr_results:
|
| 766 |
return None
|
| 767 |
|
| 768 |
+
# Sort by vertical position (top of receipt first)
|
| 769 |
+
sorted_results = sorted(ocr_results, key=lambda x: x['bbox'][1] if isinstance(x['bbox'], list) and len(x['bbox']) > 1 else 0)
|
| 770 |
|
| 771 |
+
# Look in top 10 results for vendor name
|
| 772 |
+
top_results = sorted_results[:10]
|
| 773 |
+
|
| 774 |
+
# Skip words that are clearly not vendor names
|
| 775 |
+
skip_words = {'TOTAL', 'DATE', 'TIME', 'RECEIPT', 'THANK', 'YOU', 'STORE', 'HOST',
|
| 776 |
+
'ORDER', 'TYPE', 'TOGO', 'DINE', 'IN', 'CHECK', 'CLOSED', 'AMEX',
|
| 777 |
+
'VISA', 'MASTERCARD', 'CASH', 'CHANGE', 'SUBTOTAL', 'TAX'}
|
| 778 |
+
|
| 779 |
+
# Known vendor patterns (common stores)
|
| 780 |
+
known_vendors = ['EINSTEIN', 'STARBUCKS', 'MCDONALDS', 'WALMART', 'TARGET',
|
| 781 |
+
'CHIPOTLE', 'PANERA', 'DUNKIN', 'SUBWAY', 'CHICK-FIL-A']
|
| 782 |
+
|
| 783 |
+
# First, check if any known vendor is in the OCR results
|
| 784 |
+
for result in top_results:
|
| 785 |
+
text = result['text'].strip().upper()
|
| 786 |
+
for vendor in known_vendors:
|
| 787 |
+
if vendor in text:
|
| 788 |
+
return result['text'].strip()
|
| 789 |
+
|
| 790 |
+
# Look for longest meaningful text (likely the business name)
|
| 791 |
+
candidates = []
|
| 792 |
for result in top_results:
|
| 793 |
text = result['text'].strip()
|
| 794 |
+
text_upper = text.upper()
|
| 795 |
+
|
| 796 |
+
# Skip short texts, numbers, and common skip words
|
| 797 |
+
if len(text) < 3:
|
| 798 |
+
continue
|
| 799 |
+
if text_upper in skip_words:
|
| 800 |
+
continue
|
| 801 |
+
if re.match(r'^[\d\s\-\/\.\$\,]+$', text): # Skip pure numbers/symbols
|
| 802 |
+
continue
|
| 803 |
+
if re.match(r'^#?\d+$', text): # Skip store numbers like #2846
|
| 804 |
+
continue
|
| 805 |
+
|
| 806 |
+
# Prefer texts with letters and reasonable length
|
| 807 |
+
if len(text) >= 4 and any(c.isalpha() for c in text):
|
| 808 |
+
candidates.append((text, len(text), result['confidence']))
|
| 809 |
+
|
| 810 |
+
# Return the longest candidate with good confidence
|
| 811 |
+
if candidates:
|
| 812 |
+
# Sort by length (longer = more likely to be full vendor name)
|
| 813 |
+
candidates.sort(key=lambda x: (x[1], x[2]), reverse=True)
|
| 814 |
+
return candidates[0][0]
|
| 815 |
|
| 816 |
+
return None
|
| 817 |
|
| 818 |
def _extract_date(self, text):
|
| 819 |
"""Extract date with improved patterns."""
|
|
|
|
| 1335 |
('Vendor', fields.get('vendor')),
|
| 1336 |
('Date', fields.get('date')),
|
| 1337 |
('Total', f"${fields.get('total')}" if fields.get('total') else None),
|
|
|
|
| 1338 |
]:
|
| 1339 |
display = value or '<span style="color: #9ca3af;">Not found</span>'
|
| 1340 |
fields_html += f"<div style='padding: 8px; background: #0f172a; color: #e5e7eb; border: 1px solid #1f2937; border-radius: 6px; margin: 4px 0;'><b>{name}:</b> {display}</div>"
|