Update app.py
Browse files
app.py
CHANGED
|
@@ -768,21 +768,40 @@ def extract_text_from_receipt(file_path):
|
|
| 768 |
return None, []
|
| 769 |
|
| 770 |
try:
|
| 771 |
-
#
|
| 772 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 773 |
|
| 774 |
text = ""
|
| 775 |
|
| 776 |
if file_path.name.lower().endswith('.pdf'):
|
| 777 |
-
#
|
| 778 |
-
|
| 779 |
-
|
| 780 |
-
|
| 781 |
-
|
|
|
|
| 782 |
try:
|
| 783 |
-
|
| 784 |
-
|
| 785 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 786 |
|
| 787 |
for img in images:
|
| 788 |
text += pytesseract.image_to_string(img) + "\n"
|
|
|
|
| 768 |
return None, []
|
| 769 |
|
| 770 |
try:
|
| 771 |
+
# Try to find tesseract automatically
|
| 772 |
+
import shutil
|
| 773 |
+
tesseract_path = shutil.which('tesseract')
|
| 774 |
+
if tesseract_path:
|
| 775 |
+
pytesseract.pytesseract.tesseract_cmd = tesseract_path
|
| 776 |
+
else:
|
| 777 |
+
# Try common paths
|
| 778 |
+
for path in ['/usr/bin/tesseract', '/usr/local/bin/tesseract', '/bin/tesseract']:
|
| 779 |
+
if os.path.exists(path):
|
| 780 |
+
pytesseract.pytesseract.tesseract_cmd = path
|
| 781 |
+
break
|
| 782 |
|
| 783 |
text = ""
|
| 784 |
|
| 785 |
if file_path.name.lower().endswith('.pdf'):
|
| 786 |
+
# Try multiple possible poppler paths for Hugging Face
|
| 787 |
+
poppler_paths = [None, '/usr/bin', '/usr/local/bin', '/bin']
|
| 788 |
+
images = None
|
| 789 |
+
last_error = None
|
| 790 |
+
|
| 791 |
+
for poppler_path in poppler_paths:
|
| 792 |
try:
|
| 793 |
+
if poppler_path:
|
| 794 |
+
images = convert_from_path(file_path.name, poppler_path=poppler_path)
|
| 795 |
+
else:
|
| 796 |
+
images = convert_from_path(file_path.name)
|
| 797 |
+
break # Success! Exit loop
|
| 798 |
+
except Exception as e:
|
| 799 |
+
last_error = str(e)
|
| 800 |
+
continue
|
| 801 |
+
|
| 802 |
+
if images is None:
|
| 803 |
+
# If all paths failed, return helpful error
|
| 804 |
+
return [[True, f"PDF processing unavailable. Please upload images (PNG/JPG) instead. Error: {last_error}", "", ""]], []
|
| 805 |
|
| 806 |
for img in images:
|
| 807 |
text += pytesseract.image_to_string(img) + "\n"
|