Spaces:

rohitashva
/

soil_report_analysis

Sleeping

rohitashva commited on Feb 7, 2025

Commit

366a546

verified ·

1 Parent(s): 83ebead

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -6,20 +6,22 @@ import os
 from pdf2image import convert_from_bytes
-# Set the correct paths
-pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
-POPPLER_PATH = "/usr/bin"  # Explicitly define Poppler path
 def extract_text_from_pdf(uploaded_file):
-    # Convert PDF to images using the correct Poppler path
-    images = convert_from_bytes(uploaded_file.read(), poppler_path=POPPLER_PATH)
-    extracted_text = ""
-    for img in images:
-        extracted_text += pytesseract.image_to_string(img) + "\n"
-    return extracted_text
 # Configure Google Gemini API
 GEMINI_API_KEY = os.getenv('GEMINI')
 genai.configure(api_key=GEMINI_API_KEY)

 from pdf2image import convert_from_bytes
+pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"  # Or the correct path if different
+# Set Poppler path (Hugging Face specific - environment variable is best)
+# os.environ["POPPLER_PATH"] = "/usr/bin"  # Not recommended, use environment variable
+POPPLER_PATH = os.environ.get("POPPLER_PATH", "/usr/bin") # Get from env, default to /usr/bin
 def extract_text_from_pdf(uploaded_file):
+    try:
+        images = convert_from_bytes(uploaded_file.read(), poppler_path=POPPLER_PATH)
+        extracted_text = ""
+        for img in images:
+            extracted_text += pytesseract.image_to_string(img) + "\n"
+        return extracted_text
+    except Exception as e:  # Catch potential errors during PDF conversion
+        st.error(f"Error extracting text from PDF: {e}") # Show error in Streamlit
+        return None  # Indicate failure
 # Configure Google Gemini API
 GEMINI_API_KEY = os.getenv('GEMINI')
 genai.configure(api_key=GEMINI_API_KEY)