rohitashva commited on
Commit
366a546
·
verified ·
1 Parent(s): 83ebead

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -12
app.py CHANGED
@@ -6,20 +6,22 @@ import os
6
 
7
  from pdf2image import convert_from_bytes
8
 
9
- # Set the correct paths
10
- pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
11
- POPPLER_PATH = "/usr/bin" # Explicitly define Poppler path
 
 
12
 
13
  def extract_text_from_pdf(uploaded_file):
14
- # Convert PDF to images using the correct Poppler path
15
- images = convert_from_bytes(uploaded_file.read(), poppler_path=POPPLER_PATH)
16
-
17
- extracted_text = ""
18
- for img in images:
19
- extracted_text += pytesseract.image_to_string(img) + "\n"
20
-
21
- return extracted_text
22
-
23
  # Configure Google Gemini API
24
  GEMINI_API_KEY = os.getenv('GEMINI')
25
  genai.configure(api_key=GEMINI_API_KEY)
 
6
 
7
  from pdf2image import convert_from_bytes
8
 
9
+ pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" # Or the correct path if different
10
+
11
+ # Set Poppler path (Hugging Face specific - environment variable is best)
12
+ # os.environ["POPPLER_PATH"] = "/usr/bin" # Not recommended, use environment variable
13
+ POPPLER_PATH = os.environ.get("POPPLER_PATH", "/usr/bin") # Get from env, default to /usr/bin
14
 
15
  def extract_text_from_pdf(uploaded_file):
16
+ try:
17
+ images = convert_from_bytes(uploaded_file.read(), poppler_path=POPPLER_PATH)
18
+ extracted_text = ""
19
+ for img in images:
20
+ extracted_text += pytesseract.image_to_string(img) + "\n"
21
+ return extracted_text
22
+ except Exception as e: # Catch potential errors during PDF conversion
23
+ st.error(f"Error extracting text from PDF: {e}") # Show error in Streamlit
24
+ return None # Indicate failure
25
  # Configure Google Gemini API
26
  GEMINI_API_KEY = os.getenv('GEMINI')
27
  genai.configure(api_key=GEMINI_API_KEY)