AzizWazir commited on
Commit
b1cf141
·
verified ·
1 Parent(s): 83f9467

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -7
app.py CHANGED
@@ -1,14 +1,20 @@
1
  import os
2
  import streamlit as st
3
  from pdf2image import convert_from_path
4
- import pytesseract
5
- from PIL import Image
6
- import pandas as pd
7
- from docx import Document
8
 
9
- # Set paths for poppler and tesseract (for local testing or adjust as per your environment)
10
- POPPLER_PATH = "/usr/bin"
11
- pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
 
 
 
 
 
 
 
 
 
 
12
 
13
  # Function to extract text from an image-based PDF
14
  def extract_text_from_image_pdf(pdf_path):
 
1
  import os
2
  import streamlit as st
3
  from pdf2image import convert_from_path
 
 
 
 
4
 
5
+ # Path to your PDF file
6
+ pdf_path = "path_to_your_pdf.pdf"
7
+
8
+ # Path to Poppler binary (optional if already in PATH)
9
+ poppler_path = r"C:\path\to\poppler\bin" # Update this path as needed
10
+
11
+ try:
12
+ # Convert PDF to images
13
+ images = convert_from_path(pdf_path, poppler_path=poppler_path)
14
+ print(f"Converted {len(images)} pages to images successfully!")
15
+ except Exception as e:
16
+ print(f"An error occurred: {e}")
17
+
18
 
19
  # Function to extract text from an image-based PDF
20
  def extract_text_from_image_pdf(pdf_path):