Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -50,14 +50,25 @@ def answer_question_from_pdf(pdf_text, question):
|
|
| 50 |
|
| 51 |
# Function to extract text from PDF
|
| 52 |
def extract_text_from_pdf(pdf_file):
|
| 53 |
-
|
| 54 |
-
|
|
|
|
| 55 |
pdf_arr = []
|
| 56 |
-
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
pdf_arr.append(pdf_text)
|
|
|
|
| 59 |
return pdf_arr
|
| 60 |
-
|
| 61 |
# Streamlit app
|
| 62 |
st.title("PDF Explorer")
|
| 63 |
|
|
|
|
| 50 |
|
| 51 |
# Function to extract text from PDF
|
| 52 |
def extract_text_from_pdf(pdf_file):
|
| 53 |
+
# Open the PDF file
|
| 54 |
+
pdf_document = fitz.open(pdf_file)
|
| 55 |
+
|
| 56 |
pdf_arr = []
|
| 57 |
+
|
| 58 |
+
# Iterate through each page
|
| 59 |
+
for page_num in range(len(pdf_document)):
|
| 60 |
+
# Get the page
|
| 61 |
+
page = pdf_document.load_page(page_num)
|
| 62 |
+
|
| 63 |
+
# Get the page as an image
|
| 64 |
+
pix = page.get_pixmap()
|
| 65 |
+
img = Image.open(io.BytesIO(pix.tobytes()))
|
| 66 |
+
|
| 67 |
+
# Perform OCR on the image
|
| 68 |
+
pdf_text = pytesseract.image_to_string(img)
|
| 69 |
pdf_arr.append(pdf_text)
|
| 70 |
+
|
| 71 |
return pdf_arr
|
|
|
|
| 72 |
# Streamlit app
|
| 73 |
st.title("PDF Explorer")
|
| 74 |
|