fifth_try_CAG / document_utils.py
kouki321's picture
Update document_utils.py
0d5d472 verified
raw
history blame contribute delete
840 Bytes
import PyPDF2
import io
import streamlit as st
def extract_text_from_pdf(pdf_file):
"""
Extract text from a PDF file
"""
pdf_text = ""
try:
pdf_reader = PyPDF2.PdfReader(pdf_file)
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
pdf_text += page.extract_text() + "\n\n"
return pdf_text
except Exception as e:
st.error(f"Error extracting text from PDF: {e}")
return None
def get_document_text(uploaded_file, file_type):
"""
Get text from an uploaded document (PDF or TXT)
"""
if file_type == "Text (.txt)":
# Read text file
return uploaded_file.getvalue().decode("utf-8")
else:
# Extract text from PDF
return extract_text_from_pdf(io.BytesIO(uploaded_file.getvalue()))