Sum-it-up / src /document_processor.py
garvitcpp's picture
Upload 6 files
ce9b735 verified
raw
history blame
1.15 kB
import PyPDF2
import docx
def process_document(uploaded_file):
"""
Process different document types and extract text
Args:
uploaded_file: Streamlit uploaded file object
Returns:
str: Extracted text from the document
"""
# Text file
if uploaded_file.type == 'text/plain':
return uploaded_file.getvalue().decode("utf-8")
# PDF file
elif uploaded_file.type == 'application/pdf':
try:
pdf_reader = PyPDF2.PdfReader(uploaded_file)
return " ".join([page.extract_text() for page in pdf_reader.pages])
except Exception as e:
raise ValueError(f"Error processing PDF: {e}")
# Word document
elif uploaded_file.type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
try:
doc = docx.Document(uploaded_file)
return " ".join([para.text for para in doc.paragraphs])
except Exception as e:
raise ValueError(f"Error processing Word document: {e}")
else:
raise ValueError("Unsupported file type")