File size: 840 Bytes
0d5d472
 
 
50a05c2
0d5d472
 
 
 
 
 
 
 
 
 
 
 
 
 
a918fa5
0d5d472
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import PyPDF2
import io
import streamlit as st

def extract_text_from_pdf(pdf_file):
    """
    Extract text from a PDF file
    """
    pdf_text = ""
    try:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            pdf_text += page.extract_text() + "\n\n"
        return pdf_text
    except Exception as e:
        st.error(f"Error extracting text from PDF: {e}")
        return None

def get_document_text(uploaded_file, file_type):
    """
    Get text from an uploaded document (PDF or TXT)
    """
    if file_type == "Text (.txt)":
        # Read text file
        return uploaded_file.getvalue().decode("utf-8")
    else:
        # Extract text from PDF
        return extract_text_from_pdf(io.BytesIO(uploaded_file.getvalue()))