File size: 1,149 Bytes
ce9b735
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import PyPDF2
import docx

def process_document(uploaded_file):
    """

    Process different document types and extract text

    

    Args:

        uploaded_file: Streamlit uploaded file object

    

    Returns:

        str: Extracted text from the document

    """
    # Text file
    if uploaded_file.type == 'text/plain':
        return uploaded_file.getvalue().decode("utf-8")
    
    # PDF file
    elif uploaded_file.type == 'application/pdf':
        try:
            pdf_reader = PyPDF2.PdfReader(uploaded_file)
            return " ".join([page.extract_text() for page in pdf_reader.pages])
        except Exception as e:
            raise ValueError(f"Error processing PDF: {e}")
    
    # Word document
    elif uploaded_file.type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
        try:
            doc = docx.Document(uploaded_file)
            return " ".join([para.text for para in doc.paragraphs])
        except Exception as e:
            raise ValueError(f"Error processing Word document: {e}")
    
    else:
        raise ValueError("Unsupported file type")