File size: 837 Bytes
5348e91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# modules/doc_processor.py

import fitz  # PyMuPDF for PDFs
import docx  # python-docx for DOCX files
import os

def extract_text_from_document(file_path):
    """
    Extracts text from a given document (PDF or DOCX).
    """
    text = ""
    try:
        _, file_extension = os.path.splitext(file_path)

        if file_extension.lower() == '.pdf':
            with fitz.open(file_path) as doc:
                for page in doc:
                    text += page.get_text()
        elif file_extension.lower() == '.docx':
            doc = docx.Document(file_path)
            for para in doc.paragraphs:
                text += para.text + "\n"
        else:
            return "Unsupported file format. Please upload a .pdf or .docx file."
    
    except Exception as e:
        return f"Error reading document: {e}"

    return text