File size: 1,151 Bytes
17c7e0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import docx
import fitz  # PyMuPDF

def process_docx(file_path):
    """Extracts text from a .docx file."""
    try:
        doc = docx.Document(file_path)
        full_text = [para.text for para in doc.paragraphs]
        text = '\n'.join(full_text)
        
        print(f"Extracted {len(full_text)} paragraphs from DOCX")  # Debugging
        print(f"Extracted Text: {text[:500]}...")  # Print first 500 chars
        
        return {'text': text.strip()}
    except Exception as e:
        return {'error': str(e)}


def process_pdf(file_path):
    """Extracts text from a .pdf file."""
    try:
        pdf = fitz.open(file_path)
        text = ""
        for page in pdf:
            text += page.get_text()
        pdf.close()
        return {'text': text.strip()}  # Return as a dictionary
    except Exception as e:
        return {'error': str(e)}


def process_txt(file_path):
    """Extracts text from a .txt file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        return {'text': text.strip()}  # Return as a dictionary
    except Exception as e:
        return {'error': str(e)}