File size: 3,796 Bytes
4ede186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import PyPDF2
from docx import Document
import docx2txt
import io
from typing import Union

class DocumentProcessor:
    """Process different document formats (PDF, DOCX, DOC) and extract text"""
    
    @staticmethod
    def extract_text_from_pdf(file) -> str:
        """Extract text from PDF file"""
        try:
            # Ensure we're at the beginning of the file
            if hasattr(file, 'seek'):
                file.seek(0)
            
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
            
            result = text.strip()
            print(f"[DEBUG] Extracted {len(result)} characters from PDF")
            return result
        except Exception as e:
            print(f"Error extracting text from PDF: {str(e)}")
            import traceback
            traceback.print_exc()
            return ""
    
    @staticmethod
    def extract_text_from_docx(file) -> str:
        """Extract text from DOCX file"""
        try:
            # Try using python-docx first
            try:
                doc = Document(file)
                text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
                if text.strip():
                    return text.strip()
            except:
                pass
            
            # Fallback to docx2txt
            file.seek(0)
            text = docx2txt.process(file)
            return text.strip()
        except Exception as e:
            print(f"Error extracting text from DOCX: {str(e)}")
            return ""
    
    @staticmethod
    def extract_text_from_doc(file) -> str:
        """Extract text from DOC file (legacy Word format)"""
        try:
            # For .doc files, we'll try docx2txt which has some support
            text = docx2txt.process(file)
            return text.strip()
        except Exception as e:
            print(f"Error extracting text from DOC: {str(e)}")
            # If docx2txt fails, return a message
            return "Note: Legacy .doc format may require conversion to .docx for better text extraction."
    
    @staticmethod
    def extract_text(file, file_type: str = None) -> str:
        """
        Extract text from any supported document format
        
        Args:
            file: File object or file-like object
            file_type: File extension (e.g., '.pdf', '.docx', '.doc')
        
        Returns:
            Extracted text as string
        """
        # Determine file type if not provided
        if file_type is None:
            if hasattr(file, 'name'):
                file_type = file.name.split('.')[-1].lower()
            elif hasattr(file, 'type'):
                type_map = {
                    'application/pdf': 'pdf',
                    'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
                    'application/msword': 'doc'
                }
                file_type = type_map.get(file.type, 'pdf')
            else:
                file_type = 'pdf'  # Default to PDF
        
        # Remove leading dot if present
        file_type = file_type.lstrip('.')
        
        # Reset file pointer to beginning
        if hasattr(file, 'seek'):
            file.seek(0)
        
        # Extract text based on file type
        if file_type == 'pdf':
            return DocumentProcessor.extract_text_from_pdf(file)
        elif file_type == 'docx':
            return DocumentProcessor.extract_text_from_docx(file)
        elif file_type == 'doc':
            return DocumentProcessor.extract_text_from_doc(file)
        else:
            return f"Unsupported file type: {file_type}"