import PyPDF2 from docx import Document import docx2txt import io from typing import Union class DocumentProcessor: """Process different document formats (PDF, DOCX, DOC) and extract text""" @staticmethod def extract_text_from_pdf(file) -> str: """Extract text from PDF file""" try: # Ensure we're at the beginning of the file if hasattr(file, 'seek'): file.seek(0) reader = PyPDF2.PdfReader(file) text = "" for page in reader.pages: page_text = page.extract_text() if page_text: text += page_text + "\n" result = text.strip() print(f"[DEBUG] Extracted {len(result)} characters from PDF") return result except Exception as e: print(f"Error extracting text from PDF: {str(e)}") import traceback traceback.print_exc() return "" @staticmethod def extract_text_from_docx(file) -> str: """Extract text from DOCX file""" try: # Try using python-docx first try: doc = Document(file) text = "\n".join([paragraph.text for paragraph in doc.paragraphs]) if text.strip(): return text.strip() except: pass # Fallback to docx2txt file.seek(0) text = docx2txt.process(file) return text.strip() except Exception as e: print(f"Error extracting text from DOCX: {str(e)}") return "" @staticmethod def extract_text_from_doc(file) -> str: """Extract text from DOC file (legacy Word format)""" try: # For .doc files, we'll try docx2txt which has some support text = docx2txt.process(file) return text.strip() except Exception as e: print(f"Error extracting text from DOC: {str(e)}") # If docx2txt fails, return a message return "Note: Legacy .doc format may require conversion to .docx for better text extraction." @staticmethod def extract_text(file, file_type: str = None) -> str: """ Extract text from any supported document format Args: file: File object or file-like object file_type: File extension (e.g., '.pdf', '.docx', '.doc') Returns: Extracted text as string """ # Determine file type if not provided if file_type is None: if hasattr(file, 'name'): file_type = file.name.split('.')[-1].lower() elif hasattr(file, 'type'): type_map = { 'application/pdf': 'pdf', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx', 'application/msword': 'doc' } file_type = type_map.get(file.type, 'pdf') else: file_type = 'pdf' # Default to PDF # Remove leading dot if present file_type = file_type.lstrip('.') # Reset file pointer to beginning if hasattr(file, 'seek'): file.seek(0) # Extract text based on file type if file_type == 'pdf': return DocumentProcessor.extract_text_from_pdf(file) elif file_type == 'docx': return DocumentProcessor.extract_text_from_docx(file) elif file_type == 'doc': return DocumentProcessor.extract_text_from_doc(file) else: return f"Unsupported file type: {file_type}"