# filename: document_processor.py """ Module for extracting text from various document formats. """ import io import docx from PyPDF2 import PdfReader from log_config import get_logger logger = get_logger('DocumentProcessor') def extract_text_from_document(file_path: str) -> str: """ Extracts text from a document based on its file extension. Args: file_path (str): The path to the file. Returns: str: The extracted text from the document. Raises: ValueError: If the file format is not supported. """ file_extension = file_path.split(".")[-1].lower() try: with open(file_path, 'rb') as file_obj: if file_extension == "txt": return extract_text_from_txt(file_obj) elif file_extension == "pdf": return extract_text_from_pdf(file_obj) elif file_extension == "docx": return extract_text_from_docx(file_obj) else: raise ValueError(f"Unsupported file format: {file_extension}") except Exception as e: logger.error(f"Failed to extract text from {file_path}: {str(e)}") raise def extract_text_from_txt(file_obj: io.BufferedReader) -> str: """ Extracts text from a text file. Args: file_obj (io.BufferedReader): The file object opened in binary mode. Returns: str: The decoded text. """ try: content = file_obj.read() return content.decode('utf-8') except UnicodeDecodeError as e: logger.error(f"Unicode decode error: {str(e)}") raise def extract_text_from_pdf(file_obj: io.BufferedReader) -> str: """ Extracts text from a PDF file. Args: file_obj (io.BufferedReader): The file object opened in binary mode. Returns: str: The concatenated text from all pages. """ try: reader = PdfReader(file_obj) text = ''.join([page.extract_text() or '' for page in reader.pages]) return text.strip() except Exception as e: logger.error(f"Failed to extract text from PDF: {str(e)}") raise def extract_text_from_docx(file_obj: io.BufferedReader) -> str: """ Extracts text from a DOCX file. Args: file_obj (io.BufferedReader): The file object opened in binary mode. Returns: str: The concatenated text from all paragraphs. """ try: doc = docx.Document(io.BytesIO(file_obj.read())) text = '\n'.join(paragraph.text for paragraph in doc.paragraphs if paragraph.text) return text.strip() except Exception as e: logger.error(f"Failed to extract text from DOCX: {str(e)}") raise # file: document_processor.py (end)