Spaces:
Paused
Paused
| # filename: document_processor.py | |
| """ | |
| Module for extracting text from various document formats. | |
| """ | |
| import io | |
| import docx | |
| from PyPDF2 import PdfReader | |
| from log_config import get_logger | |
| logger = get_logger('DocumentProcessor') | |
| def extract_text_from_document(file_path: str) -> str: | |
| """ | |
| Extracts text from a document based on its file extension. | |
| Args: | |
| file_path (str): The path to the file. | |
| Returns: | |
| str: The extracted text from the document. | |
| Raises: | |
| ValueError: If the file format is not supported. | |
| """ | |
| file_extension = file_path.split(".")[-1].lower() | |
| try: | |
| with open(file_path, 'rb') as file_obj: | |
| if file_extension == "txt": | |
| return extract_text_from_txt(file_obj) | |
| elif file_extension == "pdf": | |
| return extract_text_from_pdf(file_obj) | |
| elif file_extension == "docx": | |
| return extract_text_from_docx(file_obj) | |
| else: | |
| raise ValueError(f"Unsupported file format: {file_extension}") | |
| except Exception as e: | |
| logger.error(f"Failed to extract text from {file_path}: {str(e)}") | |
| raise | |
| def extract_text_from_txt(file_obj: io.BufferedReader) -> str: | |
| """ | |
| Extracts text from a text file. | |
| Args: | |
| file_obj (io.BufferedReader): The file object opened in binary mode. | |
| Returns: | |
| str: The decoded text. | |
| """ | |
| try: | |
| content = file_obj.read() | |
| return content.decode('utf-8') | |
| except UnicodeDecodeError as e: | |
| logger.error(f"Unicode decode error: {str(e)}") | |
| raise | |
| def extract_text_from_pdf(file_obj: io.BufferedReader) -> str: | |
| """ | |
| Extracts text from a PDF file. | |
| Args: | |
| file_obj (io.BufferedReader): The file object opened in binary mode. | |
| Returns: | |
| str: The concatenated text from all pages. | |
| """ | |
| try: | |
| reader = PdfReader(file_obj) | |
| text = ''.join([page.extract_text() or '' for page in reader.pages]) | |
| return text.strip() | |
| except Exception as e: | |
| logger.error(f"Failed to extract text from PDF: {str(e)}") | |
| raise | |
| def extract_text_from_docx(file_obj: io.BufferedReader) -> str: | |
| """ | |
| Extracts text from a DOCX file. | |
| Args: | |
| file_obj (io.BufferedReader): The file object opened in binary mode. | |
| Returns: | |
| str: The concatenated text from all paragraphs. | |
| """ | |
| try: | |
| doc = docx.Document(io.BytesIO(file_obj.read())) | |
| text = '\n'.join(paragraph.text for paragraph in doc.paragraphs if paragraph.text) | |
| return text.strip() | |
| except Exception as e: | |
| logger.error(f"Failed to extract text from DOCX: {str(e)}") | |
| raise | |
| # file: document_processor.py (end) | |