import docx # type: ignore import os def read_text_from_docx(file_path: str) -> str: """ Extracts all text from a .docx file and returns it as a single string. """ try: doc = docx.Document(file_path) full_text = [] for para in doc.paragraphs: full_text.append(para.text) return '\n'.join(full_text) except FileNotFoundError: print(f"Error: The file at {file_path} was not found.") return "" except Exception as e: print(f"An error occurred while reading the docx file: {e}") return "" def read_text_file(file_path: str) -> str: """ Reads a plain text file and returns its content. """ try: with open(file_path, 'r', encoding='utf-8') as f: return f.read() except FileNotFoundError: print(f"Error: The file at {file_path} was not found.") return "" except Exception as e: print(f"An error occurred while reading the text file: {e}") return "" def read_any_document(file_path: str) -> str: """ Reads text from a file, supporting .docx, .pdf, and .txt. You will need to have the underlying reader functions (e.g., read_text_from_pdf) and required libraries (e.g., pypdf, python-docx) installed. """ if not file_path: return "" _, extension = os.path.splitext(file_path) try: if extension.lower() == '.docx': # This function must be in your document_processor.py return read_text_from_docx(file_path) # Add other file types as needed, for example: # elif extension.lower() == '.pdf': # return read_text_from_pdf(file_path) # Assumes you have this function # elif extension.lower() == '.txt': # with open(file_path, 'r', encoding='utf-8') as f: # return f.read() else: return f"[Unsupported file type: {extension}]" except Exception as e: return f"[Error reading file {os.path.basename(file_path)}: {e}]"