Spaces:
Sleeping
Sleeping
| import docx # type: ignore | |
| import os | |
| def read_text_from_docx(file_path: str) -> str: | |
| """ | |
| Extracts all text from a .docx file and returns it as a single string. | |
| """ | |
| try: | |
| doc = docx.Document(file_path) | |
| full_text = [] | |
| for para in doc.paragraphs: | |
| full_text.append(para.text) | |
| return '\n'.join(full_text) | |
| except FileNotFoundError: | |
| print(f"Error: The file at {file_path} was not found.") | |
| return "" | |
| except Exception as e: | |
| print(f"An error occurred while reading the docx file: {e}") | |
| return "" | |
| def read_text_file(file_path: str) -> str: | |
| """ | |
| Reads a plain text file and returns its content. | |
| """ | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| return f.read() | |
| except FileNotFoundError: | |
| print(f"Error: The file at {file_path} was not found.") | |
| return "" | |
| except Exception as e: | |
| print(f"An error occurred while reading the text file: {e}") | |
| return "" | |
| def read_any_document(file_path: str) -> str: | |
| """ | |
| Reads text from a file, supporting .docx, .pdf, and .txt. | |
| You will need to have the underlying reader functions (e.g., read_text_from_pdf) | |
| and required libraries (e.g., pypdf, python-docx) installed. | |
| """ | |
| if not file_path: | |
| return "" | |
| _, extension = os.path.splitext(file_path) | |
| try: | |
| if extension.lower() == '.docx': | |
| # This function must be in your document_processor.py | |
| return read_text_from_docx(file_path) | |
| # Add other file types as needed, for example: | |
| # elif extension.lower() == '.pdf': | |
| # return read_text_from_pdf(file_path) # Assumes you have this function | |
| # elif extension.lower() == '.txt': | |
| # with open(file_path, 'r', encoding='utf-8') as f: | |
| # return f.read() | |
| else: | |
| return f"[Unsupported file type: {extension}]" | |
| except Exception as e: | |
| return f"[Error reading file {os.path.basename(file_path)}: {e}]" | |