Spaces:
Sleeping
Sleeping
File size: 2,023 Bytes
6122580 a52a8bb 6122580 a52a8bb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
import docx # type: ignore
import os
def read_text_from_docx(file_path: str) -> str:
"""
Extracts all text from a .docx file and returns it as a single string.
"""
try:
doc = docx.Document(file_path)
full_text = []
for para in doc.paragraphs:
full_text.append(para.text)
return '\n'.join(full_text)
except FileNotFoundError:
print(f"Error: The file at {file_path} was not found.")
return ""
except Exception as e:
print(f"An error occurred while reading the docx file: {e}")
return ""
def read_text_file(file_path: str) -> str:
"""
Reads a plain text file and returns its content.
"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
except FileNotFoundError:
print(f"Error: The file at {file_path} was not found.")
return ""
except Exception as e:
print(f"An error occurred while reading the text file: {e}")
return ""
def read_any_document(file_path: str) -> str:
"""
Reads text from a file, supporting .docx, .pdf, and .txt.
You will need to have the underlying reader functions (e.g., read_text_from_pdf)
and required libraries (e.g., pypdf, python-docx) installed.
"""
if not file_path:
return ""
_, extension = os.path.splitext(file_path)
try:
if extension.lower() == '.docx':
# This function must be in your document_processor.py
return read_text_from_docx(file_path)
# Add other file types as needed, for example:
# elif extension.lower() == '.pdf':
# return read_text_from_pdf(file_path) # Assumes you have this function
# elif extension.lower() == '.txt':
# with open(file_path, 'r', encoding='utf-8') as f:
# return f.read()
else:
return f"[Unsupported file type: {extension}]"
except Exception as e:
return f"[Error reading file {os.path.basename(file_path)}: {e}]"
|