File size: 2,023 Bytes
6122580
a52a8bb
6122580
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a52a8bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import docx # type: ignore
import os

def read_text_from_docx(file_path: str) -> str:
   """
   Extracts all text from a .docx file and returns it as a single string.
   """
   try:
       doc = docx.Document(file_path)
       full_text = []
       for para in doc.paragraphs:
           full_text.append(para.text)
       return '\n'.join(full_text)
   except FileNotFoundError:
       print(f"Error: The file at {file_path} was not found.")
       return ""
   except Exception as e:
       print(f"An error occurred while reading the docx file: {e}")
       return ""

def read_text_file(file_path: str) -> str:
   """
   Reads a plain text file and returns its content.
   """
   try:
       with open(file_path, 'r', encoding='utf-8') as f:
           return f.read()
   except FileNotFoundError:
       print(f"Error: The file at {file_path} was not found.")
       return ""
   except Exception as e:
       print(f"An error occurred while reading the text file: {e}")
       return ""
   
def read_any_document(file_path: str) -> str:
    """
    Reads text from a file, supporting .docx, .pdf, and .txt.
    You will need to have the underlying reader functions (e.g., read_text_from_pdf)
    and required libraries (e.g., pypdf, python-docx) installed.
    """
    if not file_path:
        return ""
    _, extension = os.path.splitext(file_path)
    try:
        if extension.lower() == '.docx':
            # This function must be in your document_processor.py
            return read_text_from_docx(file_path)
        # Add other file types as needed, for example:
        # elif extension.lower() == '.pdf':
        #     return read_text_from_pdf(file_path) # Assumes you have this function
        # elif extension.lower() == '.txt':
        #     with open(file_path, 'r', encoding='utf-8') as f:
        #         return f.read()
        else:
            return f"[Unsupported file type: {extension}]"
    except Exception as e:
        return f"[Error reading file {os.path.basename(file_path)}: {e}]"