| from .IDataReader import IDataReader | |
| from PyPDF2 import PdfReader | |
| from docx import Document | |
| class DataReader(IDataReader): | |
| def read_pdf(self, file_path: str) -> str: | |
| """ | |
| Reads a PDF file and returns its text content. | |
| :param file_path: Path to the PDF file. | |
| :return: Text content of the PDF file. | |
| """ | |
| try: | |
| text = "" | |
| with open(file_path, "rb") as f: | |
| reader = PdfReader(f) | |
| for page in reader.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + "\n" | |
| return text | |
| except Exception as e: | |
| print(f"Error reading PDF file: {e}") | |
| return "" | |
| def read_docx(self, file_path: str) -> str: | |
| """ | |
| Reads a DOCX file and returns its text content. | |
| :param file_path: Path to the DOCX file. | |
| :return: Text content of the DOCX file. | |
| """ | |
| try: | |
| doc = Document(file_path) | |
| text = "\n".join([para.text for para in doc.paragraphs]) | |
| return text | |
| except Exception as e: | |
| print(f"Error reading DOCX file: {e}") | |
| return "" | |
| def read_txt(self, file_path: str) -> str: | |
| """ | |
| Reads a TXT file and returns its text content. | |
| :param file_path: Path to the TXT file. | |
| :return: Text content of the TXT file. | |
| """ | |
| try: | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| text = f.read() | |
| return text | |
| except Exception as e: | |
| print(f"Error reading TXT file: {e}") | |
| return "" | |