| import PyPDF2 | |
| import docx | |
| def load_txt(file_path): | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| return f.read() | |
| def load_pdf(file_path): | |
| text = "" | |
| with open(file_path, "rb") as f: | |
| reader = PyPDF2.PdfReader(f) | |
| for page in reader.pages: | |
| if page.extract_text(): | |
| text += page.extract_text() | |
| return text | |
| def load_docx(file_path): | |
| doc = docx.Document(file_path) | |
| return "\n".join([p.text for p in doc.paragraphs]) | |