Spaces:
Sleeping
Sleeping
| import PyPDF2 | |
| import docx | |
| from pathlib import Path | |
| def load_text(path: str) -> str: | |
| """ | |
| Load text from TXT, PDF, or DOCX files. | |
| Returns the extracted text as a string. | |
| """ | |
| path_obj = Path(path) | |
| if not path_obj.exists(): | |
| raise FileNotFoundError(f"{path} does not exist.") | |
| if path_obj.suffix.lower() == ".txt": | |
| return path_obj.read_text(encoding="utf-8") | |
| elif path_obj.suffix.lower() == ".pdf": | |
| text = "" | |
| with open(path_obj, "rb") as f: | |
| reader = PyPDF2.PdfReader(f) | |
| for page in reader.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + "\n" | |
| return text | |
| elif path_obj.suffix.lower() == ".docx": | |
| doc = docx.Document(path_obj) | |
| return "\n".join([p.text for p in doc.paragraphs]) | |
| else: | |
| raise ValueError(f"Unsupported file type: {path_obj.suffix}") |