Spaces:
Sleeping
Sleeping
| import fitz | |
| from docx import Document | |
| import os | |
| def extract_text_from_pdf(pdf_path): | |
| """Extract text from PDF using PyMuPDF.""" | |
| try: | |
| doc = fitz.open(pdf_path) | |
| text = "" | |
| for page in doc: | |
| text += page.get_text() | |
| return text | |
| except Exception as e: | |
| print(f"Error reading PDF {pdf_path}: {e}") | |
| return "" | |
| def extract_text_from_docx(docx_path): | |
| """Extract text from DOCX using python-docx.""" | |
| try: | |
| doc = Document(docx_path) | |
| return "\n".join([para.text for para in doc.paragraphs]) | |
| except Exception as e: | |
| print(f"Error reading DOCX {docx_path}: {e}") | |
| return "" | |
| def load_documents(folder="data"): | |
| """Load all supported documents from a folder.""" | |
| texts = [] | |
| for file in os.listdir(folder): | |
| path = os.path.join(folder, file) | |
| if file.endswith(".pdf"): | |
| texts.append(extract_text_from_pdf(path)) | |
| elif file.endswith(".docx"): | |
| texts.append(extract_text_from_docx(path)) | |
| else: | |
| print(f"⚠ Skipped unsupported file: {file}") | |
| return texts | |