import fitz from docx import Document import os def extract_text_from_pdf(pdf_path): """Extract text from PDF using PyMuPDF.""" try: doc = fitz.open(pdf_path) text = "" for page in doc: text += page.get_text() return text except Exception as e: print(f"Error reading PDF {pdf_path}: {e}") return "" def extract_text_from_docx(docx_path): """Extract text from DOCX using python-docx.""" try: doc = Document(docx_path) return "\n".join([para.text for para in doc.paragraphs]) except Exception as e: print(f"Error reading DOCX {docx_path}: {e}") return "" def load_documents(folder="data"): """Load all supported documents from a folder.""" texts = [] for file in os.listdir(folder): path = os.path.join(folder, file) if file.endswith(".pdf"): texts.append(extract_text_from_pdf(path)) elif file.endswith(".docx"): texts.append(extract_text_from_docx(path)) else: print(f"⚠ Skipped unsupported file: {file}") return texts