import os import PyPDF2 from docx import Document SUPPORTED_ERROR = "❌ Supported formats: .txt, .pdf, .docx" def extract_text(file_path: str) -> str: ext = os.path.splitext(file_path)[1].lower() try: if ext == ".txt": with open(file_path, "r", encoding="utf-8") as f: return f.read().strip() if ext == ".pdf": text = "" with open(file_path, "rb") as f: reader = PyPDF2.PdfReader(f) for page in reader.pages: text += (page.extract_text() or "") + "\n" return text.strip() if ext == ".docx": doc = Document(file_path) return "\n".join( p.text for p in doc.paragraphs if p.text.strip() ).strip() return SUPPORTED_ERROR except Exception as e: return f"❌ Error reading file: {str(e)}"