"""File text extractor — supports .docx, .pdf, .txt. Reusable: copy this file to any Flask project's app/core/ directory. Dependencies: pypdf>=4.0 (for PDF support — add to requirements.txt) DOCX and TXT use Python built-ins only (no extra packages needed). """ import io import zipfile import xml.etree.ElementTree as ET from pathlib import Path ALLOWED_EXTENSIONS = {".pdf", ".docx", ".txt"} MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB _WORD_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" def extract_text(file_storage) -> str: """Extract plain text from a Werkzeug FileStorage object. Supports .pdf, .docx, .txt files up to 10 MB. Returns extracted text as a string. Raises ValueError for unsupported types, oversized files, or parse errors. """ filename = file_storage.filename or "" ext = Path(filename).suffix.lower() if ext not in ALLOWED_EXTENSIONS: raise ValueError( f"Unsupported file type '{ext or '(none)'}'. Allowed: PDF, DOCX, TXT" ) data = file_storage.read() if len(data) > MAX_FILE_SIZE: raise ValueError("File too large (max 10 MB)") if not data: raise ValueError("File is empty") if ext == ".txt": return data.decode("utf-8", errors="replace").strip() if ext == ".docx": return _read_docx(io.BytesIO(data)) if ext == ".pdf": return _read_pdf(io.BytesIO(data)) raise ValueError(f"Unhandled extension: {ext}") def _read_docx(stream: io.BytesIO) -> str: """Extract text from a .docx file using built-in zipfile + xml.etree (no deps).""" try: with zipfile.ZipFile(stream) as z: with z.open("word/document.xml") as f: tree = ET.parse(f) except (zipfile.BadZipFile, KeyError) as exc: raise ValueError(f"Could not read Word document: {exc}") root = tree.getroot() paragraphs = [] for para in root.iter(f"{{{_WORD_NS}}}p"): # Collect all text runs, preserving spaces parts = [] for node in para.iter(): if node.tag == f"{{{_WORD_NS}}}t" and node.text: parts.append(node.text) elif node.tag == f"{{{_WORD_NS}}}br": parts.append("\n") text = "".join(parts).strip() if text: paragraphs.append(text) text = "\n\n".join(paragraphs) if not text.strip(): raise ValueError("No readable text found in the Word document") return text def _read_pdf(stream: io.BytesIO) -> str: """Extract text from a PDF using pypdf.""" try: from pypdf import PdfReader except ImportError: raise ValueError("pypdf not installed — run: pip install pypdf") try: reader = PdfReader(stream) except Exception as exc: raise ValueError(f"Could not read PDF: {exc}") pages = [] for page in reader.pages: text = page.extract_text() or "" if text.strip(): pages.append(text.strip()) text = "\n\n".join(pages) if not text.strip(): raise ValueError("No readable text found in the PDF (may be image-based)") return text