""" pdf_parser.py — Extract plain text from PDF, DOCX, and TXT files. """ import os import logging from io import BytesIO logger = logging.getLogger(__name__) PDF_MAX_PAGES = int(os.getenv("PDF_MAX_PAGES", "15")) def extract_text_from_pdf(file_bytes: bytes) -> str: """Extract text from a PDF byte stream (up to PDF_MAX_PAGES pages).""" try: import PyPDF2 reader = PyPDF2.PdfReader(BytesIO(file_bytes)) pages = reader.pages[:PDF_MAX_PAGES] text = "\n".join(page.extract_text() or "" for page in pages) return text.strip() except Exception as exc: logger.error(f"[PDF] Extraction failed: {exc}") return "" def extract_text_from_docx(file_bytes: bytes) -> str: """Extract text from a DOCX byte stream.""" try: import docx from io import BytesIO as _BytesIO doc = docx.Document(_BytesIO(file_bytes)) return "\n".join(para.text for para in doc.paragraphs).strip() except Exception as exc: logger.error(f"[DOCX] Extraction failed: {exc}") return "" def extract_text(file_bytes: bytes, filename: str) -> str: """Dispatch extraction based on file extension.""" ext = os.path.splitext(filename.lower())[1] if ext == ".pdf": return extract_text_from_pdf(file_bytes) elif ext in (".docx", ".doc"): return extract_text_from_docx(file_bytes) elif ext == ".txt": return file_bytes.decode("utf-8", errors="ignore").strip() else: raise ValueError(f"Unsupported file type: {ext}. Allowed: PDF, DOCX, TXT.")