| """ |
| pdf_parser.py — Extract plain text from PDF, DOCX, and TXT files. |
| """ |
|
|
| import os |
| import logging |
| from io import BytesIO |
|
|
| logger = logging.getLogger(__name__) |
|
|
| PDF_MAX_PAGES = int(os.getenv("PDF_MAX_PAGES", "15")) |
|
|
|
|
| def extract_text_from_pdf(file_bytes: bytes) -> str: |
| """Extract text from a PDF byte stream (up to PDF_MAX_PAGES pages).""" |
| try: |
| import PyPDF2 |
| reader = PyPDF2.PdfReader(BytesIO(file_bytes)) |
| pages = reader.pages[:PDF_MAX_PAGES] |
| text = "\n".join(page.extract_text() or "" for page in pages) |
| return text.strip() |
| except Exception as exc: |
| logger.error(f"[PDF] Extraction failed: {exc}") |
| return "" |
|
|
|
|
| def extract_text_from_docx(file_bytes: bytes) -> str: |
| """Extract text from a DOCX byte stream.""" |
| try: |
| import docx |
| from io import BytesIO as _BytesIO |
| doc = docx.Document(_BytesIO(file_bytes)) |
| return "\n".join(para.text for para in doc.paragraphs).strip() |
| except Exception as exc: |
| logger.error(f"[DOCX] Extraction failed: {exc}") |
| return "" |
|
|
|
|
| def extract_text(file_bytes: bytes, filename: str) -> str: |
| """Dispatch extraction based on file extension.""" |
| ext = os.path.splitext(filename.lower())[1] |
| if ext == ".pdf": |
| return extract_text_from_pdf(file_bytes) |
| elif ext in (".docx", ".doc"): |
| return extract_text_from_docx(file_bytes) |
| elif ext == ".txt": |
| return file_bytes.decode("utf-8", errors="ignore").strip() |
| else: |
| raise ValueError(f"Unsupported file type: {ext}. Allowed: PDF, DOCX, TXT.") |
|
|