Spaces:
Sleeping
Sleeping
| """ | |
| pdf_parser.py — Extract plain text from PDF, DOCX, and TXT files. | |
| """ | |
| import os | |
| import logging | |
| from io import BytesIO | |
| logger = logging.getLogger(__name__) | |
| PDF_MAX_PAGES = int(os.getenv("PDF_MAX_PAGES", "15")) | |
| def extract_text_from_pdf(file_bytes: bytes) -> str: | |
| """Extract text from a PDF byte stream (up to PDF_MAX_PAGES pages).""" | |
| try: | |
| import PyPDF2 | |
| reader = PyPDF2.PdfReader(BytesIO(file_bytes)) | |
| pages = reader.pages[:PDF_MAX_PAGES] | |
| text = "\n".join(page.extract_text() or "" for page in pages) | |
| return text.strip() | |
| except Exception as exc: | |
| logger.error(f"[PDF] Extraction failed: {exc}") | |
| return "" | |
| def extract_text_from_docx(file_bytes: bytes) -> str: | |
| """Extract text from a DOCX byte stream.""" | |
| try: | |
| import docx | |
| from io import BytesIO as _BytesIO | |
| doc = docx.Document(_BytesIO(file_bytes)) | |
| return "\n".join(para.text for para in doc.paragraphs).strip() | |
| except Exception as exc: | |
| logger.error(f"[DOCX] Extraction failed: {exc}") | |
| return "" | |
| def extract_text(file_bytes: bytes, filename: str) -> str: | |
| """Dispatch extraction based on file extension.""" | |
| ext = os.path.splitext(filename.lower())[1] | |
| if ext == ".pdf": | |
| return extract_text_from_pdf(file_bytes) | |
| elif ext in (".docx", ".doc"): | |
| return extract_text_from_docx(file_bytes) | |
| elif ext == ".txt": | |
| return file_bytes.decode("utf-8", errors="ignore").strip() | |
| else: | |
| raise ValueError(f"Unsupported file type: {ext}. Allowed: PDF, DOCX, TXT.") | |