Spaces:
Sleeping
Sleeping
| """ | |
| PDF text extraction for user-uploaded datasheets. | |
| Uses PyMuPDF (fitz) for robust PDF parsing. | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| from pathlib import Path | |
| logger = logging.getLogger(__name__) | |
| def extract_text_from_pdf(file_path: str | Path) -> str: | |
| """ | |
| Extract all text from a PDF file. | |
| Returns concatenated text from all pages. | |
| """ | |
| try: | |
| import fitz # PyMuPDF | |
| except ImportError: | |
| logger.error("PyMuPDF not installed. Run: pip install pymupdf") | |
| return "" | |
| try: | |
| doc = fitz.open(str(file_path)) | |
| pages_text = [] | |
| for page_num, page in enumerate(doc): | |
| text = page.get_text() | |
| if text.strip(): | |
| pages_text.append(f"--- Page {page_num + 1} ---\n{text}") | |
| doc.close() | |
| full_text = "\n".join(pages_text) | |
| logger.info( | |
| "Extracted %d chars from %d pages of %s", | |
| len(full_text), len(pages_text), file_path, | |
| ) | |
| return full_text | |
| except Exception as exc: | |
| logger.error("PDF extraction failed for %s: %s", file_path, exc) | |
| return "" | |
| def extract_text_from_bytes(file_bytes: bytes, filename: str = "upload.pdf") -> str: | |
| """ | |
| Extract text from PDF bytes (for Gradio file upload handling). | |
| """ | |
| try: | |
| import fitz | |
| except ImportError: | |
| logger.error("PyMuPDF not installed.") | |
| return "" | |
| try: | |
| doc = fitz.open(stream=file_bytes, filetype="pdf") | |
| pages_text = [] | |
| for page_num, page in enumerate(doc): | |
| text = page.get_text() | |
| if text.strip(): | |
| pages_text.append(f"--- Page {page_num + 1} ---\n{text}") | |
| doc.close() | |
| full_text = "\n".join(pages_text) | |
| logger.info( | |
| "Extracted %d chars from %d pages of %s", | |
| len(full_text), len(pages_text), filename, | |
| ) | |
| return full_text | |
| except Exception as exc: | |
| logger.error("PDF bytes extraction failed: %s", exc) | |
| return "" | |