Spaces:
Sleeping
Sleeping
| """ | |
| Text extraction utilities for PDF and images. | |
| Supports both digital PDFs and scanned documents (OCR). | |
| """ | |
| import pdfplumber | |
| import fitz # PyMuPDF | |
| import pytesseract | |
| from PIL import Image | |
| from pathlib import Path | |
| from typing import Dict, Tuple | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| def extract_text_from_pdf(file_path: Path) -> Tuple[str, Dict]: | |
| """ | |
| Extract text from PDF using pdfplumber (for digital PDFs). | |
| Returns: | |
| (raw_text, metadata) | |
| """ | |
| try: | |
| text_pages = [] | |
| page_count = 0 | |
| with pdfplumber.open(str(file_path)) as pdf: | |
| page_count = len(pdf.pages) | |
| for page in pdf.pages: | |
| text = page.extract_text() | |
| if text: | |
| text_pages.append(text) | |
| raw_text = "\n\n".join(text_pages) | |
| metadata = { | |
| "page_count": page_count, | |
| "extraction_method": "pdfplumber", | |
| "confidence_score": 1.0 if len(raw_text) > 50 else 0.5 | |
| } | |
| # If no text extracted, it might be a scanned PDF | |
| if not raw_text.strip(): | |
| logger.info("No text found with pdfplumber, trying OCR...") | |
| return extract_text_from_pdf_ocr(file_path) | |
| return raw_text, metadata | |
| except Exception as e: | |
| logger.error(f"PDF extraction failed: {e}") | |
| raise | |
| def extract_text_from_pdf_ocr(file_path: Path) -> Tuple[str, Dict]: | |
| """ | |
| Extract text from scanned PDF using OCR (PyMuPDF + Tesseract). | |
| """ | |
| try: | |
| text_pages = [] | |
| doc = fitz.open(str(file_path)) | |
| page_count = len(doc) | |
| for page_num in range(page_count): | |
| page = doc[page_num] | |
| # Convert page to image | |
| pix = page.get_pixmap(dpi=300) | |
| img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
| # OCR | |
| text = pytesseract.image_to_string(img) | |
| text_pages.append(text) | |
| doc.close() | |
| raw_text = "\n\n".join(text_pages) | |
| metadata = { | |
| "page_count": page_count, | |
| "extraction_method": "tesseract_ocr", | |
| "confidence_score": 0.7 # OCR typically less confident | |
| } | |
| return raw_text, metadata | |
| except Exception as e: | |
| logger.error(f"OCR extraction failed: {e}") | |
| raise | |
| def extract_text_from_image(file_path: Path) -> Tuple[str, Dict]: | |
| """ | |
| Extract text from image using OCR (Tesseract). | |
| """ | |
| try: | |
| img = Image.open(str(file_path)) | |
| raw_text = pytesseract.image_to_string(img) | |
| metadata = { | |
| "page_count": 1, | |
| "extraction_method": "tesseract_ocr", | |
| "confidence_score": 0.7 | |
| } | |
| return raw_text, metadata | |
| except Exception as e: | |
| logger.error(f"Image OCR failed: {e}") | |
| raise | |
| def extract_text(file_path: Path, mime_type: str) -> Tuple[str, Dict]: | |
| """ | |
| Main entry point for text extraction. | |
| Routes to appropriate extractor based on file type. | |
| Args: | |
| file_path: Path to document | |
| mime_type: MIME type of document | |
| Returns: | |
| (raw_text, metadata_dict) | |
| """ | |
| if mime_type == "application/pdf": | |
| return extract_text_from_pdf(file_path) | |
| elif mime_type in ["image/png", "image/jpeg", "image/jpg"]: | |
| return extract_text_from_image(file_path) | |
| else: | |
| raise ValueError(f"Unsupported file type: {mime_type}") |