Spaces:
Sleeping
Sleeping
| import sys | |
| import os | |
| from typing import List, Dict, Optional | |
| sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../'))) | |
| try: | |
| from PyPDF2 import PdfReader | |
| except ImportError: | |
| try: | |
| from pypdf import PdfReader | |
| except ImportError: | |
| print("Error: PDF reading library not found. Please install PyPDF2 or pypdf.") | |
| PdfReader = None | |
| from src.utils.chunking import chunk_pdf_text, clean_text | |
| from config.settings import Config | |
| class PDFProcessor: | |
| """Process PDFs into cleaned text chunks.""" | |
| def __init__(self, chunk_size: Optional[int] = None, overlap: Optional[int] = None) -> None: | |
| """ | |
| Initialize processor with chunk parameters. | |
| Args: | |
| chunk_size: Characters per chunk (defaults to config). | |
| overlap: Overlap between chunks (defaults to config). | |
| """ | |
| self.chunk_size = chunk_size or Config.CHUNK_SIZE | |
| self.overlap = overlap or Config.CHUNK_OVERLAP | |
| def process_pdf(self, file_path: str) -> List[str]: | |
| """ | |
| Read PDF, extract text, clean, and chunk. | |
| Args: | |
| file_path: Path to PDF. | |
| Returns: | |
| List of chunk strings. | |
| """ | |
| raw = self._extract_text(file_path) | |
| if not raw.strip(): | |
| return [] | |
| cleaned = clean_text(raw) | |
| chunks = chunk_pdf_text(cleaned, self.chunk_size, self.overlap) | |
| return [c for c in chunks if len(c.strip()) > 50] | |
| def get_pdf_info(self, file_path: str) -> Dict: | |
| """ | |
| Retrieve simple info (pages, metadata, encryption). | |
| Args: | |
| file_path: Path to PDF. | |
| Returns: | |
| Dict of info. | |
| """ | |
| try: | |
| reader = PdfReader(file_path) | |
| return { | |
| "num_pages": len(reader.pages), | |
| "metadata": reader.metadata, | |
| "encrypted": reader.is_encrypted, | |
| } | |
| except Exception as e: | |
| print(f"[PDFProcessor] Info error: {e}") | |
| return {} | |
| def _extract_text(self, file_path: str) -> str: | |
| """ | |
| Extract text from all pages. | |
| Args: | |
| file_path: Path to PDF. | |
| Returns: | |
| Concatenated text with page separators. | |
| """ | |
| try: | |
| reader = PdfReader(file_path) | |
| out: List[str] = [] | |
| for idx, page in enumerate(reader.pages): | |
| try: | |
| text = page.extract_text() or "" | |
| if text.strip(): | |
| out.append(f"\n--- Page {idx+1} ---\n{text}") | |
| except Exception as pe: | |
| print(f"[PDFProcessor] Page {idx+1} extraction failed: {pe}") | |
| return "".join(out) | |
| except Exception as e: | |
| print(f"[PDFProcessor] Read error: {e}") | |
| return "" |