Spaces:
Sleeping
Sleeping
File size: 2,868 Bytes
edac567 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import sys
import os
from typing import List, Dict, Optional
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
try:
from PyPDF2 import PdfReader
except ImportError:
try:
from pypdf import PdfReader
except ImportError:
print("Error: PDF reading library not found. Please install PyPDF2 or pypdf.")
PdfReader = None
from src.utils.chunking import chunk_pdf_text, clean_text
from config.settings import Config
class PDFProcessor:
"""Process PDFs into cleaned text chunks."""
def __init__(self, chunk_size: Optional[int] = None, overlap: Optional[int] = None) -> None:
"""
Initialize processor with chunk parameters.
Args:
chunk_size: Characters per chunk (defaults to config).
overlap: Overlap between chunks (defaults to config).
"""
self.chunk_size = chunk_size or Config.CHUNK_SIZE
self.overlap = overlap or Config.CHUNK_OVERLAP
def process_pdf(self, file_path: str) -> List[str]:
"""
Read PDF, extract text, clean, and chunk.
Args:
file_path: Path to PDF.
Returns:
List of chunk strings.
"""
raw = self._extract_text(file_path)
if not raw.strip():
return []
cleaned = clean_text(raw)
chunks = chunk_pdf_text(cleaned, self.chunk_size, self.overlap)
return [c for c in chunks if len(c.strip()) > 50]
def get_pdf_info(self, file_path: str) -> Dict:
"""
Retrieve simple info (pages, metadata, encryption).
Args:
file_path: Path to PDF.
Returns:
Dict of info.
"""
try:
reader = PdfReader(file_path)
return {
"num_pages": len(reader.pages),
"metadata": reader.metadata,
"encrypted": reader.is_encrypted,
}
except Exception as e:
print(f"[PDFProcessor] Info error: {e}")
return {}
def _extract_text(self, file_path: str) -> str:
"""
Extract text from all pages.
Args:
file_path: Path to PDF.
Returns:
Concatenated text with page separators.
"""
try:
reader = PdfReader(file_path)
out: List[str] = []
for idx, page in enumerate(reader.pages):
try:
text = page.extract_text() or ""
if text.strip():
out.append(f"\n--- Page {idx+1} ---\n{text}")
except Exception as pe:
print(f"[PDFProcessor] Page {idx+1} extraction failed: {pe}")
return "".join(out)
except Exception as e:
print(f"[PDFProcessor] Read error: {e}")
return "" |