""" PDF Processing utilities for extracting and chunking text from PDF files """ import os from typing import List, Dict import PyPDF2 import pdfplumber try: from langchain_text_splitters import RecursiveCharacterTextSplitter except ImportError: from langchain.text_splitter import RecursiveCharacterTextSplitter from config.model_config import config class PDFProcessor: """Handle PDF text extraction and processing""" def __init__(self): self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=config.CHUNK_SIZE, chunk_overlap=config.CHUNK_OVERLAP, length_function=len, separators=["\n\n", "\n", " ", ""] ) def extract_text_from_pdf(self, pdf_path: str, method: str = "pdfplumber") -> str: """ Extract text from PDF file Args: pdf_path: Path to PDF file method: Extraction method ('pypdf2' or 'pdfplumber') Returns: Extracted text as string """ text = "" try: if method == "pdfplumber": text = self._extract_with_pdfplumber(pdf_path) else: text = self._extract_with_pypdf2(pdf_path) except Exception as e: print(f"Error extracting text from {pdf_path}: {e}") # Fallback to alternative method if method == "pdfplumber": text = self._extract_with_pypdf2(pdf_path) else: text = self._extract_with_pdfplumber(pdf_path) return text def _extract_with_pypdf2(self, pdf_path: str) -> str: """Extract text using PyPDF2""" text = "" with open(pdf_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) for page in pdf_reader.pages: text += page.extract_text() + "\n" return text def _extract_with_pdfplumber(self, pdf_path: str) -> str: """Extract text using pdfplumber (better for complex PDFs)""" text = "" with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: page_text = page.extract_text() if page_text: text += page_text + "\n" return text def chunk_text(self, text: str) -> List[str]: """ Split text into chunks Args: text: Input text to chunk Returns: List of text chunks """ chunks = self.text_splitter.split_text(text) return chunks def extract_with_structure(self, pdf_path: str) -> Dict: """ Extract text with page and paragraph structure Args: pdf_path: Path to PDF file Returns: Dictionary with structured content including pages and paragraphs """ structured_content = { "pages": [], "paragraphs": [], "full_text": "" } try: with pdfplumber.open(pdf_path) as pdf: paragraph_id = 0 for page_num, page in enumerate(pdf.pages, start=1): page_text = page.extract_text() if not page_text: continue # Split into paragraphs (double newline or significant whitespace) raw_paragraphs = page_text.split('\n\n') page_paragraphs = [] for para_text in raw_paragraphs: para_text = para_text.strip() if len(para_text) > 20: # Ignore very short fragments paragraph_id += 1 paragraph_data = { "id": f"para_{paragraph_id}", "page": page_num, "text": para_text, "char_start": len(structured_content["full_text"]), "char_end": len(structured_content["full_text"]) + len(para_text) } page_paragraphs.append(paragraph_data) structured_content["paragraphs"].append(paragraph_data) structured_content["full_text"] += para_text + "\n\n" structured_content["pages"].append({ "page_num": page_num, "text": page_text, "paragraphs": page_paragraphs }) except Exception as e: print(f"Error extracting structured content: {e}") # Fallback to simple extraction text = self.extract_text_from_pdf(pdf_path) structured_content["full_text"] = text structured_content["paragraphs"] = [{ "id": "para_1", "page": 1, "text": text, "char_start": 0, "char_end": len(text) }] return structured_content def generate_html_preview(self, structured_content: Dict, filename: str) -> str: """ Generate HTML representation of PDF for viewer Args: structured_content: Structured content from extract_with_structure filename: Name of the PDF file Returns: HTML string """ html = f"""

📄 {filename}

{len(structured_content['pages'])} halaman • {len(structured_content['paragraphs'])} paragraf

""" for page in structured_content["pages"]: html += f"""
Halaman {page['page_num']}
""" for para in page["paragraphs"]: html += f"""

{para['text']}

""" html += "
" html += "
" return html def chunk_text_with_metadata(self, structured_content: Dict) -> List[Dict]: """ Split text into chunks with metadata about source location Args: structured_content: Structured content from extract_with_structure Returns: List of dictionaries with chunk text and metadata """ # Get chunks from the splitter text_chunks = self.text_splitter.split_text(structured_content["full_text"]) chunks_with_metadata = [] for i, chunk_text in enumerate(text_chunks): # Find which paragraphs this chunk overlaps with chunk_start = structured_content["full_text"].find(chunk_text) chunk_end = chunk_start + len(chunk_text) # Find overlapping paragraphs related_paragraphs = [] related_pages = set() for para in structured_content["paragraphs"]: # Check if chunk overlaps with paragraph if not (chunk_end < para["char_start"] or chunk_start > para["char_end"]): related_paragraphs.append(para["id"]) related_pages.add(para["page"]) chunks_with_metadata.append({ "text": chunk_text, "chunk_index": i, "paragraph_ids": related_paragraphs, "pages": sorted(list(related_pages)), "char_start": chunk_start, "char_end": chunk_end }) return chunks_with_metadata def process_pdf(self, pdf_path: str) -> Dict: """ Complete processing pipeline: extract and chunk PDF with structure Args: pdf_path: Path to PDF file Returns: Dictionary with filename, text, chunks, and structured content """ filename = os.path.basename(pdf_path) # Extract structured content structured_content = self.extract_with_structure(pdf_path) if not structured_content["full_text"].strip(): raise ValueError(f"No text extracted from {filename}") # Generate HTML preview html_preview = self.generate_html_preview(structured_content, filename) # Chunk text with metadata chunks_with_metadata = self.chunk_text_with_metadata(structured_content) # Extract just the text for backward compatibility chunks = [c["text"] for c in chunks_with_metadata] return { "filename": filename, "full_text": structured_content["full_text"], "chunks": chunks, "chunks_metadata": chunks_with_metadata, "structured_content": structured_content, "html_preview": html_preview, "num_chunks": len(chunks), "total_chars": len(structured_content["full_text"]), "num_pages": len(structured_content["pages"]), "num_paragraphs": len(structured_content["paragraphs"]) } def get_pdf_info(self, pdf_path: str) -> Dict: """ Get metadata about PDF file Args: pdf_path: Path to PDF file Returns: Dictionary with PDF metadata """ info = { "filename": os.path.basename(pdf_path), "file_size": os.path.getsize(pdf_path), "num_pages": 0 } try: with open(pdf_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) info["num_pages"] = len(pdf_reader.pages) except Exception as e: print(f"Error getting PDF info: {e}") return info