feat: Implement an interactive document viewer with citation highlighting and structured PDF text extraction.
a86d063
| """ | |
| PDF Processing utilities for extracting and chunking text from PDF files | |
| """ | |
| import os | |
| from typing import List, Dict | |
| import PyPDF2 | |
| import pdfplumber | |
| try: | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| except ImportError: | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from config.model_config import config | |
| class PDFProcessor: | |
| """Handle PDF text extraction and processing""" | |
| def __init__(self): | |
| self.text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=config.CHUNK_SIZE, | |
| chunk_overlap=config.CHUNK_OVERLAP, | |
| length_function=len, | |
| separators=["\n\n", "\n", " ", ""] | |
| ) | |
| def extract_text_from_pdf(self, pdf_path: str, method: str = "pdfplumber") -> str: | |
| """ | |
| Extract text from PDF file | |
| Args: | |
| pdf_path: Path to PDF file | |
| method: Extraction method ('pypdf2' or 'pdfplumber') | |
| Returns: | |
| Extracted text as string | |
| """ | |
| text = "" | |
| try: | |
| if method == "pdfplumber": | |
| text = self._extract_with_pdfplumber(pdf_path) | |
| else: | |
| text = self._extract_with_pypdf2(pdf_path) | |
| except Exception as e: | |
| print(f"Error extracting text from {pdf_path}: {e}") | |
| # Fallback to alternative method | |
| if method == "pdfplumber": | |
| text = self._extract_with_pypdf2(pdf_path) | |
| else: | |
| text = self._extract_with_pdfplumber(pdf_path) | |
| return text | |
| def _extract_with_pypdf2(self, pdf_path: str) -> str: | |
| """Extract text using PyPDF2""" | |
| text = "" | |
| with open(pdf_path, 'rb') as file: | |
| pdf_reader = PyPDF2.PdfReader(file) | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() + "\n" | |
| return text | |
| def _extract_with_pdfplumber(self, pdf_path: str) -> str: | |
| """Extract text using pdfplumber (better for complex PDFs)""" | |
| text = "" | |
| with pdfplumber.open(pdf_path) as pdf: | |
| for page in pdf.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + "\n" | |
| return text | |
| def chunk_text(self, text: str) -> List[str]: | |
| """ | |
| Split text into chunks | |
| Args: | |
| text: Input text to chunk | |
| Returns: | |
| List of text chunks | |
| """ | |
| chunks = self.text_splitter.split_text(text) | |
| return chunks | |
| def extract_with_structure(self, pdf_path: str) -> Dict: | |
| """ | |
| Extract text with page and paragraph structure | |
| Args: | |
| pdf_path: Path to PDF file | |
| Returns: | |
| Dictionary with structured content including pages and paragraphs | |
| """ | |
| structured_content = { | |
| "pages": [], | |
| "paragraphs": [], | |
| "full_text": "" | |
| } | |
| try: | |
| with pdfplumber.open(pdf_path) as pdf: | |
| paragraph_id = 0 | |
| for page_num, page in enumerate(pdf.pages, start=1): | |
| page_text = page.extract_text() | |
| if not page_text: | |
| continue | |
| # Split into paragraphs (double newline or significant whitespace) | |
| raw_paragraphs = page_text.split('\n\n') | |
| page_paragraphs = [] | |
| for para_text in raw_paragraphs: | |
| para_text = para_text.strip() | |
| if len(para_text) > 20: # Ignore very short fragments | |
| paragraph_id += 1 | |
| paragraph_data = { | |
| "id": f"para_{paragraph_id}", | |
| "page": page_num, | |
| "text": para_text, | |
| "char_start": len(structured_content["full_text"]), | |
| "char_end": len(structured_content["full_text"]) + len(para_text) | |
| } | |
| page_paragraphs.append(paragraph_data) | |
| structured_content["paragraphs"].append(paragraph_data) | |
| structured_content["full_text"] += para_text + "\n\n" | |
| structured_content["pages"].append({ | |
| "page_num": page_num, | |
| "text": page_text, | |
| "paragraphs": page_paragraphs | |
| }) | |
| except Exception as e: | |
| print(f"Error extracting structured content: {e}") | |
| # Fallback to simple extraction | |
| text = self.extract_text_from_pdf(pdf_path) | |
| structured_content["full_text"] = text | |
| structured_content["paragraphs"] = [{ | |
| "id": "para_1", | |
| "page": 1, | |
| "text": text, | |
| "char_start": 0, | |
| "char_end": len(text) | |
| }] | |
| return structured_content | |
| def generate_html_preview(self, structured_content: Dict, filename: str) -> str: | |
| """ | |
| Generate HTML representation of PDF for viewer | |
| Args: | |
| structured_content: Structured content from extract_with_structure | |
| filename: Name of the PDF file | |
| Returns: | |
| HTML string | |
| """ | |
| html = f""" | |
| <div class="document-content" data-filename="{filename}"> | |
| <div class="document-header"> | |
| <h3>📄 {filename}</h3> | |
| <p class="doc-meta">{len(structured_content['pages'])} halaman • {len(structured_content['paragraphs'])} paragraf</p> | |
| </div> | |
| """ | |
| for page in structured_content["pages"]: | |
| html += f""" | |
| <div class="pdf-page" data-page="{page['page_num']}"> | |
| <div class="page-number">Halaman {page['page_num']}</div> | |
| """ | |
| for para in page["paragraphs"]: | |
| html += f""" | |
| <p class="paragraph" id="{para['id']}" data-page="{para['page']}"> | |
| {para['text']} | |
| </p> | |
| """ | |
| html += "</div>" | |
| html += "</div>" | |
| return html | |
| def chunk_text_with_metadata(self, structured_content: Dict) -> List[Dict]: | |
| """ | |
| Split text into chunks with metadata about source location | |
| Args: | |
| structured_content: Structured content from extract_with_structure | |
| Returns: | |
| List of dictionaries with chunk text and metadata | |
| """ | |
| # Get chunks from the splitter | |
| text_chunks = self.text_splitter.split_text(structured_content["full_text"]) | |
| chunks_with_metadata = [] | |
| for i, chunk_text in enumerate(text_chunks): | |
| # Find which paragraphs this chunk overlaps with | |
| chunk_start = structured_content["full_text"].find(chunk_text) | |
| chunk_end = chunk_start + len(chunk_text) | |
| # Find overlapping paragraphs | |
| related_paragraphs = [] | |
| related_pages = set() | |
| for para in structured_content["paragraphs"]: | |
| # Check if chunk overlaps with paragraph | |
| if not (chunk_end < para["char_start"] or chunk_start > para["char_end"]): | |
| related_paragraphs.append(para["id"]) | |
| related_pages.add(para["page"]) | |
| chunks_with_metadata.append({ | |
| "text": chunk_text, | |
| "chunk_index": i, | |
| "paragraph_ids": related_paragraphs, | |
| "pages": sorted(list(related_pages)), | |
| "char_start": chunk_start, | |
| "char_end": chunk_end | |
| }) | |
| return chunks_with_metadata | |
| def process_pdf(self, pdf_path: str) -> Dict: | |
| """ | |
| Complete processing pipeline: extract and chunk PDF with structure | |
| Args: | |
| pdf_path: Path to PDF file | |
| Returns: | |
| Dictionary with filename, text, chunks, and structured content | |
| """ | |
| filename = os.path.basename(pdf_path) | |
| # Extract structured content | |
| structured_content = self.extract_with_structure(pdf_path) | |
| if not structured_content["full_text"].strip(): | |
| raise ValueError(f"No text extracted from {filename}") | |
| # Generate HTML preview | |
| html_preview = self.generate_html_preview(structured_content, filename) | |
| # Chunk text with metadata | |
| chunks_with_metadata = self.chunk_text_with_metadata(structured_content) | |
| # Extract just the text for backward compatibility | |
| chunks = [c["text"] for c in chunks_with_metadata] | |
| return { | |
| "filename": filename, | |
| "full_text": structured_content["full_text"], | |
| "chunks": chunks, | |
| "chunks_metadata": chunks_with_metadata, | |
| "structured_content": structured_content, | |
| "html_preview": html_preview, | |
| "num_chunks": len(chunks), | |
| "total_chars": len(structured_content["full_text"]), | |
| "num_pages": len(structured_content["pages"]), | |
| "num_paragraphs": len(structured_content["paragraphs"]) | |
| } | |
| def get_pdf_info(self, pdf_path: str) -> Dict: | |
| """ | |
| Get metadata about PDF file | |
| Args: | |
| pdf_path: Path to PDF file | |
| Returns: | |
| Dictionary with PDF metadata | |
| """ | |
| info = { | |
| "filename": os.path.basename(pdf_path), | |
| "file_size": os.path.getsize(pdf_path), | |
| "num_pages": 0 | |
| } | |
| try: | |
| with open(pdf_path, 'rb') as file: | |
| pdf_reader = PyPDF2.PdfReader(file) | |
| info["num_pages"] = len(pdf_reader.pages) | |
| except Exception as e: | |
| print(f"Error getting PDF info: {e}") | |
| return info | |