feat: Implement an interactive document viewer with citation highlighting and structured PDF text extraction.
a86d063
| """ | |
| Document viewer component for displaying PDFs with citation highlighting | |
| """ | |
| from typing import List, Dict, Optional | |
| import json | |
| class DocumentViewer: | |
| """Handle document viewing with citation highlighting""" | |
| def __init__(self): | |
| """Initialize document viewer""" | |
| self.current_document = None | |
| self.current_highlights = [] | |
| def render_document(self, html_content: str, filename: str, highlight_paragraphs: List[str] = None) -> str: | |
| """ | |
| Render document with optional paragraph highlighting | |
| Args: | |
| html_content: HTML content of the document | |
| filename: Name of the document | |
| highlight_paragraphs: List of paragraph IDs to highlight | |
| Returns: | |
| Enhanced HTML with highlighting | |
| """ | |
| if not html_content: | |
| return self._render_empty_state() | |
| # Add highlighting script and marks | |
| if highlight_paragraphs: | |
| for para_id in highlight_paragraphs: | |
| # Add highlighted class to specific paragraphs | |
| html_content = html_content.replace( | |
| f'<p class="paragraph" id="{para_id}"', | |
| f'<p class="paragraph highlighted-citation" id="{para_id}"' | |
| ) | |
| # Wrap with viewer container and add controls | |
| enhanced_html = f""" | |
| <div class="document-viewer-wrapper"> | |
| {self._create_viewer_controls(filename)} | |
| <div class="document-viewer-content" id="doc-viewer-content"> | |
| {html_content} | |
| </div> | |
| </div> | |
| <script> | |
| // Auto-scroll to first highlighted paragraph | |
| document.addEventListener('DOMContentLoaded', function() {{ | |
| const firstHighlight = document.querySelector('.highlighted-citation'); | |
| if (firstHighlight) {{ | |
| setTimeout(function() {{ | |
| firstHighlight.scrollIntoView({{ behavior: 'smooth', block: 'center' }}); | |
| }}, 300); | |
| }} | |
| }}); | |
| // Smooth scroll when clicking citation links | |
| function scrollToParagraph(paraId) {{ | |
| const element = document.getElementById(paraId); | |
| if (element) {{ | |
| // Remove previous highlights | |
| document.querySelectorAll('.highlighted-citation').forEach(el => {{ | |
| el.classList.remove('highlighted-citation'); | |
| }}); | |
| // Add highlight to clicked citation | |
| element.classList.add('highlighted-citation'); | |
| // Scroll to element | |
| element.scrollIntoView({{ behavior: 'smooth', block: 'center' }}); | |
| // Flash effect | |
| element.style.animation = 'none'; | |
| setTimeout(() => {{ | |
| element.style.animation = 'highlight-flash 1s ease'; | |
| }}, 10); | |
| }} | |
| }} | |
| </script> | |
| """ | |
| return enhanced_html | |
| def _create_viewer_controls(self, filename: str) -> str: | |
| """ | |
| Create viewer control bar | |
| Args: | |
| filename: Current document filename | |
| Returns: | |
| HTML for controls | |
| """ | |
| return f""" | |
| <div class="viewer-controls"> | |
| <div class="viewer-title"> | |
| <span class="doc-icon">📄</span> | |
| <span class="doc-name">{filename}</span> | |
| </div> | |
| <div class="viewer-actions"> | |
| <button class="viewer-btn" onclick="document.getElementById('doc-viewer-content').style.fontSize='0.9em';" title="Zoom Out"> | |
| 🔍− | |
| </button> | |
| <button class="viewer-btn" onclick="document.getElementById('doc-viewer-content').style.fontSize='1em';" title="Reset Zoom"> | |
| 🔍 | |
| </button> | |
| <button class="viewer-btn" onclick="document.getElementById('doc-viewer-content').style.fontSize='1.1em';" title="Zoom In"> | |
| 🔍+ | |
| </button> | |
| </div> | |
| </div> | |
| """ | |
| def _render_empty_state(self) -> str: | |
| """ | |
| Render empty state when no document is selected | |
| Returns: | |
| HTML for empty state | |
| """ | |
| return """ | |
| <div class="document-viewer-empty"> | |
| <div class="empty-state-icon">📄</div> | |
| <h3>Tidak Ada Dokumen</h3> | |
| <p>Upload dokumen PDF untuk melihat konten dan sitasi di sini.</p> | |
| </div> | |
| """ | |
| def create_citation_link(self, filename: str, paragraph_ids: List[str], snippet: str, page: int = None) -> str: | |
| """ | |
| Create clickable citation link for chat | |
| Args: | |
| filename: Source document filename | |
| paragraph_ids: List of paragraph IDs this citation refers to | |
| snippet: Text snippet to show | |
| page: Page number (optional) | |
| Returns: | |
| HTML citation link | |
| """ | |
| para_id = paragraph_ids[0] if paragraph_ids else "unknown" | |
| page_info = f" (Hal. {page})" if page else "" | |
| citation_html = f""" | |
| <div class="citation-card" onclick="scrollToParagraph('{para_id}')"> | |
| <div class="citation-header"> | |
| <strong>📄 {filename}{page_info}</strong> | |
| </div> | |
| <div class="citation-snippet"> | |
| "{snippet[:150]}..." | |
| </div> | |
| </div> | |
| """ | |
| return citation_html | |
| def format_sources_with_links(self, sources: List[Dict]) -> tuple[str, List[str]]: | |
| """ | |
| Format sources as interactive citations | |
| Args: | |
| sources: List of source metadata from RAG pipeline | |
| Returns: | |
| Tuple of (HTML string, list of paragraph IDs to highlight) | |
| """ | |
| if not sources: | |
| return "", [] | |
| all_paragraph_ids = [] | |
| html = "<div class='sources-container'>" | |
| html += "<h4 class='sources-title'>📚 Sumber Referensi:</h4>" | |
| for i, source in enumerate(sources, 1): | |
| filename = source.get('filename', 'Unknown') | |
| chunk_text = source.get('chunk_text', '') | |
| # Parse paragraph IDs if available | |
| paragraph_ids_str = source.get('paragraph_ids', '[]') | |
| try: | |
| if isinstance(paragraph_ids_str, str): | |
| paragraph_ids = json.loads(paragraph_ids_str) | |
| else: | |
| paragraph_ids = paragraph_ids_str if isinstance(paragraph_ids_str, list) else [] | |
| except: | |
| paragraph_ids = [] | |
| # Parse pages | |
| pages_str = source.get('pages', '[]') | |
| try: | |
| if isinstance(pages_str, str): | |
| pages = json.loads(pages_str) | |
| else: | |
| pages = pages_str if isinstance(pages_str, list) else [] | |
| except: | |
| pages = [] | |
| page = pages[0] if pages else None | |
| # Track all paragraph IDs for highlighting | |
| all_paragraph_ids.extend(paragraph_ids) | |
| # Create citation link | |
| html += self.create_citation_link( | |
| filename=filename, | |
| paragraph_ids=paragraph_ids, | |
| snippet=chunk_text, | |
| page=page | |
| ) | |
| html += "</div>" | |
| return html, list(set(all_paragraph_ids)) # Return unique paragraph IDs | |
| def create_document_selector(self, documents: List[Dict], current_doc: str = None) -> str: | |
| """ | |
| Create dropdown selector for documents | |
| Args: | |
| documents: List of document metadata | |
| current_doc: Currently selected document filename | |
| Returns: | |
| HTML for document selector | |
| """ | |
| if not documents: | |
| return "<p class='no-docs-message'>Belum ada dokumen yang tersedia</p>" | |
| html = """ | |
| <div class="document-selector"> | |
| <label for="doc-select">Pilih Dokumen:</label> | |
| <select id="doc-select" class="doc-select-dropdown"> | |
| """ | |
| for doc in documents: | |
| filename = doc.get('filename', 'Unknown') | |
| num_pages = doc.get('num_pages', 0) | |
| selected = 'selected' if filename == current_doc else '' | |
| html += f""" | |
| <option value="{filename}" {selected}> | |
| {filename} ({num_pages} hal.) | |
| </option> | |
| """ | |
| html += """ | |
| </select> | |
| </div> | |
| """ | |
| return html | |