Spaces:
Sleeping
Sleeping
| import os | |
| import sys | |
| import uuid | |
| import gradio as gr | |
| # Import local backend and utils modules | |
| # Make sure your backend.PDF class and utils.PDFProcessor, AnthropicCitationsAPI | |
| # can handle or be extended for chunk-based processing. | |
| from backend import PDF | |
| from utils import AnthropicCitationsAPI, PDFProcessor | |
| # Check for API key at module level | |
| api_key = os.environ.get("ANTHROPIC_API_KEY") | |
| if not api_key: | |
| print("Warning: ANTHROPIC_API_KEY not found in environment variables.") | |
| print("This app requires an API key to function properly.") | |
| # ------------------------------------------------------------------ | |
| # 1) Example of a more robust PDF Processor with chunk-based extraction | |
| # ------------------------------------------------------------------ | |
| class ChunkedPDFProcessor: | |
| """ | |
| Demonstrates a chunk-based approach to extracting text from a PDF | |
| and splitting it into manageable segments. This helps avoid timeouts | |
| or extremely large single requests to the API. | |
| """ | |
| def __init__(self, pdf_path, chunk_size=1000, overlap=100): | |
| """ | |
| :param pdf_path: Path to the PDF file | |
| :param chunk_size: Number of characters (or tokens) per chunk | |
| :param overlap: Overlap between consecutive chunks | |
| """ | |
| self.pdf_path = pdf_path | |
| self.chunk_size = chunk_size | |
| self.overlap = overlap | |
| self.text = self._extract_text_from_pdf() | |
| self.chunks = self._split_into_chunks(self.text, self.chunk_size, self.overlap) | |
| def _extract_text_from_pdf(self): | |
| """ | |
| Implement a method to extract text from the PDF. | |
| Example uses a local PDFProcessor but you could also use PyPDF2 or pdfminer. | |
| """ | |
| # For illustration, assume PDFProcessor returns the full PDF text in one go. | |
| processor = PDFProcessor(self.pdf_path) | |
| full_text = processor.extract_text() | |
| return full_text | |
| def _split_into_chunks(self, text, chunk_size, overlap): | |
| """ | |
| Splits text into overlapping chunks of `chunk_size` characters each. | |
| Overlap can help the model maintain context across chunk boundaries. | |
| """ | |
| chunks = [] | |
| start = 0 | |
| while start < len(text): | |
| end = min(start + chunk_size, len(text)) | |
| chunk = text[start:end] | |
| chunks.append(chunk) | |
| # Move start forward by chunk_size - overlap | |
| start += (chunk_size - overlap) | |
| # Safety check if chunk_size < overlap | |
| if start < len(text) and start < end: | |
| start = end | |
| return chunks | |
| def get_chunks(self): | |
| return self.chunks | |
| # ------------------------------------------------------------------ | |
| # 2) Updated Citation Demo for chunk-based processing | |
| # ------------------------------------------------------------------ | |
| class CitationDemo: | |
| """ | |
| A demonstration of the Citation Interpreter functionality, updated with: | |
| - Chunk-based PDF text extraction to avoid timeouts on large PDFs | |
| - Side-by-side PDF preview and enhanced JS-based text highlighting | |
| """ | |
| def __init__(self): | |
| self.api = None | |
| self.pdf_processor = None | |
| if api_key: | |
| try: | |
| self.api = AnthropicCitationsAPI(api_key) | |
| except Exception as e: | |
| print(f"Error initializing Anthropic API: {e}") | |
| else: | |
| print("No API key found. PDF analysis might not work properly.") | |
| def _api_key_configured_html(self) -> str: | |
| return """ | |
| <div style="color: red; padding: 20px; border: 1px solid red; border-radius: 5px;"> | |
| <h3>API Key Not Configured</h3> | |
| <p>This app requires an Anthropic API key to function properly. | |
| Please set the ANTHROPIC_API_KEY environment variable.</p> | |
| </div> | |
| """ | |
| def _no_pdf_uploaded_html(self) -> str: | |
| return """ | |
| <div style="color: orange; padding: 20px; border: 1px solid orange; border-radius: 5px;"> | |
| <h3>No PDF Uploaded</h3> | |
| <p>Please upload a PDF document to analyze.</p> | |
| </div> | |
| """ | |
| def analyze_pdf(self, pdf_path, prompt="Analyze this document and provide key insights with citations."): | |
| """ | |
| 1) Loads the PDF in chunks. | |
| 2) For each chunk, calls the API to process that text. | |
| 3) Aggregates results & citations into a combined HTML output. | |
| 4) Returns the final HTML with clickable citations and highlights. | |
| """ | |
| # Check if API is configured | |
| if not self.api: | |
| return self._api_key_configured_html() | |
| # Check if PDF was uploaded | |
| if not pdf_path: | |
| return self._no_pdf_uploaded_html() | |
| try: | |
| # -------------- Chunked PDF Processing -------------- | |
| chunked_processor = ChunkedPDFProcessor(pdf_path, chunk_size=1500, overlap=200) | |
| all_chunks = chunked_processor.get_chunks() | |
| if not all_chunks: | |
| return "<p>No text could be extracted from the PDF.</p>" | |
| # We'll combine results after processing each chunk | |
| combined_html = "" | |
| combined_sources = {} | |
| citation_counter = 1 | |
| # -------------- Process Each Chunk -------------- | |
| for i, chunk_text in enumerate(all_chunks): | |
| # Construct a chunk-specific prompt | |
| chunk_prompt = ( | |
| f"{prompt}\n\n" | |
| f"Below is chunk {i+1} of the document text:\n" | |
| f"---\n{chunk_text}\n---\n" | |
| "Please analyze and provide any important citations and references to this chunk." | |
| ) | |
| # Call your Citations API (Anthropic-based) | |
| response = self.api.process_text_with_citations(chunk_prompt) | |
| # Extract citations from this chunk response | |
| processed = self.api.extract_citations(response) | |
| # processed["html"] might contain <span class="citation">... | |
| # We'll reindex them globally by incrementing 'citation_counter' | |
| html_chunk = processed.get("html", "") | |
| old_id = 'data-citation-id="' | |
| new_html = "" | |
| idx = 0 | |
| while True: | |
| start_idx = html_chunk.find(old_id, idx) | |
| if start_idx == -1: | |
| new_html += html_chunk[idx:] | |
| break | |
| new_html += html_chunk[idx:start_idx + len(old_id)] | |
| new_html += str(citation_counter) + '"' | |
| idx_close = html_chunk.find('"', start_idx + len(old_id)) | |
| idx = idx_close + 1 | |
| citation_counter += 1 | |
| # Gather the sources from this chunk | |
| sources = processed.get("sources", {}) | |
| for _, v in sources.items(): | |
| # Reindex with our citation_counter or some offset logic | |
| combined_sources[citation_counter] = v | |
| citation_counter += 1 | |
| # Add the chunk's HTML to the combined output | |
| combined_html += new_html + "<br><br>" | |
| # -------------- Build the Final Output -------------- | |
| final_output = f"<div>{combined_html}</div>" | |
| final_output += "<div class='citation-sources'><h3>Sources</h3><ol>" | |
| for key, source_text in combined_sources.items(): | |
| final_output += f"<li id='citation-{key}'>{source_text}</li>" | |
| final_output += "</ol></div>" | |
| # -------------- Inject JS for Citation Interactions -------------- | |
| final_output += """ | |
| <script> | |
| (function() { | |
| function setupCitationInteractions() { | |
| document.querySelectorAll('.citation').forEach(citation => { | |
| citation.addEventListener('click', function() { | |
| const citationId = this.getAttribute('data-citation-id'); | |
| const sourceElement = document.getElementById(`citation-${citationId}`); | |
| // Remove existing highlights | |
| document.querySelectorAll('.citation').forEach(c => { | |
| c.classList.remove('selected-citation'); | |
| }); | |
| // Highlight the clicked citation | |
| this.classList.add('selected-citation'); | |
| if (sourceElement) { | |
| sourceElement.style.backgroundColor = '#ffff99'; | |
| sourceElement.scrollIntoView({ behavior: 'smooth', block: 'center' }); | |
| setTimeout(() => { | |
| sourceElement.style.backgroundColor = ''; | |
| }, 2000); | |
| } | |
| }); | |
| }); | |
| } | |
| // Observe DOM changes to keep citations interactive | |
| const observer = new MutationObserver(function() { | |
| setupCitationInteractions(); | |
| }); | |
| observer.observe(document.body, { childList: true, subtree: true }); | |
| // Initial setup | |
| setupCitationInteractions(); | |
| })(); | |
| </script> | |
| """ | |
| return final_output | |
| except Exception as e: | |
| error_message = f""" | |
| <div style="color: red; padding: 20px; border: 1px solid red; border-radius: 5px;"> | |
| <h3>Error During Analysis</h3> | |
| <p>{str(e)}</p> | |
| </div> | |
| """ | |
| return error_message | |
| def embed_pdf_preview(self, pdf_file): | |
| """ | |
| Generate an <iframe> or HTML embed for the uploaded PDF side by side. | |
| Depending on your environment and security settings, you might | |
| need a different approach (e.g., hosting the file via a small server). | |
| """ | |
| if not pdf_file: | |
| return "<p>No PDF selected yet.</p>" | |
| # pdf_file is typically a dict with { 'name': 'filename.pdf', ... } | |
| file_path = pdf_file['name'] | |
| iframe_id = f"pdfview-{uuid.uuid4().hex}" | |
| # Attempt an embed (local files may be blocked by certain browsers) | |
| # If blank, consider hosting the file or using a data URI approach. | |
| return f""" | |
| <iframe id="{iframe_id}" src="{file_path}" width="100%" height="600" | |
| style="border: 1px solid #ccc;"> | |
| </iframe> | |
| """ | |
| # ------------------------------------------------------------------ | |
| # 3) Custom CSS | |
| # ------------------------------------------------------------------ | |
| custom_css = """ | |
| .citation { | |
| background-color: rgba(255, 255, 0, 0.2); | |
| border-bottom: 1px dotted #888; | |
| cursor: pointer; | |
| position: relative; | |
| } | |
| .citation:hover { | |
| background-color: rgba(255, 255, 0, 0.4); | |
| } | |
| .citation sup { | |
| color: #0066cc; | |
| font-weight: bold; | |
| } | |
| .citation-sources { | |
| margin-top: 20px; | |
| padding: 10px; | |
| background-color: #f8f8f8; | |
| border-radius: 5px; | |
| border: 1px solid #ddd; | |
| } | |
| .citation-sources h3 { | |
| margin-top: 0; | |
| } | |
| .citation-sources ol { | |
| padding-left: 20px; | |
| } | |
| .citation-sources li { | |
| margin-bottom: 8px; | |
| } | |
| .selected-citation { | |
| background-color: #ffff99 !important; | |
| box-shadow: 0 0 5px rgba(0,0,0,0.3); | |
| } | |
| """ | |
| # ------------------------------------------------------------------ | |
| # 4) Build the Gradio UI | |
| # ------------------------------------------------------------------ | |
| with gr.Blocks(title="Citation Interpreter (Enhanced)", css=custom_css) as demo: | |
| gr.Markdown("# Enhanced Citation Interpreter") | |
| gr.Markdown(""" | |
| **Features**: | |
| 1. **Chunk-Based PDF Extraction** for large or complex PDFs (reduces risk of timeouts). | |
| 2. **Side-by-Side PDF Preview** with an embedded viewer. | |
| 3. **Interactive Citations** that highlight source references on click. | |
| """) | |
| # Instantiate the demo class | |
| citation_demo = CitationDemo() | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| # Upload PDF widget | |
| pdf_input = PDF(label="Upload PDF", height=150) | |
| # Function that returns <iframe> HTML for PDF preview | |
| def update_pdf_preview(pdf_file): | |
| return citation_demo.embed_pdf_preview(pdf_file) | |
| # HTML component where we display the PDF preview | |
| pdf_preview_html = gr.HTML(label="PDF Preview") | |
| # Show an immediate preview upon file upload | |
| pdf_input.change( | |
| fn=update_pdf_preview, | |
| inputs=pdf_input, | |
| outputs=pdf_preview_html | |
| ) | |
| # Optional advanced settings | |
| with gr.Accordion("Advanced Options", open=False): | |
| prompt_input = gr.Textbox( | |
| label="Analysis Prompt", | |
| placeholder="Analyze this document and provide key insights with citations.", | |
| value="Analyze this document and provide key insights with citations." | |
| ) | |
| analyze_btn = gr.Button("Analyze Document", variant="primary") | |
| # Right Column: PDF preview + Analysis | |
| with gr.Column(scale=1): | |
| gr.Markdown("### PDF Preview & Analysis Results") | |
| with gr.Group(): | |
| # No extra .render() call here, just place the existing HTML component | |
| pdf_preview_html | |
| results_html = gr.HTML(label="Analysis Output") | |
| # Wire the "Analyze" button to the chunk-based PDF analysis | |
| analyze_btn.click( | |
| fn=citation_demo.analyze_pdf, | |
| inputs=[pdf_input, prompt_input], | |
| outputs=[results_html] | |
| ) | |
| gr.Markdown(""" | |
| ### Additional Notes | |
| - **Chunk-Based Approach**: Each PDF is split into overlapping segments of text; | |
| we pass each chunk to the Anthropic API to reduce the chance of timeouts on large documents. | |
| - **Side-by-Side Preview**: The embedded PDF viewer may not work for local files | |
| on all browsers due to security restrictions. | |
| - **Citation Highlighting**: Click on any citation to scroll to the source reference | |
| in the "Sources" section, briefly highlighted in yellow. | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() | |