mmrech's picture
Update app.py
3beb450 verified
import os
import sys
import uuid
import gradio as gr
# Import local backend and utils modules
# Make sure your backend.PDF class and utils.PDFProcessor, AnthropicCitationsAPI
# can handle or be extended for chunk-based processing.
from backend import PDF
from utils import AnthropicCitationsAPI, PDFProcessor
# Check for API key at module level
api_key = os.environ.get("ANTHROPIC_API_KEY")
if not api_key:
print("Warning: ANTHROPIC_API_KEY not found in environment variables.")
print("This app requires an API key to function properly.")
# ------------------------------------------------------------------
# 1) Example of a more robust PDF Processor with chunk-based extraction
# ------------------------------------------------------------------
class ChunkedPDFProcessor:
"""
Demonstrates a chunk-based approach to extracting text from a PDF
and splitting it into manageable segments. This helps avoid timeouts
or extremely large single requests to the API.
"""
def __init__(self, pdf_path, chunk_size=1000, overlap=100):
"""
:param pdf_path: Path to the PDF file
:param chunk_size: Number of characters (or tokens) per chunk
:param overlap: Overlap between consecutive chunks
"""
self.pdf_path = pdf_path
self.chunk_size = chunk_size
self.overlap = overlap
self.text = self._extract_text_from_pdf()
self.chunks = self._split_into_chunks(self.text, self.chunk_size, self.overlap)
def _extract_text_from_pdf(self):
"""
Implement a method to extract text from the PDF.
Example uses a local PDFProcessor but you could also use PyPDF2 or pdfminer.
"""
# For illustration, assume PDFProcessor returns the full PDF text in one go.
processor = PDFProcessor(self.pdf_path)
full_text = processor.extract_text()
return full_text
def _split_into_chunks(self, text, chunk_size, overlap):
"""
Splits text into overlapping chunks of `chunk_size` characters each.
Overlap can help the model maintain context across chunk boundaries.
"""
chunks = []
start = 0
while start < len(text):
end = min(start + chunk_size, len(text))
chunk = text[start:end]
chunks.append(chunk)
# Move start forward by chunk_size - overlap
start += (chunk_size - overlap)
# Safety check if chunk_size < overlap
if start < len(text) and start < end:
start = end
return chunks
def get_chunks(self):
return self.chunks
# ------------------------------------------------------------------
# 2) Updated Citation Demo for chunk-based processing
# ------------------------------------------------------------------
class CitationDemo:
"""
A demonstration of the Citation Interpreter functionality, updated with:
- Chunk-based PDF text extraction to avoid timeouts on large PDFs
- Side-by-side PDF preview and enhanced JS-based text highlighting
"""
def __init__(self):
self.api = None
self.pdf_processor = None
if api_key:
try:
self.api = AnthropicCitationsAPI(api_key)
except Exception as e:
print(f"Error initializing Anthropic API: {e}")
else:
print("No API key found. PDF analysis might not work properly.")
def _api_key_configured_html(self) -> str:
return """
<div style="color: red; padding: 20px; border: 1px solid red; border-radius: 5px;">
<h3>API Key Not Configured</h3>
<p>This app requires an Anthropic API key to function properly.
Please set the ANTHROPIC_API_KEY environment variable.</p>
</div>
"""
def _no_pdf_uploaded_html(self) -> str:
return """
<div style="color: orange; padding: 20px; border: 1px solid orange; border-radius: 5px;">
<h3>No PDF Uploaded</h3>
<p>Please upload a PDF document to analyze.</p>
</div>
"""
def analyze_pdf(self, pdf_path, prompt="Analyze this document and provide key insights with citations."):
"""
1) Loads the PDF in chunks.
2) For each chunk, calls the API to process that text.
3) Aggregates results & citations into a combined HTML output.
4) Returns the final HTML with clickable citations and highlights.
"""
# Check if API is configured
if not self.api:
return self._api_key_configured_html()
# Check if PDF was uploaded
if not pdf_path:
return self._no_pdf_uploaded_html()
try:
# -------------- Chunked PDF Processing --------------
chunked_processor = ChunkedPDFProcessor(pdf_path, chunk_size=1500, overlap=200)
all_chunks = chunked_processor.get_chunks()
if not all_chunks:
return "<p>No text could be extracted from the PDF.</p>"
# We'll combine results after processing each chunk
combined_html = ""
combined_sources = {}
citation_counter = 1
# -------------- Process Each Chunk --------------
for i, chunk_text in enumerate(all_chunks):
# Construct a chunk-specific prompt
chunk_prompt = (
f"{prompt}\n\n"
f"Below is chunk {i+1} of the document text:\n"
f"---\n{chunk_text}\n---\n"
"Please analyze and provide any important citations and references to this chunk."
)
# Call your Citations API (Anthropic-based)
response = self.api.process_text_with_citations(chunk_prompt)
# Extract citations from this chunk response
processed = self.api.extract_citations(response)
# processed["html"] might contain <span class="citation">...
# We'll reindex them globally by incrementing 'citation_counter'
html_chunk = processed.get("html", "")
old_id = 'data-citation-id="'
new_html = ""
idx = 0
while True:
start_idx = html_chunk.find(old_id, idx)
if start_idx == -1:
new_html += html_chunk[idx:]
break
new_html += html_chunk[idx:start_idx + len(old_id)]
new_html += str(citation_counter) + '"'
idx_close = html_chunk.find('"', start_idx + len(old_id))
idx = idx_close + 1
citation_counter += 1
# Gather the sources from this chunk
sources = processed.get("sources", {})
for _, v in sources.items():
# Reindex with our citation_counter or some offset logic
combined_sources[citation_counter] = v
citation_counter += 1
# Add the chunk's HTML to the combined output
combined_html += new_html + "<br><br>"
# -------------- Build the Final Output --------------
final_output = f"<div>{combined_html}</div>"
final_output += "<div class='citation-sources'><h3>Sources</h3><ol>"
for key, source_text in combined_sources.items():
final_output += f"<li id='citation-{key}'>{source_text}</li>"
final_output += "</ol></div>"
# -------------- Inject JS for Citation Interactions --------------
final_output += """
<script>
(function() {
function setupCitationInteractions() {
document.querySelectorAll('.citation').forEach(citation => {
citation.addEventListener('click', function() {
const citationId = this.getAttribute('data-citation-id');
const sourceElement = document.getElementById(`citation-${citationId}`);
// Remove existing highlights
document.querySelectorAll('.citation').forEach(c => {
c.classList.remove('selected-citation');
});
// Highlight the clicked citation
this.classList.add('selected-citation');
if (sourceElement) {
sourceElement.style.backgroundColor = '#ffff99';
sourceElement.scrollIntoView({ behavior: 'smooth', block: 'center' });
setTimeout(() => {
sourceElement.style.backgroundColor = '';
}, 2000);
}
});
});
}
// Observe DOM changes to keep citations interactive
const observer = new MutationObserver(function() {
setupCitationInteractions();
});
observer.observe(document.body, { childList: true, subtree: true });
// Initial setup
setupCitationInteractions();
})();
</script>
"""
return final_output
except Exception as e:
error_message = f"""
<div style="color: red; padding: 20px; border: 1px solid red; border-radius: 5px;">
<h3>Error During Analysis</h3>
<p>{str(e)}</p>
</div>
"""
return error_message
def embed_pdf_preview(self, pdf_file):
"""
Generate an <iframe> or HTML embed for the uploaded PDF side by side.
Depending on your environment and security settings, you might
need a different approach (e.g., hosting the file via a small server).
"""
if not pdf_file:
return "<p>No PDF selected yet.</p>"
# pdf_file is typically a dict with { 'name': 'filename.pdf', ... }
file_path = pdf_file['name']
iframe_id = f"pdfview-{uuid.uuid4().hex}"
# Attempt an embed (local files may be blocked by certain browsers)
# If blank, consider hosting the file or using a data URI approach.
return f"""
<iframe id="{iframe_id}" src="{file_path}" width="100%" height="600"
style="border: 1px solid #ccc;">
</iframe>
"""
# ------------------------------------------------------------------
# 3) Custom CSS
# ------------------------------------------------------------------
custom_css = """
.citation {
background-color: rgba(255, 255, 0, 0.2);
border-bottom: 1px dotted #888;
cursor: pointer;
position: relative;
}
.citation:hover {
background-color: rgba(255, 255, 0, 0.4);
}
.citation sup {
color: #0066cc;
font-weight: bold;
}
.citation-sources {
margin-top: 20px;
padding: 10px;
background-color: #f8f8f8;
border-radius: 5px;
border: 1px solid #ddd;
}
.citation-sources h3 {
margin-top: 0;
}
.citation-sources ol {
padding-left: 20px;
}
.citation-sources li {
margin-bottom: 8px;
}
.selected-citation {
background-color: #ffff99 !important;
box-shadow: 0 0 5px rgba(0,0,0,0.3);
}
"""
# ------------------------------------------------------------------
# 4) Build the Gradio UI
# ------------------------------------------------------------------
with gr.Blocks(title="Citation Interpreter (Enhanced)", css=custom_css) as demo:
gr.Markdown("# Enhanced Citation Interpreter")
gr.Markdown("""
**Features**:
1. **Chunk-Based PDF Extraction** for large or complex PDFs (reduces risk of timeouts).
2. **Side-by-Side PDF Preview** with an embedded viewer.
3. **Interactive Citations** that highlight source references on click.
""")
# Instantiate the demo class
citation_demo = CitationDemo()
with gr.Row():
with gr.Column(scale=1):
# Upload PDF widget
pdf_input = PDF(label="Upload PDF", height=150)
# Function that returns <iframe> HTML for PDF preview
def update_pdf_preview(pdf_file):
return citation_demo.embed_pdf_preview(pdf_file)
# HTML component where we display the PDF preview
pdf_preview_html = gr.HTML(label="PDF Preview")
# Show an immediate preview upon file upload
pdf_input.change(
fn=update_pdf_preview,
inputs=pdf_input,
outputs=pdf_preview_html
)
# Optional advanced settings
with gr.Accordion("Advanced Options", open=False):
prompt_input = gr.Textbox(
label="Analysis Prompt",
placeholder="Analyze this document and provide key insights with citations.",
value="Analyze this document and provide key insights with citations."
)
analyze_btn = gr.Button("Analyze Document", variant="primary")
# Right Column: PDF preview + Analysis
with gr.Column(scale=1):
gr.Markdown("### PDF Preview & Analysis Results")
with gr.Group():
# No extra .render() call here, just place the existing HTML component
pdf_preview_html
results_html = gr.HTML(label="Analysis Output")
# Wire the "Analyze" button to the chunk-based PDF analysis
analyze_btn.click(
fn=citation_demo.analyze_pdf,
inputs=[pdf_input, prompt_input],
outputs=[results_html]
)
gr.Markdown("""
### Additional Notes
- **Chunk-Based Approach**: Each PDF is split into overlapping segments of text;
we pass each chunk to the Anthropic API to reduce the chance of timeouts on large documents.
- **Side-by-Side Preview**: The embedded PDF viewer may not work for local files
on all browsers due to security restrictions.
- **Citation Highlighting**: Click on any citation to scroll to the source reference
in the "Sources" section, briefly highlighted in yellow.
""")
if __name__ == "__main__":
demo.launch()