Spaces:

mmrech
/

citation-interpreter

Sleeping

App Files Files Community

citation-interpreter / app.py

mmrech

Update app.py

3beb450 verified about 1 year ago

raw

history blame contribute delete

14.5 kB

	import os
	import sys
	import uuid
	import gradio as gr

	# Import local backend and utils modules
	# Make sure your backend.PDF class and utils.PDFProcessor, AnthropicCitationsAPI
	# can handle or be extended for chunk-based processing.
	from backend import PDF
	from utils import AnthropicCitationsAPI, PDFProcessor

	# Check for API key at module level
	api_key = os.environ.get("ANTHROPIC_API_KEY")
	if not api_key:
	print("Warning: ANTHROPIC_API_KEY not found in environment variables.")
	print("This app requires an API key to function properly.")

	# ------------------------------------------------------------------
	# 1) Example of a more robust PDF Processor with chunk-based extraction
	# ------------------------------------------------------------------
	class ChunkedPDFProcessor:
	"""
	Demonstrates a chunk-based approach to extracting text from a PDF
	and splitting it into manageable segments. This helps avoid timeouts
	or extremely large single requests to the API.
	"""

	def __init__(self, pdf_path, chunk_size=1000, overlap=100):
	"""
	:param pdf_path: Path to the PDF file
	:param chunk_size: Number of characters (or tokens) per chunk
	:param overlap: Overlap between consecutive chunks
	"""
	self.pdf_path = pdf_path
	self.chunk_size = chunk_size
	self.overlap = overlap
	self.text = self._extract_text_from_pdf()
	self.chunks = self._split_into_chunks(self.text, self.chunk_size, self.overlap)

	def _extract_text_from_pdf(self):
	"""
	Implement a method to extract text from the PDF.
	Example uses a local PDFProcessor but you could also use PyPDF2 or pdfminer.
	"""
	# For illustration, assume PDFProcessor returns the full PDF text in one go.
	processor = PDFProcessor(self.pdf_path)
	full_text = processor.extract_text()
	return full_text

	def _split_into_chunks(self, text, chunk_size, overlap):
	"""
	Splits text into overlapping chunks of `chunk_size` characters each.
	Overlap can help the model maintain context across chunk boundaries.
	"""
	chunks = []
	start = 0
	while start < len(text):
	end = min(start + chunk_size, len(text))
	chunk = text[start:end]
	chunks.append(chunk)
	# Move start forward by chunk_size - overlap
	start += (chunk_size - overlap)
	# Safety check if chunk_size < overlap
	if start < len(text) and start < end:
	start = end
	return chunks

	def get_chunks(self):
	return self.chunks


	# ------------------------------------------------------------------
	# 2) Updated Citation Demo for chunk-based processing
	# ------------------------------------------------------------------
	class CitationDemo:
	"""
	A demonstration of the Citation Interpreter functionality, updated with:
	- Chunk-based PDF text extraction to avoid timeouts on large PDFs
	- Side-by-side PDF preview and enhanced JS-based text highlighting
	"""

	def __init__(self):
	self.api = None
	self.pdf_processor = None

	if api_key:
	try:
	self.api = AnthropicCitationsAPI(api_key)
	except Exception as e:
	print(f"Error initializing Anthropic API: {e}")
	else:
	print("No API key found. PDF analysis might not work properly.")

	def _api_key_configured_html(self) -> str:
	return """
	<div style="color: red; padding: 20px; border: 1px solid red; border-radius: 5px;">
	<h3>API Key Not Configured</h3>
	<p>This app requires an Anthropic API key to function properly.
	Please set the ANTHROPIC_API_KEY environment variable.</p>
	</div>
	"""

	def _no_pdf_uploaded_html(self) -> str:
	return """
	<div style="color: orange; padding: 20px; border: 1px solid orange; border-radius: 5px;">
	<h3>No PDF Uploaded</h3>
	<p>Please upload a PDF document to analyze.</p>
	</div>
	"""

	def analyze_pdf(self, pdf_path, prompt="Analyze this document and provide key insights with citations."):
	"""
	1) Loads the PDF in chunks.
	2) For each chunk, calls the API to process that text.
	3) Aggregates results & citations into a combined HTML output.
	4) Returns the final HTML with clickable citations and highlights.
	"""
	# Check if API is configured
	if not self.api:
	return self._api_key_configured_html()

	# Check if PDF was uploaded
	if not pdf_path:
	return self._no_pdf_uploaded_html()

	try:
	# -------------- Chunked PDF Processing --------------
	chunked_processor = ChunkedPDFProcessor(pdf_path, chunk_size=1500, overlap=200)
	all_chunks = chunked_processor.get_chunks()

	if not all_chunks:
	return "<p>No text could be extracted from the PDF.</p>"

	# We'll combine results after processing each chunk
	combined_html = ""
	combined_sources = {}
	citation_counter = 1

	# -------------- Process Each Chunk --------------
	for i, chunk_text in enumerate(all_chunks):
	# Construct a chunk-specific prompt
	chunk_prompt = (
	f"{prompt}\n\n"
	f"Below is chunk {i+1} of the document text:\n"
	f"---\n{chunk_text}\n---\n"
	"Please analyze and provide any important citations and references to this chunk."
	)

	# Call your Citations API (Anthropic-based)
	response = self.api.process_text_with_citations(chunk_prompt)

	# Extract citations from this chunk response
	processed = self.api.extract_citations(response)

	# processed["html"] might contain <span class="citation">...
	# We'll reindex them globally by incrementing 'citation_counter'
	html_chunk = processed.get("html", "")
	old_id = 'data-citation-id="'
	new_html = ""
	idx = 0

	while True:
	start_idx = html_chunk.find(old_id, idx)
	if start_idx == -1:
	new_html += html_chunk[idx:]
	break
	new_html += html_chunk[idx:start_idx + len(old_id)]
	new_html += str(citation_counter) + '"'
	idx_close = html_chunk.find('"', start_idx + len(old_id))
	idx = idx_close + 1
	citation_counter += 1

	# Gather the sources from this chunk
	sources = processed.get("sources", {})
	for _, v in sources.items():
	# Reindex with our citation_counter or some offset logic
	combined_sources[citation_counter] = v
	citation_counter += 1

	# Add the chunk's HTML to the combined output
	combined_html += new_html + "<br><br>"

	# -------------- Build the Final Output --------------
	final_output = f"<div>{combined_html}</div>"
	final_output += "<div class='citation-sources'><h3>Sources</h3><ol>"
	for key, source_text in combined_sources.items():
	final_output += f"<li id='citation-{key}'>{source_text}</li>"
	final_output += "</ol></div>"

	# -------------- Inject JS for Citation Interactions --------------
	final_output += """
	<script>
	(function() {
	function setupCitationInteractions() {
	document.querySelectorAll('.citation').forEach(citation => {
	citation.addEventListener('click', function() {
	const citationId = this.getAttribute('data-citation-id');
	const sourceElement = document.getElementById(`citation-${citationId}`);

	// Remove existing highlights
	document.querySelectorAll('.citation').forEach(c => {
	c.classList.remove('selected-citation');
	});

	// Highlight the clicked citation
	this.classList.add('selected-citation');

	if (sourceElement) {
	sourceElement.style.backgroundColor = '#ffff99';
	sourceElement.scrollIntoView({ behavior: 'smooth', block: 'center' });

	setTimeout(() => {
	sourceElement.style.backgroundColor = '';
	}, 2000);
	}
	});
	});
	}

	// Observe DOM changes to keep citations interactive
	const observer = new MutationObserver(function() {
	setupCitationInteractions();
	});
	observer.observe(document.body, { childList: true, subtree: true });

	// Initial setup
	setupCitationInteractions();
	})();
	</script>
	"""

	return final_output

	except Exception as e:
	error_message = f"""
	<div style="color: red; padding: 20px; border: 1px solid red; border-radius: 5px;">
	<h3>Error During Analysis</h3>
	<p>{str(e)}</p>
	</div>
	"""
	return error_message

	def embed_pdf_preview(self, pdf_file):
	"""
	Generate an <iframe> or HTML embed for the uploaded PDF side by side.
	Depending on your environment and security settings, you might
	need a different approach (e.g., hosting the file via a small server).
	"""
	if not pdf_file:
	return "<p>No PDF selected yet.</p>"

	# pdf_file is typically a dict with { 'name': 'filename.pdf', ... }
	file_path = pdf_file['name']
	iframe_id = f"pdfview-{uuid.uuid4().hex}"

	# Attempt an embed (local files may be blocked by certain browsers)
	# If blank, consider hosting the file or using a data URI approach.
	return f"""
	<iframe id="{iframe_id}" src="{file_path}" width="100%" height="600"
	style="border: 1px solid #ccc;">
	</iframe>
	"""


	# ------------------------------------------------------------------
	# 3) Custom CSS
	# ------------------------------------------------------------------
	custom_css = """
	.citation {
	background-color: rgba(255, 255, 0, 0.2);
	border-bottom: 1px dotted #888;
	cursor: pointer;
	position: relative;
	}
	.citation:hover {
	background-color: rgba(255, 255, 0, 0.4);
	}
	.citation sup {
	color: #0066cc;
	font-weight: bold;
	}
	.citation-sources {
	margin-top: 20px;
	padding: 10px;
	background-color: #f8f8f8;
	border-radius: 5px;
	border: 1px solid #ddd;
	}
	.citation-sources h3 {
	margin-top: 0;
	}
	.citation-sources ol {
	padding-left: 20px;
	}
	.citation-sources li {
	margin-bottom: 8px;
	}
	.selected-citation {
	background-color: #ffff99 !important;
	box-shadow: 0 0 5px rgba(0,0,0,0.3);
	}
	"""

	# ------------------------------------------------------------------
	# 4) Build the Gradio UI
	# ------------------------------------------------------------------
	with gr.Blocks(title="Citation Interpreter (Enhanced)", css=custom_css) as demo:
	gr.Markdown("# Enhanced Citation Interpreter")
	gr.Markdown("""
	Features:
	1. Chunk-Based PDF Extraction for large or complex PDFs (reduces risk of timeouts).
	2. Side-by-Side PDF Preview with an embedded viewer.
	3. Interactive Citations that highlight source references on click.
	""")

	# Instantiate the demo class
	citation_demo = CitationDemo()

	with gr.Row():
	with gr.Column(scale=1):
	# Upload PDF widget
	pdf_input = PDF(label="Upload PDF", height=150)

	# Function that returns <iframe> HTML for PDF preview
	def update_pdf_preview(pdf_file):
	return citation_demo.embed_pdf_preview(pdf_file)

	# HTML component where we display the PDF preview
	pdf_preview_html = gr.HTML(label="PDF Preview")

	# Show an immediate preview upon file upload
	pdf_input.change(
	fn=update_pdf_preview,
	inputs=pdf_input,
	outputs=pdf_preview_html
	)

	# Optional advanced settings
	with gr.Accordion("Advanced Options", open=False):
	prompt_input = gr.Textbox(
	label="Analysis Prompt",
	placeholder="Analyze this document and provide key insights with citations.",
	value="Analyze this document and provide key insights with citations."
	)

	analyze_btn = gr.Button("Analyze Document", variant="primary")

	# Right Column: PDF preview + Analysis
	with gr.Column(scale=1):
	gr.Markdown("### PDF Preview & Analysis Results")

	with gr.Group():
	# No extra .render() call here, just place the existing HTML component
	pdf_preview_html

	results_html = gr.HTML(label="Analysis Output")

	# Wire the "Analyze" button to the chunk-based PDF analysis
	analyze_btn.click(
	fn=citation_demo.analyze_pdf,
	inputs=[pdf_input, prompt_input],
	outputs=[results_html]
	)

	gr.Markdown("""
	### Additional Notes
	- Chunk-Based Approach: Each PDF is split into overlapping segments of text;
	we pass each chunk to the Anthropic API to reduce the chance of timeouts on large documents.
	- Side-by-Side Preview: The embedded PDF viewer may not work for local files
	on all browsers due to security restrictions.
	- Citation Highlighting: Click on any citation to scroll to the source reference
	in the "Sources" section, briefly highlighted in yellow.
	""")

	if __name__ == "__main__":
	demo.launch()