Spaces:

mmrech
/

citations_app

Runtime error

App Files Files Community

citations_app / app.py

mmrech

Update app.py

f083ad2 verified 7 months ago

raw

history blame contribute delete

8.69 kB

	pip install "gradio[mcp]"
	import gradio as gr
	import anthropic
	import os
	import base64
	import fitz # PyMuPDF
	import json


	# It's recommended to load the API key from secrets when deploying
	# For Hugging Face Spaces, you would set this as a secret in your Space settings
	try:
	ANTHROPIC_API_KEY = userdata.get('ANTHROPIC_API_KEY')
	except:
	ANTHROPIC_API_KEY = os.environ.get('ANTHROPIC_API_KEY')

	client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)

	# Helper Functions from the notebook
	def visualize_raw_response(response):
	raw_response = {"content": []}
	for content in response.content:
	if content.type == "text":
	block = {"type": "text", "text": content.text}
	if hasattr(content, 'citations') and content.citations:
	block["citations"] = [vars(c) for c in content.citations]
	raw_response["content"].append(block)
	return json.dumps(raw_response, indent=2)

	def format_citations(response):
	if not response:
	return ""
	citations_dict = {}
	citation_counter = 1
	formatted_text = ""
	citations_list = []
	for content in response.content:
	if content.type == "text":
	text = content.text
	if hasattr(content, 'citations') and content.citations:
	sorted_citations = sorted(content.citations, key=lambda c: getattr(c, 'start_char_index', 0) or getattr(c, 'start_page_number', 0) or getattr(c, 'start_block_index', 0))
	for citation in sorted_citations:
	doc_title = citation.document_title
	cited_text = ' '.join(citation.cited_text.replace('\n', ' ').replace('\r', ' ').split())
	citation_key = f"{doc_title}:{cited_text}"
	if citation_key not in citations_dict:
	citations_dict[citation_key] = citation_counter
	citations_list.append(f"[{citation_counter}] \"{cited_text}\" found in \"{doc_title}\"")
	citation_counter += 1
	citation_num = citations_dict[citation_key]
	text += f" [{citation_num}]"
	formatted_text += text
	return formatted_text + "\n\n" + "\n".join(citations_list)

	def process_documents(doc_type, file_paths):
	documents = []
	if not file_paths:
	return documents
	for file_path in file_paths:
	with open(file_path, 'rb') as f:
	content = f.read()
	if doc_type == 'Plain Text':
	documents.append({"type": "document", "source": {"type": "text", "media_type": "text/plain", "data": content.decode('utf-8')}, "title": os.path.basename(file_path), "citations": {"enabled": True}})
	elif doc_type == 'PDF':
	documents.append({"type": "document", "source": {"type": "base64", "media_type": "application/pdf", "data": base64.b64encode(content).decode('utf-8')}, "title": os.path.basename(file_path), "citations": {"enabled": True}})
	elif doc_type == 'Custom Content':
	documents.append({"type": "document", "source": {"type": "content", "content": [{"type": "text", "text": content.decode('utf-8')}]}, "title": os.path.basename(file_path), "citations": {"enabled": True}})
	return documents

	def get_anthropic_response(documents, question):
	if not documents or not question:
	return None
	try:
	messages = [{"role": "user", "content": documents + [{"type": "text", "text": question}]}]
	response = client.messages.create(model="claude-3-5-sonnet-latest", temperature=0.0, max_tokens=1024, messages=messages)
	return response
	except Exception as e:
	print(f"An error occurred: {e}")
	return None

	def highlight_pdf(response, pdf_path):
	if not response:
	return None
	pdf_citations = [c for content in response.content if hasattr(content, 'citations') and content.citations for c in content.citations if c.type == "page_location"]
	if not pdf_citations:
	return None
	doc = fitz.open(pdf_path)
	output_pdf_path = "highlighted_output.pdf"
	for citation in pdf_citations:
	text_to_find = citation.cited_text.replace('\u0002', '')
	start_page = citation.start_page_number - 1
	end_page = citation.end_page_number - 1
	for page_num in range(start_page, end_page + 1):
	if 0 <= page_num < len(doc):
	page = doc[page_num]
	text_instances = page.search_for(text_to_find.strip())
	for inst in text_instances:
	highlight = page.add_highlight_annot(inst)
	highlight.set_colors({"stroke": (1, 1, 0)})
	highlight.update()
	doc.save(output_pdf_path)
	doc.close()
	return output_pdf_path

	def annotate_pdf(pdf_path, annotation_text, page_number):
	if not pdf_path or not os.path.exists(pdf_path): return None
	doc = fitz.open(pdf_path)
	page_index = page_number - 1
	if not 0 <= page_index < len(doc): doc.close(); return None
	page = doc[page_index]
	rect = fitz.Rect(50, 50, 400, 100)
	page.insert_textbox(rect, annotation_text, fontsize=12, color=(1, 0, 0))
	output_pdf_path = pdf_path.replace(".pdf", "_annotated.pdf")
	doc.save(output_pdf_path)
	doc.close()
	return output_pdf_path

	def process_and_display(doc_type, question, files, load_samples, annotation_text, annotation_page):
	original_pdf_path = None
	file_names = []
	if load_samples:
	# This part needs to be adapted for a deployed environment
	# as it relies on a local 'data' directory structure.
	# For deployment, you'd package these files with your app.
	question = "Sample question"
	file_names = [] # Add paths to sample files here
	elif files:
	file_names = [f.name for f in files]

	if not file_names:
	return "Please upload documents or load sample data.", {}, None, None, None, None, None, None

	if doc_type == 'PDF' and file_names:
	original_pdf_path = file_names[0]

	documents = process_documents(doc_type, file_names)
	response = get_anthropic_response(documents, question)

	if not response:
	return "Failed to get response from API.", {}, None, None, None, None, None, None

	formatted_response = format_citations(response)
	raw_response_json_str = visualize_raw_response(response)
	raw_response_json = json.loads(raw_response_json_str)

	highlighted_pdf_path = None
	annotated_pdf_path = None

	if doc_type == 'PDF':
	highlighted_pdf_path = highlight_pdf(response, original_pdf_path)
	if annotation_text and annotation_page:
	pdf_to_annotate = highlighted_pdf_path if highlighted_pdf_path else original_pdf_path
	if pdf_to_annotate:
	annotated_pdf_path = annotate_pdf(pdf_to_annotate, annotation_text, int(annotation_page))

	with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding='utf-8') as f:
	f.write(formatted_response)
	formatted_response_path = f.name
	with tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w", encoding='utf-8') as f:
	f.write(raw_response_json_str)
	raw_response_path = f.name

	final_pdf_path = annotated_pdf_path if annotated_pdf_path else highlighted_pdf_path

	return formatted_response, raw_response_json, highlighted_pdf_path, original_pdf_path, formatted_response_path, raw_response_path, final_pdf_path, final_pdf_path


	# Gradio Interface
	iface = gr.Interface(
	fn=process_and_display,
	inputs=[
	gr.Radio(['Plain Text', 'PDF', 'Custom Content'], label="Document Type"),
	gr.Textbox(lines=2, placeholder="Enter your question here...", label="Question"),
	gr.File(file_count="multiple", label="Upload Documents"),
	gr.Checkbox(label="Load Sample Data (requires data folder)"),
	gr.Textbox(lines=2, placeholder="Enter annotation text...", label="Annotation Text"),
	gr.Number(label="Annotation Page Number", precision=0)
	],
	outputs=[
	gr.Textbox(label="Formatted Response"),
	gr.JSON(label="Raw API Response"),
	gr.File(label="Highlighted PDF"),
	gr.File(label="Original PDF"),
	gr.File(label="Download Formatted Response"),
	gr.File(label="Download Raw Response"),
	gr.File(label="Download Highlighted PDF"),
	gr.File(label="Final Annotated PDF")
	],
	title="Anthropic Citations API Explorer",
	description="Explore Anthropic's citation capabilities. Upload documents, ask questions, see cited responses, and add your own annotations."
	)

	if __name__ == "__main__":
	iface.launch()