Spaces:

mmrech
/

citation-interpreter

Running

App Files Files Community

citation-interpreter / utils /anthropic_api.py

mmrech

Upload folder using huggingface_hub

9c6c358 verified about 1 year ago

raw

history blame contribute delete

16.6 kB

	import os
	import json
	import requests
	import base64
	from typing import Dict, List, Union, Optional, Any
	from anthropic import Anthropic

	class AnthropicCitationsAPI:
	"""
	Client for Anthropic's Citations API to validate and interpret citation information.

	This API client provides methods to interact with Anthropic's Claude model
	with citations enabled. It supports sending text and PDF content to Claude
	and extracting structured citation information from responses.
	"""
	def __init__(self, api_key: Optional[str] = None):
	"""
	Initialize the Anthropic Citations API client.

	Args:
	api_key: Anthropic API key. If not provided, will look for ANTHROPIC_API_KEY environment variable.

	Raises:
	ValueError: If no API key is provided or found in environment variables.
	"""
	self.api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
	if not self.api_key:
	raise ValueError(
	"No API key provided. Please provide an API key or set the ANTHROPIC_API_KEY environment variable."
	)
	self.client = Anthropic(api_key=self.api_key)
	self.default_model = "claude-3-opus-20240229"

	def generate_with_citations(
	self,
	text_content: str,
	prompt: str = "Please analyze this content and provide information with citations.",
	model: Optional[str] = None,
	max_tokens: int = 1024
	) -> Dict[str, Any]:
	"""
	Generate text with citations using the provided document as a source.

	Args:
	text_content: The text content to use as a citation source
	prompt: The prompt to send to the model
	model: The model to use (defaults to claude-3-opus)
	max_tokens: Maximum number of tokens to generate

	Returns:
	The full response from the API containing citation information

	Raises:
	Exception: If the API request fails
	"""
	model = model or self.default_model

	# Use standard message format without citations features
	response = self.client.messages.create(
	model=model,
	max_tokens=max_tokens,
	system="Analyze the provided content. Whenever you reference information from the input text, indicate the source with [citation] markers.",
	messages=[
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": text_content + "\n\n" + prompt + " Please cite the source when you reference information from the text."
	}
	]
	}
	]
	)

	return response

	def process_pdf_with_citations(
	self,
	pdf_path: str,
	prompt: str = "Please analyze this document and provide information with citations.",
	model: Optional[str] = None,
	max_tokens: int = 1024,
	system_prompt: Optional[str] = None
	) -> Dict[str, Any]:
	"""
	Generate text with citations using a PDF document as source.

	Args:
	pdf_path: Path to the PDF file
	prompt: The prompt to send to the model
	model: The model to use (defaults to claude-3-opus)
	max_tokens: Maximum number of tokens to generate
	system_prompt: Optional system prompt to guide the model's behavior

	Returns:
	The full response from the API containing citation information

	Raises:
	Exception: If the API request fails or if the PDF cannot be read
	"""
	import fitz # PyMuPDF

	model = model or self.default_model

	# Extract text content from PDF using PyMuPDF
	try:
	pdf_text = ""
	doc = fitz.open(pdf_path)

	for page_num in range(len(doc)):
	page = doc.load_page(page_num)
	pdf_text += f"\n\n--- Page {page_num + 1} ---\n\n"
	pdf_text += page.get_text()

	doc.close()
	except ImportError:
	# If PyMuPDF is not available, try a simpler text extraction
	try:
	from PyPDF2 import PdfReader

	reader = PdfReader(pdf_path)
	pdf_text = ""

	for page_num, page in enumerate(reader.pages):
	pdf_text += f"\n\n--- Page {page_num + 1} ---\n\n"
	pdf_text += page.extract_text() or "No text content found on this page."
	except ImportError:
	raise ImportError("Neither PyMuPDF nor PyPDF2 is installed. Please install one of these packages to process PDFs.")

	# Use standard message format with the extracted text
	message_params = {
	"model": model,
	"max_tokens": max_tokens,
	"messages": [
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": pdf_text
	},
	{
	"type": "text",
	"text": prompt + " Please cite the source with page numbers when you reference information from the document, using the format [Page X]."
	}
	]
	}
	]
	}

	# Add system prompt if provided
	if system_prompt:
	system_text = system_prompt
	else:
	system_text = "Analyze the provided PDF document text. Whenever you reference information from the document, indicate the source with [Page X] markers."

	message_params["system"] = system_text

	# Make API request
	response = self.client.messages.create(**message_params)

	return response

	def extract_citations(self, response: Dict[str, Any]) -> Dict[str, Any]:
	"""
	Extract citation information from a response.

	This method parses the API response to extract text content,
	citation markers, and source information. It formats the data
	into HTML and Markdown for displaying in the UI.

	Args:
	response: The full response from the API

	Returns:
	A dictionary containing:
	- text: The full text content
	- citations: List of citation objects with text and metadata
	- sources: List of source references
	- html: HTML-formatted text with interactive citations
	- markdown: Markdown-formatted text with footnotes
	"""
	if not response or not hasattr(response, "content"):
	return {"text": "", "citations": [], "html": "", "markdown": ""}

	# Handle different response formats
	full_text = ""
	html_text = ""
	markdown_text = ""
	citations = []
	citation_markers = {}
	content_blocks = []
	citation_sources = []

	# Extract content from response
	try:
	# Get content blocks from the response
	if isinstance(response.content, list):
	content_blocks = response.content
	else:
	# Single content case
	content_blocks = [response.content]

	# Try to get citations in various formats
	if hasattr(response, "citations"):
	citation_sources = response.citations
	elif hasattr(response, "tool_outputs"):
	for tool_output in response.tool_outputs:
	if tool_output.get("name") == "citations":
	citation_sources = tool_output.get("citations", [])
	elif hasattr(response, "content") and hasattr(response.content, "tool_outputs"):
	for tool_output in response.content.tool_outputs:
	if tool_output.get("name") == "citations":
	citation_sources = tool_output.get("citations", [])
	except Exception as e:
	print(f"Warning: Error parsing response format: {e}")

	# Process content blocks - handle different possible formats
	for block in content_blocks:
	if hasattr(block, "type"):
	# Object-style response
	if block.type == "text":
	text_content = block.text
	full_text += text_content
	html_text += text_content
	markdown_text += text_content

	elif block.type == "citation":
	# Add citation to the text
	citation_text = block.text
	full_text += citation_text

	# Get citation index
	citation_id = getattr(block, "citation_index", None)
	if citation_id is None:
	citation_id = getattr(block, "index", len(citations) + 1)

	# HTML format with hoverable citation
	html_citation = f'<span class="citation" data-citation-id="{citation_id}" title="Citation {citation_id}">{citation_text}<sup>[{citation_id}]</sup></span>'
	html_text += html_citation

	# Markdown format with superscript citation
	markdown_citation = f'{citation_text}[^{citation_id}]'
	markdown_text += markdown_citation

	# Store citation for later reference
	citation_info = {
	"text": citation_text,
	"citation_index": citation_id,
	"start": getattr(block, "start", None),
	"end": getattr(block, "end", None)
	}
	citations.append(citation_info)
	citation_markers[citation_id] = citation_info
	elif isinstance(block, dict):
	# Dict-style response
	if block.get("type") == "text":
	text_content = block.get("text", "")
	full_text += text_content
	html_text += text_content
	markdown_text += text_content

	elif block.get("type") == "citation":
	citation_text = block.get("text", "")
	full_text += citation_text

	citation_id = block.get("citation_index", block.get("index", len(citations) + 1))

	# HTML and Markdown formatting
	html_citation = f'<span class="citation" data-citation-id="{citation_id}" title="Citation {citation_id}">{citation_text}<sup>[{citation_id}]</sup></span>'
	html_text += html_citation

	markdown_citation = f'{citation_text}[^{citation_id}]'
	markdown_text += markdown_citation

	citation_info = {
	"text": citation_text,
	"citation_index": citation_id,
	"start": block.get("start"),
	"end": block.get("end")
	}
	citations.append(citation_info)
	citation_markers[citation_id] = citation_info

	# Process citation sources
	processed_sources = []
	for source in citation_sources:
	if hasattr(source, "index"):
	# Object-style source
	source_index = source.index
	source_info = {
	"index": source_index,
	"text": getattr(source, "text", ""),
	"document_id": getattr(source, "document_id", None),
	"page_number": getattr(source, "page_number", None) if hasattr(source, "page_number") else None
	}
	elif isinstance(source, dict):
	# Dict-style source
	source_index = source.get("index")
	source_info = {
	"index": source_index,
	"text": source.get("text", ""),
	"document_id": source.get("document_id"),
	"page_number": source.get("page_number")
	}
	else:
	continue

	processed_sources.append(source_info)

	# Add footnotes to markdown
	if source_index is not None:
	citation_text = source_info.get("text", "")
	page_info = f"Page {source_info['page_number']}: " if source_info.get("page_number") else ""
	markdown_text += f"\n\n[^{source_index}]: {page_info}{citation_text}"

	# Add source references section to HTML
	if processed_sources:
	html_text += "\n<div class='citation-sources'>\n<h3>Sources</h3>\n<ol>\n"
	for source in processed_sources:
	source_index = source.get("index")
	source_text = source.get("text", "")
	page_number = source.get("page_number")

	page_info = f"Page {page_number}: " if page_number else ""
	html_text += f"<li id='citation-{source_index}'>{page_info}{source_text}</li>\n"
	html_text += "</ol>\n</div>"

	# If no citations were found but we have text, try to parse citation markers
	if not citations and isinstance(response.content, str):
	full_text = response.content

	# Look for common citation patterns: [citation], [1], [Page X], etc.
	import re
	citation_pattern = r'\[([^\]]+)\]'
	matches = re.finditer(citation_pattern, full_text)

	# Start with unmodified text
	html_text = full_text
	markdown_text = full_text

	# Process each citation marker
	offset = 0
	for i, match in enumerate(matches):
	citation_id = i + 1
	citation_text = match.group(0) # The full [citation] text
	citation_content = match.group(1) # Just the content inside brackets

	# Create citation entry
	citation_info = {
	"text": citation_text,
	"citation_index": citation_id,
	"start": match.start(),
	"end": match.end(),
	"content": citation_content
	}
	citations.append(citation_info)

	# Format HTML with citation marker
	html_marker = f'<span class="citation" data-citation-id="{citation_id}" title="Citation {citation_id}">{citation_text}</span>'
	html_text = html_text[:match.start() + offset] + html_marker + html_text[match.end() + offset:]
	offset += len(html_marker) - len(citation_text)

	# Add footnote to markdown
	if "page" in citation_content.lower():
	# Handle page citations
	markdown_text += f"\n\n[^{citation_id}]: {citation_content}"
	processed_sources.append({
	"index": citation_id,
	"text": citation_content,
	"page_number": re.search(r'\d+', citation_content).group(0) if re.search(r'\d+', citation_content) else None
	})
	else:
	# Handle other citation types
	markdown_text += f"\n\n[^{citation_id}]: Citation {citation_id}: {citation_content}"
	processed_sources.append({
	"index": citation_id,
	"text": citation_content
	})

	return {
	"text": full_text,
	"citations": citations,
	"sources": processed_sources,
	"html": html_text,
	"markdown": markdown_text
	}