citation-interpreter / utils /anthropic_api.py
mmrech's picture
Upload folder using huggingface_hub
9c6c358 verified
import os
import json
import requests
import base64
from typing import Dict, List, Union, Optional, Any
from anthropic import Anthropic
class AnthropicCitationsAPI:
"""
Client for Anthropic's Citations API to validate and interpret citation information.
This API client provides methods to interact with Anthropic's Claude model
with citations enabled. It supports sending text and PDF content to Claude
and extracting structured citation information from responses.
"""
def __init__(self, api_key: Optional[str] = None):
"""
Initialize the Anthropic Citations API client.
Args:
api_key: Anthropic API key. If not provided, will look for ANTHROPIC_API_KEY environment variable.
Raises:
ValueError: If no API key is provided or found in environment variables.
"""
self.api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
if not self.api_key:
raise ValueError(
"No API key provided. Please provide an API key or set the ANTHROPIC_API_KEY environment variable."
)
self.client = Anthropic(api_key=self.api_key)
self.default_model = "claude-3-opus-20240229"
def generate_with_citations(
self,
text_content: str,
prompt: str = "Please analyze this content and provide information with citations.",
model: Optional[str] = None,
max_tokens: int = 1024
) -> Dict[str, Any]:
"""
Generate text with citations using the provided document as a source.
Args:
text_content: The text content to use as a citation source
prompt: The prompt to send to the model
model: The model to use (defaults to claude-3-opus)
max_tokens: Maximum number of tokens to generate
Returns:
The full response from the API containing citation information
Raises:
Exception: If the API request fails
"""
model = model or self.default_model
# Use standard message format without citations features
response = self.client.messages.create(
model=model,
max_tokens=max_tokens,
system="Analyze the provided content. Whenever you reference information from the input text, indicate the source with [citation] markers.",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": text_content + "\n\n" + prompt + " Please cite the source when you reference information from the text."
}
]
}
]
)
return response
def process_pdf_with_citations(
self,
pdf_path: str,
prompt: str = "Please analyze this document and provide information with citations.",
model: Optional[str] = None,
max_tokens: int = 1024,
system_prompt: Optional[str] = None
) -> Dict[str, Any]:
"""
Generate text with citations using a PDF document as source.
Args:
pdf_path: Path to the PDF file
prompt: The prompt to send to the model
model: The model to use (defaults to claude-3-opus)
max_tokens: Maximum number of tokens to generate
system_prompt: Optional system prompt to guide the model's behavior
Returns:
The full response from the API containing citation information
Raises:
Exception: If the API request fails or if the PDF cannot be read
"""
import fitz # PyMuPDF
model = model or self.default_model
# Extract text content from PDF using PyMuPDF
try:
pdf_text = ""
doc = fitz.open(pdf_path)
for page_num in range(len(doc)):
page = doc.load_page(page_num)
pdf_text += f"\n\n--- Page {page_num + 1} ---\n\n"
pdf_text += page.get_text()
doc.close()
except ImportError:
# If PyMuPDF is not available, try a simpler text extraction
try:
from PyPDF2 import PdfReader
reader = PdfReader(pdf_path)
pdf_text = ""
for page_num, page in enumerate(reader.pages):
pdf_text += f"\n\n--- Page {page_num + 1} ---\n\n"
pdf_text += page.extract_text() or "No text content found on this page."
except ImportError:
raise ImportError("Neither PyMuPDF nor PyPDF2 is installed. Please install one of these packages to process PDFs.")
# Use standard message format with the extracted text
message_params = {
"model": model,
"max_tokens": max_tokens,
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": pdf_text
},
{
"type": "text",
"text": prompt + " Please cite the source with page numbers when you reference information from the document, using the format [Page X]."
}
]
}
]
}
# Add system prompt if provided
if system_prompt:
system_text = system_prompt
else:
system_text = "Analyze the provided PDF document text. Whenever you reference information from the document, indicate the source with [Page X] markers."
message_params["system"] = system_text
# Make API request
response = self.client.messages.create(**message_params)
return response
def extract_citations(self, response: Dict[str, Any]) -> Dict[str, Any]:
"""
Extract citation information from a response.
This method parses the API response to extract text content,
citation markers, and source information. It formats the data
into HTML and Markdown for displaying in the UI.
Args:
response: The full response from the API
Returns:
A dictionary containing:
- text: The full text content
- citations: List of citation objects with text and metadata
- sources: List of source references
- html: HTML-formatted text with interactive citations
- markdown: Markdown-formatted text with footnotes
"""
if not response or not hasattr(response, "content"):
return {"text": "", "citations": [], "html": "", "markdown": ""}
# Handle different response formats
full_text = ""
html_text = ""
markdown_text = ""
citations = []
citation_markers = {}
content_blocks = []
citation_sources = []
# Extract content from response
try:
# Get content blocks from the response
if isinstance(response.content, list):
content_blocks = response.content
else:
# Single content case
content_blocks = [response.content]
# Try to get citations in various formats
if hasattr(response, "citations"):
citation_sources = response.citations
elif hasattr(response, "tool_outputs"):
for tool_output in response.tool_outputs:
if tool_output.get("name") == "citations":
citation_sources = tool_output.get("citations", [])
elif hasattr(response, "content") and hasattr(response.content, "tool_outputs"):
for tool_output in response.content.tool_outputs:
if tool_output.get("name") == "citations":
citation_sources = tool_output.get("citations", [])
except Exception as e:
print(f"Warning: Error parsing response format: {e}")
# Process content blocks - handle different possible formats
for block in content_blocks:
if hasattr(block, "type"):
# Object-style response
if block.type == "text":
text_content = block.text
full_text += text_content
html_text += text_content
markdown_text += text_content
elif block.type == "citation":
# Add citation to the text
citation_text = block.text
full_text += citation_text
# Get citation index
citation_id = getattr(block, "citation_index", None)
if citation_id is None:
citation_id = getattr(block, "index", len(citations) + 1)
# HTML format with hoverable citation
html_citation = f'<span class="citation" data-citation-id="{citation_id}" title="Citation {citation_id}">{citation_text}<sup>[{citation_id}]</sup></span>'
html_text += html_citation
# Markdown format with superscript citation
markdown_citation = f'{citation_text}[^{citation_id}]'
markdown_text += markdown_citation
# Store citation for later reference
citation_info = {
"text": citation_text,
"citation_index": citation_id,
"start": getattr(block, "start", None),
"end": getattr(block, "end", None)
}
citations.append(citation_info)
citation_markers[citation_id] = citation_info
elif isinstance(block, dict):
# Dict-style response
if block.get("type") == "text":
text_content = block.get("text", "")
full_text += text_content
html_text += text_content
markdown_text += text_content
elif block.get("type") == "citation":
citation_text = block.get("text", "")
full_text += citation_text
citation_id = block.get("citation_index", block.get("index", len(citations) + 1))
# HTML and Markdown formatting
html_citation = f'<span class="citation" data-citation-id="{citation_id}" title="Citation {citation_id}">{citation_text}<sup>[{citation_id}]</sup></span>'
html_text += html_citation
markdown_citation = f'{citation_text}[^{citation_id}]'
markdown_text += markdown_citation
citation_info = {
"text": citation_text,
"citation_index": citation_id,
"start": block.get("start"),
"end": block.get("end")
}
citations.append(citation_info)
citation_markers[citation_id] = citation_info
# Process citation sources
processed_sources = []
for source in citation_sources:
if hasattr(source, "index"):
# Object-style source
source_index = source.index
source_info = {
"index": source_index,
"text": getattr(source, "text", ""),
"document_id": getattr(source, "document_id", None),
"page_number": getattr(source, "page_number", None) if hasattr(source, "page_number") else None
}
elif isinstance(source, dict):
# Dict-style source
source_index = source.get("index")
source_info = {
"index": source_index,
"text": source.get("text", ""),
"document_id": source.get("document_id"),
"page_number": source.get("page_number")
}
else:
continue
processed_sources.append(source_info)
# Add footnotes to markdown
if source_index is not None:
citation_text = source_info.get("text", "")
page_info = f"Page {source_info['page_number']}: " if source_info.get("page_number") else ""
markdown_text += f"\n\n[^{source_index}]: {page_info}{citation_text}"
# Add source references section to HTML
if processed_sources:
html_text += "\n<div class='citation-sources'>\n<h3>Sources</h3>\n<ol>\n"
for source in processed_sources:
source_index = source.get("index")
source_text = source.get("text", "")
page_number = source.get("page_number")
page_info = f"Page {page_number}: " if page_number else ""
html_text += f"<li id='citation-{source_index}'>{page_info}{source_text}</li>\n"
html_text += "</ol>\n</div>"
# If no citations were found but we have text, try to parse citation markers
if not citations and isinstance(response.content, str):
full_text = response.content
# Look for common citation patterns: [citation], [1], [Page X], etc.
import re
citation_pattern = r'\[([^\]]+)\]'
matches = re.finditer(citation_pattern, full_text)
# Start with unmodified text
html_text = full_text
markdown_text = full_text
# Process each citation marker
offset = 0
for i, match in enumerate(matches):
citation_id = i + 1
citation_text = match.group(0) # The full [citation] text
citation_content = match.group(1) # Just the content inside brackets
# Create citation entry
citation_info = {
"text": citation_text,
"citation_index": citation_id,
"start": match.start(),
"end": match.end(),
"content": citation_content
}
citations.append(citation_info)
# Format HTML with citation marker
html_marker = f'<span class="citation" data-citation-id="{citation_id}" title="Citation {citation_id}">{citation_text}</span>'
html_text = html_text[:match.start() + offset] + html_marker + html_text[match.end() + offset:]
offset += len(html_marker) - len(citation_text)
# Add footnote to markdown
if "page" in citation_content.lower():
# Handle page citations
markdown_text += f"\n\n[^{citation_id}]: {citation_content}"
processed_sources.append({
"index": citation_id,
"text": citation_content,
"page_number": re.search(r'\d+', citation_content).group(0) if re.search(r'\d+', citation_content) else None
})
else:
# Handle other citation types
markdown_text += f"\n\n[^{citation_id}]: Citation {citation_id}: {citation_content}"
processed_sources.append({
"index": citation_id,
"text": citation_content
})
return {
"text": full_text,
"citations": citations,
"sources": processed_sources,
"html": html_text,
"markdown": markdown_text
}