Spaces:
Running
Running
| import os | |
| import json | |
| import requests | |
| import base64 | |
| from typing import Dict, List, Union, Optional, Any | |
| from anthropic import Anthropic | |
| class AnthropicCitationsAPI: | |
| """ | |
| Client for Anthropic's Citations API to validate and interpret citation information. | |
| This API client provides methods to interact with Anthropic's Claude model | |
| with citations enabled. It supports sending text and PDF content to Claude | |
| and extracting structured citation information from responses. | |
| """ | |
| def __init__(self, api_key: Optional[str] = None): | |
| """ | |
| Initialize the Anthropic Citations API client. | |
| Args: | |
| api_key: Anthropic API key. If not provided, will look for ANTHROPIC_API_KEY environment variable. | |
| Raises: | |
| ValueError: If no API key is provided or found in environment variables. | |
| """ | |
| self.api_key = api_key or os.environ.get("ANTHROPIC_API_KEY") | |
| if not self.api_key: | |
| raise ValueError( | |
| "No API key provided. Please provide an API key or set the ANTHROPIC_API_KEY environment variable." | |
| ) | |
| self.client = Anthropic(api_key=self.api_key) | |
| self.default_model = "claude-3-opus-20240229" | |
| def generate_with_citations( | |
| self, | |
| text_content: str, | |
| prompt: str = "Please analyze this content and provide information with citations.", | |
| model: Optional[str] = None, | |
| max_tokens: int = 1024 | |
| ) -> Dict[str, Any]: | |
| """ | |
| Generate text with citations using the provided document as a source. | |
| Args: | |
| text_content: The text content to use as a citation source | |
| prompt: The prompt to send to the model | |
| model: The model to use (defaults to claude-3-opus) | |
| max_tokens: Maximum number of tokens to generate | |
| Returns: | |
| The full response from the API containing citation information | |
| Raises: | |
| Exception: If the API request fails | |
| """ | |
| model = model or self.default_model | |
| # Use standard message format without citations features | |
| response = self.client.messages.create( | |
| model=model, | |
| max_tokens=max_tokens, | |
| system="Analyze the provided content. Whenever you reference information from the input text, indicate the source with [citation] markers.", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": text_content + "\n\n" + prompt + " Please cite the source when you reference information from the text." | |
| } | |
| ] | |
| } | |
| ] | |
| ) | |
| return response | |
| def process_pdf_with_citations( | |
| self, | |
| pdf_path: str, | |
| prompt: str = "Please analyze this document and provide information with citations.", | |
| model: Optional[str] = None, | |
| max_tokens: int = 1024, | |
| system_prompt: Optional[str] = None | |
| ) -> Dict[str, Any]: | |
| """ | |
| Generate text with citations using a PDF document as source. | |
| Args: | |
| pdf_path: Path to the PDF file | |
| prompt: The prompt to send to the model | |
| model: The model to use (defaults to claude-3-opus) | |
| max_tokens: Maximum number of tokens to generate | |
| system_prompt: Optional system prompt to guide the model's behavior | |
| Returns: | |
| The full response from the API containing citation information | |
| Raises: | |
| Exception: If the API request fails or if the PDF cannot be read | |
| """ | |
| import fitz # PyMuPDF | |
| model = model or self.default_model | |
| # Extract text content from PDF using PyMuPDF | |
| try: | |
| pdf_text = "" | |
| doc = fitz.open(pdf_path) | |
| for page_num in range(len(doc)): | |
| page = doc.load_page(page_num) | |
| pdf_text += f"\n\n--- Page {page_num + 1} ---\n\n" | |
| pdf_text += page.get_text() | |
| doc.close() | |
| except ImportError: | |
| # If PyMuPDF is not available, try a simpler text extraction | |
| try: | |
| from PyPDF2 import PdfReader | |
| reader = PdfReader(pdf_path) | |
| pdf_text = "" | |
| for page_num, page in enumerate(reader.pages): | |
| pdf_text += f"\n\n--- Page {page_num + 1} ---\n\n" | |
| pdf_text += page.extract_text() or "No text content found on this page." | |
| except ImportError: | |
| raise ImportError("Neither PyMuPDF nor PyPDF2 is installed. Please install one of these packages to process PDFs.") | |
| # Use standard message format with the extracted text | |
| message_params = { | |
| "model": model, | |
| "max_tokens": max_tokens, | |
| "messages": [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": pdf_text | |
| }, | |
| { | |
| "type": "text", | |
| "text": prompt + " Please cite the source with page numbers when you reference information from the document, using the format [Page X]." | |
| } | |
| ] | |
| } | |
| ] | |
| } | |
| # Add system prompt if provided | |
| if system_prompt: | |
| system_text = system_prompt | |
| else: | |
| system_text = "Analyze the provided PDF document text. Whenever you reference information from the document, indicate the source with [Page X] markers." | |
| message_params["system"] = system_text | |
| # Make API request | |
| response = self.client.messages.create(**message_params) | |
| return response | |
| def extract_citations(self, response: Dict[str, Any]) -> Dict[str, Any]: | |
| """ | |
| Extract citation information from a response. | |
| This method parses the API response to extract text content, | |
| citation markers, and source information. It formats the data | |
| into HTML and Markdown for displaying in the UI. | |
| Args: | |
| response: The full response from the API | |
| Returns: | |
| A dictionary containing: | |
| - text: The full text content | |
| - citations: List of citation objects with text and metadata | |
| - sources: List of source references | |
| - html: HTML-formatted text with interactive citations | |
| - markdown: Markdown-formatted text with footnotes | |
| """ | |
| if not response or not hasattr(response, "content"): | |
| return {"text": "", "citations": [], "html": "", "markdown": ""} | |
| # Handle different response formats | |
| full_text = "" | |
| html_text = "" | |
| markdown_text = "" | |
| citations = [] | |
| citation_markers = {} | |
| content_blocks = [] | |
| citation_sources = [] | |
| # Extract content from response | |
| try: | |
| # Get content blocks from the response | |
| if isinstance(response.content, list): | |
| content_blocks = response.content | |
| else: | |
| # Single content case | |
| content_blocks = [response.content] | |
| # Try to get citations in various formats | |
| if hasattr(response, "citations"): | |
| citation_sources = response.citations | |
| elif hasattr(response, "tool_outputs"): | |
| for tool_output in response.tool_outputs: | |
| if tool_output.get("name") == "citations": | |
| citation_sources = tool_output.get("citations", []) | |
| elif hasattr(response, "content") and hasattr(response.content, "tool_outputs"): | |
| for tool_output in response.content.tool_outputs: | |
| if tool_output.get("name") == "citations": | |
| citation_sources = tool_output.get("citations", []) | |
| except Exception as e: | |
| print(f"Warning: Error parsing response format: {e}") | |
| # Process content blocks - handle different possible formats | |
| for block in content_blocks: | |
| if hasattr(block, "type"): | |
| # Object-style response | |
| if block.type == "text": | |
| text_content = block.text | |
| full_text += text_content | |
| html_text += text_content | |
| markdown_text += text_content | |
| elif block.type == "citation": | |
| # Add citation to the text | |
| citation_text = block.text | |
| full_text += citation_text | |
| # Get citation index | |
| citation_id = getattr(block, "citation_index", None) | |
| if citation_id is None: | |
| citation_id = getattr(block, "index", len(citations) + 1) | |
| # HTML format with hoverable citation | |
| html_citation = f'<span class="citation" data-citation-id="{citation_id}" title="Citation {citation_id}">{citation_text}<sup>[{citation_id}]</sup></span>' | |
| html_text += html_citation | |
| # Markdown format with superscript citation | |
| markdown_citation = f'{citation_text}[^{citation_id}]' | |
| markdown_text += markdown_citation | |
| # Store citation for later reference | |
| citation_info = { | |
| "text": citation_text, | |
| "citation_index": citation_id, | |
| "start": getattr(block, "start", None), | |
| "end": getattr(block, "end", None) | |
| } | |
| citations.append(citation_info) | |
| citation_markers[citation_id] = citation_info | |
| elif isinstance(block, dict): | |
| # Dict-style response | |
| if block.get("type") == "text": | |
| text_content = block.get("text", "") | |
| full_text += text_content | |
| html_text += text_content | |
| markdown_text += text_content | |
| elif block.get("type") == "citation": | |
| citation_text = block.get("text", "") | |
| full_text += citation_text | |
| citation_id = block.get("citation_index", block.get("index", len(citations) + 1)) | |
| # HTML and Markdown formatting | |
| html_citation = f'<span class="citation" data-citation-id="{citation_id}" title="Citation {citation_id}">{citation_text}<sup>[{citation_id}]</sup></span>' | |
| html_text += html_citation | |
| markdown_citation = f'{citation_text}[^{citation_id}]' | |
| markdown_text += markdown_citation | |
| citation_info = { | |
| "text": citation_text, | |
| "citation_index": citation_id, | |
| "start": block.get("start"), | |
| "end": block.get("end") | |
| } | |
| citations.append(citation_info) | |
| citation_markers[citation_id] = citation_info | |
| # Process citation sources | |
| processed_sources = [] | |
| for source in citation_sources: | |
| if hasattr(source, "index"): | |
| # Object-style source | |
| source_index = source.index | |
| source_info = { | |
| "index": source_index, | |
| "text": getattr(source, "text", ""), | |
| "document_id": getattr(source, "document_id", None), | |
| "page_number": getattr(source, "page_number", None) if hasattr(source, "page_number") else None | |
| } | |
| elif isinstance(source, dict): | |
| # Dict-style source | |
| source_index = source.get("index") | |
| source_info = { | |
| "index": source_index, | |
| "text": source.get("text", ""), | |
| "document_id": source.get("document_id"), | |
| "page_number": source.get("page_number") | |
| } | |
| else: | |
| continue | |
| processed_sources.append(source_info) | |
| # Add footnotes to markdown | |
| if source_index is not None: | |
| citation_text = source_info.get("text", "") | |
| page_info = f"Page {source_info['page_number']}: " if source_info.get("page_number") else "" | |
| markdown_text += f"\n\n[^{source_index}]: {page_info}{citation_text}" | |
| # Add source references section to HTML | |
| if processed_sources: | |
| html_text += "\n<div class='citation-sources'>\n<h3>Sources</h3>\n<ol>\n" | |
| for source in processed_sources: | |
| source_index = source.get("index") | |
| source_text = source.get("text", "") | |
| page_number = source.get("page_number") | |
| page_info = f"Page {page_number}: " if page_number else "" | |
| html_text += f"<li id='citation-{source_index}'>{page_info}{source_text}</li>\n" | |
| html_text += "</ol>\n</div>" | |
| # If no citations were found but we have text, try to parse citation markers | |
| if not citations and isinstance(response.content, str): | |
| full_text = response.content | |
| # Look for common citation patterns: [citation], [1], [Page X], etc. | |
| import re | |
| citation_pattern = r'\[([^\]]+)\]' | |
| matches = re.finditer(citation_pattern, full_text) | |
| # Start with unmodified text | |
| html_text = full_text | |
| markdown_text = full_text | |
| # Process each citation marker | |
| offset = 0 | |
| for i, match in enumerate(matches): | |
| citation_id = i + 1 | |
| citation_text = match.group(0) # The full [citation] text | |
| citation_content = match.group(1) # Just the content inside brackets | |
| # Create citation entry | |
| citation_info = { | |
| "text": citation_text, | |
| "citation_index": citation_id, | |
| "start": match.start(), | |
| "end": match.end(), | |
| "content": citation_content | |
| } | |
| citations.append(citation_info) | |
| # Format HTML with citation marker | |
| html_marker = f'<span class="citation" data-citation-id="{citation_id}" title="Citation {citation_id}">{citation_text}</span>' | |
| html_text = html_text[:match.start() + offset] + html_marker + html_text[match.end() + offset:] | |
| offset += len(html_marker) - len(citation_text) | |
| # Add footnote to markdown | |
| if "page" in citation_content.lower(): | |
| # Handle page citations | |
| markdown_text += f"\n\n[^{citation_id}]: {citation_content}" | |
| processed_sources.append({ | |
| "index": citation_id, | |
| "text": citation_content, | |
| "page_number": re.search(r'\d+', citation_content).group(0) if re.search(r'\d+', citation_content) else None | |
| }) | |
| else: | |
| # Handle other citation types | |
| markdown_text += f"\n\n[^{citation_id}]: Citation {citation_id}: {citation_content}" | |
| processed_sources.append({ | |
| "index": citation_id, | |
| "text": citation_content | |
| }) | |
| return { | |
| "text": full_text, | |
| "citations": citations, | |
| "sources": processed_sources, | |
| "html": html_text, | |
| "markdown": markdown_text | |
| } |