Spaces:

mmrech
/

citations_app

Paused

App Files Files Community

mmrech commited on Jul 27, 2025

Commit

b67f906

verified ·

1 Parent(s): bbc2ddf

files

Browse files

Files changed (2) hide show

app.py +195 -0
requirements.txt +4 -0

app.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import gradio as gr
+import anthropic
+import os
+import base64
+import fitz  # PyMuPDF
+import json
+import tempfile
+from google.colab import userdata
+# It's recommended to load the API key from secrets when deploying
+# For Hugging Face Spaces, you would set this as a secret in your Space settings
+try:
+    ANTHROPIC_API_KEY = userdata.get('ANTHROPIC_API_KEY')
+except:
+    ANTHROPIC_API_KEY = os.environ.get('ANTHROPIC_API_KEY')
+client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)
+# Helper Functions from the notebook
+def visualize_raw_response(response):
+    raw_response = {"content": []}
+    for content in response.content:
+        if content.type == "text":
+            block = {"type": "text", "text": content.text}
+            if hasattr(content, 'citations') and content.citations:
+                block["citations"] = [vars(c) for c in content.citations]
+            raw_response["content"].append(block)
+    return json.dumps(raw_response, indent=2)
+def format_citations(response):
+    if not response:
+        return ""
+    citations_dict = {}
+    citation_counter = 1
+    formatted_text = ""
+    citations_list = []
+    for content in response.content:
+        if content.type == "text":
+            text = content.text
+            if hasattr(content, 'citations') and content.citations:
+                sorted_citations = sorted(content.citations, key=lambda c: getattr(c, 'start_char_index', 0) or getattr(c, 'start_page_number', 0) or getattr(c, 'start_block_index', 0))
+                for citation in sorted_citations:
+                    doc_title = citation.document_title
+                    cited_text = ' '.join(citation.cited_text.replace('\n', ' ').replace('\r', ' ').split())
+                    citation_key = f"{doc_title}:{cited_text}"
+                    if citation_key not in citations_dict:
+                        citations_dict[citation_key] = citation_counter
+                        citations_list.append(f"[{citation_counter}] \"{cited_text}\" found in \"{doc_title}\"")
+                        citation_counter += 1
+                    citation_num = citations_dict[citation_key]
+                    text += f" [{citation_num}]"
+            formatted_text += text
+    return formatted_text + "\n\n" + "\n".join(citations_list)
+def process_documents(doc_type, file_paths):
+    documents = []
+    if not file_paths:
+        return documents
+    for file_path in file_paths:
+        with open(file_path, 'rb') as f:
+            content = f.read()
+        if doc_type == 'Plain Text':
+            documents.append({"type": "document", "source": {"type": "text", "media_type": "text/plain", "data": content.decode('utf-8')}, "title": os.path.basename(file_path), "citations": {"enabled": True}})
+        elif doc_type == 'PDF':
+            documents.append({"type": "document", "source": {"type": "base64", "media_type": "application/pdf", "data": base64.b64encode(content).decode('utf-8')}, "title": os.path.basename(file_path), "citations": {"enabled": True}})
+        elif doc_type == 'Custom Content':
+            documents.append({"type": "document", "source": {"type": "content", "content": [{"type": "text", "text": content.decode('utf-8')}]}, "title": os.path.basename(file_path), "citations": {"enabled": True}})
+    return documents
+def get_anthropic_response(documents, question):
+    if not documents or not question:
+        return None
+    try:
+        messages = [{"role": "user", "content": documents + [{"type": "text", "text": question}]}]
+        response = client.messages.create(model="claude-3-5-sonnet-latest", temperature=0.0, max_tokens=1024, messages=messages)
+        return response
+    except Exception as e:
+        print(f"An error occurred: {e}")
+        return None
+def highlight_pdf(response, pdf_path):
+    if not response:
+        return None
+    pdf_citations = [c for content in response.content if hasattr(content, 'citations') and content.citations for c in content.citations if c.type == "page_location"]
+    if not pdf_citations:
+        return None
+    doc = fitz.open(pdf_path)
+    output_pdf_path = "highlighted_output.pdf"
+    for citation in pdf_citations:
+        text_to_find = citation.cited_text.replace('\u0002', '')
+        start_page = citation.start_page_number - 1
+        end_page = citation.end_page_number - 1
+        for page_num in range(start_page, end_page + 1):
+            if 0 <= page_num < len(doc):
+                page = doc[page_num]
+                text_instances = page.search_for(text_to_find.strip())
+                for inst in text_instances:
+                    highlight = page.add_highlight_annot(inst)
+                    highlight.set_colors({"stroke": (1, 1, 0)})
+                    highlight.update()
+    doc.save(output_pdf_path)
+    doc.close()
+    return output_pdf_path
+def annotate_pdf(pdf_path, annotation_text, page_number):
+    if not pdf_path or not os.path.exists(pdf_path): return None
+    doc = fitz.open(pdf_path)
+    page_index = page_number - 1
+    if not 0 <= page_index < len(doc): doc.close(); return None
+    page = doc[page_index]
+    rect = fitz.Rect(50, 50, 400, 100)
+    page.insert_textbox(rect, annotation_text, fontsize=12, color=(1, 0, 0))
+    output_pdf_path = pdf_path.replace(".pdf", "_annotated.pdf")
+    doc.save(output_pdf_path)
+    doc.close()
+    return output_pdf_path
+def process_and_display(doc_type, question, files, load_samples, annotation_text, annotation_page):
+    original_pdf_path = None
+    file_names = []
+    if load_samples:
+        # This part needs to be adapted for a deployed environment
+        # as it relies on a local 'data' directory structure.
+        # For deployment, you'd package these files with your app.
+        question = "Sample question"
+        file_names = [] # Add paths to sample files here
+    elif files:
+        file_names = [f.name for f in files]
+    if not file_names:
+        return "Please upload documents or load sample data.", {}, None, None, None, None, None, None
+    if doc_type == 'PDF' and file_names:
+        original_pdf_path = file_names[0]
+    documents = process_documents(doc_type, file_names)
+    response = get_anthropic_response(documents, question)
+    if not response:
+        return "Failed to get response from API.", {}, None, None, None, None, None, None
+    formatted_response = format_citations(response)
+    raw_response_json_str = visualize_raw_response(response)
+    raw_response_json = json.loads(raw_response_json_str)
+    highlighted_pdf_path = None
+    annotated_pdf_path = None
+    if doc_type == 'PDF':
+        highlighted_pdf_path = highlight_pdf(response, original_pdf_path)
+        if annotation_text and annotation_page:
+             pdf_to_annotate = highlighted_pdf_path if highlighted_pdf_path else original_pdf_path
+             if pdf_to_annotate:
+                annotated_pdf_path = annotate_pdf(pdf_to_annotate, annotation_text, int(annotation_page))
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding='utf-8') as f:
+        f.write(formatted_response)
+        formatted_response_path = f.name
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w", encoding='utf-8') as f:
+        f.write(raw_response_json_str)
+        raw_response_path = f.name
+    final_pdf_path = annotated_pdf_path if annotated_pdf_path else highlighted_pdf_path
+    return formatted_response, raw_response_json, highlighted_pdf_path, original_pdf_path, formatted_response_path, raw_response_path, final_pdf_path, final_pdf_path
+# Gradio Interface
+iface = gr.Interface(
+    fn=process_and_display,
+    inputs=[
+        gr.Radio(['Plain Text', 'PDF', 'Custom Content'], label="Document Type"),
+        gr.Textbox(lines=2, placeholder="Enter your question here...", label="Question"),
+        gr.File(file_count="multiple", label="Upload Documents"),
+        gr.Checkbox(label="Load Sample Data (requires data folder)"),
+        gr.Textbox(lines=2, placeholder="Enter annotation text...", label="Annotation Text"),
+        gr.Number(label="Annotation Page Number", precision=0)
+    ],
+    outputs=[
+        gr.Textbox(label="Formatted Response"),
+        gr.JSON(label="Raw API Response"),
+        gr.File(label="Highlighted PDF"),
+        gr.File(label="Original PDF"),
+        gr.File(label="Download Formatted Response"),
+        gr.File(label="Download Raw Response"),
+        gr.File(label="Download Highlighted PDF"),
+        gr.File(label="Final Annotated PDF")
+    ],
+    title="Anthropic Citations API Explorer",
+    description="Explore Anthropic's citation capabilities. Upload documents, ask questions, see cited responses, and add your own annotations."
+)
+if __name__ == "__main__":
+    iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio
+anthropic
+PyMuPDF