mmrech commited on
Commit
b67f906
·
verified ·
1 Parent(s): bbc2ddf
Files changed (2) hide show
  1. app.py +195 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ import anthropic
4
+ import os
5
+ import base64
6
+ import fitz # PyMuPDF
7
+ import json
8
+ import tempfile
9
+ from google.colab import userdata
10
+
11
+ # It's recommended to load the API key from secrets when deploying
12
+ # For Hugging Face Spaces, you would set this as a secret in your Space settings
13
+ try:
14
+ ANTHROPIC_API_KEY = userdata.get('ANTHROPIC_API_KEY')
15
+ except:
16
+ ANTHROPIC_API_KEY = os.environ.get('ANTHROPIC_API_KEY')
17
+
18
+ client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)
19
+
20
+ # Helper Functions from the notebook
21
+ def visualize_raw_response(response):
22
+ raw_response = {"content": []}
23
+ for content in response.content:
24
+ if content.type == "text":
25
+ block = {"type": "text", "text": content.text}
26
+ if hasattr(content, 'citations') and content.citations:
27
+ block["citations"] = [vars(c) for c in content.citations]
28
+ raw_response["content"].append(block)
29
+ return json.dumps(raw_response, indent=2)
30
+
31
+ def format_citations(response):
32
+ if not response:
33
+ return ""
34
+ citations_dict = {}
35
+ citation_counter = 1
36
+ formatted_text = ""
37
+ citations_list = []
38
+ for content in response.content:
39
+ if content.type == "text":
40
+ text = content.text
41
+ if hasattr(content, 'citations') and content.citations:
42
+ sorted_citations = sorted(content.citations, key=lambda c: getattr(c, 'start_char_index', 0) or getattr(c, 'start_page_number', 0) or getattr(c, 'start_block_index', 0))
43
+ for citation in sorted_citations:
44
+ doc_title = citation.document_title
45
+ cited_text = ' '.join(citation.cited_text.replace('\n', ' ').replace('\r', ' ').split())
46
+ citation_key = f"{doc_title}:{cited_text}"
47
+ if citation_key not in citations_dict:
48
+ citations_dict[citation_key] = citation_counter
49
+ citations_list.append(f"[{citation_counter}] \"{cited_text}\" found in \"{doc_title}\"")
50
+ citation_counter += 1
51
+ citation_num = citations_dict[citation_key]
52
+ text += f" [{citation_num}]"
53
+ formatted_text += text
54
+ return formatted_text + "\n\n" + "\n".join(citations_list)
55
+
56
+ def process_documents(doc_type, file_paths):
57
+ documents = []
58
+ if not file_paths:
59
+ return documents
60
+ for file_path in file_paths:
61
+ with open(file_path, 'rb') as f:
62
+ content = f.read()
63
+ if doc_type == 'Plain Text':
64
+ documents.append({"type": "document", "source": {"type": "text", "media_type": "text/plain", "data": content.decode('utf-8')}, "title": os.path.basename(file_path), "citations": {"enabled": True}})
65
+ elif doc_type == 'PDF':
66
+ documents.append({"type": "document", "source": {"type": "base64", "media_type": "application/pdf", "data": base64.b64encode(content).decode('utf-8')}, "title": os.path.basename(file_path), "citations": {"enabled": True}})
67
+ elif doc_type == 'Custom Content':
68
+ documents.append({"type": "document", "source": {"type": "content", "content": [{"type": "text", "text": content.decode('utf-8')}]}, "title": os.path.basename(file_path), "citations": {"enabled": True}})
69
+ return documents
70
+
71
+ def get_anthropic_response(documents, question):
72
+ if not documents or not question:
73
+ return None
74
+ try:
75
+ messages = [{"role": "user", "content": documents + [{"type": "text", "text": question}]}]
76
+ response = client.messages.create(model="claude-3-5-sonnet-latest", temperature=0.0, max_tokens=1024, messages=messages)
77
+ return response
78
+ except Exception as e:
79
+ print(f"An error occurred: {e}")
80
+ return None
81
+
82
+ def highlight_pdf(response, pdf_path):
83
+ if not response:
84
+ return None
85
+ pdf_citations = [c for content in response.content if hasattr(content, 'citations') and content.citations for c in content.citations if c.type == "page_location"]
86
+ if not pdf_citations:
87
+ return None
88
+ doc = fitz.open(pdf_path)
89
+ output_pdf_path = "highlighted_output.pdf"
90
+ for citation in pdf_citations:
91
+ text_to_find = citation.cited_text.replace('\u0002', '')
92
+ start_page = citation.start_page_number - 1
93
+ end_page = citation.end_page_number - 1
94
+ for page_num in range(start_page, end_page + 1):
95
+ if 0 <= page_num < len(doc):
96
+ page = doc[page_num]
97
+ text_instances = page.search_for(text_to_find.strip())
98
+ for inst in text_instances:
99
+ highlight = page.add_highlight_annot(inst)
100
+ highlight.set_colors({"stroke": (1, 1, 0)})
101
+ highlight.update()
102
+ doc.save(output_pdf_path)
103
+ doc.close()
104
+ return output_pdf_path
105
+
106
+ def annotate_pdf(pdf_path, annotation_text, page_number):
107
+ if not pdf_path or not os.path.exists(pdf_path): return None
108
+ doc = fitz.open(pdf_path)
109
+ page_index = page_number - 1
110
+ if not 0 <= page_index < len(doc): doc.close(); return None
111
+ page = doc[page_index]
112
+ rect = fitz.Rect(50, 50, 400, 100)
113
+ page.insert_textbox(rect, annotation_text, fontsize=12, color=(1, 0, 0))
114
+ output_pdf_path = pdf_path.replace(".pdf", "_annotated.pdf")
115
+ doc.save(output_pdf_path)
116
+ doc.close()
117
+ return output_pdf_path
118
+
119
+ def process_and_display(doc_type, question, files, load_samples, annotation_text, annotation_page):
120
+ original_pdf_path = None
121
+ file_names = []
122
+ if load_samples:
123
+ # This part needs to be adapted for a deployed environment
124
+ # as it relies on a local 'data' directory structure.
125
+ # For deployment, you'd package these files with your app.
126
+ question = "Sample question"
127
+ file_names = [] # Add paths to sample files here
128
+ elif files:
129
+ file_names = [f.name for f in files]
130
+
131
+ if not file_names:
132
+ return "Please upload documents or load sample data.", {}, None, None, None, None, None, None
133
+
134
+ if doc_type == 'PDF' and file_names:
135
+ original_pdf_path = file_names[0]
136
+
137
+ documents = process_documents(doc_type, file_names)
138
+ response = get_anthropic_response(documents, question)
139
+
140
+ if not response:
141
+ return "Failed to get response from API.", {}, None, None, None, None, None, None
142
+
143
+ formatted_response = format_citations(response)
144
+ raw_response_json_str = visualize_raw_response(response)
145
+ raw_response_json = json.loads(raw_response_json_str)
146
+
147
+ highlighted_pdf_path = None
148
+ annotated_pdf_path = None
149
+
150
+ if doc_type == 'PDF':
151
+ highlighted_pdf_path = highlight_pdf(response, original_pdf_path)
152
+ if annotation_text and annotation_page:
153
+ pdf_to_annotate = highlighted_pdf_path if highlighted_pdf_path else original_pdf_path
154
+ if pdf_to_annotate:
155
+ annotated_pdf_path = annotate_pdf(pdf_to_annotate, annotation_text, int(annotation_page))
156
+
157
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding='utf-8') as f:
158
+ f.write(formatted_response)
159
+ formatted_response_path = f.name
160
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w", encoding='utf-8') as f:
161
+ f.write(raw_response_json_str)
162
+ raw_response_path = f.name
163
+
164
+ final_pdf_path = annotated_pdf_path if annotated_pdf_path else highlighted_pdf_path
165
+
166
+ return formatted_response, raw_response_json, highlighted_pdf_path, original_pdf_path, formatted_response_path, raw_response_path, final_pdf_path, final_pdf_path
167
+
168
+
169
+ # Gradio Interface
170
+ iface = gr.Interface(
171
+ fn=process_and_display,
172
+ inputs=[
173
+ gr.Radio(['Plain Text', 'PDF', 'Custom Content'], label="Document Type"),
174
+ gr.Textbox(lines=2, placeholder="Enter your question here...", label="Question"),
175
+ gr.File(file_count="multiple", label="Upload Documents"),
176
+ gr.Checkbox(label="Load Sample Data (requires data folder)"),
177
+ gr.Textbox(lines=2, placeholder="Enter annotation text...", label="Annotation Text"),
178
+ gr.Number(label="Annotation Page Number", precision=0)
179
+ ],
180
+ outputs=[
181
+ gr.Textbox(label="Formatted Response"),
182
+ gr.JSON(label="Raw API Response"),
183
+ gr.File(label="Highlighted PDF"),
184
+ gr.File(label="Original PDF"),
185
+ gr.File(label="Download Formatted Response"),
186
+ gr.File(label="Download Raw Response"),
187
+ gr.File(label="Download Highlighted PDF"),
188
+ gr.File(label="Final Annotated PDF")
189
+ ],
190
+ title="Anthropic Citations API Explorer",
191
+ description="Explore Anthropic's citation capabilities. Upload documents, ask questions, see cited responses, and add your own annotations."
192
+ )
193
+
194
+ if __name__ == "__main__":
195
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ gradio
3
+ anthropic
4
+ PyMuPDF