Crackershoot commited on
Commit
8533b85
·
verified ·
1 Parent(s): 1a0361e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -344
app.py CHANGED
@@ -1,347 +1,102 @@
1
- import logging
2
- import sys
3
- import os
4
- from agno.agent import Agent
5
- from agno.models.openai import OpenAIChat
6
- from agno.knowledge.embedder.openai import OpenAIEmbedder
7
- from agno.tools.duckduckgo import DuckDuckGoTools
8
- from agno.knowledge.knowledge import Knowledge
9
- from agno.vectordb.lancedb import LanceDb, SearchType
10
- import gradio as gr
11
- import fitz # PyMuPDF
12
- from PIL import Image
13
- import io
14
- import requests
15
- import re
16
- import time
17
-
18
- # --- Logging ---
19
- logging.basicConfig(stream=sys.stdout, level=logging.INFO)
20
- logger = logging.getLogger(__name__)
21
-
22
- # --- Secrets ---
23
- OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
24
-
25
- if not OPENAI_API_KEY:
26
- raise ValueError("Missing OPENAI_API_KEY")
27
-
28
- # Create a knowledge base with PDF documents
29
- knowledge = Knowledge(
30
- vector_db=LanceDb(
31
- uri="tmp/lancedb",
32
- table_name="pdf_documents",
33
- search_type=SearchType.vector,
34
- embedder=OpenAIEmbedder(id="text-embedding-3-small"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  )
36
- )
37
-
38
- # Download and add PDFs to knowledge base
39
- pdf_urls = [
40
- "https://media.datacamp.com/cms/working-with-hugging-face.pdf",
41
- "https://media.datacamp.com/cms/ai-agents-cheat-sheet.pdf",
42
- "https://media.datacamp.com/cms/introduction-to-sql-with-ai-1.pdf",
43
- "https://media.datacamp.com/legacy/image/upload/v1719844709/Marketing/Blog/Azure_CLI_Cheat_Sheet.pdf"
44
- ]
45
-
46
- def download_if_needed(url, filename):
47
- if not os.path.exists(filename):
48
- logger.info(f"Downloading {url}...")
49
- response = requests.get(url)
50
- with open(filename, "wb") as f:
51
- f.write(response.content)
52
- logger.info(f"Downloaded {filename} ({len(response.content)} bytes)")
53
-
54
- # Create a directory for PDFs if it doesn't exist
55
- os.makedirs("pdf_cache", exist_ok=True)
56
-
57
- def add_pdfs_to_knowledge():
58
- """Add PDFs to knowledge base using the correct method for the installed agno version"""
59
- contents_to_add = []
60
-
61
- for i, url in enumerate(pdf_urls):
62
- filename = f"pdf_cache/file_{i}.pdf"
63
- try:
64
- download_if_needed(url, filename)
65
- contents_to_add.append({
66
- "path": filename,
67
- "metadata": {"source": url}
68
- })
69
- logger.info(f"Prepared PDF {i+1}: {url}")
70
- except Exception as e:
71
- logger.error(f"Failed to prepare PDF {i+1}: {str(e)}")
72
-
73
- if contents_to_add:
74
- try:
75
- if hasattr(knowledge, 'add_contents'):
76
- knowledge.add_contents(contents_to_add)
77
- logger.info(f"✅ Successfully added {len(contents_to_add)} PDFs using add_contents")
78
- elif hasattr(knowledge, 'add_content'):
79
- for item in contents_to_add:
80
- knowledge.add_content(**item)
81
- logger.info(f"✅ Successfully added {len(contents_to_add)} PDFs using add_content")
82
- else:
83
- from agno.document.reader.pdf_reader import PDFReader
84
- reader = PDFReader()
85
- all_docs = []
86
- for item in contents_to_add:
87
- docs = reader.read(item["path"])
88
- for doc in docs:
89
- doc.metadata = item["metadata"]
90
- all_docs.append(doc)
91
- knowledge.vector_db.insert(documents=all_docs)
92
- logger.info(f"✅ Successfully added {len(all_docs)} document chunks from {len(contents_to_add)} PDFs")
93
-
94
- time.sleep(2)
95
- except Exception as e:
96
- logger.error(f"Failed to add PDFs: {str(e)}")
97
- raise
98
- else:
99
- logger.warning("No PDFs were prepared to add")
100
-
101
- # Add PDFs to knowledge base
102
- add_pdfs_to_knowledge()
103
-
104
- # Define the agent
105
- agent = Agent(
106
- model=OpenAIChat(id="gpt-4.1-mini", temperature=0.2),
107
- description="You are Dox a data expert!",
108
- instructions="""
109
- You are a data professional's assistant named Dox.
110
-
111
- Your primary goal is to answer questions about data, programming, cloud computing, AI/ML, and technology topics.
112
-
113
- Here are your operating procedures:
114
-
115
- 1. **Information Gathering Strategy**:
116
- * **Prioritize Knowledge Base**: First, search your internal knowledge base for the answer.
117
- * **Supplement with Web Search**: If the knowledge base information is outdated, insufficient, or the question is better suited for current web information, use the DuckDuckGo tool to perform web searches to fill in gaps or find the most up-to-date data.
118
- * For general technology questions not in your knowledge base, use web search to provide accurate answers.
119
- * If the question is NOT data-related, you MUST respond with: "Please ask relevant data questions only." and terminate.
120
-
121
- 2. **Response Length Guidelines**:
122
- * For basic questions, keep your answer to a maximum of 300 words.
123
- * For complex questions, extend your answer to a maximum of 500 words.
124
-
125
- 3. **Citation Rules (CRITICAL)**:
126
- * **Knowledge Base Citation**: For any information sourced from your internal knowledge base, you MUST include a citation on a NEW LINE after the answer, starting with "Source: ", followed by the metadata field 'source' to get the hyperlink.
127
- * **Web Search Citation**: For any information obtained from the web using the DuckDuckGo tool, you MUST include a citation on a NEW LINE after the answer, starting with "Online Source: ", followed by the full hyperlink.
128
- * **Final Rule for Citations**: Always end your answers with the appropriate citations, ensuring they are on separate lines as specified. Do NOT mix or combine citation types on a single line.
129
- * ALWAYS cite with links NOT text like "from internal knowledge base"
130
-
131
- 4. **Accuracy and Non-Hallucination**:
132
- * Provide factual and relevant answers based ONLY on the information found in your knowledge base or through web searches.
133
- * NEVER invent or hallucinate information. If an answer cannot be found, state that directly.
134
-
135
- Make sure to follow these instructions precisely.
136
- """,
137
- knowledge=knowledge,
138
- add_datetime_to_context=True,
139
- add_location_to_context=True,
140
- search_knowledge=True,
141
- tools=[DuckDuckGoTools()],
142
- markdown=True
143
- )
144
-
145
- logger.info("Agent initialized successfully")
146
-
147
- # -----------------------------
148
- # Helper Functions
149
- # -----------------------------
150
- def extract_pdf_url_from_text(text):
151
- """Extract PDF URL from text content"""
152
- patterns = [
153
- r'Source:\s*(https?://[^\s]+\.pdf)',
154
- r'Online Source:\s*(https?://[^\s]+\.pdf)',
155
- r'(https?://[^\s]+\.pdf)'
156
- ]
157
-
158
- for pattern in patterns:
159
- match = re.search(pattern, text, re.IGNORECASE)
160
- if match:
161
- return match.group(1)
162
- return None
163
-
164
- def download_pdf_from_url(url):
165
- """Download PDF from URL"""
166
- response = requests.get(url, timeout=30)
167
- response.raise_for_status()
168
- return response.content
169
-
170
- def display_pdf(pdf_url):
171
- """Convert first page of PDF to image for display"""
172
- if not pdf_url:
173
- return None
174
-
175
- try:
176
- logger.info(f"Displaying PDF from: {pdf_url}")
177
- pdf_bytes = download_pdf_from_url(pdf_url)
178
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
179
- page = doc[0]
180
-
181
- zoom = 1.5
182
- mat = fitz.Matrix(zoom, zoom)
183
- pix = page.get_pixmap(matrix=mat)
184
-
185
- img_data = pix.tobytes("png")
186
- img = Image.open(io.BytesIO(img_data))
187
- doc.close()
188
-
189
- return img
190
- except Exception as e:
191
- logger.error(f"Error displaying PDF: {str(e)}")
192
- return None
193
-
194
- # -----------------------------
195
- # Main Chat Function
196
- # -----------------------------
197
- def respond(message, history):
198
- """Process user message and return response with automatic PDF display"""
199
- logger.info(f"Question asked: {message[:100]}...")
200
-
201
- # Get response from agent
202
- response = agent.run(message, use_knowledge=True)
203
- full_content = response.get_content_as_string()
204
-
205
- # Extract PDF URL from the response
206
- pdf_url = extract_pdf_url_from_text(full_content)
207
-
208
- # Update history - Gradio 6.0 expects a list of lists/tuples
209
- history = history or []
210
- history.append([message, full_content])
211
-
212
- return history, pdf_url
213
-
214
- # -----------------------------
215
- # Example Questions
216
- # -----------------------------
217
- examples = [
218
- "What is a transformer model in NLP?",
219
- "How do AI agents work?",
220
- "Explain SQL joins with examples",
221
- "What are the key Azure CLI commands?",
222
- "How to use Hugging Face for text classification?",
223
- "What's the difference between supervised and unsupervised learning?"
224
- ]
225
-
226
- # -----------------------------
227
- # Create Gradio Interface (Gradio 6.0 compatible)
228
- # -----------------------------
229
- with gr.Blocks(title="Dox - Data Expert Assistant") as demo:
230
-
231
- gr.Markdown("""
232
- # 🤖 Dox - Your Data Expert Assistant
233
-
234
- Ask me anything about data science, programming, AI/ML, cloud computing, and technology topics!
235
- I'll search my knowledge base and the web to provide accurate answers with citations.
236
- """)
237
-
238
- with gr.Row():
239
- # Main chat column
240
- with gr.Column(scale=2):
241
- # Chatbot with minimal parameters for Gradio 6.0
242
- chatbot = gr.Chatbot(label="Conversation", height=500)
243
-
244
- with gr.Row():
245
- msg = gr.Textbox(
246
- label="Ask your question",
247
- placeholder="e.g., What is a transformer model in NLP?",
248
- lines=2,
249
- scale=9
250
- )
251
- send_btn = gr.Button("Send 📤", scale=1, variant="primary")
252
-
253
- # Examples section
254
- gr.Markdown("### 💡 Example Questions")
255
- gr.Examples(
256
- examples=examples,
257
- inputs=msg,
258
- label="Click any example to ask"
259
- )
260
-
261
- # PDF preview column
262
- with gr.Column(scale=1):
263
- gr.Markdown("### 📄 Referenced Document")
264
- pdf_display = gr.Image(
265
- label="PDF Preview",
266
- type="pil",
267
- height=450,
268
- visible=False,
269
- show_label=True,
270
- interactive=False
271
- )
272
-
273
- gr.Markdown("""
274
- ---
275
- ### 📌 Tips:
276
- - **PDFs automatically appear** when I cite them
277
- - I prioritize my knowledge base then search the web
278
- - Citations include direct links to sources
279
- - Keep questions data/tech related
280
- """)
281
-
282
- # Hidden state to track current PDF URL
283
- current_pdf_url = gr.State(value=None)
284
-
285
- # Function to handle sending message and updating UI
286
- def send_message(message, history):
287
- if not message or not message.strip():
288
- return history, gr.update(), None
289
-
290
- # Get response and PDF URL
291
- new_history, pdf_url = respond(message, history)
292
-
293
- # Update PDF display
294
- if pdf_url:
295
- pdf_image = display_pdf(pdf_url)
296
- if pdf_image:
297
- return new_history, gr.update(value=pdf_image, visible=True), pdf_url
298
- else:
299
- return new_history, gr.update(visible=False), None
300
- else:
301
- return new_history, gr.update(visible=False), None
302
-
303
- # Function to clear chat
304
- def clear_chat():
305
- return [], gr.update(visible=False), None
306
-
307
- # Wire up events
308
- send_btn.click(
309
- send_message,
310
- inputs=[msg, chatbot],
311
- outputs=[chatbot, pdf_display, current_pdf_url]
312
- ).then(
313
- lambda: "", # Clear input
314
- None,
315
- msg
316
  )
317
-
318
- msg.submit(
319
- send_message,
320
- inputs=[msg, chatbot],
321
- outputs=[chatbot, pdf_display, current_pdf_url]
322
- ).then(
323
- lambda: "",
324
- None,
325
- msg
326
  )
327
-
328
- clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary")
329
- clear_btn.click(clear_chat, None, [chatbot, pdf_display, current_pdf_url])
330
-
331
- # Add footer
332
- gr.Markdown("""
333
- ---
334
- *Powered by Agno Framework, OpenAI, and LanceDB | Responses include citations from knowledge base and web*
335
- """)
336
-
337
- # -----------------------------
338
- # Run the app
339
- # -----------------------------
340
- if __name__ == "__main__":
341
- logger.info("Starting Gradio interface...")
342
- demo.launch(
343
- share=False,
344
- server_port=7860,
345
- server_name="0.0.0.0",
346
- theme=gr.themes.Soft()
347
- )
 
 
 
 
 
 
 
 
1
+ ===== Application Startup at 2026-04-20 10:40:26 =====
2
+
3
+ INFO Creating table: pdf_documents
4
+ [2026-04-20T10:41:03Z WARN lance::dataset::write::insert] No existing dataset at /app/tmp/lancedb/pdf_documents.lance, it will be created
5
+ INFO:__main__:Downloading https://media.datacamp.com/cms/working-with-hugging-face.pdf...
6
+ INFO:__main__:Downloaded pdf_cache/file_0.pdf (1445572 bytes)
7
+ INFO:__main__:Prepared PDF 1: https://media.datacamp.com/cms/working-with-hugging-face.pdf
8
+ INFO:__main__:Downloading https://media.datacamp.com/cms/ai-agents-cheat-sheet.pdf...
9
+ INFO:__main__:Downloaded pdf_cache/file_1.pdf (2837543 bytes)
10
+ INFO:__main__:Prepared PDF 2: https://media.datacamp.com/cms/ai-agents-cheat-sheet.pdf
11
+ INFO:__main__:Downloading https://media.datacamp.com/cms/introduction-to-sql-with-ai-1.pdf...
12
+ INFO:__main__:Downloaded pdf_cache/file_2.pdf (2641274 bytes)
13
+ INFO:__main__:Prepared PDF 3: https://media.datacamp.com/cms/introduction-to-sql-with-ai-1.pdf
14
+ INFO:__main__:Downloading https://media.datacamp.com/legacy/image/upload/v1719844709/Marketing/Blog/Azure_CLI_Cheat_Sheet.pdf...
15
+ INFO:__main__:Downloaded pdf_cache/file_3.pdf (3420887 bytes)
16
+ INFO:__main__:Prepared PDF 4: https://media.datacamp.com/legacy/image/upload/v1719844709/Marketing/Blog/Azure_CLI_Cheat_Sheet.pdf
17
+ INFO Adding content from path, 8cd3ebb7-9be0-58ca-a79b-f679d9a51ee8, None,
18
+ pdf_cache/file_0.pdf, None
19
+ INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
20
+ INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
21
+ INFO Adding content from path, 4eba8a35-09bf-5fb9-b323-0de8d692bd46, None,
22
+ pdf_cache/file_1.pdf, None
23
+ INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
24
+ INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
25
+ INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
26
+ INFO Adding content from path, f134cc6f-57ae-5d08-8ad3-e32df428c7c2, None,
27
+ pdf_cache/file_2.pdf, None
28
+ INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
29
+ INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
30
+ INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
31
+ INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
32
+ INFO Adding content from path, 57aa3a67-8e67-5cd0-9a48-6d347d69435e, None,
33
+ pdf_cache/file_3.pdf, None
34
+ INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
35
+ INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
36
+ INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
37
+ INFO:__main__:✅ Successfully added 4 PDFs using add_content
38
+ INFO:__main__:Agent initialized successfully
39
+ INFO:httpx:HTTP Request: HEAD https://huggingface.co/api/telemetry/https%3A/api.gradio.app/gradio-initiated-analytics "HTTP/1.1 200 OK"
40
+ INFO:httpx:HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"
41
+ INFO:__main__:Starting Gradio interface...
42
+ * Running on local URL: http://0.0.0.0:7860, with SSR ⚡ (experimental, to disable set `ssr_mode=False` in `launch()`)
43
+ INFO:httpx:HTTP Request: GET http://localhost:7860/gradio_api/startup-events "HTTP/1.1 200 OK"
44
+ INFO:httpx:HTTP Request: HEAD http://0.0.0.0:7861/ "HTTP/1.1 200 OK"
45
+ INFO:httpx:HTTP Request: HEAD http://localhost:7860/ "HTTP/1.1 200 OK"
46
+ * To create a public link, set `share=True` in `launch()`.
47
+ INFO:httpx:HTTP Request: HEAD https://huggingface.co/api/telemetry/https%3A/api.gradio.app/gradio-launched-telemetry "HTTP/1.1 200 OK"
48
+ INFO:httpx:HTTP Request: GET http://0.0.0.0:7861/?__theme=system "HTTP/1.1 200 OK"
49
+ INFO:__main__:Question asked: How do AI agents work?...
50
+ INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
51
+ INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
52
+ INFO Found 10 documents
53
+ INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
54
+ INFO:httpx:HTTP Request: POST https://os-api.agno.com/telemetry/runs "HTTP/2 201 Created"
55
+ INFO:__main__:Displaying PDF from: https://media.datacamp.com/cms/ai-agents-cheat-sheet.pdf
56
+ Traceback (most recent call last):
57
+ File "/usr/local/lib/python3.13/site-packages/gradio/queueing.py", line 856, in process_events
58
+ response = await route_utils.call_process_api(
59
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
60
+ ...<5 lines>...
61
  )
62
+ ^
63
+ File "/usr/local/lib/python3.13/site-packages/gradio/route_utils.py", line 358, in call_process_api
64
+ output = await app.get_blocks().process_api(
65
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
66
+ ...<11 lines>...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  )
68
+ ^
69
+ File "/usr/local/lib/python3.13/site-packages/gradio/blocks.py", line 2192, in process_api
70
+ data = await self.postprocess_data(
71
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
72
+ block_fn, result["prediction"], state
73
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 
 
74
  )
75
+ ^
76
+ File "/usr/local/lib/python3.13/site-packages/gradio/blocks.py", line 1955, in postprocess_data
77
+ prediction_value = await anyio.to_thread.run_sync(
78
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
79
+ block.postprocess, prediction_value, limiter=self.limiter
80
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
81
+ )
82
+ ^
83
+ File "/usr/local/lib/python3.13/site-packages/anyio/to_thread.py", line 63, in run_sync
84
+ return await get_async_backend().run_sync_in_worker_thread(
85
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
86
+ func, args, abandon_on_cancel=abandon_on_cancel, limiter=limiter
87
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
88
+ )
89
+ ^
90
+ File "/usr/local/lib/python3.13/site-packages/anyio/_backends/_asyncio.py", line 2518, in run_sync_in_worker_thread
91
+ return await future
92
+ ^^^^^^^^^^^^
93
+ File "/usr/local/lib/python3.13/site-packages/anyio/_backends/_asyncio.py", line 1002, in run
94
+ result = context.run(func, *args)
95
+ File "/usr/local/lib/python3.13/site-packages/gradio/components/chatbot.py", line 704, in postprocess
96
+ self._check_format(value)
97
+ ~~~~~~~~~~~~~~~~~~^^^^^^^
98
+ File "/usr/local/lib/python3.13/site-packages/gradio/components/chatbot.py", line 402, in _check_format
99
+ raise Error(
100
+ "Data incompatible with messages format. Each message should be a dictionary with 'role' and 'content' keys or a ChatMessage object."
101
+ )
102
+ gradio.exceptions.Error: "Data incompatible with messages format. Each message should be a dictionary with 'role' and 'content' keys or a ChatMessage object."