Crackershoot commited on
Commit
868bb60
·
verified ·
1 Parent(s): a467e76

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +214 -138
app.py CHANGED
@@ -1,199 +1,275 @@
1
- '''
2
- Dox - The Data Professional's Advisor
3
- '''
4
  import logging
5
  import sys
6
  import os
7
- import re
8
- import io
9
- import requests
10
- import gradio as gr
11
- import fitz # PyMuPDF
12
- from PIL import Image
13
-
14
  from agno.agent import Agent
15
  from agno.models.openai import OpenAIChat
16
  from agno.knowledge.embedder.openai import OpenAIEmbedder
17
  from agno.tools.duckduckgo import DuckDuckGoTools
18
  from agno.knowledge.knowledge import Knowledge
19
  from agno.vectordb.lancedb import LanceDb, SearchType
 
 
 
 
 
 
20
 
21
- # Import from our new config file
22
- import config
23
-
24
- # --- Setup ---
25
  logging.basicConfig(stream=sys.stdout, level=logging.INFO)
26
  logger = logging.getLogger(__name__)
27
 
28
- if not config.OPENAI_API_KEY:
29
- raise ValueError("Missing OPENAI_API_KEY. Please set it as an environment variable.")
30
 
31
- # --- Knowledge Base Initialization ---
32
- def initialize_knowledge_base():
33
- """Initializes and returns the knowledge base, populating it if necessary."""
34
- os.makedirs(config.PDF_CACHE_DIR, exist_ok=True)
35
-
36
- vector_db = LanceDb(
37
- uri=config.LANCEDB_URI,
38
- table_name=config.LANCEDB_TABLE,
39
- embedder=OpenAIEmbedder(id=config.EMBEDDER_ID),
 
40
  )
41
- knowledge = Knowledge(vector_db=vector_db)
42
 
43
- # Optimization: Check if the knowledge base is already populated
44
- if vector_db.table and len(vector_db.table) > 0:
45
- logger.info(f"✅ Knowledge base '{config.LANCEDB_TABLE}' is already populated with {len(vector_db.table)} entries.")
46
- return knowledge
 
 
 
47
 
48
- logger.info("Knowledge base is empty. Populating from sources...")
49
-
 
 
 
 
 
 
 
 
 
 
 
 
50
  contents_to_add = []
51
- for name, url in config.PDF_SOURCES.items():
52
- filename = os.path.join(config.PDF_CACHE_DIR, f"{name.replace(' ', '_')}.pdf")
 
53
  try:
54
- if not os.path.exists(filename):
55
- logger.info(f"Downloading {url}...")
56
- response = requests.get(url, timeout=30)
57
- response.raise_for_status()
58
- with open(filename, "wb") as f:
59
- f.write(response.content)
60
-
61
  contents_to_add.append({
62
  "path": filename,
63
- "metadata": {"source": url, "document_name": name}
64
  })
65
- except requests.RequestException as e:
66
- logger.error(f"Failed to download {url}: {e}")
67
-
 
68
  if contents_to_add:
69
  try:
70
- # Using add_contents which is recommended for batch processing
71
- knowledge.add_content(contents_to_add)
72
- logger.info(f"✅ Successfully added {len(contents_to_add)} PDFs to the knowledge base.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  except Exception as e:
74
- logger.error(f"Failed to add PDFs to knowledge base: {e}")
75
  raise
76
-
77
- return knowledge
78
 
79
- # --- Agent Definition ---
80
- def create_agent(knowledge):
81
- """Creates and returns the Dox agent."""
82
- instructions = """
 
 
 
 
83
  You are a data professional's assistant named Dox.
 
84
  Your primary goal is to answer questions about data, programming, cloud computing, AI/ML, and technology topics.
 
85
  Here are your operating procedures:
 
86
  1. **Information Gathering Strategy**:
87
- * **Prioritize Knowledge Base**: First, search your internal knowledge base for the answer. Your knowledge base contains cheat sheets on Hugging Face, AI Agents, SQL, and Azure CLI.
88
- * **Supplement with Web Search**: If the knowledge base is insufficient or the question requires current web information, use the DuckDuckGo tool.
89
- 2. **Response Guidelines**:
90
- * Keep answers concise and to the point (max 300 words for simple, 500 for complex).
 
 
 
 
 
91
  3. **Citation Rules (CRITICAL)**:
92
- * For information from your knowledge base, cite the source hyperlink on a NEW LINE: `Source: [URL]`
93
- * For information from the web, cite the source on a NEW LINE: `Online Source: [URL]`
94
- * Always end your answers with the appropriate citations.
95
- 4. **Accuracy**:
96
- * Provide factual answers based ONLY on retrieved information. If an answer cannot be found, state that directly.
97
- """
98
- return Agent(
99
- model=OpenAIChat(id=config.AGENT_MODEL, temperature=config.AGENT_TEMP),
100
- description="You are Dox, a data expert!",
101
- instructions=instructions,
102
- knowledge=knowledge,
103
- add_datetime_to_context=True,
104
- search_knowledge=True,
105
- tools=[DuckDuckGoTools()],
106
- markdown=True
107
- )
 
 
108
 
109
- # --- Main Application Logic ---
110
- knowledge_base = initialize_knowledge_base()
111
- agent = create_agent(knowledge_base)
112
 
113
- def ask_agent_and_display(question):
114
- """Runs the agent and processes the response for the UI."""
 
 
115
  logger.info(f"Question asked: {question[:100]}...")
116
  response = agent.run(question, use_knowledge=True)
117
  full_content = response.get_content_as_string()
118
-
119
- # Improved regex to find any URL for citation
120
- match = re.search(r'https?://[^\s)]+', full_content)
121
  link = match.group(0) if match else None
122
 
123
  if link:
124
- logger.info(f"Link found in response: {link}")
125
-
126
- # UI updates
127
- return (
128
- full_content,
129
- link,
130
- gr.update(visible=link is not None),
131
- gr.update(value=None, visible=False)
132
- )
 
 
 
 
133
 
 
 
 
134
  def display_pdf(pdf_url):
135
- """Downloads and renders the first page of a PDF from a URL."""
136
  if not pdf_url:
137
  return gr.update(visible=False)
138
 
139
  try:
140
  logger.info(f"Displaying PDF from: {pdf_url}")
141
- pdf_bytes = requests.get(pdf_url, timeout=30).content
142
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
143
  page = doc[0]
144
 
145
- # Lower zoom for faster rendering
146
- pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
 
147
 
148
  img_data = pix.tobytes("png")
149
  img = Image.open(io.BytesIO(img_data))
150
  doc.close()
151
 
 
152
  return gr.update(value=img, visible=True)
153
  except Exception as e:
154
- logger.error(f"Error displaying PDF: {e}")
155
  return gr.update(value=None, visible=False)
156
 
157
- # --- Gradio UI ---
158
- def build_ui():
159
- """Builds and returns the Gradio UI."""
160
- with gr.Blocks(title="Dox the Data Advisor", theme=gr.themes.Ocean()) as demo:
161
- gr.Markdown("# 🌊 Dox the Data Professional's Advisor 🤖")
162
- gr.Markdown("🧠 Dox has DataCamp cheat sheets on Hugging Face, AI Agents, SQL with AI, and the Azure CLI.")
163
-
164
- with gr.Row():
165
- with gr.Column(scale=2):
166
- question_box = gr.Textbox(label="Ask Dox a question:", lines=2, placeholder="e.g., What are the 'core four' Hugging Face libraries?")
167
- gr.Examples(
168
- examples=[
169
- "How do you log into Azure using device code authentication?",
170
- "What are the three main components of an AI agent?",
171
- "What are the 'core four' Hugging Face libraries?",
172
- "What SQL clause is used to filter data after grouping?"
173
- ],
174
- inputs=question_box
175
- )
176
- submit_btn = gr.Button("Submit", variant="primary")
177
- answer_box = gr.Markdown(label="Answer:", container=True)
178
-
179
- with gr.Column(scale=2):
180
- pdf_link_state = gr.State()
181
- show_pdf_btn = gr.Button("Show Source PDF", visible=False, variant="secondary")
182
- pdf_preview = gr.Image(label="PDF Preview (Page 1)", visible=False)
183
-
184
- submit_btn.click(
185
- ask_agent_and_display,
186
- inputs=question_box,
187
- outputs=[answer_box, pdf_link_state, show_pdf_btn, pdf_preview]
188
- )
189
-
190
- show_pdf_btn.click(
191
- display_pdf,
192
- inputs=pdf_link_state,
193
- outputs=pdf_preview
194
- )
195
- return demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
 
197
  if __name__ == "__main__":
198
- ui = build_ui()
199
- ui.launch()
 
 
 
 
1
  import logging
2
  import sys
3
  import os
 
 
 
 
 
 
 
4
  from agno.agent import Agent
5
  from agno.models.openai import OpenAIChat
6
  from agno.knowledge.embedder.openai import OpenAIEmbedder
7
  from agno.tools.duckduckgo import DuckDuckGoTools
8
  from agno.knowledge.knowledge import Knowledge
9
  from agno.vectordb.lancedb import LanceDb, SearchType
10
+ import gradio as gr
11
+ import fitz # PyMuPDF
12
+ from PIL import Image
13
+ import io
14
+ import requests
15
+ import re
16
 
17
+ # --- Logging ---
 
 
 
18
  logging.basicConfig(stream=sys.stdout, level=logging.INFO)
19
  logger = logging.getLogger(__name__)
20
 
21
+ # --- Secrets ---
22
+ OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
23
 
24
+ if not OPENAI_API_KEY:
25
+ raise ValueError("Missing OPENAI_API_KEY")
26
+
27
+ # Create a knowledge base with PDF documents
28
+ knowledge = Knowledge(
29
+ vector_db=LanceDb(
30
+ uri="tmp/lancedb",
31
+ table_name="pdf_documents",
32
+ search_type=SearchType.vector, # Changed to vector to avoid tantivy dependency
33
+ embedder=OpenAIEmbedder(id="text-embedding-3-small"),
34
  )
35
+ )
36
 
37
+ # Download and add PDFs to knowledge base
38
+ pdf_urls = [
39
+ "https://media.datacamp.com/cms/working-with-hugging-face.pdf",
40
+ "https://media.datacamp.com/cms/ai-agents-cheat-sheet.pdf",
41
+ "https://media.datacamp.com/cms/introduction-to-sql-with-ai-1.pdf",
42
+ "https://media.datacamp.com/legacy/image/upload/v1719844709/Marketing/Blog/Azure_CLI_Cheat_Sheet.pdf"
43
+ ]
44
 
45
+ def download_if_needed(url, filename):
46
+ if not os.path.exists(filename):
47
+ logger.info(f"Downloading {url}...")
48
+ response = requests.get(url)
49
+ with open(filename, "wb") as f:
50
+ f.write(response.content)
51
+ logger.info(f"Downloaded {filename} ({len(response.content)} bytes)")
52
+
53
+ # Create a directory for PDFs if it doesn't exist
54
+ os.makedirs("pdf_cache", exist_ok=True)
55
+
56
+ # Method 1: Try using knowledge.add_content (newer agno versions)
57
+ def add_pdfs_to_knowledge():
58
+ """Add PDFs to knowledge base using the correct method for the installed agno version"""
59
  contents_to_add = []
60
+
61
+ for i, url in enumerate(pdf_urls):
62
+ filename = f"pdf_cache/file_{i}.pdf"
63
  try:
64
+ download_if_needed(url, filename)
 
 
 
 
 
 
65
  contents_to_add.append({
66
  "path": filename,
67
+ "metadata": {"source": url}
68
  })
69
+ logger.info(f"Prepared PDF {i+1}: {url}")
70
+ except Exception as e:
71
+ logger.error(f"Failed to prepare PDF {i+1}: {str(e)}")
72
+
73
  if contents_to_add:
74
  try:
75
+ # Try the new method first
76
+ if hasattr(knowledge, 'add_contents'):
77
+ knowledge.add_contents(contents_to_add)
78
+ logger.info(f"✅ Successfully added {len(contents_to_add)} PDFs using add_contents")
79
+ elif hasattr(knowledge, 'add_content'):
80
+ for item in contents_to_add:
81
+ knowledge.add_content(**item)
82
+ logger.info(f"✅ Successfully added {len(contents_to_add)} PDFs using add_content")
83
+ else:
84
+ # Fallback: Direct vector DB insertion
85
+ from agno.document.reader.pdf_reader import PDFReader
86
+ reader = PDFReader()
87
+ all_docs = []
88
+ for item in contents_to_add:
89
+ docs = reader.read(item["path"])
90
+ for doc in docs:
91
+ doc.metadata = item["metadata"]
92
+ all_docs.append(doc)
93
+ knowledge.vector_db.insert(documents=all_docs)
94
+ logger.info(f"✅ Successfully added {len(all_docs)} document chunks from {len(contents_to_add)} PDFs")
95
  except Exception as e:
96
+ logger.error(f"Failed to add PDFs: {str(e)}")
97
  raise
98
+ else:
99
+ logger.warning("No PDFs were prepared to add")
100
 
101
+ # Add PDFs to knowledge base
102
+ add_pdfs_to_knowledge()
103
+
104
+ # Define the agent
105
+ agent = Agent(
106
+ model=OpenAIChat(id="gpt-4.1-mini", temperature=0.2),
107
+ description="You are Dox a data expert!",
108
+ instructions="""
109
  You are a data professional's assistant named Dox.
110
+
111
  Your primary goal is to answer questions about data, programming, cloud computing, AI/ML, and technology topics.
112
+
113
  Here are your operating procedures:
114
+
115
  1. **Information Gathering Strategy**:
116
+ * **Prioritize Knowledge Base**: First, search your internal knowledge base for the answer.
117
+ * **Supplement with Web Search**: If the knowledge base information is outdated, insufficient, or the question is better suited for current web information, use the DuckDuckGo tool to perform web searches to fill in gaps or find the most up-to-date data.
118
+ * For general technology questions not in your knowledge base, use web search to provide accurate answers.
119
+ * If the question is NOT data-related, you MUST respond with: "Please ask relevant data questions only." and terminate.
120
+
121
+ 2. **Response Length Guidelines**:
122
+ * For basic questions, keep your answer to a maximum of 300 words.
123
+ * For complex questions, extend your answer to a maximum of 500 words.
124
+
125
  3. **Citation Rules (CRITICAL)**:
126
+ * **Knowledge Base Citation**: For any information sourced from your internal knowledge base, you MUST include a citation on a NEW LINE after the answer, starting with "Source: ", followed by the metadata field 'source' to get the hyperlink.
127
+ * **Web Search Citation**: For any information obtained from the web using the DuckDuckGo tool, you MUST include a citation on a NEW LINE after the answer, starting with "Online Source: ", followed by the full hyperlink.
128
+ * **Final Rule for Citations**: Always end your answers with the appropriate citations, ensuring they are on separate lines as specified. Do NOT mix or combine citation types on a single line.
129
+ * ALWAYS cite with links NOT text like "from internal knowledge base"
130
+
131
+ 4. **Accuracy and Non-Hallucination**:
132
+ * Provide factual and relevant answers based ONLY on the information found in your knowledge base or through web searches.
133
+ * NEVER invent or hallucinate information. If an answer cannot be found, state that directly.
134
+
135
+ Make sure to follow these instructions precisely.
136
+ """,
137
+ knowledge=knowledge,
138
+ add_datetime_to_context=True,
139
+ add_location_to_context=True,
140
+ search_knowledge=True,
141
+ tools=[DuckDuckGoTools()],
142
+ markdown=True
143
+ )
144
 
145
+ logger.info("Agent initialized successfully")
 
 
146
 
147
+ # -----------------------------
148
+ # Your agent function
149
+ # -----------------------------
150
+ def ask_agent(question):
151
  logger.info(f"Question asked: {question[:100]}...")
152
  response = agent.run(question, use_knowledge=True)
153
  full_content = response.get_content_as_string()
154
+
155
+ # Extract PDF URL from response
156
+ match = re.search(r'https?://[^\s]+\.pdf', full_content, re.IGNORECASE)
157
  link = match.group(0) if match else None
158
 
159
  if link:
160
+ logger.info(f"PDF link found: {link}")
161
+ else:
162
+ logger.info("No PDF link found in response")
163
+
164
+ return full_content, link
165
+
166
+ # -----------------------------
167
+ # Download PDF
168
+ # -----------------------------
169
+ def download_pdf_from_url(url):
170
+ response = requests.get(url, timeout=30)
171
+ response.raise_for_status()
172
+ return response.content
173
 
174
+ # -----------------------------
175
+ # Display PDF
176
+ # -----------------------------
177
  def display_pdf(pdf_url):
 
178
  if not pdf_url:
179
  return gr.update(visible=False)
180
 
181
  try:
182
  logger.info(f"Displaying PDF from: {pdf_url}")
183
+ pdf_bytes = download_pdf_from_url(pdf_url)
184
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
185
  page = doc[0]
186
 
187
+ zoom = 5
188
+ mat = fitz.Matrix(zoom, zoom)
189
+ pix = page.get_pixmap(matrix=mat)
190
 
191
  img_data = pix.tobytes("png")
192
  img = Image.open(io.BytesIO(img_data))
193
  doc.close()
194
 
195
+ logger.info("PDF displayed successfully")
196
  return gr.update(value=img, visible=True)
197
  except Exception as e:
198
+ logger.error(f"Error displaying PDF: {str(e)}")
199
  return gr.update(value=None, visible=False)
200
 
201
+ # -----------------------------
202
+ # UI Wrapper for Blocks
203
+ # -----------------------------
204
+ def ask_agent_ui(question):
205
+ response_text, link = ask_agent(question)
206
+
207
+ return (
208
+ response_text,
209
+ link,
210
+ gr.update(visible=link is not None), # button visibility
211
+ gr.update(value=None, visible=False) # RESET PDF preview
212
+ )
213
+
214
+ # -----------------------------
215
+ # Combined Gradio UI with Blocks and Interface
216
+ # -----------------------------
217
+ with gr.Blocks(title="# 🌊 Dox the Data Professional's Advisor 🤖", theme=gr.themes.Ocean()) as demo:
218
+ gr.Markdown("# 🌊 Dox the Data Professional's Advisor 🤖")
219
+ gr.Markdown("🧠 Dox has 4 DataCamp cheat sheets in its database that you could ask about (1️⃣ Hugging Face | 2️⃣ AI Agents | 3️⃣ SQL with AI | 4️⃣ Azure CLI):")
220
+
221
+ # Create two columns for better layout
222
+ with gr.Row():
223
+ with gr.Column(scale=2):
224
+ question = gr.Textbox(
225
+ label="Ask Dox a question:",
226
+ lines=2,
227
+ placeholder="Type your question here..."
228
+ )
229
+
230
+ # Add examples
231
+ gr.Examples(
232
+ examples=[
233
+ "How do you log into Azure using device code authentication?",
234
+ "What are the three main components of an AI agent?",
235
+ "What are the \"core four\" Hugging Face libraries?",
236
+ "What SQL clause is used to filter data after grouping?"
237
+ ],
238
+ inputs=question,
239
+ label="Example Questions"
240
+ )
241
+
242
+ ask_btn = gr.Button("Submit", variant="primary")
243
+
244
+ answer = gr.Markdown(
245
+ label="Answer: ",
246
+ render=True,
247
+ container=True,
248
+ elem_id="answer_markdown"
249
+ )
250
+
251
+ with gr.Column(scale=2):
252
+ link_state = gr.State()
253
+ show_btn = gr.Button("Show PDF", visible=False, variant="secondary")
254
+ output_image = gr.Image(label="PDF Preview (Page 1)", visible=False, format="pdf")
255
+
256
+ # Ask agent functionality
257
+ ask_btn.click(
258
+ ask_agent_ui,
259
+ inputs=question,
260
+ outputs=[answer, link_state, show_btn, output_image]
261
+ )
262
+
263
+ # Show PDF functionality
264
+ show_btn.click(
265
+ display_pdf,
266
+ inputs=link_state,
267
+ outputs=output_image
268
+ ).then(
269
+ lambda: gr.update(visible=True),
270
+ None,
271
+ output_image
272
+ )
273
 
274
  if __name__ == "__main__":
275
+ demo.launch()