Update app.py
Browse files
app.py
CHANGED
|
@@ -1,199 +1,275 @@
|
|
| 1 |
-
'''
|
| 2 |
-
Dox - The Data Professional's Advisor
|
| 3 |
-
'''
|
| 4 |
import logging
|
| 5 |
import sys
|
| 6 |
import os
|
| 7 |
-
import re
|
| 8 |
-
import io
|
| 9 |
-
import requests
|
| 10 |
-
import gradio as gr
|
| 11 |
-
import fitz # PyMuPDF
|
| 12 |
-
from PIL import Image
|
| 13 |
-
|
| 14 |
from agno.agent import Agent
|
| 15 |
from agno.models.openai import OpenAIChat
|
| 16 |
from agno.knowledge.embedder.openai import OpenAIEmbedder
|
| 17 |
from agno.tools.duckduckgo import DuckDuckGoTools
|
| 18 |
from agno.knowledge.knowledge import Knowledge
|
| 19 |
from agno.vectordb.lancedb import LanceDb, SearchType
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
-
#
|
| 22 |
-
import config
|
| 23 |
-
|
| 24 |
-
# --- Setup ---
|
| 25 |
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
| 26 |
logger = logging.getLogger(__name__)
|
| 27 |
|
| 28 |
-
|
| 29 |
-
|
| 30 |
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
vector_db
|
| 37 |
-
uri=
|
| 38 |
-
table_name=
|
| 39 |
-
|
|
|
|
| 40 |
)
|
| 41 |
-
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
contents_to_add = []
|
| 51 |
-
|
| 52 |
-
|
|
|
|
| 53 |
try:
|
| 54 |
-
|
| 55 |
-
logger.info(f"Downloading {url}...")
|
| 56 |
-
response = requests.get(url, timeout=30)
|
| 57 |
-
response.raise_for_status()
|
| 58 |
-
with open(filename, "wb") as f:
|
| 59 |
-
f.write(response.content)
|
| 60 |
-
|
| 61 |
contents_to_add.append({
|
| 62 |
"path": filename,
|
| 63 |
-
"metadata": {"source": url
|
| 64 |
})
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
|
|
|
| 68 |
if contents_to_add:
|
| 69 |
try:
|
| 70 |
-
#
|
| 71 |
-
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
except Exception as e:
|
| 74 |
-
logger.error(f"Failed to add PDFs
|
| 75 |
raise
|
| 76 |
-
|
| 77 |
-
|
| 78 |
|
| 79 |
-
#
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
You are a data professional's assistant named Dox.
|
|
|
|
| 84 |
Your primary goal is to answer questions about data, programming, cloud computing, AI/ML, and technology topics.
|
|
|
|
| 85 |
Here are your operating procedures:
|
|
|
|
| 86 |
1. **Information Gathering Strategy**:
|
| 87 |
-
* **Prioritize Knowledge Base**: First, search your internal knowledge base for the answer.
|
| 88 |
-
* **Supplement with Web Search**: If the knowledge base is insufficient or the question
|
| 89 |
-
|
| 90 |
-
*
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
3. **Citation Rules (CRITICAL)**:
|
| 92 |
-
* For information from your knowledge base,
|
| 93 |
-
* For information from the web
|
| 94 |
-
* Always end your answers with the appropriate citations.
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
)
|
|
|
|
|
|
|
| 108 |
|
| 109 |
-
|
| 110 |
-
knowledge_base = initialize_knowledge_base()
|
| 111 |
-
agent = create_agent(knowledge_base)
|
| 112 |
|
| 113 |
-
|
| 114 |
-
|
|
|
|
|
|
|
| 115 |
logger.info(f"Question asked: {question[:100]}...")
|
| 116 |
response = agent.run(question, use_knowledge=True)
|
| 117 |
full_content = response.get_content_as_string()
|
| 118 |
-
|
| 119 |
-
#
|
| 120 |
-
match = re.search(r'https?://[^\s
|
| 121 |
link = match.group(0) if match else None
|
| 122 |
|
| 123 |
if link:
|
| 124 |
-
logger.info(f"
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
|
|
|
|
|
|
|
|
|
|
| 134 |
def display_pdf(pdf_url):
|
| 135 |
-
"""Downloads and renders the first page of a PDF from a URL."""
|
| 136 |
if not pdf_url:
|
| 137 |
return gr.update(visible=False)
|
| 138 |
|
| 139 |
try:
|
| 140 |
logger.info(f"Displaying PDF from: {pdf_url}")
|
| 141 |
-
pdf_bytes =
|
| 142 |
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 143 |
page = doc[0]
|
| 144 |
|
| 145 |
-
|
| 146 |
-
|
|
|
|
| 147 |
|
| 148 |
img_data = pix.tobytes("png")
|
| 149 |
img = Image.open(io.BytesIO(img_data))
|
| 150 |
doc.close()
|
| 151 |
|
|
|
|
| 152 |
return gr.update(value=img, visible=True)
|
| 153 |
except Exception as e:
|
| 154 |
-
logger.error(f"Error displaying PDF: {e}")
|
| 155 |
return gr.update(value=None, visible=False)
|
| 156 |
|
| 157 |
-
# ---
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
|
| 197 |
if __name__ == "__main__":
|
| 198 |
-
|
| 199 |
-
ui.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import logging
|
| 2 |
import sys
|
| 3 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
from agno.agent import Agent
|
| 5 |
from agno.models.openai import OpenAIChat
|
| 6 |
from agno.knowledge.embedder.openai import OpenAIEmbedder
|
| 7 |
from agno.tools.duckduckgo import DuckDuckGoTools
|
| 8 |
from agno.knowledge.knowledge import Knowledge
|
| 9 |
from agno.vectordb.lancedb import LanceDb, SearchType
|
| 10 |
+
import gradio as gr
|
| 11 |
+
import fitz # PyMuPDF
|
| 12 |
+
from PIL import Image
|
| 13 |
+
import io
|
| 14 |
+
import requests
|
| 15 |
+
import re
|
| 16 |
|
| 17 |
+
# --- Logging ---
|
|
|
|
|
|
|
|
|
|
| 18 |
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
| 19 |
logger = logging.getLogger(__name__)
|
| 20 |
|
| 21 |
+
# --- Secrets ---
|
| 22 |
+
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
|
| 23 |
|
| 24 |
+
if not OPENAI_API_KEY:
|
| 25 |
+
raise ValueError("Missing OPENAI_API_KEY")
|
| 26 |
+
|
| 27 |
+
# Create a knowledge base with PDF documents
|
| 28 |
+
knowledge = Knowledge(
|
| 29 |
+
vector_db=LanceDb(
|
| 30 |
+
uri="tmp/lancedb",
|
| 31 |
+
table_name="pdf_documents",
|
| 32 |
+
search_type=SearchType.vector, # Changed to vector to avoid tantivy dependency
|
| 33 |
+
embedder=OpenAIEmbedder(id="text-embedding-3-small"),
|
| 34 |
)
|
| 35 |
+
)
|
| 36 |
|
| 37 |
+
# Download and add PDFs to knowledge base
|
| 38 |
+
pdf_urls = [
|
| 39 |
+
"https://media.datacamp.com/cms/working-with-hugging-face.pdf",
|
| 40 |
+
"https://media.datacamp.com/cms/ai-agents-cheat-sheet.pdf",
|
| 41 |
+
"https://media.datacamp.com/cms/introduction-to-sql-with-ai-1.pdf",
|
| 42 |
+
"https://media.datacamp.com/legacy/image/upload/v1719844709/Marketing/Blog/Azure_CLI_Cheat_Sheet.pdf"
|
| 43 |
+
]
|
| 44 |
|
| 45 |
+
def download_if_needed(url, filename):
|
| 46 |
+
if not os.path.exists(filename):
|
| 47 |
+
logger.info(f"Downloading {url}...")
|
| 48 |
+
response = requests.get(url)
|
| 49 |
+
with open(filename, "wb") as f:
|
| 50 |
+
f.write(response.content)
|
| 51 |
+
logger.info(f"Downloaded {filename} ({len(response.content)} bytes)")
|
| 52 |
+
|
| 53 |
+
# Create a directory for PDFs if it doesn't exist
|
| 54 |
+
os.makedirs("pdf_cache", exist_ok=True)
|
| 55 |
+
|
| 56 |
+
# Method 1: Try using knowledge.add_content (newer agno versions)
|
| 57 |
+
def add_pdfs_to_knowledge():
|
| 58 |
+
"""Add PDFs to knowledge base using the correct method for the installed agno version"""
|
| 59 |
contents_to_add = []
|
| 60 |
+
|
| 61 |
+
for i, url in enumerate(pdf_urls):
|
| 62 |
+
filename = f"pdf_cache/file_{i}.pdf"
|
| 63 |
try:
|
| 64 |
+
download_if_needed(url, filename)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
contents_to_add.append({
|
| 66 |
"path": filename,
|
| 67 |
+
"metadata": {"source": url}
|
| 68 |
})
|
| 69 |
+
logger.info(f"Prepared PDF {i+1}: {url}")
|
| 70 |
+
except Exception as e:
|
| 71 |
+
logger.error(f"Failed to prepare PDF {i+1}: {str(e)}")
|
| 72 |
+
|
| 73 |
if contents_to_add:
|
| 74 |
try:
|
| 75 |
+
# Try the new method first
|
| 76 |
+
if hasattr(knowledge, 'add_contents'):
|
| 77 |
+
knowledge.add_contents(contents_to_add)
|
| 78 |
+
logger.info(f"✅ Successfully added {len(contents_to_add)} PDFs using add_contents")
|
| 79 |
+
elif hasattr(knowledge, 'add_content'):
|
| 80 |
+
for item in contents_to_add:
|
| 81 |
+
knowledge.add_content(**item)
|
| 82 |
+
logger.info(f"✅ Successfully added {len(contents_to_add)} PDFs using add_content")
|
| 83 |
+
else:
|
| 84 |
+
# Fallback: Direct vector DB insertion
|
| 85 |
+
from agno.document.reader.pdf_reader import PDFReader
|
| 86 |
+
reader = PDFReader()
|
| 87 |
+
all_docs = []
|
| 88 |
+
for item in contents_to_add:
|
| 89 |
+
docs = reader.read(item["path"])
|
| 90 |
+
for doc in docs:
|
| 91 |
+
doc.metadata = item["metadata"]
|
| 92 |
+
all_docs.append(doc)
|
| 93 |
+
knowledge.vector_db.insert(documents=all_docs)
|
| 94 |
+
logger.info(f"✅ Successfully added {len(all_docs)} document chunks from {len(contents_to_add)} PDFs")
|
| 95 |
except Exception as e:
|
| 96 |
+
logger.error(f"Failed to add PDFs: {str(e)}")
|
| 97 |
raise
|
| 98 |
+
else:
|
| 99 |
+
logger.warning("No PDFs were prepared to add")
|
| 100 |
|
| 101 |
+
# Add PDFs to knowledge base
|
| 102 |
+
add_pdfs_to_knowledge()
|
| 103 |
+
|
| 104 |
+
# Define the agent
|
| 105 |
+
agent = Agent(
|
| 106 |
+
model=OpenAIChat(id="gpt-4.1-mini", temperature=0.2),
|
| 107 |
+
description="You are Dox a data expert!",
|
| 108 |
+
instructions="""
|
| 109 |
You are a data professional's assistant named Dox.
|
| 110 |
+
|
| 111 |
Your primary goal is to answer questions about data, programming, cloud computing, AI/ML, and technology topics.
|
| 112 |
+
|
| 113 |
Here are your operating procedures:
|
| 114 |
+
|
| 115 |
1. **Information Gathering Strategy**:
|
| 116 |
+
* **Prioritize Knowledge Base**: First, search your internal knowledge base for the answer.
|
| 117 |
+
* **Supplement with Web Search**: If the knowledge base information is outdated, insufficient, or the question is better suited for current web information, use the DuckDuckGo tool to perform web searches to fill in gaps or find the most up-to-date data.
|
| 118 |
+
* For general technology questions not in your knowledge base, use web search to provide accurate answers.
|
| 119 |
+
* If the question is NOT data-related, you MUST respond with: "Please ask relevant data questions only." and terminate.
|
| 120 |
+
|
| 121 |
+
2. **Response Length Guidelines**:
|
| 122 |
+
* For basic questions, keep your answer to a maximum of 300 words.
|
| 123 |
+
* For complex questions, extend your answer to a maximum of 500 words.
|
| 124 |
+
|
| 125 |
3. **Citation Rules (CRITICAL)**:
|
| 126 |
+
* **Knowledge Base Citation**: For any information sourced from your internal knowledge base, you MUST include a citation on a NEW LINE after the answer, starting with "Source: ", followed by the metadata field 'source' to get the hyperlink.
|
| 127 |
+
* **Web Search Citation**: For any information obtained from the web using the DuckDuckGo tool, you MUST include a citation on a NEW LINE after the answer, starting with "Online Source: ", followed by the full hyperlink.
|
| 128 |
+
* **Final Rule for Citations**: Always end your answers with the appropriate citations, ensuring they are on separate lines as specified. Do NOT mix or combine citation types on a single line.
|
| 129 |
+
* ALWAYS cite with links NOT text like "from internal knowledge base"
|
| 130 |
+
|
| 131 |
+
4. **Accuracy and Non-Hallucination**:
|
| 132 |
+
* Provide factual and relevant answers based ONLY on the information found in your knowledge base or through web searches.
|
| 133 |
+
* NEVER invent or hallucinate information. If an answer cannot be found, state that directly.
|
| 134 |
+
|
| 135 |
+
Make sure to follow these instructions precisely.
|
| 136 |
+
""",
|
| 137 |
+
knowledge=knowledge,
|
| 138 |
+
add_datetime_to_context=True,
|
| 139 |
+
add_location_to_context=True,
|
| 140 |
+
search_knowledge=True,
|
| 141 |
+
tools=[DuckDuckGoTools()],
|
| 142 |
+
markdown=True
|
| 143 |
+
)
|
| 144 |
|
| 145 |
+
logger.info("Agent initialized successfully")
|
|
|
|
|
|
|
| 146 |
|
| 147 |
+
# -----------------------------
|
| 148 |
+
# Your agent function
|
| 149 |
+
# -----------------------------
|
| 150 |
+
def ask_agent(question):
|
| 151 |
logger.info(f"Question asked: {question[:100]}...")
|
| 152 |
response = agent.run(question, use_knowledge=True)
|
| 153 |
full_content = response.get_content_as_string()
|
| 154 |
+
|
| 155 |
+
# Extract PDF URL from response
|
| 156 |
+
match = re.search(r'https?://[^\s]+\.pdf', full_content, re.IGNORECASE)
|
| 157 |
link = match.group(0) if match else None
|
| 158 |
|
| 159 |
if link:
|
| 160 |
+
logger.info(f"PDF link found: {link}")
|
| 161 |
+
else:
|
| 162 |
+
logger.info("No PDF link found in response")
|
| 163 |
+
|
| 164 |
+
return full_content, link
|
| 165 |
+
|
| 166 |
+
# -----------------------------
|
| 167 |
+
# Download PDF
|
| 168 |
+
# -----------------------------
|
| 169 |
+
def download_pdf_from_url(url):
|
| 170 |
+
response = requests.get(url, timeout=30)
|
| 171 |
+
response.raise_for_status()
|
| 172 |
+
return response.content
|
| 173 |
|
| 174 |
+
# -----------------------------
|
| 175 |
+
# Display PDF
|
| 176 |
+
# -----------------------------
|
| 177 |
def display_pdf(pdf_url):
|
|
|
|
| 178 |
if not pdf_url:
|
| 179 |
return gr.update(visible=False)
|
| 180 |
|
| 181 |
try:
|
| 182 |
logger.info(f"Displaying PDF from: {pdf_url}")
|
| 183 |
+
pdf_bytes = download_pdf_from_url(pdf_url)
|
| 184 |
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 185 |
page = doc[0]
|
| 186 |
|
| 187 |
+
zoom = 5
|
| 188 |
+
mat = fitz.Matrix(zoom, zoom)
|
| 189 |
+
pix = page.get_pixmap(matrix=mat)
|
| 190 |
|
| 191 |
img_data = pix.tobytes("png")
|
| 192 |
img = Image.open(io.BytesIO(img_data))
|
| 193 |
doc.close()
|
| 194 |
|
| 195 |
+
logger.info("PDF displayed successfully")
|
| 196 |
return gr.update(value=img, visible=True)
|
| 197 |
except Exception as e:
|
| 198 |
+
logger.error(f"Error displaying PDF: {str(e)}")
|
| 199 |
return gr.update(value=None, visible=False)
|
| 200 |
|
| 201 |
+
# -----------------------------
|
| 202 |
+
# UI Wrapper for Blocks
|
| 203 |
+
# -----------------------------
|
| 204 |
+
def ask_agent_ui(question):
|
| 205 |
+
response_text, link = ask_agent(question)
|
| 206 |
+
|
| 207 |
+
return (
|
| 208 |
+
response_text,
|
| 209 |
+
link,
|
| 210 |
+
gr.update(visible=link is not None), # button visibility
|
| 211 |
+
gr.update(value=None, visible=False) # RESET PDF preview
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
# -----------------------------
|
| 215 |
+
# Combined Gradio UI with Blocks and Interface
|
| 216 |
+
# -----------------------------
|
| 217 |
+
with gr.Blocks(title="# 🌊 Dox the Data Professional's Advisor 🤖", theme=gr.themes.Ocean()) as demo:
|
| 218 |
+
gr.Markdown("# 🌊 Dox the Data Professional's Advisor 🤖")
|
| 219 |
+
gr.Markdown("🧠 Dox has 4 DataCamp cheat sheets in its database that you could ask about (1️⃣ Hugging Face | 2️⃣ AI Agents | 3️⃣ SQL with AI | 4️⃣ Azure CLI):")
|
| 220 |
+
|
| 221 |
+
# Create two columns for better layout
|
| 222 |
+
with gr.Row():
|
| 223 |
+
with gr.Column(scale=2):
|
| 224 |
+
question = gr.Textbox(
|
| 225 |
+
label="Ask Dox a question:",
|
| 226 |
+
lines=2,
|
| 227 |
+
placeholder="Type your question here..."
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
# Add examples
|
| 231 |
+
gr.Examples(
|
| 232 |
+
examples=[
|
| 233 |
+
"How do you log into Azure using device code authentication?",
|
| 234 |
+
"What are the three main components of an AI agent?",
|
| 235 |
+
"What are the \"core four\" Hugging Face libraries?",
|
| 236 |
+
"What SQL clause is used to filter data after grouping?"
|
| 237 |
+
],
|
| 238 |
+
inputs=question,
|
| 239 |
+
label="Example Questions"
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
ask_btn = gr.Button("Submit", variant="primary")
|
| 243 |
+
|
| 244 |
+
answer = gr.Markdown(
|
| 245 |
+
label="Answer: ",
|
| 246 |
+
render=True,
|
| 247 |
+
container=True,
|
| 248 |
+
elem_id="answer_markdown"
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
with gr.Column(scale=2):
|
| 252 |
+
link_state = gr.State()
|
| 253 |
+
show_btn = gr.Button("Show PDF", visible=False, variant="secondary")
|
| 254 |
+
output_image = gr.Image(label="PDF Preview (Page 1)", visible=False, format="pdf")
|
| 255 |
+
|
| 256 |
+
# Ask agent functionality
|
| 257 |
+
ask_btn.click(
|
| 258 |
+
ask_agent_ui,
|
| 259 |
+
inputs=question,
|
| 260 |
+
outputs=[answer, link_state, show_btn, output_image]
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
# Show PDF functionality
|
| 264 |
+
show_btn.click(
|
| 265 |
+
display_pdf,
|
| 266 |
+
inputs=link_state,
|
| 267 |
+
outputs=output_image
|
| 268 |
+
).then(
|
| 269 |
+
lambda: gr.update(visible=True),
|
| 270 |
+
None,
|
| 271 |
+
output_image
|
| 272 |
+
)
|
| 273 |
|
| 274 |
if __name__ == "__main__":
|
| 275 |
+
demo.launch()
|
|
|