| |
| import logging |
| import sys |
| import os |
|
|
| |
| from agno.agent import Agent |
| from agno.models.openai import OpenAIChat |
| from agno.knowledge.embedder.openai import OpenAIEmbedder |
| from agno.tools.duckduckgo import DuckDuckGoTools |
| from agno.knowledge.knowledge import Knowledge |
| from agno.vectordb.lancedb import LanceDb, SearchType |
|
|
| |
| import gradio as gr |
|
|
| |
| import fitz |
| from PIL import Image |
| import io |
| import requests |
| import re |
|
|
| |
| logging.basicConfig(stream=sys.stdout, level=logging.INFO) |
| |
| logger = logging.getLogger(__name__) |
|
|
| |
| OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") |
| |
| if not OPENAI_API_KEY: |
| raise ValueError("Missing OPENAI_API_KEY") |
|
|
| |
| knowledge = Knowledge( |
| |
| vector_db=LanceDb( |
| uri="tmp/lancedb", |
| table_name="pdf_documents", |
| search_type=SearchType.vector, |
| |
| embedder=OpenAIEmbedder(id="text-embedding-3-small"), |
| ) |
| ) |
|
|
| |
| pdf_urls = [ |
| "https://media.datacamp.com/cms/working-with-hugging-face.pdf", |
| "https://media.datacamp.com/cms/ai-agents-cheat-sheet.pdf", |
| "https://media.datacamp.com/cms/introduction-to-sql-with-ai-1.pdf", |
| "https://media.datacamp.com/legacy/image/upload/v1719844709/Marketing/Blog/Azure_CLI_Cheat_Sheet.pdf", |
| "https://s3.amazonaws.com/assets.datacamp.com/email/other/Power+BI_Cheat+Sheet.pdf", |
| "https://media.datacamp.com/cms/python-basics-cheat-sheet-v4.pdf" |
| ] |
|
|
| |
| |
| |
| example_questions=[ |
| "How do you log into Azure using device code authentication?", |
| "What are the three main components of an AI agent?", |
| "What are the \"core four\" Hugging Face libraries?", |
| "What SQL clause is used to filter data after grouping?", |
| "What is the latest GPT model?" |
| ] |
|
|
| |
| |
| |
| initial_chat = [ |
| { |
| "role": "assistant", |
| "content": "Hello, I am Dox, the Data Professional's Guide. Ask me a question about data." |
| } |
| ] |
|
|
|
|
| |
| def download_if_needed(url, filename): |
| |
| if not os.path.exists(filename): |
| logger.info(f"Downloading {url}...") |
| |
| response = requests.get(url) |
| |
| with open(filename, "wb") as f: |
| |
| f.write(response.content) |
| logger.info(f"Downloaded {filename} ({len(response.content)} bytes)") |
|
|
| |
| |
| os.makedirs("pdf_cache", exist_ok=True) |
|
|
| |
| def add_pdfs_to_knowledge(): |
| """Add PDFs to knowledge base using the correct method for the installed agno version""" |
| |
| contents_to_add = [] |
| |
| |
| for i, url in enumerate(pdf_urls): |
| |
| filename = f"pdf_cache/file_{i}.pdf" |
| try: |
| |
| download_if_needed(url, filename) |
| |
| contents_to_add.append({ |
| "path": filename, |
| "metadata": {"source": url} |
| }) |
| logger.info(f"Prepared PDF {i+1}: {url}") |
| except Exception as e: |
| |
| logger.error(f"Failed to prepare PDF {i+1}: {str(e)}") |
| |
| |
| if contents_to_add: |
| try: |
| |
| |
| if hasattr(knowledge, 'add_contents'): |
| knowledge.add_contents(contents_to_add) |
| logger.info(f"✅ Successfully added {len(contents_to_add)} PDFs using add_contents") |
| |
| elif hasattr(knowledge, 'add_content'): |
| for item in contents_to_add: |
| knowledge.add_content(**item) |
| logger.info(f"✅ Successfully added {len(contents_to_add)} PDFs using add_content") |
| |
| else: |
| from agno.document.reader.pdf_reader import PDFReader |
| reader = PDFReader() |
| all_docs = [] |
| for item in contents_to_add: |
| docs = reader.read(item["path"]) |
| for doc in docs: |
| doc.metadata = item["metadata"] |
| all_docs.append(doc) |
| knowledge.vector_db.insert(documents=all_docs) |
| logger.info(f"✅ Successfully added {len(all_docs)} document chunks from {len(contents_to_add)} PDFs") |
| except Exception as e: |
| |
| logger.error(f"Failed to add PDFs: {str(e)}") |
| raise |
| else: |
| |
| logger.warning("No PDFs were prepared to add") |
|
|
| |
| add_pdfs_to_knowledge() |
|
|
| |
| agent = Agent( |
| |
| model=OpenAIChat(id="gpt-4.1-mini", temperature=0.2), |
| |
| description="You are Dox a data expert!", |
| |
| instructions=""" |
| You are a data professional's assistant named Dox. |
| Your primary goal is to answer questions about data, programming, cloud computing, AI/ML, and technology topics. |
| Here are your operating procedures: |
| 1. **Information Gathering Strategy**: |
| * **Prioritize Knowledge Base**: First, search your internal knowledge base for the answer. |
| * **Supplement with Web Search**: If the knowledge base information is outdated, insufficient, or the question is better suited for current web information, use the DuckDuckGo tool to perform web searches to fill in gaps or find the most up-to-date data. |
| * For general technology questions not in your knowledge base, use the DuckDuckGo tool to perform web search to provide accurate answers. |
| * If the question is asking for the "latest" or "most recent" of a data-related topic, ALWAYS use the DuckDuckGo tool to perform the latest web search and datetime to context. |
| * If the question is NOT data-related, you MUST respond with: "Please ask relevant data questions only." and terminate. |
| 2. **Response Length Guidelines**: |
| * For basic questions, keep your answer to a maximum of 300 words. |
| * For complex questions, extend your answer to a maximum of 500 words. |
| 3. **Citation Rules (CRITICAL)**: |
| * **Knowledge Base Citation**: For any information sourced from your internal knowledge base, you MUST ALWAYS include a citation on a NEW LINE after the answer, starting with "Source: ", followed by the metadata field 'source' to get the hyperlink. |
| * **Web Search Citation**: For any information obtained from the web using the DuckDuckGo tool, you MUST ALWAYS include a citation on a NEW LINE after the answer, starting with "Online Source: ", followed by the full hyperlink. |
| * **Final Rule for Citations**: ALWAYS end your answers with the appropriate citations, ensuring they are on separate lines as specified. Do NOT mix or combine citation types on a single line. |
| * ALWAYS cite with links NOT text like "from internal knowledge base" |
| 4. **Accuracy and Non-Hallucination**: |
| * Provide factual and relevant answers based ONLY on the information found in your knowledge base or through the DuckDuckGo tool to perform web searches. |
| * NEVER invent or hallucinate information. If an answer cannot be found, state that directly. |
| Make sure to follow these instructions precisely. |
| """, |
| |
| knowledge=knowledge, |
| |
| add_datetime_to_context=True, |
| |
| add_location_to_context=True, |
| |
| |
| |
| tools=[DuckDuckGoTools()], |
| |
| markdown=True |
| ) |
|
|
| |
| logger.info("🟢 Agent initialized successfully!") |
|
|
| |
| def ask_agent(question): |
| logger.info(f"Question asked: {question[:100]}...") |
| try: |
| |
| response = agent.run(question, use_knowledge=True) |
| |
| full_content = response.get_content_as_string() |
| except Exception as e: |
| logger.error(str(e)) |
| return "❌ Something went wrong. Please try again.", None |
| |
| match = re.search(r'https?://[^\s]+\.pdf', full_content, re.IGNORECASE) |
| |
| link = match.group(0) if match else None |
| |
| if link: |
| logger.info(f"PDF link found: {link}") |
| else: |
| logger.info("🔴 No PDF link found in response") |
| |
| |
| full_content += "\n\n---\n**📋 Dox would appreciate your feedback! ⬇️**" |
| return full_content, link |
|
|
| |
| def download_pdf_from_url(url): |
| |
| response = requests.get(url, timeout=30) |
| |
| response.raise_for_status() |
| |
| return response.content |
|
|
| |
| def prepare_pdf_loading(link): |
| |
| if link: |
| return gr.update(value="📄 Loading PDF preview...", visible=True) |
| |
| return gr.update(value="❌ No PDF for preview", visible=True) |
|
|
| |
| def display_pdf(pdf_url): |
| |
| if not pdf_url: |
| return ( |
| gr.update(value=None, visible=False), |
| gr.update(value="", visible=False) |
| ) |
| try: |
| |
| pdf_bytes = download_pdf_from_url(pdf_url) |
| |
| doc = fitz.open(stream=pdf_bytes, filetype="pdf") |
| |
| page = doc[0] |
| |
| zoom = 1.5 |
| mat = fitz.Matrix(zoom, zoom) |
| |
| pix = page.get_pixmap(matrix=mat) |
| |
| img = Image.open(io.BytesIO(pix.tobytes("png"))) |
| |
| doc.close() |
| |
| return ( |
| gr.update(value=img, visible=True), |
| gr.update(value="", visible=False) |
| ) |
| except Exception as e: |
| |
| logger.error(f"PDF error: {e}") |
| return ( |
| gr.update(value=None, visible=False), |
| gr.update(value="❌ Failed to load PDF", visible=True) |
| ) |
|
|
| theme = gr.themes.Ocean( |
| font=[gr.themes.GoogleFont("Inter"), "Segoe UI", "sans-serif"], |
| font_mono=[gr.themes.GoogleFont("Fira Code"), "monospace"] |
| ) |
| DOX_LOGO = "https://raw.githubusercontent.com/AzzamAlnatsheh/DDS_BuildingAIChallenge/main/Dox Transparent Image.png" |
| custom_css = """ |
| .main-container { |
| width: 100%; |
| max-width: 100%; |
| margin: 0; |
| padding: 0 10px; |
| } |
| |
| /* HEADER */ |
| .header-card { |
| background: linear-gradient(135deg, #0B1F3A, #102A4C); |
| border-radius: 24px; |
| padding: 0px; |
| box-shadow: 0 16px 40px rgba(2, 6, 23, 0.4); |
| border: 1px solid #1F3B5C; |
| margin: 0px; |
| } |
| |
| /* SIDEBAR */ |
| .sidebar-card { |
| background: rgba(11, 31, 58, 0.95); |
| border-radius: 24px; |
| padding: 24px; |
| box-shadow: 0 16px 40px rgba(2, 6, 23, 0.35); |
| border: 1px solid #1F3B5C; |
| height: 100%; |
| } |
| |
| /* CHAT */ |
| .chat-card { |
| background: rgba(16, 42, 76, 0.95); |
| border-radius: 24px; |
| padding: 22px; |
| box-shadow: 0 16px 40px rgba(2, 6, 23, 0.35); |
| border: 1px solid #1F3B5C; |
| } |
| |
| /* LOGO */ |
| .logo-img { |
| width: 200px; |
| height: 170px; |
| margin: 0px; |
| padding: 0px; |
| } |
| |
| /* TITLE */ |
| .title-text { |
| font-size: 32px; |
| font-weight: 850; |
| color: #E6F0FA; |
| margin-bottom: 8px; |
| letter-spacing: -0.03em; |
| } |
| |
| /* SUBTITLE */ |
| .subtitle-text { |
| font-size: 16px; |
| color: #AFCBE8; |
| line-height: 1.65; |
| max-width: auto; |
| } |
| |
| /* BADGES */ |
| .badge { |
| display: inline-block; |
| background: rgba(30, 167, 255, 0.15); |
| color: #4FD1FF; |
| padding: 7px 13px; |
| border-radius: 999px; |
| font-size: 13px; |
| font-weight: 650; |
| margin-right: 7px; |
| margin-bottom: 8px; |
| border: 1px solid rgba(30, 167, 255, 0.3); |
| } |
| |
| /* STATUS BOX */ |
| .status-box { |
| background: rgba(16, 42, 76, 0.85); |
| border: 1px solid #1F3B5C; |
| padding: 14px; |
| border-radius: 16px; |
| font-size: 14px; |
| color: #CFE6FF; |
| line-height: 1.6; |
| } |
| |
| /* NOTES */ |
| .small-note { |
| font-size: 13px; |
| color: #9FB9D9; |
| line-height: 1.55; |
| } |
| |
| .footer-note { |
| font-size: 13px; |
| color: #9FB9D9; |
| text-align: center; |
| margin-top: 18px; |
| } |
| |
| /* CHATBOT BOX */ |
| #chatbot { |
| min-height: 540px; |
| border-radius: 18px; |
| border: 1px solid #1F3B5C; |
| background: rgba(11, 31, 58, 0.9); |
| } |
| |
| /* INPUT */ |
| #question_box textarea { |
| border-radius: 16px !important; |
| background: #0B1F3A; |
| color: #E6F0FA; |
| border: 1px solid #1F3B5C; |
| } |
| |
| /* EXAMPLES */ |
| .example-button { |
| margin-bottom: 8px !important; |
| border-radius: 14px !important; |
| white-space: normal !important; |
| text-align: left !important; |
| background: rgba(30, 167, 255, 0.08); |
| color: #CFE6FF; |
| border: 1px solid rgba(30, 167, 255, 0.2); |
| } |
| |
| /* PRIMARY BUTTON */ |
| .primary-action { |
| border-radius: 14px !important; |
| background: linear-gradient(135deg, #1EA7FF, #4FD1FF) !important; |
| color: #0B1F3A !important; |
| border: none !important; |
| font-weight: 700; |
| } |
| |
| /* CLEAR BUTTON */ |
| .clear-action { |
| border-radius: 14px !important; |
| background: rgba(255, 255, 255, 0.05) !important; |
| color: #E6F0FA !important; |
| border: 1px solid #1F3B5C !important; |
| } |
| """ |
|
|
| |
| with gr.Blocks(title="🤖 Dox the Data Professional's Guide 🤖", |
| |
| |
| fill_width=True |
| ) as demo: |
| def run_example(question_text, chat_history): |
| return chat_ui(question_text, chat_history) |
| with gr.Column(elem_classes=["main-container"]): |
|
|
| |
| |
| |
| with gr.Row(elem_classes=["header-card"]): |
| with gr.Column(scale=1): |
| gr.HTML( |
| f""" |
| <img src="{DOX_LOGO}" class="logo-img" alt="DOX Logo"> |
| """ |
| ) |
| |
| with gr.Column(scale=5): |
| gr.HTML( |
| """ |
| <div class="title-text">🤖 Dox the Data Professional's Advisor 🤖</div> |
| <div class="subtitle-text"> |
| A professional data chatbot that aims in reminding and helping data experts in certain concepts in a simplified way |
| while also having access to download DataCamp's public cheatsheets on many data-related topics. |
| </div> |
| <br> |
| <span class="badge">Agno</span> |
| <span class="badge">LanceDB</span> |
| <span class="badge">OpenAI</span> |
| <span class="badge">Gradio</span> |
| """ |
| ) |
| |
| |
| |
| with gr.Row(): |
|
|
| |
| with gr.Column(scale=3, elem_classes=["chat-card"]): |
| gr.Markdown("### 🧠 Dox is an expert in the following topics: \n1️⃣ Hugging Face | 2️⃣ AI Agents | 3️⃣ SQL with AI | 4️⃣ Azure CLI | 5️⃣ Power BI | 6️⃣ Python") |
|
|
| chatbot = gr.Chatbot( |
| label="💬 Conversation", |
| elem_id="chatbot", |
| value=initial_chat.copy(), |
| height=450 |
| ) |
|
|
| question = gr.Textbox( |
| label="🙋 Ask Dox a question:", |
| placeholder="🤔 Type your question here...", |
| lines=1, |
| elem_classes="question_box" |
| ) |
|
|
| with gr.Row(): |
| ask_btn = gr.Button("Submit 📤", variant="primary", elem_classes="primary-action") |
| clear_btn = gr.Button("🧹 Clear Chat", elem_classes="clear-action") |
| |
| gr.Markdown("### 💡 Example Questions", elem_classes="example-button") |
| examples = gr.Examples( |
| examples=example_questions, |
| inputs=question, |
| outputs=[chatbot, question], |
| fn=run_example, |
| cache_examples=False |
| ) |
|
|
| |
| with gr.Row(): |
| thumbs_up = gr.Button("👍 Helpful", elem_classes="clear-action") |
| thumbs_down = gr.Button("👎 Not Helpful", elem_classes="clear-action") |
| |
| |
| feedback_box = gr.Textbox( |
| placeholder="💬 Optional: tell us what went wrong...", |
| visible=False |
| ) |
| |
| submit_feedback_btn = gr.Button("📝 Submit Feedback", visible=False, elem_classes="clear-action") |
| feedback_status = gr.Markdown("", elem_classes="clear-action") |
| |
| with gr.Column(scale=3): |
| gr.Markdown("### 📄 Referenced PDF Document (🌐 Empty for Web Results)", elem_classes="clear-action") |
| |
| |
| link_state = gr.State() |
| |
| pdf_status = gr.Markdown(visible=False, elem_classes="clear-action") |
| |
| output_image = gr.Image( |
| label="⬇️ Cheat Sheet Preview", |
| visible=False |
| ) |
| pdf_link_btn = gr.Markdown("") |
|
|
| |
| def chat_ui(user_message, chat_history): |
| |
| if chat_history is None: |
| chat_history = [] |
| |
| |
| chat_history.append({ |
| "role": "user", |
| "content": user_message |
| }) |
| |
| |
| chat_history.append({ |
| "role": "assistant", |
| "content": "🤔 Dox is thinking..." |
| }) |
| |
| |
| |
| yield ( |
| chat_history, |
| None, |
| gr.update(value=None, visible=False), |
| "" |
| ) |
| |
| |
| response_text, link = ask_agent(user_message) |
| |
| |
| chat_history[-1] = { |
| "role": "assistant", |
| "content": response_text |
| } |
| |
| |
| yield ( |
| chat_history, |
| link, |
| gr.update(value=None, visible=False), |
| "" |
| ) |
| |
| |
| def submit_chain(): |
| |
| |
| |
| return ( |
| chat_ui, |
| [question, chatbot], |
| [chatbot, link_state, output_image, question] |
| ) |
|
|
| def show_pdf_link(link): |
| if link: |
| return f"[📥 Open Full PDF]({link})" |
| return "" |
|
|
| def clear_chat(): |
| return [], None, gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False) |
|
|
| clear_btn.click( |
| clear_chat, |
| outputs=[chatbot, link_state, output_image, feedback_box, submit_feedback_btn] |
| ) |
|
|
| def show_feedback_box(): |
| return gr.update(visible=True), gr.update(visible=True) |
|
|
| def show_appreciation(): |
| logger.info("It was helpful!") |
| return "✅ Feedback submitted. Thank you!" |
| |
| thumbs_down.click( |
| show_feedback_box, |
| outputs=[feedback_box, submit_feedback_btn] |
| ) |
|
|
| thumbs_up.click( |
| show_appreciation, |
| outputs=feedback_status |
| ) |
|
|
| def handle_feedback(text): |
| logger.info(f"User feedback: {text}") |
| return "✅ Feedback submitted. Thank you!" |
| |
| submit_feedback_btn.click( |
| handle_feedback, |
| inputs=feedback_box, |
| outputs=feedback_status |
| ) |
|
|
| examples.dataset.click( |
| *submit_chain() |
| ).then( |
| prepare_pdf_loading, |
| inputs=link_state, |
| outputs=pdf_status |
| ).then( |
| display_pdf, |
| inputs=link_state, |
| outputs=[output_image, pdf_status] |
| ).then( |
| show_pdf_link, |
| inputs=link_state, |
| outputs=pdf_link_btn |
| ) |
|
|
| |
| ask_btn.click( |
| *submit_chain() |
| |
| ).then( |
| |
| prepare_pdf_loading, |
| inputs=link_state, |
| outputs=pdf_status |
| ).then( |
| |
| display_pdf, |
| inputs=link_state, |
| outputs=[output_image, pdf_status] |
| ).then( |
| show_pdf_link, |
| inputs=link_state, |
| outputs=pdf_link_btn |
| ) |
| |
| |
| question.submit( |
| *submit_chain() |
| ).then( |
| prepare_pdf_loading, |
| inputs=link_state, |
| outputs=pdf_status |
| ).then( |
| display_pdf, |
| inputs=link_state, |
| outputs=[output_image, pdf_status] |
| ).then( |
| show_pdf_link, |
| inputs=link_state, |
| outputs=pdf_link_btn |
| ) |
|
|
| |
| if __name__ == "__main__": |
| demo.launch( |
| theme=gr.themes.Ocean( |
| primary_hue="indigo", |
| neutral_hue="slate" |
| ), |
| css=custom_css |
| ) |