Spaces:

Agents-MCP-Hackathon
/

egg_tart_agent

Sleeping

App Files Files Community

gnokit commited on Jun 9, 2025

Commit

43bc4f7

1 Parent(s): a850eda

initial version

Browse files

Files changed (6) hide show

.gitignore +140 -0
app.py +197 -0
app_config.py +132 -0
app_tools.py +288 -0
open-data-json-list-english-v3.json +0 -0
requirements.txt +5 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,140 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+build/
+tmp/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+Pipfile.lock
+# poetry
+poetry.lock
+# PEP 582; used by python-next
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+.gradio
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+cache/
+# Spyder project settings
+.spyderproject
+.spyderworkspace
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# VS Code settings
+.vscode/

app.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import gradio as gr
+import logging
+from app_config import create_agent, create_task, get_app_info
+from smolagents import MultiStepAgent
+from smolagents.gradio_ui import GradioUI, stream_to_gradio
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+logger = logging.getLogger(__name__)
+class EggTartAgentUI(GradioUI):
+    def __init__(self, agent, app_name, app_description, example_questions):
+        super().__init__(agent)
+        self.name = app_name
+        self.description = app_description
+        self.example_questions = example_questions
+    def interact_with_agent(self, prompt, messages, session_state):
+        import gradio as gr
+        # Get the agent type from the template agent
+        if "agent" not in session_state:
+            session_state["agent"] = self.agent
+        try:
+            messages.append(gr.ChatMessage(role="user", content=prompt, metadata={"status": "done"}))
+            yield messages
+            task = create_task(question=prompt)
+            for msg in stream_to_gradio(session_state["agent"], task=task, reset_agent_memory=False):
+                if isinstance(msg, gr.ChatMessage):
+                    messages[-1].metadata["status"] = "done"
+                    messages.append(msg)
+                elif isinstance(msg, str):  # Then it's only a completion delta
+                    msg = msg.replace("<", r"\<").replace(">", r"\>")  # HTML tags seem to break Gradio Chatbot
+                    if messages[-1].metadata["status"] == "pending":
+                        messages[-1].content = msg
+                    else:
+                        messages.append(gr.ChatMessage(role="assistant", content=msg, metadata={"status": "pending"}))
+                yield messages
+            yield messages
+        except Exception as e:
+            yield messages
+            raise gr.Error(f"Error in interaction: {str(e)}")
+    def ask_egg_tart_agent(self, question: str) -> str:
+        """
+        Search for information about Hong Kong based on the user's question.
+        Args:
+            question (str): The user's question about Hong Kong.
+        Returns:
+            str: A concise answer based on the datasets available, formatted in markdown.
+        """
+        logger.info(f"New task received: {question}")
+        response = agent.run(
+            task=create_task(question=question),
+            max_steps=15
+        )
+        return response.strip()
+    def create_app(self):
+        with gr.Blocks(theme=gr.themes.Ocean(), fill_height=True, title=self.name) as demo:
+            # Add session state to store session-specific data
+            session_state = gr.State({})
+            stored_messages = gr.State([])
+            file_uploads_log = gr.State([])
+            with gr.Sidebar():
+                gr.Markdown(
+                    f"# {self.name}\n\n{self.description}"
+                )
+                with gr.Group():
+                    gr.Markdown("**Your Question**", container=True)
+                    text_input = gr.Textbox(
+                        lines=3,
+                        label="Chat Message",
+                        container=False,
+                        placeholder="Enter your question here and press Shift+Enter or press the button",
+                    )
+                    submit_btn = gr.Button("Submit", variant="primary")
+                # If an upload folder is provided, enable the upload feature
+                if self.file_upload_folder is not None:
+                    upload_file = gr.File(label="Upload a file")
+                    upload_status = gr.Textbox(label="Upload Status", interactive=False, visible=False)
+                    upload_file.change(
+                        self.upload_file,
+                        [upload_file, file_uploads_log],
+                        [upload_status, file_uploads_log],
+                    )
+                gr.Examples(self.example_questions,text_input)
+                mcp_btn = gr.Button(
+                    "MCP Server Trigger",
+                    variant="stop",
+                    visible=False
+                )
+                mcp_result = gr.Markdown(
+                    visible=False,
+                )
+                gr.HTML(
+                    "<br><br><h4><center>Powered by <a target='_blank' href='https://github.com/huggingface/smolagents'><b>smolagents</b></a></center></h4>"
+                )
+            # Main chat interface
+            chatbot = gr.Chatbot(
+                label="Running Log",
+                type="messages",
+                avatar_images=(
+                    None,
+                    "./egg_tart_icon.png",
+                ),
+                resizeable=True,
+                scale=1,
+                latex_delimiters=[
+                    {"left": r"$$", "right": r"$$", "display": True},
+                    {"left": r"$", "right": r"$", "display": False},
+                    {"left": r"\[", "right": r"\]", "display": True},
+                    {"left": r"\(", "right": r"\)", "display": False},
+                ],
+            )
+            # Set up event handlers
+            text_input.submit(
+                self.log_user_message,
+                [text_input, file_uploads_log],
+                [stored_messages, text_input, submit_btn],
+                show_api=False,
+            ).then(
+                self.interact_with_agent,
+                [stored_messages, chatbot, session_state],
+                [chatbot],
+                show_api=False,
+            ).then(
+                lambda: (
+                    gr.Textbox(
+                        interactive=True, placeholder="Enter your question here and press Shift+Enter or the button"
+                    ),
+                    gr.Button(interactive=True),
+                ),
+                None,
+                [text_input, submit_btn],
+                show_api=False,
+            )
+            mcp_btn.click(
+                self.ask_egg_tart_agent,
+                [text_input],
+                [mcp_result],
+                show_api=True
+            )
+            submit_btn.click(
+                self.log_user_message,
+                [text_input, file_uploads_log],
+                [stored_messages, text_input, submit_btn],
+                show_api=False,
+            ).then(
+                self.interact_with_agent,
+                [stored_messages, chatbot, session_state],
+                [chatbot],
+                show_api=False,
+            ).then(
+                lambda: (
+                    gr.Textbox(
+                        interactive=True, placeholder="Enter your question here and press Shift+Enter or the button"
+                    ),
+                    gr.Button(interactive=True),
+                ),
+                None,
+                [text_input, submit_btn],
+                show_api=False,
+            )
+        return demo
+if __name__ == "__main__":
+    logger.info("Starting Egg Tart Agent app initialization")
+    agent = create_agent()
+    app_name, app_description, example_questions = get_app_info()
+    logger.info(f"Egg Tart Agent initialized with app name: {app_name}")
+    ui = EggTartAgentUI(agent, app_name, app_description, example_questions)
+    ui.launch(share=False, mcp_server=True)

app_config.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import os
+import logging
+from datetime import datetime
+from zoneinfo import ZoneInfo
+from dotenv import load_dotenv
+from app_tools import DatasetSearchTool, ExploreDatasetTool, FetchContentTool, ReadDataDictionaryTool
+from smolagents import CodeAgent, OpenAIServerModel
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+logger = logging.getLogger(__name__)
+load_dotenv()
+model_id = "Qwen/Qwen2.5-Coder-32B-Instruct"
+app_name = "Egg Tart - 蛋撻仔 🍮"
+app_description = f"""
+Got a question about Hong Kong? Egg Tart - 蛋撻仔 🍮 is here to serve up answers with a smile—and a sprinkle of fun!
+Powered by {model_id}, Explore 1,800 datasets from [data.gov.hk](https://data.gov.hk/en/).
+### 🍮 What I Can Help With:
+- 🌦️ Weather & Air Quality
+- 🚇 Transportation & Traffic
+- 📊 Population & Demographics
+- 📅 Public Holidays & Events
+- 💰 Economic Data & Statistics
+"""
+prompt = """
+You are Egg Tart - 蛋撻仔 🍮, a friendly and knowledgeable assistant specializing in answering questions about Hong Kong using datasets from data.gov.hk.
+Assume all user questions relate to Hong Kong and the current year. Today is {date}.
+**Instructions:**
+1. Read the user's question carefully to understand their intent, keywords, and any relevant context or dates.
+2. Use `dataset_search_tool` to find datasets related to the user's question.
+3. For each promising dataset, use its 'Dataset ID' with `explore_dataset_tool` to get detailed metadata.
+4. Find the dataset's download URL in the metadata.
+5. If a download URL exists, use `fetch_content_tool` to get the dataset content.
+6. If fetching fails, use the data_dictionary URL from `explore_dataset_tool` and call `read_data_dictionary_tool` to learn how to construct or use the download URL.
+7. Use the data dictionary to compose the correct download URL if needed.
+8. Examine the dataset content and extract the most relevant, up-to-date information to answer the user's question.
+9. Write a concise, friendly markdown response in English (about 100 words) that includes:
+   - A heading restating the user's question in a welcoming tone.
+   - A clear, accurate answer with emojis where appropriate.
+   - Dates in the format: Weekday, Month Day, Year (e.g., Monday, June 10, 2024).
+   - A source citation with the dataset name, download URL, and the dataset's information date.
+10. If no suitable dataset is found, politely inform the user and suggest exploring data.gov.hk for more information.
+11. Briefly explain your reasoning step-by-step if relevant.
+**Formatting Example:**
+### How many people live in Hong Kong?
+🏙️ The estimated population is around 7.5 million people, including both permanent and non-permanent residents. The number has remained stable in recent years, with minor changes due to migration and demographics. For more details by age or district, you can explore additional datasets from the government portal.
+Source: [Hong Kong Population Estimates](https://data.gov.hk/dataset/population-estimates.csv) - Monday, June 10, 2024
+**User Query:**
+{query}
+"""
+# Add these example questions near the top of your file
+example_questions = [
+    "Should I bring an umbrella tomorrow in Hong Kong?",
+    "What is the route of Kowloon Motor Bus route 45? Please provide the starting point and the destination.",
+    "I want to plan a long vacation in 2025 by combining consecutive public holidays, Saturdays, and Sundays (since I don’t work weekends). Can you suggest the best 3 periods  to maximize my time off? ",
+    "What is the latest unemployment rate in Hong Kong?",
+    "When is the next MTR train at Mong Kok on the Tsuen Wan line?"
+]
+def get_app_info():
+    return app_name, app_description, example_questions
+def create_task(question):
+    """
+    Generates a formatted task instruction for a given question, including the current date.
+    Args:
+        question (str): The question or query to be included in the task.
+    Returns:
+        str: The formatted task instruction string with the question and today's date.
+    """
+    logger.info(f"Creating new task: {question}")
+    now_hk = datetime.now(ZoneInfo("Asia/Hong_Kong"))
+    full_date = now_hk.strftime("%A, %B %d, %Y, %I:%M %p %Z")
+    return prompt.format(query=question, date=full_date)
+def create_agent():
+    """
+    Creates and configures a CodeAgent instance with a specified language model and a set of tools.
+    The agent is initialized with:
+    - An OpenAIServerModel using the Nebius API, with parameters such as model_id, API key, temperature, and max_tokens.
+    - A list of tools for dataset search, exploration, content fetching, and reading data dictionaries.
+    - Additional authorized imports (e.g., "json").
+    - Base tools are not added by default.
+    Returns:
+        CodeAgent: An instance of CodeAgent configured with the specified model and tools.
+    """
+    logger.info(f"Initializing {app_name} with model: {model_id}")
+    # Create model instance
+    model = OpenAIServerModel(
+        model_id=model_id,
+        api_base="https://api.studio.nebius.com/v1/",
+        api_key=os.getenv("NEBIUS_API_KEY"),
+        temperature=0.2,
+        max_tokens=32000,
+    )
+    # Create tools list
+    tools = [
+        DatasetSearchTool(),
+        ExploreDatasetTool(),
+        FetchContentTool(),
+        ReadDataDictionaryTool(),
+    ]
+    agent = CodeAgent(
+        tools=tools,
+        additional_authorized_imports=["json"],
+        model=model,
+        add_base_tools=False,
+    )
+    return agent

app_tools.py ADDED Viewed

	@@ -0,0 +1,288 @@

+from smolagents import Tool
+from qdrant_client import QdrantClient, models
+from fastembed import TextEmbedding
+import json
+from typing import List, Dict, Any
+import requests
+from docling.document_converter import DocumentConverter
+import os
+class ReadDataDictionaryTool(Tool):
+    name = "read_data_dictionary_tool"
+    description = """Reads the data dictionary url from a dataset and returns its content in markdown format."""
+    inputs = {
+        "data_dictionary_url": {
+            "type": "string",
+            "description": "The URL of the data dictionary to read.",
+        }
+    }
+    output_type = "string"
+    def __init__(self):
+        """
+        Initializes the ReadDataDictionaryTool.
+        This tool fetches the content of a data dictionary from a given URL.
+        """
+        super().__init__()
+        self.converter = DocumentConverter()
+        # create ./cache directory if not exists
+        os.makedirs('./cache', exist_ok=True)
+    def get_cache(self, data_dictionary_url: str) -> str:
+        """
+        Get cached markdown content of data_dictionary_url
+        """
+        cache_file = f'./cache/{data_dictionary_url.replace("/", "_").replace(":", "_")}.md'
+        if os.path.exists(cache_file):
+            with open(cache_file, 'r') as f:
+                return f.read()
+        return None
+    def set_cache(self, data_dictionary_url: str, content: str):
+        """
+        Set cached markdown content of data_dictionary_url
+        """
+        cache_file = f'./cache/{data_dictionary_url.replace("/", "_").replace(":", "_")}.md'
+        with open(cache_file, 'w') as f:
+            f.write(content)
+    def forward(self, data_dictionary_url: str) -> str:
+        """
+        Reads the data dictionary from a given URL.
+        Args:
+            data_dictionary_url (str): The URL of the data dictionary to read.
+        Returns:
+            str: The content of the data dictionary as text, or an error message if the fetch fails.
+        """
+        if not data_dictionary_url or not isinstance(data_dictionary_url, str):
+            raise ValueError("Data dictionary URL must be a non-empty string")
+        try:
+            cached = self.get_cache(data_dictionary_url)
+            if cached is None:
+                result = self.converter.convert(data_dictionary_url)
+                cached = result.document.export_to_markdown()
+                self.set_cache(data_dictionary_url, cached)
+            return cached
+        except requests.RequestException as e:
+            return f"Error fetching data dictionary: {str(e)}"
+class DatasetSearchTool(Tool):
+    name = "dataset_search_tool"
+    description = """Searches the dataset matching the given query."""
+    MODEL_NAME = "BAAI/bge-small-en-v1.5"
+    COLLECTION_NAME = "open_data_dataset"
+    inputs = {
+        "query": {
+            "type": "string",
+            "description": "The search query string.",
+        },
+        "limit": {
+            "type": "integer",
+            "default": 5,
+            "description": "The maximum number of results to return. Defaults to 5. Must be between 1 and 10.",
+            "nullable": True,
+        }
+    }
+    output_type = "object"
+    def __init__(self):
+        """
+        Initializes the dataset search engine by setting up the Qdrant vector database, loading the embedding model,
+        and uploading dataset documents with their vector embeddings.
+        Steps performed:
+        - Initializes a QdrantClient with in-memory storage.
+        - Sets collection and model names.
+        - Loads a text embedding model.
+        - Creates a Qdrant collection with vector parameters based on the embedding model's output size.
+        - Loads dataset documents from a JSON file.
+        - Generates vector embeddings for each document's 'Resource Name'.
+        - Uploads the vectors and their corresponding payloads (documents) to the Qdrant collection.
+        """
+        super().__init__()
+        self.client = QdrantClient(":memory:")
+        self.embedding_model = TextEmbedding(self.MODEL_NAME)
+        self.client.create_collection(
+            collection_name=self.COLLECTION_NAME,
+            vectors_config=models.VectorParams(
+                size=self.client.get_embedding_size(self.MODEL_NAME),
+                distance=models.Distance.COSINE,
+            ),
+        )
+        data_path = './open-data-json-list-english-v3.json'
+        with open(data_path, 'r', encoding='utf-8') as file:
+            documents = json.load(file)
+            vectors = self.embedding_model.embed([self.__document_to_description(doc) for doc in documents])
+            payloads = documents
+            self.client.upload_collection(
+                collection_name=self.COLLECTION_NAME,
+                vectors=vectors,
+                payload=payloads
+            )
+    def __document_to_description(self, document: Dict[str, Any]) -> str:
+        """
+        Converts a document dictionary to a description string.
+        Args:
+            document (dict): The document dictionary containing dataset information.
+        Returns:
+            str: A formatted string describing the dataset.
+        """
+        description = document["Resource Name"] if document['Dataset Name'].lower().strip() in document['Resource Name'].lower().strip() else f'{document["Dataset Name"]} - {document["Resource Name"]}'
+        data_provider = document["Data Provider"]
+        data_category = document["Category"]
+        return f"{description}, provided by {data_provider}, categorized under \"{data_category}\""
+    def forward(self, query: str, limit: int = 5) -> List[Dict[str, Any]]:
+        """
+        Searches the dataset matching the given query.
+        Args:
+            query (str): The search query string.
+            limit (int, optional): The maximum number of results to return. Defaults to 5.
+        Returns:
+            dict: A list of dictionaries, each containing:
+                - "dataset": The details of the matched dataset.
+                - "score": The relevance score of the matched dataset.
+        """
+        if not query or not isinstance(query, str):
+            raise ValueError("Query must be a non-empty string")
+        if limit < 1 or limit > 10:
+            raise ValueError("Limit must be between 1 and 10")
+        search_result = self.client.query_points(
+            collection_name=self.COLLECTION_NAME,
+            query=models.Document(
+                text=query,
+                model=self.MODEL_NAME
+            ),
+            limit=limit
+        ).points
+        return [ { "dataset": point.payload, "score": point.score } for point in search_result ]
+    def __del__(self):
+        """Cleanup resources when the object is destroyed."""
+        if hasattr(self, 'client'):
+            self.client.close()
+class ExploreDatasetTool(Tool):
+    name = "explore_dataset_tool"
+    description = """Fetches detailed metadata for a dataset from data.gov.hk using its dataset ID."""
+    inputs = {
+        "dataset_id": {
+            "type": "string",
+            "description": "The unique identifier of the dataset.",
+        }
+    }
+    output_type = "object"
+    def get_valuables(self, dataset_json):
+        """
+        Extracts the most valuable properties from a dataset JSON object.
+        Returns a dictionary with key information for LLM execution.
+        """
+        result = {}
+        # Basic dataset info
+        result['id'] = dataset_json['result'].get('id')
+        result['name'] = dataset_json['result'].get('name')
+        result['notes'] = dataset_json['result'].get('notes')
+        result['title'] = dataset_json['result'].get('title')
+        result['description'] = dataset_json['result'].get('notes')
+        result['update_frequency'] = dataset_json['result'].get('update_frequency')
+        result['data_dictionary'] = dataset_json['result'].get('data_dictionary')
+        # Resources: only keep key fields for each resource, and filter for format 'JSON' only
+        valuable_resource_fields = ['name', 'description', 'format', 'inLanguage', 'url']
+        result['resources'] = []
+        for res in dataset_json['result'].get('resources', []):
+            if res.get('format', '').upper() == 'JSON':
+                filtered_res = {k: res.get(k) for k in valuable_resource_fields if k in res}
+                result['resources'].append(filtered_res)
+        return result
+    def forward(self, dataset_id: str) -> dict:
+        """
+        Fetch detailed metadata for a dataset from data.gov.hk using its dataset ID.
+        Args:
+            dataset_id (str): The unique identifier of the dataset.
+        Returns:
+            dict: The dataset metadata as returned by the API, or an error message if the fetch fails.
+        Example:
+            >>> tool = ExploreDatasetTool()
+            >>> info = tool(dataset_id='hk-hko-rss-9-day-weather-forecast')
+            >>> print(info)
+        """
+        if not dataset_id or not isinstance(dataset_id, str):
+            raise ValueError("Dataset ID must be a non-empty string")
+        url = f"https://data.gov.hk/en-data/api/3/action/package_show?id={dataset_id}"
+        try:
+            response = requests.get(url, timeout=10)  # Added timeout
+            response.raise_for_status()  # Raises an HTTPError for bad responses
+            return self.get_valuables(response.json())
+        except requests.RequestException as e:
+            return {
+                "error": f"Error fetching dataset: {str(e)}"
+            }
+class FetchContentTool(Tool):
+    name = "fetch_content_tool"
+    description = """Fetches the content from a given URL."""
+    inputs = {
+        "url": {
+            "type": "string",
+            "description": "The URL to fetch content from.",
+        }
+    }
+    output_type = "string"
+    def forward(self, url: str) -> str:
+        """
+        Fetch the content of a URL.
+        Args:
+            url (str): The URL to fetch content from.
+        Returns:
+            str: The content of the URL as text, or an error message if the fetch fails.
+                  If the response is JSON, returns the JSON as a string.
+        Example:
+            >>> tool = FetchContentTool()
+            >>> content = tool(url='https://example.com')
+            >>> print(content)
+        """
+        if not url or not isinstance(url, str):
+            raise ValueError("URL must be a non-empty string")
+        try:
+            response = requests.get(url, timeout=10)
+            response.raise_for_status()
+            content_type = response.headers.get("Content-Type", "")
+            if "application/json" not in content_type:
+                return f"Error fetching content: Expected JSON, but got {content_type}"
+            return response.text
+        except requests.RequestException as e:
+            return f"Error fetching content: {str(e)}"
+# Update the main section
+if __name__ == "__main__":
+    read_data_dictionary_tool = ReadDataDictionaryTool()
+    print(read_data_dictionary_tool("https://opendata.mtr.com.hk/doc/Next_Train_DataDictionary_v1.7.pdf"))

open-data-json-list-english-v3.json ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+smolagents[openai,mcp]
+qdrant-client[fastembed]
+python-dotenv
+docling
+gradio