gnokit commited on
Commit
43bc4f7
·
1 Parent(s): a850eda

initial version

Browse files
Files changed (6) hide show
  1. .gitignore +140 -0
  2. app.py +197 -0
  3. app_config.py +132 -0
  4. app_tools.py +288 -0
  5. open-data-json-list-english-v3.json +0 -0
  6. requirements.txt +5 -0
.gitignore ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+
53
+ # Translations
54
+ *.mo
55
+ *.pot
56
+
57
+ # Django stuff:
58
+ *.log
59
+ local_settings.py
60
+ db.sqlite3
61
+ db.sqlite3-journal
62
+
63
+ # Flask stuff:
64
+ instance/
65
+ .webassets-cache
66
+
67
+ # Scrapy stuff:
68
+ .scrapy
69
+
70
+ # Sphinx documentation
71
+ docs/_build/
72
+ build/
73
+ tmp/
74
+
75
+ # PyBuilder
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ .python-version
87
+
88
+ # pipenv
89
+ Pipfile.lock
90
+
91
+ # poetry
92
+ poetry.lock
93
+
94
+ # PEP 582; used by python-next
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ .gradio
108
+ env/
109
+ venv/
110
+ ENV/
111
+ env.bak/
112
+ venv.bak/
113
+ cache/
114
+
115
+ # Spyder project settings
116
+ .spyderproject
117
+ .spyderworkspace
118
+
119
+ # Rope project settings
120
+ .ropeproject
121
+
122
+ # mkdocs documentation
123
+ /site
124
+
125
+ # mypy
126
+ .mypy_cache/
127
+ .dmypy.json
128
+ dmypy.json
129
+
130
+ # Pyre type checker
131
+ .pyre/
132
+
133
+ # pytype static type analyzer
134
+ .pytype/
135
+
136
+ # Cython debug symbols
137
+ cython_debug/
138
+
139
+ # VS Code settings
140
+ .vscode/
app.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import logging
3
+ from app_config import create_agent, create_task, get_app_info
4
+ from smolagents import MultiStepAgent
5
+ from smolagents.gradio_ui import GradioUI, stream_to_gradio
6
+
7
+ # Configure logging
8
+ logging.basicConfig(
9
+ level=logging.INFO,
10
+ format='%(asctime)s - %(levelname)s - %(message)s',
11
+ datefmt='%Y-%m-%d %H:%M:%S'
12
+ )
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class EggTartAgentUI(GradioUI):
17
+ def __init__(self, agent, app_name, app_description, example_questions):
18
+ super().__init__(agent)
19
+ self.name = app_name
20
+ self.description = app_description
21
+ self.example_questions = example_questions
22
+
23
+ def interact_with_agent(self, prompt, messages, session_state):
24
+ import gradio as gr
25
+
26
+ # Get the agent type from the template agent
27
+ if "agent" not in session_state:
28
+ session_state["agent"] = self.agent
29
+
30
+ try:
31
+ messages.append(gr.ChatMessage(role="user", content=prompt, metadata={"status": "done"}))
32
+ yield messages
33
+
34
+ task = create_task(question=prompt)
35
+ for msg in stream_to_gradio(session_state["agent"], task=task, reset_agent_memory=False):
36
+ if isinstance(msg, gr.ChatMessage):
37
+ messages[-1].metadata["status"] = "done"
38
+ messages.append(msg)
39
+ elif isinstance(msg, str): # Then it's only a completion delta
40
+ msg = msg.replace("<", r"\<").replace(">", r"\>") # HTML tags seem to break Gradio Chatbot
41
+ if messages[-1].metadata["status"] == "pending":
42
+ messages[-1].content = msg
43
+ else:
44
+ messages.append(gr.ChatMessage(role="assistant", content=msg, metadata={"status": "pending"}))
45
+ yield messages
46
+
47
+ yield messages
48
+ except Exception as e:
49
+ yield messages
50
+ raise gr.Error(f"Error in interaction: {str(e)}")
51
+
52
+ def ask_egg_tart_agent(self, question: str) -> str:
53
+ """
54
+ Search for information about Hong Kong based on the user's question.
55
+
56
+ Args:
57
+ question (str): The user's question about Hong Kong.
58
+
59
+ Returns:
60
+ str: A concise answer based on the datasets available, formatted in markdown.
61
+ """
62
+ logger.info(f"New task received: {question}")
63
+ response = agent.run(
64
+ task=create_task(question=question),
65
+ max_steps=15
66
+ )
67
+
68
+ return response.strip()
69
+
70
+ def create_app(self):
71
+
72
+ with gr.Blocks(theme=gr.themes.Ocean(), fill_height=True, title=self.name) as demo:
73
+ # Add session state to store session-specific data
74
+ session_state = gr.State({})
75
+ stored_messages = gr.State([])
76
+ file_uploads_log = gr.State([])
77
+
78
+ with gr.Sidebar():
79
+ gr.Markdown(
80
+ f"# {self.name}\n\n{self.description}"
81
+ )
82
+
83
+ with gr.Group():
84
+ gr.Markdown("**Your Question**", container=True)
85
+ text_input = gr.Textbox(
86
+ lines=3,
87
+ label="Chat Message",
88
+ container=False,
89
+ placeholder="Enter your question here and press Shift+Enter or press the button",
90
+ )
91
+ submit_btn = gr.Button("Submit", variant="primary")
92
+
93
+ # If an upload folder is provided, enable the upload feature
94
+ if self.file_upload_folder is not None:
95
+ upload_file = gr.File(label="Upload a file")
96
+ upload_status = gr.Textbox(label="Upload Status", interactive=False, visible=False)
97
+ upload_file.change(
98
+ self.upload_file,
99
+ [upload_file, file_uploads_log],
100
+ [upload_status, file_uploads_log],
101
+ )
102
+
103
+ gr.Examples(self.example_questions,text_input)
104
+
105
+ mcp_btn = gr.Button(
106
+ "MCP Server Trigger",
107
+ variant="stop",
108
+ visible=False
109
+ )
110
+
111
+ mcp_result = gr.Markdown(
112
+ visible=False,
113
+ )
114
+
115
+ gr.HTML(
116
+ "<br><br><h4><center>Powered by <a target='_blank' href='https://github.com/huggingface/smolagents'><b>smolagents</b></a></center></h4>"
117
+ )
118
+
119
+ # Main chat interface
120
+ chatbot = gr.Chatbot(
121
+ label="Running Log",
122
+ type="messages",
123
+ avatar_images=(
124
+ None,
125
+ "./egg_tart_icon.png",
126
+ ),
127
+ resizeable=True,
128
+ scale=1,
129
+ latex_delimiters=[
130
+ {"left": r"$$", "right": r"$$", "display": True},
131
+ {"left": r"$", "right": r"$", "display": False},
132
+ {"left": r"\[", "right": r"\]", "display": True},
133
+ {"left": r"\(", "right": r"\)", "display": False},
134
+ ],
135
+ )
136
+
137
+ # Set up event handlers
138
+ text_input.submit(
139
+ self.log_user_message,
140
+ [text_input, file_uploads_log],
141
+ [stored_messages, text_input, submit_btn],
142
+ show_api=False,
143
+ ).then(
144
+ self.interact_with_agent,
145
+ [stored_messages, chatbot, session_state],
146
+ [chatbot],
147
+ show_api=False,
148
+ ).then(
149
+ lambda: (
150
+ gr.Textbox(
151
+ interactive=True, placeholder="Enter your question here and press Shift+Enter or the button"
152
+ ),
153
+ gr.Button(interactive=True),
154
+ ),
155
+ None,
156
+ [text_input, submit_btn],
157
+ show_api=False,
158
+ )
159
+
160
+ mcp_btn.click(
161
+ self.ask_egg_tart_agent,
162
+ [text_input],
163
+ [mcp_result],
164
+ show_api=True
165
+ )
166
+
167
+ submit_btn.click(
168
+ self.log_user_message,
169
+ [text_input, file_uploads_log],
170
+ [stored_messages, text_input, submit_btn],
171
+ show_api=False,
172
+ ).then(
173
+ self.interact_with_agent,
174
+ [stored_messages, chatbot, session_state],
175
+ [chatbot],
176
+ show_api=False,
177
+ ).then(
178
+ lambda: (
179
+ gr.Textbox(
180
+ interactive=True, placeholder="Enter your question here and press Shift+Enter or the button"
181
+ ),
182
+ gr.Button(interactive=True),
183
+ ),
184
+ None,
185
+ [text_input, submit_btn],
186
+ show_api=False,
187
+ )
188
+
189
+ return demo
190
+
191
+ if __name__ == "__main__":
192
+ logger.info("Starting Egg Tart Agent app initialization")
193
+ agent = create_agent()
194
+ app_name, app_description, example_questions = get_app_info()
195
+ logger.info(f"Egg Tart Agent initialized with app name: {app_name}")
196
+ ui = EggTartAgentUI(agent, app_name, app_description, example_questions)
197
+ ui.launch(share=False, mcp_server=True)
app_config.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from datetime import datetime
4
+ from zoneinfo import ZoneInfo
5
+ from dotenv import load_dotenv
6
+ from app_tools import DatasetSearchTool, ExploreDatasetTool, FetchContentTool, ReadDataDictionaryTool
7
+ from smolagents import CodeAgent, OpenAIServerModel
8
+
9
+ # Configure logging
10
+ logging.basicConfig(
11
+ level=logging.INFO,
12
+ format='%(asctime)s - %(levelname)s - %(message)s',
13
+ datefmt='%Y-%m-%d %H:%M:%S'
14
+ )
15
+ logger = logging.getLogger(__name__)
16
+
17
+ load_dotenv()
18
+
19
+ model_id = "Qwen/Qwen2.5-Coder-32B-Instruct"
20
+ app_name = "Egg Tart - 蛋撻仔 🍮"
21
+ app_description = f"""
22
+ Got a question about Hong Kong? Egg Tart - 蛋撻仔 🍮 is here to serve up answers with a smile—and a sprinkle of fun!
23
+ Powered by {model_id}, Explore 1,800 datasets from [data.gov.hk](https://data.gov.hk/en/).
24
+
25
+ ### 🍮 What I Can Help With:
26
+ - 🌦️ Weather & Air Quality
27
+ - 🚇 Transportation & Traffic
28
+ - 📊 Population & Demographics
29
+ - 📅 Public Holidays & Events
30
+ - 💰 Economic Data & Statistics
31
+ """
32
+
33
+ prompt = """
34
+ You are Egg Tart - 蛋撻仔 🍮, a friendly and knowledgeable assistant specializing in answering questions about Hong Kong using datasets from data.gov.hk.
35
+ Assume all user questions relate to Hong Kong and the current year. Today is {date}.
36
+
37
+ **Instructions:**
38
+
39
+ 1. Read the user's question carefully to understand their intent, keywords, and any relevant context or dates.
40
+ 2. Use `dataset_search_tool` to find datasets related to the user's question.
41
+ 3. For each promising dataset, use its 'Dataset ID' with `explore_dataset_tool` to get detailed metadata.
42
+ 4. Find the dataset's download URL in the metadata.
43
+ 5. If a download URL exists, use `fetch_content_tool` to get the dataset content.
44
+ 6. If fetching fails, use the data_dictionary URL from `explore_dataset_tool` and call `read_data_dictionary_tool` to learn how to construct or use the download URL.
45
+ 7. Use the data dictionary to compose the correct download URL if needed.
46
+ 8. Examine the dataset content and extract the most relevant, up-to-date information to answer the user's question.
47
+ 9. Write a concise, friendly markdown response in English (about 100 words) that includes:
48
+ - A heading restating the user's question in a welcoming tone.
49
+ - A clear, accurate answer with emojis where appropriate.
50
+ - Dates in the format: Weekday, Month Day, Year (e.g., Monday, June 10, 2024).
51
+ - A source citation with the dataset name, download URL, and the dataset's information date.
52
+ 10. If no suitable dataset is found, politely inform the user and suggest exploring data.gov.hk for more information.
53
+ 11. Briefly explain your reasoning step-by-step if relevant.
54
+
55
+ **Formatting Example:**
56
+
57
+ ### How many people live in Hong Kong?
58
+ 🏙️ The estimated population is around 7.5 million people, including both permanent and non-permanent residents. The number has remained stable in recent years, with minor changes due to migration and demographics. For more details by age or district, you can explore additional datasets from the government portal.
59
+
60
+ Source: [Hong Kong Population Estimates](https://data.gov.hk/dataset/population-estimates.csv) - Monday, June 10, 2024
61
+
62
+ **User Query:**
63
+ {query}
64
+ """
65
+
66
+ # Add these example questions near the top of your file
67
+ example_questions = [
68
+ "Should I bring an umbrella tomorrow in Hong Kong?",
69
+ "What is the route of Kowloon Motor Bus route 45? Please provide the starting point and the destination.",
70
+ "I want to plan a long vacation in 2025 by combining consecutive public holidays, Saturdays, and Sundays (since I don’t work weekends). Can you suggest the best 3 periods to maximize my time off? ",
71
+ "What is the latest unemployment rate in Hong Kong?",
72
+ "When is the next MTR train at Mong Kok on the Tsuen Wan line?"
73
+ ]
74
+
75
+ def get_app_info():
76
+ return app_name, app_description, example_questions
77
+
78
+ def create_task(question):
79
+ """
80
+ Generates a formatted task instruction for a given question, including the current date.
81
+
82
+ Args:
83
+ question (str): The question or query to be included in the task.
84
+
85
+ Returns:
86
+ str: The formatted task instruction string with the question and today's date.
87
+ """
88
+ logger.info(f"Creating new task: {question}")
89
+ now_hk = datetime.now(ZoneInfo("Asia/Hong_Kong"))
90
+ full_date = now_hk.strftime("%A, %B %d, %Y, %I:%M %p %Z")
91
+ return prompt.format(query=question, date=full_date)
92
+
93
+ def create_agent():
94
+ """
95
+ Creates and configures a CodeAgent instance with a specified language model and a set of tools.
96
+
97
+ The agent is initialized with:
98
+ - An OpenAIServerModel using the Nebius API, with parameters such as model_id, API key, temperature, and max_tokens.
99
+ - A list of tools for dataset search, exploration, content fetching, and reading data dictionaries.
100
+ - Additional authorized imports (e.g., "json").
101
+ - Base tools are not added by default.
102
+
103
+ Returns:
104
+ CodeAgent: An instance of CodeAgent configured with the specified model and tools.
105
+ """
106
+ logger.info(f"Initializing {app_name} with model: {model_id}")
107
+
108
+ # Create model instance
109
+ model = OpenAIServerModel(
110
+ model_id=model_id,
111
+ api_base="https://api.studio.nebius.com/v1/",
112
+ api_key=os.getenv("NEBIUS_API_KEY"),
113
+ temperature=0.2,
114
+ max_tokens=32000,
115
+ )
116
+
117
+ # Create tools list
118
+ tools = [
119
+ DatasetSearchTool(),
120
+ ExploreDatasetTool(),
121
+ FetchContentTool(),
122
+ ReadDataDictionaryTool(),
123
+ ]
124
+
125
+ agent = CodeAgent(
126
+ tools=tools,
127
+ additional_authorized_imports=["json"],
128
+ model=model,
129
+ add_base_tools=False,
130
+ )
131
+
132
+ return agent
app_tools.py ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from smolagents import Tool
2
+ from qdrant_client import QdrantClient, models
3
+ from fastembed import TextEmbedding
4
+ import json
5
+ from typing import List, Dict, Any
6
+ import requests
7
+ from docling.document_converter import DocumentConverter
8
+ import os
9
+
10
+ class ReadDataDictionaryTool(Tool):
11
+ name = "read_data_dictionary_tool"
12
+ description = """Reads the data dictionary url from a dataset and returns its content in markdown format."""
13
+ inputs = {
14
+ "data_dictionary_url": {
15
+ "type": "string",
16
+ "description": "The URL of the data dictionary to read.",
17
+ }
18
+ }
19
+ output_type = "string"
20
+
21
+ def __init__(self):
22
+ """
23
+ Initializes the ReadDataDictionaryTool.
24
+ This tool fetches the content of a data dictionary from a given URL.
25
+ """
26
+ super().__init__()
27
+ self.converter = DocumentConverter()
28
+ # create ./cache directory if not exists
29
+ os.makedirs('./cache', exist_ok=True)
30
+
31
+ def get_cache(self, data_dictionary_url: str) -> str:
32
+ """
33
+ Get cached markdown content of data_dictionary_url
34
+ """
35
+ cache_file = f'./cache/{data_dictionary_url.replace("/", "_").replace(":", "_")}.md'
36
+ if os.path.exists(cache_file):
37
+ with open(cache_file, 'r') as f:
38
+ return f.read()
39
+ return None
40
+
41
+ def set_cache(self, data_dictionary_url: str, content: str):
42
+ """
43
+ Set cached markdown content of data_dictionary_url
44
+ """
45
+ cache_file = f'./cache/{data_dictionary_url.replace("/", "_").replace(":", "_")}.md'
46
+ with open(cache_file, 'w') as f:
47
+ f.write(content)
48
+
49
+ def forward(self, data_dictionary_url: str) -> str:
50
+ """
51
+ Reads the data dictionary from a given URL.
52
+
53
+ Args:
54
+ data_dictionary_url (str): The URL of the data dictionary to read.
55
+
56
+ Returns:
57
+ str: The content of the data dictionary as text, or an error message if the fetch fails.
58
+ """
59
+ if not data_dictionary_url or not isinstance(data_dictionary_url, str):
60
+ raise ValueError("Data dictionary URL must be a non-empty string")
61
+
62
+ try:
63
+ cached = self.get_cache(data_dictionary_url)
64
+ if cached is None:
65
+ result = self.converter.convert(data_dictionary_url)
66
+ cached = result.document.export_to_markdown()
67
+ self.set_cache(data_dictionary_url, cached)
68
+
69
+ return cached
70
+ except requests.RequestException as e:
71
+ return f"Error fetching data dictionary: {str(e)}"
72
+
73
+ class DatasetSearchTool(Tool):
74
+ name = "dataset_search_tool"
75
+ description = """Searches the dataset matching the given query."""
76
+ MODEL_NAME = "BAAI/bge-small-en-v1.5"
77
+ COLLECTION_NAME = "open_data_dataset"
78
+ inputs = {
79
+ "query": {
80
+ "type": "string",
81
+ "description": "The search query string.",
82
+ },
83
+ "limit": {
84
+ "type": "integer",
85
+ "default": 5,
86
+ "description": "The maximum number of results to return. Defaults to 5. Must be between 1 and 10.",
87
+ "nullable": True,
88
+ }
89
+ }
90
+ output_type = "object"
91
+
92
+
93
+ def __init__(self):
94
+ """
95
+ Initializes the dataset search engine by setting up the Qdrant vector database, loading the embedding model,
96
+ and uploading dataset documents with their vector embeddings.
97
+
98
+ Steps performed:
99
+ - Initializes a QdrantClient with in-memory storage.
100
+ - Sets collection and model names.
101
+ - Loads a text embedding model.
102
+ - Creates a Qdrant collection with vector parameters based on the embedding model's output size.
103
+ - Loads dataset documents from a JSON file.
104
+ - Generates vector embeddings for each document's 'Resource Name'.
105
+ - Uploads the vectors and their corresponding payloads (documents) to the Qdrant collection.
106
+ """
107
+ super().__init__()
108
+ self.client = QdrantClient(":memory:")
109
+ self.embedding_model = TextEmbedding(self.MODEL_NAME)
110
+ self.client.create_collection(
111
+ collection_name=self.COLLECTION_NAME,
112
+ vectors_config=models.VectorParams(
113
+ size=self.client.get_embedding_size(self.MODEL_NAME),
114
+ distance=models.Distance.COSINE,
115
+ ),
116
+ )
117
+ data_path = './open-data-json-list-english-v3.json'
118
+ with open(data_path, 'r', encoding='utf-8') as file:
119
+ documents = json.load(file)
120
+ vectors = self.embedding_model.embed([self.__document_to_description(doc) for doc in documents])
121
+ payloads = documents
122
+ self.client.upload_collection(
123
+ collection_name=self.COLLECTION_NAME,
124
+ vectors=vectors,
125
+ payload=payloads
126
+ )
127
+
128
+ def __document_to_description(self, document: Dict[str, Any]) -> str:
129
+ """
130
+ Converts a document dictionary to a description string.
131
+
132
+ Args:
133
+ document (dict): The document dictionary containing dataset information.
134
+
135
+ Returns:
136
+ str: A formatted string describing the dataset.
137
+ """
138
+ description = document["Resource Name"] if document['Dataset Name'].lower().strip() in document['Resource Name'].lower().strip() else f'{document["Dataset Name"]} - {document["Resource Name"]}'
139
+ data_provider = document["Data Provider"]
140
+ data_category = document["Category"]
141
+ return f"{description}, provided by {data_provider}, categorized under \"{data_category}\""
142
+
143
+ def forward(self, query: str, limit: int = 5) -> List[Dict[str, Any]]:
144
+ """
145
+ Searches the dataset matching the given query.
146
+
147
+ Args:
148
+ query (str): The search query string.
149
+ limit (int, optional): The maximum number of results to return. Defaults to 5.
150
+
151
+ Returns:
152
+ dict: A list of dictionaries, each containing:
153
+ - "dataset": The details of the matched dataset.
154
+ - "score": The relevance score of the matched dataset.
155
+ """
156
+ if not query or not isinstance(query, str):
157
+ raise ValueError("Query must be a non-empty string")
158
+ if limit < 1 or limit > 10:
159
+ raise ValueError("Limit must be between 1 and 10")
160
+
161
+ search_result = self.client.query_points(
162
+ collection_name=self.COLLECTION_NAME,
163
+ query=models.Document(
164
+ text=query,
165
+ model=self.MODEL_NAME
166
+ ),
167
+ limit=limit
168
+ ).points
169
+
170
+ return [ { "dataset": point.payload, "score": point.score } for point in search_result ]
171
+
172
+ def __del__(self):
173
+ """Cleanup resources when the object is destroyed."""
174
+ if hasattr(self, 'client'):
175
+ self.client.close()
176
+
177
+
178
+ class ExploreDatasetTool(Tool):
179
+ name = "explore_dataset_tool"
180
+ description = """Fetches detailed metadata for a dataset from data.gov.hk using its dataset ID."""
181
+ inputs = {
182
+ "dataset_id": {
183
+ "type": "string",
184
+ "description": "The unique identifier of the dataset.",
185
+ }
186
+ }
187
+ output_type = "object"
188
+
189
+ def get_valuables(self, dataset_json):
190
+ """
191
+ Extracts the most valuable properties from a dataset JSON object.
192
+ Returns a dictionary with key information for LLM execution.
193
+ """
194
+ result = {}
195
+
196
+ # Basic dataset info
197
+ result['id'] = dataset_json['result'].get('id')
198
+ result['name'] = dataset_json['result'].get('name')
199
+ result['notes'] = dataset_json['result'].get('notes')
200
+ result['title'] = dataset_json['result'].get('title')
201
+ result['description'] = dataset_json['result'].get('notes')
202
+ result['update_frequency'] = dataset_json['result'].get('update_frequency')
203
+ result['data_dictionary'] = dataset_json['result'].get('data_dictionary')
204
+
205
+ # Resources: only keep key fields for each resource, and filter for format 'JSON' only
206
+ valuable_resource_fields = ['name', 'description', 'format', 'inLanguage', 'url']
207
+ result['resources'] = []
208
+ for res in dataset_json['result'].get('resources', []):
209
+ if res.get('format', '').upper() == 'JSON':
210
+ filtered_res = {k: res.get(k) for k in valuable_resource_fields if k in res}
211
+ result['resources'].append(filtered_res)
212
+
213
+ return result
214
+
215
+
216
+ def forward(self, dataset_id: str) -> dict:
217
+ """
218
+ Fetch detailed metadata for a dataset from data.gov.hk using its dataset ID.
219
+
220
+ Args:
221
+ dataset_id (str): The unique identifier of the dataset.
222
+
223
+ Returns:
224
+ dict: The dataset metadata as returned by the API, or an error message if the fetch fails.
225
+
226
+ Example:
227
+ >>> tool = ExploreDatasetTool()
228
+ >>> info = tool(dataset_id='hk-hko-rss-9-day-weather-forecast')
229
+ >>> print(info)
230
+ """
231
+ if not dataset_id or not isinstance(dataset_id, str):
232
+ raise ValueError("Dataset ID must be a non-empty string")
233
+
234
+ url = f"https://data.gov.hk/en-data/api/3/action/package_show?id={dataset_id}"
235
+ try:
236
+ response = requests.get(url, timeout=10) # Added timeout
237
+ response.raise_for_status() # Raises an HTTPError for bad responses
238
+ return self.get_valuables(response.json())
239
+ except requests.RequestException as e:
240
+ return {
241
+ "error": f"Error fetching dataset: {str(e)}"
242
+ }
243
+
244
+
245
+ class FetchContentTool(Tool):
246
+ name = "fetch_content_tool"
247
+ description = """Fetches the content from a given URL."""
248
+ inputs = {
249
+ "url": {
250
+ "type": "string",
251
+ "description": "The URL to fetch content from.",
252
+ }
253
+ }
254
+ output_type = "string"
255
+
256
+ def forward(self, url: str) -> str:
257
+ """
258
+ Fetch the content of a URL.
259
+
260
+ Args:
261
+ url (str): The URL to fetch content from.
262
+
263
+ Returns:
264
+ str: The content of the URL as text, or an error message if the fetch fails.
265
+ If the response is JSON, returns the JSON as a string.
266
+
267
+ Example:
268
+ >>> tool = FetchContentTool()
269
+ >>> content = tool(url='https://example.com')
270
+ >>> print(content)
271
+ """
272
+ if not url or not isinstance(url, str):
273
+ raise ValueError("URL must be a non-empty string")
274
+
275
+ try:
276
+ response = requests.get(url, timeout=10)
277
+ response.raise_for_status()
278
+ content_type = response.headers.get("Content-Type", "")
279
+ if "application/json" not in content_type:
280
+ return f"Error fetching content: Expected JSON, but got {content_type}"
281
+ return response.text
282
+ except requests.RequestException as e:
283
+ return f"Error fetching content: {str(e)}"
284
+
285
+ # Update the main section
286
+ if __name__ == "__main__":
287
+ read_data_dictionary_tool = ReadDataDictionaryTool()
288
+ print(read_data_dictionary_tool("https://opendata.mtr.com.hk/doc/Next_Train_DataDictionary_v1.7.pdf"))
open-data-json-list-english-v3.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ smolagents[openai,mcp]
2
+ qdrant-client[fastembed]
3
+ python-dotenv
4
+ docling
5
+ gradio