alisamak commited on
Commit
25b44c6
·
verified ·
1 Parent(s): d19b1bb

Update tools.py

Browse files
Files changed (1) hide show
  1. tools.py +115 -237
tools.py CHANGED
@@ -1,42 +1,84 @@
1
- from langchain_core.tools import tool
2
- from urllib.parse import urlparse
3
  from duckduckgo_search import DDGS
4
  import wikipedia
5
- import requests
6
  import chess
7
  import chess.engine
8
  import sympy
9
- import fitz # PyMuPDF
10
  import pandas as pd
11
  from imdb import IMDb
12
  from youtube_transcript_api import YouTubeTranscriptApi
13
  import yt_dlp
14
  import whisper
15
- from bs4 import BeautifulSoup
16
- import re
17
- import time
18
- from typing import Optional, List, Dict, Any
19
- import re
20
- from datetime import datetime, timedelta
21
- from langchain_core.tools import tool
22
- import logging
23
- from tavily import TavilyClient
24
- import os
25
 
26
  TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
27
 
28
  client = TavilyClient(api_key=TAVILY_API_KEY)
29
 
30
- # Dictionary of known GAIA-style entities → canonical Wikipedia URLs
31
- WIKIPEDIA_PAGES = {
32
- "mercedes sosa": "https://en.wikipedia.org/wiki/Mercedes_Sosa",
33
- "summer olympics": "https://en.wikipedia.org/wiki/Summer_Olympic_Games",
34
- "united nations": "https://en.wikipedia.org/wiki/United_Nations",
35
- "pink floyd": "https://en.wikipedia.org/wiki/Pink_Floyd",
36
- "chess": "https://en.wikipedia.org/wiki/Chess",
37
- "dinosaur": "https://en.wikipedia.org/wiki/Dinosaur",
38
- # ➕ add more GAIA topics here
39
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  @tool
42
  def extract_number_from_snippets(snippets: list[str]) -> Optional[int]:
@@ -64,115 +106,6 @@ def extract_number_from_snippets(snippets: list[str]) -> Optional[int]:
64
  return None
65
 
66
 
67
- @tool
68
- def tavily_search(query: str, k: int = 5) -> list[str]:
69
- """
70
- Perform a web search using the Tavily API and return up to k relevant snippets.
71
- """
72
- try:
73
- response = client.search(query=query, search_depth="advanced", max_results=k)
74
- return [r["content"] for r in response.get("results", [])]
75
- except Exception as e:
76
- return [f"❌ Error during Tavily search: {str(e)}"]
77
-
78
- @tool
79
- def get_article_nominator_from_fac_page(title: str) -> str:
80
- """
81
- Get the nominator of a Featured Article by scanning the main FAC page (not just archives).
82
- """
83
- base = "https://en.wikipedia.org/wiki/Wikipedia:Featured_article_candidates"
84
- url = f"{base}/{title}"
85
- res = requests.get(url)
86
- if res.status_code != 200:
87
- return "Nominator not found"
88
-
89
- soup = BeautifulSoup(res.text, "html.parser")
90
- text = soup.get_text()
91
-
92
- # Try direct pattern first
93
- match = re.search(r"nominated by \[\[User:(.*?)\]\]", text, re.IGNORECASE)
94
- if match:
95
- return match.group(1).strip()
96
-
97
- # Fallback: try to find first signed comment (e.g. --[[User:XYZ]])
98
-
99
-
100
- @tool
101
- def count_sosa_studio_albums_2000s() -> int:
102
- """
103
- Returns the number of studio albums by Mercedes Sosa released between 2000 and 2009 (inclusive).
104
- Scrapes the 'Studio albums' section of her Wikipedia page.
105
- """
106
- import requests
107
- from bs4 import BeautifulSoup
108
- import re
109
-
110
- url = "https://en.wikipedia.org/wiki/Mercedes_Sosa"
111
- res = requests.get(url)
112
- soup = BeautifulSoup(res.text, "html.parser")
113
-
114
- albums = []
115
- start_header = None
116
-
117
- # Find the "Studio albums" header
118
- for tag in soup.find_all(["h2", "h3"]):
119
- if 'Studio albums' in tag.get_text():
120
- start_header = tag
121
- break
122
-
123
- if not start_header:
124
- return 0
125
-
126
- # Loop over the siblings until we hit the next major section
127
- for sibling in start_header.find_next_siblings():
128
- if sibling.name in ["h2", "h3"]:
129
- break # stop at next section
130
-
131
- if sibling.name == "ul":
132
- for li in sibling.find_all("li"):
133
- text = li.get_text()
134
- match = re.search(r"\b(19|20)\d{2}\b", text)
135
- if match:
136
- year = int(match.group())
137
- if 2000 <= year <= 2009:
138
- albums.append(text.strip())
139
-
140
- return len(set(albums))
141
-
142
-
143
- @tool
144
- def count_albums_by_year_range(title: str, start_year: int, end_year: int) -> int:
145
- """
146
- Count how many studio albums listed on the Wikipedia page were released between start_year and end_year.
147
- This function targets the "Studio albums" section.
148
- """
149
- url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
150
- response = requests.get(url)
151
- soup = BeautifulSoup(response.text, "html.parser")
152
-
153
- studio_section = None
154
- for header in soup.find_all(["h2", "h3"]):
155
- if "Studio albums" in header.get_text():
156
- studio_section = header
157
- break
158
-
159
- if not studio_section:
160
- return 0
161
-
162
- albums = []
163
- for elem in studio_section.find_next_siblings():
164
- if elem.name in ["h2", "h3"]: # next section begins
165
- break
166
- for li in elem.find_all("li"):
167
- text = li.get_text()
168
- year_match = re.search(r"(19|20)\d{2}", text)
169
- if year_match:
170
- year = int(year_match.group())
171
- if start_year <= year <= end_year:
172
- albums.append(text)
173
-
174
- return len(albums)
175
-
176
  @tool
177
  def get_article_nominator_from_fac_page(title: str) -> str:
178
  """
@@ -199,57 +132,8 @@ def get_article_nominator_from_fac_page(title: str) -> str:
199
 
200
  return "Nominator not found"
201
 
202
- @tool
203
- def handle_question(question: str) -> str:
204
- """
205
- Dynamically handle a question by routing to appropriate tools and combining results.
206
- """
207
- strategy = route_question.run(question)
208
-
209
- if strategy == "extract_structured_facts_from_url":
210
- wiki_url = resolve_wikipedia_url.run(question)
211
- if not wiki_url:
212
- return "❌ Could not find Wikipedia page."
213
- return extract_structured_facts_from_url.run(wiki_url)
214
-
215
- if strategy == "search_featured_articles_by_date_range":
216
- return search_featured_articles_by_date_range.run("2016-11-01", "2016-11-30")
217
-
218
- return "🤔 I will use internal reasoning."
219
-
220
-
221
- @tool
222
- def resolve_wikipedia_url(question: str) -> Optional[str]:
223
- """
224
- Returns a known Wikipedia URL if the question contains a known entity.
225
- """
226
- q = question.lower()
227
- for key, url in WIKIPEDIA_PAGES.items():
228
- if key in q:
229
- logging.info(f"[Router] Matched '{key}' → {url}")
230
- return url
231
- logging.info(f"[Router] No match for: {question}")
232
- return None
233
-
234
- @tool
235
- def route_question(question: str) -> str:
236
- """
237
- Determines the best tool to answer a given question.
238
- Returns: one of 'search_web', 'extract_structured_facts_from_url', or 'use_internal_logic'
239
- """
240
- q = question.lower()
241
-
242
- if "who" in q or "what" in q or "how many" in q or "when" in q:
243
- return "tavily_search"
244
 
245
- if "wikipedia" in q and any(k in q for k in ["how many", "list", "albums", "awards", "release"]):
246
- return "extract_structured_facts_from_url"
247
 
248
- if "featured article" in q and any(k in q for k in ["promoted", "in", "nominated"]):
249
- return "search_featured_articles_by_date_range"
250
-
251
- # Default to internal logic (math, logic puzzles, wordplay)
252
- return "use_internal_logic"
253
 
254
  @tool
255
  def extract_structured_facts_from_url(url: str, selector: Optional[str] = None) -> str:
@@ -337,53 +221,53 @@ def categorize_grocery_items(items: list[str]) -> dict:
337
  return result
338
 
339
 
340
- @tool
341
- def search_featured_articles_by_date_range(start_date: str, end_date: str) -> list[str]:
342
- """
343
- Searches the English Wikipedia featured article archive and returns article titles
344
- promoted between start_date and end_date.
345
- Args:
346
- start_date (str): Start date in YYYY-MM-DD format (e.g. '2016-11-01')
347
- end_date (str): End date in YYYY-MM-DD format (e.g. '2016-11-30')
348
- Returns:
349
- list[str]: A list of article titles promoted as Featured Articles during that period.
350
- """
351
- print(f"🛠️ search_featured_articles_by_date_range called with: {start_date} , {end_date}")
352
- try:
353
- base_url = "https://en.wikipedia.org/wiki/Wikipedia:Featured_articles"
354
- archive_url = "https://en.wikipedia.org/wiki/Wikipedia:Featured_articles_by_year"
355
-
356
- start = datetime.strptime(start_date, "%Y-%m-%d")
357
- end = datetime.strptime(end_date, "%Y-%m-%d")
358
-
359
- # We'll collect year-specific pages
360
- result_titles = []
361
-
362
- for year in range(start.year, end.year + 1):
363
- url = f"https://en.wikipedia.org/wiki/Wikipedia:Featured_articles_{year}"
364
- response = requests.get(url)
365
- if response.status_code != 200:
366
- continue
367
-
368
- soup = BeautifulSoup(response.text, "html.parser")
369
- for li in soup.select("li"):
370
- text = li.get_text()
371
- date_matches = date_matches = re.findall(r"\b(19\d{2}|20\d{2})-\d{2}-\d{2}\b", text)
372
- print("🔍 Date matches:", date_matches)
373
 
374
- for match in date_matches:
375
- try:
376
- d = datetime.strptime(match, "%Y-%m-%d")
377
- if start <= d <= end:
378
- a_tag = li.find("a")
379
- if a_tag:
380
- result_titles.append(a_tag.get_text(strip=True))
381
- except:
382
- continue
383
-
384
- return sorted(set(result_titles))
385
- except Exception as e:
386
- return [f"Error: {str(e)}"]
387
 
388
  @tool
389
  def detect_non_commutative_subset(table_text: str) -> str:
@@ -463,19 +347,13 @@ def filter_vegetables(items: list[str]) -> list[str]:
463
 
464
  # List of all tools
465
  all_tools = [
 
466
  extract_number_from_snippets,
467
- tavily_search,
468
- route_question,
469
- resolve_wikipedia_url,
470
- handle_question,
471
- search_featured_articles_by_date_range,
472
- get_article_nominator_from_fac_page,
473
- count_sosa_studio_albums_2000s,
474
- count_albums_by_year_range,
475
- extract_structured_facts_from_url,
476
  detect_non_commutative_subset,
477
  reverse_sentence,
478
  filter_vegetables,
479
  categorize_grocery_items,
 
 
480
  ]
481
 
 
 
 
1
  from duckduckgo_search import DDGS
2
  import wikipedia
 
3
  import chess
4
  import chess.engine
5
  import sympy
6
+ import fitz
7
  import pandas as pd
8
  from imdb import IMDb
9
  from youtube_transcript_api import YouTubeTranscriptApi
10
  import yt_dlp
11
  import whisper
 
 
 
 
 
 
 
 
 
 
12
 
13
  TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
14
 
15
  client = TavilyClient(api_key=TAVILY_API_KEY)
16
 
17
+ @tool
18
+ def handle_question(question: str) -> str:
19
+ """
20
+ Simple router for question types. Uses web_lookup as default.
21
+ """
22
+ if "table" in question and "*" in question:
23
+ return detect_non_commutative_subset.run(question)
24
+ if "reverse" in question or "backwards" in question:
25
+ return reverse_sentence.run(question)
26
+ if "vegetables" in question:
27
+ return ", ".join(filter_vegetables.run(question.split(", ")))
28
+ return web_lookup.run(question)
29
+
30
+
31
+ @tool
32
+ def web_lookup(query: str) -> str:
33
+ """
34
+ Unified web search tool that:
35
+ - Uses Tavily API to retrieve relevant snippets.
36
+ - Extracts the most relevant numeric or short factual answer.
37
+ - Falls back to Wikipedia if Tavily fails.
38
+
39
+ Args:
40
+ query (str): The user query or question.
41
+
42
+ Returns:
43
+ str: A concise factual answer extracted from Tavily or Wikipedia.
44
+ """
45
+ try:
46
+ # Step 1: Tavily search
47
+ response = client.search(query=query, search_depth="advanced", max_results=5)
48
+ snippets = [r["content"] for r in response.get("results", [])]
49
+
50
+ for s in snippets:
51
+ # Try to extract a meaningful answer (year, name, short fact)
52
+ match = re.search(r"\b(18|19|20)\d{2}\b", s)
53
+ if match:
54
+ return match.group()
55
+ elif len(s.split()) <= 12:
56
+ return s.strip()
57
+
58
+ # Step 2: Wikipedia fallback
59
+ # Guess page title from query
60
+ wiki_title = query.split(" ")[-1].capitalize()
61
+ wiki_url = f"https://en.wikipedia.org/wiki/{wiki_title}"
62
+ res = requests.get(wiki_url, timeout=10)
63
+ if res.status_code != 200:
64
+ return "❌ Wikipedia page not found."
65
+
66
+ soup = BeautifulSoup(res.text, "html.parser")
67
+ text = soup.get_text()
68
+ match = re.search(r"\b(18|19|20)\d{2}\b", text)
69
+ if match:
70
+ return match.group()
71
+
72
+ # Fallback to first paragraph or snippet
73
+ paras = soup.find_all("p")
74
+ if paras:
75
+ for p in paras:
76
+ if p.get_text(strip=True):
77
+ return p.get_text(strip=True)
78
+
79
+ return "❌ No relevant data found."
80
+ except Exception as e:
81
+ return f"❌ Error during web lookup: {str(e)}"
82
 
83
  @tool
84
  def extract_number_from_snippets(snippets: list[str]) -> Optional[int]:
 
106
  return None
107
 
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  @tool
110
  def get_article_nominator_from_fac_page(title: str) -> str:
111
  """
 
132
 
133
  return "Nominator not found"
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
 
 
136
 
 
 
 
 
 
137
 
138
  @tool
139
  def extract_structured_facts_from_url(url: str, selector: Optional[str] = None) -> str:
 
221
  return result
222
 
223
 
224
+ # @tool
225
+ # def search_featured_articles_by_date_range(start_date: str, end_date: str) -> list[str]:
226
+ # """
227
+ # Searches the English Wikipedia featured article archive and returns article titles
228
+ # promoted between start_date and end_date.
229
+ # Args:
230
+ # start_date (str): Start date in YYYY-MM-DD format (e.g. '2016-11-01')
231
+ # end_date (str): End date in YYYY-MM-DD format (e.g. '2016-11-30')
232
+ # Returns:
233
+ # list[str]: A list of article titles promoted as Featured Articles during that period.
234
+ # """
235
+ # print(f"🛠️ search_featured_articles_by_date_range called with: {start_date} , {end_date}")
236
+ # try:
237
+ # base_url = "https://en.wikipedia.org/wiki/Wikipedia:Featured_articles"
238
+ # archive_url = "https://en.wikipedia.org/wiki/Wikipedia:Featured_articles_by_year"
239
+
240
+ # start = datetime.strptime(start_date, "%Y-%m-%d")
241
+ # end = datetime.strptime(end_date, "%Y-%m-%d")
242
+
243
+ # # We'll collect year-specific pages
244
+ # result_titles = []
245
+
246
+ # for year in range(start.year, end.year + 1):
247
+ # url = f"https://en.wikipedia.org/wiki/Wikipedia:Featured_articles_{year}"
248
+ # response = requests.get(url)
249
+ # if response.status_code != 200:
250
+ # continue
251
+
252
+ # soup = BeautifulSoup(response.text, "html.parser")
253
+ # for li in soup.select("li"):
254
+ # text = li.get_text()
255
+ # date_matches = date_matches = re.findall(r"\b(19\d{2}|20\d{2})-\d{2}-\d{2}\b", text)
256
+ # print("🔍 Date matches:", date_matches)
257
 
258
+ # for match in date_matches:
259
+ # try:
260
+ # d = datetime.strptime(match, "%Y-%m-%d")
261
+ # if start <= d <= end:
262
+ # a_tag = li.find("a")
263
+ # if a_tag:
264
+ # result_titles.append(a_tag.get_text(strip=True))
265
+ # except:
266
+ # continue
267
+
268
+ # return sorted(set(result_titles))
269
+ # except Exception as e:
270
+ # return [f"Error: {str(e)}"]
271
 
272
  @tool
273
  def detect_non_commutative_subset(table_text: str) -> str:
 
347
 
348
  # List of all tools
349
  all_tools = [
350
+ web_lookup,
351
  extract_number_from_snippets,
 
 
 
 
 
 
 
 
 
352
  detect_non_commutative_subset,
353
  reverse_sentence,
354
  filter_vegetables,
355
  categorize_grocery_items,
356
+ get_article_nominator_from_fac_page,
357
+ # Optional: handle_question (for fallback routing)
358
  ]
359