marcos-banik commited on
Commit
1c6a590
·
1 Parent(s): 020b298

🚧 More tools

Browse files
Files changed (2) hide show
  1. app.py +7 -1
  2. tools.py +123 -0
app.py CHANGED
@@ -12,7 +12,11 @@ from smolagents import (
12
  PythonInterpreterTool,
13
  )
14
 
15
- from tools import list_wikipedia_sections
 
 
 
 
16
 
17
  # (Keep Constants as is)
18
  # --- Constants ---
@@ -66,6 +70,8 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
66
  wiki_search,
67
  py_run,
68
  list_wikipedia_sections,
 
 
69
  ],
70
  verbosity_level=2,
71
  additional_authorized_imports=authorized_imports,
 
12
  PythonInterpreterTool,
13
  )
14
 
15
+ from tools import (
16
+ list_wikipedia_sections,
17
+ count_tables_in_wikipedia_section,
18
+ extract_nth_table_in_wikipedia_section,
19
+ )
20
 
21
  # (Keep Constants as is)
22
  # --- Constants ---
 
70
  wiki_search,
71
  py_run,
72
  list_wikipedia_sections,
73
+ count_tables_in_wikipedia_section,
74
+ extract_nth_table_in_wikipedia_section,
75
  ],
76
  verbosity_level=2,
77
  additional_authorized_imports=authorized_imports,
tools.py CHANGED
@@ -1,8 +1,10 @@
1
  from bs4 import BeautifulSoup
2
  import requests
3
  import re
 
4
 
5
 
 
6
  def list_wikipedia_sections(page_title: str) -> list[str]:
7
  """
8
  Return an ordered list of section headings from a Wikipedia article.
@@ -34,3 +36,124 @@ def list_wikipedia_sections(page_title: str) -> list[str]:
34
  sections.append(text)
35
 
36
  return sections
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from bs4 import BeautifulSoup
2
  import requests
3
  import re
4
+ from smolagents import tool
5
 
6
 
7
+ @tool
8
  def list_wikipedia_sections(page_title: str) -> list[str]:
9
  """
10
  Return an ordered list of section headings from a Wikipedia article.
 
36
  sections.append(text)
37
 
38
  return sections
39
+
40
+
41
+ @tool
42
+ def count_tables_in_wikipedia_section(
43
+ page_title: str, section_title: str
44
+ ) -> int:
45
+ """
46
+ Return the number of HTML <table> elements found within a specified section of a Wikipedia article.
47
+
48
+ Args:
49
+ page_title (str): Title of the Wikipedia article (spaces will be replaced with underscores),
50
+ e.g., "Python (programming language)".
51
+ section_title (str): Visible heading of the section to inspect, e.g., "Discography".
52
+
53
+ Returns:
54
+ int: The count of <table> tags under the given section heading, stopping at the next section.
55
+ Returns 0 if the article or section is not found or if no tables are present.
56
+ """
57
+ url = f"https://en.wikipedia.org/wiki/{page_title.replace(' ', '_')}"
58
+ resp = requests.get(url, timeout=10)
59
+ resp.raise_for_status()
60
+
61
+ soup = BeautifulSoup(resp.text, "html.parser")
62
+
63
+ div = None
64
+ for div in soup.find_all(
65
+ "div", class_=lambda cls: cls and cls.startswith("mw-heading")
66
+ ):
67
+ title = div.text.strip()
68
+ if title:
69
+ title = re.sub(r"\[edit\]$", "", title).strip()
70
+ if title.lower() == section_title.lower():
71
+ div = div
72
+ break
73
+
74
+ if not div:
75
+ return 0
76
+
77
+ count = 0
78
+
79
+ for sibling in div.find_next_siblings():
80
+ if sibling.name == "table":
81
+ count += 1
82
+ if (
83
+ sibling.name == "div"
84
+ and sibling.get("class")
85
+ and any(c.startswith("mw-heading") for c in sibling["class"])
86
+ ):
87
+ break
88
+
89
+ return count
90
+
91
+
92
+ @tool
93
+ def extract_nth_table_in_wikipedia_section(
94
+ page_title: str, section_title: str, n: int
95
+ ) -> str:
96
+ """
97
+ Extract the HTML of the nth table within a specified section of a Wikipedia article.
98
+
99
+ Args:
100
+ page_title (str):
101
+ Title of the Wikipedia article (e.g., "Queen (band)" or "Python (programming language)").
102
+ Spaces are automatically replaced with underscores.
103
+ section_title (str):
104
+ Visible title of the section to search (e.g., "Discography").
105
+ n (int):
106
+ 1-based index specifying which table to extract (1 for the first table, 2 for second, etc.).
107
+
108
+ Returns:
109
+ str: The full HTML string of the requested <table> element, including all nested tags.
110
+ Returns an empty string if:
111
+ - The article cannot be found.
112
+ - The section does not exist.
113
+ - The section contains fewer than n tables.
114
+ """
115
+ url = f"https://en.wikipedia.org/wiki/{page_title.replace(' ', '_')}"
116
+ resp = requests.get(url, timeout=10)
117
+ resp.raise_for_status()
118
+ soup = BeautifulSoup(resp.text, "html.parser")
119
+
120
+ div = None
121
+ for div in soup.find_all(
122
+ "div", class_=lambda cls: cls and cls.startswith("mw-heading")
123
+ ):
124
+ title = div.text.strip()
125
+ if title:
126
+ title = re.sub(r"\[edit\]$", "", title).strip()
127
+ if title.lower() == section_title.lower():
128
+ break
129
+ if not div:
130
+ return ""
131
+
132
+ # Iterate siblings until next section header
133
+ count = 0
134
+ tbl = None
135
+ for sib in div.find_next_siblings():
136
+ if (
137
+ sib.name == "div"
138
+ and sib.get("class")
139
+ and any(c.startswith("mw-heading") for c in sib["class"])
140
+ ):
141
+ break
142
+ if sib.name == "table":
143
+ count += 1
144
+ if count == n:
145
+ tbl = sib
146
+ break
147
+
148
+ # Convert the table to TSV
149
+ rows = []
150
+ for tr in tbl.find_all("tr"):
151
+ cells = tr.find_all(["th", "td"])
152
+ texts = [
153
+ cell.get_text(separator=" ", strip=True).replace("\t", " ")
154
+ for cell in cells
155
+ ]
156
+ if texts:
157
+ rows.append("\t".join(texts))
158
+
159
+ return "\n".join(rows)