gabriel-melki commited on
Commit
68f7ba1
·
1 Parent(s): d47e192

to be squashed

Browse files
__pycache__/prompt.cpython-313.pyc ADDED
Binary file (3.8 kB). View file
 
__pycache__/wiki_extractor.cpython-313.pyc ADDED
Binary file (15.6 kB). View file
 
__pycache__/wikipedia_tools.cpython-313.pyc ADDED
Binary file (11.1 kB). View file
 
app.py CHANGED
@@ -11,13 +11,14 @@ from prompt import get_prompt
11
  from huggingface_hub import login
12
  from smolagents import (
13
  InferenceClientModel,
14
- FinalAnswerTool, WikipediaSearchTool, VisitWebpageTool, DuckDuckGoSearchTool,
15
  CodeAgent
16
  )
17
- from smolagents.tools import tool
18
- from smolagents.agents import PromptTemplates
19
 
20
- login(token=os.environ["HF_TOKEN"])
 
 
 
21
 
22
  # --- Constants ---
23
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
@@ -26,23 +27,13 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
26
  # --- model initialization ---
27
  model = InferenceClientModel(
28
  provider="auto",
29
- model_id="Qwen/Qwen2.5-Coder-7B-Instruct", # Changed to larger model
30
  #timeout=30
31
  )
32
 
33
- # --- tools ---
34
- # # Initialize the web search tool
35
- # web_search = DuckDuckGoSearchTool() # Included in base tools
36
-
37
- wikipedia_search = WikipediaSearchTool(
38
- user_agent=f"HF_Agents_Final_Assignment ({os.getenv('USER_EMAIL')})",
39
- language="en",
40
- content_type="text",
41
- extract_format="WIKI",
42
- )
43
- # visit_webpage = VisitWebpageTool() # Included in base tools
44
  final_answer = FinalAnswerTool()
45
 
 
46
  def _download_file(file_name: str) -> None:
47
  if not os.path.exists(file_name):
48
  url = f"{DEFAULT_API_URL}/files/{file_name.split('.')[-2]}"
@@ -109,10 +100,10 @@ class QuestionAnsweringAgent(CodeAgent):
109
  agent = QuestionAnsweringAgent(
110
  name="question_answering_expert",
111
  model=model,
112
- tools=[final_answer, wikipedia_search, read_file_as_text],
113
- add_base_tools=True, # Add any additional base tools
114
  planning_interval=None, # Disable planning to ensure immediate stop after final_answer
115
- #additional_authorized_imports=["pandas", "numpy", "datetime", "json", "re", "math", "os", "io", "requests", "csv", "urllib"],
116
  max_steps=5,
117
  verbosity_level=2, # Add for better debugging
118
  #use_structured_outputs_internally=True # Enable structured output
@@ -182,7 +173,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
182
  print(f"Skipping item with missing task_id or question: {item}")
183
  continue
184
  try:
185
- submitted_answer = run_agent_with_debug(agent, question_text, file_name)
186
 
187
  # Log the execution timeline
188
  print(f"Task {task_id} execution steps: {len(agent.memory.steps)}")
@@ -258,39 +249,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
258
  results_df = pd.DataFrame(results_log)
259
  return status_message, results_df
260
 
261
-
262
- def run_agent_with_debug(agent_fn, *args, **kwargs):
263
- """
264
- Wrap an agent generator (or function returning a generator)
265
- to log each step and catch StopIteration cleanly.
266
- """
267
- print("=== Starting agent run ===")
268
- try:
269
- gen = agent_fn(*args, **kwargs)
270
-
271
- # If it's a generator, iterate step by step
272
- if hasattr(gen, "__iter__") and not isinstance(gen, (str, bytes)):
273
- for step in gen:
274
- print(">>> Agent step:", step)
275
- print("=== Agent run completed without explicit final_answer ===")
276
-
277
- else:
278
- # If it's just a normal function return
279
- print(">>> Agent returned:", gen)
280
-
281
- except StopIteration as e:
282
- # Explicitly capture StopIteration so it doesn't look like a silent crash
283
- print("!!! Agent terminated early (StopIteration). Details:", e)
284
- raise RuntimeError(
285
- "Agent exited without producing a final_answer. "
286
- "Check control flow and ensure `final_answer(...)` is always returned."
287
- ) from e
288
-
289
- except Exception as e:
290
- print("!!! Agent crashed with error:", type(e).__name__, str(e))
291
- raise
292
-
293
-
294
  # --- Build Gradio Interface using Blocks ---
295
  with gr.Blocks() as demo:
296
  gr.Markdown("# Basic Agent Evaluation Runner")
 
11
  from huggingface_hub import login
12
  from smolagents import (
13
  InferenceClientModel,
14
+ FinalAnswerTool,
15
  CodeAgent
16
  )
 
 
17
 
18
+ from smolagents.tools import tool, Tool
19
+ from smolagents.agents import PromptTemplates
20
+ from wikipedia_tools import wikipedia_summary, read_wikipedia_page
21
+ #login(token=os.environ["HF_TOKEN"])
22
 
23
  # --- Constants ---
24
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 
27
  # --- model initialization ---
28
  model = InferenceClientModel(
29
  provider="auto",
30
+ model_id="Qwen/Qwen3-Coder-30B-A3B-Instruct", # Changed to larger model
31
  #timeout=30
32
  )
33
 
 
 
 
 
 
 
 
 
 
 
 
34
  final_answer = FinalAnswerTool()
35
 
36
+
37
  def _download_file(file_name: str) -> None:
38
  if not os.path.exists(file_name):
39
  url = f"{DEFAULT_API_URL}/files/{file_name.split('.')[-2]}"
 
100
  agent = QuestionAnsweringAgent(
101
  name="question_answering_expert",
102
  model=model,
103
+ tools=[final_answer, wikipedia_summary, read_wikipedia_page, read_file_as_text],
104
+ add_base_tools=False, # Disable auto base tools to avoid overriding custom visit_webpage
105
  planning_interval=None, # Disable planning to ensure immediate stop after final_answer
106
+ additional_authorized_imports=["bs4"],
107
  max_steps=5,
108
  verbosity_level=2, # Add for better debugging
109
  #use_structured_outputs_internally=True # Enable structured output
 
173
  print(f"Skipping item with missing task_id or question: {item}")
174
  continue
175
  try:
176
+ submitted_answer = agent(question_text, file_name)
177
 
178
  # Log the execution timeline
179
  print(f"Task {task_id} execution steps: {len(agent.memory.steps)}")
 
249
  results_df = pd.DataFrame(results_log)
250
  return status_message, results_df
251
 
252
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  # --- Build Gradio Interface using Blocks ---
254
  with gr.Blocks() as demo:
255
  gr.Markdown("# Basic Agent Evaluation Runner")
logs CHANGED
@@ -1,12 +1,22 @@
 
 
 
 
 
 
 
 
 
 
1
  === Starting agent run ===
2
  ╭──────────────────── New run - question_answering_expert ─────────────────────╮
3
  │ │
4
  │ You are a highly precise question-answering agent. │
5
  │ When given a question: │
6
- │ - If necessary or suggested, perform a wikipedia search using the
7
  │ `wikipedia_search` tool to find possible sources of information. For the │
8
  │ `query` parameter of the `wikipedia_search` tool, enter only the name of the │
9
- │ person, place, or event you want to search. Not something too long.
10
  │ - If necessary, perform a web search using the `web_search` tool to │
11
  │ find possible sources of information. │
12
  │ - If the web search only returns titles and short snippets, you MUST │
@@ -56,256 +66,156 @@
56
  │ file_name: │
57
  │ │
58
  ╰─ InferenceClientModel - Qwen/Qwen2.5-Coder-7B-Instruct ──────────────────────╯
59
- ───────────────────────────────── Initial plan ─────────────────────────────────
60
- Here are the facts I know and the plan of action that I will follow to solve the
61
- task:
62
- ```
63
- ## 1. Facts survey
64
-
65
- ### 1.1. Facts given in the task
66
- - None given.
67
-
68
- ### 1.2. Facts to look up
69
- 1. **Mercedes Sosa** - Identify who she is and what her nationality is.
70
- - Wikipedia Search: "Mercedes Sosa"
71
-
72
- 2. **Studio Albums by Mercedes Sosa** - List all studio albums by her.
73
- - Wikipedia Search: "List of studio albums by Mercedes Sosa"
74
- OR
75
- - Music Databases (e.g., Discogs, Billboard Charts)
76
-
77
- 3. **Release Dates of Mercedes Sosa's Studio Albums** - Specifically focus on
78
- albums released between 2000 and 2009 inclusive.
79
- - Music Databases (e.g., Discogs)
80
-
81
- 4. **Counting Relevant Albums** - Verify how many of these albums were indeed
82
- released within the specified period.
83
- - Manual Counting using derived data
84
-
85
- ### 1.3. Facts to derive
86
- - Determine whether a Wikipedia article exists on "Mercedes Sosa".
87
- - Extract a list of studio albums.
88
- - Filter the list to include only those albums released between 2000 and 2009.
89
- - Count the filtered list to get the number of relevant studio albums.
90
-
91
- ---
92
-
93
- ## 2. Plan
94
-
95
- 1. **Query Wikipedia to confirm existence of "Mercedes Sosa" article**.
96
- - `wikipedia_search("Mercedes Sosa")`
97
-
98
- 2. **If the article exists, derive information about her studio albums**.
99
- - Visit the "List of studio albums by Mercedes Sosa" page or related music
100
- database pages.
101
-
102
- 3. **Filter out albums that were released between 2000 and 2009 (inclusive)**.
103
-
104
- 4. **Count the number of filtered album entries**.
105
-
106
- 5. **Submit the count as the final answer via`final_answer`**.
107
-
108
-
109
- ```
110
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Step 1 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
111
  Output message of the LLM: ─────────────────────────────────────────────────────
112
- Thought: To answer the question accurately about how many studio albums Mercedes
113
- Sosa published between 2000 and 2009, I first need to check if there is any
114
- relevant Wikipedia article for Mercedes Sosa. If a page exists, it would likely
115
- contain detailed information about the artist, including her studio albums.
 
 
116
 
117
- I will use the `wikipedia_search` tool to search for "Mercedes Sosa" to see if a
118
- Wikipedia page exists about her.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  <code>
121
- mercedes_sosa_article = wikipedia_search("Mercedes Sosa")
122
- print(f"Wikipedia article found: {mercedes_sosa_article}")
123
 
124
  ─ Executing parsed code: ─────────────────────────────────────────────────────
125
- mercedes_sosa_article = wikipedia_search("Mercedes Sosa")
126
- print(f"Wikipedia article found: {mercedes_sosa_article}")
127
  ──────────────────────────────────────────────────────────────────────────────
128
  Execution logs:
129
- Wikipedia article found: **Wikipedia Page:** Mercedes Sosa
 
130
 
131
- **Content:** Haydée Mercedes Sosa (9 July 1935 – 4 October 2009) was an
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  Argentine singer who was popular throughout Latin America and many countries
133
- outside the region. With her roots in Argentine folk music, Sosa became one of
134
- the preeminent exponents of El nuevo cancionero. She gave voice to songs written
135
- by many Latin American songwriters. Her music made people hail her as the "voice
136
- of the voiceless ones". She was often called "the conscience of Latin America".
137
- Sosa performed in venues such as the Lincoln Center in New York City, the
138
- Théâtre Mogador in Paris, the Sistine Chapel in Vatican City, as well as
139
- sold-out shows in New York's Carnegie Hall and the Roman Colosseum during her
140
- final decade of life. Her career spanned four decades and she was the recipient
141
- of six Latin Grammy awards (2000, 2003, 2004, 2006, 2009, 2011), including a
142
- Latin Grammy Lifetime Achievement Award in 2004 and two posthumous Latin Grammy
143
- Award for Best Folk Album in 2009 and 2011. She won the Premio Gardel in 2000,
144
- the main musical award in Argentina. She served as an ambassador for UNICEF.
145
-
146
- Life
147
- Sosa was born on 9 July 1935, in San Miguel de Tucumán, in the northwestern
148
- Argentine province of Tucumán, of mestizo ancestry. She was of French, Spanish
149
- and Diaguita descent. Her nickname "la negra", which is a common nickname in
150
- Argentina for people with darker complexion, is a reference to her indigenous
151
- heritage. Her parents, a day laborer and a washerwoman, were Peronists, although
152
- they never registered in the party, and she started her career as a singer for
153
- the Peronist Party in Tucuman under the name Gladys Osorio. In 1950, at age
154
- fifteen, she won a singing competition organized by a local radio station and
155
- was given a contract to perform for two months. She recorded her first album, La
156
- Voz de la Zafra, in 1959. A performance at the 1965 Cosquín National Folklore
157
- Festival—where she was introduced and brought to the stage while sitting in the
158
- audience by fellow folk singer Jorge Cafrune— brought her to the attention of
159
- the Argentine public. Sosa and her first husband, Manuel Oscar Matus, with whom
160
- she had one son, were key players in the mid-60s nueva canción movement (which
161
- was called nuevo cancionero in Argentina). Her second record was Canciones con
162
- Fundamento, a collection of Argentine folk songs.
163
-
164
- Sosa "spent the late 1960s building her audience in Europe and among the
165
- cosmopolitan middle class in Buenos Aires, becoming in the process a much bigger
166
- star" than her contemporaries. In 1967, Sosa toured the United States and Europe
167
- with great success. In later years, she performed and recorded extensively,
168
- broadening her repertoire to include material from throughout Latin America.
169
- In the early 1970s, Sosa released two concept albums in collaboration with
170
- composer Ariel Ramírez and lyricist Félix Luna: Cantata Sudamericana and Mujeres
171
- Argentinas (Argentine Women). She also recorded a tribute to Chilean musician
172
- Violeta Parra in 1971, including what was to become one of Sosa's signature
173
- songs, Gracias a la vida. She further popularized of songs written by Milton
174
- Nascimento of Brazil and Pablo Milanés and Silvio Rodríguez both from Cuba.
175
- Throughout the decade, she released albums such as Hasta la Victoria in 1972 and
176
- Traigo un Pueblo en mi Voz in 1973. They featured songs like "Cuando tenga la
177
- tierra", written by Ariel Petrocelli and Daniel Toro, which tackles political
178
- and social issues like wealth and land inequality. During the 1970s she was a
179
- part of two films by the director Leopoldo Torre Nilsson: El Santo de la Espada
180
- in 1970 and Güemes, la tierra en armas in 1971, in which she portrayed Juana
181
- Azurduy de Padilla, the guerrilla military leader who fought for Argentine
182
- independence.
183
-
184
- After the military junta of Jorge Videla came to power in 1976, the atmosphere
185
- in Argentina grew increasingly oppressive. Sosa faced death threats against both
186
- her and her family, but refused for many years to leave the country. At a
187
- concert in La Plata in 1979, Sosa was searched and arrested on stage, along with
188
- all those attending the concert. Their release came about through international
189
- intervention. Despite attempts to hold more concerts, she was officially barred
190
- from performing by the military regime. Banned in her own country, she moved to
191
- Paris and then to Madrid. She has spoken publicly about her artistic and
192
- emotional struggles during this period of her life. While in exile, she released
193
- the album A Quien Doy in 1981. The album included a recording of the song
194
- "Cuando Me Acuerdo de Mi Pais" which was originally written by the prolific
195
- Chilean singer/songwriter, Patricio Manns. The song, which he wrote while also
196
- in political exile, expresses the sorrow he felt from being separated from his
197
- homeland. She related to this feeling and struggled to continue recording and
198
- performing. In an interview with the New York Times, she said, “It was a mental
199
- problem, a problem of morale...It wasn’t my throat, or anything physical".
200
- Sosa returned to Argentina from her exile in Europe in February 1982, several
201
- months before the military regime collapsed as a result of the Falklands War,
202
- and gave a series of concerts at the Teatro Ópera in Buenos Aires, where she
203
- invited many of her younger colleagues to share the stage. A double album of
204
- recordings, Mercedes Sosa en Argentina, from these performances became an
205
- instant best seller. She then traveled to perform in her home province of
206
- Tucuman. However, these performances were largely ignored by mainstream media in
207
- the country. In subsequent years, Sosa continued to tour both in Argentina and
208
- abroad, performing in such venues as the Lincoln Center in New York City and the
209
- Théâtre Mogador in Paris. In poor health for much of the 1990s, she performed a
210
- comeback show in Argentina in 1998. In 1994, she played in the Sistine Chapel in
211
- Vatican City. In 2002, she sold out both Carnegie Hall in New York and the
212
- Colosseum in Rome in the same year.
213
 
214
- A supporter of Perón, she favored leftist causes throughout her life. She
215
- supported President Raul Alfonsin in the election of 1983 which marked the
216
- return of democracy in Argentina following the dictatorship. She referred to
217
- this election as "Argentina's Spring" She opposed President Carlos Menem, who
218
- was in office from 1989 to 1999, and supported the election of Néstor Kirchner,
219
- who became president in 2003.
220
- Sosa was a UNESCO Goodwill Ambassador for Latin America and the Caribbean.
221
- Sosa disliked being identified as a protest singer. While she was outright in
222
- her political stances, Sosa said the following on the position of the artist:
223
 
224
- “An artist isn’t political in the party political sense – they have a
225
- constituency, which is their public – it is the poetry that matters most of
226
- all.”
227
- In a career spanning four decades, she worked with performers across several
228
- genres and generations, folk, opera, pop, rock, including Martha Argerich,
229
- Andrea Bocelli, David Broza, Franco Battiato, Jaime Roos, Joan Baez, Francis
230
- Cabrel, Gal Costa, Luz Casal, Lila Downs, Lucio Dalla, Maria Farantouri,
231
- Lucecita Benitez, Nilda Fernández, Charly Garcia, León Gieco, Gian Marco, Nana
232
- Mouskouri, Pablo Milanés, Holly Near, Milton Nascimento, Pata Negra, Fito Páez,
233
- Franco De Vita, Lourdes Pérez, Luciano Pavarotti, Silvio Rodríguez, Ismael
234
- Serrano, Shakira, Sting, Caetano Veloso, Julieta Venegas, Gustavo Cerati and
235
- Konstantin Wecker
236
- Sosa participated in a 1999 production of Ariel Ramírez's Misa Criolla. Her song
237
- Balderrama is featured in the 2008 movie Che, starring Benicio del Toro as the
238
- Argentine Marxist revolutionary Che Guevara.
239
- Sosa was the co-chair of the Earth Charter International Commission.
240
 
241
- Awards
242
- Sosa won the Latin Grammy Award for Best Folk Album in 2000 (Misa Criolla), 2003
243
- (Acústico), 2006 (Corazón Libre), 2009 (Cantora 1, which also won Best Recording
244
- Package and was nominated for Album of the Year), and 2011 (Deja La Vida Volar),
245
- as well as several international awards.
246
- In 1995, Konex Foundation from Argentina granted her the Diamond Konex Award,
247
- one of the most prestigious awards in Argentina, as the most important
248
- personality in the popular music of her country in the last decade.
249
-
250
- Death
251
- Suffering from recurrent endocrine and respiratory problems in later years, the
252
- 74-year-old Sosa was hospitalized in Buenos Aires on 18 September 2009. She died
253
- from multiple organ failure on 4 October 2009, at 5:15 am. She was survived by
254
- one son, Fabián Matus (d. 15 March 2019), born of her first marriage. He said:
255
- "She lived her 74 years to the fullest. She had done practically everything she
256
- wanted, she didn't have any type of barrier or any type of fear that limited
257
- her". The hospital expressed its sympathies to her relatives. Her website
258
- featured the following: "Her undisputed talent, her honesty and her profound
259
- convictions leave a great legacy to future generations".
260
- Her body was placed on display at the National Congress building in Buenos Aires
261
- for the public to pay their respects, and President Fernández de Kirchner
262
- ordered three days of national mourning. Thousands had queued by the end of the
263
- day.
264
- Sosa's obituary in The Daily Telegraph said she was "an unrivalled interpreter
265
- of works by her compatriot, the Argentine Atahualpa Yupanqui, and Chile's
266
- Violeta Parra". Helen Popper of Reuters reported her death by saying she "fought
267
- South America's dictators with her voice and became a giant of contemporary
268
- Latin American music". Sosa received three Latin Grammy nominations for her
269
- album, in 2009 . She went on to win Best Folk Album about a month after her
270
- death.
271
-
272
- Tributes
273
- In 2019, Sosa was celebrated by a Google Doodle. The doodle was showcased in
274
- Argentina, Chile, Uruguay, Paraguay, Bolivia, Peru, Ecuador, Cuba, Iceland,
275
- Sweden, Serbia, Greece, Israel and Vietnam.
276
- In 2023, Rolling Stone ranked Sosa at number 160 on its list of the 200 Greatest
277
- Singers of All Time.
278
-
279
- Discography
280
- Sosa recorded forty albums.
281
-
282
- Studio albums
283
- EPs
284
- Live albums
285
- Compilation albums
286
- Filmography
287
- Güemes, la tierra en armas (1971)
288
- Argentinísima (1972)
289
- Esta es mi Argentina (1974)
290
- Mercedes Sosa, como un pájaro libre (1983)
291
- Será possible el sur: Mercedes Sosa (1985)
292
- Historias de Argentina en vivo (2001)
293
-
294
- References
295
- Further reading
296
- External links
297
-
298
- Tribute to Mercedes Sosa (in Brazilian Portuguese)
299
- Mercedes Sosa's website at the Wayback Machine (archived 16 October 2009) (in
300
- Spanish)
301
- Mercedes Sosa's News (in Spanish)
302
- Mercedes Sosa at IMDb
303
- Mercedes Sosa discography at Discogs
304
-
305
- 🔗 **Read more:** https://en.wikipedia.org/wiki/Mercedes_Sosa
306
 
307
  Out: None
308
- [Step 1: Duration 3.00 seconds| Input tokens: 3,166 | Output tokens: 129]
309
- !!! Agent crashed with error: ZeroDivisionError integer division or modulo by zero
310
- Error running agent on task 8e867cd7-cff9-4e6c-867a-ff5ddc2550be: integer division or modulo by zero
311
- Agent did not produce any answers to submit.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ Launching Gradio Interface for Basic Agent Evaluation...
3
+ * Running on local URL: http://0.0.0.0:7860, with SSR ⚡ (experimental, to disable set `ssr_mode=False` in `launch()`)
4
+
5
+ To create a public link, set `share=True` in `launch()`.
6
+ User logged in: gabzer
7
+ https://huggingface.co/spaces/gabzer/GAIA_benchmark_agent/tree/main
8
+ Fetching questions from: https://agents-course-unit4-scoring.hf.space/questions
9
+ Fetched 20 questions.
10
+ Running agent on 20 questions...
11
  === Starting agent run ===
12
  ╭──────────────────── New run - question_answering_expert ─────────────────────╮
13
  │ │
14
  │ You are a highly precise question-answering agent. │
15
  │ When given a question: │
16
+ │ - If necessary, perform a wikipedia search using the
17
  │ `wikipedia_search` tool to find possible sources of information. For the │
18
  │ `query` parameter of the `wikipedia_search` tool, enter only the name of the │
19
+ │ person, the place, or the event you want to search. Not something too long.
20
  │ - If necessary, perform a web search using the `web_search` tool to │
21
  │ find possible sources of information. │
22
  │ - If the web search only returns titles and short snippets, you MUST │
 
66
  │ file_name: │
67
  │ │
68
  ╰─ InferenceClientModel - Qwen/Qwen2.5-Coder-7B-Instruct ──────────────────────╯
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Step 1 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
70
  Output message of the LLM: ─────────────────────────────────────────────────────
71
+ Thought: To find how many studio albums Mercedes Sosa published between 2000 and
72
+ 2009, I need to look up her discography on Wikipedia. This will give me the
73
+ information I require.
74
+ <code>
75
+ sosa_discography = wikipedia_search("Mercedes Sosa discography")
76
+ print(sosa_discography)
77
 
78
+ Executing parsed code: ─────────────────────────────────────────────────────
79
+ sosa_discography = wikipedia_search("Mercedes Sosa discography")
80
+ print(sosa_discography)
81
+ ──────────────────────────────────────────────────────────────────────────────
82
+ Execution logs:
83
+ No Wikipedia page found for 'Mercedes Sosa discography'. Try a different query.
84
+
85
+ Out: None
86
+ [Step 1: Duration 2.08 seconds| Input tokens: 2,734 | Output tokens: 72]
87
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Step 2 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
88
+ Output message of the LLM: ─────────────────────────────────────────────────────
89
+ Thought: The previous query didn't yield any results. It could be due to the
90
+ specificity or the lack of a recent entry. I'll broaden the query to search for
91
+ "Mercedes Sosa career".
92
+ Using broadened query for better results.
93
+ <code>
94
+ sosa_career = wikipedia_search("Mercedes Sosa career")
95
+ print(sosa_career)
96
 
97
+ ─ Executing parsed code: ─────────────────────────────────────────────────────
98
+ sosa_career = wikipedia_search("Mercedes Sosa career")
99
+ print(sosa_career)
100
+ ──────────────────────────────────────────────────────────────────────────────
101
+ Execution logs:
102
+ No Wikipedia page found for 'Mercedes Sosa career'. Try a different query.
103
+
104
+ Out: None
105
+ [Step 2: Duration 1.96 seconds| Input tokens: 5,639 | Output tokens: 149]
106
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Step 3 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
107
+ Output message of the LLM: ─────────────────────────────────────────────────────
108
+ Thought: Searching for "Mercedes Sosa's album releases" may be more accurate
109
+ since I am specifically looking for studio albums.
110
  <code>
111
+ sosa_album_releases = wikipedia_search("Mercedes Sosa's album releases")
112
+ print(sosa_album_releases)
113
 
114
  ─ Executing parsed code: ─────────────────────────────────────────────────────
115
+ sosa_album_releases = wikipedia_search("Mercedes Sosa's album releases")
116
+ print(sosa_album_releases)
117
  ──────────────────────────────────────────────────────────────────────────────
118
  Execution logs:
119
+ No Wikipedia page found for 'Mercedes Sosa's album releases'. Try a different
120
+ query.
121
 
122
+ Out: None
123
+ [Step 3: Duration 1.82 seconds| Input tokens: 8,720 | Output tokens: 205]
124
+ ━━━━━━━━━━━━━━━━━━━━━━━���━━━━━━━━━━━━ Step 4 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
125
+ Output message of the LLM: ─────────────────────────────────────────────────────
126
+ Thought: Given that previous searches were unsuccessful, let me perform a
127
+ general web search for "Mercedes Sosa albums between 2000 and 2009" and examine
128
+ the results for relevant information.
129
+ <code>
130
+ album_info = web_search("Mercedes Sosa albums between 2000 and 2009")
131
+ print(album_info)
132
+
133
+ ─ Executing parsed code: ─────────────────────────────────────────────────────
134
+ album_info = web_search("Mercedes Sosa albums between 2000 and 2009")
135
+ print(album_info)
136
+ ──────────────────────────────────────────────────────────────────────────────
137
+ Execution logs:
138
+ ## Search Results
139
+
140
+ [Category: Mercedes Sosa albums -
141
+ Wikipedia](https://en.wikipedia.org/wiki/Category:Mercedes_Sosa_albums)
142
+ This is a set category. It should only contain pages that are Mercedes Sosa
143
+ albums or lists of Mercedes Sosa albums , as well as subcategories containing
144
+ those things (themselves set categories).
145
+
146
+ [Mercedes Sosa Albums and
147
+ Discography](https://genius.com/artists/Mercedes-sosa/albums)
148
+ All Albums by Mercedes Sosa . Mercedes Sosa discography includes 45 albums .
149
+
150
+ [Mercedes Sosa | Discografia |
151
+ Discogs](https://www.discogs.com/it/artist/333361-Mercedes-Sosa)
152
+ Mercedes Sosa , known as La Negra, (born July 9, 1935 in San Miguel de Tucuman,
153
+ Argentina – Death October 4, 2009 in Buenos Aires) was an Argentine singer who
154
+ was and remains immensely popular throughout Latin America and internationally.
155
+
156
+ [Mercedes Sosa - Apple
157
+ Music](https://music.apple.com/tc/artist/mercedes-sosa/122968)
158
+ Mercedes Sosa . Latest Release. 24 NOV 2024. Mercedes Sosa en New York, 1974 ·
159
+ 2024. Palabras de Mercedes IX (En Vivo NY).
160
+
161
+ [Mercedes Sosa : albums , chansons, concerts |
162
+ Deezer](https://www.deezer.com/fr/artist/10606)
163
+ Mercedes Sosa sur Deezer : discographie de l'artiste, top albums et chansons,
164
+ concerts et featurings.
165
+
166
+ [Mercedes Sosa on TIDAL](https://tidal.com/browse/artist/3501549)
167
+ Albums . Mercedes Sosa en New York, 1974. Mercedes Sosa . 2009 . Zona de
168
+ Promesas (with Gustavo Cerati).
169
+
170
+ [Mercedes Sosa : Best songs, Albums and Concerts -
171
+ Mozaart](https://mozaart.com/en/a/mercedes-sosa)
172
+ Haydée Mercedes Sosa (Spanish pronunciation: [meɾˈseðes ˈsosa]; 9 July 1935 – 4
173
+ October 2009 ), sometimes known as La Negra (literally: The Black One), was an
174
  Argentine singer who was popular throughout Latin America and many countries
175
+ outsid.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
 
177
+ [Mercedes Sosa - Albums](https://muzikum.eu/en/mercedes-sosa/discography)
178
+ Mercedes Sosa . 23 Albums found.Reyes Records (2003). Mercedes Sosa - A Arte De
179
+ (bradypus Series).
 
 
 
 
 
 
180
 
181
+ [Cantora 1 - Mercedes Sosa | Album |
182
+ AllMusic](https://www.allmusic.com/album/cantora-1-mw0000827785)
183
+ Cantora 1 by Mercedes Sosa released in 2009 . Find album reviews, track lists,
184
+ credits, awards and more at AllMusic.
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
+ [Mercedes Sosa Albums , Songs - Discography - Album of The
187
+ Year](https://www.albumoftheyear.org/artist/33792-mercedes-sosa/?type=live)
188
+ Information on Mercedes Sosa . Complete discography, ratings, reviews and
189
+ more.1982. Mercedes Sosa En Argentina. Live.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
  Out: None
192
+ [Step 4: Duration 3.07 seconds| Input tokens: 11,961 | Output tokens: 283]
193
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Step 5 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
194
+ Output message of the LLM: ─────────────────────────────────────────────────────
195
+ Thought: After reviewing the web search results, it is apparent that there is
196
+ limited information available about Mercedes Sosa releasing albums specifically
197
+ between 2000 and 2009. The most relevant result indicates a live album released
198
+ in 1982 and a song release in 2009, but not studio albums in this range.
199
+
200
+ As no specific information about the number ofstudio albums released by Mercedes
201
+ Sosa between 2000 and 2009 was found, I cannot determine the exact count.
202
+ Therefore, I will finalize my answer as "unknown."
203
+ <code>
204
+ final_answer("unknown")
205
+
206
+ ─ Executing parsed code: ─────────────────────────────────────────────────────
207
+ final_answer("unknown")
208
+ ──────────────────────────────────────────────────────────────────────────────
209
+ Final answer: unknown
210
+ [Step 5: Duration 2.95 seconds| Input tokens: 16,092 | Output tokens: 414]
211
+ >>> Agent returned: unknown
212
+ Task 8e867cd7-cff9-4e6c-867a-ff5ddc2550be execution steps: 6
213
+ Step 1: TaskStep
214
+ Step 2: ActionStep
215
+ Step 3: ActionStep
216
+ Step 4: ActionStep
217
+ Step 5: ActionStep
218
+ Step 6: ActionStep
219
+ Agent finished. Submitting 1 answers for user 'gabzer'...
220
+ Submitting 1 answers to: https://agents-course-unit4-scoring.hf.space/submit
221
+ Submission Failed: Server responded with status 422. Detail: [{'type': 'string_type', 'loc': ['body', 'answers', 0, 'submitted_answer', 'str'], 'msg': 'Input should be a valid string', 'input': None}, {'type': 'int_type', 'loc': ['body', 'answers', 0, 'submitted_answer', 'int'], 'msg': 'Input should be a valid integer', 'input': None}, {'type': 'float_type', 'loc': ['body', 'answers', 0, 'submitted_answer', 'float'], 'msg': 'Input should be a valid number', 'input': None}]
prompt.py CHANGED
@@ -2,9 +2,17 @@ def get_prompt(question_text, file_name):
2
  PROMPT = f"""
3
  You are a highly precise question-answering agent.
4
  When given a question:
5
- - If necessary, perform a wikipedia search using the `wikipedia_search` tool to find possible sources of information. For the `query` parameter of the `wikipedia_search` tool, enter only the name of the person, the place, or the event you want to search. Not something too long.
 
 
 
 
 
 
 
 
6
  - If necessary, perform a web search using the `web_search` tool to find possible sources of information.
7
- - If the web search only returns titles and short snippets, you MUST visit the actual webpage using the `visit_webpage` tool to read the full content before answering.
8
  - If the task requires reading, listening, or analyzing a file, you must use the file specified after the question, NOT the file name mentioned casually inside the question text.
9
  - Comma separated lists MUST contain a single space after each comma.
10
  - If you are asked for a number, don't use comma to write your number, nor use units such as $$ or percent sign unless specified otherwise.
 
2
  PROMPT = f"""
3
  You are a highly precise question-answering agent.
4
  When given a question:
5
+ - If necessary, start by performing a wikipedia search using the `wikipedia_summary` tool to find possible sources of information.
6
+ For the `query` parameter of the `wikipedia_summary` tool, you MUST think about the entity being searched for and ALWAYS pass exactly the entity name (person/place/event) with no qualifiers.
7
+ NEVER include words like: career, albums, list, biography, years, numbers, prepositions, or date ranges.
8
+ Examples:
9
+ - "Lionel Messi career" → use: wikipedia_summary("Lionel Messi")
10
+ - "Mercedes Sosa discography" → use: wikipedia_summary("Mercedes Sosa")
11
+ - "Battle of Hastings timeline" → use: wikipedia_summary("Battle of Hastings")
12
+ - "Population of Paris in 2010" → use: wikipedia_summary("Paris")
13
+ - If necessary, visit the wikipedia page listed in the wikipedia summary tool to read the full content. You will find the page url in the output of the wikipedia summary tool at the end after the **Read more:** section. Use the `read_wikipedia_page` tool to visit the page.
14
  - If necessary, perform a web search using the `web_search` tool to find possible sources of information.
15
+ - If the web search only returns titles and short snippets, you MUST visit the actual webpage using the `read_wikipedia_page` tool to read the full content before answering.
16
  - If the task requires reading, listening, or analyzing a file, you must use the file specified after the question, NOT the file name mentioned casually inside the question text.
17
  - Comma separated lists MUST contain a single space after each comma.
18
  - If you are asked for a number, don't use comma to write your number, nor use units such as $$ or percent sign unless specified otherwise.
requirements.txt CHANGED
@@ -1,10 +1,14 @@
1
- gradio==5.44.1
2
  requests==2.32.5
3
- smolagents[transformers]==1.21.3
4
  duckduckgo-search==8.1.1
5
  ddgs==9.5.5
6
  requests==2.32.5
7
  markdownify==0.11.0
8
  openpyxl==3.1.5
9
  wikipedia-api==0.8.1
10
- whisper==1.1.10
 
 
 
 
 
1
+ gradio[oauth]==5.44.1
2
  requests==2.32.5
3
+ smolagents==1.21.3
4
  duckduckgo-search==8.1.1
5
  ddgs==9.5.5
6
  requests==2.32.5
7
  markdownify==0.11.0
8
  openpyxl==3.1.5
9
  wikipedia-api==0.8.1
10
+ whisper==1.1.10
11
+ beautifulsoup4==4.12.3
12
+ langchain_community==0.3.2
13
+ wikipedia==1.4.0
14
+ tabulate==0.9.0
test.ipynb ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 4,
6
+ "id": "289bbe12",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from smolagents import WikipediaSearchTool \n",
11
+ "wikipedia_search = WikipediaSearchTool(\n",
12
+ " user_agent=f\"My research agent (gabriel.melki@gmail.com)\",\n",
13
+ " language=\"en\",\n",
14
+ " content_type=\"text\",\n",
15
+ " extract_format=\"HTML\",\n",
16
+ ")"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": 2,
22
+ "id": "0e69fd46",
23
+ "metadata": {},
24
+ "outputs": [
25
+ {
26
+ "data": {
27
+ "text/plain": [
28
+ "{'summary': '\\n The sections inside the page are Life, Awards, Death, Tributes, Discography, Filmography, References, Further reading, External links and the summary of the page is Haydée Mercedes Sosa (9 July 1935 – 4 October 2009) was an Argentine singer who was popular throughout Latin America and many countries outside the region. With her roots in Argentine folk music, Sosa became one of the preeminent exponents of El nuevo cancionero. She gave voice to songs written by many Latin American songwriters. Her music made people hail her as the \"voice of the voiceless ones\". She was often called \"the conscience of Latin America\".\\nSosa performed in venues such as the Lincoln Center in New York City, the Théâtre Mogador in Paris, the Sistine Chapel in Vatican City, as well as sold-out shows in New York\\'s Carnegie Hall and the Roman Colosseum during her final decade of life. Her career spanned four decades and she was the recipient of six Latin Grammy awards (2000, 2003, 2004, 2006, 2009, 2011), including a Latin Grammy Lifetime Achievement Award in 2004 and two posthumous Latin Grammy Award for Best Folk Album in 2009 and 2011. She won the Premio Gardel in 2000, the main musical award in Argentina. She served as an ambassador for UNICEF.\\n ',\n",
29
+ " 'url': 'https://en.wikipedia.org/wiki/Mercedes_Sosa'}"
30
+ ]
31
+ },
32
+ "execution_count": 2,
33
+ "metadata": {},
34
+ "output_type": "execute_result"
35
+ }
36
+ ],
37
+ "source": [
38
+ "from wikipedia_tools import wikipedia_summary, read_wikipedia_page\n",
39
+ "url = \"https://en.wikipedia.org/wiki/Mercedes_Sosa\"\n",
40
+ "query = \"Mercedes Sosa\"\n",
41
+ "wikipedia_summary(query=query)"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": 8,
47
+ "id": "034ffd47",
48
+ "metadata": {},
49
+ "outputs": [
50
+ {
51
+ "data": {
52
+ "text/plain": [
53
+ "{'summary': 'Haydée Mercedes Sosa (9 July 1935 – 4 October 2009) was an Argentine singer who was popular throughout Latin America and many countries outside the region. With her roots in Argentine folk music, Sosa became one of the preeminent exponents of El nuevo cancionero. She gave voice to songs written by many Latin American songwriters. Her music made people hail her as the \"voice of the voiceless ones\". She was often called \"the conscience of Latin America\".\\nSosa performed in venues such as the Lincoln Center in New York City, the Théâtre Mogador in Paris, the Sistine Chapel in Vatican City, as well as sold-out shows in New York\\'s Carnegie Hall and the Roman Colosseum during her final decade of life. Her career spanned four decades and she was the recipient of six Latin Grammy awards (2000, 2003, 2004, 2006, 2009, 2011), including a Latin Grammy Lifetime Achievement Award in 2004 and two posthumous Latin Grammy Award for Best Folk Album in 2009 and 2011. She won the Premio Gardel in 2000, the main musical award in Argentina. She served as an ambassador for UNICEF.',\n",
54
+ " 'url': 'https://en.wikipedia.org/wiki/Mercedes Sosa'}"
55
+ ]
56
+ },
57
+ "execution_count": 8,
58
+ "metadata": {},
59
+ "output_type": "execute_result"
60
+ }
61
+ ],
62
+ "source": [
63
+ "import wikipediaapi\n",
64
+ "import os\n",
65
+ "query=\"Mercedes Sosa\"\n",
66
+ "\"\"\"\n",
67
+ "Search Wikipedia for a query and return a dictionary with the summary of the page and the url of the page.\n",
68
+ "Args:\n",
69
+ " query: The query to search for.\n",
70
+ "Returns:\n",
71
+ " A dictionary with the summary of the page and the url of the page.\n",
72
+ "\"\"\"\n",
73
+ "summary_tool = wikipediaapi.Wikipedia(\n",
74
+ " user_agent=f\"My research agent ({os.getenv('USER_EMAIL')})\",\n",
75
+ " extra_api_params={\"include\": \"url\"}\n",
76
+ ")\n",
77
+ "page = summary_tool.page(query)\n",
78
+ "if not page.exists():\n",
79
+ " raise ValueError(f\"No Wikipedia page found for '{query}'. Try a different query.\")\n",
80
+ "{\n",
81
+ " \"summary\": page.summary,\n",
82
+ " \"url\": f\"https://en.wikipedia.org/wiki/{page.title}\"\n",
83
+ "}"
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "code",
88
+ "execution_count": 19,
89
+ "id": "9992b1ec",
90
+ "metadata": {},
91
+ "outputs": [
92
+ {
93
+ "data": {
94
+ "text/plain": [
95
+ "Mercedes Sosa (lang: en, variant: None, id: 476992, ns: 0)"
96
+ ]
97
+ },
98
+ "execution_count": 19,
99
+ "metadata": {},
100
+ "output_type": "execute_result"
101
+ }
102
+ ],
103
+ "source": [
104
+ "page"
105
+ ]
106
+ },
107
+ {
108
+ "cell_type": "code",
109
+ "execution_count": 3,
110
+ "id": "11d4a9f8",
111
+ "metadata": {},
112
+ "outputs": [
113
+ {
114
+ "data": {
115
+ "text/plain": [
116
+ "'https://en.wikipedia.org/wiki/Mercedes_Sosa'"
117
+ ]
118
+ },
119
+ "execution_count": 3,
120
+ "metadata": {},
121
+ "output_type": "execute_result"
122
+ }
123
+ ],
124
+ "source": [
125
+ "import wikipedia\n",
126
+ "ny = wikipedia.page(pageid=476992)\n",
127
+ "ny.url\n",
128
+ "# u'http://en.wikipedia.org/wiki/New_York'"
129
+ ]
130
+ },
131
+ {
132
+ "cell_type": "code",
133
+ "execution_count": 11,
134
+ "id": "577562bd",
135
+ "metadata": {},
136
+ "outputs": [
137
+ {
138
+ "data": {
139
+ "text/plain": [
140
+ "{'pageid': '476992',\n",
141
+ " 'title': 'Mercedes Sosa',\n",
142
+ " 'url': 'https://en.wikipedia.org/wiki/Mercedes_Sosa',\n",
143
+ " '_sections': [],\n",
144
+ " '_categories': ['1935 births',\n",
145
+ " '2009 deaths',\n",
146
+ " '20th-century Argentine women singers',\n",
147
+ " '20th-century drummers',\n",
148
+ " 'All articles with dead external links',\n",
149
+ " 'All articles with unsourced statements',\n",
150
+ " 'Argentine activists',\n",
151
+ " 'Argentine people of Diaguita descent',\n",
152
+ " 'Argentine people of French descent',\n",
153
+ " 'Argentine people of Quechua descent',\n",
154
+ " 'Argentine women activists',\n",
155
+ " 'Articles with Brazilian Portuguese-language sources (pt-br)',\n",
156
+ " 'Articles with German-language sources (de)',\n",
157
+ " 'Articles with Spanish-language sources (es)',\n",
158
+ " 'Articles with dead external links from June 2024',\n",
159
+ " 'Articles with hCards',\n",
160
+ " 'Articles with short description',\n",
161
+ " 'Articles with unsourced statements from December 2023',\n",
162
+ " 'Bombo legüero players',\n",
163
+ " 'CS1 Spanish-language sources (es)',\n",
164
+ " 'Commons category link is on Wikidata',\n",
165
+ " 'Deaths from kidney failure in Argentina',\n",
166
+ " 'Latin Grammy Award winners',\n",
167
+ " 'Latin Grammy Lifetime Achievement Award winners',\n",
168
+ " 'Nueva canción musicians',\n",
169
+ " 'People from San Miguel de Tucumán',\n",
170
+ " 'Recipients of the Order of Cultural Merit (Brazil)',\n",
171
+ " 'Short description is different from Wikidata',\n",
172
+ " 'Use dmy dates from July 2025',\n",
173
+ " 'Webarchive template wayback links',\n",
174
+ " 'Wikipedia indefinitely move-protected pages',\n",
175
+ " 'Women in Latin music']}"
176
+ ]
177
+ },
178
+ "execution_count": 11,
179
+ "metadata": {},
180
+ "output_type": "execute_result"
181
+ }
182
+ ],
183
+ "source": [
184
+ "ny.__dict__"
185
+ ]
186
+ },
187
+ {
188
+ "cell_type": "code",
189
+ "execution_count": 20,
190
+ "id": "9fa24efc",
191
+ "metadata": {},
192
+ "outputs": [],
193
+ "source": [
194
+ "import wikipediaapi\n",
195
+ "import os\n",
196
+ "query=\"Mercedes Sosa\"\n",
197
+ "summary_tool = wikipediaapi.Wikipedia(\n",
198
+ " user_agent=f\"My research agent ({os.getenv('USER_EMAIL')})\",\n",
199
+ ")\n",
200
+ "page = summary_tool.page(query)"
201
+ ]
202
+ },
203
+ {
204
+ "cell_type": "code",
205
+ "execution_count": 24,
206
+ "id": "8589fd42",
207
+ "metadata": {},
208
+ "outputs": [
209
+ {
210
+ "data": {
211
+ "text/plain": [
212
+ "['Life',\n",
213
+ " 'Awards',\n",
214
+ " 'Death',\n",
215
+ " 'Tributes',\n",
216
+ " 'Discography',\n",
217
+ " 'Filmography',\n",
218
+ " 'References',\n",
219
+ " 'Further reading',\n",
220
+ " 'External links']"
221
+ ]
222
+ },
223
+ "execution_count": 24,
224
+ "metadata": {},
225
+ "output_type": "execute_result"
226
+ }
227
+ ],
228
+ "source": [
229
+ "[section._title for section in page.sections]"
230
+ ]
231
+ },
232
+ {
233
+ "cell_type": "code",
234
+ "execution_count": null,
235
+ "id": "38a8b06c",
236
+ "metadata": {},
237
+ "outputs": [],
238
+ "source": []
239
+ }
240
+ ],
241
+ "metadata": {
242
+ "kernelspec": {
243
+ "display_name": "agents_env",
244
+ "language": "python",
245
+ "name": "python3"
246
+ },
247
+ "language_info": {
248
+ "codemirror_mode": {
249
+ "name": "ipython",
250
+ "version": 3
251
+ },
252
+ "file_extension": ".py",
253
+ "mimetype": "text/x-python",
254
+ "name": "python",
255
+ "nbconvert_exporter": "python",
256
+ "pygments_lexer": "ipython3",
257
+ "version": "3.13.6"
258
+ }
259
+ },
260
+ "nbformat": 4,
261
+ "nbformat_minor": 5
262
+ }
test_executed.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
wiki_extractor.py ADDED
@@ -0,0 +1,341 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import sys
3
+ import json
4
+ from typing import Any, Dict, List, Optional
5
+
6
+ import requests
7
+ from bs4 import BeautifulSoup, Tag
8
+ from markdownify import markdownify as md
9
+
10
+
11
+ USER_AGENT = (
12
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
13
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
14
+ "Chrome/126.0.0.0 Safari/537.36"
15
+ )
16
+
17
+
18
+ def _clean_text(text: str) -> str:
19
+ if not text:
20
+ return ""
21
+ # Remove reference markers like [1], [a], [note 1]
22
+ text = re.sub(r"\s*\[[^\]]+\]", "", text)
23
+ # Collapse whitespace
24
+ text = re.sub(r"\s+", " ", text).strip()
25
+ return text
26
+
27
+
28
+ def _previous_heading(element: Tag) -> Optional[str]:
29
+ node = element
30
+ while node is not None:
31
+ node = node.previous_sibling
32
+ if isinstance(node, Tag) and node.name in {"h2", "h3", "h4", "h5", "h6"}:
33
+ return _clean_text(node.get_text(" ", strip=True))
34
+ # Fallback: walk up to find any earlier heading in parent
35
+ parent = element.parent
36
+ while parent is not None and isinstance(parent, Tag):
37
+ sib = parent.previous_sibling
38
+ while sib is not None:
39
+ if isinstance(sib, Tag) and sib.name in {"h2", "h3", "h4", "h5", "h6"}:
40
+ return _clean_text(sib.get_text(" ", strip=True))
41
+ sib = sib.previous_sibling
42
+ parent = parent.parent
43
+ return None
44
+
45
+
46
+ def _parse_table(table: Tag) -> Dict[str, Any]:
47
+ # Determine title/caption
48
+ title = None
49
+ caption = table.find("caption")
50
+ if caption:
51
+ title = _clean_text(caption.get_text(" ", strip=True))
52
+ if not title:
53
+ title = _previous_heading(table)
54
+
55
+ # Skip navboxes or non-content tables
56
+ classes = set(table.get("class", []))
57
+ if any(c in {"navbox", "vertical-navbox", "metadata", "mbox"} for c in classes):
58
+ return {
59
+ "name": title or "",
60
+ "headers": [],
61
+ "rows": [],
62
+ "skipped": True,
63
+ }
64
+
65
+ # Identify header cells
66
+ headers: List[str] = []
67
+ header_row = None
68
+ thead = table.find("thead")
69
+ if thead:
70
+ header_row = thead.find("tr")
71
+ if header_row is None:
72
+ # Find first row that contains any <th>
73
+ for tr in table.find_all("tr", recursive=True):
74
+ if tr.find("th"):
75
+ header_row = tr
76
+ break
77
+ if header_row is not None:
78
+ for th in header_row.find_all(["th", "td"], recursive=False):
79
+ header_text = _clean_text(th.get_text(" ", strip=True))
80
+ if header_text:
81
+ headers.append(header_text)
82
+
83
+ # Collect rows
84
+ rows: List[Any] = []
85
+ for tr in table.find_all("tr", recursive=True):
86
+ if tr is header_row:
87
+ continue
88
+ cells = tr.find_all(["td", "th"], recursive=False)
89
+ if not cells:
90
+ continue
91
+ values = [_clean_text(c.get_text(" ", strip=True)) for c in cells]
92
+ # If headers exist and lengths match, map to dict; else keep as list
93
+ if headers and len(values) == len(headers):
94
+ rows.append({headers[i]: values[i] for i in range(len(headers))})
95
+ else:
96
+ rows.append(values)
97
+
98
+ return {
99
+ "name": title or "",
100
+ "headers": headers,
101
+ "rows": rows,
102
+ }
103
+
104
+
105
+ def extract_wikipedia_content(url: str) -> Dict[str, Any]:
106
+ resp = requests.get(
107
+ url,
108
+ headers={"User-Agent": USER_AGENT, "Accept-Language": "en-US,en;q=0.9"},
109
+ timeout=30,
110
+ )
111
+ resp.raise_for_status()
112
+
113
+ soup = BeautifulSoup(resp.text, "html.parser")
114
+ title_tag = soup.find("h1", id="firstHeading")
115
+ title = _clean_text(title_tag.get_text(" ", strip=True)) if title_tag else ""
116
+
117
+ # Main content
118
+ content = soup.select_one("#mw-content-text .mw-parser-output")
119
+ if content is None:
120
+ content = soup.find("div", class_="mw-parser-output") or soup
121
+
122
+ # Remove non-content elements
123
+ for selector in [
124
+ "table.navbox",
125
+ "table.vertical-navbox",
126
+ "div.reflist",
127
+ "ol.references",
128
+ "span.mw-editsection",
129
+ "script",
130
+ "style",
131
+ "div.mw-authority-control",
132
+ "div.navbox",
133
+ "table.metadata",
134
+ "table.toccolours",
135
+ "div.mw-references-wrap",
136
+ "sup.reference",
137
+ ]:
138
+ for node in content.select(selector):
139
+ node.decompose()
140
+
141
+ # Extract tables (keep real tables) by walking all descendants to capture nearest heading context
142
+ tables: List[Dict[str, Any]] = []
143
+ current_heading: Optional[str] = None
144
+ for node in content.descendants:
145
+ if not isinstance(node, Tag):
146
+ continue
147
+ if node.name in {"h2", "h3", "h4", "h5", "h6"}:
148
+ headline = node.find("span", class_="mw-headline")
149
+ heading_text = headline.get_text(" ", strip=True) if headline else node.get_text(" ", strip=True)
150
+ current_heading = _clean_text(heading_text)
151
+ continue
152
+ if node.name == "table":
153
+ classes = set(node.get("class", []))
154
+ if not classes or any(c in {"wikitable", "infobox", "sortable", "vevent"} for c in classes):
155
+ parsed = _parse_table(node)
156
+ if parsed.get("rows"):
157
+ if not parsed.get("name") and current_heading:
158
+ parsed["name"] = current_heading
159
+ tables.append({k: v for k, v in parsed.items() if k != "skipped"})
160
+
161
+ # Extract text markdown excluding tables
162
+ # Clone by stringifying and re-parsing only the content, then drop tables
163
+ content_clone = BeautifulSoup(str(content), "html.parser")
164
+ for tbl in content_clone.find_all("table"):
165
+ tbl.decompose()
166
+ text_markdown = md(str(content_clone), strip=['img'])
167
+ text_markdown = _clean_text(text_markdown)
168
+
169
+ return {
170
+ "title": title,
171
+ "url": url,
172
+ "text_markdown": text_markdown,
173
+ "tables": tables,
174
+ }
175
+
176
+
177
+ def _escape_markdown_cell(value: Any) -> str:
178
+ """Escape characters that break Markdown tables and normalize whitespace."""
179
+ if value is None:
180
+ return ""
181
+ text = str(value)
182
+ text = text.replace("|", "\\|")
183
+ text = re.sub(r"\s+", " ", text).strip()
184
+ return text
185
+
186
+
187
+ def format_tables_as_markdown(
188
+ tables: List[Dict[str, Any]],
189
+ max_tables: Optional[int] = None,
190
+ max_rows_per_table: int = 25,
191
+ ) -> str:
192
+ """
193
+ Convert extracted tables into compact Markdown tables.
194
+
195
+ Args:
196
+ tables: List of table dicts as returned by extract_wikipedia_content.
197
+ max_tables: If set, include at most this many tables (in order).
198
+ max_rows_per_table: Maximum number of data rows to include per table.
199
+
200
+ Returns:
201
+ A Markdown string representing the tables.
202
+ """
203
+ if not tables:
204
+ return ""
205
+
206
+ rendered_sections: List[str] = []
207
+ selected = tables if max_tables is None else tables[: max_tables]
208
+
209
+ for table_idx, table in enumerate(selected):
210
+ name = table.get("name") or f"Table {table_idx + 1}"
211
+ headers: List[str] = table.get("headers", [])
212
+ rows: List[Any] = table.get("rows", [])
213
+
214
+ if not rows:
215
+ continue
216
+
217
+ section_lines: List[str] = []
218
+ section_lines.append(f"### Table: {name}")
219
+
220
+ # If we have headers and row dicts/lists, render a markdown table
221
+ if headers:
222
+ # Header row
223
+ escaped_headers = [_escape_markdown_cell(h) for h in headers]
224
+ section_lines.append("| " + " | ".join(escaped_headers) + " |")
225
+ section_lines.append("| " + " | ".join(["---"] * len(headers)) + " |")
226
+
227
+ # Data rows
228
+ for r_idx, row in enumerate(rows[: max_rows_per_table]):
229
+ if isinstance(row, dict):
230
+ values = [_escape_markdown_cell(row.get(h, "")) for h in headers]
231
+ else:
232
+ # row is a list; align to headers length
233
+ values = [_escape_markdown_cell(row[i] if i < len(row) else "") for i in range(len(headers))]
234
+ section_lines.append("| " + " | ".join(values) + " |")
235
+ else:
236
+ # No headers: render as bullet list with row previews
237
+ for r_idx, row in enumerate(rows[: max_rows_per_table]):
238
+ if isinstance(row, dict):
239
+ preview = ", ".join(f"{_escape_markdown_cell(k)}: {_escape_markdown_cell(v)}" for k, v in row.items())
240
+ else:
241
+ preview = ", ".join(_escape_markdown_cell(v) for v in row)
242
+ section_lines.append(f"- {preview}")
243
+
244
+ # Indicate truncation if applicable
245
+ if len(rows) > max_rows_per_table:
246
+ section_lines.append(f"… ({len(rows) - max_rows_per_table} more rows omitted)")
247
+
248
+ rendered_sections.append("\n".join(section_lines))
249
+
250
+ return "\n\n".join(rendered_sections)
251
+
252
+
253
+ def format_extracted_content(
254
+ data: Dict[str, Any],
255
+ include_url: bool = True,
256
+ max_tables: Optional[int] = None,
257
+ max_rows_per_table: int = 25,
258
+ ) -> str:
259
+ """
260
+ Combine `text_markdown` and `tables` from extract_wikipedia_content into an LLM-friendly Markdown string.
261
+
262
+ Args:
263
+ data: Dict returned by extract_wikipedia_content.
264
+ include_url: Whether to include the source URL at the top.
265
+ max_tables: If set, include at most this many tables.
266
+ max_rows_per_table: Maximum number of data rows per table.
267
+
268
+ Returns:
269
+ Markdown string ready to feed into an LLM.
270
+ """
271
+ if not data:
272
+ return ""
273
+
274
+ title = data.get("title") or ""
275
+ url = data.get("url") or ""
276
+ text_md = data.get("text_markdown") or ""
277
+ tables = data.get("tables") or []
278
+
279
+ parts: List[str] = []
280
+ if title:
281
+ parts.append(f"# {title}")
282
+ if include_url and url:
283
+ parts.append(f"Source: {url}")
284
+
285
+ if text_md:
286
+ parts.append("## Article")
287
+ parts.append(text_md)
288
+
289
+ tables_md = format_tables_as_markdown(tables, max_tables=max_tables, max_rows_per_table=max_rows_per_table)
290
+ if tables_md:
291
+ parts.append("## Tables")
292
+ parts.append(tables_md)
293
+
294
+ return "\n\n".join(p for p in parts if p)
295
+
296
+
297
+ def main() -> None:
298
+ if len(sys.argv) < 2:
299
+ print("Usage: python wiki_extractor.py <wikipedia_url>")
300
+ sys.exit(1)
301
+ url = sys.argv[1]
302
+ data = extract_wikipedia_content(url)
303
+
304
+ print(json.dumps({
305
+ "title": data["title"],
306
+ "url": data["url"],
307
+ "num_tables": len(data["tables"]),
308
+ "table_names": [t.get("name", "") for t in data["tables"]][:20],
309
+ }, ensure_ascii=False, indent=2))
310
+
311
+ # Try to locate Studio albums table and print first 3 rows
312
+ studio_tables = [
313
+ t for t in data["tables"]
314
+ if "studio albums" in (t.get("name", "").lower())
315
+ or any("studio albums" in (cap.lower()) for cap in [t.get("name", "")])
316
+ ]
317
+ if studio_tables:
318
+ t0 = studio_tables[0]
319
+ print("\nFound 'Studio albums' table. Headers:")
320
+ print(t0.get("headers", []))
321
+ print("First 3 rows:")
322
+ for row in t0.get("rows", [])[:3]:
323
+ print(row)
324
+ else:
325
+ # Heuristic: print first wikitable under any heading that contains 'albums'
326
+ albums_like = [
327
+ t for t in data["tables"] if "albums" in t.get("name", "").lower()
328
+ ]
329
+ if albums_like:
330
+ t0 = albums_like[0]
331
+ print("\nFound albums-related table. Headers:")
332
+ print(t0.get("headers", []))
333
+ print("First 3 rows:")
334
+ for row in t0.get("rows", [])[:3]:
335
+ print(row)
336
+
337
+
338
+ if __name__ == "__main__":
339
+ main()
340
+
341
+
wikipedia_tools.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from io import StringIO
4
+ import pandas as pd
5
+ from bs4 import BeautifulSoup
6
+ from smolagents.tools import tool
7
+ import wikipediaapi
8
+ def fetch_wikipedia_page(url: str) -> str:
9
+ """Fetch raw HTML of a Wikipedia page."""
10
+ headers = {
11
+ "User-Agent": "GAIA_benchmark_agent/1.0 (contact: gabriel.melki@gmail.com)",
12
+ "Accept-Language": "en-US,en;q=0.9",
13
+ }
14
+ resp = requests.get(url, headers=headers, timeout=50)
15
+ resp.raise_for_status()
16
+ return resp.text
17
+
18
+
19
+ def _normalize_title(value: str) -> str:
20
+ """Lowercase, collapse whitespace for robust title comparisons."""
21
+ return " ".join(value.lower().split()) if isinstance(value, str) else ""
22
+
23
+
24
+ def _remove_sections_by_titles(soup: BeautifulSoup, titles: list[str]) -> None:
25
+ """Remove sections (header + content until next header of same/higher level) whose
26
+ header text matches any of `titles` (case-insensitive). Mutates `soup` in-place.
27
+ """
28
+ if not titles:
29
+ return
30
+ excluded = {_normalize_title(t) for t in titles}
31
+ header_tags = ["h1", "h2", "h3", "h4", "h5", "h6"]
32
+
33
+ for header in list(soup.find_all(header_tags)):
34
+ title_text = _normalize_title(header.get_text(" ", strip=True))
35
+ if title_text in excluded:
36
+ level = int(header.name[1])
37
+ nodes_to_remove = [header]
38
+ for sibling in list(header.next_siblings):
39
+ sib_name = getattr(sibling, "name", None)
40
+ if sib_name in header_tags:
41
+ next_level = int(sibling.name[1])
42
+ if next_level <= level:
43
+ break
44
+ nodes_to_remove.append(sibling)
45
+ for node in nodes_to_remove:
46
+ try:
47
+ node.decompose()
48
+ except Exception:
49
+ try:
50
+ node.extract()
51
+ except Exception:
52
+ pass
53
+
54
+
55
+ def _cleanup_non_content(root: BeautifulSoup) -> None:
56
+ """Remove Wikipedia UI/maintenance blocks from the main content area."""
57
+ selectors = [
58
+ "div#toc",
59
+ "div.toc",
60
+ "div.hatnote",
61
+ "div.shortdescription",
62
+ "div.reflist",
63
+ "ol.references",
64
+ "div.navbox",
65
+ "table.navbox",
66
+ "table.vertical-navbox",
67
+ "table.sidebar",
68
+ "table.ambox",
69
+ "table.metadata",
70
+ "div#catlinks",
71
+ "div.mw-authority-control",
72
+ "div.printfooter",
73
+ "div.portal",
74
+ "table.infobox", # avoid dumping infobox into text
75
+ ]
76
+ for sel in selectors:
77
+ for el in root.select(sel):
78
+ try:
79
+ el.decompose()
80
+ except Exception:
81
+ try:
82
+ el.extract()
83
+ except Exception:
84
+ pass
85
+
86
+
87
+ def extract_text(soup: BeautifulSoup) -> str:
88
+ """Extract main text (paragraphs + headers + lists) from article body only."""
89
+ content_root = soup.select_one("div.mw-parser-output") or soup
90
+
91
+ for elem in content_root(["script", "style", "sup", "aside", "nav"]):
92
+ elem.decompose()
93
+ _cleanup_non_content(content_root)
94
+
95
+ headers = [h.get_text(" ", strip=True) for h in content_root.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])]
96
+ paragraphs = [p.get_text(" ", strip=True) for p in content_root.find_all("p")]
97
+ lists = [li.get_text(" ", strip=True) for li in content_root.find_all("li")]
98
+
99
+ all_text = headers + paragraphs + lists
100
+ clean = [t for t in all_text if len(t) > 30]
101
+ return "\n\n".join(clean)
102
+
103
+
104
+ def extract_tables(soup: BeautifulSoup) -> list[dict]:
105
+ """Extract all HTML tables as dicts: {name, df}."""
106
+ content_root = soup.select_one("div.mw-parser-output") or soup
107
+
108
+ tables = []
109
+ for table_idx, table in enumerate(content_root.find_all("table")):
110
+ # Skip non-content tables (navboxes, sidebars, etc.)
111
+ classes = table.get("class", [])
112
+ if isinstance(classes, list) and any(
113
+ c.lower() in {"navbox", "vertical-navbox", "sidebar", "mbox", "metadata"}
114
+ for c in classes
115
+ ):
116
+ continue
117
+
118
+ # Prefer explicit <caption>
119
+ caption_el = table.find("caption")
120
+ name = caption_el.get_text(" ", strip=True) if caption_el else None
121
+
122
+ # Fallback: nearest previous section header
123
+ if not name:
124
+ prev_header = table.find_previous(["h1", "h2", "h3", "h4", "h5", "h6"])
125
+ if prev_header:
126
+ name = prev_header.get_text(" ", strip=True)
127
+
128
+ # Fallback: class-based hints (e.g., infobox)
129
+ if not name:
130
+ if isinstance(classes, list) and any(c.lower() == "infobox" for c in classes):
131
+ name = "Infobox"
132
+
133
+ # Final fallback
134
+ if not name:
135
+ name = f"Table {table_idx + 1}"
136
+
137
+ try:
138
+ dfs = pd.read_html(StringIO(str(table)))
139
+ if len(dfs) == 1:
140
+ tables.append({"name": name, "df": dfs[0]})
141
+ else:
142
+ for part_idx, df in enumerate(dfs, start=1):
143
+ tables.append({"name": f"{name} (part {part_idx})", "df": df})
144
+ except ValueError:
145
+ continue
146
+ return tables
147
+
148
+
149
+ def format_for_llm(text: str, tables: list[dict], spec: dict) -> str:
150
+ """Combine text + tables into a single string for LLM input."""
151
+ output = []
152
+ output.append("=== ARTICLE TEXT ===\n")
153
+ output.append(text)
154
+
155
+ sections_to_exclude = spec.get("sections_to_exclude", [])
156
+ excluded = {_normalize_title(s) for s in sections_to_exclude}
157
+ filtered_tables = [
158
+ t for t in tables if _normalize_title(t.get("name", "")) not in excluded
159
+ ]
160
+
161
+ for i, t in enumerate(filtered_tables, start=1):
162
+ tname = t.get("name") or f"Table {i}"
163
+ df = t["df"]
164
+ output.append(f"\n\n=== TABLE {i}: {tname} ===\n")
165
+ output.append(df.to_markdown(index=False))
166
+
167
+ return "\n".join(output)
168
+
169
+
170
+ @tool
171
+ def wikipedia_summary(entity: str) -> dict:
172
+ """
173
+ Search Wikipedia for a query and return a dictionary with the summary of the page and the url of the page.
174
+ Args:
175
+ entity: the entity being searched for and ALWAYS pass exactly the entity name (person/place/event/concept) with no qualifiers.
176
+ Returns:
177
+ A dictionary with the summary of the page and the url of the page.
178
+ """
179
+ import wikipedia
180
+ summary_tool = wikipediaapi.Wikipedia(
181
+ user_agent=f"My research agent ({os.getenv('USER_EMAIL')})",
182
+ )
183
+ page = summary_tool.page(entity)
184
+ if not page.exists():
185
+ raise ValueError(f"No Wikipedia page found for '{entity}'. Try a different query.")
186
+ sections = [section._title for section in page.sections]
187
+ return {
188
+ "summary": f'''The sections inside the page are {", ".join(sections)} and the summary of the page is {page.summary}
189
+ ''',
190
+ "url": wikipedia.page(pageid=page.pageid).url
191
+ }
192
+
193
+
194
+
195
+ @tool
196
+ def read_wikipedia_page(
197
+ url: str,
198
+ spec: dict = {
199
+ "sections_to_exclude": [
200
+ "External links",
201
+ "References",
202
+ "Further reading",
203
+ "See also",
204
+ "Notes",
205
+ ]
206
+ }) -> str:
207
+ """
208
+ Read a Wikipedia page and return a dictionary with the text of the page.
209
+ Args:
210
+ url: The URL of the Wikipedia page to read.
211
+ spec: A dictionary specifying sections to exclude from the page.
212
+ Returns:
213
+ A dictionary with the text of the page.
214
+ """
215
+ if "https://en.wikipedia.org/wiki/" not in url:
216
+ raise ValueError("URL is required")
217
+ # Fetch the page
218
+ html = fetch_wikipedia_page(url)
219
+ # Parse the page
220
+ soup = BeautifulSoup(html, "html.parser")
221
+ # Remove unwanted sections
222
+ _remove_sections_by_titles(soup, spec.get("sections_to_exclude", []))
223
+
224
+ # Extract after pruning unwanted sections
225
+ text = extract_text(soup)
226
+ tables = extract_tables(soup)
227
+
228
+ # Combine
229
+ llm_ready = format_for_llm(text, tables, spec)
230
+ return llm_ready