Thanh Vinh Vo commited on
Commit
abda1eb
·
1 Parent(s): b34523d
Files changed (2) hide show
  1. app.py +80 -2
  2. requirements.txt +1 -0
app.py CHANGED
@@ -24,6 +24,80 @@ import whisper
24
  # --- Constants ---
25
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  @tool
28
  def audio_to_text(file_path: str) -> str:
29
  """
@@ -83,6 +157,7 @@ def audio_to_text(file_path: str) -> str:
83
 
84
  return json.dumps({"transcript": transcript_data})
85
 
 
86
 
87
 
88
  @tool
@@ -160,6 +235,7 @@ class BasicAgent:
160
  "numpy",
161
  "json",
162
  "whisper",
 
163
  ],
164
  name="multimodal_agent",
165
  description="""
@@ -169,7 +245,7 @@ class BasicAgent:
169
  )
170
 
171
  self.code_agent = CodeAgent(
172
- tools=[VisitWebpageTool(), DuckDuckGoSearchTool(), get_file, audio_to_text],
173
  model=InferenceClientModel(
174
  model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
175
  ),
@@ -191,6 +267,7 @@ class BasicAgent:
191
  "chess.engine",
192
  "json",
193
  "whisper",
 
194
  ],
195
  name="code_agent",
196
  description="""
@@ -213,7 +290,7 @@ class BasicAgent:
213
  model=InferenceClientModel(
214
  "Qwen/Qwen2.5-32B-Instruct"
215
  ),
216
- tools=[get_file, audio_to_text],
217
  managed_agents=[
218
  self.multimodal_agent,
219
  self.code_agent],
@@ -235,6 +312,7 @@ class BasicAgent:
235
  "chess.engine",
236
  "whisper",
237
  "json",
 
238
  ],
239
  planning_interval=5,
240
  max_steps=15,
 
24
  # --- Constants ---
25
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
26
 
27
+
28
+ @tool
29
+ def extract_table_from_html(html: str, match: str | None = None) -> list:
30
+ """
31
+ A tool that extracts HTML tables from HTML content and returns them as pandas DataFrames. Example use-cases include extracting tables from Wikipedia pages, HTML emails, or other web content.
32
+
33
+ This function uses pandas.read_html() to parse HTML tables from the provided HTML content
34
+ and returns the extracted tables as a list of pandas DataFrames. It can optionally filter
35
+ tables based on a text pattern match.
36
+
37
+ Args:
38
+ html (str): The HTML content containing HTML tables to extract. This can be raw HTML
39
+ string content or a URL to a webpage.
40
+ match (str | None, optional): A string or regular expression pattern to match
41
+ against table text content. Only tables containing
42
+ this pattern will be returned. If None, all tables
43
+ are extracted. Defaults to None.
44
+
45
+ Returns:
46
+ list: A list of pandas DataFrames, where each DataFrame represents a table found
47
+ in the HTML content. Returns an empty list if no tables are found.
48
+
49
+ Raises:
50
+ ValueError: If the HTML content is invalid or cannot be parsed.
51
+ Exception: If HTML parsing fails or other unexpected errors occur.
52
+
53
+ Example:
54
+ >>> html_content = '''
55
+ ... <table>
56
+ ... <tr><th>Name</th><th>Age</th></tr>
57
+ ... <tr><td>John</td><td>25</td></tr>
58
+ ... </table>
59
+ ... '''
60
+ >>> tables = extract_table_from_html(html_content)
61
+ >>> print(f"Found {len(tables)} tables")
62
+ >>> if tables:
63
+ ... first_table = tables[0]
64
+ ... print(f"First table shape: {first_table.shape}")
65
+ ... print(first_table.head())
66
+
67
+ >>> # Extract tables containing specific text
68
+ >>> tables = extract_table_from_html(html_content, match="Name")
69
+ >>> for i, table in enumerate(tables):
70
+ ... print(f"Table {i}: {table.shape[0]} rows, {table.shape[1]} columns")
71
+
72
+ Note:
73
+ - Uses pandas.read_html() which requires lxml, html5lib, or BeautifulSoup4
74
+ - Tables must be properly formatted HTML <table> elements
75
+ - The match parameter is case-sensitive
76
+ - Returns native pandas DataFrames for direct manipulation and analysis
77
+ - Can accept either raw HTML content or URLs (pandas.read_html supports both)
78
+ - Returns empty list instead of raising error when no tables are found
79
+ """
80
+ import pandas as pd
81
+
82
+ try:
83
+ # Extract tables using pandas
84
+ if match is not None:
85
+ tables = pd.read_html(html, match=match)
86
+ else:
87
+ tables = pd.read_html(html)
88
+
89
+ # Return the list of DataFrames directly
90
+ return tables if tables else []
91
+
92
+ except ValueError as e:
93
+ if "No tables found" in str(e):
94
+ # Return empty list instead of raising error
95
+ return []
96
+ else:
97
+ raise ValueError(f"Error extracting tables from HTML content: {e}")
98
+ except Exception as e:
99
+ raise Exception(f"Failed to extract tables from HTML content: {e}")
100
+
101
  @tool
102
  def audio_to_text(file_path: str) -> str:
103
  """
 
157
 
158
  return json.dumps({"transcript": transcript_data})
159
 
160
+ @tool
161
 
162
 
163
  @tool
 
235
  "numpy",
236
  "json",
237
  "whisper",
238
+ "openpyxl"
239
  ],
240
  name="multimodal_agent",
241
  description="""
 
245
  )
246
 
247
  self.code_agent = CodeAgent(
248
+ tools=[VisitWebpageTool(), DuckDuckGoSearchTool(), get_file, audio_to_text, extract_table_from_html],
249
  model=InferenceClientModel(
250
  model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
251
  ),
 
267
  "chess.engine",
268
  "json",
269
  "whisper",
270
+ "openpyxl"
271
  ],
272
  name="code_agent",
273
  description="""
 
290
  model=InferenceClientModel(
291
  "Qwen/Qwen2.5-32B-Instruct"
292
  ),
293
+ tools=[get_file, audio_to_text, extract_table_from_html],
294
  managed_agents=[
295
  self.multimodal_agent,
296
  self.code_agent],
 
312
  "chess.engine",
313
  "whisper",
314
  "json",
315
+ "openpyxl"
316
  ],
317
  planning_interval=5,
318
  max_steps=15,
requirements.txt CHANGED
@@ -14,3 +14,4 @@ opencv-python
14
  numpy
15
  html5lib
16
  openai-whisper
 
 
14
  numpy
15
  html5lib
16
  openai-whisper
17
+ openpyxl