Thanh Vinh Vo
commited on
Commit
·
abda1eb
1
Parent(s):
b34523d
update
Browse files- app.py +80 -2
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -24,6 +24,80 @@ import whisper
|
|
| 24 |
# --- Constants ---
|
| 25 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
@tool
|
| 28 |
def audio_to_text(file_path: str) -> str:
|
| 29 |
"""
|
|
@@ -83,6 +157,7 @@ def audio_to_text(file_path: str) -> str:
|
|
| 83 |
|
| 84 |
return json.dumps({"transcript": transcript_data})
|
| 85 |
|
|
|
|
| 86 |
|
| 87 |
|
| 88 |
@tool
|
|
@@ -160,6 +235,7 @@ class BasicAgent:
|
|
| 160 |
"numpy",
|
| 161 |
"json",
|
| 162 |
"whisper",
|
|
|
|
| 163 |
],
|
| 164 |
name="multimodal_agent",
|
| 165 |
description="""
|
|
@@ -169,7 +245,7 @@ class BasicAgent:
|
|
| 169 |
)
|
| 170 |
|
| 171 |
self.code_agent = CodeAgent(
|
| 172 |
-
tools=[VisitWebpageTool(), DuckDuckGoSearchTool(), get_file, audio_to_text],
|
| 173 |
model=InferenceClientModel(
|
| 174 |
model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
|
| 175 |
),
|
|
@@ -191,6 +267,7 @@ class BasicAgent:
|
|
| 191 |
"chess.engine",
|
| 192 |
"json",
|
| 193 |
"whisper",
|
|
|
|
| 194 |
],
|
| 195 |
name="code_agent",
|
| 196 |
description="""
|
|
@@ -213,7 +290,7 @@ class BasicAgent:
|
|
| 213 |
model=InferenceClientModel(
|
| 214 |
"Qwen/Qwen2.5-32B-Instruct"
|
| 215 |
),
|
| 216 |
-
tools=[get_file, audio_to_text],
|
| 217 |
managed_agents=[
|
| 218 |
self.multimodal_agent,
|
| 219 |
self.code_agent],
|
|
@@ -235,6 +312,7 @@ class BasicAgent:
|
|
| 235 |
"chess.engine",
|
| 236 |
"whisper",
|
| 237 |
"json",
|
|
|
|
| 238 |
],
|
| 239 |
planning_interval=5,
|
| 240 |
max_steps=15,
|
|
|
|
| 24 |
# --- Constants ---
|
| 25 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 26 |
|
| 27 |
+
|
| 28 |
+
@tool
|
| 29 |
+
def extract_table_from_html(html: str, match: str | None = None) -> list:
|
| 30 |
+
"""
|
| 31 |
+
A tool that extracts HTML tables from HTML content and returns them as pandas DataFrames. Example use-cases include extracting tables from Wikipedia pages, HTML emails, or other web content.
|
| 32 |
+
|
| 33 |
+
This function uses pandas.read_html() to parse HTML tables from the provided HTML content
|
| 34 |
+
and returns the extracted tables as a list of pandas DataFrames. It can optionally filter
|
| 35 |
+
tables based on a text pattern match.
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
html (str): The HTML content containing HTML tables to extract. This can be raw HTML
|
| 39 |
+
string content or a URL to a webpage.
|
| 40 |
+
match (str | None, optional): A string or regular expression pattern to match
|
| 41 |
+
against table text content. Only tables containing
|
| 42 |
+
this pattern will be returned. If None, all tables
|
| 43 |
+
are extracted. Defaults to None.
|
| 44 |
+
|
| 45 |
+
Returns:
|
| 46 |
+
list: A list of pandas DataFrames, where each DataFrame represents a table found
|
| 47 |
+
in the HTML content. Returns an empty list if no tables are found.
|
| 48 |
+
|
| 49 |
+
Raises:
|
| 50 |
+
ValueError: If the HTML content is invalid or cannot be parsed.
|
| 51 |
+
Exception: If HTML parsing fails or other unexpected errors occur.
|
| 52 |
+
|
| 53 |
+
Example:
|
| 54 |
+
>>> html_content = '''
|
| 55 |
+
... <table>
|
| 56 |
+
... <tr><th>Name</th><th>Age</th></tr>
|
| 57 |
+
... <tr><td>John</td><td>25</td></tr>
|
| 58 |
+
... </table>
|
| 59 |
+
... '''
|
| 60 |
+
>>> tables = extract_table_from_html(html_content)
|
| 61 |
+
>>> print(f"Found {len(tables)} tables")
|
| 62 |
+
>>> if tables:
|
| 63 |
+
... first_table = tables[0]
|
| 64 |
+
... print(f"First table shape: {first_table.shape}")
|
| 65 |
+
... print(first_table.head())
|
| 66 |
+
|
| 67 |
+
>>> # Extract tables containing specific text
|
| 68 |
+
>>> tables = extract_table_from_html(html_content, match="Name")
|
| 69 |
+
>>> for i, table in enumerate(tables):
|
| 70 |
+
... print(f"Table {i}: {table.shape[0]} rows, {table.shape[1]} columns")
|
| 71 |
+
|
| 72 |
+
Note:
|
| 73 |
+
- Uses pandas.read_html() which requires lxml, html5lib, or BeautifulSoup4
|
| 74 |
+
- Tables must be properly formatted HTML <table> elements
|
| 75 |
+
- The match parameter is case-sensitive
|
| 76 |
+
- Returns native pandas DataFrames for direct manipulation and analysis
|
| 77 |
+
- Can accept either raw HTML content or URLs (pandas.read_html supports both)
|
| 78 |
+
- Returns empty list instead of raising error when no tables are found
|
| 79 |
+
"""
|
| 80 |
+
import pandas as pd
|
| 81 |
+
|
| 82 |
+
try:
|
| 83 |
+
# Extract tables using pandas
|
| 84 |
+
if match is not None:
|
| 85 |
+
tables = pd.read_html(html, match=match)
|
| 86 |
+
else:
|
| 87 |
+
tables = pd.read_html(html)
|
| 88 |
+
|
| 89 |
+
# Return the list of DataFrames directly
|
| 90 |
+
return tables if tables else []
|
| 91 |
+
|
| 92 |
+
except ValueError as e:
|
| 93 |
+
if "No tables found" in str(e):
|
| 94 |
+
# Return empty list instead of raising error
|
| 95 |
+
return []
|
| 96 |
+
else:
|
| 97 |
+
raise ValueError(f"Error extracting tables from HTML content: {e}")
|
| 98 |
+
except Exception as e:
|
| 99 |
+
raise Exception(f"Failed to extract tables from HTML content: {e}")
|
| 100 |
+
|
| 101 |
@tool
|
| 102 |
def audio_to_text(file_path: str) -> str:
|
| 103 |
"""
|
|
|
|
| 157 |
|
| 158 |
return json.dumps({"transcript": transcript_data})
|
| 159 |
|
| 160 |
+
@tool
|
| 161 |
|
| 162 |
|
| 163 |
@tool
|
|
|
|
| 235 |
"numpy",
|
| 236 |
"json",
|
| 237 |
"whisper",
|
| 238 |
+
"openpyxl"
|
| 239 |
],
|
| 240 |
name="multimodal_agent",
|
| 241 |
description="""
|
|
|
|
| 245 |
)
|
| 246 |
|
| 247 |
self.code_agent = CodeAgent(
|
| 248 |
+
tools=[VisitWebpageTool(), DuckDuckGoSearchTool(), get_file, audio_to_text, extract_table_from_html],
|
| 249 |
model=InferenceClientModel(
|
| 250 |
model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
|
| 251 |
),
|
|
|
|
| 267 |
"chess.engine",
|
| 268 |
"json",
|
| 269 |
"whisper",
|
| 270 |
+
"openpyxl"
|
| 271 |
],
|
| 272 |
name="code_agent",
|
| 273 |
description="""
|
|
|
|
| 290 |
model=InferenceClientModel(
|
| 291 |
"Qwen/Qwen2.5-32B-Instruct"
|
| 292 |
),
|
| 293 |
+
tools=[get_file, audio_to_text, extract_table_from_html],
|
| 294 |
managed_agents=[
|
| 295 |
self.multimodal_agent,
|
| 296 |
self.code_agent],
|
|
|
|
| 312 |
"chess.engine",
|
| 313 |
"whisper",
|
| 314 |
"json",
|
| 315 |
+
"openpyxl"
|
| 316 |
],
|
| 317 |
planning_interval=5,
|
| 318 |
max_steps=15,
|
requirements.txt
CHANGED
|
@@ -14,3 +14,4 @@ opencv-python
|
|
| 14 |
numpy
|
| 15 |
html5lib
|
| 16 |
openai-whisper
|
|
|
|
|
|
| 14 |
numpy
|
| 15 |
html5lib
|
| 16 |
openai-whisper
|
| 17 |
+
openpyxl
|