aelin commited on
Commit
f791164
·
1 Parent(s): d0210fc

Adds web page to markdown conversion tool

Browse files

Introduces a tool to fetch a web page by URL and convert its content to markdown, with error handling and output length limits. Also updates agent context initialization to ensure proper usage across runs.

Enhances web content extraction and improves agent reliability.

Files changed (2) hide show
  1. _tools.py +31 -0
  2. app.py +4 -2
_tools.py CHANGED
@@ -1,3 +1,8 @@
 
 
 
 
 
1
  import requests
2
  import io
3
 
@@ -78,6 +83,25 @@ def _extract_text_from_audio_file(file_bytes: bytes) -> str:
78
  """Extract text from an audio file."""
79
  return client.automatic_speech_recognition(file_bytes, model="openai/whisper-large-v2").text
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  # Initialize tools
82
  search_tool = FunctionTool.from_defaults(
83
  _search_tool,
@@ -133,6 +157,12 @@ extract_text_from_audio_file_tool = FunctionTool.from_defaults(
133
  description="Extract text from an audio file."
134
  )
135
 
 
 
 
 
 
 
136
  tools = [
137
  search_tool,
138
  fetch_file_bytes_tool,
@@ -143,5 +173,6 @@ tools = [
143
  extract_text_from_code_file_tool,
144
  extract_text_from_audio_file_tool,
145
  xlsx_to_text_tool,
 
146
  ]
147
 
 
1
+
2
+ import re
3
+ from markdownify import markdownify
4
+
5
+
6
  import requests
7
  import io
8
 
 
83
  """Extract text from an audio file."""
84
  return client.automatic_speech_recognition(file_bytes, model="openai/whisper-large-v2").text
85
 
86
+ def _webpage_to_markdown(url: str) -> str:
87
+ """
88
+ Access a web page and return its content as markdown.
89
+ Limits output to 10,000 characters to avoid excessive responses.
90
+ """
91
+ try:
92
+ response = requests.get(url, timeout=20)
93
+ response.raise_for_status()
94
+ markdown_content = markdownify(response.text).strip()
95
+ markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
96
+ return markdown_content[:10000]
97
+ except requests.exceptions.Timeout:
98
+ return "Request timed out. Please try again later or check the URL."
99
+ except requests.exceptions.RequestException as e:
100
+ return f"Error fetching the webpage: {str(e)}"
101
+ except Exception as e:
102
+ return f"Unexpected error: {str(e)}"
103
+
104
+
105
  # Initialize tools
106
  search_tool = FunctionTool.from_defaults(
107
  _search_tool,
 
157
  description="Extract text from an audio file."
158
  )
159
 
160
+ webpage_to_markdown_tool = FunctionTool.from_defaults(
161
+ _webpage_to_markdown,
162
+ name="Webpage to Markdown",
163
+ description="Access a web page by URL and return the content as markdown. Use to read web pages."
164
+ )
165
+
166
  tools = [
167
  search_tool,
168
  fetch_file_bytes_tool,
 
173
  extract_text_from_code_file_tool,
174
  extract_text_from_audio_file_tool,
175
  xlsx_to_text_tool,
176
+ webpage_to_markdown_tool,
177
  ]
178
 
app.py CHANGED
@@ -12,7 +12,6 @@ import asyncio
12
  from utils import cache_answers, update_cache_answer, get_cached_answer, load_cache
13
 
14
 
15
- context = Context()
16
  # (Keep Constants as is)
17
  # --- Constants ---
18
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
@@ -42,7 +41,10 @@ class BasicAgent:
42
  Don't use any other format than the one above and limit your attempts to answer the question to 3 times.
43
  """,
44
  )
 
 
45
  self.agent = agent
 
46
 
47
  async def run(self, question: Question) -> str:
48
  question_text = question["question"]
@@ -69,7 +71,7 @@ class BasicAgent:
69
 
70
  return str(cached["answer"])
71
 
72
- answer = await self.agent.run(prompt, ctx=context)
73
 
74
  print(f"Agent returning answer: {answer}")
75
 
 
12
  from utils import cache_answers, update_cache_answer, get_cached_answer, load_cache
13
 
14
 
 
15
  # (Keep Constants as is)
16
  # --- Constants ---
17
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 
41
  Don't use any other format than the one above and limit your attempts to answer the question to 3 times.
42
  """,
43
  )
44
+
45
+ context = Context(agent)
46
  self.agent = agent
47
+ self.context = context
48
 
49
  async def run(self, question: Question) -> str:
50
  question_text = question["question"]
 
71
 
72
  return str(cached["answer"])
73
 
74
+ answer = await self.agent.run(prompt, ctx=self.context)
75
 
76
  print(f"Agent returning answer: {answer}")
77