Spaces:
Sleeping
Sleeping
aelin commited on
Commit ·
f791164
1
Parent(s): d0210fc
Adds web page to markdown conversion tool
Browse filesIntroduces a tool to fetch a web page by URL and convert its content to markdown, with error handling and output length limits. Also updates agent context initialization to ensure proper usage across runs.
Enhances web content extraction and improves agent reliability.
_tools.py
CHANGED
|
@@ -1,3 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import requests
|
| 2 |
import io
|
| 3 |
|
|
@@ -78,6 +83,25 @@ def _extract_text_from_audio_file(file_bytes: bytes) -> str:
|
|
| 78 |
"""Extract text from an audio file."""
|
| 79 |
return client.automatic_speech_recognition(file_bytes, model="openai/whisper-large-v2").text
|
| 80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
# Initialize tools
|
| 82 |
search_tool = FunctionTool.from_defaults(
|
| 83 |
_search_tool,
|
|
@@ -133,6 +157,12 @@ extract_text_from_audio_file_tool = FunctionTool.from_defaults(
|
|
| 133 |
description="Extract text from an audio file."
|
| 134 |
)
|
| 135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
tools = [
|
| 137 |
search_tool,
|
| 138 |
fetch_file_bytes_tool,
|
|
@@ -143,5 +173,6 @@ tools = [
|
|
| 143 |
extract_text_from_code_file_tool,
|
| 144 |
extract_text_from_audio_file_tool,
|
| 145 |
xlsx_to_text_tool,
|
|
|
|
| 146 |
]
|
| 147 |
|
|
|
|
| 1 |
+
|
| 2 |
+
import re
|
| 3 |
+
from markdownify import markdownify
|
| 4 |
+
|
| 5 |
+
|
| 6 |
import requests
|
| 7 |
import io
|
| 8 |
|
|
|
|
| 83 |
"""Extract text from an audio file."""
|
| 84 |
return client.automatic_speech_recognition(file_bytes, model="openai/whisper-large-v2").text
|
| 85 |
|
| 86 |
+
def _webpage_to_markdown(url: str) -> str:
|
| 87 |
+
"""
|
| 88 |
+
Access a web page and return its content as markdown.
|
| 89 |
+
Limits output to 10,000 characters to avoid excessive responses.
|
| 90 |
+
"""
|
| 91 |
+
try:
|
| 92 |
+
response = requests.get(url, timeout=20)
|
| 93 |
+
response.raise_for_status()
|
| 94 |
+
markdown_content = markdownify(response.text).strip()
|
| 95 |
+
markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
|
| 96 |
+
return markdown_content[:10000]
|
| 97 |
+
except requests.exceptions.Timeout:
|
| 98 |
+
return "Request timed out. Please try again later or check the URL."
|
| 99 |
+
except requests.exceptions.RequestException as e:
|
| 100 |
+
return f"Error fetching the webpage: {str(e)}"
|
| 101 |
+
except Exception as e:
|
| 102 |
+
return f"Unexpected error: {str(e)}"
|
| 103 |
+
|
| 104 |
+
|
| 105 |
# Initialize tools
|
| 106 |
search_tool = FunctionTool.from_defaults(
|
| 107 |
_search_tool,
|
|
|
|
| 157 |
description="Extract text from an audio file."
|
| 158 |
)
|
| 159 |
|
| 160 |
+
webpage_to_markdown_tool = FunctionTool.from_defaults(
|
| 161 |
+
_webpage_to_markdown,
|
| 162 |
+
name="Webpage to Markdown",
|
| 163 |
+
description="Access a web page by URL and return the content as markdown. Use to read web pages."
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
tools = [
|
| 167 |
search_tool,
|
| 168 |
fetch_file_bytes_tool,
|
|
|
|
| 173 |
extract_text_from_code_file_tool,
|
| 174 |
extract_text_from_audio_file_tool,
|
| 175 |
xlsx_to_text_tool,
|
| 176 |
+
webpage_to_markdown_tool,
|
| 177 |
]
|
| 178 |
|
app.py
CHANGED
|
@@ -12,7 +12,6 @@ import asyncio
|
|
| 12 |
from utils import cache_answers, update_cache_answer, get_cached_answer, load_cache
|
| 13 |
|
| 14 |
|
| 15 |
-
context = Context()
|
| 16 |
# (Keep Constants as is)
|
| 17 |
# --- Constants ---
|
| 18 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
|
@@ -42,7 +41,10 @@ class BasicAgent:
|
|
| 42 |
Don't use any other format than the one above and limit your attempts to answer the question to 3 times.
|
| 43 |
""",
|
| 44 |
)
|
|
|
|
|
|
|
| 45 |
self.agent = agent
|
|
|
|
| 46 |
|
| 47 |
async def run(self, question: Question) -> str:
|
| 48 |
question_text = question["question"]
|
|
@@ -69,7 +71,7 @@ class BasicAgent:
|
|
| 69 |
|
| 70 |
return str(cached["answer"])
|
| 71 |
|
| 72 |
-
answer = await self.agent.run(prompt, ctx=context)
|
| 73 |
|
| 74 |
print(f"Agent returning answer: {answer}")
|
| 75 |
|
|
|
|
| 12 |
from utils import cache_answers, update_cache_answer, get_cached_answer, load_cache
|
| 13 |
|
| 14 |
|
|
|
|
| 15 |
# (Keep Constants as is)
|
| 16 |
# --- Constants ---
|
| 17 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
|
|
|
| 41 |
Don't use any other format than the one above and limit your attempts to answer the question to 3 times.
|
| 42 |
""",
|
| 43 |
)
|
| 44 |
+
|
| 45 |
+
context = Context(agent)
|
| 46 |
self.agent = agent
|
| 47 |
+
self.context = context
|
| 48 |
|
| 49 |
async def run(self, question: Question) -> str:
|
| 50 |
question_text = question["question"]
|
|
|
|
| 71 |
|
| 72 |
return str(cached["answer"])
|
| 73 |
|
| 74 |
+
answer = await self.agent.run(prompt, ctx=self.context)
|
| 75 |
|
| 76 |
print(f"Agent returning answer: {answer}")
|
| 77 |
|