Spaces:
Sleeping
Sleeping
File size: 3,422 Bytes
9c8d535 c19d193 e96076b 9c8d535 755b8aa e96076b 9e73ba8 5231f49 e96076b 8fe992b 9b5b26a 1b2d92d 9e73ba8 be1e877 9e73ba8 a1422d2 9e73ba8 755b8aa 9e73ba8 755b8aa 9e73ba8 755b8aa 9e73ba8 9b5b26a e96076b 5b496d7 56016da 5b496d7 2728c1b 9d94df1 2728c1b dd0c5f5 8c01ffb dd0c5f5 2728c1b e96076b 8c01ffb 8fe992b 0c8769a dd0c5f5 8c01ffb dd0c5f5 8fe992b 8c01ffb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 | import os
import yaml
import requests
from bs4 import BeautifulSoup
from smolagents import (
load_tool, tool, Tool,
DuckDuckGoSearchTool, FinalAnswerTool,
CodeAgent, InferenceClientModel,
)
from Gradio_UI import GradioUI
class SiteContentFetcher(Tool):
name = "site_content_fetcher"
description = (
"This tool fetches and cleans readable text from the specified URL. Normally used after some web_search_tool."
)
inputs = {
"url": {
"type": "string",
"description": "The full URL of the website to fetch content from, including the protocol (http or https).",
}
}
output_type = "string"
def __init__(self):
self.MAX_CHARS = 100_000 # Optional: limit size of returned content
def forward(self, url: str) -> str:
headers = {
"User-Agent": "Mozilla/5.0 (compatible; SiteContentFetcher/1.0)"
}
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
cleaned_text = self._clean_html(response.text)
return cleaned_text[:self.MAX_CHARS]
except requests.exceptions.MissingSchema:
return "Invalid URL format. Make sure it starts with http:// or https://"
except requests.exceptions.Timeout:
return "The request timed out. The site may be too slow or unresponsive."
except requests.exceptions.ConnectionError:
return f"Failed to connect to {url}. Please check if the site is reachable."
except requests.exceptions.HTTPError as e:
return f"HTTP error occurred: {e.response.status_code} {e.response.reason}"
except Exception as e:
return f"An unexpected error occurred: {str(e)}"
def _clean_html(self, html: str) -> str:
soup = BeautifulSoup(html, "html.parser")
# Remove script, style, and noscript tags
for tag in soup(["script", "style", "noscript"]):
tag.decompose()
# Extract and clean text
text = soup.get_text(separator="\n")
lines = [line.strip() for line in text.splitlines()]
cleaned_lines = [line for line in lines if line]
return "\n".join(cleaned_lines)
# @tool
# def my_custom_tool(arg1: str) -> str:
# """ Description
# Args:
# arg1: the first argument
# """
# pass
# Model init
# If the agent does not answer, the model is overloaded, please use another model or the following Hugging Face Endpoint that also contains qwen2.5 coder:
# model_id='https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud'
model = InferenceClientModel(
model_id='Qwen/Qwen2.5-Coder-32B-Instruct',
max_tokens=2096,
temperature=0.5,
provider="auto",
# token=os.environ["HF_TOKEN"], # used this env var by default
)
# # Import tool from Hub
# image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)
# with open("prompts.yaml", 'r') as stream:
# prompt_templates = yaml.safe_load(stream)
agent = CodeAgent(
model=model,
tools=[DuckDuckGoSearchTool(), SiteContentFetcher()], ## add your tools here (don't remove final answer)
max_steps=5,
verbosity_level=1,
# grammar=None,
# planning_interval=None,
# name=None,
# description=None,
# prompt_templates=prompt_templates
)
GradioUI(agent).launch() |