File size: 3,422 Bytes
9c8d535
c19d193
e96076b
9c8d535
755b8aa
 
e96076b
9e73ba8
 
5231f49
e96076b
8fe992b
9b5b26a
 
1b2d92d
9e73ba8
be1e877
9e73ba8
a1422d2
9e73ba8
 
 
 
 
 
 
 
 
755b8aa
 
 
9e73ba8
 
 
 
 
 
 
 
755b8aa
 
9e73ba8
 
 
 
 
 
 
 
 
 
 
755b8aa
 
 
 
 
 
 
 
 
 
 
 
 
9e73ba8
 
 
 
 
 
 
 
 
9b5b26a
e96076b
5b496d7
56016da
5b496d7
2728c1b
9d94df1
2728c1b
 
 
 
 
 
dd0c5f5
 
8c01ffb
dd0c5f5
 
2728c1b
e96076b
8c01ffb
8fe992b
0c8769a
dd0c5f5
8c01ffb
dd0c5f5
 
 
 
 
8fe992b
 
8c01ffb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
import yaml
import requests

from bs4 import BeautifulSoup

from smolagents import (
    load_tool, tool, Tool,
    DuckDuckGoSearchTool, FinalAnswerTool,
    CodeAgent, InferenceClientModel,
)

from Gradio_UI import GradioUI


class SiteContentFetcher(Tool):
    name = "site_content_fetcher"
    description = (
        "This tool fetches and cleans readable text from the specified URL. Normally used after some web_search_tool."
    )
    inputs = {
        "url": {
            "type": "string",
            "description": "The full URL of the website to fetch content from, including the protocol (http or https).",
        }
    }
    output_type = "string"

    def __init__(self):
        self.MAX_CHARS = 100_000  # Optional: limit size of returned content

    def forward(self, url: str) -> str:
        headers = {
            "User-Agent": "Mozilla/5.0 (compatible; SiteContentFetcher/1.0)"
        }

        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            cleaned_text = self._clean_html(response.text)
            return cleaned_text[:self.MAX_CHARS]
        except requests.exceptions.MissingSchema:
            return "Invalid URL format. Make sure it starts with http:// or https://"
        except requests.exceptions.Timeout:
            return "The request timed out. The site may be too slow or unresponsive."
        except requests.exceptions.ConnectionError:
            return f"Failed to connect to {url}. Please check if the site is reachable."
        except requests.exceptions.HTTPError as e:
            return f"HTTP error occurred: {e.response.status_code} {e.response.reason}"
        except Exception as e:
            return f"An unexpected error occurred: {str(e)}"

    def _clean_html(self, html: str) -> str:
        soup = BeautifulSoup(html, "html.parser")

        # Remove script, style, and noscript tags
        for tag in soup(["script", "style", "noscript"]):
            tag.decompose()

        # Extract and clean text
        text = soup.get_text(separator="\n")
        lines = [line.strip() for line in text.splitlines()]
        cleaned_lines = [line for line in lines if line]
        return "\n".join(cleaned_lines)


# @tool
# def my_custom_tool(arg1: str) -> str:
#     """ Description
#     Args:
#         arg1: the first argument
#     """
#     pass


# Model init
# If the agent does not answer, the model is overloaded, please use another model or the following Hugging Face Endpoint that also contains qwen2.5 coder:
# model_id='https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud'

model = InferenceClientModel(
    model_id='Qwen/Qwen2.5-Coder-32B-Instruct',
    max_tokens=2096,
    temperature=0.5,
    provider="auto",
    # token=os.environ["HF_TOKEN"],  # used this env var by default
)

# # Import tool from Hub
# image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)

# with open("prompts.yaml", 'r') as stream:
#     prompt_templates = yaml.safe_load(stream)


agent = CodeAgent(
    model=model,
    tools=[DuckDuckGoSearchTool(), SiteContentFetcher()], ## add your tools here (don't remove final answer)
    max_steps=5,
    verbosity_level=1,
    # grammar=None,
    # planning_interval=None,
    # name=None,
    # description=None,
    # prompt_templates=prompt_templates
)

GradioUI(agent).launch()