File size: 2,405 Bytes
030fe8e
 
9b5b26a
a509455
 
6aae614
9b5b26a
 
 
a509455
030fe8e
9b5b26a
030fe8e
 
 
9b5b26a
 
a509455
030fe8e
a509455
030fe8e
 
a509455
8c01ffb
030fe8e
a509455
8c01ffb
030fe8e
 
 
 
 
 
 
ae7a494
030fe8e
 
 
 
 
 
 
 
 
 
 
 
 
a509455
 
 
 
ae7a494
030fe8e
d8b50c3
030fe8e
 
 
 
13d500a
8c01ffb
 
8fe992b
a509455
 
 
 
976827d
8c01ffb
 
 
a509455
030fe8e
 
8fe992b
 
8c01ffb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from smolagents import CodeAgent, HfApiModel, tool
import os
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify
from tools.final_answer import FinalAnswerTool
from Gradio_UI import GradioUI

@tool
def webpage_scraper(url: str) -> str:
    """A tool that scrapes and summarizes webpage content from a given URL.
    Args:
        url: URL of the webpage to scrape and summarize
    Returns:
        str: A summary of the webpage content including title and main text
    """
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Get title
        title = soup.title.string if soup.title else "No title"

        # Find main content
        main_content = (
            soup.find('main') or
            soup.find('article') or
            soup.find('div', class_='content') or
            soup.body
        )

        if main_content:
            # Remove unwanted elements
            for tag in main_content.find_all(['script', 'style', 'nav', 'footer', 'aside', 'header']):
                tag.decompose()

            # Convert to markdown and clean up
            content = markdownify(str(main_content), heading_style="ATX")
            # Try to keep complete sentences
            content = content[:2000].rsplit('.', 1)[0] + '...'
        else:
            content = "Could not find main content"

        return f"Title: {title}\n\nContent Summary:\n{content}"
    except Exception as e:
        return f"Error scraping webpage: {str(e)}"

final_answer = FinalAnswerTool()

# Keep the same model endpoint
model = HfApiModel(
    model_id='https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud',
    max_tokens=2096,
    temperature=0.5,
    custom_role_conversions=None,
)

agent = CodeAgent(
    model=model,
    tools=[
        final_answer,
        webpage_scraper
    ],
    max_steps=5,
    verbosity_level=1,
    grammar=None,
    planning_interval=None,
    name="Web Scraping Agent",
    description="An agent capable of scraping and analyzing web content",
    prompt_templates=None  # Remove prompts.yaml dependency
)

GradioUI(agent).launch()