Spaces:
Sleeping
Sleeping
File size: 2,405 Bytes
030fe8e 9b5b26a a509455 6aae614 9b5b26a a509455 030fe8e 9b5b26a 030fe8e 9b5b26a a509455 030fe8e a509455 030fe8e a509455 8c01ffb 030fe8e a509455 8c01ffb 030fe8e ae7a494 030fe8e a509455 ae7a494 030fe8e d8b50c3 030fe8e 13d500a 8c01ffb 8fe992b a509455 976827d 8c01ffb a509455 030fe8e 8fe992b 8c01ffb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
from smolagents import CodeAgent, HfApiModel, tool
import os
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify
from tools.final_answer import FinalAnswerTool
from Gradio_UI import GradioUI
@tool
def webpage_scraper(url: str) -> str:
"""A tool that scrapes and summarizes webpage content from a given URL.
Args:
url: URL of the webpage to scrape and summarize
Returns:
str: A summary of the webpage content including title and main text
"""
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
response = requests.get(url, headers=headers, timeout=15)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Get title
title = soup.title.string if soup.title else "No title"
# Find main content
main_content = (
soup.find('main') or
soup.find('article') or
soup.find('div', class_='content') or
soup.body
)
if main_content:
# Remove unwanted elements
for tag in main_content.find_all(['script', 'style', 'nav', 'footer', 'aside', 'header']):
tag.decompose()
# Convert to markdown and clean up
content = markdownify(str(main_content), heading_style="ATX")
# Try to keep complete sentences
content = content[:2000].rsplit('.', 1)[0] + '...'
else:
content = "Could not find main content"
return f"Title: {title}\n\nContent Summary:\n{content}"
except Exception as e:
return f"Error scraping webpage: {str(e)}"
final_answer = FinalAnswerTool()
# Keep the same model endpoint
model = HfApiModel(
model_id='https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud',
max_tokens=2096,
temperature=0.5,
custom_role_conversions=None,
)
agent = CodeAgent(
model=model,
tools=[
final_answer,
webpage_scraper
],
max_steps=5,
verbosity_level=1,
grammar=None,
planning_interval=None,
name="Web Scraping Agent",
description="An agent capable of scraping and analyzing web content",
prompt_templates=None # Remove prompts.yaml dependency
)
GradioUI(agent).launch() |