Spaces:

muhammadmaazuddin
/

smgp

No application file

App Files Files Community

smgp / src /agent_dir /web_inspector_agent.py

muhammadmaazuddin

working on agent

78e0552 4 months ago

raw

history blame contribute delete

3.91 kB

	# type: ignore
	from agents import Agent, RunContextWrapper, function_tool
	from model import get_model
	import re
	import requests
	from bs4 import BeautifulSoup
	from markdownify import markdownify
	from requests.exceptions import RequestException
	from langchain_community.tools import DuckDuckGoSearchResults
	from urllib.parse import urljoin

	@function_tool
	async def extract_og_image(url: str) -> str:
	"""Extracts the Open Graph (OG) image from a given website URL."""
	try:
	resp = requests.get(url, timeout=10)
	resp.raise_for_status()
	soup = BeautifulSoup(resp.text, "html.parser")

	og_image = soup.find("meta", property="og:image")
	if og_image and og_image.get("content"):
	return urljoin(url, og_image["content"])

	return "No OG image found"
	except Exception as e:
	return f"Error extracting OG image: {e}"

	@function_tool
	def collect_theme_data(url: str) -> dict:
	"""Collects raw theme-related data from a website."""
	try:
	resp = requests.get(url, timeout=10)
	resp.raise_for_status()
	soup = BeautifulSoup(resp.text, "html.parser")

	theme_data = {
	"meta_theme": [],
	"inline_styles": [],
	"linked_css": []
	}

	# Meta theme-color
	for meta in soup.find_all("meta", attrs={"name": "theme-color"}):
	theme_data["meta_theme"].append(meta.get("content"))

	# Inline styles
	for tag in soup.find_all(style=True):
	theme_data["inline_styles"].append(tag["style"])

	# Linked CSS files
	for link in soup.find_all("link", rel="stylesheet"):
	href = link.get("href")
	if href:
	if not href.startswith(("http://", "https://")):
	href = urljoin(url, href)
	theme_data["linked_css"].append(href)

	return theme_data

	except Exception as e:
	return {"error": str(e)}

	@function_tool
	async def extract_text_tool(url: str) -> str:
	"""Visits a webpage and returns its content as markdown."""
	try:
	response = requests.get(url)
	response.raise_for_status()
	markdown_content = markdownify(response.text).strip()
	markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
	return markdown_content

	except RequestException as e:
	return f"Error fetching the webpage: {str(e)}"
	except Exception as e:
	return f"An unexpected error occurred: {str(e)}"

	@function_tool
	async def Web_search_tool(query: str):
	"""Performs a web search using DuckDuckGo."""
	try:
	print("Searching the web for:", query)
	search = DuckDuckGoSearchResults(output_format="list")
	results = search.invoke(query)
	return results

	except Exception as e:
	print(e)
	return f"An unexpected error occurred: {str(e)}"

	def webInspectorPrompt(context: RunContextWrapper, agent: Agent) -> str:
	return """
	You are WebInspector Agent.
	Your role is to extract and analyze data from websites.

	You can use the available tools to load web pages and retrieve content such as:
	- Page text and headings
	- Colors (from inline styles, CSS, or computed values)
	- Links and metadata (title, description, keywords)
	- Layout or structural information (DOM hierarchy, tag types)

	Guidelines:
	- Always extract text in a clean and structured format.
	- If asked about styles (like colors, fonts), parse them from the HTML/CSS.
	- Provide clear summaries when possible, instead of raw HTML.
	- When following links, only fetch up to N pages to avoid overload.
	- If extraction fails, explain the reason.
	"""

	WebInspectorAgent = Agent(
	name="WebInspectorAgent",
	instructions=webInspectorPrompt,
	model=get_model('gemini-2.0-flash'),
	tools=[collect_theme_data, extract_text_tool, Web_search_tool]
	)