Spaces:
No application file
No application file
| # type: ignore | |
| from agents import Agent, RunContextWrapper, function_tool | |
| from model import get_model | |
| import re | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from markdownify import markdownify | |
| from requests.exceptions import RequestException | |
| from langchain_community.tools import DuckDuckGoSearchResults | |
| from urllib.parse import urljoin | |
| async def extract_og_image(url: str) -> str: | |
| """Extracts the Open Graph (OG) image from a given website URL.""" | |
| try: | |
| resp = requests.get(url, timeout=10) | |
| resp.raise_for_status() | |
| soup = BeautifulSoup(resp.text, "html.parser") | |
| og_image = soup.find("meta", property="og:image") | |
| if og_image and og_image.get("content"): | |
| return urljoin(url, og_image["content"]) | |
| return "No OG image found" | |
| except Exception as e: | |
| return f"Error extracting OG image: {e}" | |
| def collect_theme_data(url: str) -> dict: | |
| """Collects raw theme-related data from a website.""" | |
| try: | |
| resp = requests.get(url, timeout=10) | |
| resp.raise_for_status() | |
| soup = BeautifulSoup(resp.text, "html.parser") | |
| theme_data = { | |
| "meta_theme": [], | |
| "inline_styles": [], | |
| "linked_css": [] | |
| } | |
| # Meta theme-color | |
| for meta in soup.find_all("meta", attrs={"name": "theme-color"}): | |
| theme_data["meta_theme"].append(meta.get("content")) | |
| # Inline styles | |
| for tag in soup.find_all(style=True): | |
| theme_data["inline_styles"].append(tag["style"]) | |
| # Linked CSS files | |
| for link in soup.find_all("link", rel="stylesheet"): | |
| href = link.get("href") | |
| if href: | |
| if not href.startswith(("http://", "https://")): | |
| href = urljoin(url, href) | |
| theme_data["linked_css"].append(href) | |
| return theme_data | |
| except Exception as e: | |
| return {"error": str(e)} | |
| async def extract_text_tool(url: str) -> str: | |
| """Visits a webpage and returns its content as markdown.""" | |
| try: | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| markdown_content = markdownify(response.text).strip() | |
| markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content) | |
| return markdown_content | |
| except RequestException as e: | |
| return f"Error fetching the webpage: {str(e)}" | |
| except Exception as e: | |
| return f"An unexpected error occurred: {str(e)}" | |
| async def Web_search_tool(query: str): | |
| """Performs a web search using DuckDuckGo.""" | |
| try: | |
| print("Searching the web for:", query) | |
| search = DuckDuckGoSearchResults(output_format="list") | |
| results = search.invoke(query) | |
| return results | |
| except Exception as e: | |
| print(e) | |
| return f"An unexpected error occurred: {str(e)}" | |
| def webInspectorPrompt(context: RunContextWrapper, agent: Agent) -> str: | |
| return """ | |
| You are WebInspector Agent. | |
| Your role is to extract and analyze data from websites. | |
| You can use the available tools to load web pages and retrieve content such as: | |
| - Page text and headings | |
| - Colors (from inline styles, CSS, or computed values) | |
| - Links and metadata (title, description, keywords) | |
| - Layout or structural information (DOM hierarchy, tag types) | |
| Guidelines: | |
| - Always extract text in a clean and structured format. | |
| - If asked about styles (like colors, fonts), parse them from the HTML/CSS. | |
| - Provide clear summaries when possible, instead of raw HTML. | |
| - When following links, only fetch up to N pages to avoid overload. | |
| - If extraction fails, explain the reason. | |
| """ | |
| WebInspectorAgent = Agent( | |
| name="WebInspectorAgent", | |
| instructions=webInspectorPrompt, | |
| model=get_model('gemini-2.0-flash'), | |
| tools=[collect_theme_data, extract_text_tool, Web_search_tool] | |
| ) |