Spaces:

muhammadmaazuddin
/

smgp

No application file

File size: 3,912 Bytes

78e0552

# type: ignore
from agents import Agent, RunContextWrapper, function_tool
from model import get_model
import re
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify
from requests.exceptions import RequestException
from langchain_community.tools import DuckDuckGoSearchResults
from urllib.parse import urljoin

@function_tool
async def extract_og_image(url: str) -> str:
    """Extracts the Open Graph (OG) image from a given website URL."""
    try:
        resp = requests.get(url, timeout=10)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")

        og_image = soup.find("meta", property="og:image")
        if og_image and og_image.get("content"):
            return urljoin(url, og_image["content"])
        
        return "No OG image found"
    except Exception as e:
        return f"Error extracting OG image: {e}"

@function_tool
def collect_theme_data(url: str) -> dict:
    """Collects raw theme-related data from a website."""
    try:
        resp = requests.get(url, timeout=10)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")

        theme_data = {
            "meta_theme": [],
            "inline_styles": [],
            "linked_css": []
        }

        # Meta theme-color
        for meta in soup.find_all("meta", attrs={"name": "theme-color"}):
            theme_data["meta_theme"].append(meta.get("content"))

        # Inline styles
        for tag in soup.find_all(style=True):
            theme_data["inline_styles"].append(tag["style"])

        # Linked CSS files
        for link in soup.find_all("link", rel="stylesheet"):
            href = link.get("href")
            if href:
                if not href.startswith(("http://", "https://")):
                    href = urljoin(url, href)
                theme_data["linked_css"].append(href)

        return theme_data

    except Exception as e:
        return {"error": str(e)}

@function_tool
async def extract_text_tool(url: str) -> str:
    """Visits a webpage and returns its content as markdown."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        markdown_content = markdownify(response.text).strip()
        markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
        return markdown_content
    
    except RequestException as e:
        return f"Error fetching the webpage: {str(e)}"
    except Exception as e:
        return f"An unexpected error occurred: {str(e)}"

@function_tool
async def Web_search_tool(query: str):
    """Performs a web search using DuckDuckGo."""
    try:
        print("Searching the web for:", query)
        search = DuckDuckGoSearchResults(output_format="list")
        results = search.invoke(query)
        return results
    
    except Exception as e:
        print(e)
        return f"An unexpected error occurred: {str(e)}"

def webInspectorPrompt(context: RunContextWrapper, agent: Agent) -> str:
    return """
You are WebInspector Agent.  
Your role is to extract and analyze data from websites.  

You can use the available tools to load web pages and retrieve content such as:  
- Page text and headings  
- Colors (from inline styles, CSS, or computed values)  
- Links and metadata (title, description, keywords)  
- Layout or structural information (DOM hierarchy, tag types)

Guidelines:
- Always extract text in a clean and structured format.  
- If asked about styles (like colors, fonts), parse them from the HTML/CSS.  
- Provide clear summaries when possible, instead of raw HTML.  
- When following links, only fetch up to N pages to avoid overload.  
- If extraction fails, explain the reason.  
"""

WebInspectorAgent = Agent(
    name="WebInspectorAgent",
    instructions=webInspectorPrompt,
    model=get_model('gemini-2.0-flash'),
    tools=[collect_theme_data, extract_text_tool, Web_search_tool]
)