Spaces:

Agents-MCP-Hackathon
/

search-web-MCP-server

Runtime error

App Files Files Community

Olaemad commited on Jun 10, 2025

Commit

e94d3a9

1 Parent(s): 237ef97

Add Searcher, Scraper and analyzer functionality with custom domain and site search tools

Browse files

Files changed (14) hide show

src/analyzer/openai_analyzer.py +46 -0
src/models/__init__.py +0 -0
src/models/analyzer_models.py +16 -0
src/models/scrape_models.py +15 -0
src/models/search_models.py +14 -0
src/scraper/__init__.py +0 -0
src/scraper/trafilatura_scraper.py +33 -0
src/searcher/__init__.py +0 -0
src/searcher/open_google_search.py +93 -0
src/tools/__init__.py +0 -0
src/tools/custom_domains_search_tool.py +52 -0
src/tools/custom_sites_search_tool.py +53 -0
src/tools/search_on_web_tool.py +46 -0
src/utils/url_validator.py +13 -0

src/analyzer/openai_analyzer.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from typing import List
+from openai import OpenAI
+from src.core.types import DEFAULT_OPENAI_ANALYZER, DEFAULT_SYSTEM_PROMPT, DEFAULT_USER_PROMPT
+from src.models.analyzer_models import AnalyzerResult
+from src.models.scrape_models import ScrapeResult
+from src.core.interface.analyzer_interface import AnalyzerInterface
+class OpenaiAnalyzer(AnalyzerInterface):
+    def __init__(self, api_key, model_name = DEFAULT_OPENAI_ANALYZER):
+       self.client = OpenAI(api_key=api_key)
+       self.model_name = model_name
+    def analyze_search_result(self, query: str, search_results: List[ScrapeResult]) -> AnalyzerResult:
+        """
+        Analyzes the provided search results based on the given query.
+        Args:
+            query (str): The search query string.
+            search_results (List[ScrapeResult]): A list of search results to be analyzed.
+        Returns:
+            AnalyzerResult: The result of the analysis.
+        Raises:
+            NotImplementedError: If the method is not implemented by a subclass.
+        """
+        try:
+            user_prompt = DEFAULT_USER_PROMPT.replace("query", query).replace("scrape_results", f"{search_results}")
+            completion = self.client.beta.chat.completions.parse(model=self.model_name,
+                                                                 messages=[
+                                                                     {
+                                                                         "role": "system",
+                                                                         "content": DEFAULT_SYSTEM_PROMPT
+                                                                     },
+                                                                     {
+                                                                         "role": "user",
+                                                                         "content": user_prompt
+                                                                     }
+                                                                 ],
+                                                                 response_format=AnalyzerResult)
+            response = completion.choices[0].message.parsed
+            return response
+        except Exception as e:
+            raise Exception(f"Error while analyzing search result: {str(e)}")

src/models/__init__.py ADDED Viewed

File without changes

src/models/analyzer_models.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from typing import List, Optional
+from pydantic import BaseModel, Field
+class Citation(BaseModel):
+    citation_type: str = Field(description="Citation type.")
+    url: str = Field(description="Citation URL.")
+    start_index: int = Field(description="Citation start index in response.")
+    end_index: int = Field(description="Citation end index in response.")
+class AnalyzerResult(BaseModel):
+    response_str: str = Field(description="Final response string.")
+    citation: Optional[List[Citation]] = Field(
+        default=[], description="Final response string."
+    )

src/models/scrape_models.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from typing import Optional
+from pydantic import BaseModel, Field
+from src.models.search_models import SearchItemResult
+class ScrapeQuery(SearchItemResult):
+    pass
+class ScrapeResult(BaseModel):
+    url: str = Field(description="URL.")
+    content: Optional[str] = Field("", description="URL content.")
+    title: str = Field(description="Title of result item.")
+    description: str = Field(description="Description of result item.")

src/models/search_models.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from typing import List, Optional
+from pydantic import BaseModel, Field
+class SearchItemResult(BaseModel):
+    url: str = Field(description="URL of result item.")
+    title: str = Field(description="Title of result item.")
+    description: str = Field(description="Description of result item.")
+class SearchResult(BaseModel):
+    items: Optional[List[SearchItemResult]] = Field(
+        default=[], description="Search result items."
+    )

src/scraper/__init__.py ADDED Viewed

File without changes

src/scraper/trafilatura_scraper.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from src.core.interface.scraper_interface import ScraperInterface
+from src.models.scrape_models import ScrapeQuery, ScrapeResult
+import trafilatura
+class TrafilaturaScraper(ScraperInterface):
+    def __init__(self):
+        pass
+    def get_url_content(self, url_parameters: ScrapeQuery) -> ScrapeResult:
+        """
+        Fetches and extracts the main textual content from the specified URL using trafilatura.
+        Args:
+            url_parameters (ScrapeQuery): The URL parameters of the web page to scrape.
+        Returns:
+            ScrapeResult: An object containing the extracted content from the URL.
+        Raises:
+            Exception: If an error occurs during fetching or extraction, an exception is raised with a descriptive message.
+        """
+        try:
+            downloaded = trafilatura.fetch_url(url_parameters.url)
+            result = trafilatura.extract(downloaded)
+            return ScrapeResult(
+                content=result,
+                url=url_parameters.url,
+                title=url_parameters.title,
+                description=url_parameters.description,
+            )
+        except Exception as e:
+            raise Exception(f"Error occurred while getting url content: {str(e)}")

src/searcher/__init__.py ADDED Viewed

File without changes

src/searcher/open_google_search.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from typing import List, Optional
+from googlesearch import search
+from src.core.interface.searcher_interface import SearchInterface
+from src.models.search_models import SearchItemResult, SearchResult
+class GoogleSearch(SearchInterface):
+    def __init__(self):
+        pass
+    def search_custom_sites(
+        self, query: str, sites: Optional[list] = None
+    ) -> SearchResult:
+        """
+        Performs a Google search restricted to a dynamic list of specific sites.
+        Args:
+            query (str): The user's search query (e.g., "generative AI").
+            sites (list): A list of websites to search within (e.g., ['wired.com', 'theverge.com']).
+        Returns:
+            SearchResult: The search results from the API, or None if an error occurs.
+        """
+        try:
+            # 1. Construct the dynamic query string
+            # Joins the sites with " OR " and formats them with the "site:" operator
+            site_restriction = (
+                " OR ".join([f"site:{site}" for site in sites]) if sites else ""
+            )
+            full_query = f"{query} {site_restriction}"
+            # 2. Execute the search
+            result = search(full_query, num_results=5, advanced=True)
+            # 3. Check for returned result
+            items = [
+                SearchItemResult(
+                    url=item.url, title=item.title, description=item.description
+                )
+                for item in result
+            ]
+            urls = [item.url for item in items if item.url]
+            if not urls:
+                return SearchResult(items=[])
+            return SearchResult(items=items)
+        except Exception as e:
+            raise Exception(f"An error occurred while searching in Google: {str(e)}")
+    def search_custom_domains(
+        self, query: str, domains: Optional[List[str]] = None
+    ) -> SearchResult:
+        """
+        Performs a Google search restricted to a custom domain.
+        Args:
+            query (str): The user's search query (e.g., "generative AI").
+            domain (List[str]): A List od domains of websites to search within (e.g., '.edu').
+        Returns:
+            SearchResult: The search results from the API, or None if an error occurs.
+        """
+        try:
+            # 1. Construct the dynamic query string
+            domain_restriction = (
+                " OR ".join([f"site: {domain}" for domain in domains])
+                if domains
+                else ""
+            )
+            full_query = f"{query} {domain_restriction}"
+            # 2. Execute the search
+            result = search(full_query, num_results=5, advanced=True)
+            # 3. Check for returned result
+            items = [
+                SearchItemResult(
+                    url=item.url, title=item.title, description=item.description
+                )
+                for item in result
+            ]
+            urls = [item.url for item in items if item.url]
+            if not urls:
+                SearchResult(items=[])
+            return SearchResult(items=items)
+        except Exception as e:
+            raise Exception(f"An error occurred while searching in Google: {str(e)}")

src/tools/__init__.py ADDED Viewed

File without changes

src/tools/custom_domains_search_tool.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from typing import List
+from src.core.factory.analyzer_facrory import AnalyzerFactory
+from src.core.factory.scraper_factory import ScraperFactory
+from src.core.factory.searcher_factory import SearcherFactory
+from src.core.types import AnalyzerType, ScraperType, SearcherType
+from src.models.analyzer_models import AnalyzerResult
+searcher = SearcherFactory.initialize_searcher(
+    searcher_type=SearcherType.OPEN_GOOGLE_SEARCH
+)
+scraper = ScraperFactory.initialize_scraper(ScraperType.TRAFILATURA_SCRAPER)
+analyzer = AnalyzerFactory.initialize_analyzer(AnalyzerType.OPENAI_ANALYZER)
+def search_custom_domain(query: str, domains: List[str] = None) -> AnalyzerResult:
+    """
+    Performs a custom domain search for the given query and domains, scrapes the resulting URLs, and analyzes the search results.
+    Args:
+        query (str): The search query string. Must not be empty.
+        domains (List[str], optional): A list of domain strings to restrict the search (e.g., ['edu', 'gov']). Must not be empty.
+    Returns:
+        AnalyzerResult: The analyzed result of the search, as returned by the analyzer.
+    Raises:
+        ValueError: If the query or domains are empty.
+        Exception: Propagates any exceptions raised during the search, scraping, or analysis process.
+    """
+    try:
+        # 1. Validate search parameters
+        if not query or query.strip() == "":
+            raise ValueError("Query can't be empty.")
+        if not domains or domains == []:
+            raise ValueError("Domain can't be empty (e.g. 'edu', 'gov').")
+        # 2. Run initial search
+        searcher_result = searcher.search_custom_domains(query=query, domains=domains)
+        # 3. Scrape search result
+        scrape_result = []
+        for item in searcher_result.items:
+            url_scrape_result = scraper.get_url_content(url_parameters=item)
+            scrape_result.append(url_scrape_result)
+        # 4. Analyze search result
+        final_result = analyzer.analyze_search_result(
+            query=query, search_results=scrape_result
+        )
+        return final_result
+    except Exception as e:
+        raise e

src/tools/custom_sites_search_tool.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from typing import Optional
+from src.core.factory.analyzer_facrory import AnalyzerFactory
+from src.core.factory.scraper_factory import ScraperFactory
+from src.core.factory.searcher_factory import SearcherFactory
+from src.core.types import AnalyzerType, ScraperType, SearcherType
+from src.models.analyzer_models import AnalyzerResult
+from src.utils.url_validator import validate_urls
+searcher = SearcherFactory.initialize_searcher(
+    searcher_type=SearcherType.OPEN_GOOGLE_SEARCH
+)
+scraper = ScraperFactory.initialize_scraper(ScraperType.TRAFILATURA_SCRAPER)
+analyzer = AnalyzerFactory.initialize_analyzer(AnalyzerType.OPENAI_ANALYZER)
+def search_custom_sites(query: str, sites: Optional[list] = None) -> AnalyzerResult:
+    """
+    Performs a custom site search, scrapes the resulting URLs, and analyzes the content.
+    Args:
+        query (str): The search query string. Must not be empty.
+        sites (Optional[list], optional): A list of site URLs to restrict the search to. Defaults to None.
+    Returns:
+        AnalyzerResult: The analyzed result of the scraped search results.
+    Raises:
+        ValueError: If the query is empty.
+        Exception: Propagates any exception raised during validation, searching, scraping, or analysis.
+    """
+    try:
+        # 1. Validate search parameters
+        if not query or query.strip() == "":
+            raise ValueError("Query can't be empty.")
+        validate_urls(urls=sites)
+        # 2. Run initial search
+        searcher_result = searcher.search_custom_sites(query=query, sites=sites)
+        # 3. Scrape search result
+        scrape_result = []
+        for item in searcher_result.items:
+            url_scrape_result = scraper.get_url_content(url_parameters=item)
+            scrape_result.append(url_scrape_result)
+        # 4. Analyze search result
+        final_result = analyzer.analyze_search_result(
+            query=query, search_results=scrape_result
+        )
+        return final_result
+    except Exception as e:
+        raise e

src/tools/search_on_web_tool.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from src.core.factory.analyzer_facrory import AnalyzerFactory
+from src.core.factory.scraper_factory import ScraperFactory
+from src.core.factory.searcher_factory import SearcherFactory
+from src.core.types import AnalyzerType, ScraperType, SearcherType
+from src.models.analyzer_models import AnalyzerResult
+searcher = SearcherFactory.initialize_searcher(
+    searcher_type=SearcherType.OPEN_GOOGLE_SEARCH
+)
+scraper = ScraperFactory.initialize_scraper(ScraperType.TRAFILATURA_SCRAPER)
+analyzer = AnalyzerFactory.initialize_analyzer(AnalyzerType.OPENAI_ANALYZER)
+def search_on_web(query: str) -> AnalyzerResult:
+    """
+    Performs a general search on web, scrapes the resulting URLs, and analyzes the search results.
+    Args:
+        query (str): The search query string. Must not be empty.
+    Returns:
+        AnalyzerResult: The analyzed result of the search, as returned by the analyzer.
+    Raises:
+        ValueError: If the query or domains are empty.
+        Exception: Propagates any exceptions raised during the search, scraping, or analysis process.
+    """
+    try:
+        # 1. Validate search parameters
+        if not query or query.strip() == "":
+            raise ValueError("Query can't be empty.")
+        # 2. Run initial search
+        searcher_result = searcher.search_custom_domains(query=query)
+        # 3. Scrape search result
+        scrape_result = []
+        for item in searcher_result.items:
+            url_scrape_result = scraper.get_url_content(url_parameters=item)
+            scrape_result.append(url_scrape_result)
+        # 4. Analyze search result
+        final_result = analyzer.analyze_search_result(
+            query=query, search_results=scrape_result
+        )
+        return final_result
+    except Exception as e:
+        raise e

src/utils/url_validator.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from typing import List
+import validators
+def validate_url(url: str):
+    if not validators.url(url):
+        raise ValueError("Provided url is not valid.")
+def validate_urls(urls: List[str]):
+    for url in urls:
+        if not validators.url(url):
+            raise ValueError(f"Provided url: {url} is not valid.")