Olaemad commited on
Commit
e94d3a9
·
1 Parent(s): 237ef97

Add Searcher, Scraper and analyzer functionality with custom domain and site search tools

Browse files
src/analyzer/openai_analyzer.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from typing import List
3
+ from openai import OpenAI
4
+
5
+ from src.core.types import DEFAULT_OPENAI_ANALYZER, DEFAULT_SYSTEM_PROMPT, DEFAULT_USER_PROMPT
6
+
7
+ from src.models.analyzer_models import AnalyzerResult
8
+ from src.models.scrape_models import ScrapeResult
9
+
10
+ from src.core.interface.analyzer_interface import AnalyzerInterface
11
+
12
+
13
+ class OpenaiAnalyzer(AnalyzerInterface):
14
+ def __init__(self, api_key, model_name = DEFAULT_OPENAI_ANALYZER):
15
+ self.client = OpenAI(api_key=api_key)
16
+ self.model_name = model_name
17
+
18
+ def analyze_search_result(self, query: str, search_results: List[ScrapeResult]) -> AnalyzerResult:
19
+ """
20
+ Analyzes the provided search results based on the given query.
21
+ Args:
22
+ query (str): The search query string.
23
+ search_results (List[ScrapeResult]): A list of search results to be analyzed.
24
+ Returns:
25
+ AnalyzerResult: The result of the analysis.
26
+ Raises:
27
+ NotImplementedError: If the method is not implemented by a subclass.
28
+ """
29
+ try:
30
+ user_prompt = DEFAULT_USER_PROMPT.replace("query", query).replace("scrape_results", f"{search_results}")
31
+ completion = self.client.beta.chat.completions.parse(model=self.model_name,
32
+ messages=[
33
+ {
34
+ "role": "system",
35
+ "content": DEFAULT_SYSTEM_PROMPT
36
+ },
37
+ {
38
+ "role": "user",
39
+ "content": user_prompt
40
+ }
41
+ ],
42
+ response_format=AnalyzerResult)
43
+ response = completion.choices[0].message.parsed
44
+ return response
45
+ except Exception as e:
46
+ raise Exception(f"Error while analyzing search result: {str(e)}")
src/models/__init__.py ADDED
File without changes
src/models/analyzer_models.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional
2
+ from pydantic import BaseModel, Field
3
+
4
+
5
+ class Citation(BaseModel):
6
+ citation_type: str = Field(description="Citation type.")
7
+ url: str = Field(description="Citation URL.")
8
+ start_index: int = Field(description="Citation start index in response.")
9
+ end_index: int = Field(description="Citation end index in response.")
10
+
11
+
12
+ class AnalyzerResult(BaseModel):
13
+ response_str: str = Field(description="Final response string.")
14
+ citation: Optional[List[Citation]] = Field(
15
+ default=[], description="Final response string."
16
+ )
src/models/scrape_models.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+ from pydantic import BaseModel, Field
3
+
4
+ from src.models.search_models import SearchItemResult
5
+
6
+
7
+ class ScrapeQuery(SearchItemResult):
8
+ pass
9
+
10
+
11
+ class ScrapeResult(BaseModel):
12
+ url: str = Field(description="URL.")
13
+ content: Optional[str] = Field("", description="URL content.")
14
+ title: str = Field(description="Title of result item.")
15
+ description: str = Field(description="Description of result item.")
src/models/search_models.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional
2
+ from pydantic import BaseModel, Field
3
+
4
+
5
+ class SearchItemResult(BaseModel):
6
+ url: str = Field(description="URL of result item.")
7
+ title: str = Field(description="Title of result item.")
8
+ description: str = Field(description="Description of result item.")
9
+
10
+
11
+ class SearchResult(BaseModel):
12
+ items: Optional[List[SearchItemResult]] = Field(
13
+ default=[], description="Search result items."
14
+ )
src/scraper/__init__.py ADDED
File without changes
src/scraper/trafilatura_scraper.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.core.interface.scraper_interface import ScraperInterface
2
+ from src.models.scrape_models import ScrapeQuery, ScrapeResult
3
+
4
+ import trafilatura
5
+
6
+
7
+ class TrafilaturaScraper(ScraperInterface):
8
+ def __init__(self):
9
+ pass
10
+
11
+ def get_url_content(self, url_parameters: ScrapeQuery) -> ScrapeResult:
12
+ """
13
+ Fetches and extracts the main textual content from the specified URL using trafilatura.
14
+ Args:
15
+ url_parameters (ScrapeQuery): The URL parameters of the web page to scrape.
16
+ Returns:
17
+ ScrapeResult: An object containing the extracted content from the URL.
18
+ Raises:
19
+ Exception: If an error occurs during fetching or extraction, an exception is raised with a descriptive message.
20
+ """
21
+ try:
22
+ downloaded = trafilatura.fetch_url(url_parameters.url)
23
+ result = trafilatura.extract(downloaded)
24
+
25
+ return ScrapeResult(
26
+ content=result,
27
+ url=url_parameters.url,
28
+ title=url_parameters.title,
29
+ description=url_parameters.description,
30
+ )
31
+
32
+ except Exception as e:
33
+ raise Exception(f"Error occurred while getting url content: {str(e)}")
src/searcher/__init__.py ADDED
File without changes
src/searcher/open_google_search.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional
2
+ from googlesearch import search
3
+
4
+ from src.core.interface.searcher_interface import SearchInterface
5
+ from src.models.search_models import SearchItemResult, SearchResult
6
+
7
+
8
+ class GoogleSearch(SearchInterface):
9
+ def __init__(self):
10
+ pass
11
+
12
+ def search_custom_sites(
13
+ self, query: str, sites: Optional[list] = None
14
+ ) -> SearchResult:
15
+ """
16
+ Performs a Google search restricted to a dynamic list of specific sites.
17
+
18
+ Args:
19
+ query (str): The user's search query (e.g., "generative AI").
20
+ sites (list): A list of websites to search within (e.g., ['wired.com', 'theverge.com']).
21
+
22
+ Returns:
23
+ SearchResult: The search results from the API, or None if an error occurs.
24
+ """
25
+ try:
26
+ # 1. Construct the dynamic query string
27
+ # Joins the sites with " OR " and formats them with the "site:" operator
28
+ site_restriction = (
29
+ " OR ".join([f"site:{site}" for site in sites]) if sites else ""
30
+ )
31
+ full_query = f"{query} {site_restriction}"
32
+
33
+ # 2. Execute the search
34
+ result = search(full_query, num_results=5, advanced=True)
35
+
36
+ # 3. Check for returned result
37
+ items = [
38
+ SearchItemResult(
39
+ url=item.url, title=item.title, description=item.description
40
+ )
41
+ for item in result
42
+ ]
43
+
44
+ urls = [item.url for item in items if item.url]
45
+ if not urls:
46
+ return SearchResult(items=[])
47
+
48
+ return SearchResult(items=items)
49
+
50
+ except Exception as e:
51
+ raise Exception(f"An error occurred while searching in Google: {str(e)}")
52
+
53
+ def search_custom_domains(
54
+ self, query: str, domains: Optional[List[str]] = None
55
+ ) -> SearchResult:
56
+ """
57
+ Performs a Google search restricted to a custom domain.
58
+
59
+ Args:
60
+ query (str): The user's search query (e.g., "generative AI").
61
+ domain (List[str]): A List od domains of websites to search within (e.g., '.edu').
62
+
63
+ Returns:
64
+ SearchResult: The search results from the API, or None if an error occurs.
65
+ """
66
+ try:
67
+ # 1. Construct the dynamic query string
68
+ domain_restriction = (
69
+ " OR ".join([f"site: {domain}" for domain in domains])
70
+ if domains
71
+ else ""
72
+ )
73
+ full_query = f"{query} {domain_restriction}"
74
+
75
+ # 2. Execute the search
76
+ result = search(full_query, num_results=5, advanced=True)
77
+
78
+ # 3. Check for returned result
79
+ items = [
80
+ SearchItemResult(
81
+ url=item.url, title=item.title, description=item.description
82
+ )
83
+ for item in result
84
+ ]
85
+
86
+ urls = [item.url for item in items if item.url]
87
+ if not urls:
88
+ SearchResult(items=[])
89
+
90
+ return SearchResult(items=items)
91
+
92
+ except Exception as e:
93
+ raise Exception(f"An error occurred while searching in Google: {str(e)}")
src/tools/__init__.py ADDED
File without changes
src/tools/custom_domains_search_tool.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ from src.core.factory.analyzer_facrory import AnalyzerFactory
4
+ from src.core.factory.scraper_factory import ScraperFactory
5
+ from src.core.factory.searcher_factory import SearcherFactory
6
+ from src.core.types import AnalyzerType, ScraperType, SearcherType
7
+ from src.models.analyzer_models import AnalyzerResult
8
+
9
+ searcher = SearcherFactory.initialize_searcher(
10
+ searcher_type=SearcherType.OPEN_GOOGLE_SEARCH
11
+ )
12
+ scraper = ScraperFactory.initialize_scraper(ScraperType.TRAFILATURA_SCRAPER)
13
+ analyzer = AnalyzerFactory.initialize_analyzer(AnalyzerType.OPENAI_ANALYZER)
14
+
15
+
16
+ def search_custom_domain(query: str, domains: List[str] = None) -> AnalyzerResult:
17
+ """
18
+ Performs a custom domain search for the given query and domains, scrapes the resulting URLs, and analyzes the search results.
19
+ Args:
20
+ query (str): The search query string. Must not be empty.
21
+ domains (List[str], optional): A list of domain strings to restrict the search (e.g., ['edu', 'gov']). Must not be empty.
22
+ Returns:
23
+ AnalyzerResult: The analyzed result of the search, as returned by the analyzer.
24
+ Raises:
25
+ ValueError: If the query or domains are empty.
26
+ Exception: Propagates any exceptions raised during the search, scraping, or analysis process.
27
+ """
28
+ try:
29
+ # 1. Validate search parameters
30
+ if not query or query.strip() == "":
31
+ raise ValueError("Query can't be empty.")
32
+
33
+ if not domains or domains == []:
34
+ raise ValueError("Domain can't be empty (e.g. 'edu', 'gov').")
35
+
36
+ # 2. Run initial search
37
+ searcher_result = searcher.search_custom_domains(query=query, domains=domains)
38
+
39
+ # 3. Scrape search result
40
+ scrape_result = []
41
+ for item in searcher_result.items:
42
+ url_scrape_result = scraper.get_url_content(url_parameters=item)
43
+ scrape_result.append(url_scrape_result)
44
+
45
+ # 4. Analyze search result
46
+ final_result = analyzer.analyze_search_result(
47
+ query=query, search_results=scrape_result
48
+ )
49
+
50
+ return final_result
51
+ except Exception as e:
52
+ raise e
src/tools/custom_sites_search_tool.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ from src.core.factory.analyzer_facrory import AnalyzerFactory
4
+ from src.core.factory.scraper_factory import ScraperFactory
5
+ from src.core.factory.searcher_factory import SearcherFactory
6
+ from src.core.types import AnalyzerType, ScraperType, SearcherType
7
+ from src.models.analyzer_models import AnalyzerResult
8
+ from src.utils.url_validator import validate_urls
9
+
10
+ searcher = SearcherFactory.initialize_searcher(
11
+ searcher_type=SearcherType.OPEN_GOOGLE_SEARCH
12
+ )
13
+ scraper = ScraperFactory.initialize_scraper(ScraperType.TRAFILATURA_SCRAPER)
14
+ analyzer = AnalyzerFactory.initialize_analyzer(AnalyzerType.OPENAI_ANALYZER)
15
+
16
+
17
+ def search_custom_sites(query: str, sites: Optional[list] = None) -> AnalyzerResult:
18
+ """
19
+ Performs a custom site search, scrapes the resulting URLs, and analyzes the content.
20
+ Args:
21
+ query (str): The search query string. Must not be empty.
22
+ sites (Optional[list], optional): A list of site URLs to restrict the search to. Defaults to None.
23
+ Returns:
24
+ AnalyzerResult: The analyzed result of the scraped search results.
25
+ Raises:
26
+ ValueError: If the query is empty.
27
+ Exception: Propagates any exception raised during validation, searching, scraping, or analysis.
28
+ """
29
+
30
+ try:
31
+ # 1. Validate search parameters
32
+ if not query or query.strip() == "":
33
+ raise ValueError("Query can't be empty.")
34
+
35
+ validate_urls(urls=sites)
36
+
37
+ # 2. Run initial search
38
+ searcher_result = searcher.search_custom_sites(query=query, sites=sites)
39
+
40
+ # 3. Scrape search result
41
+ scrape_result = []
42
+ for item in searcher_result.items:
43
+ url_scrape_result = scraper.get_url_content(url_parameters=item)
44
+ scrape_result.append(url_scrape_result)
45
+
46
+ # 4. Analyze search result
47
+ final_result = analyzer.analyze_search_result(
48
+ query=query, search_results=scrape_result
49
+ )
50
+
51
+ return final_result
52
+ except Exception as e:
53
+ raise e
src/tools/search_on_web_tool.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.core.factory.analyzer_facrory import AnalyzerFactory
2
+ from src.core.factory.scraper_factory import ScraperFactory
3
+ from src.core.factory.searcher_factory import SearcherFactory
4
+ from src.core.types import AnalyzerType, ScraperType, SearcherType
5
+ from src.models.analyzer_models import AnalyzerResult
6
+
7
+ searcher = SearcherFactory.initialize_searcher(
8
+ searcher_type=SearcherType.OPEN_GOOGLE_SEARCH
9
+ )
10
+ scraper = ScraperFactory.initialize_scraper(ScraperType.TRAFILATURA_SCRAPER)
11
+ analyzer = AnalyzerFactory.initialize_analyzer(AnalyzerType.OPENAI_ANALYZER)
12
+
13
+
14
+ def search_on_web(query: str) -> AnalyzerResult:
15
+ """
16
+ Performs a general search on web, scrapes the resulting URLs, and analyzes the search results.
17
+ Args:
18
+ query (str): The search query string. Must not be empty.
19
+ Returns:
20
+ AnalyzerResult: The analyzed result of the search, as returned by the analyzer.
21
+ Raises:
22
+ ValueError: If the query or domains are empty.
23
+ Exception: Propagates any exceptions raised during the search, scraping, or analysis process.
24
+ """
25
+ try:
26
+ # 1. Validate search parameters
27
+ if not query or query.strip() == "":
28
+ raise ValueError("Query can't be empty.")
29
+
30
+ # 2. Run initial search
31
+ searcher_result = searcher.search_custom_domains(query=query)
32
+
33
+ # 3. Scrape search result
34
+ scrape_result = []
35
+ for item in searcher_result.items:
36
+ url_scrape_result = scraper.get_url_content(url_parameters=item)
37
+ scrape_result.append(url_scrape_result)
38
+
39
+ # 4. Analyze search result
40
+ final_result = analyzer.analyze_search_result(
41
+ query=query, search_results=scrape_result
42
+ )
43
+
44
+ return final_result
45
+ except Exception as e:
46
+ raise e
src/utils/url_validator.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ import validators
3
+
4
+
5
+ def validate_url(url: str):
6
+ if not validators.url(url):
7
+ raise ValueError("Provided url is not valid.")
8
+
9
+
10
+ def validate_urls(urls: List[str]):
11
+ for url in urls:
12
+ if not validators.url(url):
13
+ raise ValueError(f"Provided url: {url} is not valid.")