Olaemad commited on
Commit
237ef97
·
1 Parent(s): bcfcfc4

Add initial project structure with configuration and factory classes

Browse files

- Created pyproject.toml and requirements.txt for project dependencies.
- Implemented AnalyzerFactory, ScraperFactory, and SearcherFactory classes.
- Defined interfaces for analyzers, scrapers, and searchers.
- Added type enumerations for searchers, scrapers, and analyzers.

pyproject.toml ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "search-tool"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = [
8
+ "annotated-types==0.7.0",
9
+ "anyio==4.9.0",
10
+ "babel==2.17.0",
11
+ "beautifulsoup4==4.13.4",
12
+ "cachetools==5.5.2",
13
+ "certifi==2025.4.26",
14
+ "charset-normalizer==3.4.2",
15
+ "click==8.2.1",
16
+ "courlan==1.3.2",
17
+ "dateparser==1.2.1",
18
+ "distro==1.9.0",
19
+ "exceptiongroup==1.3.0",
20
+ "google-api-core==2.25.0",
21
+ "google-api-python-client==2.171.0",
22
+ "google-auth==2.40.3",
23
+ "google-auth-httplib2==0.2.0",
24
+ "googleapis-common-protos==1.70.0",
25
+ "googlesearch-python==1.3.0",
26
+ "h11==0.16.0",
27
+ "htmldate==1.9.3",
28
+ "httpcore==1.0.9",
29
+ "httplib2==0.22.0",
30
+ "httpx==0.28.1",
31
+ "httpx-sse==0.4.0",
32
+ "idna==3.10",
33
+ "jiter==0.10.0",
34
+ "justext==3.0.2",
35
+ "lxml==5.4.0",
36
+ "lxml-html-clean==0.4.2",
37
+ "mcp==1.9.3",
38
+ "openai==1.85.0",
39
+ "proto-plus==1.26.1",
40
+ "protobuf==6.31.1",
41
+ "pyasn1==0.6.1",
42
+ "pyasn1-modules==0.4.2",
43
+ "pydantic==2.11.5",
44
+ "pydantic-core==2.33.2",
45
+ "pydantic-settings==2.9.1",
46
+ "pyparsing==3.2.3",
47
+ "python-dateutil==2.9.0.post0",
48
+ "python-dotenv==1.1.0",
49
+ "python-multipart==0.0.20",
50
+ "pytz==2025.2",
51
+ "regex==2024.11.6",
52
+ "requests==2.32.4",
53
+ "rsa==4.9.1",
54
+ "six==1.17.0",
55
+ "sniffio==1.3.1",
56
+ "soupsieve==2.7",
57
+ "sse-starlette==2.3.6",
58
+ "starlette==0.47.0",
59
+ "tld==0.13.1",
60
+ "tqdm==4.67.1",
61
+ "trafilatura==2.0.0",
62
+ "typing-extensions==4.14.0",
63
+ "typing-inspection==0.4.1",
64
+ "tzlocal==5.3.1",
65
+ "uritemplate==4.2.0",
66
+ "urllib3==2.4.0",
67
+ "uvicorn==0.34.3",
68
+ "validators==0.35.0",
69
+ ]
requirements.txt ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ annotated-types==0.7.0
2
+ anyio==4.9.0
3
+ babel==2.17.0
4
+ beautifulsoup4==4.13.4
5
+ cachetools==5.5.2
6
+ certifi==2025.4.26
7
+ charset-normalizer==3.4.2
8
+ click==8.2.1
9
+ courlan==1.3.2
10
+ dateparser==1.2.1
11
+ distro==1.9.0
12
+ exceptiongroup==1.3.0
13
+ google-api-core==2.25.0
14
+ google-api-python-client==2.171.0
15
+ google-auth==2.40.3
16
+ google-auth-httplib2==0.2.0
17
+ googleapis-common-protos==1.70.0
18
+ googlesearch-python==1.3.0
19
+ h11==0.16.0
20
+ htmldate==1.9.3
21
+ httpcore==1.0.9
22
+ httplib2==0.22.0
23
+ httpx==0.28.1
24
+ httpx-sse==0.4.0
25
+ idna==3.10
26
+ jiter==0.10.0
27
+ justext==3.0.2
28
+ lxml==5.4.0
29
+ lxml-html-clean==0.4.2
30
+ mcp==1.9.3
31
+ openai==1.85.0
32
+ proto-plus==1.26.1
33
+ protobuf==6.31.1
34
+ pyasn1==0.6.1
35
+ pyasn1-modules==0.4.2
36
+ pydantic==2.11.5
37
+ pydantic-core==2.33.2
38
+ pydantic-settings==2.9.1
39
+ pyparsing==3.2.3
40
+ python-dateutil==2.9.0.post0
41
+ python-dotenv==1.1.0
42
+ python-multipart==0.0.20
43
+ pytz==2025.2
44
+ regex==2024.11.6
45
+ requests==2.32.4
46
+ rsa==4.9.1
47
+ six==1.17.0
48
+ sniffio==1.3.1
49
+ soupsieve==2.7
50
+ sse-starlette==2.3.6
51
+ starlette==0.47.0
52
+ tld==0.13.1
53
+ tqdm==4.67.1
54
+ trafilatura==2.0.0
55
+ typing-extensions==4.14.0
56
+ typing-inspection==0.4.1
57
+ tzlocal==5.3.1
58
+ uritemplate==4.2.0
59
+ urllib3==2.4.0
60
+ uvicorn==0.34.3
61
+ validators==0.35.0
src/core/factory/__init__.py ADDED
File without changes
src/core/factory/analyzer_facrory.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ from src.core.types import AnalyzerType
5
+ from src.analyzer.openai_analyzer import OpenaiAnalyzer
6
+
7
+ load_dotenv() # Loads from .env file
8
+
9
+
10
+ class AnalyzerFactory:
11
+ @staticmethod
12
+ def initialize_analyzer(analyzer_type: str):
13
+ if analyzer_type == AnalyzerType.OPENAI_ANALYZER:
14
+ return OpenaiAnalyzer(api_key=os.getenv("OPENAI_API_KEY"))
15
+ else:
16
+ raise Exception(
17
+ f"Unsupported analyzer type please choose from {[analyzer_type for analyzer_type in AnalyzerType.__annotations__]}"
18
+ )
src/core/factory/scraper_factory.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.core.types import ScraperType
2
+ from src.scraper.trafilatura_scraper import TrafilaturaScraper
3
+
4
+
5
+ class ScraperFactory:
6
+ @staticmethod
7
+ def initialize_scraper(scraper_type: str):
8
+ if scraper_type == ScraperType.TRAFILATURA_SCRAPER:
9
+ return TrafilaturaScraper()
10
+ else:
11
+ raise Exception(
12
+ f"Unsupported scraper type please choose from {[scraper_type for scraper_type in ScraperType.__annotations__]}"
13
+ )
src/core/factory/searcher_factory.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.core.types import SearcherType
2
+ from src.searcher.open_google_search import GoogleSearch
3
+
4
+
5
+ class SearcherFactory:
6
+ @staticmethod
7
+ def initialize_searcher(searcher_type: str):
8
+ if searcher_type == SearcherType.OPEN_GOOGLE_SEARCH:
9
+ return GoogleSearch()
10
+ else:
11
+ raise Exception(
12
+ f"Unsupported searcher type please choose from {[searcher_type for searcher_type in SearcherType.__annotations__]}"
13
+ )
src/core/interface/__init__.py ADDED
File without changes
src/core/interface/analyzer_interface.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from typing import List
3
+
4
+ from src.models.scrape_models import ScrapeResult
5
+ from src.models.analyzer_models import AnalyzerResult
6
+
7
+
8
+ class AnalyzerInterface(ABC):
9
+ @abstractmethod
10
+ def analyze_search_result(
11
+ query: str, search_result: List[ScrapeResult]
12
+ ) -> AnalyzerResult:
13
+ """
14
+ Analyzes the provided search results based on the given query.
15
+ Args:
16
+ query (str): The search query string.
17
+ search_result (List[ScrapeResult]): A list of search results to be analyzed.
18
+ Returns:
19
+ AnalyzerResult: The result of the analysis.
20
+ Raises:
21
+ NotImplementedError: If the method is not implemented by a subclass.
22
+ """
23
+ raise NotImplementedError
src/core/interface/scraper_interface.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+
3
+ from src.models.scrape_models import ScrapeQuery, ScrapeResult
4
+
5
+
6
+ class ScraperInterface(ABC):
7
+ @abstractmethod
8
+ def get_url_content(url_parameters: ScrapeQuery) -> ScrapeResult:
9
+ """
10
+ Fetches the content of the specified URL and returns the result as a ScrapeResult object.
11
+ Args:
12
+ url_parameters (ScrapeQuery): The URL parameters to fetch content from.
13
+ Returns:
14
+ ScrapeResult: An object containing the scraped content and related metadata.
15
+ Raises:
16
+ NotImplementedError: This method should be implemented by subclasses.
17
+ """
18
+ raise NotImplementedError
src/core/interface/searcher_interface.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from typing import List, Optional
3
+
4
+ from src.models.search_models import SearchResult
5
+
6
+
7
+ class SearchInterface(ABC):
8
+ @abstractmethod
9
+ def search_custom_sites(
10
+ query: str, sites: Optional[List[str]] = None
11
+ ) -> SearchResult:
12
+ """
13
+ Searches for the given query across a list of custom sites.
14
+ Args:
15
+ query (str): The search query string.
16
+ sites (Optional[List[str]], optional): A list of site URLs or identifiers to restrict the search to.
17
+ If None, searches all available custom sites.
18
+ Returns:
19
+ SearchResult: An object containing the search results.
20
+ Raises:
21
+ NotImplementedError: This method must be implemented by subclasses.
22
+ """
23
+
24
+ raise NotImplementedError
25
+
26
+ @abstractmethod
27
+ def search_custom_domains(
28
+ query: str, domains: Optional[List[str]] = None
29
+ ) -> SearchResult:
30
+ """
31
+ Searches for the given query within a specified custom domain.
32
+ Args:
33
+ query (str): The search query string.
34
+ domain (Optional[List[str]], optional): The custom domain to restrict the search to. Defaults to None.
35
+ Returns:
36
+ SearchResult: The result of the search operation.
37
+ Raises:
38
+ NotImplementedError: If the method is not implemented.
39
+ """
40
+
41
+ raise NotImplementedError
src/core/types.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from enum import Enum
2
+
3
+
4
+ class SearcherType(str, Enum):
5
+ OPEN_GOOGLE_SEARCH: str = "Open Google Search"
6
+
7
+
8
+ class ScraperType(str, Enum):
9
+ TRAFILATURA_SCRAPER: str = "trafilatura_scraper"
10
+
11
+
12
+ class AnalyzerType(str, Enum):
13
+ OPENAI_ANALYZER: str = "openai_analyzer"
14
+
15
+
16
+ # DEFAULT VALUES AND CONSTANTS
17
+ DEFAULT_SYSTEM_PROMPT = """You are an intelligent assistant designed to answer user questions strictly based on the provided list of search result items. Each item includes a title, description, content, and URL. You must not use external knowledge or make assumptions beyond what is explicitly available in the search results.
18
+
19
+ Your task is to generate a concise and informative response to the user’s query, ensuring that any factual claims in your answer are supported by specific excerpts from the `ScrapeResult` list. For each piece of information used from a scrape result, create a corresponding `Citation` object.
20
+
21
+ You must return the result in the form of an `AnalyzerResult`, which includes:
22
+
23
+ - `response_str`: The complete response text.
24
+ - `citation`: A list of `Citation` entries referencing the exact part of the `response_str` that came from the scraped content.
25
+
26
+ Each `Citation` must include:
27
+ - `citation_type`: Always "url_citation".
28
+ - `url`: The source URL as provided.
29
+ - `start_index` and `end_index`: The exact character indices of the corresponding information in the `response_str`.
30
+
31
+ Only include citations for parts that directly come from the `ScrapeResult`.
32
+
33
+ Do not fabricate information. If the scraped results do not contain enough detail to fully answer the question, mention that in your answer.
34
+ """
35
+
36
+ DEFAULT_USER_PROMPT = """
37
+ "query": {query},
38
+ "scrape_results": {scrape_results}
39
+ """
40
+
41
+ DEFAULT_OPENAI_ANALYZER = "gpt-4o-mini"