Spaces:
Runtime error
Runtime error
Add initial project structure with configuration and factory classes
Browse files- Created pyproject.toml and requirements.txt for project dependencies.
- Implemented AnalyzerFactory, ScraperFactory, and SearcherFactory classes.
- Defined interfaces for analyzers, scrapers, and searchers.
- Added type enumerations for searchers, scrapers, and analyzers.
- pyproject.toml +69 -0
- requirements.txt +61 -0
- src/core/factory/__init__.py +0 -0
- src/core/factory/analyzer_facrory.py +18 -0
- src/core/factory/scraper_factory.py +13 -0
- src/core/factory/searcher_factory.py +13 -0
- src/core/interface/__init__.py +0 -0
- src/core/interface/analyzer_interface.py +23 -0
- src/core/interface/scraper_interface.py +18 -0
- src/core/interface/searcher_interface.py +41 -0
- src/core/types.py +41 -0
pyproject.toml
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "search-tool"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Add your description here"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.10"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"annotated-types==0.7.0",
|
| 9 |
+
"anyio==4.9.0",
|
| 10 |
+
"babel==2.17.0",
|
| 11 |
+
"beautifulsoup4==4.13.4",
|
| 12 |
+
"cachetools==5.5.2",
|
| 13 |
+
"certifi==2025.4.26",
|
| 14 |
+
"charset-normalizer==3.4.2",
|
| 15 |
+
"click==8.2.1",
|
| 16 |
+
"courlan==1.3.2",
|
| 17 |
+
"dateparser==1.2.1",
|
| 18 |
+
"distro==1.9.0",
|
| 19 |
+
"exceptiongroup==1.3.0",
|
| 20 |
+
"google-api-core==2.25.0",
|
| 21 |
+
"google-api-python-client==2.171.0",
|
| 22 |
+
"google-auth==2.40.3",
|
| 23 |
+
"google-auth-httplib2==0.2.0",
|
| 24 |
+
"googleapis-common-protos==1.70.0",
|
| 25 |
+
"googlesearch-python==1.3.0",
|
| 26 |
+
"h11==0.16.0",
|
| 27 |
+
"htmldate==1.9.3",
|
| 28 |
+
"httpcore==1.0.9",
|
| 29 |
+
"httplib2==0.22.0",
|
| 30 |
+
"httpx==0.28.1",
|
| 31 |
+
"httpx-sse==0.4.0",
|
| 32 |
+
"idna==3.10",
|
| 33 |
+
"jiter==0.10.0",
|
| 34 |
+
"justext==3.0.2",
|
| 35 |
+
"lxml==5.4.0",
|
| 36 |
+
"lxml-html-clean==0.4.2",
|
| 37 |
+
"mcp==1.9.3",
|
| 38 |
+
"openai==1.85.0",
|
| 39 |
+
"proto-plus==1.26.1",
|
| 40 |
+
"protobuf==6.31.1",
|
| 41 |
+
"pyasn1==0.6.1",
|
| 42 |
+
"pyasn1-modules==0.4.2",
|
| 43 |
+
"pydantic==2.11.5",
|
| 44 |
+
"pydantic-core==2.33.2",
|
| 45 |
+
"pydantic-settings==2.9.1",
|
| 46 |
+
"pyparsing==3.2.3",
|
| 47 |
+
"python-dateutil==2.9.0.post0",
|
| 48 |
+
"python-dotenv==1.1.0",
|
| 49 |
+
"python-multipart==0.0.20",
|
| 50 |
+
"pytz==2025.2",
|
| 51 |
+
"regex==2024.11.6",
|
| 52 |
+
"requests==2.32.4",
|
| 53 |
+
"rsa==4.9.1",
|
| 54 |
+
"six==1.17.0",
|
| 55 |
+
"sniffio==1.3.1",
|
| 56 |
+
"soupsieve==2.7",
|
| 57 |
+
"sse-starlette==2.3.6",
|
| 58 |
+
"starlette==0.47.0",
|
| 59 |
+
"tld==0.13.1",
|
| 60 |
+
"tqdm==4.67.1",
|
| 61 |
+
"trafilatura==2.0.0",
|
| 62 |
+
"typing-extensions==4.14.0",
|
| 63 |
+
"typing-inspection==0.4.1",
|
| 64 |
+
"tzlocal==5.3.1",
|
| 65 |
+
"uritemplate==4.2.0",
|
| 66 |
+
"urllib3==2.4.0",
|
| 67 |
+
"uvicorn==0.34.3",
|
| 68 |
+
"validators==0.35.0",
|
| 69 |
+
]
|
requirements.txt
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
annotated-types==0.7.0
|
| 2 |
+
anyio==4.9.0
|
| 3 |
+
babel==2.17.0
|
| 4 |
+
beautifulsoup4==4.13.4
|
| 5 |
+
cachetools==5.5.2
|
| 6 |
+
certifi==2025.4.26
|
| 7 |
+
charset-normalizer==3.4.2
|
| 8 |
+
click==8.2.1
|
| 9 |
+
courlan==1.3.2
|
| 10 |
+
dateparser==1.2.1
|
| 11 |
+
distro==1.9.0
|
| 12 |
+
exceptiongroup==1.3.0
|
| 13 |
+
google-api-core==2.25.0
|
| 14 |
+
google-api-python-client==2.171.0
|
| 15 |
+
google-auth==2.40.3
|
| 16 |
+
google-auth-httplib2==0.2.0
|
| 17 |
+
googleapis-common-protos==1.70.0
|
| 18 |
+
googlesearch-python==1.3.0
|
| 19 |
+
h11==0.16.0
|
| 20 |
+
htmldate==1.9.3
|
| 21 |
+
httpcore==1.0.9
|
| 22 |
+
httplib2==0.22.0
|
| 23 |
+
httpx==0.28.1
|
| 24 |
+
httpx-sse==0.4.0
|
| 25 |
+
idna==3.10
|
| 26 |
+
jiter==0.10.0
|
| 27 |
+
justext==3.0.2
|
| 28 |
+
lxml==5.4.0
|
| 29 |
+
lxml-html-clean==0.4.2
|
| 30 |
+
mcp==1.9.3
|
| 31 |
+
openai==1.85.0
|
| 32 |
+
proto-plus==1.26.1
|
| 33 |
+
protobuf==6.31.1
|
| 34 |
+
pyasn1==0.6.1
|
| 35 |
+
pyasn1-modules==0.4.2
|
| 36 |
+
pydantic==2.11.5
|
| 37 |
+
pydantic-core==2.33.2
|
| 38 |
+
pydantic-settings==2.9.1
|
| 39 |
+
pyparsing==3.2.3
|
| 40 |
+
python-dateutil==2.9.0.post0
|
| 41 |
+
python-dotenv==1.1.0
|
| 42 |
+
python-multipart==0.0.20
|
| 43 |
+
pytz==2025.2
|
| 44 |
+
regex==2024.11.6
|
| 45 |
+
requests==2.32.4
|
| 46 |
+
rsa==4.9.1
|
| 47 |
+
six==1.17.0
|
| 48 |
+
sniffio==1.3.1
|
| 49 |
+
soupsieve==2.7
|
| 50 |
+
sse-starlette==2.3.6
|
| 51 |
+
starlette==0.47.0
|
| 52 |
+
tld==0.13.1
|
| 53 |
+
tqdm==4.67.1
|
| 54 |
+
trafilatura==2.0.0
|
| 55 |
+
typing-extensions==4.14.0
|
| 56 |
+
typing-inspection==0.4.1
|
| 57 |
+
tzlocal==5.3.1
|
| 58 |
+
uritemplate==4.2.0
|
| 59 |
+
urllib3==2.4.0
|
| 60 |
+
uvicorn==0.34.3
|
| 61 |
+
validators==0.35.0
|
src/core/factory/__init__.py
ADDED
|
File without changes
|
src/core/factory/analyzer_facrory.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
|
| 4 |
+
from src.core.types import AnalyzerType
|
| 5 |
+
from src.analyzer.openai_analyzer import OpenaiAnalyzer
|
| 6 |
+
|
| 7 |
+
load_dotenv() # Loads from .env file
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class AnalyzerFactory:
|
| 11 |
+
@staticmethod
|
| 12 |
+
def initialize_analyzer(analyzer_type: str):
|
| 13 |
+
if analyzer_type == AnalyzerType.OPENAI_ANALYZER:
|
| 14 |
+
return OpenaiAnalyzer(api_key=os.getenv("OPENAI_API_KEY"))
|
| 15 |
+
else:
|
| 16 |
+
raise Exception(
|
| 17 |
+
f"Unsupported analyzer type please choose from {[analyzer_type for analyzer_type in AnalyzerType.__annotations__]}"
|
| 18 |
+
)
|
src/core/factory/scraper_factory.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.core.types import ScraperType
|
| 2 |
+
from src.scraper.trafilatura_scraper import TrafilaturaScraper
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class ScraperFactory:
|
| 6 |
+
@staticmethod
|
| 7 |
+
def initialize_scraper(scraper_type: str):
|
| 8 |
+
if scraper_type == ScraperType.TRAFILATURA_SCRAPER:
|
| 9 |
+
return TrafilaturaScraper()
|
| 10 |
+
else:
|
| 11 |
+
raise Exception(
|
| 12 |
+
f"Unsupported scraper type please choose from {[scraper_type for scraper_type in ScraperType.__annotations__]}"
|
| 13 |
+
)
|
src/core/factory/searcher_factory.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.core.types import SearcherType
|
| 2 |
+
from src.searcher.open_google_search import GoogleSearch
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class SearcherFactory:
|
| 6 |
+
@staticmethod
|
| 7 |
+
def initialize_searcher(searcher_type: str):
|
| 8 |
+
if searcher_type == SearcherType.OPEN_GOOGLE_SEARCH:
|
| 9 |
+
return GoogleSearch()
|
| 10 |
+
else:
|
| 11 |
+
raise Exception(
|
| 12 |
+
f"Unsupported searcher type please choose from {[searcher_type for searcher_type in SearcherType.__annotations__]}"
|
| 13 |
+
)
|
src/core/interface/__init__.py
ADDED
|
File without changes
|
src/core/interface/analyzer_interface.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from abc import ABC, abstractmethod
|
| 2 |
+
from typing import List
|
| 3 |
+
|
| 4 |
+
from src.models.scrape_models import ScrapeResult
|
| 5 |
+
from src.models.analyzer_models import AnalyzerResult
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class AnalyzerInterface(ABC):
|
| 9 |
+
@abstractmethod
|
| 10 |
+
def analyze_search_result(
|
| 11 |
+
query: str, search_result: List[ScrapeResult]
|
| 12 |
+
) -> AnalyzerResult:
|
| 13 |
+
"""
|
| 14 |
+
Analyzes the provided search results based on the given query.
|
| 15 |
+
Args:
|
| 16 |
+
query (str): The search query string.
|
| 17 |
+
search_result (List[ScrapeResult]): A list of search results to be analyzed.
|
| 18 |
+
Returns:
|
| 19 |
+
AnalyzerResult: The result of the analysis.
|
| 20 |
+
Raises:
|
| 21 |
+
NotImplementedError: If the method is not implemented by a subclass.
|
| 22 |
+
"""
|
| 23 |
+
raise NotImplementedError
|
src/core/interface/scraper_interface.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from abc import ABC, abstractmethod
|
| 2 |
+
|
| 3 |
+
from src.models.scrape_models import ScrapeQuery, ScrapeResult
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class ScraperInterface(ABC):
|
| 7 |
+
@abstractmethod
|
| 8 |
+
def get_url_content(url_parameters: ScrapeQuery) -> ScrapeResult:
|
| 9 |
+
"""
|
| 10 |
+
Fetches the content of the specified URL and returns the result as a ScrapeResult object.
|
| 11 |
+
Args:
|
| 12 |
+
url_parameters (ScrapeQuery): The URL parameters to fetch content from.
|
| 13 |
+
Returns:
|
| 14 |
+
ScrapeResult: An object containing the scraped content and related metadata.
|
| 15 |
+
Raises:
|
| 16 |
+
NotImplementedError: This method should be implemented by subclasses.
|
| 17 |
+
"""
|
| 18 |
+
raise NotImplementedError
|
src/core/interface/searcher_interface.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from abc import ABC, abstractmethod
|
| 2 |
+
from typing import List, Optional
|
| 3 |
+
|
| 4 |
+
from src.models.search_models import SearchResult
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class SearchInterface(ABC):
|
| 8 |
+
@abstractmethod
|
| 9 |
+
def search_custom_sites(
|
| 10 |
+
query: str, sites: Optional[List[str]] = None
|
| 11 |
+
) -> SearchResult:
|
| 12 |
+
"""
|
| 13 |
+
Searches for the given query across a list of custom sites.
|
| 14 |
+
Args:
|
| 15 |
+
query (str): The search query string.
|
| 16 |
+
sites (Optional[List[str]], optional): A list of site URLs or identifiers to restrict the search to.
|
| 17 |
+
If None, searches all available custom sites.
|
| 18 |
+
Returns:
|
| 19 |
+
SearchResult: An object containing the search results.
|
| 20 |
+
Raises:
|
| 21 |
+
NotImplementedError: This method must be implemented by subclasses.
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
raise NotImplementedError
|
| 25 |
+
|
| 26 |
+
@abstractmethod
|
| 27 |
+
def search_custom_domains(
|
| 28 |
+
query: str, domains: Optional[List[str]] = None
|
| 29 |
+
) -> SearchResult:
|
| 30 |
+
"""
|
| 31 |
+
Searches for the given query within a specified custom domain.
|
| 32 |
+
Args:
|
| 33 |
+
query (str): The search query string.
|
| 34 |
+
domain (Optional[List[str]], optional): The custom domain to restrict the search to. Defaults to None.
|
| 35 |
+
Returns:
|
| 36 |
+
SearchResult: The result of the search operation.
|
| 37 |
+
Raises:
|
| 38 |
+
NotImplementedError: If the method is not implemented.
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
raise NotImplementedError
|
src/core/types.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from enum import Enum
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class SearcherType(str, Enum):
|
| 5 |
+
OPEN_GOOGLE_SEARCH: str = "Open Google Search"
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class ScraperType(str, Enum):
|
| 9 |
+
TRAFILATURA_SCRAPER: str = "trafilatura_scraper"
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class AnalyzerType(str, Enum):
|
| 13 |
+
OPENAI_ANALYZER: str = "openai_analyzer"
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
# DEFAULT VALUES AND CONSTANTS
|
| 17 |
+
DEFAULT_SYSTEM_PROMPT = """You are an intelligent assistant designed to answer user questions strictly based on the provided list of search result items. Each item includes a title, description, content, and URL. You must not use external knowledge or make assumptions beyond what is explicitly available in the search results.
|
| 18 |
+
|
| 19 |
+
Your task is to generate a concise and informative response to the user’s query, ensuring that any factual claims in your answer are supported by specific excerpts from the `ScrapeResult` list. For each piece of information used from a scrape result, create a corresponding `Citation` object.
|
| 20 |
+
|
| 21 |
+
You must return the result in the form of an `AnalyzerResult`, which includes:
|
| 22 |
+
|
| 23 |
+
- `response_str`: The complete response text.
|
| 24 |
+
- `citation`: A list of `Citation` entries referencing the exact part of the `response_str` that came from the scraped content.
|
| 25 |
+
|
| 26 |
+
Each `Citation` must include:
|
| 27 |
+
- `citation_type`: Always "url_citation".
|
| 28 |
+
- `url`: The source URL as provided.
|
| 29 |
+
- `start_index` and `end_index`: The exact character indices of the corresponding information in the `response_str`.
|
| 30 |
+
|
| 31 |
+
Only include citations for parts that directly come from the `ScrapeResult`.
|
| 32 |
+
|
| 33 |
+
Do not fabricate information. If the scraped results do not contain enough detail to fully answer the question, mention that in your answer.
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
DEFAULT_USER_PROMPT = """
|
| 37 |
+
"query": {query},
|
| 38 |
+
"scrape_results": {scrape_results}
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
DEFAULT_OPENAI_ANALYZER = "gpt-4o-mini"
|