Spaces:
Runtime error
Runtime error
Add Searcher, Scraper and analyzer functionality with custom domain and site search tools
Browse files- src/analyzer/openai_analyzer.py +46 -0
- src/models/__init__.py +0 -0
- src/models/analyzer_models.py +16 -0
- src/models/scrape_models.py +15 -0
- src/models/search_models.py +14 -0
- src/scraper/__init__.py +0 -0
- src/scraper/trafilatura_scraper.py +33 -0
- src/searcher/__init__.py +0 -0
- src/searcher/open_google_search.py +93 -0
- src/tools/__init__.py +0 -0
- src/tools/custom_domains_search_tool.py +52 -0
- src/tools/custom_sites_search_tool.py +53 -0
- src/tools/search_on_web_tool.py +46 -0
- src/utils/url_validator.py +13 -0
src/analyzer/openai_analyzer.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from typing import List
|
| 3 |
+
from openai import OpenAI
|
| 4 |
+
|
| 5 |
+
from src.core.types import DEFAULT_OPENAI_ANALYZER, DEFAULT_SYSTEM_PROMPT, DEFAULT_USER_PROMPT
|
| 6 |
+
|
| 7 |
+
from src.models.analyzer_models import AnalyzerResult
|
| 8 |
+
from src.models.scrape_models import ScrapeResult
|
| 9 |
+
|
| 10 |
+
from src.core.interface.analyzer_interface import AnalyzerInterface
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class OpenaiAnalyzer(AnalyzerInterface):
|
| 14 |
+
def __init__(self, api_key, model_name = DEFAULT_OPENAI_ANALYZER):
|
| 15 |
+
self.client = OpenAI(api_key=api_key)
|
| 16 |
+
self.model_name = model_name
|
| 17 |
+
|
| 18 |
+
def analyze_search_result(self, query: str, search_results: List[ScrapeResult]) -> AnalyzerResult:
|
| 19 |
+
"""
|
| 20 |
+
Analyzes the provided search results based on the given query.
|
| 21 |
+
Args:
|
| 22 |
+
query (str): The search query string.
|
| 23 |
+
search_results (List[ScrapeResult]): A list of search results to be analyzed.
|
| 24 |
+
Returns:
|
| 25 |
+
AnalyzerResult: The result of the analysis.
|
| 26 |
+
Raises:
|
| 27 |
+
NotImplementedError: If the method is not implemented by a subclass.
|
| 28 |
+
"""
|
| 29 |
+
try:
|
| 30 |
+
user_prompt = DEFAULT_USER_PROMPT.replace("query", query).replace("scrape_results", f"{search_results}")
|
| 31 |
+
completion = self.client.beta.chat.completions.parse(model=self.model_name,
|
| 32 |
+
messages=[
|
| 33 |
+
{
|
| 34 |
+
"role": "system",
|
| 35 |
+
"content": DEFAULT_SYSTEM_PROMPT
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"role": "user",
|
| 39 |
+
"content": user_prompt
|
| 40 |
+
}
|
| 41 |
+
],
|
| 42 |
+
response_format=AnalyzerResult)
|
| 43 |
+
response = completion.choices[0].message.parsed
|
| 44 |
+
return response
|
| 45 |
+
except Exception as e:
|
| 46 |
+
raise Exception(f"Error while analyzing search result: {str(e)}")
|
src/models/__init__.py
ADDED
|
File without changes
|
src/models/analyzer_models.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Optional
|
| 2 |
+
from pydantic import BaseModel, Field
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class Citation(BaseModel):
|
| 6 |
+
citation_type: str = Field(description="Citation type.")
|
| 7 |
+
url: str = Field(description="Citation URL.")
|
| 8 |
+
start_index: int = Field(description="Citation start index in response.")
|
| 9 |
+
end_index: int = Field(description="Citation end index in response.")
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class AnalyzerResult(BaseModel):
|
| 13 |
+
response_str: str = Field(description="Final response string.")
|
| 14 |
+
citation: Optional[List[Citation]] = Field(
|
| 15 |
+
default=[], description="Final response string."
|
| 16 |
+
)
|
src/models/scrape_models.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional
|
| 2 |
+
from pydantic import BaseModel, Field
|
| 3 |
+
|
| 4 |
+
from src.models.search_models import SearchItemResult
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class ScrapeQuery(SearchItemResult):
|
| 8 |
+
pass
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class ScrapeResult(BaseModel):
|
| 12 |
+
url: str = Field(description="URL.")
|
| 13 |
+
content: Optional[str] = Field("", description="URL content.")
|
| 14 |
+
title: str = Field(description="Title of result item.")
|
| 15 |
+
description: str = Field(description="Description of result item.")
|
src/models/search_models.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Optional
|
| 2 |
+
from pydantic import BaseModel, Field
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class SearchItemResult(BaseModel):
|
| 6 |
+
url: str = Field(description="URL of result item.")
|
| 7 |
+
title: str = Field(description="Title of result item.")
|
| 8 |
+
description: str = Field(description="Description of result item.")
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class SearchResult(BaseModel):
|
| 12 |
+
items: Optional[List[SearchItemResult]] = Field(
|
| 13 |
+
default=[], description="Search result items."
|
| 14 |
+
)
|
src/scraper/__init__.py
ADDED
|
File without changes
|
src/scraper/trafilatura_scraper.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.core.interface.scraper_interface import ScraperInterface
|
| 2 |
+
from src.models.scrape_models import ScrapeQuery, ScrapeResult
|
| 3 |
+
|
| 4 |
+
import trafilatura
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class TrafilaturaScraper(ScraperInterface):
|
| 8 |
+
def __init__(self):
|
| 9 |
+
pass
|
| 10 |
+
|
| 11 |
+
def get_url_content(self, url_parameters: ScrapeQuery) -> ScrapeResult:
|
| 12 |
+
"""
|
| 13 |
+
Fetches and extracts the main textual content from the specified URL using trafilatura.
|
| 14 |
+
Args:
|
| 15 |
+
url_parameters (ScrapeQuery): The URL parameters of the web page to scrape.
|
| 16 |
+
Returns:
|
| 17 |
+
ScrapeResult: An object containing the extracted content from the URL.
|
| 18 |
+
Raises:
|
| 19 |
+
Exception: If an error occurs during fetching or extraction, an exception is raised with a descriptive message.
|
| 20 |
+
"""
|
| 21 |
+
try:
|
| 22 |
+
downloaded = trafilatura.fetch_url(url_parameters.url)
|
| 23 |
+
result = trafilatura.extract(downloaded)
|
| 24 |
+
|
| 25 |
+
return ScrapeResult(
|
| 26 |
+
content=result,
|
| 27 |
+
url=url_parameters.url,
|
| 28 |
+
title=url_parameters.title,
|
| 29 |
+
description=url_parameters.description,
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
except Exception as e:
|
| 33 |
+
raise Exception(f"Error occurred while getting url content: {str(e)}")
|
src/searcher/__init__.py
ADDED
|
File without changes
|
src/searcher/open_google_search.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Optional
|
| 2 |
+
from googlesearch import search
|
| 3 |
+
|
| 4 |
+
from src.core.interface.searcher_interface import SearchInterface
|
| 5 |
+
from src.models.search_models import SearchItemResult, SearchResult
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class GoogleSearch(SearchInterface):
|
| 9 |
+
def __init__(self):
|
| 10 |
+
pass
|
| 11 |
+
|
| 12 |
+
def search_custom_sites(
|
| 13 |
+
self, query: str, sites: Optional[list] = None
|
| 14 |
+
) -> SearchResult:
|
| 15 |
+
"""
|
| 16 |
+
Performs a Google search restricted to a dynamic list of specific sites.
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
query (str): The user's search query (e.g., "generative AI").
|
| 20 |
+
sites (list): A list of websites to search within (e.g., ['wired.com', 'theverge.com']).
|
| 21 |
+
|
| 22 |
+
Returns:
|
| 23 |
+
SearchResult: The search results from the API, or None if an error occurs.
|
| 24 |
+
"""
|
| 25 |
+
try:
|
| 26 |
+
# 1. Construct the dynamic query string
|
| 27 |
+
# Joins the sites with " OR " and formats them with the "site:" operator
|
| 28 |
+
site_restriction = (
|
| 29 |
+
" OR ".join([f"site:{site}" for site in sites]) if sites else ""
|
| 30 |
+
)
|
| 31 |
+
full_query = f"{query} {site_restriction}"
|
| 32 |
+
|
| 33 |
+
# 2. Execute the search
|
| 34 |
+
result = search(full_query, num_results=5, advanced=True)
|
| 35 |
+
|
| 36 |
+
# 3. Check for returned result
|
| 37 |
+
items = [
|
| 38 |
+
SearchItemResult(
|
| 39 |
+
url=item.url, title=item.title, description=item.description
|
| 40 |
+
)
|
| 41 |
+
for item in result
|
| 42 |
+
]
|
| 43 |
+
|
| 44 |
+
urls = [item.url for item in items if item.url]
|
| 45 |
+
if not urls:
|
| 46 |
+
return SearchResult(items=[])
|
| 47 |
+
|
| 48 |
+
return SearchResult(items=items)
|
| 49 |
+
|
| 50 |
+
except Exception as e:
|
| 51 |
+
raise Exception(f"An error occurred while searching in Google: {str(e)}")
|
| 52 |
+
|
| 53 |
+
def search_custom_domains(
|
| 54 |
+
self, query: str, domains: Optional[List[str]] = None
|
| 55 |
+
) -> SearchResult:
|
| 56 |
+
"""
|
| 57 |
+
Performs a Google search restricted to a custom domain.
|
| 58 |
+
|
| 59 |
+
Args:
|
| 60 |
+
query (str): The user's search query (e.g., "generative AI").
|
| 61 |
+
domain (List[str]): A List od domains of websites to search within (e.g., '.edu').
|
| 62 |
+
|
| 63 |
+
Returns:
|
| 64 |
+
SearchResult: The search results from the API, or None if an error occurs.
|
| 65 |
+
"""
|
| 66 |
+
try:
|
| 67 |
+
# 1. Construct the dynamic query string
|
| 68 |
+
domain_restriction = (
|
| 69 |
+
" OR ".join([f"site: {domain}" for domain in domains])
|
| 70 |
+
if domains
|
| 71 |
+
else ""
|
| 72 |
+
)
|
| 73 |
+
full_query = f"{query} {domain_restriction}"
|
| 74 |
+
|
| 75 |
+
# 2. Execute the search
|
| 76 |
+
result = search(full_query, num_results=5, advanced=True)
|
| 77 |
+
|
| 78 |
+
# 3. Check for returned result
|
| 79 |
+
items = [
|
| 80 |
+
SearchItemResult(
|
| 81 |
+
url=item.url, title=item.title, description=item.description
|
| 82 |
+
)
|
| 83 |
+
for item in result
|
| 84 |
+
]
|
| 85 |
+
|
| 86 |
+
urls = [item.url for item in items if item.url]
|
| 87 |
+
if not urls:
|
| 88 |
+
SearchResult(items=[])
|
| 89 |
+
|
| 90 |
+
return SearchResult(items=items)
|
| 91 |
+
|
| 92 |
+
except Exception as e:
|
| 93 |
+
raise Exception(f"An error occurred while searching in Google: {str(e)}")
|
src/tools/__init__.py
ADDED
|
File without changes
|
src/tools/custom_domains_search_tool.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
|
| 3 |
+
from src.core.factory.analyzer_facrory import AnalyzerFactory
|
| 4 |
+
from src.core.factory.scraper_factory import ScraperFactory
|
| 5 |
+
from src.core.factory.searcher_factory import SearcherFactory
|
| 6 |
+
from src.core.types import AnalyzerType, ScraperType, SearcherType
|
| 7 |
+
from src.models.analyzer_models import AnalyzerResult
|
| 8 |
+
|
| 9 |
+
searcher = SearcherFactory.initialize_searcher(
|
| 10 |
+
searcher_type=SearcherType.OPEN_GOOGLE_SEARCH
|
| 11 |
+
)
|
| 12 |
+
scraper = ScraperFactory.initialize_scraper(ScraperType.TRAFILATURA_SCRAPER)
|
| 13 |
+
analyzer = AnalyzerFactory.initialize_analyzer(AnalyzerType.OPENAI_ANALYZER)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def search_custom_domain(query: str, domains: List[str] = None) -> AnalyzerResult:
|
| 17 |
+
"""
|
| 18 |
+
Performs a custom domain search for the given query and domains, scrapes the resulting URLs, and analyzes the search results.
|
| 19 |
+
Args:
|
| 20 |
+
query (str): The search query string. Must not be empty.
|
| 21 |
+
domains (List[str], optional): A list of domain strings to restrict the search (e.g., ['edu', 'gov']). Must not be empty.
|
| 22 |
+
Returns:
|
| 23 |
+
AnalyzerResult: The analyzed result of the search, as returned by the analyzer.
|
| 24 |
+
Raises:
|
| 25 |
+
ValueError: If the query or domains are empty.
|
| 26 |
+
Exception: Propagates any exceptions raised during the search, scraping, or analysis process.
|
| 27 |
+
"""
|
| 28 |
+
try:
|
| 29 |
+
# 1. Validate search parameters
|
| 30 |
+
if not query or query.strip() == "":
|
| 31 |
+
raise ValueError("Query can't be empty.")
|
| 32 |
+
|
| 33 |
+
if not domains or domains == []:
|
| 34 |
+
raise ValueError("Domain can't be empty (e.g. 'edu', 'gov').")
|
| 35 |
+
|
| 36 |
+
# 2. Run initial search
|
| 37 |
+
searcher_result = searcher.search_custom_domains(query=query, domains=domains)
|
| 38 |
+
|
| 39 |
+
# 3. Scrape search result
|
| 40 |
+
scrape_result = []
|
| 41 |
+
for item in searcher_result.items:
|
| 42 |
+
url_scrape_result = scraper.get_url_content(url_parameters=item)
|
| 43 |
+
scrape_result.append(url_scrape_result)
|
| 44 |
+
|
| 45 |
+
# 4. Analyze search result
|
| 46 |
+
final_result = analyzer.analyze_search_result(
|
| 47 |
+
query=query, search_results=scrape_result
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
return final_result
|
| 51 |
+
except Exception as e:
|
| 52 |
+
raise e
|
src/tools/custom_sites_search_tool.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional
|
| 2 |
+
|
| 3 |
+
from src.core.factory.analyzer_facrory import AnalyzerFactory
|
| 4 |
+
from src.core.factory.scraper_factory import ScraperFactory
|
| 5 |
+
from src.core.factory.searcher_factory import SearcherFactory
|
| 6 |
+
from src.core.types import AnalyzerType, ScraperType, SearcherType
|
| 7 |
+
from src.models.analyzer_models import AnalyzerResult
|
| 8 |
+
from src.utils.url_validator import validate_urls
|
| 9 |
+
|
| 10 |
+
searcher = SearcherFactory.initialize_searcher(
|
| 11 |
+
searcher_type=SearcherType.OPEN_GOOGLE_SEARCH
|
| 12 |
+
)
|
| 13 |
+
scraper = ScraperFactory.initialize_scraper(ScraperType.TRAFILATURA_SCRAPER)
|
| 14 |
+
analyzer = AnalyzerFactory.initialize_analyzer(AnalyzerType.OPENAI_ANALYZER)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def search_custom_sites(query: str, sites: Optional[list] = None) -> AnalyzerResult:
|
| 18 |
+
"""
|
| 19 |
+
Performs a custom site search, scrapes the resulting URLs, and analyzes the content.
|
| 20 |
+
Args:
|
| 21 |
+
query (str): The search query string. Must not be empty.
|
| 22 |
+
sites (Optional[list], optional): A list of site URLs to restrict the search to. Defaults to None.
|
| 23 |
+
Returns:
|
| 24 |
+
AnalyzerResult: The analyzed result of the scraped search results.
|
| 25 |
+
Raises:
|
| 26 |
+
ValueError: If the query is empty.
|
| 27 |
+
Exception: Propagates any exception raised during validation, searching, scraping, or analysis.
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
try:
|
| 31 |
+
# 1. Validate search parameters
|
| 32 |
+
if not query or query.strip() == "":
|
| 33 |
+
raise ValueError("Query can't be empty.")
|
| 34 |
+
|
| 35 |
+
validate_urls(urls=sites)
|
| 36 |
+
|
| 37 |
+
# 2. Run initial search
|
| 38 |
+
searcher_result = searcher.search_custom_sites(query=query, sites=sites)
|
| 39 |
+
|
| 40 |
+
# 3. Scrape search result
|
| 41 |
+
scrape_result = []
|
| 42 |
+
for item in searcher_result.items:
|
| 43 |
+
url_scrape_result = scraper.get_url_content(url_parameters=item)
|
| 44 |
+
scrape_result.append(url_scrape_result)
|
| 45 |
+
|
| 46 |
+
# 4. Analyze search result
|
| 47 |
+
final_result = analyzer.analyze_search_result(
|
| 48 |
+
query=query, search_results=scrape_result
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
return final_result
|
| 52 |
+
except Exception as e:
|
| 53 |
+
raise e
|
src/tools/search_on_web_tool.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.core.factory.analyzer_facrory import AnalyzerFactory
|
| 2 |
+
from src.core.factory.scraper_factory import ScraperFactory
|
| 3 |
+
from src.core.factory.searcher_factory import SearcherFactory
|
| 4 |
+
from src.core.types import AnalyzerType, ScraperType, SearcherType
|
| 5 |
+
from src.models.analyzer_models import AnalyzerResult
|
| 6 |
+
|
| 7 |
+
searcher = SearcherFactory.initialize_searcher(
|
| 8 |
+
searcher_type=SearcherType.OPEN_GOOGLE_SEARCH
|
| 9 |
+
)
|
| 10 |
+
scraper = ScraperFactory.initialize_scraper(ScraperType.TRAFILATURA_SCRAPER)
|
| 11 |
+
analyzer = AnalyzerFactory.initialize_analyzer(AnalyzerType.OPENAI_ANALYZER)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def search_on_web(query: str) -> AnalyzerResult:
|
| 15 |
+
"""
|
| 16 |
+
Performs a general search on web, scrapes the resulting URLs, and analyzes the search results.
|
| 17 |
+
Args:
|
| 18 |
+
query (str): The search query string. Must not be empty.
|
| 19 |
+
Returns:
|
| 20 |
+
AnalyzerResult: The analyzed result of the search, as returned by the analyzer.
|
| 21 |
+
Raises:
|
| 22 |
+
ValueError: If the query or domains are empty.
|
| 23 |
+
Exception: Propagates any exceptions raised during the search, scraping, or analysis process.
|
| 24 |
+
"""
|
| 25 |
+
try:
|
| 26 |
+
# 1. Validate search parameters
|
| 27 |
+
if not query or query.strip() == "":
|
| 28 |
+
raise ValueError("Query can't be empty.")
|
| 29 |
+
|
| 30 |
+
# 2. Run initial search
|
| 31 |
+
searcher_result = searcher.search_custom_domains(query=query)
|
| 32 |
+
|
| 33 |
+
# 3. Scrape search result
|
| 34 |
+
scrape_result = []
|
| 35 |
+
for item in searcher_result.items:
|
| 36 |
+
url_scrape_result = scraper.get_url_content(url_parameters=item)
|
| 37 |
+
scrape_result.append(url_scrape_result)
|
| 38 |
+
|
| 39 |
+
# 4. Analyze search result
|
| 40 |
+
final_result = analyzer.analyze_search_result(
|
| 41 |
+
query=query, search_results=scrape_result
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
return final_result
|
| 45 |
+
except Exception as e:
|
| 46 |
+
raise e
|
src/utils/url_validator.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
import validators
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def validate_url(url: str):
|
| 6 |
+
if not validators.url(url):
|
| 7 |
+
raise ValueError("Provided url is not valid.")
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def validate_urls(urls: List[str]):
|
| 11 |
+
for url in urls:
|
| 12 |
+
if not validators.url(url):
|
| 13 |
+
raise ValueError(f"Provided url: {url} is not valid.")
|