|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import asyncio |
|
|
import aiohttp |
|
|
import requests |
|
|
from urllib.parse import quote |
|
|
from config import CONTENT_EXTRACTION, SEARCH_SELECTION |
|
|
from src.core.web_loader import web_loader |
|
|
|
|
|
class BrowserEngine: |
|
|
def __init__(self, configuration): |
|
|
self.config = configuration |
|
|
|
|
|
def generate_headers(self): |
|
|
ipv4 = web_loader.get_ipv4() |
|
|
ipv6 = web_loader.get_ipv6() |
|
|
user_agent = web_loader.get_user_agent() |
|
|
origin = web_loader.get_origin() |
|
|
referrer = web_loader.get_referrer() |
|
|
location = web_loader.get_location() |
|
|
|
|
|
return { |
|
|
"User-Agent": user_agent, |
|
|
"X-Forwarded-For": f"{ipv4}, {ipv6}", |
|
|
"X-Real-IP": ipv4, |
|
|
"X-Originating-IP": ipv4, |
|
|
"X-Remote-IP": ipv4, |
|
|
"X-Remote-Addr": ipv4, |
|
|
"X-Client-IP": ipv4, |
|
|
"X-Forwarded-Host": origin.replace("https://", "").replace("http://", ""), |
|
|
"Origin": origin, |
|
|
"Referer": referrer, |
|
|
"Accept-Language": f"{location['language']},en;q=0.9", |
|
|
"Accept": "application/json,text/html,application/xhtml+xml,application/xml;q=0.9,image/*,*/*;q=0.8", |
|
|
"Accept-Encoding": "gzip, deflate, br", |
|
|
"DNT": "1", |
|
|
"Connection": "keep-alive", |
|
|
"Upgrade-Insecure-Requests": "1", |
|
|
"Sec-Fetch-Dest": "document", |
|
|
"Sec-Fetch-Mode": "navigate", |
|
|
"Sec-Fetch-Site": "cross-site", |
|
|
"Sec-Fetch-User": "?1", |
|
|
"Cache-Control": "max-age=0", |
|
|
"X-Country": location['country'], |
|
|
"X-Timezone": location['timezone'] |
|
|
} |
|
|
|
|
|
def _build_search_url_and_selector(self, search_query: str, search_provider: str = "google"): |
|
|
if search_provider == "baidu": |
|
|
return ( |
|
|
f"{self.config.content_reader_api}{self.config.baidu_endpoint}?wd={quote(search_query)}", |
|
|
"#content_left" |
|
|
) |
|
|
provider_prefix = "!go" if search_provider == "google" else "!bi" |
|
|
return ( |
|
|
f"{self.config.content_reader_api}{self.config.searxng_endpoint}?q={quote(f'{provider_prefix} {search_query}')}", |
|
|
"#urls" |
|
|
) |
|
|
|
|
|
async def _async_post(self, url: str, data: dict, headers: dict): |
|
|
timeout = aiohttp.ClientTimeout(total=self.config.request_timeout) |
|
|
async with aiohttp.ClientSession(timeout=timeout) as session: |
|
|
async with session.post(url, data=data, headers=headers) as response: |
|
|
text = await response.text() |
|
|
if response.status >= 400: |
|
|
raise aiohttp.ClientResponseError( |
|
|
request_info=response.request_info, |
|
|
history=response.history, |
|
|
status=response.status, |
|
|
message=text, |
|
|
headers=response.headers |
|
|
) |
|
|
return text |
|
|
|
|
|
async def _async_get(self, url: str, headers: dict): |
|
|
timeout = aiohttp.ClientTimeout(total=self.config.request_timeout) |
|
|
async with aiohttp.ClientSession(timeout=timeout) as session: |
|
|
async with session.get(url, headers=headers) as response: |
|
|
text = await response.text() |
|
|
if response.status >= 400: |
|
|
raise aiohttp.ClientResponseError( |
|
|
request_info=response.request_info, |
|
|
history=response.history, |
|
|
status=response.status, |
|
|
message=text, |
|
|
headers=response.headers |
|
|
) |
|
|
return text |
|
|
|
|
|
def _sync_post(self, url: str, data: dict, headers: dict): |
|
|
response = requests.post(url, data=data, headers=headers, timeout=self.config.request_timeout) |
|
|
response.raise_for_status() |
|
|
return response.text |
|
|
|
|
|
def _sync_get(self, url: str, headers: dict): |
|
|
response = requests.get(url, headers=headers, timeout=self.config.request_timeout) |
|
|
response.raise_for_status() |
|
|
return response.text |
|
|
|
|
|
async def async_extract_page_content(self, target_url: str) -> str: |
|
|
headers = self.generate_headers() |
|
|
payload = {"url": target_url} |
|
|
extracted_content = await self._async_post(self.config.content_reader_api, payload, headers) |
|
|
return f"{extracted_content}\n\n\n{CONTENT_EXTRACTION}\n\n\n" |
|
|
|
|
|
def extract_page_content(self, target_url: str) -> str: |
|
|
try: |
|
|
return asyncio.run(self.async_extract_page_content(target_url)) |
|
|
except Exception: |
|
|
try: |
|
|
headers = self.generate_headers() |
|
|
payload = {"url": target_url} |
|
|
extracted_content = self._sync_post(self.config.content_reader_api, payload, headers) |
|
|
return f"{extracted_content}\n\n\n{CONTENT_EXTRACTION}\n\n\n" |
|
|
except Exception as error: |
|
|
return f"Error reading URL: {str(error)}" |
|
|
|
|
|
async def async_perform_search(self, search_query: str, search_provider: str = "google") -> str: |
|
|
headers = self.generate_headers() |
|
|
full_url, selector = self._build_search_url_and_selector(search_query, search_provider) |
|
|
headers["X-Target-Selector"] = selector |
|
|
search_results = await self._async_get(full_url, headers) |
|
|
return f"{search_results}\n\n\n{SEARCH_SELECTION}\n\n\n" |
|
|
|
|
|
def perform_search(self, search_query: str, search_provider: str = "google") -> str: |
|
|
try: |
|
|
return asyncio.run(self.async_perform_search(search_query, search_provider)) |
|
|
except Exception: |
|
|
try: |
|
|
headers = self.generate_headers() |
|
|
full_url, selector = self._build_search_url_and_selector(search_query, search_provider) |
|
|
headers["X-Target-Selector"] = selector |
|
|
search_results = self._sync_get(full_url, headers) |
|
|
return f"{search_results}\n\n\n{SEARCH_SELECTION}\n\n\n" |
|
|
except Exception as error: |
|
|
return f"Error during search: {str(error)}" |