Spaces:

Chars
/

CognitiveKernel-Launchpad

Runtime error

CognitiveKernel-Launchpad / ck_pro /agents /search /google_search.py

charSLee013

feat: complete Hugging Face Spaces deployment with production-ready CognitiveKernel-Launchpad

1ea26af 8 months ago

5.29 kB

	"""
	Google Search Engine implementation for CognitiveKernel-Pro
	Embedded anti-bot bypass techniques from googlesearch library
	"""

	import random
	import time
	from typing import List, Generator
	from urllib.parse import unquote
	from .base import BaseSearchEngine, SearchResult, SearchEngine, SearchEngineError

	try:
	import requests
	from bs4 import BeautifulSoup
	except ImportError as e:
	raise SearchEngineError(
	"Required dependencies not installed. Install with: pip install requests beautifulsoup4"
	) from e


	def _get_random_user_agent() -> str:
	"""Generate random Lynx-based user agent to avoid detection"""
	lynx_version = f"Lynx/{random.randint(2, 3)}.{random.randint(8, 9)}.{random.randint(0, 2)}"
	libwww_version = f"libwww-FM/{random.randint(2, 3)}.{random.randint(13, 15)}"
	ssl_mm_version = f"SSL-MM/{random.randint(1, 2)}.{random.randint(3, 5)}"
	openssl_version = f"OpenSSL/{random.randint(1, 3)}.{random.randint(0, 4)}.{random.randint(0, 9)}"
	return f"{lynx_version} {libwww_version} {ssl_mm_version} {openssl_version}"


	def _google_search_request(query: str, num_results: int, timeout: int = 10) -> requests.Response:
	"""Make Google search request with anti-bot protection"""
	response = requests.get(
	url="https://www.google.com/search",
	headers={
	"User-Agent": _get_random_user_agent(),
	"Accept": "/"
	},
	params={
	"q": query,
	"num": num_results + 2, # Get extra to account for filtering
	"hl": "en",
	"gl": "us",
	"safe": "off",
	},
	timeout=timeout,
	verify=True,
	cookies={
	'CONSENT': 'PENDING+987', # Bypasses Google consent page
	'SOCS': 'CAESHAgBEhIaAB', # Additional consent bypass
	}
	)
	response.raise_for_status()
	return response


	def _parse_google_results(html: str) -> Generator[SearchResult, None, None]:
	"""Parse Google search results from HTML using precise CSS selectors"""
	soup = BeautifulSoup(html, "html.parser")
	result_blocks = soup.find_all("div", class_="ezO2md") # Precise Google result selector

	for result in result_blocks:
	# Extract link
	link_tag = result.find("a", href=True)
	if not link_tag:
	continue

	# Extract title
	title_tag = link_tag.find("span", class_="CVA68e") if link_tag else None

	# Extract description
	description_tag = result.find("span", class_="FrIlee")

	if link_tag and title_tag:
	# Clean and decode URL
	raw_url = link_tag["href"]
	if raw_url.startswith("/url?q="):
	url = unquote(raw_url.split("&")[0].replace("/url?q=", ""))
	else:
	url = raw_url

	title = title_tag.text.strip() if title_tag else "No title"
	description = description_tag.text.strip() if description_tag else "No description"

	yield SearchResult(title=title, url=url, description=description)


	class GoogleSearchEngine(BaseSearchEngine):
	"""Google Search implementation with embedded anti-bot bypass techniques"""

	def __init__(self, max_results: int = 7, sleep_interval: float = 0.5):
	super().__init__(max_results)
	self.sleep_interval = sleep_interval

	@property
	def engine_type(self) -> SearchEngine:
	return SearchEngine.GOOGLE

	def search(self, query: str) -> List[SearchResult]:
	"""
	Perform Google search using embedded anti-bot techniques

	Args:
	query: Search query string

	Returns:
	List of SearchResult objects

	Raises:
	SearchEngineError: If search fails - LET IT CRASH!
	"""
	if not query or not query.strip():
	raise SearchEngineError("Query cannot be empty")

	try:
	# Make request with anti-bot protection
	response = _google_search_request(
	query=query.strip(),
	num_results=self.max_results,
	timeout=10
	)

	# Parse results using precise CSS selectors
	results = list(_parse_google_results(response.text))

	# Limit to requested number of results
	limited_results = results[:self.max_results]

	# Add sleep interval to avoid rate limiting
	if self.sleep_interval > 0:
	time.sleep(self.sleep_interval)

	return limited_results

	except requests.RequestException as e:
	# Network or HTTP errors
	raise SearchEngineError(f"Google search network error: {str(e)}") from e
	except Exception as e:
	# Check for anti-bot detection
	error_msg = str(e).lower()
	if any(indicator in error_msg for indicator in [
	'blocked', 'captcha', 'unusual traffic', 'rate limit', 'consent'
	]):
	raise SearchEngineError(
	f"Google blocked the request (anti-bot protection): {str(e)}. "
	"Try increasing sleep_interval or using a proxy."
	) from e
	else:
	raise SearchEngineError(f"Google search failed: {str(e)}") from e