update app/hunter/strategies/search.py
Browse files- app/hunter/strategies/search.py +100 -0
app/hunter/strategies/search.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import urllib.parse
|
| 3 |
+
from typing import List
|
| 4 |
+
import aiohttp
|
| 5 |
+
from bs4 import BeautifulSoup
|
| 6 |
+
from sqlalchemy.ext.asyncio import AsyncSession
|
| 7 |
+
|
| 8 |
+
from app.hunter.strategy import BaseStrategy
|
| 9 |
+
from app.db_storage import db_storage
|
| 10 |
+
from app.database import get_db
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class SearchStrategy(BaseStrategy):
|
| 16 |
+
SEARCH_URL = "https://html.duckduckgo.com/html/"
|
| 17 |
+
|
| 18 |
+
@property
|
| 19 |
+
def name(self) -> str:
|
| 20 |
+
return "search"
|
| 21 |
+
|
| 22 |
+
async def discover(self) -> List[str]:
|
| 23 |
+
# We need a DB session to get proxies.
|
| 24 |
+
# Since strategies are called from service, maybe service should provide session?
|
| 25 |
+
# For now, we'll create a new one using the async generator manually or just use get_random_proxy which needs session.
|
| 26 |
+
|
| 27 |
+
# Strategy: Get a session, get a proxy, make a request.
|
| 28 |
+
found_urls = []
|
| 29 |
+
|
| 30 |
+
queries = [
|
| 31 |
+
'site:pastebin.com "vmess://"',
|
| 32 |
+
'site:github.com "clash config" "proxies"',
|
| 33 |
+
'intitle:"proxy list" "ss://"',
|
| 34 |
+
]
|
| 35 |
+
|
| 36 |
+
# Get a DB session
|
| 37 |
+
async for session in get_db():
|
| 38 |
+
for query in queries:
|
| 39 |
+
urls = await self._search_with_rotation(session, query)
|
| 40 |
+
found_urls.extend(urls)
|
| 41 |
+
break # Just need one session context
|
| 42 |
+
|
| 43 |
+
return list(set(found_urls))
|
| 44 |
+
|
| 45 |
+
async def _search_with_rotation(
|
| 46 |
+
self, session: AsyncSession, query: str
|
| 47 |
+
) -> List[str]:
|
| 48 |
+
# Try up to 5 times with different proxies
|
| 49 |
+
for _ in range(5):
|
| 50 |
+
proxy = await db_storage.get_random_proxy(
|
| 51 |
+
session=session,
|
| 52 |
+
is_working=True,
|
| 53 |
+
protocol="http", # aiohttp prefers http/https proxies
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
if not proxy:
|
| 57 |
+
logger.warning("No working proxies available for search strategy")
|
| 58 |
+
return []
|
| 59 |
+
|
| 60 |
+
proxy_url = f"{proxy.protocol}://{proxy.ip}:{proxy.port}"
|
| 61 |
+
|
| 62 |
+
try:
|
| 63 |
+
return await self._execute_search(query, proxy_url)
|
| 64 |
+
except Exception as e:
|
| 65 |
+
logger.debug(f"Search failed with proxy {proxy.ip}: {str(e)}")
|
| 66 |
+
# Continue to next proxy
|
| 67 |
+
|
| 68 |
+
return []
|
| 69 |
+
|
| 70 |
+
async def _execute_search(self, query: str, proxy_url: str) -> List[str]:
|
| 71 |
+
headers = {
|
| 72 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
| 73 |
+
}
|
| 74 |
+
data = {"q": query}
|
| 75 |
+
|
| 76 |
+
async with aiohttp.ClientSession() as session:
|
| 77 |
+
async with session.post(
|
| 78 |
+
self.SEARCH_URL, data=data, headers=headers, proxy=proxy_url, timeout=10
|
| 79 |
+
) as resp:
|
| 80 |
+
if resp.status != 200:
|
| 81 |
+
raise Exception(f"Status {resp.status}")
|
| 82 |
+
|
| 83 |
+
html = await resp.text()
|
| 84 |
+
return self._parse_duckduckgo(html)
|
| 85 |
+
|
| 86 |
+
def _parse_duckduckgo(self, html: str) -> List[str]:
|
| 87 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 88 |
+
urls = []
|
| 89 |
+
for link in soup.select(".result__a"):
|
| 90 |
+
href = link.get("href")
|
| 91 |
+
if href:
|
| 92 |
+
# DuckDuckGo redirect link
|
| 93 |
+
# format: /l/?kh=-1&uddg=https%3A%2F%2Fpastebin.com%2F...
|
| 94 |
+
if "uddg=" in href:
|
| 95 |
+
parsed = urllib.parse.urlparse(href)
|
| 96 |
+
qs = urllib.parse.parse_qs(parsed.query)
|
| 97 |
+
real_url = qs.get("uddg", [None])[0]
|
| 98 |
+
if real_url:
|
| 99 |
+
urls.append(real_url)
|
| 100 |
+
return urls
|