paijo77 commited on
Commit
4a2f950
·
verified ·
1 Parent(s): 0e9cd60

update app/hunter/strategies/search.py

Browse files
Files changed (1) hide show
  1. app/hunter/strategies/search.py +100 -0
app/hunter/strategies/search.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import urllib.parse
3
+ from typing import List
4
+ import aiohttp
5
+ from bs4 import BeautifulSoup
6
+ from sqlalchemy.ext.asyncio import AsyncSession
7
+
8
+ from app.hunter.strategy import BaseStrategy
9
+ from app.db_storage import db_storage
10
+ from app.database import get_db
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class SearchStrategy(BaseStrategy):
16
+ SEARCH_URL = "https://html.duckduckgo.com/html/"
17
+
18
+ @property
19
+ def name(self) -> str:
20
+ return "search"
21
+
22
+ async def discover(self) -> List[str]:
23
+ # We need a DB session to get proxies.
24
+ # Since strategies are called from service, maybe service should provide session?
25
+ # For now, we'll create a new one using the async generator manually or just use get_random_proxy which needs session.
26
+
27
+ # Strategy: Get a session, get a proxy, make a request.
28
+ found_urls = []
29
+
30
+ queries = [
31
+ 'site:pastebin.com "vmess://"',
32
+ 'site:github.com "clash config" "proxies"',
33
+ 'intitle:"proxy list" "ss://"',
34
+ ]
35
+
36
+ # Get a DB session
37
+ async for session in get_db():
38
+ for query in queries:
39
+ urls = await self._search_with_rotation(session, query)
40
+ found_urls.extend(urls)
41
+ break # Just need one session context
42
+
43
+ return list(set(found_urls))
44
+
45
+ async def _search_with_rotation(
46
+ self, session: AsyncSession, query: str
47
+ ) -> List[str]:
48
+ # Try up to 5 times with different proxies
49
+ for _ in range(5):
50
+ proxy = await db_storage.get_random_proxy(
51
+ session=session,
52
+ is_working=True,
53
+ protocol="http", # aiohttp prefers http/https proxies
54
+ )
55
+
56
+ if not proxy:
57
+ logger.warning("No working proxies available for search strategy")
58
+ return []
59
+
60
+ proxy_url = f"{proxy.protocol}://{proxy.ip}:{proxy.port}"
61
+
62
+ try:
63
+ return await self._execute_search(query, proxy_url)
64
+ except Exception as e:
65
+ logger.debug(f"Search failed with proxy {proxy.ip}: {str(e)}")
66
+ # Continue to next proxy
67
+
68
+ return []
69
+
70
+ async def _execute_search(self, query: str, proxy_url: str) -> List[str]:
71
+ headers = {
72
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
73
+ }
74
+ data = {"q": query}
75
+
76
+ async with aiohttp.ClientSession() as session:
77
+ async with session.post(
78
+ self.SEARCH_URL, data=data, headers=headers, proxy=proxy_url, timeout=10
79
+ ) as resp:
80
+ if resp.status != 200:
81
+ raise Exception(f"Status {resp.status}")
82
+
83
+ html = await resp.text()
84
+ return self._parse_duckduckgo(html)
85
+
86
+ def _parse_duckduckgo(self, html: str) -> List[str]:
87
+ soup = BeautifulSoup(html, "html.parser")
88
+ urls = []
89
+ for link in soup.select(".result__a"):
90
+ href = link.get("href")
91
+ if href:
92
+ # DuckDuckGo redirect link
93
+ # format: /l/?kh=-1&uddg=https%3A%2F%2Fpastebin.com%2F...
94
+ if "uddg=" in href:
95
+ parsed = urllib.parse.urlparse(href)
96
+ qs = urllib.parse.parse_qs(parsed.query)
97
+ real_url = qs.get("uddg", [None])[0]
98
+ if real_url:
99
+ urls.append(real_url)
100
+ return urls