paijo77 commited on
Commit
fafa4d5
·
verified ·
1 Parent(s): a3f299b

update app/hunter/service.py

Browse files
Files changed (1) hide show
  1. app/hunter/service.py +141 -0
app/hunter/service.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import aiohttp
3
+ import asyncio
4
+ from datetime import datetime
5
+ from typing import List, Optional
6
+ from sqlalchemy import select, func
7
+ from sqlalchemy.ext.asyncio import AsyncSession
8
+
9
+ from app.database import get_db
10
+ from app.db_models import CandidateSource, ProxySource
11
+ from app.hunter.strategies.github import GitHubStrategy
12
+ from app.hunter.strategies.ai import AIStrategy
13
+ from app.hunter.strategies.search import SearchStrategy
14
+ from app.hunter.extractor import UniversalExtractor
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class HunterService:
20
+ def __init__(self):
21
+ self.strategies = [GitHubStrategy(), AIStrategy(), SearchStrategy()]
22
+
23
+ async def run_hunt(self):
24
+ """
25
+ Execute all discovery strategies and process results.
26
+ """
27
+ logger.info("Starting Hunter Protocol...")
28
+
29
+ discovered_urls = set()
30
+
31
+ # 1. Run all strategies concurrently
32
+ tasks = [strategy.discover() for strategy in self.strategies]
33
+ results = await asyncio.gather(*tasks, return_exceptions=True)
34
+
35
+ for i, result in enumerate(results):
36
+ strategy_name = self.strategies[i].name
37
+ if isinstance(result, Exception):
38
+ logger.error(f"Strategy {strategy_name} failed: {result}")
39
+ continue
40
+
41
+ if result:
42
+ logger.info(f"Strategy {strategy_name} found {len(result)} URLs")
43
+ for url in result:
44
+ discovered_urls.add((url, strategy_name))
45
+
46
+ # 2. Process unique URLs
47
+ logger.info(f"Total unique candidates found: {len(discovered_urls)}")
48
+
49
+ async for session in get_db():
50
+ for url, method in discovered_urls:
51
+ await self.process_candidate(session, url, method)
52
+ await session.commit()
53
+
54
+ logger.info("Hunter Protocol complete.")
55
+
56
+ async def process_candidate(self, session: AsyncSession, url: str, method: str):
57
+ """
58
+ Check if URL is new, save it, and score it.
59
+ """
60
+ # Check if already exists in Candidates
61
+ stmt = select(CandidateSource).where(CandidateSource.url == url)
62
+ result = await session.execute(stmt)
63
+ if result.scalar_one_or_none():
64
+ logger.debug(f"Candidate already exists: {url}")
65
+ return
66
+
67
+ # Check if already exists in Active Sources
68
+ stmt = select(ProxySource).where(ProxySource.url == url)
69
+ result = await session.execute(stmt)
70
+ if result.scalar_one_or_none():
71
+ logger.debug(f"Source already active: {url}")
72
+ return
73
+
74
+ # Fetch and Analyze
75
+ try:
76
+ content = await self._fetch_content(url)
77
+ proxies = UniversalExtractor.extract_proxies(content, source_url=url)
78
+
79
+ confidence = self._calculate_confidence(url, proxies)
80
+
81
+ # Save
82
+ candidate = CandidateSource(
83
+ url=url,
84
+ domain=self._extract_domain(url),
85
+ discovery_method=method,
86
+ status="pending",
87
+ confidence_score=confidence,
88
+ proxies_found_count=len(proxies),
89
+ last_checked_at=datetime.utcnow(),
90
+ )
91
+ session.add(candidate)
92
+ logger.info(
93
+ f"Added candidate: {url} (Score: {confidence}, Proxies: {len(proxies)})"
94
+ )
95
+
96
+ except Exception as e:
97
+ logger.warning(f"Failed to process candidate {url}: {str(e)}")
98
+ # We might still save it as 'failed' or 'pending' retry?
99
+ # For now, skip invalid URLs to keep DB clean.
100
+
101
+ async def _fetch_content(self, url: str) -> str:
102
+ async with aiohttp.ClientSession() as session:
103
+ async with session.get(url, timeout=10) as resp:
104
+ if resp.status != 200:
105
+ raise Exception(f"HTTP {resp.status}")
106
+ return await resp.text()
107
+
108
+ def _calculate_confidence(self, url: str, proxies: List[any]) -> int:
109
+ score = 0
110
+
111
+ # Domain Trust
112
+ if "github.com" in url or "raw.githubusercontent.com" in url:
113
+ score += 20
114
+ elif "pastebin.com" in url:
115
+ score += 10
116
+
117
+ # Content Volume
118
+ count = len(proxies)
119
+ if count > 0:
120
+ score += 10
121
+ if count > 50:
122
+ score += 20
123
+ if count > 500:
124
+ score += 20
125
+
126
+ # Protocol Diversity
127
+ protocols = {p.protocol for p in proxies}
128
+ if len(protocols) > 1:
129
+ score += 10
130
+ if "vmess" in protocols or "vless" in protocols:
131
+ score += 10
132
+
133
+ return min(score, 100)
134
+
135
+ def _extract_domain(self, url: str) -> str:
136
+ from urllib.parse import urlparse
137
+
138
+ try:
139
+ return urlparse(url).netloc
140
+ except:
141
+ return ""