paijo77 commited on
Commit
a3f299b
·
verified ·
1 Parent(s): 0de63d4

update app/hunter/extractor.py

Browse files
Files changed (1) hide show
  1. app/hunter/extractor.py +106 -0
app/hunter/extractor.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import base64
3
+ import logging
4
+ from typing import List, Optional
5
+ import aiohttp
6
+ from bs4 import BeautifulSoup
7
+
8
+ from app.models.proxy import Proxy
9
+ from app.grabber.patterns import ProxyPatterns
10
+ from app.grabber.parsers import VMessParser, VLESSParser, TrojanParser, SSParser
11
+ from app.utils.base64_decoder import SubscriptionDecoder
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class UniversalExtractor:
17
+ """
18
+ Extracts proxies from any text content (HTML, Base64, Raw).
19
+ """
20
+
21
+ @classmethod
22
+ def extract_proxies(
23
+ cls, content: str, source_url: str = "discovered"
24
+ ) -> List[Proxy]:
25
+ """
26
+ Main entry point. Tries to decode and parse proxies from string content.
27
+ """
28
+ proxies: List[Proxy] = []
29
+
30
+ # 1. Clean HTML if present
31
+ if cls._is_html(content):
32
+ content = cls._strip_html(content)
33
+
34
+ # 2. Try Base64 Decoding (Optimistic)
35
+ decoded_content = cls._try_decode(content)
36
+
37
+ # 3. Parse content (both raw and decoded)
38
+ # We parse both because sometimes valid text is mixed with base64
39
+ proxies.extend(cls._parse_text(content, source_url))
40
+ if decoded_content != content:
41
+ proxies.extend(cls._parse_text(decoded_content, source_url))
42
+
43
+ # Deduplicate by URL/IP:Port
44
+ unique_proxies = {f"{p.ip}:{p.port}": p for p in proxies}
45
+ return list(unique_proxies.values())
46
+
47
+ @classmethod
48
+ def _is_html(cls, text: str) -> bool:
49
+ return bool(re.search(r"<!DOCTYPE html>|<html", text, re.IGNORECASE))
50
+
51
+ @classmethod
52
+ def _strip_html(cls, html: str) -> str:
53
+ try:
54
+ soup = BeautifulSoup(html, "html.parser")
55
+ return soup.get_text(separator="\n")
56
+ except Exception:
57
+ return html
58
+
59
+ @classmethod
60
+ def _try_decode(cls, text: str) -> str:
61
+ try:
62
+ return SubscriptionDecoder.decode(text)
63
+ except Exception:
64
+ return text
65
+
66
+ @classmethod
67
+ def _parse_text(cls, text: str, source_url: str) -> List[Proxy]:
68
+ proxies = []
69
+
70
+ # HTTP/S Proxies (IP:Port)
71
+ http_matches = ProxyPatterns.extract_http_proxies(text)
72
+ for ip, port in http_matches:
73
+ # We assume HTTP unless verified otherwise
74
+ proxies.append(
75
+ Proxy(ip=ip, port=int(port), protocol="http", source=source_url)
76
+ )
77
+
78
+ # VMess
79
+ for url in ProxyPatterns.extract_vmess_urls(text):
80
+ try:
81
+ proxies.append(VMessParser.parse(url))
82
+ except Exception:
83
+ pass
84
+
85
+ # VLESS
86
+ for url in ProxyPatterns.extract_vless_urls(text):
87
+ try:
88
+ proxies.append(VLESSParser.parse(url))
89
+ except Exception:
90
+ pass
91
+
92
+ # Trojan
93
+ for url in ProxyPatterns.extract_trojan_urls(text):
94
+ try:
95
+ proxies.append(TrojanParser.parse(url))
96
+ except Exception:
97
+ pass
98
+
99
+ # Shadowsocks
100
+ for url in ProxyPatterns.extract_ss_urls(text):
101
+ try:
102
+ proxies.append(SSParser.parse(url))
103
+ except Exception:
104
+ pass
105
+
106
+ return proxies