File size: 3,199 Bytes
a3f299b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import re
import base64
import logging
from typing import List, Optional
import aiohttp
from bs4 import BeautifulSoup

from app.models.proxy import Proxy
from app.grabber.patterns import ProxyPatterns
from app.grabber.parsers import VMessParser, VLESSParser, TrojanParser, SSParser
from app.utils.base64_decoder import SubscriptionDecoder

logger = logging.getLogger(__name__)


class UniversalExtractor:
    """
    Extracts proxies from any text content (HTML, Base64, Raw).
    """

    @classmethod
    def extract_proxies(
        cls, content: str, source_url: str = "discovered"
    ) -> List[Proxy]:
        """
        Main entry point. Tries to decode and parse proxies from string content.
        """
        proxies: List[Proxy] = []

        # 1. Clean HTML if present
        if cls._is_html(content):
            content = cls._strip_html(content)

        # 2. Try Base64 Decoding (Optimistic)
        decoded_content = cls._try_decode(content)

        # 3. Parse content (both raw and decoded)
        # We parse both because sometimes valid text is mixed with base64
        proxies.extend(cls._parse_text(content, source_url))
        if decoded_content != content:
            proxies.extend(cls._parse_text(decoded_content, source_url))

        # Deduplicate by URL/IP:Port
        unique_proxies = {f"{p.ip}:{p.port}": p for p in proxies}
        return list(unique_proxies.values())

    @classmethod
    def _is_html(cls, text: str) -> bool:
        return bool(re.search(r"<!DOCTYPE html>|<html", text, re.IGNORECASE))

    @classmethod
    def _strip_html(cls, html: str) -> str:
        try:
            soup = BeautifulSoup(html, "html.parser")
            return soup.get_text(separator="\n")
        except Exception:
            return html

    @classmethod
    def _try_decode(cls, text: str) -> str:
        try:
            return SubscriptionDecoder.decode(text)
        except Exception:
            return text

    @classmethod
    def _parse_text(cls, text: str, source_url: str) -> List[Proxy]:
        proxies = []

        # HTTP/S Proxies (IP:Port)
        http_matches = ProxyPatterns.extract_http_proxies(text)
        for ip, port in http_matches:
            # We assume HTTP unless verified otherwise
            proxies.append(
                Proxy(ip=ip, port=int(port), protocol="http", source=source_url)
            )

        # VMess
        for url in ProxyPatterns.extract_vmess_urls(text):
            try:
                proxies.append(VMessParser.parse(url))
            except Exception:
                pass

        # VLESS
        for url in ProxyPatterns.extract_vless_urls(text):
            try:
                proxies.append(VLESSParser.parse(url))
            except Exception:
                pass

        # Trojan
        for url in ProxyPatterns.extract_trojan_urls(text):
            try:
                proxies.append(TrojanParser.parse(url))
            except Exception:
                pass

        # Shadowsocks
        for url in ProxyPatterns.extract_ss_urls(text):
            try:
                proxies.append(SSParser.parse(url))
            except Exception:
                pass

        return proxies