Spaces:

paijo77
/

1proxy

Paused

App Files Files Community

1proxy / app /hunter /extractor.py

paijo77

update app/hunter/extractor.py

a3f299b verified about 2 months ago

raw

history blame contribute delete

3.2 kB

	import re
	import base64
	import logging
	from typing import List, Optional
	import aiohttp
	from bs4 import BeautifulSoup

	from app.models.proxy import Proxy
	from app.grabber.patterns import ProxyPatterns
	from app.grabber.parsers import VMessParser, VLESSParser, TrojanParser, SSParser
	from app.utils.base64_decoder import SubscriptionDecoder

	logger = logging.getLogger(__name__)


	class UniversalExtractor:
	"""
	Extracts proxies from any text content (HTML, Base64, Raw).
	"""

	@classmethod
	def extract_proxies(
	cls, content: str, source_url: str = "discovered"
	) -> List[Proxy]:
	"""
	Main entry point. Tries to decode and parse proxies from string content.
	"""
	proxies: List[Proxy] = []

	# 1. Clean HTML if present
	if cls._is_html(content):
	content = cls._strip_html(content)

	# 2. Try Base64 Decoding (Optimistic)
	decoded_content = cls._try_decode(content)

	# 3. Parse content (both raw and decoded)
	# We parse both because sometimes valid text is mixed with base64
	proxies.extend(cls._parse_text(content, source_url))
	if decoded_content != content:
	proxies.extend(cls._parse_text(decoded_content, source_url))

	# Deduplicate by URL/IP:Port
	unique_proxies = {f"{p.ip}:{p.port}": p for p in proxies}
	return list(unique_proxies.values())

	@classmethod
	def _is_html(cls, text: str) -> bool:
	return bool(re.search(r"<!DOCTYPE html>\|<html", text, re.IGNORECASE))

	@classmethod
	def _strip_html(cls, html: str) -> str:
	try:
	soup = BeautifulSoup(html, "html.parser")
	return soup.get_text(separator="\n")
	except Exception:
	return html

	@classmethod
	def _try_decode(cls, text: str) -> str:
	try:
	return SubscriptionDecoder.decode(text)
	except Exception:
	return text

	@classmethod
	def _parse_text(cls, text: str, source_url: str) -> List[Proxy]:
	proxies = []

	# HTTP/S Proxies (IP:Port)
	http_matches = ProxyPatterns.extract_http_proxies(text)
	for ip, port in http_matches:
	# We assume HTTP unless verified otherwise
	proxies.append(
	Proxy(ip=ip, port=int(port), protocol="http", source=source_url)
	)

	# VMess
	for url in ProxyPatterns.extract_vmess_urls(text):
	try:
	proxies.append(VMessParser.parse(url))
	except Exception:
	pass

	# VLESS
	for url in ProxyPatterns.extract_vless_urls(text):
	try:
	proxies.append(VLESSParser.parse(url))
	except Exception:
	pass

	# Trojan
	for url in ProxyPatterns.extract_trojan_urls(text):
	try:
	proxies.append(TrojanParser.parse(url))
	except Exception:
	pass

	# Shadowsocks
	for url in ProxyPatterns.extract_ss_urls(text):
	try:
	proxies.append(SSParser.parse(url))
	except Exception:
	pass

	return proxies