taobao_scraper / proxy_manager.py
meccatronis's picture
Upload proxy_manager.py with huggingface_hub
2a2927a verified
#!/usr/bin/env python3
"""
Gerenciador de Proxies Chineses
Busca, testa e retorna proxies funcionais
"""
import re
import json
import time
from typing import List, Optional
from dataclasses import dataclass
import urllib.request
import urllib.error
from bs4 import BeautifulSoup
@dataclass
class Proxy:
"""Proxy info"""
host: str
port: int
protocol: str # http, https, socks5
source: str
@property
def url(self) -> str:
return f"{self.protocol}://{self.host}:{self.port}"
@property
def dict(self) -> dict:
return {
"http": self.url,
"https": self.url
}
def fetch_url(url: str, timeout: int = 30) -> Optional[str]:
"""Fetch URL usando urllib"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
req = urllib.request.Request(url, headers=headers)
try:
with urllib.request.urlopen(req, timeout=timeout) as response:
# Lê como texto, trata encoding chinês
content = response.read()
try:
return content.decode('utf-8')
except:
return content.decode('gbk', errors='ignore')
except Exception as e:
return None
class ChinaProxyFinder:
"""Busca proxies chineses gratuitos"""
def __init__(self):
self.proxies: List[Proxy] = []
def fetch_geonode_api(self) -> List[Proxy]:
"""Busca proxies via Geonode API (JSON, mais confiável)"""
proxies = []
try:
url = "https://proxylist.geonode.com/api/proxy-list?limit=50&country=cn&protocol=http&format=json"
data = fetch_url(url)
if data:
result = json.loads(data)
if isinstance(result, list):
for item in result[:30]:
if 'ip' in item and 'port' in item:
proxies.append(Proxy(
host=item['ip'],
port=int(item['port']),
protocol="http",
source="geonode"
))
except Exception as e:
print(f"Erro geonode: {e}")
return proxies
def fetch_spys_one(self) -> List[Proxy]:
"""Busca proxies em spys.one usando urllib"""
proxies = []
try:
html = fetch_url("https://spys.one/free-proxy-list/CN/")
if html:
# Spys.one usa formatação especial com script
# Tenta encontrar padrões IP:port
matches = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\D+(\d{2,5})', html)
for ip, port in matches[:30]:
try:
proxies.append(Proxy(
host=ip,
port=int(port),
protocol="http",
source="spys.one"
))
except:
continue
except Exception as e:
print(f"Erro spys.one: {e}")
return proxies
def fetch_proxylist(self) -> List[Proxy]:
"""API alternativa de proxy list"""
proxies = []
try:
url = "http://www.89ip.cn/tiqv.html?id=1" # Proxy chinês
html = fetch_url(url)
if html:
soup = BeautifulSoup(html, 'lxml')
# Tenta encontrar IPs em tbody
tds = soup.find_all('td')
for td in tds[:100]:
text = td.get_text(strip=True)
match = re.match(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+)', text)
if match:
proxies.append(Proxy(
host=match.group(1),
port=int(match.group(2)),
protocol="http",
source="89ip"
))
if len(proxies) >= 20:
break
except Exception as e:
print(f"Erro proxylist: {e}")
return proxies
def find_all(self) -> List[Proxy]:
"""Busca proxies de todas as fontes"""
print("🔍 Buscando proxies chineses...")
# Busca sequencialmente
all_proxies = []
# Geonode API (mais confiável)
print(" - Geonode API...")
all_proxies.extend(self.fetch_geonode_api())
# Spys.one
print(" - Spys.one...")
all_proxies.extend(self.fetch_spys_one())
# Outros
print(" - Outras fontes...")
all_proxies.extend(self.fetch_proxylist())
# Remove duplicatas
seen = set()
unique = []
for p in all_proxies:
key = f"{p.host}:{p.port}"
if key not in seen:
seen.add(key)
unique.append(p)
self.proxies = unique
print(f"✅ Encontrados {len(self.proxies)} proxies únicos")
return self.proxies
class ProxyTester:
"""Testa proxies para ver se funcionam"""
def test_proxy(self, proxy: Proxy, timeout: int = 10) -> bool:
"""Testa se o proxy funciona"""
try:
proxy_handler = urllib.request.ProxyHandler({proxy.protocol: proxy.url})
opener = urllib.request.build_opener(proxy_handler)
req = urllib.request.Request(
"http://httpbin.org/ip",
headers={"User-Agent": "Mozilla/5.0"}
)
with opener.open(req, timeout=timeout) as response:
if response.status == 200:
return True
except Exception:
pass
return False
def test_for_taobao(self, proxy: Proxy, timeout: int = 15) -> bool:
"""Testa se o proxy funciona com Taobao"""
try:
proxy_handler = urllib.request.ProxyHandler({proxy.protocol: proxy.url})
opener = urllib.request.build_opener(proxy_handler)
req = urllib.request.Request(
"https://s.taobao.com/search?q=test",
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
)
with opener.open(req, timeout=timeout) as response:
text = response.read().decode('utf-8', errors='ignore')
# Verifica se não tem bloqueio
if "验证" not in text and len(text) > 10000:
return True
except Exception:
pass
return False
def find_working(self, proxies: List[Proxy], count: int = 3, test_taobao: bool = False) -> List[Proxy]:
"""Encontra N proxies que funcionam"""
print(f"🧪 Testando {len(proxies)} proxies...")
working = []
for i, proxy in enumerate(proxies[:30]): # Limita a 30 testes
if len(working) >= count:
break
print(f" [{i+1}] {proxy.host}:{proxy.port}... ", end="", flush=True)
# Testa
if test_taobao:
success = self.test_for_taobao(proxy, timeout=10)
else:
success = self.test_proxy(proxy, timeout=5)
if success:
print("✅")
working.append(proxy)
else:
print("❌")
return working
def main():
"""Busca e retorna proxies funcionais"""
finder = ChinaProxyFinder()
tester = ProxyTester()
# Busca proxies
proxies = finder.find_all()
if not proxies:
print("❌ Nenhum proxy encontrado")
return
# Testa proxies
working = tester.find_working(proxies, count=5)
if working:
print(f"\n✅ {len(working)} proxies funcionais:")
for p in working:
print(f" {p.url}")
# Salva em arquivo
with open("/tmp/taobao_proxies.txt", "w") as f:
for p in working:
f.write(f"{p.url}\n")
# Salva JSON também
with open("/tmp/taobao_proxies.json", "w") as f:
json.dump([{"url": p.url, "host": p.host, "port": p.port} for p in working], f, indent=2)
print(f"\n💾 Salvo em /tmp/taobao_proxies.txt e /tmp/taobao_proxies.json")
return working[0]
else:
print("\n❌ Nenhum proxy funcional encontrado")
print("\n💡 Sugestões:")
print(" 1. Use proxy pago (mais confiável)")
print(" 2. Tente novamente em alguns minutos")
print(" 3. Use VPN chinês")
return None
if __name__ == "__main__":
result = main()
if result:
print(f"\n🎯 Use este proxy: {result.url}")