robin / scrape.py
pjpjq's picture
Allow custom model IDs and ignore env proxies for Tor
d396650
import random
import requests
import threading
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import warnings
warnings.filterwarnings("ignore")
# Define a list of rotating user agents.
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14.7; rv:137.0) Gecko/20100101 Firefox/137.0",
"Mozilla/5.0 (X11; Linux i686; rv:137.0) Gecko/20100101 Firefox/137.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.3 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 Edg/135.0.3179.54",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 Edg/135.0.3179.54"
]
def get_tor_session():
"""
Creates a requests Session with Tor SOCKS proxy and automatic retries.
"""
session = requests.Session()
# Ignore shell-level proxy variables (http_proxy/https_proxy/all_proxy).
session.trust_env = False
retry = Retry(
total=3,
read=3,
connect=3,
backoff_factor=0.3,
status_forcelist=[500, 502, 503, 504]
)
adapter = HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.mount("https://", adapter)
session.proxies = {
"http": "socks5h://127.0.0.1:9050",
"https": "socks5h://127.0.0.1:9050"
}
return session
def scrape_single(url_data, rotate=False, rotate_interval=5, control_port=9051, control_password=None):
"""
Scrapes a single URL using a robust Tor session.
Returns a tuple (url, scraped_text).
"""
url = url_data['link']
use_tor = ".onion" in url
headers = {
"User-Agent": random.choice(USER_AGENTS)
}
try:
if use_tor:
session = get_tor_session()
# Increased timeout for Tor latency
response = session.get(url, headers=headers, timeout=45)
else:
# Fallback for clearweb if needed, though tool focuses on dark web
response = requests.get(url, headers=headers, timeout=30)
if response.status_code == 200:
soup = BeautifulSoup(response.text, "html.parser")
# Clean up text: remove scripts/styles
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text(separator=' ')
# Normalize whitespace
text = ' '.join(text.split())
scraped_text = f"{url_data['title']} - {text}"
else:
scraped_text = url_data['title']
except Exception as e:
# Return title only on failure, so we don't lose the reference
scraped_text = url_data['title']
return url, scraped_text
def scrape_multiple(urls_data, max_workers=5):
"""
Scrapes multiple URLs concurrently using a thread pool.
"""
results = {}
max_chars = 2000 # Increased limit slightly for better context
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_url = {
executor.submit(scrape_single, url_data): url_data
for url_data in urls_data
}
for future in as_completed(future_to_url):
try:
url, content = future.result()
if len(content) > max_chars:
content = content[:max_chars] + "...(truncated)"
results[url] = content
except Exception:
continue
return results