Muttered3 commited on
Commit
7a8a39d
Β·
verified Β·
1 Parent(s): 72c50c0

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +6 -15
scraper.py CHANGED
@@ -14,19 +14,11 @@ USER_AGENTS = [
14
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0"
15
  ]
16
 
17
- # Track adaptive rate limits per proxy instance in memory safely
18
- _rate_limited_until = 0.0
19
-
20
  async def check_fragment(word: str, proxy_url: str = None) -> str:
21
- global _rate_limited_until
22
  url = f"https://fragment.com/username/{word}"
23
 
24
  # 4 Retries matching your advanced optimization matrix
25
  for attempt in range(1, 5):
26
- current_time = time.time()
27
- if current_time < _rate_limited_until:
28
- await asyncio.sleep(_rate_limited_until - current_time + random.uniform(0.1, 0.4))
29
-
30
  try:
31
  headers = {
32
  "User-Agent": random.choice(USER_AGENTS),
@@ -53,14 +45,14 @@ async def check_fragment(word: str, proxy_url: str = None) -> str:
53
 
54
  # Case B: Firewall Soft-Ban Redirect (Kicked back to clean root index)
55
  elif location == "/" or location.rstrip('/') in ["https://fragment.com", "https://www.fragment.com"]:
56
- backoff = (2 ** attempt) + random.uniform(0.5, 1.5)
57
- _rate_limited_until = time.time() + backoff
58
- log.warning(f"⚠️ Proxy soft-banned on '{word}'. Backing off {backoff:.2f}s...")
59
  continue
60
 
61
  if status in [429, 403]:
62
- backoff = 4 + (2 ** attempt) + random.uniform(0.5, 1.5)
63
- _rate_limited_until = time.time() + backoff
64
  continue
65
 
66
  if status == 404:
@@ -94,7 +86,6 @@ async def check_fragment(word: str, proxy_url: str = None) -> str:
94
 
95
  # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
96
  # ENGINE 2: FAILSAFE CONVERTED SEARCH LOOKUP LOOP
97
- # (Runs if 200 OK was returned but page layout looks structural like raw queries)
98
  # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
99
  clean_html = re.sub(r'\s+', ' ', html)
100
  search_regex = re.compile(rf'>@{word}<.*?class="[^"]*status[^"]*"[^>]*>\s*([^<]+?)\s*TL', re.IGNORECASE)
@@ -113,4 +104,4 @@ async def check_fragment(word: str, proxy_url: str = None) -> str:
113
  log.error(f"Execution fault for '{word}' over proxy: {str(e)}")
114
  await asyncio.sleep(1.5 * attempt)
115
 
116
- return "ERROR"
 
14
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0"
15
  ]
16
 
 
 
 
17
  async def check_fragment(word: str, proxy_url: str = None) -> str:
 
18
  url = f"https://fragment.com/username/{word}"
19
 
20
  # 4 Retries matching your advanced optimization matrix
21
  for attempt in range(1, 5):
 
 
 
 
22
  try:
23
  headers = {
24
  "User-Agent": random.choice(USER_AGENTS),
 
45
 
46
  # Case B: Firewall Soft-Ban Redirect (Kicked back to clean root index)
47
  elif location == "/" or location.rstrip('/') in ["https://fragment.com", "https://www.fragment.com"]:
48
+ backoff = (2 ** attempt) + random.uniform(1.0, 3.0)
49
+ log.warning(f"⚠️ Rate limit target hit on '{word}'. Backing off loop thread {backoff:.2f}s...")
50
+ await asyncio.sleep(backoff)
51
  continue
52
 
53
  if status in [429, 403]:
54
+ backoff = 4 + (2 ** attempt) + random.uniform(1.5, 3.5)
55
+ await asyncio.sleep(backoff)
56
  continue
57
 
58
  if status == 404:
 
86
 
87
  # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
88
  # ENGINE 2: FAILSAFE CONVERTED SEARCH LOOKUP LOOP
 
89
  # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
90
  clean_html = re.sub(r'\s+', ' ', html)
91
  search_regex = re.compile(rf'>@{word}<.*?class="[^"]*status[^"]*"[^>]*>\s*([^<]+?)\s*TL', re.IGNORECASE)
 
104
  log.error(f"Execution fault for '{word}' over proxy: {str(e)}")
105
  await asyncio.sleep(1.5 * attempt)
106
 
107
+ return "ERROR"