Muttered3 commited on
Commit
935b8a6
·
verified ·
1 Parent(s): b4a56aa

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +24 -28
scraper.py CHANGED
@@ -1,28 +1,35 @@
1
  import asyncio
2
  import random
3
  import re
4
- from curl_cffi.requests import AsyncSession
5
  from logger import get_logger
6
 
7
  log = get_logger()
8
 
 
 
 
 
 
 
9
  async def check_fragment(word: str, proxy_url: str = None) -> str:
10
  url = f"https://fragment.com/username/{word}"
11
- proxies = {"http": proxy_url, "https": proxy_url} if proxy_url else None
12
-
13
- try:
14
- async with AsyncSession(impersonate="chrome120", proxies=proxies) as session:
15
- for attempt in range(3):
16
- try:
17
- resp = await session.get(url, timeout=15, allow_redirects=True)
18
 
19
- if resp.status_code in [429, 403]:
20
- log.warning(f"Rate limited on {word}. Retrying...")
 
 
 
 
 
 
21
  await asyncio.sleep(random.uniform(2.0, 4.0))
22
  continue
23
 
24
- html = resp.text
25
 
 
26
  if "Just a moment..." in html or "cloudflare" in html.lower():
27
  await asyncio.sleep(random.uniform(2, 4))
28
  continue
@@ -42,32 +49,21 @@ async def check_fragment(word: str, proxy_url: str = None) -> str:
42
  if "auction" in s: return "ON_AUCTION"
43
  if "available" in s: return "AVAILABLE"
44
  if "sale" in s or "purchase" in s: return "FOR_SALE"
 
45
 
46
- # If it found a status span but we don't recognize the text
47
- clean = s.upper()
48
- log.warning(f"UNKNOWN STATUS TEXT for {word}: '{clean}'")
49
- return clean
50
-
51
- # 3. FALLBACK HTML PARSING (Fixing the False Unavailables)
52
  if 'class="tm-status-taken"' in html: return "TAKEN"
53
 
54
- # 'tm-status-unavail' is used for BOTH Sold and Unavailable. We must differentiate.
55
  if 'class="tm-status-unavail"' in html:
56
  if ">Sold<" in html or ">sold<" in html.lower() or "recently sold" in html.lower():
57
  return "SOLD"
58
  return "UNAVAILABLE"
59
 
60
- # If absolutely no markers exist, it might be a new UI layout.
61
- log.error(f"FAILED TO PARSE UI FOR: {word}. HTML snippet logged.")
62
- with open("failed_parse.log", "a", encoding="utf-8") as f:
63
- f.write(f"\n--- {word} ---\n{html[:1000]}\n")
64
  return "ERROR"
65
 
66
- except Exception as e:
67
- log.error(f"Request error for {word} (Attempt {attempt+1}): {str(e)}")
68
- await asyncio.sleep(1.5)
69
-
70
- except Exception as e:
71
- log.error(f"Session error for {word}: {str(e)}")
72
 
73
  return "ERROR"
 
1
  import asyncio
2
  import random
3
  import re
4
+ import aiohttp
5
  from logger import get_logger
6
 
7
  log = get_logger()
8
 
9
+ USER_AGENTS = [
10
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124.0 Safari/537.36",
11
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 Safari/605.1.15",
12
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/123.0 Safari/537.36"
13
+ ]
14
+
15
  async def check_fragment(word: str, proxy_url: str = None) -> str:
16
  url = f"https://fragment.com/username/{word}"
17
+ headers = {"User-Agent": random.choice(USER_AGENTS)}
 
 
 
 
 
 
18
 
19
+ for attempt in range(3):
20
+ try:
21
+ # We open a fresh session per attempt to prevent dead proxies from leaking connections
22
+ timeout = aiohttp.ClientTimeout(total=15)
23
+ async with aiohttp.ClientSession(timeout=timeout) as session:
24
+ async with session.get(url, headers=headers, proxy=proxy_url, allow_redirects=True) as resp:
25
+
26
+ if resp.status in [429, 403]:
27
  await asyncio.sleep(random.uniform(2.0, 4.0))
28
  continue
29
 
30
+ html = await resp.text()
31
 
32
+ # Cloudflare block
33
  if "Just a moment..." in html or "cloudflare" in html.lower():
34
  await asyncio.sleep(random.uniform(2, 4))
35
  continue
 
49
  if "auction" in s: return "ON_AUCTION"
50
  if "available" in s: return "AVAILABLE"
51
  if "sale" in s or "purchase" in s: return "FOR_SALE"
52
+ return s.upper()
53
 
54
+ # 3. FALLBACK HTML PARSING
 
 
 
 
 
55
  if 'class="tm-status-taken"' in html: return "TAKEN"
56
 
 
57
  if 'class="tm-status-unavail"' in html:
58
  if ">Sold<" in html or ">sold<" in html.lower() or "recently sold" in html.lower():
59
  return "SOLD"
60
  return "UNAVAILABLE"
61
 
 
 
 
 
62
  return "ERROR"
63
 
64
+ except Exception as e:
65
+ # This logs the specific aiohttp timeout/connection error so you can see if a proxy dies
66
+ log.error(f"Request error for {word} via {proxy_url}: {str(e)}")
67
+ await asyncio.sleep(1.5)
 
 
68
 
69
  return "ERROR"