Muttered3 commited on
Commit
69f02a2
·
verified ·
1 Parent(s): ab6960a

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +44 -37
scraper.py CHANGED
@@ -2,54 +2,61 @@ import asyncio
2
  import random
3
  from curl_cffi.requests import AsyncSession
4
 
 
 
 
 
 
 
 
5
  async def check_fragment(word: str) -> str:
6
- url = f"https://fragment.com/username/{word}"
7
 
8
- # Impersonates a real Chrome browser to bypass Cloudflare
9
- async with AsyncSession(impersonate="chrome110") as session:
10
- for _ in range(4):
 
 
 
 
 
 
 
11
  try:
12
- resp = await session.get(url, timeout=15)
 
13
 
14
- # Wait out rate limits
15
- if resp.status_code in [429, 403]:
16
- await asyncio.sleep(random.uniform(2, 5))
17
- continue
18
-
 
 
 
19
  if resp.status_code == 200:
20
- html = resp.text
21
 
22
- # Double-check if Cloudflare intercepted the page anyway
23
- if "Just a moment..." in html or "Cloudflare" in html:
24
- await asyncio.sleep(random.uniform(3, 6))
25
- continue
26
-
27
- await asyncio.sleep(random.uniform(0.15, 0.6))
28
-
29
- # --- HTML PARSER ---
30
- if "tm-status-avail" in html: return "AVAILABLE"
31
 
32
  if "tm-status-taken" in html:
33
  if "On auction" in html or "tm-section-subscribe" in html: return "ON_AUCTION"
34
  if "For sale" in html: return "FOR_SALE"
35
  if ">Sold for" in html or ">Sold at" in html or 'class="tm-status-text">Sold' in html: return "SOLD"
36
  return "TAKEN"
37
-
38
- if "tm-status-unavail" in html: return "UNAVAILABLE"
 
39
 
40
- # Fallbacks
41
- if "Available" in html: return "AVAILABLE"
42
- if "Taken" in html: return "TAKEN"
43
-
44
- # If Fragment redirects to the search page, the word is usually available to claim
45
- if "/?query=" in str(resp.url):
46
- return "AVAILABLE"
47
-
48
- return "UNAVAILABLE"
49
 
50
- except Exception:
51
- pass
52
-
53
- await asyncio.sleep(random.uniform(1.0, 2.0))
54
-
55
- return "UNAVAILABLE"
 
 
2
  import random
3
  from curl_cffi.requests import AsyncSession
4
 
5
+ # We use free CORS proxies to hide your Server IP from Cloudflare
6
+ PROXIES = [
7
+ "", # 1st Attempt: Direct Connection
8
+ "https://api.allorigins.win/raw?url=", # 2nd Attempt: Proxy Router 1
9
+ "https://corsproxy.io/?", # 3rd Attempt: Proxy Router 2
10
+ ]
11
+
12
  async def check_fragment(word: str) -> str:
13
+ target_url = f"https://fragment.com/username/{word}"
14
 
15
+ headers = {
16
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
17
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
18
+ }
19
+
20
+ # Spoof a real Google Chrome browser fingerprint
21
+ async with AsyncSession(impersonate="chrome120") as session:
22
+ for proxy in PROXIES:
23
+ url = f"{proxy}{target_url}"
24
+
25
  try:
26
+ resp = await session.get(url, headers=headers, timeout=15)
27
+ html = resp.text
28
 
29
+ # 1. Did we hit the Cloudflare Wall?
30
+ if resp.status_code in [403, 429] or "Just a moment..." in html or "cf-browser-verification" in html:
31
+ proxy_name = "Proxy" if proxy else "Direct HF IP"
32
+ print(f"⚠️ [CLOUDFLARE BLOCK] Blocked on {proxy_name} for @{word}. Retrying...")
33
+ await asyncio.sleep(random.uniform(2, 4))
34
+ continue # Try the next proxy in the list
35
+
36
+ # 2. Success! Parse the HTML accurately
37
  if resp.status_code == 200:
 
38
 
39
+ if "tm-status-avail" in html:
40
+ print(f" [SUCCESS] @{word} is AVAILABLE!")
41
+ return "AVAILABLE"
 
 
 
 
 
 
42
 
43
  if "tm-status-taken" in html:
44
  if "On auction" in html or "tm-section-subscribe" in html: return "ON_AUCTION"
45
  if "For sale" in html: return "FOR_SALE"
46
  if ">Sold for" in html or ">Sold at" in html or 'class="tm-status-text">Sold' in html: return "SOLD"
47
  return "TAKEN"
48
+
49
+ if "tm-status-unavail" in html:
50
+ return "UNAVAILABLE"
51
 
52
+ # If Fragment loads but the layout changed
53
+ print(f" [UNKNOWN HTML] Couldn't parse @{word}. Fragment may have changed their website.")
54
+ return "ERROR"
 
 
 
 
 
 
55
 
56
+ except Exception as e:
57
+ print(f"❌ [NETWORK ERROR] Failed to load @{word}: {str(e)}")
58
+ continue
59
+
60
+ # If all proxies fail, DO NOT label as Unavail. Label as ERROR to protect the database.
61
+ print(f"💀 [FATAL] All proxies blocked for @{word}.")
62
+ return "ERROR"