Muttered3 commited on
Commit
4b0add0
·
verified ·
1 Parent(s): 4718934

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +53 -35
scraper.py CHANGED
@@ -1,55 +1,73 @@
1
  import asyncio
2
  import random
3
  import re
4
- import aiofiles
5
  from curl_cffi.requests import AsyncSession
 
 
 
6
 
7
  async def check_fragment(word: str, proxy_url: str = None) -> str:
8
  url = f"https://fragment.com/username/{word}"
9
-
10
  proxies = {"http": proxy_url, "https": proxy_url} if proxy_url else None
11
 
12
- async with AsyncSession(impersonate="chrome120", proxies=proxies) as session:
13
- for attempt in range(4):
14
- try:
15
- resp = await session.get(url, timeout=15, allow_redirects=True)
 
 
 
 
 
 
16
 
17
- if resp.status_code in [429, 403]:
18
- await asyncio.sleep(random.uniform(1.5, 3.5))
19
- continue
20
 
21
- html = resp.text
 
 
22
 
23
- if "Just a moment..." in html or "cloudflare" in html.lower():
24
- await asyncio.sleep(random.uniform(2, 4))
25
- continue
 
26
 
27
- canonical = re.search(r'<link rel="canonical" href="([^"]+)"', html, re.IGNORECASE)
28
- if canonical and "/username/" not in canonical.group(1):
29
- return "UNAVAILABLE"
 
 
 
 
 
 
 
30
 
31
- status_match = re.search(r'class="tm-section-header-status[^"]*"[^>]*>\s*([^<]+?)\s*</span>', html, re.IGNORECASE)
 
 
 
32
 
33
- if status_match:
34
- s = status_match.group(1).strip().lower()
35
- if "sold" in s: return "SOLD"
36
- if "taken" in s: return "TAKEN"
37
- if "auction" in s: return "ON_AUCTION"
38
- if "available" in s: return "AVAILABLE"
39
- if "sale" in s or "purchase" in s: return "FOR_SALE"
 
40
 
41
- clean = s.upper()
42
- async with aiofiles.open("other.txt", "a", encoding="utf-8") as f:
43
- await f.write(f"{word} - {clean} (Proxy: {proxy_url})\n")
44
- return clean
 
45
 
46
- if "tm-status-taken" in html: return "TAKEN"
47
- if "tm-status-unavail" in html:
48
- if ">Sold<" in html or ">sold<" in html.lower():
49
- return "SOLD"
50
- return "UNAVAILABLE"
51
 
52
- except Exception:
53
- await asyncio.sleep(1)
54
 
55
  return "ERROR"
 
1
  import asyncio
2
  import random
3
  import re
 
4
  from curl_cffi.requests import AsyncSession
5
+ from logger import get_logger
6
+
7
+ log = get_logger()
8
 
9
  async def check_fragment(word: str, proxy_url: str = None) -> str:
10
  url = f"https://fragment.com/username/{word}"
 
11
  proxies = {"http": proxy_url, "https": proxy_url} if proxy_url else None
12
 
13
+ try:
14
+ async with AsyncSession(impersonate="chrome120", proxies=proxies) as session:
15
+ for attempt in range(3):
16
+ try:
17
+ resp = await session.get(url, timeout=15, allow_redirects=True)
18
+
19
+ if resp.status_code in [429, 403]:
20
+ log.warning(f"Rate limited on {word}. Retrying...")
21
+ await asyncio.sleep(random.uniform(2.0, 4.0))
22
+ continue
23
 
24
+ html = resp.text
 
 
25
 
26
+ if "Just a moment..." in html or "cloudflare" in html.lower():
27
+ await asyncio.sleep(random.uniform(2, 4))
28
+ continue
29
 
30
+ # 1. CANONICAL CHECK (Soft-404 Banishment)
31
+ canonical = re.search(r'<link rel="canonical" href="([^"]+)"', html, re.IGNORECASE)
32
+ if canonical and "fragment.com/username/" not in canonical.group(1):
33
+ return "UNAVAILABLE"
34
 
35
+ # 2. PRECISE STATUS EXTRACTION
36
+ status_match = re.search(r'class="tm-section-header-status[^"]*"[^>]*>\s*([^<]+?)\s*</span>', html, re.IGNORECASE)
37
+
38
+ if status_match:
39
+ s = status_match.group(1).strip().lower()
40
+ if "sold" in s: return "SOLD"
41
+ if "taken" in s: return "TAKEN"
42
+ if "auction" in s: return "ON_AUCTION"
43
+ if "available" in s: return "AVAILABLE"
44
+ if "sale" in s or "purchase" in s: return "FOR_SALE"
45
 
46
+ # If it found a status span but we don't recognize the text
47
+ clean = s.upper()
48
+ log.warning(f"UNKNOWN STATUS TEXT for {word}: '{clean}'")
49
+ return clean
50
 
51
+ # 3. FALLBACK HTML PARSING (Fixing the False Unavailables)
52
+ if 'class="tm-status-taken"' in html: return "TAKEN"
53
+
54
+ # 'tm-status-unavail' is used for BOTH Sold and Unavailable. We must differentiate.
55
+ if 'class="tm-status-unavail"' in html:
56
+ if ">Sold<" in html or ">sold<" in html.lower() or "recently sold" in html.lower():
57
+ return "SOLD"
58
+ return "UNAVAILABLE"
59
 
60
+ # If absolutely no markers exist, it might be a new UI layout.
61
+ log.error(f"FAILED TO PARSE UI FOR: {word}. HTML snippet logged.")
62
+ with open("failed_parse.log", "a", encoding="utf-8") as f:
63
+ f.write(f"\n--- {word} ---\n{html[:1000]}\n")
64
+ return "ERROR"
65
 
66
+ except Exception as e:
67
+ log.error(f"Request error for {word} (Attempt {attempt+1}): {str(e)}")
68
+ await asyncio.sleep(1.5)
 
 
69
 
70
+ except Exception as e:
71
+ log.error(f"Session error for {word}: {str(e)}")
72
 
73
  return "ERROR"