Spaces:
Paused
Paused
Update scraper.py
Browse files- scraper.py +53 -35
scraper.py
CHANGED
|
@@ -1,55 +1,73 @@
|
|
| 1 |
import asyncio
|
| 2 |
import random
|
| 3 |
import re
|
| 4 |
-
import aiofiles
|
| 5 |
from curl_cffi.requests import AsyncSession
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
async def check_fragment(word: str, proxy_url: str = None) -> str:
|
| 8 |
url = f"https://fragment.com/username/{word}"
|
| 9 |
-
|
| 10 |
proxies = {"http": proxy_url, "https": proxy_url} if proxy_url else None
|
| 11 |
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
-
|
| 18 |
-
await asyncio.sleep(random.uniform(1.5, 3.5))
|
| 19 |
-
continue
|
| 20 |
|
| 21 |
-
|
|
|
|
|
|
|
| 22 |
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
|
|
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
if "
|
| 38 |
-
|
| 39 |
-
|
|
|
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
| 45 |
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
return "SOLD"
|
| 50 |
-
return "UNAVAILABLE"
|
| 51 |
|
| 52 |
-
|
| 53 |
-
|
| 54 |
|
| 55 |
return "ERROR"
|
|
|
|
| 1 |
import asyncio
|
| 2 |
import random
|
| 3 |
import re
|
|
|
|
| 4 |
from curl_cffi.requests import AsyncSession
|
| 5 |
+
from logger import get_logger
|
| 6 |
+
|
| 7 |
+
log = get_logger()
|
| 8 |
|
| 9 |
async def check_fragment(word: str, proxy_url: str = None) -> str:
|
| 10 |
url = f"https://fragment.com/username/{word}"
|
|
|
|
| 11 |
proxies = {"http": proxy_url, "https": proxy_url} if proxy_url else None
|
| 12 |
|
| 13 |
+
try:
|
| 14 |
+
async with AsyncSession(impersonate="chrome120", proxies=proxies) as session:
|
| 15 |
+
for attempt in range(3):
|
| 16 |
+
try:
|
| 17 |
+
resp = await session.get(url, timeout=15, allow_redirects=True)
|
| 18 |
+
|
| 19 |
+
if resp.status_code in [429, 403]:
|
| 20 |
+
log.warning(f"Rate limited on {word}. Retrying...")
|
| 21 |
+
await asyncio.sleep(random.uniform(2.0, 4.0))
|
| 22 |
+
continue
|
| 23 |
|
| 24 |
+
html = resp.text
|
|
|
|
|
|
|
| 25 |
|
| 26 |
+
if "Just a moment..." in html or "cloudflare" in html.lower():
|
| 27 |
+
await asyncio.sleep(random.uniform(2, 4))
|
| 28 |
+
continue
|
| 29 |
|
| 30 |
+
# 1. CANONICAL CHECK (Soft-404 Banishment)
|
| 31 |
+
canonical = re.search(r'<link rel="canonical" href="([^"]+)"', html, re.IGNORECASE)
|
| 32 |
+
if canonical and "fragment.com/username/" not in canonical.group(1):
|
| 33 |
+
return "UNAVAILABLE"
|
| 34 |
|
| 35 |
+
# 2. PRECISE STATUS EXTRACTION
|
| 36 |
+
status_match = re.search(r'class="tm-section-header-status[^"]*"[^>]*>\s*([^<]+?)\s*</span>', html, re.IGNORECASE)
|
| 37 |
+
|
| 38 |
+
if status_match:
|
| 39 |
+
s = status_match.group(1).strip().lower()
|
| 40 |
+
if "sold" in s: return "SOLD"
|
| 41 |
+
if "taken" in s: return "TAKEN"
|
| 42 |
+
if "auction" in s: return "ON_AUCTION"
|
| 43 |
+
if "available" in s: return "AVAILABLE"
|
| 44 |
+
if "sale" in s or "purchase" in s: return "FOR_SALE"
|
| 45 |
|
| 46 |
+
# If it found a status span but we don't recognize the text
|
| 47 |
+
clean = s.upper()
|
| 48 |
+
log.warning(f"UNKNOWN STATUS TEXT for {word}: '{clean}'")
|
| 49 |
+
return clean
|
| 50 |
|
| 51 |
+
# 3. FALLBACK HTML PARSING (Fixing the False Unavailables)
|
| 52 |
+
if 'class="tm-status-taken"' in html: return "TAKEN"
|
| 53 |
+
|
| 54 |
+
# 'tm-status-unavail' is used for BOTH Sold and Unavailable. We must differentiate.
|
| 55 |
+
if 'class="tm-status-unavail"' in html:
|
| 56 |
+
if ">Sold<" in html or ">sold<" in html.lower() or "recently sold" in html.lower():
|
| 57 |
+
return "SOLD"
|
| 58 |
+
return "UNAVAILABLE"
|
| 59 |
|
| 60 |
+
# If absolutely no markers exist, it might be a new UI layout.
|
| 61 |
+
log.error(f"FAILED TO PARSE UI FOR: {word}. HTML snippet logged.")
|
| 62 |
+
with open("failed_parse.log", "a", encoding="utf-8") as f:
|
| 63 |
+
f.write(f"\n--- {word} ---\n{html[:1000]}\n")
|
| 64 |
+
return "ERROR"
|
| 65 |
|
| 66 |
+
except Exception as e:
|
| 67 |
+
log.error(f"Request error for {word} (Attempt {attempt+1}): {str(e)}")
|
| 68 |
+
await asyncio.sleep(1.5)
|
|
|
|
|
|
|
| 69 |
|
| 70 |
+
except Exception as e:
|
| 71 |
+
log.error(f"Session error for {word}: {str(e)}")
|
| 72 |
|
| 73 |
return "ERROR"
|