Spaces:
Paused
Paused
Update scraper.py
Browse files- scraper.py +24 -28
scraper.py
CHANGED
|
@@ -1,28 +1,35 @@
|
|
| 1 |
import asyncio
|
| 2 |
import random
|
| 3 |
import re
|
| 4 |
-
|
| 5 |
from logger import get_logger
|
| 6 |
|
| 7 |
log = get_logger()
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
async def check_fragment(word: str, proxy_url: str = None) -> str:
|
| 10 |
url = f"https://fragment.com/username/{word}"
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
try:
|
| 14 |
-
async with AsyncSession(impersonate="chrome120", proxies=proxies) as session:
|
| 15 |
-
for attempt in range(3):
|
| 16 |
-
try:
|
| 17 |
-
resp = await session.get(url, timeout=15, allow_redirects=True)
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
await asyncio.sleep(random.uniform(2.0, 4.0))
|
| 22 |
continue
|
| 23 |
|
| 24 |
-
html = resp.text
|
| 25 |
|
|
|
|
| 26 |
if "Just a moment..." in html or "cloudflare" in html.lower():
|
| 27 |
await asyncio.sleep(random.uniform(2, 4))
|
| 28 |
continue
|
|
@@ -42,32 +49,21 @@ async def check_fragment(word: str, proxy_url: str = None) -> str:
|
|
| 42 |
if "auction" in s: return "ON_AUCTION"
|
| 43 |
if "available" in s: return "AVAILABLE"
|
| 44 |
if "sale" in s or "purchase" in s: return "FOR_SALE"
|
|
|
|
| 45 |
|
| 46 |
-
|
| 47 |
-
clean = s.upper()
|
| 48 |
-
log.warning(f"UNKNOWN STATUS TEXT for {word}: '{clean}'")
|
| 49 |
-
return clean
|
| 50 |
-
|
| 51 |
-
# 3. FALLBACK HTML PARSING (Fixing the False Unavailables)
|
| 52 |
if 'class="tm-status-taken"' in html: return "TAKEN"
|
| 53 |
|
| 54 |
-
# 'tm-status-unavail' is used for BOTH Sold and Unavailable. We must differentiate.
|
| 55 |
if 'class="tm-status-unavail"' in html:
|
| 56 |
if ">Sold<" in html or ">sold<" in html.lower() or "recently sold" in html.lower():
|
| 57 |
return "SOLD"
|
| 58 |
return "UNAVAILABLE"
|
| 59 |
|
| 60 |
-
# If absolutely no markers exist, it might be a new UI layout.
|
| 61 |
-
log.error(f"FAILED TO PARSE UI FOR: {word}. HTML snippet logged.")
|
| 62 |
-
with open("failed_parse.log", "a", encoding="utf-8") as f:
|
| 63 |
-
f.write(f"\n--- {word} ---\n{html[:1000]}\n")
|
| 64 |
return "ERROR"
|
| 65 |
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
except Exception as e:
|
| 71 |
-
log.error(f"Session error for {word}: {str(e)}")
|
| 72 |
|
| 73 |
return "ERROR"
|
|
|
|
| 1 |
import asyncio
|
| 2 |
import random
|
| 3 |
import re
|
| 4 |
+
import aiohttp
|
| 5 |
from logger import get_logger
|
| 6 |
|
| 7 |
log = get_logger()
|
| 8 |
|
| 9 |
+
USER_AGENTS = [
|
| 10 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124.0 Safari/537.36",
|
| 11 |
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 Safari/605.1.15",
|
| 12 |
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/123.0 Safari/537.36"
|
| 13 |
+
]
|
| 14 |
+
|
| 15 |
async def check_fragment(word: str, proxy_url: str = None) -> str:
|
| 16 |
url = f"https://fragment.com/username/{word}"
|
| 17 |
+
headers = {"User-Agent": random.choice(USER_AGENTS)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
+
for attempt in range(3):
|
| 20 |
+
try:
|
| 21 |
+
# We open a fresh session per attempt to prevent dead proxies from leaking connections
|
| 22 |
+
timeout = aiohttp.ClientTimeout(total=15)
|
| 23 |
+
async with aiohttp.ClientSession(timeout=timeout) as session:
|
| 24 |
+
async with session.get(url, headers=headers, proxy=proxy_url, allow_redirects=True) as resp:
|
| 25 |
+
|
| 26 |
+
if resp.status in [429, 403]:
|
| 27 |
await asyncio.sleep(random.uniform(2.0, 4.0))
|
| 28 |
continue
|
| 29 |
|
| 30 |
+
html = await resp.text()
|
| 31 |
|
| 32 |
+
# Cloudflare block
|
| 33 |
if "Just a moment..." in html or "cloudflare" in html.lower():
|
| 34 |
await asyncio.sleep(random.uniform(2, 4))
|
| 35 |
continue
|
|
|
|
| 49 |
if "auction" in s: return "ON_AUCTION"
|
| 50 |
if "available" in s: return "AVAILABLE"
|
| 51 |
if "sale" in s or "purchase" in s: return "FOR_SALE"
|
| 52 |
+
return s.upper()
|
| 53 |
|
| 54 |
+
# 3. FALLBACK HTML PARSING
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
if 'class="tm-status-taken"' in html: return "TAKEN"
|
| 56 |
|
|
|
|
| 57 |
if 'class="tm-status-unavail"' in html:
|
| 58 |
if ">Sold<" in html or ">sold<" in html.lower() or "recently sold" in html.lower():
|
| 59 |
return "SOLD"
|
| 60 |
return "UNAVAILABLE"
|
| 61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
return "ERROR"
|
| 63 |
|
| 64 |
+
except Exception as e:
|
| 65 |
+
# This logs the specific aiohttp timeout/connection error so you can see if a proxy dies
|
| 66 |
+
log.error(f"Request error for {word} via {proxy_url}: {str(e)}")
|
| 67 |
+
await asyncio.sleep(1.5)
|
|
|
|
|
|
|
| 68 |
|
| 69 |
return "ERROR"
|