Spaces:
Paused
Paused
Update parser.py
Browse files
parser.py
CHANGED
|
@@ -1,65 +1,42 @@
|
|
| 1 |
import re
|
| 2 |
-
from bs4 import BeautifulSoup
|
| 3 |
|
| 4 |
def parse_html(html: str, final_url: str, word: str) -> str:
|
| 5 |
"""
|
| 6 |
-
|
| 7 |
-
|
| 8 |
"""
|
| 9 |
if not html or not html.strip():
|
| 10 |
-
return
|
| 11 |
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
# Pre-process raw text buffers cleanly
|
| 15 |
-
full_text = " ".join(soup.get_text(separator=' ', strip=True).split()).lower()
|
| 16 |
word_clean = word.strip().replace("@", "").lower()
|
| 17 |
-
|
| 18 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 19 |
-
# APPROACH 1: DIRECT PROFILE RESOLUTION SIGNATURES
|
| 20 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 21 |
-
# Safely checks for structural components if the payload is a full profile DOM
|
| 22 |
-
if '/username/' in final_url.lower() and '?query=' not in final_url.lower():
|
| 23 |
-
status_tag = soup.find(class_='tm-section-header-status')
|
| 24 |
-
if status_tag:
|
| 25 |
-
t = status_tag.get_text(strip=True).lower()
|
| 26 |
-
if 'sold' in t: return 'SOLD'
|
| 27 |
-
if 'taken' in t: return 'TAKEN'
|
| 28 |
-
if 'available' in t: return 'AVAILABLE'
|
| 29 |
-
if 'on auction' in t or 'bidding' in t: return 'ON_AUCTION'
|
| 30 |
-
if 'sale' in t or 'purchase' in t: return 'FOR_SALE'
|
| 31 |
-
|
| 32 |
-
if 'is already taken' in full_text or 'make an offer' in full_text:
|
| 33 |
-
return 'TAKEN'
|
| 34 |
|
| 35 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 36 |
-
#
|
| 37 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 38 |
-
|
| 39 |
-
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
if
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
if 'available' in row_text: return 'AVAILABLE'
|
| 50 |
-
if 'sold' in row_text: return 'SOLD'
|
| 51 |
-
if 'taken' in row_text or 'make an offer' in row_text: return 'TAKEN'
|
| 52 |
-
if 'unavailable' in row_text: return 'UNAVAILABLE'
|
| 53 |
|
| 54 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 55 |
-
#
|
| 56 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
if "
|
| 60 |
-
if "sold" in
|
| 61 |
-
if "taken" in
|
| 62 |
-
if "
|
| 63 |
-
if "
|
|
|
|
| 64 |
|
| 65 |
-
return
|
|
|
|
| 1 |
import re
|
|
|
|
| 2 |
|
| 3 |
def parse_html(html: str, final_url: str, word: str) -> str:
|
| 4 |
"""
|
| 5 |
+
High-Performance Text Boundary Token Mapper optimized for reading
|
| 6 |
+
asynchronous AJAX responses with structural fallbacks.
|
| 7 |
"""
|
| 8 |
if not html or not html.strip():
|
| 9 |
+
return "AVAILABLE"
|
| 10 |
|
| 11 |
+
# Flatten trailing breaks and tabs into clean space arrays
|
| 12 |
+
clean_html = " ".join(html.split())
|
|
|
|
|
|
|
| 13 |
word_clean = word.strip().replace("@", "").lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 16 |
+
# ENGINE 1: EXACT ROW PATTERN VALIDATION
|
| 17 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 18 |
+
search_regex = re.compile(rf'>@{word_clean}<.*?class="[^"]*status[^"]*"[^>]*>\s*([^<]+?)\s*<', re.IGNORECASE)
|
| 19 |
+
match = search_regex.search(clean_html)
|
| 20 |
|
| 21 |
+
if match:
|
| 22 |
+
status_text = match.group(1).strip().lower()
|
| 23 |
+
if "auction" in status_text or "bidding" in status_text: return "ON_AUCTION"
|
| 24 |
+
if "sold" in status_text: return "SOLD"
|
| 25 |
+
if "unavailable" in status_text: return "UNAVAILABLE"
|
| 26 |
+
if "taken" in status_text or "offer" in status_text: return "TAKEN"
|
| 27 |
+
if "sale" in status_text or "purchase" in status_text: return "FOR_SALE"
|
| 28 |
+
return status_text.upper()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 31 |
+
# ENGINE 2: GLOBAL TEXT BOUNDARY FALLBACK SCANS
|
| 32 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 33 |
+
fallback_text = clean_html.lower()
|
| 34 |
+
|
| 35 |
+
if "on auction" in fallback_text: return "ON_AUCTION"
|
| 36 |
+
if "sold for" in fallback_text or "recently sold" in fallback_text: return "SOLD"
|
| 37 |
+
if "taken" in fallback_text or "make an offer" in fallback_text: return "TAKEN"
|
| 38 |
+
if "for sale" in fallback_text or "purchase" in fallback_text: return "FOR_SALE"
|
| 39 |
+
if "unavailable" in fallback_text: return "UNAVAILABLE"
|
| 40 |
+
if "available" in fallback_text: return "AVAILABLE"
|
| 41 |
|
| 42 |
+
return "UNAVAILABLE"
|