Spaces:
Paused
Paused
Update parser.py
Browse files
parser.py
CHANGED
|
@@ -1,33 +1,65 @@
|
|
|
|
|
| 1 |
from bs4 import BeautifulSoup
|
| 2 |
|
| 3 |
def parse_html(html: str, final_url: str, word: str) -> str:
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
status_tag = soup.find(class_='tm-section-header-status')
|
| 10 |
if status_tag:
|
| 11 |
t = status_tag.get_text(strip=True).lower()
|
| 12 |
if 'sold' in t: return 'SOLD'
|
| 13 |
if 'taken' in t: return 'TAKEN'
|
| 14 |
if 'available' in t: return 'AVAILABLE'
|
| 15 |
-
if 'on auction' in t: return 'ON_AUCTION'
|
| 16 |
-
if '
|
| 17 |
-
|
| 18 |
-
if 'is already taken' in full_text:
|
| 19 |
return 'TAKEN'
|
| 20 |
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
for row in rows:
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
if '
|
| 29 |
-
if '
|
| 30 |
-
if '
|
| 31 |
-
if '
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
return 'UNCERTAIN'
|
|
|
|
| 1 |
+
import re
|
| 2 |
from bs4 import BeautifulSoup
|
| 3 |
|
| 4 |
def parse_html(html: str, final_url: str, word: str) -> str:
|
| 5 |
+
"""
|
| 6 |
+
Highly resilient HTML payload engine optimized for processing
|
| 7 |
+
both direct page DOM structures and AJAX search response grids safely.
|
| 8 |
+
"""
|
| 9 |
+
if not html or not html.strip():
|
| 10 |
+
return 'AVAILABLE'
|
| 11 |
|
| 12 |
+
soup = BeautifulSoup(html, 'html.parser')
|
| 13 |
+
|
| 14 |
+
# Pre-process raw text buffers cleanly
|
| 15 |
+
full_text = " ".join(soup.get_text(separator=' ', strip=True).split()).lower()
|
| 16 |
+
word_clean = word.strip().replace("@", "").lower()
|
| 17 |
+
|
| 18 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 19 |
+
# APPROACH 1: DIRECT PROFILE RESOLUTION SIGNATURES
|
| 20 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 21 |
+
# Safely checks for structural components if the payload is a full profile DOM
|
| 22 |
+
if '/username/' in final_url.lower() and '?query=' not in final_url.lower():
|
| 23 |
status_tag = soup.find(class_='tm-section-header-status')
|
| 24 |
if status_tag:
|
| 25 |
t = status_tag.get_text(strip=True).lower()
|
| 26 |
if 'sold' in t: return 'SOLD'
|
| 27 |
if 'taken' in t: return 'TAKEN'
|
| 28 |
if 'available' in t: return 'AVAILABLE'
|
| 29 |
+
if 'on auction' in t or 'bidding' in t: return 'ON_AUCTION'
|
| 30 |
+
if 'sale' in t or 'purchase' in t: return 'FOR_SALE'
|
| 31 |
+
|
| 32 |
+
if 'is already taken' in full_text or 'make an offer' in full_text:
|
| 33 |
return 'TAKEN'
|
| 34 |
|
| 35 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 36 |
+
# APPROACH 2: AJAX DATA GRID / TABLE ROW INTERCEPTIONS
|
| 37 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 38 |
+
# Iterate through all functional rows or element dividers cleanly
|
| 39 |
+
rows = soup.find_all(['tr', 'div', 'a'], class_=re.compile(re.escape('row') + r'|tm-table-row|tm-row', re.IGNORECASE)) or soup.find_all('tr')
|
| 40 |
+
|
| 41 |
for row in rows:
|
| 42 |
+
row_text = " ".join(row.get_text(separator=' ', strip=True).split()).lower()
|
| 43 |
+
|
| 44 |
+
# Performance check mapping boundaries cleanly using exact string containment
|
| 45 |
+
if f"@{word_clean}" in row_text or word_clean in row_text:
|
| 46 |
+
# Check explicit validation statuses matching your matrix boundaries
|
| 47 |
+
if 'on auction' in row_text or 'bidding' in row_text: return 'ON_AUCTION'
|
| 48 |
+
if 'for sale' in row_text and 'not for sale' not in row_text: return 'FOR_SALE'
|
| 49 |
+
if 'available' in row_text: return 'AVAILABLE'
|
| 50 |
+
if 'sold' in row_text: return 'SOLD'
|
| 51 |
+
if 'taken' in row_text or 'make an offer' in row_text: return 'TAKEN'
|
| 52 |
+
if 'unavailable' in row_text: return 'UNAVAILABLE'
|
| 53 |
+
|
| 54 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 55 |
+
# APPROACH 3: GLOBAL FAILSAFE PATTERN RESOLUTION
|
| 56 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 57 |
+
# Ultimate boundary backup scan across text layers if explicit row bindings fail
|
| 58 |
+
if "on auction" in full_text: return 'ON_AUCTION'
|
| 59 |
+
if "for sale" in full_text and "not for sale" not in full_text: return 'FOR_SALE'
|
| 60 |
+
if "sold" in full_text: return 'SOLD'
|
| 61 |
+
if "taken" in full_text or "make an offer" in full_text: return 'TAKEN'
|
| 62 |
+
if "unavailable" in full_text: return 'UNAVAILABLE'
|
| 63 |
+
if "available" in full_text: return 'AVAILABLE'
|
| 64 |
|
| 65 |
return 'UNCERTAIN'
|