Spaces:

Muttered3
/

Dvs

Paused

App Files Files Community

Muttered3 commited on 10 days ago

Commit

02a8ebc

verified ·

1 Parent(s): e2fa3ac

Update parser.py

Browse files

Files changed (1) hide show

parser.py +26 -49

parser.py CHANGED Viewed

@@ -1,65 +1,42 @@
 import re
-from bs4 import BeautifulSoup
 def parse_html(html: str, final_url: str, word: str) -> str:
     """
-    Highly resilient HTML payload engine optimized for processing
-    both direct page DOM structures and AJAX search response grids safely.
     """
     if not html or not html.strip():
-        return 'AVAILABLE'
-    soup = BeautifulSoup(html, 'html.parser')
-    # Pre-process raw text buffers cleanly
-    full_text = " ".join(soup.get_text(separator=' ', strip=True).split()).lower()
     word_clean = word.strip().replace("@", "").lower()
-    # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
-    # APPROACH 1: DIRECT PROFILE RESOLUTION SIGNATURES
-    # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
-    # Safely checks for structural components if the payload is a full profile DOM
-    if '/username/' in final_url.lower() and '?query=' not in final_url.lower():
-        status_tag = soup.find(class_='tm-section-header-status')
-        if status_tag:
-            t = status_tag.get_text(strip=True).lower()
-            if 'sold' in t: return 'SOLD'
-            if 'taken' in t: return 'TAKEN'
-            if 'available' in t: return 'AVAILABLE'
-            if 'on auction' in t or 'bidding' in t: return 'ON_AUCTION'
-            if 'sale' in t or 'purchase' in t: return 'FOR_SALE'
-        if 'is already taken' in full_text or 'make an offer' in full_text:
-            return 'TAKEN'
     # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
-    # APPROACH 2: AJAX DATA GRID / TABLE ROW INTERCEPTIONS
     # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
-    # Iterate through all functional rows or element dividers cleanly
-    rows = soup.find_all(['tr', 'div', 'a'], class_=re.compile(re.escape('row') + r'|tm-table-row|tm-row', re.IGNORECASE)) or soup.find_all('tr')
-    for row in rows:
-        row_text = " ".join(row.get_text(separator=' ', strip=True).split()).lower()
-        # Performance check mapping boundaries cleanly using exact string containment
-        if f"@{word_clean}" in row_text or word_clean in row_text:
-            # Check explicit validation statuses matching your matrix boundaries
-            if 'on auction' in row_text or 'bidding' in row_text: return 'ON_AUCTION'
-            if 'for sale' in row_text and 'not for sale' not in row_text: return 'FOR_SALE'
-            if 'available' in row_text: return 'AVAILABLE'
-            if 'sold' in row_text: return 'SOLD'
-            if 'taken' in row_text or 'make an offer' in row_text: return 'TAKEN'
-            if 'unavailable' in row_text: return 'UNAVAILABLE'
     # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
-    # APPROACH 3: GLOBAL FAILSAFE PATTERN RESOLUTION
     # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
-    # Ultimate boundary backup scan across text layers if explicit row bindings fail
-    if "on auction" in full_text: return 'ON_AUCTION'
-    if "for sale" in full_text and "not for sale" not in full_text: return 'FOR_SALE'
-    if "sold" in full_text: return 'SOLD'
-    if "taken" in full_text or "make an offer" in full_text: return 'TAKEN'
-    if "unavailable" in full_text: return 'UNAVAILABLE'
-    if "available" in full_text: return 'AVAILABLE'
-    return 'UNCERTAIN'

 import re
 def parse_html(html: str, final_url: str, word: str) -> str:
     """
+    High-Performance Text Boundary Token Mapper optimized for reading
+    asynchronous AJAX responses with structural fallbacks.
     """
     if not html or not html.strip():
+        return "AVAILABLE"
+    # Flatten trailing breaks and tabs into clean space arrays
+    clean_html = " ".join(html.split())
     word_clean = word.strip().replace("@", "").lower()
     # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+    # ENGINE 1: EXACT ROW PATTERN VALIDATION
     # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+    search_regex = re.compile(rf'>@{word_clean}<.*?class="[^"]*status[^"]*"[^>]*>\s*([^<]+?)\s*<', re.IGNORECASE)
+    match = search_regex.search(clean_html)
+    if match:
+        status_text = match.group(1).strip().lower()
+        if "auction" in status_text or "bidding" in status_text: return "ON_AUCTION"
+        if "sold" in status_text: return "SOLD"
+        if "unavailable" in status_text: return "UNAVAILABLE"
+        if "taken" in status_text or "offer" in status_text: return "TAKEN"
+        if "sale" in status_text or "purchase" in status_text: return "FOR_SALE"
+        return status_text.upper()
     # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+    # ENGINE 2: GLOBAL TEXT BOUNDARY FALLBACK SCANS
     # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+    fallback_text = clean_html.lower()
+    if "on auction" in fallback_text: return "ON_AUCTION"
+    if "sold for" in fallback_text or "recently sold" in fallback_text: return "SOLD"
+    if "taken" in fallback_text or "make an offer" in fallback_text: return "TAKEN"
+    if "for sale" in fallback_text or "purchase" in fallback_text: return "FOR_SALE"
+    if "unavailable" in fallback_text: return "UNAVAILABLE"
+    if "available" in fallback_text: return "AVAILABLE"
+    return "UNAVAILABLE"