Spaces:
Paused
Paused
Update parser.py
#1
by UNUSUALxd - opened
parser.py
CHANGED
|
@@ -1,26 +1,32 @@
|
|
| 1 |
import re
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
def parse_html(html: str, final_url: str, word: str) -> str:
|
| 4 |
"""
|
| 5 |
High-Performance Text Boundary Token Mapper optimized for reading
|
| 6 |
-
direct username landing pages and search
|
| 7 |
"""
|
| 8 |
if not html or not html.strip():
|
|
|
|
| 9 |
return "AVAILABLE"
|
| 10 |
|
| 11 |
-
# Flatten trailing breaks and spaces into
|
| 12 |
clean_html = " ".join(html.split())
|
| 13 |
word_clean = word.strip().replace("@", "").lower()
|
| 14 |
final_url_lower = final_url.lower()
|
| 15 |
|
| 16 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 17 |
-
# ENGINE 1: DIRECT DOM
|
| 18 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 19 |
if '/username/' in final_url_lower and '?query=' not in final_url_lower:
|
| 20 |
-
|
|
|
|
| 21 |
status_match = re.search(r'class="tm-section-header-status[^"]*"[^>]*>\s*([^<]+?)\s*</span>', clean_html, re.IGNORECASE)
|
| 22 |
if status_match:
|
| 23 |
s = status_match.group(1).strip().lower()
|
|
|
|
| 24 |
if "sold" in s: return "SOLD"
|
| 25 |
if "taken" in s: return "TAKEN"
|
| 26 |
if "auction" in s: return "ON_AUCTION"
|
|
@@ -28,24 +34,31 @@ def parse_html(html: str, final_url: str, word: str) -> str:
|
|
| 28 |
if "sale" in s or "purchase" in s: return "FOR_SALE"
|
| 29 |
|
| 30 |
if 'is already taken' in clean_html.lower() or 'make an offer' in clean_html.lower():
|
|
|
|
| 31 |
return "TAKEN"
|
| 32 |
|
| 33 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 34 |
-
# ENGINE 2: REDIRECTED SEARCH RESULTS
|
| 35 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
|
|
| 36 |
search_regex = re.compile(rf'>@{word_clean}<.*?class="[^"]*status[^"]*"[^>]*>\s*([^<]+?)\s*<', re.IGNORECASE)
|
| 37 |
match = search_regex.search(clean_html)
|
| 38 |
|
| 39 |
if match:
|
| 40 |
s = match.group(1).strip().lower()
|
|
|
|
| 41 |
if "auction" in s or "bidding" in s: return "ON_AUCTION"
|
| 42 |
if "sold" in s: return "SOLD"
|
| 43 |
if "unavailable" in s: return "UNAVAILABLE"
|
| 44 |
if "taken" in s or "offer" in s: return "TAKEN"
|
| 45 |
if "sale" in s or "purchase" in s: return "FOR_SALE"
|
| 46 |
|
| 47 |
-
#
|
|
|
|
|
|
|
|
|
|
| 48 |
fallback_text = clean_html.lower()
|
|
|
|
| 49 |
if "on auction" in fallback_text: return "ON_AUCTION"
|
| 50 |
if "sold for" in fallback_text or "recently sold" in fallback_text: return "SOLD"
|
| 51 |
if "taken" in fallback_text or "make an offer" in fallback_text: return "TAKEN"
|
|
|
|
| 1 |
import re
|
| 2 |
+
from logger import get_logger
|
| 3 |
+
|
| 4 |
+
log = get_logger()
|
| 5 |
|
| 6 |
def parse_html(html: str, final_url: str, word: str) -> str:
|
| 7 |
"""
|
| 8 |
High-Performance Text Boundary Token Mapper optimized for reading
|
| 9 |
+
direct username landing pages and search loop arrays cleanly.
|
| 10 |
"""
|
| 11 |
if not html or not html.strip():
|
| 12 |
+
log.warning(f"๐ Empty text payload passed to evaluation parser engine for target: {word}")
|
| 13 |
return "AVAILABLE"
|
| 14 |
|
| 15 |
+
# Flatten trailing breaks, tabs, and layout spaces into single line strings
|
| 16 |
clean_html = " ".join(html.split())
|
| 17 |
word_clean = word.strip().replace("@", "").lower()
|
| 18 |
final_url_lower = final_url.lower()
|
| 19 |
|
| 20 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 21 |
+
# ENGINE 1: DIRECT DOM PROFILE SCANNING
|
| 22 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 23 |
if '/username/' in final_url_lower and '?query=' not in final_url_lower:
|
| 24 |
+
log.info(f"๐ Converted Parser executing Profile Mode parsing path for word: {word_clean}")
|
| 25 |
+
|
| 26 |
status_match = re.search(r'class="tm-section-header-status[^"]*"[^>]*>\s*([^<]+?)\s*</span>', clean_html, re.IGNORECASE)
|
| 27 |
if status_match:
|
| 28 |
s = status_match.group(1).strip().lower()
|
| 29 |
+
log.info(f"๐ฎ Direct Class Marker found for '{word_clean}': '{s}'")
|
| 30 |
if "sold" in s: return "SOLD"
|
| 31 |
if "taken" in s: return "TAKEN"
|
| 32 |
if "auction" in s: return "ON_AUCTION"
|
|
|
|
| 34 |
if "sale" in s or "purchase" in s: return "FOR_SALE"
|
| 35 |
|
| 36 |
if 'is already taken' in clean_html.lower() or 'make an offer' in clean_html.lower():
|
| 37 |
+
log.info(f"๐ฎ Found fallback text marker 'taken/offer' for: {word_clean}")
|
| 38 |
return "TAKEN"
|
| 39 |
|
| 40 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 41 |
+
# ENGINE 2: REDIRECTED SEARCH RESULTS ROW LOOKUP
|
| 42 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 43 |
+
log.info(f"๐ Converted Parser executing Grid Mode search loop path for word: {word_clean}")
|
| 44 |
search_regex = re.compile(rf'>@{word_clean}<.*?class="[^"]*status[^"]*"[^>]*>\s*([^<]+?)\s*<', re.IGNORECASE)
|
| 45 |
match = search_regex.search(clean_html)
|
| 46 |
|
| 47 |
if match:
|
| 48 |
s = match.group(1).strip().lower()
|
| 49 |
+
log.info(f"๐ฎ Search Grid Table status mapped for '{word_clean}': '{s}'")
|
| 50 |
if "auction" in s or "bidding" in s: return "ON_AUCTION"
|
| 51 |
if "sold" in s: return "SOLD"
|
| 52 |
if "unavailable" in s: return "UNAVAILABLE"
|
| 53 |
if "taken" in s or "offer" in s: return "TAKEN"
|
| 54 |
if "sale" in s or "purchase" in s: return "FOR_SALE"
|
| 55 |
|
| 56 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 57 |
+
# ENGINE 3: GLOBAL LAYOUT REGEX FALLBACK SCANS
|
| 58 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 59 |
+
log.info(f"โ ๏ธ Row boundaries missed for '{word_clean}'. Executing global text scans...")
|
| 60 |
fallback_text = clean_html.lower()
|
| 61 |
+
|
| 62 |
if "on auction" in fallback_text: return "ON_AUCTION"
|
| 63 |
if "sold for" in fallback_text or "recently sold" in fallback_text: return "SOLD"
|
| 64 |
if "taken" in fallback_text or "make an offer" in fallback_text: return "TAKEN"
|