UNUSUALxd commited on
Commit
fa97c3c
ยท
verified ยท
1 Parent(s): 1b2a7fe

Update parser.py

Browse files
Files changed (1) hide show
  1. parser.py +19 -6
parser.py CHANGED
@@ -1,26 +1,32 @@
1
  import re
 
 
 
2
 
3
  def parse_html(html: str, final_url: str, word: str) -> str:
4
  """
5
  High-Performance Text Boundary Token Mapper optimized for reading
6
- direct username landing pages and search loops cleanly.
7
  """
8
  if not html or not html.strip():
 
9
  return "AVAILABLE"
10
 
11
- # Flatten trailing breaks and spaces into clean strings
12
  clean_html = " ".join(html.split())
13
  word_clean = word.strip().replace("@", "").lower()
14
  final_url_lower = final_url.lower()
15
 
16
  # โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
17
- # ENGINE 1: DIRECT DOM VIEW REACTION
18
  # โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
19
  if '/username/' in final_url_lower and '?query=' not in final_url_lower:
20
- # Isolate class definitions safely without heavy BeautifulSoup dependency overhead
 
21
  status_match = re.search(r'class="tm-section-header-status[^"]*"[^>]*>\s*([^<]+?)\s*</span>', clean_html, re.IGNORECASE)
22
  if status_match:
23
  s = status_match.group(1).strip().lower()
 
24
  if "sold" in s: return "SOLD"
25
  if "taken" in s: return "TAKEN"
26
  if "auction" in s: return "ON_AUCTION"
@@ -28,24 +34,31 @@ def parse_html(html: str, final_url: str, word: str) -> str:
28
  if "sale" in s or "purchase" in s: return "FOR_SALE"
29
 
30
  if 'is already taken' in clean_html.lower() or 'make an offer' in clean_html.lower():
 
31
  return "TAKEN"
32
 
33
  # โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
34
- # ENGINE 2: REDIRECTED SEARCH RESULTS TABLE ROW LOOKUP
35
  # โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
 
36
  search_regex = re.compile(rf'>@{word_clean}<.*?class="[^"]*status[^"]*"[^>]*>\s*([^<]+?)\s*<', re.IGNORECASE)
37
  match = search_regex.search(clean_html)
38
 
39
  if match:
40
  s = match.group(1).strip().lower()
 
41
  if "auction" in s or "bidding" in s: return "ON_AUCTION"
42
  if "sold" in s: return "SOLD"
43
  if "unavailable" in s: return "UNAVAILABLE"
44
  if "taken" in s or "offer" in s: return "TAKEN"
45
  if "sale" in s or "purchase" in s: return "FOR_SALE"
46
 
47
- # Global text layout backup verification block scans
 
 
 
48
  fallback_text = clean_html.lower()
 
49
  if "on auction" in fallback_text: return "ON_AUCTION"
50
  if "sold for" in fallback_text or "recently sold" in fallback_text: return "SOLD"
51
  if "taken" in fallback_text or "make an offer" in fallback_text: return "TAKEN"
 
1
  import re
2
+ from logger import get_logger
3
+
4
+ log = get_logger()
5
 
6
  def parse_html(html: str, final_url: str, word: str) -> str:
7
  """
8
  High-Performance Text Boundary Token Mapper optimized for reading
9
+ direct username landing pages and search loop arrays cleanly.
10
  """
11
  if not html or not html.strip():
12
+ log.warning(f"๐Ÿ” Empty text payload passed to evaluation parser engine for target: {word}")
13
  return "AVAILABLE"
14
 
15
+ # Flatten trailing breaks, tabs, and layout spaces into single line strings
16
  clean_html = " ".join(html.split())
17
  word_clean = word.strip().replace("@", "").lower()
18
  final_url_lower = final_url.lower()
19
 
20
  # โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
21
+ # ENGINE 1: DIRECT DOM PROFILE SCANNING
22
  # โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
23
  if '/username/' in final_url_lower and '?query=' not in final_url_lower:
24
+ log.info(f"๐Ÿ”Ž Converted Parser executing Profile Mode parsing path for word: {word_clean}")
25
+
26
  status_match = re.search(r'class="tm-section-header-status[^"]*"[^>]*>\s*([^<]+?)\s*</span>', clean_html, re.IGNORECASE)
27
  if status_match:
28
  s = status_match.group(1).strip().lower()
29
+ log.info(f"๐Ÿ”ฎ Direct Class Marker found for '{word_clean}': '{s}'")
30
  if "sold" in s: return "SOLD"
31
  if "taken" in s: return "TAKEN"
32
  if "auction" in s: return "ON_AUCTION"
 
34
  if "sale" in s or "purchase" in s: return "FOR_SALE"
35
 
36
  if 'is already taken' in clean_html.lower() or 'make an offer' in clean_html.lower():
37
+ log.info(f"๐Ÿ”ฎ Found fallback text marker 'taken/offer' for: {word_clean}")
38
  return "TAKEN"
39
 
40
  # โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
41
+ # ENGINE 2: REDIRECTED SEARCH RESULTS ROW LOOKUP
42
  # โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
43
+ log.info(f"๐Ÿ”Ž Converted Parser executing Grid Mode search loop path for word: {word_clean}")
44
  search_regex = re.compile(rf'>@{word_clean}<.*?class="[^"]*status[^"]*"[^>]*>\s*([^<]+?)\s*<', re.IGNORECASE)
45
  match = search_regex.search(clean_html)
46
 
47
  if match:
48
  s = match.group(1).strip().lower()
49
+ log.info(f"๐Ÿ”ฎ Search Grid Table status mapped for '{word_clean}': '{s}'")
50
  if "auction" in s or "bidding" in s: return "ON_AUCTION"
51
  if "sold" in s: return "SOLD"
52
  if "unavailable" in s: return "UNAVAILABLE"
53
  if "taken" in s or "offer" in s: return "TAKEN"
54
  if "sale" in s or "purchase" in s: return "FOR_SALE"
55
 
56
+ # โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
57
+ # ENGINE 3: GLOBAL LAYOUT REGEX FALLBACK SCANS
58
+ # โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
59
+ log.info(f"โš ๏ธ Row boundaries missed for '{word_clean}'. Executing global text scans...")
60
  fallback_text = clean_html.lower()
61
+
62
  if "on auction" in fallback_text: return "ON_AUCTION"
63
  if "sold for" in fallback_text or "recently sold" in fallback_text: return "SOLD"
64
  if "taken" in fallback_text or "make an offer" in fallback_text: return "TAKEN"