Muttered3 commited on
Commit
02a8ebc
Β·
verified Β·
1 Parent(s): e2fa3ac

Update parser.py

Browse files
Files changed (1) hide show
  1. parser.py +26 -49
parser.py CHANGED
@@ -1,65 +1,42 @@
1
  import re
2
- from bs4 import BeautifulSoup
3
 
4
  def parse_html(html: str, final_url: str, word: str) -> str:
5
  """
6
- Highly resilient HTML payload engine optimized for processing
7
- both direct page DOM structures and AJAX search response grids safely.
8
  """
9
  if not html or not html.strip():
10
- return 'AVAILABLE'
11
 
12
- soup = BeautifulSoup(html, 'html.parser')
13
-
14
- # Pre-process raw text buffers cleanly
15
- full_text = " ".join(soup.get_text(separator=' ', strip=True).split()).lower()
16
  word_clean = word.strip().replace("@", "").lower()
17
-
18
- # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
19
- # APPROACH 1: DIRECT PROFILE RESOLUTION SIGNATURES
20
- # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
21
- # Safely checks for structural components if the payload is a full profile DOM
22
- if '/username/' in final_url.lower() and '?query=' not in final_url.lower():
23
- status_tag = soup.find(class_='tm-section-header-status')
24
- if status_tag:
25
- t = status_tag.get_text(strip=True).lower()
26
- if 'sold' in t: return 'SOLD'
27
- if 'taken' in t: return 'TAKEN'
28
- if 'available' in t: return 'AVAILABLE'
29
- if 'on auction' in t or 'bidding' in t: return 'ON_AUCTION'
30
- if 'sale' in t or 'purchase' in t: return 'FOR_SALE'
31
-
32
- if 'is already taken' in full_text or 'make an offer' in full_text:
33
- return 'TAKEN'
34
 
35
  # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
36
- # APPROACH 2: AJAX DATA GRID / TABLE ROW INTERCEPTIONS
37
  # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
38
- # Iterate through all functional rows or element dividers cleanly
39
- rows = soup.find_all(['tr', 'div', 'a'], class_=re.compile(re.escape('row') + r'|tm-table-row|tm-row', re.IGNORECASE)) or soup.find_all('tr')
40
 
41
- for row in rows:
42
- row_text = " ".join(row.get_text(separator=' ', strip=True).split()).lower()
43
-
44
- # Performance check mapping boundaries cleanly using exact string containment
45
- if f"@{word_clean}" in row_text or word_clean in row_text:
46
- # Check explicit validation statuses matching your matrix boundaries
47
- if 'on auction' in row_text or 'bidding' in row_text: return 'ON_AUCTION'
48
- if 'for sale' in row_text and 'not for sale' not in row_text: return 'FOR_SALE'
49
- if 'available' in row_text: return 'AVAILABLE'
50
- if 'sold' in row_text: return 'SOLD'
51
- if 'taken' in row_text or 'make an offer' in row_text: return 'TAKEN'
52
- if 'unavailable' in row_text: return 'UNAVAILABLE'
53
 
54
  # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
55
- # APPROACH 3: GLOBAL FAILSAFE PATTERN RESOLUTION
56
  # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
57
- # Ultimate boundary backup scan across text layers if explicit row bindings fail
58
- if "on auction" in full_text: return 'ON_AUCTION'
59
- if "for sale" in full_text and "not for sale" not in full_text: return 'FOR_SALE'
60
- if "sold" in full_text: return 'SOLD'
61
- if "taken" in full_text or "make an offer" in full_text: return 'TAKEN'
62
- if "unavailable" in full_text: return 'UNAVAILABLE'
63
- if "available" in full_text: return 'AVAILABLE'
 
64
 
65
- return 'UNCERTAIN'
 
1
  import re
 
2
 
3
  def parse_html(html: str, final_url: str, word: str) -> str:
4
  """
5
+ High-Performance Text Boundary Token Mapper optimized for reading
6
+ asynchronous AJAX responses with structural fallbacks.
7
  """
8
  if not html or not html.strip():
9
+ return "AVAILABLE"
10
 
11
+ # Flatten trailing breaks and tabs into clean space arrays
12
+ clean_html = " ".join(html.split())
 
 
13
  word_clean = word.strip().replace("@", "").lower()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
16
+ # ENGINE 1: EXACT ROW PATTERN VALIDATION
17
  # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
18
+ search_regex = re.compile(rf'>@{word_clean}<.*?class="[^"]*status[^"]*"[^>]*>\s*([^<]+?)\s*<', re.IGNORECASE)
19
+ match = search_regex.search(clean_html)
20
 
21
+ if match:
22
+ status_text = match.group(1).strip().lower()
23
+ if "auction" in status_text or "bidding" in status_text: return "ON_AUCTION"
24
+ if "sold" in status_text: return "SOLD"
25
+ if "unavailable" in status_text: return "UNAVAILABLE"
26
+ if "taken" in status_text or "offer" in status_text: return "TAKEN"
27
+ if "sale" in status_text or "purchase" in status_text: return "FOR_SALE"
28
+ return status_text.upper()
 
 
 
 
29
 
30
  # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
31
+ # ENGINE 2: GLOBAL TEXT BOUNDARY FALLBACK SCANS
32
  # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
33
+ fallback_text = clean_html.lower()
34
+
35
+ if "on auction" in fallback_text: return "ON_AUCTION"
36
+ if "sold for" in fallback_text or "recently sold" in fallback_text: return "SOLD"
37
+ if "taken" in fallback_text or "make an offer" in fallback_text: return "TAKEN"
38
+ if "for sale" in fallback_text or "purchase" in fallback_text: return "FOR_SALE"
39
+ if "unavailable" in fallback_text: return "UNAVAILABLE"
40
+ if "available" in fallback_text: return "AVAILABLE"
41
 
42
+ return "UNAVAILABLE"