Muttered3 commited on
Commit
e2fa3ac
Β·
verified Β·
1 Parent(s): 8e12b43

Update parser.py

Browse files
Files changed (1) hide show
  1. parser.py +50 -18
parser.py CHANGED
@@ -1,33 +1,65 @@
 
1
  from bs4 import BeautifulSoup
2
 
3
  def parse_html(html: str, final_url: str, word: str) -> str:
4
- soup = BeautifulSoup(html, 'lxml')
5
- full_text = soup.get_text(separator=' ', strip=True).lower()
6
- is_direct = '/username/' in final_url and '?query=' not in final_url
 
 
 
7
 
8
- if is_direct:
 
 
 
 
 
 
 
 
 
 
9
  status_tag = soup.find(class_='tm-section-header-status')
10
  if status_tag:
11
  t = status_tag.get_text(strip=True).lower()
12
  if 'sold' in t: return 'SOLD'
13
  if 'taken' in t: return 'TAKEN'
14
  if 'available' in t: return 'AVAILABLE'
15
- if 'on auction' in t: return 'ON_AUCTION'
16
- if 'for sale' in t: return 'FOR_SALE'
17
-
18
- if 'is already taken' in full_text:
19
  return 'TAKEN'
20
 
21
- rows = soup.find_all('tr')
 
 
 
 
 
22
  for row in rows:
23
- text = row.get_text(separator=' ', strip=True).lower()
24
- if f"@{word}" in text.split():
25
- if 'unavailable' in text and 'unknown' in text: return 'UNAVAILABLE'
26
- if 'on auction' in text: return 'ON_AUCTION'
27
- if 'for sale' in text and 'not for sale' not in text: return 'FOR_SALE'
28
- if 'available' in text: return 'AVAILABLE'
29
- if 'sold' in text: return 'SOLD'
30
- if 'taken' in text: return 'TAKEN'
31
- if 'unavailable' in text: return 'UNAVAILABLE'
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  return 'UNCERTAIN'
 
1
+ import re
2
  from bs4 import BeautifulSoup
3
 
4
  def parse_html(html: str, final_url: str, word: str) -> str:
5
+ """
6
+ Highly resilient HTML payload engine optimized for processing
7
+ both direct page DOM structures and AJAX search response grids safely.
8
+ """
9
+ if not html or not html.strip():
10
+ return 'AVAILABLE'
11
 
12
+ soup = BeautifulSoup(html, 'html.parser')
13
+
14
+ # Pre-process raw text buffers cleanly
15
+ full_text = " ".join(soup.get_text(separator=' ', strip=True).split()).lower()
16
+ word_clean = word.strip().replace("@", "").lower()
17
+
18
+ # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
19
+ # APPROACH 1: DIRECT PROFILE RESOLUTION SIGNATURES
20
+ # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
21
+ # Safely checks for structural components if the payload is a full profile DOM
22
+ if '/username/' in final_url.lower() and '?query=' not in final_url.lower():
23
  status_tag = soup.find(class_='tm-section-header-status')
24
  if status_tag:
25
  t = status_tag.get_text(strip=True).lower()
26
  if 'sold' in t: return 'SOLD'
27
  if 'taken' in t: return 'TAKEN'
28
  if 'available' in t: return 'AVAILABLE'
29
+ if 'on auction' in t or 'bidding' in t: return 'ON_AUCTION'
30
+ if 'sale' in t or 'purchase' in t: return 'FOR_SALE'
31
+
32
+ if 'is already taken' in full_text or 'make an offer' in full_text:
33
  return 'TAKEN'
34
 
35
+ # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
36
+ # APPROACH 2: AJAX DATA GRID / TABLE ROW INTERCEPTIONS
37
+ # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
38
+ # Iterate through all functional rows or element dividers cleanly
39
+ rows = soup.find_all(['tr', 'div', 'a'], class_=re.compile(re.escape('row') + r'|tm-table-row|tm-row', re.IGNORECASE)) or soup.find_all('tr')
40
+
41
  for row in rows:
42
+ row_text = " ".join(row.get_text(separator=' ', strip=True).split()).lower()
43
+
44
+ # Performance check mapping boundaries cleanly using exact string containment
45
+ if f"@{word_clean}" in row_text or word_clean in row_text:
46
+ # Check explicit validation statuses matching your matrix boundaries
47
+ if 'on auction' in row_text or 'bidding' in row_text: return 'ON_AUCTION'
48
+ if 'for sale' in row_text and 'not for sale' not in row_text: return 'FOR_SALE'
49
+ if 'available' in row_text: return 'AVAILABLE'
50
+ if 'sold' in row_text: return 'SOLD'
51
+ if 'taken' in row_text or 'make an offer' in row_text: return 'TAKEN'
52
+ if 'unavailable' in row_text: return 'UNAVAILABLE'
53
+
54
+ # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
55
+ # APPROACH 3: GLOBAL FAILSAFE PATTERN RESOLUTION
56
+ # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
57
+ # Ultimate boundary backup scan across text layers if explicit row bindings fail
58
+ if "on auction" in full_text: return 'ON_AUCTION'
59
+ if "for sale" in full_text and "not for sale" not in full_text: return 'FOR_SALE'
60
+ if "sold" in full_text: return 'SOLD'
61
+ if "taken" in full_text or "make an offer" in full_text: return 'TAKEN'
62
+ if "unavailable" in full_text: return 'UNAVAILABLE'
63
+ if "available" in full_text: return 'AVAILABLE'
64
 
65
  return 'UNCERTAIN'