Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
File size: 4,720 Bytes
61d29fc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | #!/usr/bin/env python3
"""
Debug script to examine eBoard page structure
"""
import asyncio
from playwright.async_api import async_playwright
from playwright_stealth import Stealth
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
async def main():
url = "https://simbli.eboardsolutions.com/SB_Meetings/SB_MeetingListing.aspx?S=2088"
base_url = "https://simbli.eboardsolutions.com"
print(f"Loading: {url}\n")
async with async_playwright() as p:
browser = await p.chromium.launch(
headless=True,
args=[
'--disable-blink-features=AutomationControlled',
'--disable-dev-shm-usage',
'--no-sandbox'
]
)
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
context = await browser.new_context(
viewport={'width': 1920, 'height': 1080},
user_agent=user_agent,
locale='en-US',
timezone_id='America/Chicago',
)
page = await context.new_page()
# Apply stealth
stealth = Stealth()
await stealth.apply_stealth_async(page)
# Navigate
response = await page.goto(url, wait_until='networkidle', timeout=60000)
print(f"Response status: {response.status}")
# Wait for JavaScript
await page.wait_for_timeout(5000)
content = await page.content()
print(f"Page size: {len(content)} bytes\n")
# Save full HTML for inspection
with open('/tmp/eboard_page.html', 'w') as f:
f.write(content)
print("Saved full HTML to /tmp/eboard_page.html\n")
# Parse with BeautifulSoup
soup = BeautifulSoup(content, 'html.parser')
# Find all links
all_links = soup.find_all('a', href=True)
print(f"Total links found: {len(all_links)}\n")
# Categorize links
mid_links = []
meetingdetail_links = []
pdf_links = []
other_links = []
for link in all_links:
href = link.get('href', '')
text = link.get_text().strip()
if 'MID=' in href.upper():
mid_links.append((href, text))
elif 'meetingdetail' in href.lower():
meetingdetail_links.append((href, text))
elif href.lower().endswith('.pdf'):
pdf_links.append((href, text))
elif href and not href.startswith('#') and not href.startswith('javascript:'):
other_links.append((href, text[:50]))
print(f"Links with MID=: {len(mid_links)}")
for href, text in mid_links[:10]:
print(f" - {text[:60]}: {href[:80]}")
print(f"\nLinks with 'meetingdetail': {len(meetingdetail_links)}")
for href, text in meetingdetail_links[:10]:
print(f" - {text[:60]}: {href[:80]}")
print(f"\nPDF links: {len(pdf_links)}")
for href, text in pdf_links[:10]:
print(f" - {text[:60]}: {href[:80]}")
print(f"\nOther significant links: {len(other_links)}")
for href, text in other_links[:20]:
print(f" - {text[:60]}: {href[:80]}")
# Look for ASP.NET ViewState and other dynamic content indicators
print("\n" + "="*80)
print("Page Analysis:")
print("="*80)
viewstate = soup.find('input', {'id': '__VIEWSTATE'})
if viewstate:
print(f"✓ ASP.NET ViewState present ({len(viewstate.get('value', ''))} chars)")
# Look for tables or grids that might contain meetings
tables = soup.find_all('table')
print(f"Tables found: {len(tables)}")
for i, table in enumerate(tables[:5]):
rows = table.find_all('tr')
print(f" Table {i+1}: {len(rows)} rows")
if rows:
first_row_text = rows[0].get_text().strip()[:100]
print(f" First row: {first_row_text}")
# Look for JavaScript-rendered content
scripts = soup.find_all('script')
print(f"\nJavaScript blocks: {len(scripts)}")
# Check for common eBoard element IDs
meeting_list_elem = soup.find(id=re.compile(r'meeting.*list', re.I))
if meeting_list_elem:
print(f"✓ Found element with 'meeting' and 'list' in ID: {meeting_list_elem.get('id')}")
await browser.close()
if __name__ == "__main__":
asyncio.run(main())
|