Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
File size: 11,029 Bytes
61d29fc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 | """
Alternative eBoard scraper using undetected-chromedriver
This bypasses Incapsula without manual cookies
"""
import asyncio
import re
from typing import Dict, Any, List
from pathlib import Path
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime
import hashlib
from loguru import logger
class UndetectedEboardScraper:
"""
Scrape eBoard using undetected-chromedriver to bypass Incapsula.
This library patches Selenium ChromeDriver to avoid detection by:
- Removing Selenium markers from navigator.webdriver
- Randomizing browser fingerprints
- Using real Chrome instead of ChromeDriver
"""
async def scrape_eboard(
self,
url: str,
municipality: str,
state: str,
school_id: str = None
) -> List[Dict[str, Any]]:
"""
Scrape eBoard platform without manual cookies.
Args:
url: eBoard URL
municipality: School district name
state: State code
school_id: Optional school ID (extracted from URL if not provided)
Returns:
List of meeting documents
"""
try:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import random
except ImportError:
logger.error("Missing undetected-chromedriver. Install: pip install undetected-chromedriver")
return []
# Extract school ID
if not school_id:
match = re.search(r'[?&]s=(\d+)', url, re.IGNORECASE)
school_id = match.group(1) if match else None
if not school_id:
logger.error(f"Could not extract school ID from URL: {url}")
return []
base_url = "https://simbli.eboardsolutions.com"
meetings_url = f"{base_url}/SB_Meetings/SB_MeetingListing.aspx?S={school_id}"
logger.info(f"Using undetected-chromedriver to bypass Incapsula")
logger.info(f"Target: {meetings_url}")
documents = []
try:
# Create undetected Chrome instance
options = uc.ChromeOptions()
# options.add_argument('--headless') # Headless may still be detected
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-blink-features=AutomationControlled')
# Create driver with version management
driver = uc.Chrome(options=options, version_main=None)
logger.info("Chrome launched with anti-detection patches")
# Navigate to meetings page
driver.get(meetings_url)
logger.info(f"Loaded page: {driver.title[:100]}")
# Wait for Incapsula challenge to complete
# The challenge usually takes 3-5 seconds
wait_time = random.uniform(5.0, 8.0)
logger.info(f"Waiting {wait_time:.1f}s for Incapsula challenge...")
time.sleep(wait_time)
# Check if we bypassed Incapsula
page_source = driver.page_source
if 'Incapsula' in page_source and len(page_source) < 10000:
logger.error("Still blocked by Incapsula")
logger.warning("Try running with headless=False or use Option 2 (Residential Proxies)")
driver.quit()
return []
logger.success(f"✓ Bypassed Incapsula! Page size: {len(page_source)} bytes")
# Parse the page
soup = BeautifulSoup(page_source, 'html.parser')
# Extract meeting links
meeting_links = []
# Method 1: Look for MID parameter
for link in soup.find_all('a', href=True):
href = link.get('href', '')
text = link.get_text().strip()
if 'MID=' in href.upper() or 'meetingdetail' in href.lower():
full_url = urljoin(base_url, href)
meeting_links.append({
'url': full_url,
'text': text,
'type': 'meeting'
})
elif href.lower().endswith('.pdf'):
full_url = urljoin(base_url, href)
meeting_links.append({
'url': full_url,
'text': text,
'type': 'pdf'
})
logger.info(f"Found {len(meeting_links)} meeting/document links")
# If no links found, try JavaScript execution
if len(meeting_links) == 0:
logger.warning("No links found in HTML, checking for JavaScript-rendered content...")
# Wait for dynamic content
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "a"))
)
time.sleep(3) # Additional wait for JS
# Re-parse
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
for link in soup.find_all('a', href=True):
href = link.get('href', '')
text = link.get_text().strip()
if 'MID=' in href.upper() or href.lower().endswith('.pdf'):
full_url = urljoin(base_url, href)
meeting_links.append({
'url': full_url,
'text': text,
'type': 'pdf' if href.lower().endswith('.pdf') else 'meeting'
})
logger.info(f"After JS wait: Found {len(meeting_links)} links")
except Exception as e:
logger.warning(f"JS content wait failed: {e}")
# Process meeting links (limit to prevent overwhelming)
for idx, meeting_info in enumerate(meeting_links[:50]):
if idx > 0 and idx % 10 == 0:
logger.info(f"Progress: {idx}/{min(50, len(meeting_links))}")
# Human-like delay
time.sleep(random.uniform(2.0, 5.0))
try:
meeting_url = meeting_info['url']
meeting_title = meeting_info['text']
if meeting_info['type'] == 'pdf':
# Download PDF directly
logger.debug(f" Downloading PDF: {meeting_title[:50]}")
# TODO: Implement PDF download
# For now, just record the URL
doc = {
'document_id': hashlib.md5(f"{meeting_url}{municipality}".encode()).hexdigest(),
'source_url': meeting_url,
'municipality': municipality,
'state': state,
'meeting_date': datetime.now(),
'meeting_type': 'Board Meeting',
'title': meeting_title,
'content': '', # Would need PDF extraction
'metadata': {
'platform': 'eboard',
'school_id': school_id,
'scraped_with': 'undetected_chromedriver'
}
}
documents.append(doc)
else:
# Navigate to meeting detail page
logger.debug(f" Loading meeting: {meeting_title[:50]}")
driver.get(meeting_url)
time.sleep(random.uniform(2.0, 4.0))
meeting_soup = BeautifulSoup(driver.page_source, 'html.parser')
# Extract PDFs from meeting page
for link in meeting_soup.find_all('a', href=True):
href = link.get('href', '')
if href.lower().endswith('.pdf'):
doc_url = urljoin(base_url, href)
doc_title = link.get_text().strip()
doc = {
'document_id': hashlib.md5(f"{doc_url}{municipality}".encode()).hexdigest(),
'source_url': doc_url,
'municipality': municipality,
'state': state,
'meeting_date': datetime.now(),
'meeting_type': 'Board Meeting',
'title': doc_title or meeting_title,
'content': '',
'metadata': {
'platform': 'eboard',
'meeting_page': meeting_url,
'school_id': school_id,
'scraped_with': 'undetected_chromedriver'
}
}
documents.append(doc)
logger.success(f" ✓ Found: {doc_title[:50]}")
except Exception as e:
logger.error(f"Error processing {meeting_info.get('text', 'unknown')}: {e}")
continue
driver.quit()
logger.success(f"Scraping complete: {len(documents)} documents")
return documents
except Exception as e:
logger.error(f"Error in undetected scraper: {e}")
import traceback
logger.error(traceback.format_exc())
return []
# Example usage
async def main():
scraper = UndetectedEboardScraper()
docs = await scraper.scrape_eboard(
url="http://simbli.eboardsolutions.com/index.aspx?s=2088",
municipality="Tuscaloosa City Schools",
state="AL"
)
print(f"Scraped {len(docs)} documents")
if __name__ == "__main__":
asyncio.run(main())
|