File size: 11,029 Bytes
61d29fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
"""
Alternative eBoard scraper using undetected-chromedriver
This bypasses Incapsula without manual cookies
"""
import asyncio
import re
from typing import Dict, Any, List
from pathlib import Path
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime
import hashlib

from loguru import logger


class UndetectedEboardScraper:
    """
    Scrape eBoard using undetected-chromedriver to bypass Incapsula.
    
    This library patches Selenium ChromeDriver to avoid detection by:
    - Removing Selenium markers from navigator.webdriver
    - Randomizing browser fingerprints
    - Using real Chrome instead of ChromeDriver
    """
    
    async def scrape_eboard(
        self,
        url: str,
        municipality: str,
        state: str,
        school_id: str = None
    ) -> List[Dict[str, Any]]:
        """
        Scrape eBoard platform without manual cookies.
        
        Args:
            url: eBoard URL
            municipality: School district name
            state: State code
            school_id: Optional school ID (extracted from URL if not provided)
        
        Returns:
            List of meeting documents
        """
        try:
            import undetected_chromedriver as uc
            from selenium.webdriver.common.by import By
            from selenium.webdriver.support.ui import WebDriverWait
            from selenium.webdriver.support import expected_conditions as EC
            import time
            import random
        except ImportError:
            logger.error("Missing undetected-chromedriver. Install: pip install undetected-chromedriver")
            return []
        
        # Extract school ID
        if not school_id:
            match = re.search(r'[?&]s=(\d+)', url, re.IGNORECASE)
            school_id = match.group(1) if match else None
        
        if not school_id:
            logger.error(f"Could not extract school ID from URL: {url}")
            return []
        
        base_url = "https://simbli.eboardsolutions.com"
        meetings_url = f"{base_url}/SB_Meetings/SB_MeetingListing.aspx?S={school_id}"
        
        logger.info(f"Using undetected-chromedriver to bypass Incapsula")
        logger.info(f"Target: {meetings_url}")
        
        documents = []
        
        try:
            # Create undetected Chrome instance
            options = uc.ChromeOptions()
            # options.add_argument('--headless')  # Headless may still be detected
            options.add_argument('--no-sandbox')
            options.add_argument('--disable-dev-shm-usage')
            options.add_argument('--disable-blink-features=AutomationControlled')
            
            # Create driver with version management
            driver = uc.Chrome(options=options, version_main=None)
            
            logger.info("Chrome launched with anti-detection patches")
            
            # Navigate to meetings page
            driver.get(meetings_url)
            logger.info(f"Loaded page: {driver.title[:100]}")
            
            # Wait for Incapsula challenge to complete
            # The challenge usually takes 3-5 seconds
            wait_time = random.uniform(5.0, 8.0)
            logger.info(f"Waiting {wait_time:.1f}s for Incapsula challenge...")
            time.sleep(wait_time)
            
            # Check if we bypassed Incapsula
            page_source = driver.page_source
            
            if 'Incapsula' in page_source and len(page_source) < 10000:
                logger.error("Still blocked by Incapsula")
                logger.warning("Try running with headless=False or use Option 2 (Residential Proxies)")
                driver.quit()
                return []
            
            logger.success(f"✓ Bypassed Incapsula! Page size: {len(page_source)} bytes")
            
            # Parse the page
            soup = BeautifulSoup(page_source, 'html.parser')
            
            # Extract meeting links
            meeting_links = []
            
            # Method 1: Look for MID parameter
            for link in soup.find_all('a', href=True):
                href = link.get('href', '')
                text = link.get_text().strip()
                
                if 'MID=' in href.upper() or 'meetingdetail' in href.lower():
                    full_url = urljoin(base_url, href)
                    meeting_links.append({
                        'url': full_url,
                        'text': text,
                        'type': 'meeting'
                    })
                elif href.lower().endswith('.pdf'):
                    full_url = urljoin(base_url, href)
                    meeting_links.append({
                        'url': full_url,
                        'text': text,
                        'type': 'pdf'
                    })
            
            logger.info(f"Found {len(meeting_links)} meeting/document links")
            
            # If no links found, try JavaScript execution
            if len(meeting_links) == 0:
                logger.warning("No links found in HTML, checking for JavaScript-rendered content...")
                
                # Wait for dynamic content
                try:
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.TAG_NAME, "a"))
                    )
                    time.sleep(3)  # Additional wait for JS
                    
                    # Re-parse
                    page_source = driver.page_source
                    soup = BeautifulSoup(page_source, 'html.parser')
                    
                    for link in soup.find_all('a', href=True):
                        href = link.get('href', '')
                        text = link.get_text().strip()
                        
                        if 'MID=' in href.upper() or href.lower().endswith('.pdf'):
                            full_url = urljoin(base_url, href)
                            meeting_links.append({
                                'url': full_url,
                                'text': text,
                                'type': 'pdf' if href.lower().endswith('.pdf') else 'meeting'
                            })
                    
                    logger.info(f"After JS wait: Found {len(meeting_links)} links")
                except Exception as e:
                    logger.warning(f"JS content wait failed: {e}")
            
            # Process meeting links (limit to prevent overwhelming)
            for idx, meeting_info in enumerate(meeting_links[:50]):
                if idx > 0 and idx % 10 == 0:
                    logger.info(f"Progress: {idx}/{min(50, len(meeting_links))}")
                
                # Human-like delay
                time.sleep(random.uniform(2.0, 5.0))
                
                try:
                    meeting_url = meeting_info['url']
                    meeting_title = meeting_info['text']
                    
                    if meeting_info['type'] == 'pdf':
                        # Download PDF directly
                        logger.debug(f"  Downloading PDF: {meeting_title[:50]}")
                        # TODO: Implement PDF download
                        # For now, just record the URL
                        doc = {
                            'document_id': hashlib.md5(f"{meeting_url}{municipality}".encode()).hexdigest(),
                            'source_url': meeting_url,
                            'municipality': municipality,
                            'state': state,
                            'meeting_date': datetime.now(),
                            'meeting_type': 'Board Meeting',
                            'title': meeting_title,
                            'content': '',  # Would need PDF extraction
                            'metadata': {
                                'platform': 'eboard',
                                'school_id': school_id,
                                'scraped_with': 'undetected_chromedriver'
                            }
                        }
                        documents.append(doc)
                    else:
                        # Navigate to meeting detail page
                        logger.debug(f"  Loading meeting: {meeting_title[:50]}")
                        driver.get(meeting_url)
                        time.sleep(random.uniform(2.0, 4.0))
                        
                        meeting_soup = BeautifulSoup(driver.page_source, 'html.parser')
                        
                        # Extract PDFs from meeting page
                        for link in meeting_soup.find_all('a', href=True):
                            href = link.get('href', '')
                            if href.lower().endswith('.pdf'):
                                doc_url = urljoin(base_url, href)
                                doc_title = link.get_text().strip()
                                
                                doc = {
                                    'document_id': hashlib.md5(f"{doc_url}{municipality}".encode()).hexdigest(),
                                    'source_url': doc_url,
                                    'municipality': municipality,
                                    'state': state,
                                    'meeting_date': datetime.now(),
                                    'meeting_type': 'Board Meeting',
                                    'title': doc_title or meeting_title,
                                    'content': '',
                                    'metadata': {
                                        'platform': 'eboard',
                                        'meeting_page': meeting_url,
                                        'school_id': school_id,
                                        'scraped_with': 'undetected_chromedriver'
                                    }
                                }
                                documents.append(doc)
                                logger.success(f"    ✓ Found: {doc_title[:50]}")
                
                except Exception as e:
                    logger.error(f"Error processing {meeting_info.get('text', 'unknown')}: {e}")
                    continue
            
            driver.quit()
            logger.success(f"Scraping complete: {len(documents)} documents")
            return documents
            
        except Exception as e:
            logger.error(f"Error in undetected scraper: {e}")
            import traceback
            logger.error(traceback.format_exc())
            return []


# Example usage
async def main():
    scraper = UndetectedEboardScraper()
    docs = await scraper.scrape_eboard(
        url="http://simbli.eboardsolutions.com/index.aspx?s=2088",
        municipality="Tuscaloosa City Schools",
        state="AL"
    )
    print(f"Scraped {len(docs)} documents")


if __name__ == "__main__":
    asyncio.run(main())