Spaces:
Paused
Paused
itsOwen commited on
Commit ·
e0f5917
1
Parent(s): 2317670
Fix multipage scraping with captcha mode - now scrapes all pages after CAPTCHA solved
Browse files
src/scrapers/playwright_scraper.py
CHANGED
|
@@ -180,7 +180,7 @@ class PlaywrightScraper(BaseScraper):
|
|
| 180 |
context = None
|
| 181 |
try:
|
| 182 |
if handle_captcha:
|
| 183 |
-
# For CAPTCHA mode: create context, handle CAPTCHA, then scrape
|
| 184 |
context = await self.create_context(browser, proxy)
|
| 185 |
page = await context.new_page()
|
| 186 |
|
|
@@ -191,10 +191,27 @@ class PlaywrightScraper(BaseScraper):
|
|
| 191 |
await self.handle_captcha(page, url)
|
| 192 |
|
| 193 |
# After CAPTCHA is solved, get content from the current page
|
| 194 |
-
# instead of creating a new context
|
| 195 |
await asyncio.sleep(self.config.delay_after_load)
|
| 196 |
-
|
| 197 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
else:
|
| 199 |
# Normal mode: use scrape_multiple_pages
|
| 200 |
contents = await self.scrape_multiple_pages(browser, url, pages, url_pattern, proxy)
|
|
|
|
| 180 |
context = None
|
| 181 |
try:
|
| 182 |
if handle_captcha:
|
| 183 |
+
# For CAPTCHA mode: create context, handle CAPTCHA, then scrape pages
|
| 184 |
context = await self.create_context(browser, proxy)
|
| 185 |
page = await context.new_page()
|
| 186 |
|
|
|
|
| 191 |
await self.handle_captcha(page, url)
|
| 192 |
|
| 193 |
# After CAPTCHA is solved, get content from the current page
|
|
|
|
| 194 |
await asyncio.sleep(self.config.delay_after_load)
|
| 195 |
+
first_page_content = await page.content()
|
| 196 |
+
|
| 197 |
+
# Check if we need to scrape multiple pages
|
| 198 |
+
if pages:
|
| 199 |
+
page_numbers = self.parse_page_numbers(pages)
|
| 200 |
+
if not url_pattern:
|
| 201 |
+
url_pattern = self.detect_url_pattern(url)
|
| 202 |
+
|
| 203 |
+
contents = [first_page_content] # First page already scraped
|
| 204 |
+
|
| 205 |
+
# Scrape remaining pages (skip first one since we already have it)
|
| 206 |
+
for page_num in page_numbers[1:]:
|
| 207 |
+
page_url = self.apply_url_pattern(url, url_pattern, page_num) if url_pattern else url
|
| 208 |
+
self.logger.info(f"Scraping page {page_num}: {page_url}")
|
| 209 |
+
await page.goto(page_url, wait_until=self.config.wait_for, timeout=self.config.timeout)
|
| 210 |
+
await asyncio.sleep(self.config.delay_after_load)
|
| 211 |
+
content = await page.content()
|
| 212 |
+
contents.append(content)
|
| 213 |
+
else:
|
| 214 |
+
contents = [first_page_content]
|
| 215 |
else:
|
| 216 |
# Normal mode: use scrape_multiple_pages
|
| 217 |
contents = await self.scrape_multiple_pages(browser, url, pages, url_pattern, proxy)
|