itsOwen commited on
Commit
e0f5917
·
1 Parent(s): 2317670

Fix multipage scraping with captcha mode - now scrapes all pages after CAPTCHA solved

Browse files
Files changed (1) hide show
  1. src/scrapers/playwright_scraper.py +21 -4
src/scrapers/playwright_scraper.py CHANGED
@@ -180,7 +180,7 @@ class PlaywrightScraper(BaseScraper):
180
  context = None
181
  try:
182
  if handle_captcha:
183
- # For CAPTCHA mode: create context, handle CAPTCHA, then scrape from same page
184
  context = await self.create_context(browser, proxy)
185
  page = await context.new_page()
186
 
@@ -191,10 +191,27 @@ class PlaywrightScraper(BaseScraper):
191
  await self.handle_captcha(page, url)
192
 
193
  # After CAPTCHA is solved, get content from the current page
194
- # instead of creating a new context
195
  await asyncio.sleep(self.config.delay_after_load)
196
- content = await page.content()
197
- contents = [content]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  else:
199
  # Normal mode: use scrape_multiple_pages
200
  contents = await self.scrape_multiple_pages(browser, url, pages, url_pattern, proxy)
 
180
  context = None
181
  try:
182
  if handle_captcha:
183
+ # For CAPTCHA mode: create context, handle CAPTCHA, then scrape pages
184
  context = await self.create_context(browser, proxy)
185
  page = await context.new_page()
186
 
 
191
  await self.handle_captcha(page, url)
192
 
193
  # After CAPTCHA is solved, get content from the current page
 
194
  await asyncio.sleep(self.config.delay_after_load)
195
+ first_page_content = await page.content()
196
+
197
+ # Check if we need to scrape multiple pages
198
+ if pages:
199
+ page_numbers = self.parse_page_numbers(pages)
200
+ if not url_pattern:
201
+ url_pattern = self.detect_url_pattern(url)
202
+
203
+ contents = [first_page_content] # First page already scraped
204
+
205
+ # Scrape remaining pages (skip first one since we already have it)
206
+ for page_num in page_numbers[1:]:
207
+ page_url = self.apply_url_pattern(url, url_pattern, page_num) if url_pattern else url
208
+ self.logger.info(f"Scraping page {page_num}: {page_url}")
209
+ await page.goto(page_url, wait_until=self.config.wait_for, timeout=self.config.timeout)
210
+ await asyncio.sleep(self.config.delay_after_load)
211
+ content = await page.content()
212
+ contents.append(content)
213
+ else:
214
+ contents = [first_page_content]
215
  else:
216
  # Normal mode: use scrape_multiple_pages
217
  contents = await self.scrape_multiple_pages(browser, url, pages, url_pattern, proxy)