Karim shoair commited on
Commit
0335389
·
1 Parent(s): 50f8455

fix: A workaround for Playwright issue with page content retrieving on windows

Browse files
scrapling/engines/_browsers/_camoufox.py CHANGED
@@ -14,6 +14,7 @@ from playwright.async_api import (
14
  Locator as AsyncLocator,
15
  Page as async_Page,
16
  )
 
17
 
18
  from ._validators import validate, CamoufoxConfig
19
  from ._base import SyncSession, AsyncSession, StealthySessionMixin
@@ -201,20 +202,34 @@ class StealthySession(StealthySessionMixin, SyncSession):
201
 
202
  self._closed = True
203
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  def _solve_cloudflare(self, page: Page) -> None: # pragma: no cover
205
  """Solve the cloudflare challenge displayed on the playwright page passed
206
 
207
  :param page: The targeted page
208
  :return:
209
  """
210
- challenge_type = self._detect_cloudflare(page.content())
211
  if not challenge_type:
212
  log.error("No Cloudflare challenge found.")
213
  return
214
  else:
215
  log.info(f'The turnstile version discovered is "{challenge_type}"')
216
  if challenge_type == "non-interactive":
217
- while "<title>Just a moment...</title>" in (page.content()):
218
  log.info("Waiting for Cloudflare wait page to disappear.")
219
  page.wait_for_timeout(1000)
220
  page.wait_for_load_state()
@@ -222,7 +237,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
222
  return
223
 
224
  else:
225
- while "Verifying you are human." in page.content():
226
  # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
227
  page.wait_for_timeout(500)
228
 
@@ -506,20 +521,34 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
506
 
507
  self._closed = True
508
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
509
  async def _solve_cloudflare(self, page: async_Page):
510
  """Solve the cloudflare challenge displayed on the playwright page passed. The async version
511
 
512
  :param page: The async targeted page
513
  :return:
514
  """
515
- challenge_type = self._detect_cloudflare(await page.content())
516
  if not challenge_type:
517
  log.error("No Cloudflare challenge found.")
518
  return
519
  else:
520
  log.info(f'The turnstile version discovered is "{challenge_type}"')
521
  if challenge_type == "non-interactive": # pragma: no cover
522
- while "<title>Just a moment...</title>" in (await page.content()):
523
  log.info("Waiting for Cloudflare wait page to disappear.")
524
  await page.wait_for_timeout(1000)
525
  await page.wait_for_load_state()
@@ -527,7 +556,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
527
  return
528
 
529
  else:
530
- while "Verifying you are human." in (await page.content()):
531
  # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
532
  await page.wait_for_timeout(500)
533
 
 
14
  Locator as AsyncLocator,
15
  Page as async_Page,
16
  )
17
+ from playwright._impl._errors import Error as PlaywrightError
18
 
19
  from ._validators import validate, CamoufoxConfig
20
  from ._base import SyncSession, AsyncSession, StealthySessionMixin
 
202
 
203
  self._closed = True
204
 
205
+ @staticmethod
206
+ def _get_page_content(page: Page) -> str | None:
207
+ """
208
+ A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
209
+ :param page: The page to extract content from.
210
+ :return:
211
+ """
212
+ while True:
213
+ try:
214
+ return page.content() or ""
215
+ except PlaywrightError:
216
+ page.wait_for_timeout(1000)
217
+ continue
218
+
219
  def _solve_cloudflare(self, page: Page) -> None: # pragma: no cover
220
  """Solve the cloudflare challenge displayed on the playwright page passed
221
 
222
  :param page: The targeted page
223
  :return:
224
  """
225
+ challenge_type = self._detect_cloudflare(self._get_page_content(page))
226
  if not challenge_type:
227
  log.error("No Cloudflare challenge found.")
228
  return
229
  else:
230
  log.info(f'The turnstile version discovered is "{challenge_type}"')
231
  if challenge_type == "non-interactive":
232
+ while "<title>Just a moment...</title>" in (self._get_page_content(page)):
233
  log.info("Waiting for Cloudflare wait page to disappear.")
234
  page.wait_for_timeout(1000)
235
  page.wait_for_load_state()
 
237
  return
238
 
239
  else:
240
+ while "Verifying you are human." in self._get_page_content(page):
241
  # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
242
  page.wait_for_timeout(500)
243
 
 
521
 
522
  self._closed = True
523
 
524
+ @staticmethod
525
+ async def _get_page_content(page: async_Page) -> str | None:
526
+ """
527
+ A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
528
+ :param page: The page to extract content from.
529
+ :return:
530
+ """
531
+ while True:
532
+ try:
533
+ return (await page.content()) or ""
534
+ except PlaywrightError:
535
+ await page.wait_for_timeout(1000)
536
+ continue
537
+
538
  async def _solve_cloudflare(self, page: async_Page):
539
  """Solve the cloudflare challenge displayed on the playwright page passed. The async version
540
 
541
  :param page: The async targeted page
542
  :return:
543
  """
544
+ challenge_type = self._detect_cloudflare(await self._get_page_content(page))
545
  if not challenge_type:
546
  log.error("No Cloudflare challenge found.")
547
  return
548
  else:
549
  log.info(f'The turnstile version discovered is "{challenge_type}"')
550
  if challenge_type == "non-interactive": # pragma: no cover
551
+ while "<title>Just a moment...</title>" in (await self._get_page_content(page)):
552
  log.info("Waiting for Cloudflare wait page to disappear.")
553
  await page.wait_for_timeout(1000)
554
  await page.wait_for_load_state()
 
556
  return
557
 
558
  else:
559
+ while "Verifying you are human." in (await self._get_page_content(page)):
560
  # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
561
  await page.wait_for_timeout(500)
562