Karim shoair commited on
Commit ·
0335389
1
Parent(s): 50f8455
fix: A workaround for Playwright issue with page content retrieving on windows
Browse files
scrapling/engines/_browsers/_camoufox.py
CHANGED
|
@@ -14,6 +14,7 @@ from playwright.async_api import (
|
|
| 14 |
Locator as AsyncLocator,
|
| 15 |
Page as async_Page,
|
| 16 |
)
|
|
|
|
| 17 |
|
| 18 |
from ._validators import validate, CamoufoxConfig
|
| 19 |
from ._base import SyncSession, AsyncSession, StealthySessionMixin
|
|
@@ -201,20 +202,34 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
| 201 |
|
| 202 |
self._closed = True
|
| 203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
def _solve_cloudflare(self, page: Page) -> None: # pragma: no cover
|
| 205 |
"""Solve the cloudflare challenge displayed on the playwright page passed
|
| 206 |
|
| 207 |
:param page: The targeted page
|
| 208 |
:return:
|
| 209 |
"""
|
| 210 |
-
challenge_type = self._detect_cloudflare(
|
| 211 |
if not challenge_type:
|
| 212 |
log.error("No Cloudflare challenge found.")
|
| 213 |
return
|
| 214 |
else:
|
| 215 |
log.info(f'The turnstile version discovered is "{challenge_type}"')
|
| 216 |
if challenge_type == "non-interactive":
|
| 217 |
-
while "<title>Just a moment...</title>" in (
|
| 218 |
log.info("Waiting for Cloudflare wait page to disappear.")
|
| 219 |
page.wait_for_timeout(1000)
|
| 220 |
page.wait_for_load_state()
|
|
@@ -222,7 +237,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
| 222 |
return
|
| 223 |
|
| 224 |
else:
|
| 225 |
-
while "Verifying you are human." in
|
| 226 |
# Waiting for the verify spinner to disappear, checking every 1s if it disappeared
|
| 227 |
page.wait_for_timeout(500)
|
| 228 |
|
|
@@ -506,20 +521,34 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
| 506 |
|
| 507 |
self._closed = True
|
| 508 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 509 |
async def _solve_cloudflare(self, page: async_Page):
|
| 510 |
"""Solve the cloudflare challenge displayed on the playwright page passed. The async version
|
| 511 |
|
| 512 |
:param page: The async targeted page
|
| 513 |
:return:
|
| 514 |
"""
|
| 515 |
-
challenge_type = self._detect_cloudflare(await
|
| 516 |
if not challenge_type:
|
| 517 |
log.error("No Cloudflare challenge found.")
|
| 518 |
return
|
| 519 |
else:
|
| 520 |
log.info(f'The turnstile version discovered is "{challenge_type}"')
|
| 521 |
if challenge_type == "non-interactive": # pragma: no cover
|
| 522 |
-
while "<title>Just a moment...</title>" in (await
|
| 523 |
log.info("Waiting for Cloudflare wait page to disappear.")
|
| 524 |
await page.wait_for_timeout(1000)
|
| 525 |
await page.wait_for_load_state()
|
|
@@ -527,7 +556,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
| 527 |
return
|
| 528 |
|
| 529 |
else:
|
| 530 |
-
while "Verifying you are human." in (await
|
| 531 |
# Waiting for the verify spinner to disappear, checking every 1s if it disappeared
|
| 532 |
await page.wait_for_timeout(500)
|
| 533 |
|
|
|
|
| 14 |
Locator as AsyncLocator,
|
| 15 |
Page as async_Page,
|
| 16 |
)
|
| 17 |
+
from playwright._impl._errors import Error as PlaywrightError
|
| 18 |
|
| 19 |
from ._validators import validate, CamoufoxConfig
|
| 20 |
from ._base import SyncSession, AsyncSession, StealthySessionMixin
|
|
|
|
| 202 |
|
| 203 |
self._closed = True
|
| 204 |
|
| 205 |
+
@staticmethod
|
| 206 |
+
def _get_page_content(page: Page) -> str | None:
|
| 207 |
+
"""
|
| 208 |
+
A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
|
| 209 |
+
:param page: The page to extract content from.
|
| 210 |
+
:return:
|
| 211 |
+
"""
|
| 212 |
+
while True:
|
| 213 |
+
try:
|
| 214 |
+
return page.content() or ""
|
| 215 |
+
except PlaywrightError:
|
| 216 |
+
page.wait_for_timeout(1000)
|
| 217 |
+
continue
|
| 218 |
+
|
| 219 |
def _solve_cloudflare(self, page: Page) -> None: # pragma: no cover
|
| 220 |
"""Solve the cloudflare challenge displayed on the playwright page passed
|
| 221 |
|
| 222 |
:param page: The targeted page
|
| 223 |
:return:
|
| 224 |
"""
|
| 225 |
+
challenge_type = self._detect_cloudflare(self._get_page_content(page))
|
| 226 |
if not challenge_type:
|
| 227 |
log.error("No Cloudflare challenge found.")
|
| 228 |
return
|
| 229 |
else:
|
| 230 |
log.info(f'The turnstile version discovered is "{challenge_type}"')
|
| 231 |
if challenge_type == "non-interactive":
|
| 232 |
+
while "<title>Just a moment...</title>" in (self._get_page_content(page)):
|
| 233 |
log.info("Waiting for Cloudflare wait page to disappear.")
|
| 234 |
page.wait_for_timeout(1000)
|
| 235 |
page.wait_for_load_state()
|
|
|
|
| 237 |
return
|
| 238 |
|
| 239 |
else:
|
| 240 |
+
while "Verifying you are human." in self._get_page_content(page):
|
| 241 |
# Waiting for the verify spinner to disappear, checking every 1s if it disappeared
|
| 242 |
page.wait_for_timeout(500)
|
| 243 |
|
|
|
|
| 521 |
|
| 522 |
self._closed = True
|
| 523 |
|
| 524 |
+
@staticmethod
|
| 525 |
+
async def _get_page_content(page: async_Page) -> str | None:
|
| 526 |
+
"""
|
| 527 |
+
A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
|
| 528 |
+
:param page: The page to extract content from.
|
| 529 |
+
:return:
|
| 530 |
+
"""
|
| 531 |
+
while True:
|
| 532 |
+
try:
|
| 533 |
+
return (await page.content()) or ""
|
| 534 |
+
except PlaywrightError:
|
| 535 |
+
await page.wait_for_timeout(1000)
|
| 536 |
+
continue
|
| 537 |
+
|
| 538 |
async def _solve_cloudflare(self, page: async_Page):
|
| 539 |
"""Solve the cloudflare challenge displayed on the playwright page passed. The async version
|
| 540 |
|
| 541 |
:param page: The async targeted page
|
| 542 |
:return:
|
| 543 |
"""
|
| 544 |
+
challenge_type = self._detect_cloudflare(await self._get_page_content(page))
|
| 545 |
if not challenge_type:
|
| 546 |
log.error("No Cloudflare challenge found.")
|
| 547 |
return
|
| 548 |
else:
|
| 549 |
log.info(f'The turnstile version discovered is "{challenge_type}"')
|
| 550 |
if challenge_type == "non-interactive": # pragma: no cover
|
| 551 |
+
while "<title>Just a moment...</title>" in (await self._get_page_content(page)):
|
| 552 |
log.info("Waiting for Cloudflare wait page to disappear.")
|
| 553 |
await page.wait_for_timeout(1000)
|
| 554 |
await page.wait_for_load_state()
|
|
|
|
| 556 |
return
|
| 557 |
|
| 558 |
else:
|
| 559 |
+
while "Verifying you are human." in (await self._get_page_content(page)):
|
| 560 |
# Waiting for the verify spinner to disappear, checking every 1s if it disappeared
|
| 561 |
await page.wait_for_timeout(500)
|
| 562 |
|