Karim shoair commited on
Commit
8400659
·
1 Parent(s): 887b306

refactor(StealthyFetcher): Remove virtual mode and use persistent context

Browse files
scrapling/cli.py CHANGED
@@ -774,7 +774,7 @@ def stealthy_fetch(
774
 
775
  :param url: Target url.
776
  :param output_file: Output file path (.md for Markdown, .html for HTML).
777
- :param headless: Run the browser in headless/hidden, virtual screen mode, or headful/visible mode.
778
  :param block_images: Prevent the loading of images through Firefox preferences.
779
  :param disable_resources: Drop requests of unnecessary resources for a speed boost.
780
  :param block_webrtc: Blocks WebRTC entirely.
 
774
 
775
  :param url: Target url.
776
  :param output_file: Output file path (.md for Markdown, .html for HTML).
777
+ :param headless: Run the browser in headless/hidden, or headful/visible mode.
778
  :param block_images: Prevent the loading of images through Firefox preferences.
779
  :param disable_resources: Drop requests of unnecessary resources for a speed boost.
780
  :param block_webrtc: Blocks WebRTC entirely.
scrapling/core/ai.py CHANGED
@@ -410,7 +410,7 @@ class ScraplingMCPServer:
410
  extraction_type: extraction_types = "markdown",
411
  css_selector: Optional[str] = None,
412
  main_content_only: bool = True,
413
- headless: Union[bool, Literal["virtual"]] = True, # noqa: F821
414
  block_images: bool = False,
415
  disable_resources: bool = False,
416
  block_webrtc: bool = False,
@@ -443,7 +443,7 @@ class ScraplingMCPServer:
443
  - Text will return the text content of the page.
444
  :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
445
  :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
446
- :param headless: Run the browser in headless/hidden (default), virtual screen mode, or headful/visible mode.
447
  :param block_images: Prevent the loading of images through Firefox preferences.
448
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
449
  :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
@@ -510,7 +510,7 @@ class ScraplingMCPServer:
510
  extraction_type: extraction_types = "markdown",
511
  css_selector: Optional[str] = None,
512
  main_content_only: bool = True,
513
- headless: Union[bool, Literal["virtual"]] = True, # noqa: F821
514
  block_images: bool = False,
515
  disable_resources: bool = False,
516
  block_webrtc: bool = False,
@@ -543,7 +543,7 @@ class ScraplingMCPServer:
543
  - Text will return the text content of the page.
544
  :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
545
  :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
546
- :param headless: Run the browser in headless/hidden (default), virtual screen mode, or headful/visible mode.
547
  :param block_images: Prevent the loading of images through Firefox preferences.
548
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
549
  :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
 
410
  extraction_type: extraction_types = "markdown",
411
  css_selector: Optional[str] = None,
412
  main_content_only: bool = True,
413
+ headless: Union[bool] = True, # noqa: F821
414
  block_images: bool = False,
415
  disable_resources: bool = False,
416
  block_webrtc: bool = False,
 
443
  - Text will return the text content of the page.
444
  :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
445
  :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
446
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
447
  :param block_images: Prevent the loading of images through Firefox preferences.
448
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
449
  :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
 
510
  extraction_type: extraction_types = "markdown",
511
  css_selector: Optional[str] = None,
512
  main_content_only: bool = True,
513
+ headless: Union[bool] = True, # noqa: F821
514
  block_images: bool = False,
515
  disable_resources: bool = False,
516
  block_webrtc: bool = False,
 
543
  - Text will return the text content of the page.
544
  :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
545
  :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
546
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
547
  :param block_images: Prevent the loading of images through Firefox preferences.
548
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
549
  :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
scrapling/engines/_browsers/_camoufox.py CHANGED
@@ -2,12 +2,11 @@ from time import time, sleep
2
  from re import compile as re_compile
3
  from asyncio import sleep as asyncio_sleep, Lock
4
 
5
- from camoufox import AsyncNewBrowser, NewBrowser, DefaultAddons
 
6
  from playwright.sync_api import (
7
  Response as SyncPlaywrightResponse,
8
  sync_playwright,
9
- BrowserType,
10
- Browser,
11
  BrowserContext,
12
  Playwright,
13
  Locator,
@@ -16,8 +15,6 @@ from playwright.sync_api import (
16
  from playwright.async_api import (
17
  async_playwright,
18
  Response as AsyncPlaywrightResponse,
19
- BrowserType as AsyncBrowserType,
20
- Browser as AsyncBrowser,
21
  BrowserContext as AsyncBrowserContext,
22
  Playwright as AsyncPlaywright,
23
  Locator as AsyncLocator,
@@ -32,7 +29,6 @@ from scrapling.core._types import (
32
  Optional,
33
  Union,
34
  Callable,
35
- Literal,
36
  List,
37
  SelectorWaitStates,
38
  )
@@ -82,14 +78,13 @@ class StealthySession:
82
  "page_pool",
83
  "_closed",
84
  "launch_options",
85
- "context_options",
86
  "_headers_keys",
87
  )
88
 
89
  def __init__(
90
  self,
91
  max_pages: int = 1,
92
- headless: Union[bool, Literal["virtual"]] = True, # noqa: F821
93
  block_images: bool = False,
94
  disable_resources: bool = False,
95
  block_webrtc: bool = False,
@@ -115,7 +110,7 @@ class StealthySession:
115
  ):
116
  """A Browser session manager with page pooling
117
 
118
- :param headless: Run the browser in headless/hidden (default), virtual screen mode, or headful/visible mode.
119
  :param block_images: Prevent the loading of images through Firefox preferences.
120
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
121
  :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
@@ -199,7 +194,6 @@ class StealthySession:
199
  self.additional_arguments = config.additional_arguments
200
 
201
  self.playwright: Optional[Playwright] = None
202
- self.browser: Optional[Union[BrowserType, Browser]] = None
203
  self.context: Optional[BrowserContext] = None
204
  self.page_pool = PagePool(self.max_pages)
205
  self._closed = False
@@ -214,28 +208,31 @@ class StealthySession:
214
 
215
  def __initiate_browser_options__(self):
216
  """Initiate browser options."""
217
- self.launch_options = {
218
- "geoip": self.geoip,
219
- "proxy": dict(self.proxy) if self.proxy else self.proxy,
220
- "enable_cache": True,
221
- "addons": self.addons,
222
- "exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
223
- "headless": self.headless,
224
- "humanize": True if self.solve_cloudflare else self.humanize,
225
- "i_know_what_im_doing": True, # To turn warnings off with the user configurations
226
- "allow_webgl": self.allow_webgl,
227
- "block_webrtc": self.block_webrtc,
228
- "block_images": self.block_images, # Careful! it makes some websites don't finish loading at all like stackoverflow even in headful mode.
229
- "os": None if self.os_randomize else get_os_name(),
230
- **self.additional_arguments,
231
- }
232
- self.context_options = {}
 
 
233
 
234
  def __create__(self):
235
  """Create a browser for this instance and context."""
236
  self.playwright = sync_playwright().start()
237
- self.browser = NewBrowser(self.playwright, **self.launch_options)
238
- self.context = self.browser.new_context(**self.context_options)
 
239
  if self.cookies:
240
  self.context.add_cookies(self.cookies)
241
 
@@ -255,10 +252,6 @@ class StealthySession:
255
  self.context.close()
256
  self.context = None
257
 
258
- if self.browser:
259
- self.browser.close()
260
- self.browser = None
261
-
262
  if self.playwright:
263
  self.playwright.stop()
264
  self.playwright = None
@@ -468,7 +461,7 @@ class AsyncStealthySession(StealthySession):
468
  def __init__(
469
  self,
470
  max_pages: int = 1,
471
- headless: Union[bool, Literal["virtual"]] = True, # noqa: F821
472
  block_images: bool = False,
473
  disable_resources: bool = False,
474
  block_webrtc: bool = False,
@@ -494,7 +487,7 @@ class AsyncStealthySession(StealthySession):
494
  ):
495
  """A Browser session manager with page pooling
496
 
497
- :param headless: Run the browser in headless/hidden (default), virtual screen mode, or headful/visible mode.
498
  :param block_images: Prevent the loading of images through Firefox preferences.
499
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
500
  :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
@@ -550,7 +543,6 @@ class AsyncStealthySession(StealthySession):
550
  additional_arguments,
551
  )
552
  self.playwright: Optional[AsyncPlaywright] = None
553
- self.browser: Optional[Union[AsyncBrowserType, AsyncBrowser]] = None
554
  self.context: Optional[AsyncBrowserContext] = None
555
  self._lock = Lock()
556
  self.__enter__ = None
@@ -559,9 +551,10 @@ class AsyncStealthySession(StealthySession):
559
  async def __create__(self):
560
  """Create a browser for this instance and context."""
561
  self.playwright: AsyncPlaywright = await async_playwright().start()
562
- self.browser = await AsyncNewBrowser(self.playwright, **self.launch_options)
563
- self.context: AsyncBrowserContext = await self.browser.new_context(
564
- **self.context_options
 
565
  )
566
  if self.cookies:
567
  await self.context.add_cookies(self.cookies)
@@ -582,10 +575,6 @@ class AsyncStealthySession(StealthySession):
582
  await self.context.close()
583
  self.context = None
584
 
585
- if self.browser:
586
- await self.browser.close()
587
- self.browser = None
588
-
589
  if self.playwright:
590
  await self.playwright.stop()
591
  self.playwright = None
 
2
  from re import compile as re_compile
3
  from asyncio import sleep as asyncio_sleep, Lock
4
 
5
+ from camoufox import DefaultAddons
6
+ from camoufox.utils import launch_options as generate_launch_options
7
  from playwright.sync_api import (
8
  Response as SyncPlaywrightResponse,
9
  sync_playwright,
 
 
10
  BrowserContext,
11
  Playwright,
12
  Locator,
 
15
  from playwright.async_api import (
16
  async_playwright,
17
  Response as AsyncPlaywrightResponse,
 
 
18
  BrowserContext as AsyncBrowserContext,
19
  Playwright as AsyncPlaywright,
20
  Locator as AsyncLocator,
 
29
  Optional,
30
  Union,
31
  Callable,
 
32
  List,
33
  SelectorWaitStates,
34
  )
 
78
  "page_pool",
79
  "_closed",
80
  "launch_options",
 
81
  "_headers_keys",
82
  )
83
 
84
  def __init__(
85
  self,
86
  max_pages: int = 1,
87
+ headless: Union[bool] = True, # noqa: F821
88
  block_images: bool = False,
89
  disable_resources: bool = False,
90
  block_webrtc: bool = False,
 
110
  ):
111
  """A Browser session manager with page pooling
112
 
113
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
114
  :param block_images: Prevent the loading of images through Firefox preferences.
115
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
116
  :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
 
194
  self.additional_arguments = config.additional_arguments
195
 
196
  self.playwright: Optional[Playwright] = None
 
197
  self.context: Optional[BrowserContext] = None
198
  self.page_pool = PagePool(self.max_pages)
199
  self._closed = False
 
208
 
209
  def __initiate_browser_options__(self):
210
  """Initiate browser options."""
211
+ self.launch_options = generate_launch_options(
212
+ **{
213
+ "geoip": self.geoip,
214
+ "proxy": dict(self.proxy) if self.proxy else self.proxy,
215
+ "enable_cache": True,
216
+ "addons": self.addons,
217
+ "exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
218
+ "headless": self.headless,
219
+ "humanize": True if self.solve_cloudflare else self.humanize,
220
+ "i_know_what_im_doing": True, # To turn warnings off with the user configurations
221
+ "allow_webgl": self.allow_webgl,
222
+ "block_webrtc": self.block_webrtc,
223
+ "block_images": self.block_images, # Careful! it makes some websites don't finish loading at all like stackoverflow even in headful mode.
224
+ "os": None if self.os_randomize else get_os_name(),
225
+ "user_data_dir": "",
226
+ **self.additional_arguments,
227
+ }
228
+ )
229
 
230
  def __create__(self):
231
  """Create a browser for this instance and context."""
232
  self.playwright = sync_playwright().start()
233
+ self.context = self.playwright.firefox.launch_persistent_context(
234
+ **self.launch_options
235
+ )
236
  if self.cookies:
237
  self.context.add_cookies(self.cookies)
238
 
 
252
  self.context.close()
253
  self.context = None
254
 
 
 
 
 
255
  if self.playwright:
256
  self.playwright.stop()
257
  self.playwright = None
 
461
  def __init__(
462
  self,
463
  max_pages: int = 1,
464
+ headless: Union[bool] = True, # noqa: F821
465
  block_images: bool = False,
466
  disable_resources: bool = False,
467
  block_webrtc: bool = False,
 
487
  ):
488
  """A Browser session manager with page pooling
489
 
490
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
491
  :param block_images: Prevent the loading of images through Firefox preferences.
492
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
493
  :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
 
543
  additional_arguments,
544
  )
545
  self.playwright: Optional[AsyncPlaywright] = None
 
546
  self.context: Optional[AsyncBrowserContext] = None
547
  self._lock = Lock()
548
  self.__enter__ = None
 
551
  async def __create__(self):
552
  """Create a browser for this instance and context."""
553
  self.playwright: AsyncPlaywright = await async_playwright().start()
554
+ self.context: AsyncBrowserContext = (
555
+ await self.playwright.firefox.launch_persistent_context(
556
+ **self.launch_options
557
+ )
558
  )
559
  if self.cookies:
560
  await self.context.add_cookies(self.cookies)
 
575
  await self.context.close()
576
  self.context = None
577
 
 
 
 
 
578
  if self.playwright:
579
  await self.playwright.stop()
580
  self.playwright = None
scrapling/engines/_browsers/_validators.py CHANGED
@@ -82,7 +82,7 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
82
  """Configuration struct for validation"""
83
 
84
  max_pages: int = 1
85
- headless: Union[bool, Literal["virtual"]] = True # noqa: F821
86
  block_images: bool = False
87
  disable_resources: bool = False
88
  block_webrtc: bool = False
 
82
  """Configuration struct for validation"""
83
 
84
  max_pages: int = 1
85
+ headless: Union[bool] = True # noqa: F821
86
  block_images: bool = False
87
  disable_resources: bool = False
88
  block_webrtc: bool = False
scrapling/fetchers.py CHANGED
@@ -52,7 +52,7 @@ class StealthyFetcher(BaseFetcher):
52
  def fetch(
53
  cls,
54
  url: str,
55
- headless: Union[bool, Literal["virtual"]] = True, # noqa: F821
56
  block_images: bool = False,
57
  disable_resources: bool = False,
58
  block_webrtc: bool = False,
@@ -80,7 +80,7 @@ class StealthyFetcher(BaseFetcher):
80
  Opens up a browser and do your request based on your chosen options below.
81
 
82
  :param url: Target url.
83
- :param headless: Run the browser in headless/hidden (default), virtual screen mode, or headful/visible mode.
84
  :param block_images: Prevent the loading of images through Firefox preferences.
85
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
86
  :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
@@ -148,7 +148,7 @@ class StealthyFetcher(BaseFetcher):
148
  async def async_fetch(
149
  cls,
150
  url: str,
151
- headless: Union[bool, Literal["virtual"]] = True, # noqa: F821
152
  block_images: bool = False,
153
  disable_resources: bool = False,
154
  block_webrtc: bool = False,
@@ -176,7 +176,7 @@ class StealthyFetcher(BaseFetcher):
176
  Opens up a browser and do your request based on your chosen options below.
177
 
178
  :param url: Target url.
179
- :param headless: Run the browser in headless/hidden (default), virtual screen mode, or headful/visible mode.
180
  :param block_images: Prevent the loading of images through Firefox preferences.
181
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
182
  :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
 
52
  def fetch(
53
  cls,
54
  url: str,
55
+ headless: Union[bool] = True, # noqa: F821
56
  block_images: bool = False,
57
  disable_resources: bool = False,
58
  block_webrtc: bool = False,
 
80
  Opens up a browser and do your request based on your chosen options below.
81
 
82
  :param url: Target url.
83
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
84
  :param block_images: Prevent the loading of images through Firefox preferences.
85
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
86
  :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
 
148
  async def async_fetch(
149
  cls,
150
  url: str,
151
+ headless: Union[bool] = True, # noqa: F821
152
  block_images: bool = False,
153
  disable_resources: bool = False,
154
  block_webrtc: bool = False,
 
176
  Opens up a browser and do your request based on your chosen options below.
177
 
178
  :param url: Target url.
179
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
180
  :param block_images: Prevent the loading of images through Firefox preferences.
181
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
182
  :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.