Karim shoair commited on
Commit
887b306
·
1 Parent(s): ae12e0c

feat: Add an mcp server

Browse files
Files changed (3) hide show
  1. scrapling/cli.py +8 -0
  2. scrapling/core/ai.py +613 -0
  3. scrapling/core/shell.py +27 -21
scrapling/cli.py CHANGED
@@ -133,6 +133,13 @@ def install(force):
133
  print("The dependencies are already installed")
134
 
135
 
 
 
 
 
 
 
 
136
  @command(help="Interactive scraping console")
137
  @option(
138
  "-c",
@@ -824,3 +831,4 @@ def main():
824
  main.add_command(install)
825
  main.add_command(shell)
826
  main.add_command(extract)
 
 
133
  print("The dependencies are already installed")
134
 
135
 
136
+ @command(help="Run Scrapling's MCP server (Check the docs for more info).")
137
+ def mcp():
138
+ from scrapling.core.ai import ScraplingMCPServer
139
+
140
+ ScraplingMCPServer().serve()
141
+
142
+
143
  @command(help="Interactive scraping console")
144
  @option(
145
  "-c",
 
831
  main.add_command(install)
832
  main.add_command(shell)
833
  main.add_command(extract)
834
+ main.add_command(mcp)
scrapling/core/ai.py ADDED
@@ -0,0 +1,613 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from asyncio import gather
2
+
3
+ from mcp.server.fastmcp import FastMCP
4
+ from pydantic import BaseModel, Field
5
+
6
+ from scrapling.core.shell import Convertor
7
+ from scrapling.engines.toolbelt import Response as _ScraplingResponse
8
+ from scrapling.fetchers import (
9
+ Fetcher,
10
+ FetcherSession,
11
+ DynamicFetcher,
12
+ AsyncDynamicSession,
13
+ StealthyFetcher,
14
+ AsyncStealthySession,
15
+ )
16
+ from scrapling.core._types import (
17
+ Optional,
18
+ Literal,
19
+ Tuple,
20
+ extraction_types,
21
+ Union,
22
+ Mapping,
23
+ Dict,
24
+ List,
25
+ SelectorWaitStates,
26
+ Generator,
27
+ )
28
+ from curl_cffi.requests import (
29
+ BrowserTypeLiteral,
30
+ )
31
+
32
+
33
+ class ResponseModel(BaseModel):
34
+ """Request's response information structure."""
35
+
36
+ status: int = Field(description="The status code returned by the website.")
37
+ content: list[str] = Field(
38
+ description="The content as Markdown/HTML or the text content of the page."
39
+ )
40
+ url: str = Field(
41
+ description="The URL given by the user that resulted in this response."
42
+ )
43
+
44
+
45
+ def _ContentTranslator(
46
+ content: Generator[str, None, None], page: _ScraplingResponse
47
+ ) -> ResponseModel:
48
+ """Convert a content generator to a list of ResponseModel objects."""
49
+ return ResponseModel(
50
+ status=page.status, content=[result for result in content], url=page.url
51
+ )
52
+
53
+
54
+ class ScraplingMCPServer:
55
+ _server = FastMCP(name="Scrapling")
56
+
57
+ @staticmethod
58
+ @_server.tool()
59
+ def get(
60
+ url: str,
61
+ impersonate: Optional[BrowserTypeLiteral] = "chrome",
62
+ extraction_type: extraction_types = "markdown",
63
+ css_selector: Optional[str] = None,
64
+ main_content_only: bool = True,
65
+ params: Optional[Union[Dict, List, Tuple]] = None,
66
+ headers: Optional[Mapping[str, Optional[str]]] = None,
67
+ cookies: Optional[Union[dict[str, str], list[tuple[str, str]]]] = None,
68
+ timeout: Optional[Union[int, float]] = 30,
69
+ follow_redirects: bool = True,
70
+ max_redirects: int = 30,
71
+ retries: Optional[int] = 3,
72
+ retry_delay: Optional[int] = 1,
73
+ proxy: Optional[str] = None,
74
+ proxy_auth: Optional[Tuple[str, str]] = None,
75
+ auth: Optional[Tuple[str, str]] = None,
76
+ verify: Optional[bool] = True,
77
+ http3: Optional[bool] = False,
78
+ stealthy_headers: Optional[bool] = True,
79
+ ) -> ResponseModel:
80
+ """Make GET HTTP request to a URL and return a structured output of the result.
81
+ Note: This is only suitable for low-mid protection levels. For high-protection levels or websites that require JS loading, use the other tools directly.
82
+ Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
83
+
84
+ :param url: The URL to request.
85
+ :param impersonate: Browser version to impersonate its fingerprint. It's using the latest chrome version by default.
86
+ :param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
87
+ - Markdown will convert the page content to Markdown format.
88
+ - HTML will return the raw HTML content of the page.
89
+ - Text will return the text content of the page.
90
+ :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
91
+ :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
92
+ :param params: Query string parameters for the request.
93
+ :param headers: Headers to include in the request.
94
+ :param cookies: Cookies to use in the request.
95
+ :param timeout: Number of seconds to wait before timing out.
96
+ :param follow_redirects: Whether to follow redirects. Defaults to True.
97
+ :param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
98
+ :param retries: Number of retry attempts. Defaults to 3.
99
+ :param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
100
+ :param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
101
+ Cannot be used together with the `proxies` parameter.
102
+ :param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
103
+ :param auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
104
+ :param verify: Whether to verify HTTPS certificates.
105
+ :param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
106
+ :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
107
+ """
108
+ page = Fetcher.get(
109
+ url,
110
+ auth=auth,
111
+ proxy=proxy,
112
+ http3=http3,
113
+ verify=verify,
114
+ params=params,
115
+ proxy_auth=proxy_auth,
116
+ retry_delay=retry_delay,
117
+ stealthy_headers=stealthy_headers,
118
+ impersonate=impersonate,
119
+ headers=headers,
120
+ cookies=cookies,
121
+ timeout=timeout,
122
+ retries=retries,
123
+ max_redirects=max_redirects,
124
+ follow_redirects=follow_redirects,
125
+ )
126
+ return _ContentTranslator(
127
+ Convertor._extract_content(
128
+ page,
129
+ css_selector=css_selector,
130
+ extraction_type=extraction_type,
131
+ main_content_only=main_content_only,
132
+ ),
133
+ page,
134
+ )
135
+
136
+ @staticmethod
137
+ @_server.tool()
138
+ async def bulk_get(
139
+ urls: Tuple[str, ...],
140
+ impersonate: Optional[BrowserTypeLiteral] = "chrome",
141
+ extraction_type: extraction_types = "markdown",
142
+ css_selector: Optional[str] = None,
143
+ main_content_only: bool = True,
144
+ params: Optional[Union[Dict, List, Tuple]] = None,
145
+ headers: Optional[Mapping[str, Optional[str]]] = None,
146
+ cookies: Optional[Union[dict[str, str], list[tuple[str, str]]]] = None,
147
+ timeout: Optional[Union[int, float]] = 30,
148
+ follow_redirects: bool = True,
149
+ max_redirects: int = 30,
150
+ retries: Optional[int] = 3,
151
+ retry_delay: Optional[int] = 1,
152
+ proxy: Optional[str] = None,
153
+ proxy_auth: Optional[Tuple[str, str]] = None,
154
+ auth: Optional[Tuple[str, str]] = None,
155
+ verify: Optional[bool] = True,
156
+ http3: Optional[bool] = False,
157
+ stealthy_headers: Optional[bool] = True,
158
+ ) -> List[ResponseModel]:
159
+ """Make GET HTTP request to a group of URLs and for each URL, return a structured output of the result.
160
+ Note: This is only suitable for low-mid protection levels. For high-protection levels or websites that require JS loading, use the other tools directly.
161
+ Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
162
+
163
+ :param urls: A tuple of the URLs to request.
164
+ :param impersonate: Browser version to impersonate its fingerprint. It's using the latest chrome version by default.
165
+ :param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
166
+ - Markdown will convert the page content to Markdown format.
167
+ - HTML will return the raw HTML content of the page.
168
+ - Text will return the text content of the page.
169
+ :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
170
+ :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
171
+ :param params: Query string parameters for the request.
172
+ :param headers: Headers to include in the request.
173
+ :param cookies: Cookies to use in the request.
174
+ :param timeout: Number of seconds to wait before timing out.
175
+ :param follow_redirects: Whether to follow redirects. Defaults to True.
176
+ :param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
177
+ :param retries: Number of retry attempts. Defaults to 3.
178
+ :param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
179
+ :param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
180
+ Cannot be used together with the `proxies` parameter.
181
+ :param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
182
+ :param auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
183
+ :param verify: Whether to verify HTTPS certificates.
184
+ :param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
185
+ :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
186
+ """
187
+ async with FetcherSession() as session:
188
+ tasks = [
189
+ session.get(
190
+ url,
191
+ auth=auth,
192
+ proxy=proxy,
193
+ http3=http3,
194
+ verify=verify,
195
+ params=params,
196
+ headers=headers,
197
+ cookies=cookies,
198
+ timeout=timeout,
199
+ retries=retries,
200
+ proxy_auth=proxy_auth,
201
+ retry_delay=retry_delay,
202
+ impersonate=impersonate,
203
+ max_redirects=max_redirects,
204
+ follow_redirects=follow_redirects,
205
+ stealthy_headers=stealthy_headers,
206
+ )
207
+ for url in urls
208
+ ]
209
+ responses = await gather(*tasks)
210
+ return [
211
+ _ContentTranslator(
212
+ Convertor._extract_content(
213
+ page,
214
+ css_selector=css_selector,
215
+ extraction_type=extraction_type,
216
+ main_content_only=main_content_only,
217
+ ),
218
+ page,
219
+ )
220
+ for page in responses
221
+ ]
222
+
223
+ @staticmethod
224
+ @_server.tool()
225
+ async def fetch(
226
+ url: str,
227
+ extraction_type: extraction_types = "markdown",
228
+ css_selector: Optional[str] = None,
229
+ main_content_only: bool = True,
230
+ headless: bool = False,
231
+ google_search: bool = True,
232
+ hide_canvas: bool = False,
233
+ disable_webgl: bool = False,
234
+ real_chrome: bool = False,
235
+ stealth: bool = False,
236
+ wait: Union[int, float] = 0,
237
+ proxy: Optional[Union[str, Dict[str, str]]] = None,
238
+ locale: str = "en-US",
239
+ extra_headers: Optional[Dict[str, str]] = None,
240
+ useragent: Optional[str] = None,
241
+ cdp_url: Optional[str] = None,
242
+ timeout: Union[int, float] = 30000,
243
+ disable_resources: bool = False,
244
+ wait_selector: Optional[str] = None,
245
+ cookies: Optional[List[Dict]] = None,
246
+ network_idle: bool = False,
247
+ wait_selector_state: SelectorWaitStates = "attached",
248
+ ) -> ResponseModel:
249
+ """Use playwright to open a browser to fetch a URL and return a structured output of the result.
250
+ Note: This is only suitable for low-mid protection levels.
251
+ Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
252
+
253
+ :param url: The URL to request.
254
+ :param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
255
+ - Markdown will convert the page content to Markdown format.
256
+ - HTML will return the raw HTML content of the page.
257
+ - Text will return the text content of the page.
258
+ :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
259
+ :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
260
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
261
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
262
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
263
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
264
+ :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
265
+ :param cookies: Set cookies for the next request. It should be in a dictionary format that Playwright accepts.
266
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
267
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
268
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
269
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
270
+ :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
271
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
272
+ :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
273
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
274
+ :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
275
+ :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
276
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
277
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
278
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
279
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
280
+ """
281
+ page = await DynamicFetcher.async_fetch(
282
+ url,
283
+ wait=wait,
284
+ proxy=proxy,
285
+ locale=locale,
286
+ timeout=timeout,
287
+ cookies=cookies,
288
+ stealth=stealth,
289
+ cdp_url=cdp_url,
290
+ headless=headless,
291
+ useragent=useragent,
292
+ hide_canvas=hide_canvas,
293
+ real_chrome=real_chrome,
294
+ network_idle=network_idle,
295
+ wait_selector=wait_selector,
296
+ disable_webgl=disable_webgl,
297
+ extra_headers=extra_headers,
298
+ google_search=google_search,
299
+ disable_resources=disable_resources,
300
+ wait_selector_state=wait_selector_state,
301
+ )
302
+ return _ContentTranslator(
303
+ Convertor._extract_content(
304
+ page,
305
+ css_selector=css_selector,
306
+ extraction_type=extraction_type,
307
+ main_content_only=main_content_only,
308
+ ),
309
+ page,
310
+ )
311
+
312
+ @staticmethod
313
+ @_server.tool()
314
+ async def bulk_fetch(
315
+ urls: Tuple[str, ...],
316
+ extraction_type: extraction_types = "markdown",
317
+ css_selector: Optional[str] = None,
318
+ main_content_only: bool = True,
319
+ headless: bool = False,
320
+ google_search: bool = True,
321
+ hide_canvas: bool = False,
322
+ disable_webgl: bool = False,
323
+ real_chrome: bool = False,
324
+ stealth: bool = False,
325
+ wait: Union[int, float] = 0,
326
+ proxy: Optional[Union[str, Dict[str, str]]] = None,
327
+ locale: str = "en-US",
328
+ extra_headers: Optional[Dict[str, str]] = None,
329
+ useragent: Optional[str] = None,
330
+ cdp_url: Optional[str] = None,
331
+ timeout: Union[int, float] = 30000,
332
+ disable_resources: bool = False,
333
+ wait_selector: Optional[str] = None,
334
+ cookies: Optional[List[Dict]] = None,
335
+ network_idle: bool = False,
336
+ wait_selector_state: SelectorWaitStates = "attached",
337
+ ) -> List[ResponseModel]:
338
+ """Use playwright to open a browser, then fetch a group of URLs at the same time, and for each page return a structured output of the result.
339
+ Note: This is only suitable for low-mid protection levels.
340
+ Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
341
+
342
+ :param urls: A tuple of the URLs to request.
343
+ :param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
344
+ - Markdown will convert the page content to Markdown format.
345
+ - HTML will return the raw HTML content of the page.
346
+ - Text will return the text content of the page.
347
+ :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
348
+ :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
349
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
350
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
351
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
352
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
353
+ :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
354
+ :param cookies: Set cookies for the next request. It should be in a dictionary format that Playwright accepts.
355
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
356
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
357
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
358
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
359
+ :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
360
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
361
+ :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
362
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
363
+ :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
364
+ :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
365
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
366
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
367
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
368
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
369
+ """
370
+ async with AsyncDynamicSession(
371
+ wait=wait,
372
+ proxy=proxy,
373
+ locale=locale,
374
+ timeout=timeout,
375
+ cookies=cookies,
376
+ stealth=stealth,
377
+ cdp_url=cdp_url,
378
+ headless=headless,
379
+ max_pages=len(urls),
380
+ useragent=useragent,
381
+ hide_canvas=hide_canvas,
382
+ real_chrome=real_chrome,
383
+ network_idle=network_idle,
384
+ wait_selector=wait_selector,
385
+ google_search=google_search,
386
+ disable_webgl=disable_webgl,
387
+ extra_headers=extra_headers,
388
+ disable_resources=disable_resources,
389
+ wait_selector_state=wait_selector_state,
390
+ ) as session:
391
+ tasks = [session.fetch(url) for url in urls]
392
+ responses = await gather(*tasks)
393
+ return [
394
+ _ContentTranslator(
395
+ Convertor._extract_content(
396
+ page,
397
+ css_selector=css_selector,
398
+ extraction_type=extraction_type,
399
+ main_content_only=main_content_only,
400
+ ),
401
+ page,
402
+ )
403
+ for page in responses
404
+ ]
405
+
406
+ @staticmethod
407
+ @_server.tool()
408
+ async def stealthy_fetch(
409
+ url: str,
410
+ extraction_type: extraction_types = "markdown",
411
+ css_selector: Optional[str] = None,
412
+ main_content_only: bool = True,
413
+ headless: Union[bool, Literal["virtual"]] = True, # noqa: F821
414
+ block_images: bool = False,
415
+ disable_resources: bool = False,
416
+ block_webrtc: bool = False,
417
+ allow_webgl: bool = True,
418
+ network_idle: bool = False,
419
+ humanize: Union[bool, float] = True,
420
+ solve_cloudflare: bool = False,
421
+ wait: Union[int, float] = 0,
422
+ timeout: Union[int, float] = 30000,
423
+ wait_selector: Optional[str] = None,
424
+ addons: Optional[List[str]] = None,
425
+ wait_selector_state: SelectorWaitStates = "attached",
426
+ cookies: Optional[List[Dict]] = None,
427
+ google_search: bool = True,
428
+ extra_headers: Optional[Dict[str, str]] = None,
429
+ proxy: Optional[Union[str, Dict[str, str]]] = None,
430
+ os_randomize: bool = False,
431
+ disable_ads: bool = False,
432
+ geoip: bool = False,
433
+ additional_arguments: Optional[Dict] = None,
434
+ ) -> ResponseModel:
435
+ """Use Scrapling's version of the Camoufox browser to fetch a URL and return a structured output of the result.
436
+ Note: This is best suitable for high protection levels. It's slower than the other tools.
437
+ Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
438
+
439
+ :param url: The URL to request.
440
+ :param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
441
+ - Markdown will convert the page content to Markdown format.
442
+ - HTML will return the raw HTML content of the page.
443
+ - Text will return the text content of the page.
444
+ :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
445
+ :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
446
+ :param headless: Run the browser in headless/hidden (default), virtual screen mode, or headful/visible mode.
447
+ :param block_images: Prevent the loading of images through Firefox preferences.
448
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
449
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
450
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
451
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
452
+ :param block_webrtc: Blocks WebRTC entirely.
453
+ :param cookies: Set cookies for the next request.
454
+ :param addons: List of Firefox addons to use. Must be paths to extracted addons.
455
+ :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
456
+ :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
457
+ :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
458
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
459
+ :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
460
+ :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
461
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
462
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
463
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
464
+ :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
465
+ It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
466
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
467
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
468
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
469
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
470
+ :param additional_arguments: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
471
+ """
472
+ page = await StealthyFetcher.async_fetch(
473
+ url,
474
+ wait=wait,
475
+ proxy=proxy,
476
+ geoip=geoip,
477
+ addons=addons,
478
+ timeout=timeout,
479
+ cookies=cookies,
480
+ headless=headless,
481
+ humanize=humanize,
482
+ allow_webgl=allow_webgl,
483
+ disable_ads=disable_ads,
484
+ network_idle=network_idle,
485
+ block_images=block_images,
486
+ block_webrtc=block_webrtc,
487
+ os_randomize=os_randomize,
488
+ wait_selector=wait_selector,
489
+ google_search=google_search,
490
+ extra_headers=extra_headers,
491
+ solve_cloudflare=solve_cloudflare,
492
+ disable_resources=disable_resources,
493
+ wait_selector_state=wait_selector_state,
494
+ additional_arguments=additional_arguments,
495
+ )
496
+ return _ContentTranslator(
497
+ Convertor._extract_content(
498
+ page,
499
+ css_selector=css_selector,
500
+ extraction_type=extraction_type,
501
+ main_content_only=main_content_only,
502
+ ),
503
+ page,
504
+ )
505
+
506
+ @staticmethod
507
+ @_server.tool()
508
+ async def bulk_stealthy_fetch(
509
+ urls: Tuple[str, ...],
510
+ extraction_type: extraction_types = "markdown",
511
+ css_selector: Optional[str] = None,
512
+ main_content_only: bool = True,
513
+ headless: Union[bool, Literal["virtual"]] = True, # noqa: F821
514
+ block_images: bool = False,
515
+ disable_resources: bool = False,
516
+ block_webrtc: bool = False,
517
+ allow_webgl: bool = True,
518
+ network_idle: bool = False,
519
+ humanize: Union[bool, float] = True,
520
+ solve_cloudflare: bool = False,
521
+ wait: Union[int, float] = 0,
522
+ timeout: Union[int, float] = 30000,
523
+ wait_selector: Optional[str] = None,
524
+ addons: Optional[List[str]] = None,
525
+ wait_selector_state: SelectorWaitStates = "attached",
526
+ cookies: Optional[List[Dict]] = None,
527
+ google_search: bool = True,
528
+ extra_headers: Optional[Dict[str, str]] = None,
529
+ proxy: Optional[Union[str, Dict[str, str]]] = None,
530
+ os_randomize: bool = False,
531
+ disable_ads: bool = False,
532
+ geoip: bool = False,
533
+ additional_arguments: Optional[Dict] = None,
534
+ ) -> List[ResponseModel]:
535
+ """Use Scrapling's version of the Camoufox browser to fetch a group of URLs at the same time, and for each page return a structured output of the result.
536
+ Note: This is best suitable for high protection levels. It's slower than the other tools.
537
+ Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
538
+
539
+ :param urls: A tuple of the URLs to request.
540
+ :param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
541
+ - Markdown will convert the page content to Markdown format.
542
+ - HTML will return the raw HTML content of the page.
543
+ - Text will return the text content of the page.
544
+ :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
545
+ :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
546
+ :param headless: Run the browser in headless/hidden (default), virtual screen mode, or headful/visible mode.
547
+ :param block_images: Prevent the loading of images through Firefox preferences.
548
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
549
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
550
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
551
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
552
+ :param block_webrtc: Blocks WebRTC entirely.
553
+ :param cookies: Set cookies for the next request.
554
+ :param addons: List of Firefox addons to use. Must be paths to extracted addons.
555
+ :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
556
+ :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
557
+ :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
558
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
559
+ :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
560
+ :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
561
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
562
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
563
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
564
+ :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
565
+ It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
566
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
567
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
568
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
569
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
570
+ :param additional_arguments: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
571
+ """
572
+ async with AsyncStealthySession(
573
+ wait=wait,
574
+ proxy=proxy,
575
+ geoip=geoip,
576
+ addons=addons,
577
+ timeout=timeout,
578
+ cookies=cookies,
579
+ headless=headless,
580
+ humanize=humanize,
581
+ max_pages=len(urls),
582
+ allow_webgl=allow_webgl,
583
+ disable_ads=disable_ads,
584
+ block_images=block_images,
585
+ block_webrtc=block_webrtc,
586
+ network_idle=network_idle,
587
+ os_randomize=os_randomize,
588
+ wait_selector=wait_selector,
589
+ google_search=google_search,
590
+ extra_headers=extra_headers,
591
+ solve_cloudflare=solve_cloudflare,
592
+ disable_resources=disable_resources,
593
+ wait_selector_state=wait_selector_state,
594
+ additional_arguments=additional_arguments,
595
+ ) as session:
596
+ tasks = [session.fetch(url) for url in urls]
597
+ responses = await gather(*tasks)
598
+ return [
599
+ _ContentTranslator(
600
+ Convertor._extract_content(
601
+ page,
602
+ css_selector=css_selector,
603
+ extraction_type=extraction_type,
604
+ main_content_only=main_content_only,
605
+ ),
606
+ page,
607
+ )
608
+ for page in responses
609
+ ]
610
+
611
+ def serve(self):
612
+ """Serve the MCP server."""
613
+ self._server.run(transport="stdio")
scrapling/core/shell.py CHANGED
@@ -36,6 +36,7 @@ from scrapling.core._types import (
36
  Any,
37
  Union,
38
  extraction_types,
 
39
  )
40
  from scrapling.fetchers import (
41
  Fetcher,
@@ -589,7 +590,7 @@ class Convertor:
589
  extraction_type: extraction_types = "markdown",
590
  css_selector: Optional[str] = None,
591
  main_content_only: bool = False,
592
- ) -> str:
593
  """Extract the content of an Adaptor"""
594
  if not page or not isinstance(page, Adaptor):
595
  raise TypeError("Input must be of type `Adaptor`")
@@ -599,24 +600,25 @@ class Convertor:
599
  if main_content_only:
600
  page = page.css_first("body") or page
601
 
602
- page = page if not css_selector else page.css_first(css_selector)
603
- match extraction_type:
604
- case "markdown":
605
- return cls._convert_to_markdown(page.body)
606
- case "html":
607
- return page.body
608
- case "text":
609
- txt_content = page.get_all_text(strip=True)
610
- for s in (
611
- "\n",
612
- "\r",
613
- "\t",
614
- " ",
615
- ):
616
- # Remove consecutive white-spaces
617
- txt_content = re_sub(f"[{s}]+", s, txt_content)
618
- return txt_content
619
- return ""
 
620
 
621
  @classmethod
622
  def write_content_to_file(
@@ -635,7 +637,11 @@ class Convertor:
635
  with open(filename, "w", encoding="utf-8") as f:
636
  extension = filename.split(".")[-1]
637
  f.write(
638
- cls._extract_content(
639
- page, cls._extension_map[extension], css_selector=css_selector
 
 
 
 
640
  )
641
  )
 
36
  Any,
37
  Union,
38
  extraction_types,
39
+ Generator,
40
  )
41
  from scrapling.fetchers import (
42
  Fetcher,
 
590
  extraction_type: extraction_types = "markdown",
591
  css_selector: Optional[str] = None,
592
  main_content_only: bool = False,
593
+ ) -> Generator[str, None, None]:
594
  """Extract the content of an Adaptor"""
595
  if not page or not isinstance(page, Adaptor):
596
  raise TypeError("Input must be of type `Adaptor`")
 
600
  if main_content_only:
601
  page = page.css_first("body") or page
602
 
603
+ pages = [page] if not css_selector else page.css(css_selector)
604
+ for page in pages:
605
+ match extraction_type:
606
+ case "markdown":
607
+ yield cls._convert_to_markdown(page.body)
608
+ case "html":
609
+ yield page.body
610
+ case "text":
611
+ txt_content = page.get_all_text(strip=True)
612
+ for s in (
613
+ "\n",
614
+ "\r",
615
+ "\t",
616
+ " ",
617
+ ):
618
+ # Remove consecutive white-spaces
619
+ txt_content = re_sub(f"[{s}]+", s, txt_content)
620
+ yield txt_content
621
+ yield ""
622
 
623
  @classmethod
624
  def write_content_to_file(
 
637
  with open(filename, "w", encoding="utf-8") as f:
638
  extension = filename.split(".")[-1]
639
  f.write(
640
+ "".join(
641
+ cls._extract_content(
642
+ page,
643
+ cls._extension_map[extension],
644
+ css_selector=css_selector,
645
+ )
646
  )
647
  )