v0.4.2 (#183)
Browse files- .github/workflows/tests.yml +1 -1
- README.md +0 -1
- agent-skill/Scrapling-Skill.zip +0 -0
- agent-skill/Scrapling-Skill/SKILL.md +2 -2
- agent-skill/Scrapling-Skill/examples/README.md +1 -1
- agent-skill/Scrapling-Skill/references/fetching/dynamic.md +3 -3
- agent-skill/Scrapling-Skill/references/fetching/static.md +1 -1
- agent-skill/Scrapling-Skill/references/fetching/stealthy.md +4 -5
- agent-skill/Scrapling-Skill/references/mcp-server.md +2 -2
- docs/README_AR.md +0 -1
- docs/README_CN.md +0 -1
- docs/README_DE.md +0 -1
- docs/README_ES.md +0 -1
- docs/README_FR.md +0 -1
- docs/README_JP.md +0 -1
- docs/README_RU.md +0 -1
- docs/fetching/dynamic.md +3 -3
- docs/fetching/static.md +1 -1
- docs/fetching/stealthy.md +4 -5
- docs/overview.md +2 -3
- pyproject.toml +5 -4
- scrapling/__init__.py +1 -1
- scrapling/core/_types.py +1 -2
- scrapling/core/ai.py +10 -10
- scrapling/engines/_browsers/_base.py +2 -2
- scrapling/engines/_browsers/_controllers.py +10 -15
- scrapling/engines/_browsers/_stealth.py +10 -15
- scrapling/engines/static.py +4 -4
- scrapling/engines/toolbelt/fingerprints.py +3 -32
- scrapling/fetchers/chrome.py +2 -2
- scrapling/fetchers/stealth_chrome.py +4 -4
- scrapling/parser.py +24 -10
- server.json +2 -2
- setup.cfg +1 -1
- tests/fetchers/test_utils.py +0 -17
- tests/parser/test_parser_advanced.py +27 -0
- tests/requirements.txt +1 -1
- tox.ini +2 -2
.github/workflows/tests.yml
CHANGED
|
@@ -73,7 +73,7 @@ jobs:
|
|
| 73 |
- name: Install all browsers dependencies
|
| 74 |
run: |
|
| 75 |
python3 -m pip install --upgrade pip
|
| 76 |
-
python3 -m pip install playwright==1.
|
| 77 |
|
| 78 |
- name: Get Playwright version
|
| 79 |
id: playwright-version
|
|
|
|
| 73 |
- name: Install all browsers dependencies
|
| 74 |
run: |
|
| 75 |
python3 -m pip install --upgrade pip
|
| 76 |
+
python3 -m pip install playwright==1.58.0 patchright==1.58.2
|
| 77 |
|
| 78 |
- name: Get Playwright version
|
| 79 |
id: playwright-version
|
README.md
CHANGED
|
@@ -144,7 +144,6 @@ MySpider().start()
|
|
| 144 |
|
| 145 |
<!-- sponsors -->
|
| 146 |
|
| 147 |
-
<a href="https://www.thordata.com/?ls=github&lk=github" target="_blank" title="Unblockable proxies and scraping infrastructure, delivering real-time, reliable web data to power AI models and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
|
| 148 |
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
| 149 |
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
| 150 |
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
|
|
|
| 144 |
|
| 145 |
<!-- sponsors -->
|
| 146 |
|
|
|
|
| 147 |
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
| 148 |
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
| 149 |
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
agent-skill/Scrapling-Skill.zip
CHANGED
|
Binary files a/agent-skill/Scrapling-Skill.zip and b/agent-skill/Scrapling-Skill.zip differ
|
|
|
agent-skill/Scrapling-Skill/SKILL.md
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
---
|
| 2 |
name: scrapling-official
|
| 3 |
description: Scrape web pages using Scrapling with anti-bot bypass (like Cloudflare Turnstile), stealth headless browsing, spiders framework, adaptive scraping, and JavaScript rendering. Use when asked to scrape, crawl, or extract data from websites; web_fetch fails; the site has anti-bot protections; write Python code to scrape/crawl; or write spiders.
|
| 4 |
-
version: 0.4.
|
| 5 |
license: Complete terms in LICENSE.txt
|
| 6 |
---
|
| 7 |
|
|
@@ -22,7 +22,7 @@ Blazing fast crawls with real-time stats and streaming. Built by Web Scrapers fo
|
|
| 22 |
|
| 23 |
Create a virtual Python environment through any way available, like `venv`, then inside the environment do:
|
| 24 |
|
| 25 |
-
`pip install "scrapling[all]>=0.4.
|
| 26 |
|
| 27 |
Then do this to download all the browsers' dependencies:
|
| 28 |
|
|
|
|
| 1 |
---
|
| 2 |
name: scrapling-official
|
| 3 |
description: Scrape web pages using Scrapling with anti-bot bypass (like Cloudflare Turnstile), stealth headless browsing, spiders framework, adaptive scraping, and JavaScript rendering. Use when asked to scrape, crawl, or extract data from websites; web_fetch fails; the site has anti-bot protections; write Python code to scrape/crawl; or write spiders.
|
| 4 |
+
version: 0.4.2
|
| 5 |
license: Complete terms in LICENSE.txt
|
| 6 |
---
|
| 7 |
|
|
|
|
| 22 |
|
| 23 |
Create a virtual Python environment through any way available, like `venv`, then inside the environment do:
|
| 24 |
|
| 25 |
+
`pip install "scrapling[all]>=0.4.2"`
|
| 26 |
|
| 27 |
Then do this to download all the browsers' dependencies:
|
| 28 |
|
agent-skill/Scrapling-Skill/examples/README.md
CHANGED
|
@@ -9,7 +9,7 @@ All examples collect **all 100 quotes across 10 pages**.
|
|
| 9 |
Make sure Scrapling is installed:
|
| 10 |
|
| 11 |
```bash
|
| 12 |
-
pip install "scrapling[all]>=0.4.
|
| 13 |
scrapling install --force
|
| 14 |
```
|
| 15 |
|
|
|
|
| 9 |
Make sure Scrapling is installed:
|
| 10 |
|
| 11 |
```bash
|
| 12 |
+
pip install "scrapling[all]>=0.4.2"
|
| 13 |
scrapling install --force
|
| 14 |
```
|
| 15 |
|
agent-skill/Scrapling-Skill/references/fetching/dynamic.md
CHANGED
|
@@ -64,8 +64,8 @@ All arguments for `DynamicFetcher` and its session classes:
|
|
| 64 |
| wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
|
| 65 |
| init_script | An absolute path to a JavaScript file to be executed on page creation for all pages in this session. | ✔️ |
|
| 66 |
| wait_selector_state | Scrapling will wait for the given state to be fulfilled for the selector given with `wait_selector`. _Default state is `attached`._ | ✔️ |
|
| 67 |
-
| google_search | Enabled by default, Scrapling will set
|
| 68 |
-
| extra_headers | A dictionary of extra headers to add to the request. _The referer set by
|
| 69 |
| proxy | The proxy to be used with requests. It can be a string or a dictionary with only the keys 'server', 'username', and 'password'. | ✔️ |
|
| 70 |
| real_chrome | If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch and use an instance of your browser. | ✔️ |
|
| 71 |
| locale | Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect `navigator.language` value, `Accept-Language` request header value, as well as number and date formatting rules. Defaults to the system default locale. | ✔️ |
|
|
@@ -84,7 +84,7 @@ In session classes, all these arguments can be set globally for the session. Sti
|
|
| 84 |
|
| 85 |
**Notes:**
|
| 86 |
1. The `disable_resources` option made requests ~25% faster in tests for some websites and can help save proxy usage, but be careful with it, as it can cause some websites to never finish loading.
|
| 87 |
-
2. The `google_search` argument is enabled by default for all requests,
|
| 88 |
3. Since version 0.3.13, the `stealth` option has been removed here in favor of the `StealthyFetcher` class, and the `hide_canvas` option has been moved to it. The `disable_webgl` argument has been moved to the `StealthyFetcher` class and renamed as `allow_webgl`.
|
| 89 |
4. If you didn't set a user agent and enabled headless mode, the fetcher will generate a real user agent for the same browser version and use it. If you didn't set a user agent and didn't enable headless mode, the fetcher will use the browser's default user agent, which is the same as in standard browsers in the latest versions.
|
| 90 |
|
|
|
|
| 64 |
| wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
|
| 65 |
| init_script | An absolute path to a JavaScript file to be executed on page creation for all pages in this session. | ✔️ |
|
| 66 |
| wait_selector_state | Scrapling will wait for the given state to be fulfilled for the selector given with `wait_selector`. _Default state is `attached`._ | ✔️ |
|
| 67 |
+
| google_search | Enabled by default, Scrapling will set a Google referer header. | ✔️ |
|
| 68 |
+
| extra_headers | A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._ | ✔️ |
|
| 69 |
| proxy | The proxy to be used with requests. It can be a string or a dictionary with only the keys 'server', 'username', and 'password'. | ✔️ |
|
| 70 |
| real_chrome | If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch and use an instance of your browser. | ✔️ |
|
| 71 |
| locale | Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect `navigator.language` value, `Accept-Language` request header value, as well as number and date formatting rules. Defaults to the system default locale. | ✔️ |
|
|
|
|
| 84 |
|
| 85 |
**Notes:**
|
| 86 |
1. The `disable_resources` option made requests ~25% faster in tests for some websites and can help save proxy usage, but be careful with it, as it can cause some websites to never finish loading.
|
| 87 |
+
2. The `google_search` argument is enabled by default for all requests, setting the referer to `https://www.google.com/`. If used together with `extra_headers`, it takes priority over the referer set there.
|
| 88 |
3. Since version 0.3.13, the `stealth` option has been removed here in favor of the `StealthyFetcher` class, and the `hide_canvas` option has been moved to it. The `disable_webgl` argument has been moved to the `StealthyFetcher` class and renamed as `allow_webgl`.
|
| 89 |
4. If you didn't set a user agent and enabled headless mode, the fetcher will generate a real user agent for the same browser version and use it. If you didn't set a user agent and didn't enable headless mode, the fetcher will use the browser's default user agent, which is the same as in standard browsers in the latest versions.
|
| 90 |
|
agent-skill/Scrapling-Skill/references/fetching/static.md
CHANGED
|
@@ -14,7 +14,7 @@ Check out how to configure the parsing options [here](choosing.md#parser-configu
|
|
| 14 |
All methods for making requests here share some arguments, so let's discuss them first.
|
| 15 |
|
| 16 |
- **url**: The targeted URL
|
| 17 |
-
- **stealthy_headers**: If enabled (default), it creates and adds real browser headers. It also sets
|
| 18 |
- **follow_redirects**: As the name implies, tell the fetcher to follow redirections. **Enabled by default**
|
| 19 |
- **timeout**: The number of seconds to wait for each request to be finished. **Defaults to 30 seconds**.
|
| 20 |
- **retries**: The number of retries that the fetcher will do for failed requests. **Defaults to three retries**.
|
|
|
|
| 14 |
All methods for making requests here share some arguments, so let's discuss them first.
|
| 15 |
|
| 16 |
- **url**: The targeted URL
|
| 17 |
+
- **stealthy_headers**: If enabled (default), it creates and adds real browser headers. It also sets a Google referer header.
|
| 18 |
- **follow_redirects**: As the name implies, tell the fetcher to follow redirections. **Enabled by default**
|
| 19 |
- **timeout**: The number of seconds to wait for each request to be finished. **Defaults to 30 seconds**.
|
| 20 |
- **retries**: The number of retries that the fetcher will do for failed requests. **Defaults to three retries**.
|
agent-skill/Scrapling-Skill/references/fetching/stealthy.md
CHANGED
|
@@ -21,8 +21,7 @@ The `StealthyFetcher` class is a stealthy version of the [DynamicFetcher](dynami
|
|
| 21 |
3. It isolates JS execution, removes many Playwright fingerprints, and stops detection through some of the known behaviors that bots do.
|
| 22 |
4. It generates canvas noise to prevent fingerprinting through canvas.
|
| 23 |
5. It automatically patches known methods to detect running in headless mode and provides an option to defeat timezone mismatch attacks.
|
| 24 |
-
6.
|
| 25 |
-
7. and other anti-protection options...
|
| 26 |
|
| 27 |
## Full list of arguments
|
| 28 |
Scrapling provides many options with this fetcher and its session classes. Before jumping to the [examples](#examples), here's the full list of arguments
|
|
@@ -43,8 +42,8 @@ Scrapling provides many options with this fetcher and its session classes. Befor
|
|
| 43 |
| wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
|
| 44 |
| init_script | An absolute path to a JavaScript file to be executed on page creation for all pages in this session. | ✔️ |
|
| 45 |
| wait_selector_state | Scrapling will wait for the given state to be fulfilled for the selector given with `wait_selector`. _Default state is `attached`._ | ✔️ |
|
| 46 |
-
| google_search | Enabled by default, Scrapling will set
|
| 47 |
-
| extra_headers | A dictionary of extra headers to add to the request. _The referer set by
|
| 48 |
| proxy | The proxy to be used with requests. It can be a string or a dictionary with only the keys 'server', 'username', and 'password'. | ✔️ |
|
| 49 |
| real_chrome | If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch and use an instance of your browser. | ✔️ |
|
| 50 |
| locale | Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect `navigator.language` value, `Accept-Language` request header value, as well as number and date formatting rules. Defaults to the system default locale. | ✔️ |
|
|
@@ -69,7 +68,7 @@ In session classes, all these arguments can be set globally for the session. Sti
|
|
| 69 |
|
| 70 |
1. It's basically the same arguments as [DynamicFetcher](dynamic.md) class, but with these additional arguments: `solve_cloudflare`, `block_webrtc`, `hide_canvas`, and `allow_webgl`.
|
| 71 |
2. The `disable_resources` option made requests ~25% faster in tests for some websites and can help save proxy usage, but be careful with it, as it can cause some websites to never finish loading.
|
| 72 |
-
3. The `google_search` argument is enabled by default for all requests,
|
| 73 |
4. If you didn't set a user agent and enabled headless mode, the fetcher will generate a real user agent for the same browser version and use it. If you didn't set a user agent and didn't enable headless mode, the fetcher will use the browser's default user agent, which is the same as in standard browsers in the latest versions.
|
| 74 |
|
| 75 |
## Examples
|
|
|
|
| 21 |
3. It isolates JS execution, removes many Playwright fingerprints, and stops detection through some of the known behaviors that bots do.
|
| 22 |
4. It generates canvas noise to prevent fingerprinting through canvas.
|
| 23 |
5. It automatically patches known methods to detect running in headless mode and provides an option to defeat timezone mismatch attacks.
|
| 24 |
+
6. and other anti-protection options...
|
|
|
|
| 25 |
|
| 26 |
## Full list of arguments
|
| 27 |
Scrapling provides many options with this fetcher and its session classes. Before jumping to the [examples](#examples), here's the full list of arguments
|
|
|
|
| 42 |
| wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
|
| 43 |
| init_script | An absolute path to a JavaScript file to be executed on page creation for all pages in this session. | ✔️ |
|
| 44 |
| wait_selector_state | Scrapling will wait for the given state to be fulfilled for the selector given with `wait_selector`. _Default state is `attached`._ | ✔️ |
|
| 45 |
+
| google_search | Enabled by default, Scrapling will set a Google referer header. | ✔️ |
|
| 46 |
+
| extra_headers | A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._ | ✔️ |
|
| 47 |
| proxy | The proxy to be used with requests. It can be a string or a dictionary with only the keys 'server', 'username', and 'password'. | ✔️ |
|
| 48 |
| real_chrome | If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch and use an instance of your browser. | ✔️ |
|
| 49 |
| locale | Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect `navigator.language` value, `Accept-Language` request header value, as well as number and date formatting rules. Defaults to the system default locale. | ✔️ |
|
|
|
|
| 68 |
|
| 69 |
1. It's basically the same arguments as [DynamicFetcher](dynamic.md) class, but with these additional arguments: `solve_cloudflare`, `block_webrtc`, `hide_canvas`, and `allow_webgl`.
|
| 70 |
2. The `disable_resources` option made requests ~25% faster in tests for some websites and can help save proxy usage, but be careful with it, as it can cause some websites to never finish loading.
|
| 71 |
+
3. The `google_search` argument is enabled by default for all requests, setting the referer to `https://www.google.com/`. If used together with `extra_headers`, it takes priority over the referer set there.
|
| 72 |
4. If you didn't set a user agent and enabled headless mode, the fetcher will generate a real user agent for the same browser version and use it. If you didn't set a user agent and didn't enable headless mode, the fetcher will use the browser's default user agent, which is the same as in standard browsers in the latest versions.
|
| 73 |
|
| 74 |
## Examples
|
agent-skill/Scrapling-Skill/references/mcp-server.md
CHANGED
|
@@ -25,7 +25,7 @@ Fast HTTP GET with browser fingerprint impersonation (TLS, headers). Suitable fo
|
|
| 25 |
| `timeout` | number | 30 | Seconds before timeout |
|
| 26 |
| `retries` | int | 3 | Retry attempts on failure |
|
| 27 |
| `retry_delay` | int | 1 | Seconds between retries |
|
| 28 |
-
| `stealthy_headers` | bool | true | Generate realistic browser headers and Google
|
| 29 |
| `http3` | bool | false | Use HTTP/3 (may conflict with `impersonate`) |
|
| 30 |
| `follow_redirects` | bool | true | Follow HTTP redirects |
|
| 31 |
| `max_redirects` | int | 30 | Max redirects (-1 for unlimited) |
|
|
@@ -58,7 +58,7 @@ Opens a Chromium browser via Playwright to render JavaScript. Suitable for dynam
|
|
| 58 |
| `wait_selector_state` | str | `"attached"` | State for wait_selector: `"attached"` / `"visible"` / `"hidden"` / `"detached"` |
|
| 59 |
| `network_idle` | bool | false | Wait until no network activity for 500ms |
|
| 60 |
| `disable_resources` | bool | false | Block fonts, images, media, stylesheets, etc. for speed |
|
| 61 |
-
| `google_search` | bool | true | Set
|
| 62 |
| `real_chrome` | bool | false | Use locally installed Chrome instead of bundled Chromium |
|
| 63 |
| `cdp_url` | str or null | null | Connect to existing browser via CDP URL |
|
| 64 |
| `extra_headers` | dict or null | null | Additional request headers |
|
|
|
|
| 25 |
| `timeout` | number | 30 | Seconds before timeout |
|
| 26 |
| `retries` | int | 3 | Retry attempts on failure |
|
| 27 |
| `retry_delay` | int | 1 | Seconds between retries |
|
| 28 |
+
| `stealthy_headers` | bool | true | Generate realistic browser headers and Google referer |
|
| 29 |
| `http3` | bool | false | Use HTTP/3 (may conflict with `impersonate`) |
|
| 30 |
| `follow_redirects` | bool | true | Follow HTTP redirects |
|
| 31 |
| `max_redirects` | int | 30 | Max redirects (-1 for unlimited) |
|
|
|
|
| 58 |
| `wait_selector_state` | str | `"attached"` | State for wait_selector: `"attached"` / `"visible"` / `"hidden"` / `"detached"` |
|
| 59 |
| `network_idle` | bool | false | Wait until no network activity for 500ms |
|
| 60 |
| `disable_resources` | bool | false | Block fonts, images, media, stylesheets, etc. for speed |
|
| 61 |
+
| `google_search` | bool | true | Set a Google referer header |
|
| 62 |
| `real_chrome` | bool | false | Use locally installed Chrome instead of bundled Chromium |
|
| 63 |
| `cdp_url` | str or null | null | Connect to existing browser via CDP URL |
|
| 64 |
| `extra_headers` | dict or null | null | Additional request headers |
|
docs/README_AR.md
CHANGED
|
@@ -140,7 +140,6 @@ MySpider().start()
|
|
| 140 |
|
| 141 |
<!-- sponsors -->
|
| 142 |
|
| 143 |
-
<a href="https://www.thordata.com/?ls=github&lk=github" target="_blank" title="Unblockable proxies and scraping infrastructure, delivering real-time, reliable web data to power AI models and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
|
| 144 |
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
| 145 |
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
| 146 |
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
|
|
|
| 140 |
|
| 141 |
<!-- sponsors -->
|
| 142 |
|
|
|
|
| 143 |
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
| 144 |
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
| 145 |
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
docs/README_CN.md
CHANGED
|
@@ -140,7 +140,6 @@ MySpider().start()
|
|
| 140 |
|
| 141 |
<!-- sponsors -->
|
| 142 |
|
| 143 |
-
<a href="https://www.thordata.com/?ls=github&lk=github" target="_blank" title="Unblockable proxies and scraping infrastructure, delivering real-time, reliable web data to power AI models and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
|
| 144 |
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
| 145 |
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
| 146 |
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
|
|
|
| 140 |
|
| 141 |
<!-- sponsors -->
|
| 142 |
|
|
|
|
| 143 |
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
| 144 |
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
| 145 |
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
docs/README_DE.md
CHANGED
|
@@ -140,7 +140,6 @@ MySpider().start()
|
|
| 140 |
|
| 141 |
<!-- sponsors -->
|
| 142 |
|
| 143 |
-
<a href="https://www.thordata.com/?ls=github&lk=github" target="_blank" title="Unblockable proxies and scraping infrastructure, delivering real-time, reliable web data to power AI models and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
|
| 144 |
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
| 145 |
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
| 146 |
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
|
|
|
| 140 |
|
| 141 |
<!-- sponsors -->
|
| 142 |
|
|
|
|
| 143 |
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
| 144 |
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
| 145 |
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
docs/README_ES.md
CHANGED
|
@@ -140,7 +140,6 @@ MySpider().start()
|
|
| 140 |
|
| 141 |
<!-- sponsors -->
|
| 142 |
|
| 143 |
-
<a href="https://www.thordata.com/?ls=github&lk=github" target="_blank" title="Unblockable proxies and scraping infrastructure, delivering real-time, reliable web data to power AI models and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
|
| 144 |
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
| 145 |
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
| 146 |
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
|
|
|
| 140 |
|
| 141 |
<!-- sponsors -->
|
| 142 |
|
|
|
|
| 143 |
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
| 144 |
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
| 145 |
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
docs/README_FR.md
CHANGED
|
@@ -140,7 +140,6 @@ MySpider().start()
|
|
| 140 |
|
| 141 |
<!-- sponsors -->
|
| 142 |
|
| 143 |
-
<a href="https://www.thordata.com/?ls=github&lk=github" target="_blank" title="Unblockable proxies and scraping infrastructure, delivering real-time, reliable web data to power AI models and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
|
| 144 |
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
| 145 |
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
| 146 |
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
|
|
|
| 140 |
|
| 141 |
<!-- sponsors -->
|
| 142 |
|
|
|
|
| 143 |
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
| 144 |
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
| 145 |
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
docs/README_JP.md
CHANGED
|
@@ -140,7 +140,6 @@ MySpider().start()
|
|
| 140 |
|
| 141 |
<!-- sponsors -->
|
| 142 |
|
| 143 |
-
<a href="https://www.thordata.com/?ls=github&lk=github" target="_blank" title="Unblockable proxies and scraping infrastructure, delivering real-time, reliable web data to power AI models and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
|
| 144 |
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
| 145 |
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
| 146 |
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
|
|
|
| 140 |
|
| 141 |
<!-- sponsors -->
|
| 142 |
|
|
|
|
| 143 |
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
| 144 |
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
| 145 |
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
docs/README_RU.md
CHANGED
|
@@ -143,7 +143,6 @@ MySpider().start()
|
|
| 143 |
|
| 144 |
<!-- sponsors -->
|
| 145 |
|
| 146 |
-
<a href="https://www.thordata.com/?ls=github&lk=github" target="_blank" title="Unblockable proxies and scraping infrastructure, delivering real-time, reliable web data to power AI models and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
|
| 147 |
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
| 148 |
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
| 149 |
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
|
|
|
| 143 |
|
| 144 |
<!-- sponsors -->
|
| 145 |
|
|
|
|
| 146 |
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
| 147 |
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
| 148 |
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
docs/fetching/dynamic.md
CHANGED
|
@@ -76,8 +76,8 @@ Scrapling provides many options with this fetcher and its session classes. To ma
|
|
| 76 |
| wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
|
| 77 |
| init_script | An absolute path to a JavaScript file to be executed on page creation for all pages in this session. | ✔️ |
|
| 78 |
| wait_selector_state | Scrapling will wait for the given state to be fulfilled for the selector given with `wait_selector`. _Default state is `attached`._ | ✔️ |
|
| 79 |
-
| google_search | Enabled by default, Scrapling will set
|
| 80 |
-
| extra_headers | A dictionary of extra headers to add to the request. _The referer set by
|
| 81 |
| proxy | The proxy to be used with requests. It can be a string or a dictionary with only the keys 'server', 'username', and 'password'. | ✔️ |
|
| 82 |
| real_chrome | If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch and use an instance of your browser. | ✔️ |
|
| 83 |
| locale | Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect `navigator.language` value, `Accept-Language` request header value, as well as number and date formatting rules. Defaults to the system default locale. | ✔️ |
|
|
@@ -97,7 +97,7 @@ In session classes, all these arguments can be set globally for the session. Sti
|
|
| 97 |
!!! note "Notes:"
|
| 98 |
|
| 99 |
1. The `disable_resources` option made requests ~25% faster in my tests for some websites and can help save your proxy usage, but be careful with it, as it can cause some websites to never finish loading.
|
| 100 |
-
2. The `google_search` argument is enabled by default for all requests,
|
| 101 |
3. Since version 0.3.13, the `stealth` option has been removed here in favor of the `StealthyFetcher` class, and the `hide_canvas` option has been moved to it. The `disable_webgl` argument has been moved to the `StealthyFetcher` class and renamed as `allow_webgl`.
|
| 102 |
4. If you didn't set a user agent and enabled headless mode, the fetcher will generate a real user agent for the same browser version and use it. If you didn't set a user agent and didn't enable headless mode, the fetcher will use the browser's default user agent, which is the same as in standard browsers in the latest versions.
|
| 103 |
|
|
|
|
| 76 |
| wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
|
| 77 |
| init_script | An absolute path to a JavaScript file to be executed on page creation for all pages in this session. | ✔️ |
|
| 78 |
| wait_selector_state | Scrapling will wait for the given state to be fulfilled for the selector given with `wait_selector`. _Default state is `attached`._ | ✔️ |
|
| 79 |
+
| google_search | Enabled by default, Scrapling will set a Google referer header. | ✔️ |
|
| 80 |
+
| extra_headers | A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._ | ✔️ |
|
| 81 |
| proxy | The proxy to be used with requests. It can be a string or a dictionary with only the keys 'server', 'username', and 'password'. | ✔️ |
|
| 82 |
| real_chrome | If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch and use an instance of your browser. | ✔️ |
|
| 83 |
| locale | Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect `navigator.language` value, `Accept-Language` request header value, as well as number and date formatting rules. Defaults to the system default locale. | ✔️ |
|
|
|
|
| 97 |
!!! note "Notes:"
|
| 98 |
|
| 99 |
1. The `disable_resources` option made requests ~25% faster in my tests for some websites and can help save your proxy usage, but be careful with it, as it can cause some websites to never finish loading.
|
| 100 |
+
2. The `google_search` argument is enabled by default for all requests, setting the referer to `https://www.google.com/`. If used together with `extra_headers`, it takes priority over the referer set there.
|
| 101 |
3. Since version 0.3.13, the `stealth` option has been removed here in favor of the `StealthyFetcher` class, and the `hide_canvas` option has been moved to it. The `disable_webgl` argument has been moved to the `StealthyFetcher` class and renamed as `allow_webgl`.
|
| 102 |
4. If you didn't set a user agent and enabled headless mode, the fetcher will generate a real user agent for the same browser version and use it. If you didn't set a user agent and didn't enable headless mode, the fetcher will use the browser's default user agent, which is the same as in standard browsers in the latest versions.
|
| 103 |
|
docs/fetching/static.md
CHANGED
|
@@ -20,7 +20,7 @@ Check out how to configure the parsing options [here](choosing.md#parser-configu
|
|
| 20 |
All methods for making requests here share some arguments, so let's discuss them first.
|
| 21 |
|
| 22 |
- **url**: The targeted URL
|
| 23 |
-
- **stealthy_headers**: If enabled (default), it creates and adds real browser headers. It also sets
|
| 24 |
- **follow_redirects**: As the name implies, tell the fetcher to follow redirections. **Enabled by default**
|
| 25 |
- **timeout**: The number of seconds to wait for each request to be finished. **Defaults to 30 seconds**.
|
| 26 |
- **retries**: The number of retries that the fetcher will do for failed requests. **Defaults to three retries**.
|
|
|
|
| 20 |
All methods for making requests here share some arguments, so let's discuss them first.
|
| 21 |
|
| 22 |
- **url**: The targeted URL
|
| 23 |
+
- **stealthy_headers**: If enabled (default), it creates and adds real browser headers. It also sets a Google referer header.
|
| 24 |
- **follow_redirects**: As the name implies, tell the fetcher to follow redirections. **Enabled by default**
|
| 25 |
- **timeout**: The number of seconds to wait for each request to be finished. **Defaults to 30 seconds**.
|
| 26 |
- **retries**: The number of retries that the fetcher will do for failed requests. **Defaults to three retries**.
|
docs/fetching/stealthy.md
CHANGED
|
@@ -32,8 +32,7 @@ The `StealthyFetcher` class is a stealthy version of the [DynamicFetcher](dynami
|
|
| 32 |
3. It isolates JS execution, removes many Playwright fingerprints, and stops detection through some of the known behaviors that bots do.
|
| 33 |
4. It generates canvas noise to prevent fingerprinting through canvas.
|
| 34 |
5. It automatically patches known methods to detect running in headless mode and provides an option to defeat timezone mismatch attacks.
|
| 35 |
-
6.
|
| 36 |
-
7. and other anti-protection options...
|
| 37 |
|
| 38 |
## Full list of arguments
|
| 39 |
Scrapling provides many options with this fetcher and its session classes. Before jumping to the [examples](#examples), here's the full list of arguments
|
|
@@ -54,8 +53,8 @@ Scrapling provides many options with this fetcher and its session classes. Befor
|
|
| 54 |
| wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
|
| 55 |
| init_script | An absolute path to a JavaScript file to be executed on page creation for all pages in this session. | ✔️ |
|
| 56 |
| wait_selector_state | Scrapling will wait for the given state to be fulfilled for the selector given with `wait_selector`. _Default state is `attached`._ | ✔️ |
|
| 57 |
-
| google_search | Enabled by default, Scrapling will set
|
| 58 |
-
| extra_headers | A dictionary of extra headers to add to the request. _The referer set by
|
| 59 |
| proxy | The proxy to be used with requests. It can be a string or a dictionary with only the keys 'server', 'username', and 'password'. | ✔️ |
|
| 60 |
| real_chrome | If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch and use an instance of your browser. | ✔️ |
|
| 61 |
| locale | Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect `navigator.language` value, `Accept-Language` request header value, as well as number and date formatting rules. Defaults to the system default locale. | ✔️ |
|
|
@@ -80,7 +79,7 @@ In session classes, all these arguments can be set globally for the session. Sti
|
|
| 80 |
|
| 81 |
1. It's basically the same arguments as [DynamicFetcher](dynamic.md#introduction) class, but with these additional arguments: `solve_cloudflare`, `block_webrtc`, `hide_canvas`, and `allow_webgl`.
|
| 82 |
2. The `disable_resources` option made requests ~25% faster in my tests for some websites and can help save your proxy usage, but be careful with it, as it can cause some websites to never finish loading.
|
| 83 |
-
3. The `google_search` argument is enabled by default for all requests,
|
| 84 |
4. If you didn't set a user agent and enabled headless mode, the fetcher will generate a real user agent for the same browser version and use it. If you didn't set a user agent and didn't enable headless mode, the fetcher will use the browser's default user agent, which is the same as in standard browsers in the latest versions.
|
| 85 |
|
| 86 |
## Examples
|
|
|
|
| 32 |
3. It isolates JS execution, removes many Playwright fingerprints, and stops detection through some of the known behaviors that bots do.
|
| 33 |
4. It generates canvas noise to prevent fingerprinting through canvas.
|
| 34 |
5. It automatically patches known methods to detect running in headless mode and provides an option to defeat timezone mismatch attacks.
|
| 35 |
+
6. and other anti-protection options...
|
|
|
|
| 36 |
|
| 37 |
## Full list of arguments
|
| 38 |
Scrapling provides many options with this fetcher and its session classes. Before jumping to the [examples](#examples), here's the full list of arguments
|
|
|
|
| 53 |
| wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
|
| 54 |
| init_script | An absolute path to a JavaScript file to be executed on page creation for all pages in this session. | ✔️ |
|
| 55 |
| wait_selector_state | Scrapling will wait for the given state to be fulfilled for the selector given with `wait_selector`. _Default state is `attached`._ | ✔️ |
|
| 56 |
+
| google_search | Enabled by default, Scrapling will set a Google referer header. | ✔️ |
|
| 57 |
+
| extra_headers | A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._ | ✔️ |
|
| 58 |
| proxy | The proxy to be used with requests. It can be a string or a dictionary with only the keys 'server', 'username', and 'password'. | ✔️ |
|
| 59 |
| real_chrome | If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch and use an instance of your browser. | ✔️ |
|
| 60 |
| locale | Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect `navigator.language` value, `Accept-Language` request header value, as well as number and date formatting rules. Defaults to the system default locale. | ✔️ |
|
|
|
|
| 79 |
|
| 80 |
1. It's basically the same arguments as [DynamicFetcher](dynamic.md#introduction) class, but with these additional arguments: `solve_cloudflare`, `block_webrtc`, `hide_canvas`, and `allow_webgl`.
|
| 81 |
2. The `disable_resources` option made requests ~25% faster in my tests for some websites and can help save your proxy usage, but be careful with it, as it can cause some websites to never finish loading.
|
| 82 |
+
3. The `google_search` argument is enabled by default for all requests, setting the referer to `https://www.google.com/`. If used together with `extra_headers`, it takes priority over the referer set there.
|
| 83 |
4. If you didn't set a user agent and enabled headless mode, the fetcher will generate a real user agent for the same browser version and use it. If you didn't set a user agent and didn't enable headless mode, the fetcher will use the browser's default user agent, which is the same as in standard browsers in the latest versions.
|
| 84 |
|
| 85 |
## Examples
|
docs/overview.md
CHANGED
|
@@ -280,7 +280,7 @@ For Async requests, you will replace the import like below:
|
|
| 280 |
|
| 281 |
!!! note "Notes:"
|
| 282 |
|
| 283 |
-
1. You have the `stealthy_headers` argument, which, when enabled, makes requests to generate real browser headers and use them, including a
|
| 284 |
2. The `impersonate` argument lets you fake the TLS fingerprint for a specific browser version.
|
| 285 |
3. There's also the `http3` argument, which, when enabled, makes the fetcher use HTTP/3 for requests, which makes your requests more authentic
|
| 286 |
|
|
@@ -320,8 +320,7 @@ Some of the things it does:
|
|
| 320 |
3. It isolates JS execution, removes many Playwright fingerprints, and stops detection through some of the known behaviors that bots do.
|
| 321 |
4. It generates canvas noise to prevent fingerprinting through canvas.
|
| 322 |
5. It automatically patches known methods to detect running in headless mode and provides an option to defeat timezone mismatch attacks.
|
| 323 |
-
6.
|
| 324 |
-
7. and other anti-protection options...
|
| 325 |
|
| 326 |
```python
|
| 327 |
>>> from scrapling.fetchers import StealthyFetcher
|
|
|
|
| 280 |
|
| 281 |
!!! note "Notes:"
|
| 282 |
|
| 283 |
+
1. You have the `stealthy_headers` argument, which, when enabled, makes requests to generate real browser headers and use them, including a Google referer header. It's enabled by default.
|
| 284 |
2. The `impersonate` argument lets you fake the TLS fingerprint for a specific browser version.
|
| 285 |
3. There's also the `http3` argument, which, when enabled, makes the fetcher use HTTP/3 for requests, which makes your requests more authentic
|
| 286 |
|
|
|
|
| 320 |
3. It isolates JS execution, removes many Playwright fingerprints, and stops detection through some of the known behaviors that bots do.
|
| 321 |
4. It generates canvas noise to prevent fingerprinting through canvas.
|
| 322 |
5. It automatically patches known methods to detect running in headless mode and provides an option to defeat timezone mismatch attacks.
|
| 323 |
+
6. and other anti-protection options...
|
|
|
|
| 324 |
|
| 325 |
```python
|
| 326 |
>>> from scrapling.fetchers import StealthyFetcher
|
pyproject.toml
CHANGED
|
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
|
|
| 5 |
[project]
|
| 6 |
name = "scrapling"
|
| 7 |
# Static version instead of a dynamic version so we can get better layer caching while building docker, check the docker file to understand
|
| 8 |
-
version = "0.4.
|
| 9 |
description = "Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!"
|
| 10 |
readme = {file = "README.md", content-type = "text/markdown"}
|
| 11 |
license = {file = "LICENSE"}
|
|
@@ -64,7 +64,7 @@ dependencies = [
|
|
| 64 |
"lxml>=6.0.2",
|
| 65 |
"cssselect>=1.4.0",
|
| 66 |
"orjson>=3.11.7",
|
| 67 |
-
"tld>=0.13.
|
| 68 |
"w3lib>=2.4.0",
|
| 69 |
"typing_extensions",
|
| 70 |
]
|
|
@@ -73,9 +73,10 @@ dependencies = [
|
|
| 73 |
fetchers = [
|
| 74 |
"click>=8.3.0",
|
| 75 |
"curl_cffi>=0.14.0",
|
| 76 |
-
"playwright==1.
|
| 77 |
-
"patchright==1.
|
| 78 |
"browserforge>=1.2.4",
|
|
|
|
| 79 |
"msgspec>=0.20.0",
|
| 80 |
"anyio>=4.12.1"
|
| 81 |
]
|
|
|
|
| 5 |
[project]
|
| 6 |
name = "scrapling"
|
| 7 |
# Static version instead of a dynamic version so we can get better layer caching while building docker, check the docker file to understand
|
| 8 |
+
version = "0.4.2"
|
| 9 |
description = "Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!"
|
| 10 |
readme = {file = "README.md", content-type = "text/markdown"}
|
| 11 |
license = {file = "LICENSE"}
|
|
|
|
| 64 |
"lxml>=6.0.2",
|
| 65 |
"cssselect>=1.4.0",
|
| 66 |
"orjson>=3.11.7",
|
| 67 |
+
"tld>=0.13.2",
|
| 68 |
"w3lib>=2.4.0",
|
| 69 |
"typing_extensions",
|
| 70 |
]
|
|
|
|
| 73 |
fetchers = [
|
| 74 |
"click>=8.3.0",
|
| 75 |
"curl_cffi>=0.14.0",
|
| 76 |
+
"playwright==1.58.0",
|
| 77 |
+
"patchright==1.58.2",
|
| 78 |
"browserforge>=1.2.4",
|
| 79 |
+
"apify-fingerprint-datapoints>=0.11.0",
|
| 80 |
"msgspec>=0.20.0",
|
| 81 |
"anyio>=4.12.1"
|
| 82 |
]
|
scrapling/__init__.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
| 2 |
-
__version__ = "0.4.
|
| 3 |
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
| 4 |
|
| 5 |
from typing import Any, TYPE_CHECKING
|
|
|
|
| 1 |
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
| 2 |
+
__version__ = "0.4.2"
|
| 3 |
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
| 4 |
|
| 5 |
from typing import Any, TYPE_CHECKING
|
scrapling/core/_types.py
CHANGED
|
@@ -4,7 +4,6 @@ Type definitions for type checking purposes.
|
|
| 4 |
|
| 5 |
from typing import (
|
| 6 |
TYPE_CHECKING,
|
| 7 |
-
TypedDict,
|
| 8 |
TypeAlias,
|
| 9 |
cast,
|
| 10 |
overload,
|
|
@@ -32,7 +31,7 @@ from typing import (
|
|
| 32 |
Coroutine,
|
| 33 |
SupportsIndex,
|
| 34 |
)
|
| 35 |
-
from typing_extensions import Self, Unpack
|
| 36 |
|
| 37 |
# Proxy can be a string URL or a dict (Playwright format: {"server": "...", "username": "...", "password": "..."})
|
| 38 |
ProxyType = Union[str, Dict[str, str]]
|
|
|
|
| 4 |
|
| 5 |
from typing import (
|
| 6 |
TYPE_CHECKING,
|
|
|
|
| 7 |
TypeAlias,
|
| 8 |
cast,
|
| 9 |
overload,
|
|
|
|
| 31 |
Coroutine,
|
| 32 |
SupportsIndex,
|
| 33 |
)
|
| 34 |
+
from typing_extensions import Self, Unpack, TypedDict
|
| 35 |
|
| 36 |
# Proxy can be a string URL or a dict (Playwright format: {"server": "...", "username": "...", "password": "..."})
|
| 37 |
ProxyType = Union[str, Dict[str, str]]
|
scrapling/core/ai.py
CHANGED
|
@@ -105,7 +105,7 @@ class ScraplingMCPServer:
|
|
| 105 |
:param auth: HTTP basic auth in dictionary format with `username` and `password` keys.
|
| 106 |
:param verify: Whether to verify HTTPS certificates.
|
| 107 |
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
| 108 |
-
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets
|
| 109 |
"""
|
| 110 |
normalized_proxy_auth = _normalize_credentials(proxy_auth)
|
| 111 |
normalized_auth = _normalize_credentials(auth)
|
|
@@ -186,7 +186,7 @@ class ScraplingMCPServer:
|
|
| 186 |
:param auth: HTTP basic auth in dictionary format with `username` and `password` keys.
|
| 187 |
:param verify: Whether to verify HTTPS certificates.
|
| 188 |
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
| 189 |
-
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets
|
| 190 |
"""
|
| 191 |
normalized_proxy_auth = _normalize_credentials(proxy_auth)
|
| 192 |
normalized_auth = _normalize_credentials(auth)
|
|
@@ -276,8 +276,8 @@ class ScraplingMCPServer:
|
|
| 276 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 277 |
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 278 |
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 279 |
-
:param google_search: Enabled by default, Scrapling will set
|
| 280 |
-
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by
|
| 281 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 282 |
"""
|
| 283 |
page = await DynamicFetcher.async_fetch(
|
|
@@ -358,8 +358,8 @@ class ScraplingMCPServer:
|
|
| 358 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 359 |
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 360 |
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 361 |
-
:param google_search: Enabled by default, Scrapling will set
|
| 362 |
-
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by
|
| 363 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 364 |
"""
|
| 365 |
async with AsyncDynamicSession(
|
|
@@ -454,8 +454,8 @@ class ScraplingMCPServer:
|
|
| 454 |
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
| 455 |
:param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
|
| 456 |
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 457 |
-
:param google_search: Enabled by default, Scrapling will set
|
| 458 |
-
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by
|
| 459 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 460 |
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 461 |
"""
|
|
@@ -551,8 +551,8 @@ class ScraplingMCPServer:
|
|
| 551 |
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
| 552 |
:param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
|
| 553 |
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 554 |
-
:param google_search: Enabled by default, Scrapling will set
|
| 555 |
-
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by
|
| 556 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 557 |
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 558 |
"""
|
|
|
|
| 105 |
:param auth: HTTP basic auth in dictionary format with `username` and `password` keys.
|
| 106 |
:param verify: Whether to verify HTTPS certificates.
|
| 107 |
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
| 108 |
+
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets a Google referer header.
|
| 109 |
"""
|
| 110 |
normalized_proxy_auth = _normalize_credentials(proxy_auth)
|
| 111 |
normalized_auth = _normalize_credentials(auth)
|
|
|
|
| 186 |
:param auth: HTTP basic auth in dictionary format with `username` and `password` keys.
|
| 187 |
:param verify: Whether to verify HTTPS certificates.
|
| 188 |
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
| 189 |
+
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets a Google referer header.
|
| 190 |
"""
|
| 191 |
normalized_proxy_auth = _normalize_credentials(proxy_auth)
|
| 192 |
normalized_auth = _normalize_credentials(auth)
|
|
|
|
| 276 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 277 |
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 278 |
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 279 |
+
:param google_search: Enabled by default, Scrapling will set a Google referer header.
|
| 280 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
|
| 281 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 282 |
"""
|
| 283 |
page = await DynamicFetcher.async_fetch(
|
|
|
|
| 358 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 359 |
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 360 |
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 361 |
+
:param google_search: Enabled by default, Scrapling will set a Google referer header.
|
| 362 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
|
| 363 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 364 |
"""
|
| 365 |
async with AsyncDynamicSession(
|
|
|
|
| 454 |
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
| 455 |
:param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
|
| 456 |
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 457 |
+
:param google_search: Enabled by default, Scrapling will set a Google referer header.
|
| 458 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
|
| 459 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 460 |
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 461 |
"""
|
|
|
|
| 551 |
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
| 552 |
:param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.
|
| 553 |
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 554 |
+
:param google_search: Enabled by default, Scrapling will set a Google referer header.
|
| 555 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
|
| 556 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 557 |
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
| 558 |
"""
|
scrapling/engines/_browsers/_base.py
CHANGED
|
@@ -419,7 +419,7 @@ class BaseSessionMixin:
|
|
| 419 |
if not config.cdp_url:
|
| 420 |
flags = self._browser_options["args"]
|
| 421 |
if config.extra_flags or extra_flags:
|
| 422 |
-
flags = list(set(flags + (config.extra_flags or extra_flags)))
|
| 423 |
|
| 424 |
self._browser_options.update(
|
| 425 |
{
|
|
@@ -480,7 +480,7 @@ class StealthySessionMixin(BaseSessionMixin):
|
|
| 480 |
config = cast(StealthConfig, self._config)
|
| 481 |
flags: Tuple[str, ...] = tuple()
|
| 482 |
if not config.cdp_url:
|
| 483 |
-
flags = DEFAULT_ARGS + STEALTH_ARGS
|
| 484 |
|
| 485 |
if config.block_webrtc:
|
| 486 |
flags += (
|
|
|
|
| 419 |
if not config.cdp_url:
|
| 420 |
flags = self._browser_options["args"]
|
| 421 |
if config.extra_flags or extra_flags:
|
| 422 |
+
flags = list(set(tuple(flags) + tuple(config.extra_flags or extra_flags or ())))
|
| 423 |
|
| 424 |
self._browser_options.update(
|
| 425 |
{
|
|
|
|
| 480 |
config = cast(StealthConfig, self._config)
|
| 481 |
flags: Tuple[str, ...] = tuple()
|
| 482 |
if not config.cdp_url:
|
| 483 |
+
flags = tuple(DEFAULT_ARGS) + tuple(STEALTH_ARGS)
|
| 484 |
|
| 485 |
if config.block_webrtc:
|
| 486 |
flags += (
|
scrapling/engines/_browsers/_controllers.py
CHANGED
|
@@ -14,7 +14,6 @@ from scrapling.core.utils import log
|
|
| 14 |
from scrapling.core._types import Optional, ProxyType, Unpack
|
| 15 |
from scrapling.engines.toolbelt.proxy_rotation import is_proxy_error
|
| 16 |
from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
|
| 17 |
-
from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
|
| 18 |
from scrapling.engines._browsers._types import PlaywrightSession, PlaywrightFetchParams
|
| 19 |
from scrapling.engines._browsers._base import SyncSession, AsyncSession, DynamicSessionMixin
|
| 20 |
from scrapling.engines._browsers._validators import validate_fetch as _validate, PlaywrightConfig
|
|
@@ -58,8 +57,8 @@ class DynamicSession(SyncSession, DynamicSessionMixin):
|
|
| 58 |
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 59 |
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 60 |
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 61 |
-
:param google_search: Enabled by default, Scrapling will set
|
| 62 |
-
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by
|
| 63 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 64 |
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
| 65 |
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
|
@@ -103,11 +102,11 @@ class DynamicSession(SyncSession, DynamicSessionMixin):
|
|
| 103 |
"""Opens up the browser and do your request based on your chosen options.
|
| 104 |
|
| 105 |
:param url: The Target url.
|
| 106 |
-
:param google_search: Enabled by default, Scrapling will set
|
| 107 |
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 108 |
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 109 |
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 110 |
-
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by
|
| 111 |
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 112 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 113 |
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
|
@@ -127,9 +126,7 @@ class DynamicSession(SyncSession, DynamicSessionMixin):
|
|
| 127 |
|
| 128 |
request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
|
| 129 |
referer = (
|
| 130 |
-
|
| 131 |
-
if (params.google_search and "referer" not in request_headers_keys)
|
| 132 |
-
else None
|
| 133 |
)
|
| 134 |
|
| 135 |
for attempt in range(self._config.retries):
|
|
@@ -226,8 +223,8 @@ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
|
|
| 226 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 227 |
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 228 |
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 229 |
-
:param google_search: Enabled by default, Scrapling will set
|
| 230 |
-
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by
|
| 231 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 232 |
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
|
| 233 |
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
|
@@ -271,11 +268,11 @@ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
|
|
| 271 |
"""Opens up the browser and do your request based on your chosen options.
|
| 272 |
|
| 273 |
:param url: The Target url.
|
| 274 |
-
:param google_search: Enabled by default, Scrapling will set
|
| 275 |
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 276 |
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 277 |
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 278 |
-
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by
|
| 279 |
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 280 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 281 |
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
|
@@ -296,9 +293,7 @@ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin):
|
|
| 296 |
|
| 297 |
request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
|
| 298 |
referer = (
|
| 299 |
-
|
| 300 |
-
if (params.google_search and "referer" not in request_headers_keys)
|
| 301 |
-
else None
|
| 302 |
)
|
| 303 |
|
| 304 |
for attempt in range(self._config.retries):
|
|
|
|
| 14 |
from scrapling.core._types import Optional, ProxyType, Unpack
|
| 15 |
from scrapling.engines.toolbelt.proxy_rotation import is_proxy_error
|
| 16 |
from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
|
|
|
|
| 17 |
from scrapling.engines._browsers._types import PlaywrightSession, PlaywrightFetchParams
|
| 18 |
from scrapling.engines._browsers._base import SyncSession, AsyncSession, DynamicSessionMixin
|
| 19 |
from scrapling.engines._browsers._validators import validate_fetch as _validate, PlaywrightConfig
|
|
|
|
| 57 |
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 58 |
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 59 |
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 60 |
+
:param google_search: Enabled by default, Scrapling will set a Google referer header.
|
| 61 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
|
| 62 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 63 |
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
| 64 |
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
|
|
|
| 102 |
"""Opens up the browser and do your request based on your chosen options.
|
| 103 |
|
| 104 |
:param url: The Target url.
|
| 105 |
+
:param google_search: Enabled by default, Scrapling will set a Google referer header.
|
| 106 |
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 107 |
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 108 |
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 109 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
|
| 110 |
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 111 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 112 |
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
|
|
|
| 126 |
|
| 127 |
request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
|
| 128 |
referer = (
|
| 129 |
+
"https://www.google.com/" if (params.google_search and "referer" not in request_headers_keys) else None
|
|
|
|
|
|
|
| 130 |
)
|
| 131 |
|
| 132 |
for attempt in range(self._config.retries):
|
|
|
|
| 223 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 224 |
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 225 |
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 226 |
+
:param google_search: Enabled by default, Scrapling will set a Google referer header.
|
| 227 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
|
| 228 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 229 |
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
|
| 230 |
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
|
|
|
| 268 |
"""Opens up the browser and do your request based on your chosen options.
|
| 269 |
|
| 270 |
:param url: The Target url.
|
| 271 |
+
:param google_search: Enabled by default, Scrapling will set a Google referer header.
|
| 272 |
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 273 |
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 274 |
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 275 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
|
| 276 |
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 277 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 278 |
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
|
|
|
| 293 |
|
| 294 |
request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
|
| 295 |
referer = (
|
| 296 |
+
"https://www.google.com/" if (params.google_search and "referer" not in request_headers_keys) else None
|
|
|
|
|
|
|
| 297 |
)
|
| 298 |
|
| 299 |
for attempt in range(self._config.retries):
|
scrapling/engines/_browsers/_stealth.py
CHANGED
|
@@ -16,7 +16,6 @@ from scrapling.core.utils import log
|
|
| 16 |
from scrapling.core._types import Any, Optional, ProxyType, Unpack
|
| 17 |
from scrapling.engines.toolbelt.proxy_rotation import is_proxy_error
|
| 18 |
from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
|
| 19 |
-
from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
|
| 20 |
from scrapling.engines._browsers._types import StealthSession, StealthFetchParams
|
| 21 |
from scrapling.engines._browsers._base import SyncSession, AsyncSession, StealthySessionMixin
|
| 22 |
from scrapling.engines._browsers._validators import validate_fetch as _validate, StealthConfig
|
|
@@ -66,8 +65,8 @@ class StealthySession(SyncSession, StealthySessionMixin):
|
|
| 66 |
:param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
| 67 |
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 68 |
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 69 |
-
:param google_search: Enabled by default, Scrapling will set
|
| 70 |
-
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by
|
| 71 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 72 |
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
| 73 |
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
|
@@ -189,11 +188,11 @@ class StealthySession(SyncSession, StealthySessionMixin):
|
|
| 189 |
"""Opens up the browser and do your request based on your chosen options.
|
| 190 |
|
| 191 |
:param url: The Target url.
|
| 192 |
-
:param google_search: Enabled by default, Scrapling will set
|
| 193 |
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 194 |
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 195 |
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 196 |
-
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by
|
| 197 |
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 198 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 199 |
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
|
@@ -214,9 +213,7 @@ class StealthySession(SyncSession, StealthySessionMixin):
|
|
| 214 |
|
| 215 |
request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
|
| 216 |
referer = (
|
| 217 |
-
|
| 218 |
-
if (params.google_search and "referer" not in request_headers_keys)
|
| 219 |
-
else None
|
| 220 |
)
|
| 221 |
|
| 222 |
for attempt in range(self._config.retries):
|
|
@@ -322,8 +319,8 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
|
|
| 322 |
:param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
| 323 |
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 324 |
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 325 |
-
:param google_search: Enabled by default, Scrapling will set
|
| 326 |
-
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by
|
| 327 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 328 |
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
| 329 |
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
|
@@ -444,11 +441,11 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
|
|
| 444 |
"""Opens up the browser and do your request based on your chosen options.
|
| 445 |
|
| 446 |
:param url: The Target url.
|
| 447 |
-
:param google_search: Enabled by default, Scrapling will set
|
| 448 |
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 449 |
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 450 |
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 451 |
-
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by
|
| 452 |
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 453 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 454 |
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
|
@@ -470,9 +467,7 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin):
|
|
| 470 |
|
| 471 |
request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
|
| 472 |
referer = (
|
| 473 |
-
|
| 474 |
-
if (params.google_search and "referer" not in request_headers_keys)
|
| 475 |
-
else None
|
| 476 |
)
|
| 477 |
|
| 478 |
for attempt in range(self._config.retries):
|
|
|
|
| 16 |
from scrapling.core._types import Any, Optional, ProxyType, Unpack
|
| 17 |
from scrapling.engines.toolbelt.proxy_rotation import is_proxy_error
|
| 18 |
from scrapling.engines.toolbelt.convertor import Response, ResponseFactory
|
|
|
|
| 19 |
from scrapling.engines._browsers._types import StealthSession, StealthFetchParams
|
| 20 |
from scrapling.engines._browsers._base import SyncSession, AsyncSession, StealthySessionMixin
|
| 21 |
from scrapling.engines._browsers._validators import validate_fetch as _validate, StealthConfig
|
|
|
|
| 65 |
:param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
| 66 |
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 67 |
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 68 |
+
:param google_search: Enabled by default, Scrapling will set a Google referer header.
|
| 69 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
|
| 70 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 71 |
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
| 72 |
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
|
|
|
| 188 |
"""Opens up the browser and do your request based on your chosen options.
|
| 189 |
|
| 190 |
:param url: The Target url.
|
| 191 |
+
:param google_search: Enabled by default, Scrapling will set a Google referer header.
|
| 192 |
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 193 |
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 194 |
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 195 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
|
| 196 |
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 197 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 198 |
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
|
|
|
| 213 |
|
| 214 |
request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
|
| 215 |
referer = (
|
| 216 |
+
"https://www.google.com/" if (params.google_search and "referer" not in request_headers_keys) else None
|
|
|
|
|
|
|
| 217 |
)
|
| 218 |
|
| 219 |
for attempt in range(self._config.retries):
|
|
|
|
| 319 |
:param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
| 320 |
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 321 |
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 322 |
+
:param google_search: Enabled by default, Scrapling will set a Google referer header.
|
| 323 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
|
| 324 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 325 |
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
| 326 |
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
|
|
|
| 441 |
"""Opens up the browser and do your request based on your chosen options.
|
| 442 |
|
| 443 |
:param url: The Target url.
|
| 444 |
+
:param google_search: Enabled by default, Scrapling will set a Google referer header.
|
| 445 |
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
| 446 |
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
| 447 |
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
| 448 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
|
| 449 |
:param disable_resources: Drop requests for unnecessary resources for a speed boost.
|
| 450 |
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 451 |
:param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too).
|
|
|
|
| 467 |
|
| 468 |
request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()
|
| 469 |
referer = (
|
| 470 |
+
"https://www.google.com/" if (params.google_search and "referer" not in request_headers_keys) else None
|
|
|
|
|
|
|
| 471 |
)
|
| 472 |
|
| 473 |
for attempt in range(self._config.retries):
|
scrapling/engines/static.py
CHANGED
|
@@ -26,7 +26,7 @@ from .toolbelt.custom import Response
|
|
| 26 |
from .toolbelt.convertor import ResponseFactory
|
| 27 |
from .toolbelt.proxy_rotation import ProxyRotator, is_proxy_error
|
| 28 |
from ._browsers._types import RequestsSession, GetRequestParams, DataRequestParams, ImpersonateType
|
| 29 |
-
from .toolbelt.fingerprints import
|
| 30 |
|
| 31 |
_NO_SESSION: Any = object()
|
| 32 |
|
|
@@ -166,14 +166,14 @@ class _ConfigurationLogic(ABC):
|
|
| 166 |
"""
|
| 167 |
1. Adds a useragent to the headers if it doesn't have one
|
| 168 |
2. Generates real headers and append them to current headers
|
| 169 |
-
3.
|
| 170 |
"""
|
| 171 |
# Merge session headers with request headers, request takes precedence (if it was set)
|
| 172 |
final_headers = {**self._default_headers, **(headers if headers else {})}
|
| 173 |
headers_keys = {k.lower() for k in final_headers}
|
| 174 |
if stealth:
|
| 175 |
if "referer" not in headers_keys:
|
| 176 |
-
final_headers["referer"] =
|
| 177 |
|
| 178 |
if not impersonate_enabled: # Curl will generate the suitable headers
|
| 179 |
extra_headers = generate_headers(browser_mode=False)
|
|
@@ -672,7 +672,7 @@ class FetcherSession:
|
|
| 672 |
"""
|
| 673 |
:param impersonate: Browser version to impersonate. Can be a single browser string or a list of browser strings for random selection. (Default: latest available Chrome version)
|
| 674 |
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
| 675 |
-
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets
|
| 676 |
:param proxies: Dict of proxies to use. Format: {"http": proxy_url, "https": proxy_url}.
|
| 677 |
:param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
| 678 |
Cannot be used together with the `proxies` parameter.
|
|
|
|
| 26 |
from .toolbelt.convertor import ResponseFactory
|
| 27 |
from .toolbelt.proxy_rotation import ProxyRotator, is_proxy_error
|
| 28 |
from ._browsers._types import RequestsSession, GetRequestParams, DataRequestParams, ImpersonateType
|
| 29 |
+
from .toolbelt.fingerprints import generate_headers, __default_useragent__
|
| 30 |
|
| 31 |
_NO_SESSION: Any = object()
|
| 32 |
|
|
|
|
| 166 |
"""
|
| 167 |
1. Adds a useragent to the headers if it doesn't have one
|
| 168 |
2. Generates real headers and append them to current headers
|
| 169 |
+
3. Sets a Google referer header.
|
| 170 |
"""
|
| 171 |
# Merge session headers with request headers, request takes precedence (if it was set)
|
| 172 |
final_headers = {**self._default_headers, **(headers if headers else {})}
|
| 173 |
headers_keys = {k.lower() for k in final_headers}
|
| 174 |
if stealth:
|
| 175 |
if "referer" not in headers_keys:
|
| 176 |
+
final_headers["referer"] = "https://www.google.com/"
|
| 177 |
|
| 178 |
if not impersonate_enabled: # Curl will generate the suitable headers
|
| 179 |
extra_headers = generate_headers(browser_mode=False)
|
|
|
|
| 672 |
"""
|
| 673 |
:param impersonate: Browser version to impersonate. Can be a single browser string or a list of browser strings for random selection. (Default: latest available Chrome version)
|
| 674 |
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
| 675 |
+
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets a Google referer header.
|
| 676 |
:param proxies: Dict of proxies to use. Format: {"http": proxy_url, "https": proxy_url}.
|
| 677 |
:param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
| 678 |
Cannot be used together with the `proxies` parameter.
|
scrapling/engines/toolbelt/fingerprints.py
CHANGED
|
@@ -5,45 +5,16 @@ Functions related to generating headers and fingerprints generally
|
|
| 5 |
from functools import lru_cache
|
| 6 |
from platform import system as platform_system
|
| 7 |
|
| 8 |
-
from tld import get_tld, Result
|
| 9 |
from browserforge.headers import Browser, HeaderGenerator
|
| 10 |
from browserforge.headers.generator import SUPPORTED_OPERATING_SYSTEMS
|
| 11 |
|
| 12 |
-
from scrapling.core._types import Dict, Literal, Tuple
|
| 13 |
|
| 14 |
__OS_NAME__ = platform_system()
|
| 15 |
OSName = Literal["linux", "macos", "windows"]
|
| 16 |
# Current versions hardcoded for now (Playwright doesn't allow to know the version of a browser without launching it)
|
| 17 |
-
chromium_version =
|
| 18 |
-
chrome_version =
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
@lru_cache(10, typed=True)
|
| 22 |
-
def generate_convincing_referer(url: str) -> str | None:
|
| 23 |
-
"""Takes the domain from the URL without the subdomain/suffix and make it look like you were searching Google for this website
|
| 24 |
-
|
| 25 |
-
>>> generate_convincing_referer('https://www.somewebsite.com/blah')
|
| 26 |
-
'https://www.google.com/search?q=somewebsite'
|
| 27 |
-
|
| 28 |
-
:param url: The URL you are about to fetch.
|
| 29 |
-
:return: Google's search URL of the domain name, or None for localhost/IP addresses
|
| 30 |
-
"""
|
| 31 |
-
# Fixing the inaccurate return type hint in `get_tld`
|
| 32 |
-
extracted: Result | None = cast(Result, get_tld(url, as_object=True, fail_silently=True))
|
| 33 |
-
if not extracted:
|
| 34 |
-
return None
|
| 35 |
-
|
| 36 |
-
website_name = extracted.domain
|
| 37 |
-
|
| 38 |
-
# Skip generating referer for localhost, IP addresses, or when there's no valid domain
|
| 39 |
-
if not website_name or not extracted.tld or website_name in ("localhost", "127.0.0.1", "::1"):
|
| 40 |
-
return None
|
| 41 |
-
|
| 42 |
-
# Check if it's an IP address (simple check for IPv4)
|
| 43 |
-
if all(part.isdigit() for part in website_name.split(".") if part):
|
| 44 |
-
return None
|
| 45 |
-
|
| 46 |
-
return f"https://www.google.com/search?q={website_name}"
|
| 47 |
|
| 48 |
|
| 49 |
@lru_cache(1, typed=True)
|
|
|
|
| 5 |
from functools import lru_cache
|
| 6 |
from platform import system as platform_system
|
| 7 |
|
|
|
|
| 8 |
from browserforge.headers import Browser, HeaderGenerator
|
| 9 |
from browserforge.headers.generator import SUPPORTED_OPERATING_SYSTEMS
|
| 10 |
|
| 11 |
+
from scrapling.core._types import Dict, Literal, Tuple
|
| 12 |
|
| 13 |
__OS_NAME__ = platform_system()
|
| 14 |
OSName = Literal["linux", "macos", "windows"]
|
| 15 |
# Current versions hardcoded for now (Playwright doesn't allow to know the version of a browser without launching it)
|
| 16 |
+
chromium_version = 145
|
| 17 |
+
chrome_version = 145
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
@lru_cache(1, typed=True)
|
scrapling/fetchers/chrome.py
CHANGED
|
@@ -28,7 +28,7 @@ class DynamicFetcher(BaseFetcher):
|
|
| 28 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 29 |
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 30 |
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 31 |
-
:param google_search: Enabled by default, Scrapling will set
|
| 32 |
:param extra_headers: A dictionary of extra headers to add to the request.
|
| 33 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 34 |
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
|
@@ -68,7 +68,7 @@ class DynamicFetcher(BaseFetcher):
|
|
| 68 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 69 |
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 70 |
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 71 |
-
:param google_search: Enabled by default, Scrapling will set
|
| 72 |
:param extra_headers: A dictionary of extra headers to add to the request.
|
| 73 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 74 |
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
|
|
|
| 28 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 29 |
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 30 |
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 31 |
+
:param google_search: Enabled by default, Scrapling will set a Google referer header.
|
| 32 |
:param extra_headers: A dictionary of extra headers to add to the request.
|
| 33 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 34 |
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
|
|
|
| 68 |
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
| 69 |
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
| 70 |
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 71 |
+
:param google_search: Enabled by default, Scrapling will set a Google referer header.
|
| 72 |
:param extra_headers: A dictionary of extra headers to add to the request.
|
| 73 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 74 |
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
scrapling/fetchers/stealth_chrome.py
CHANGED
|
@@ -39,8 +39,8 @@ class StealthyFetcher(BaseFetcher):
|
|
| 39 |
:param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
| 40 |
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 41 |
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 42 |
-
:param google_search: Enabled by default, Scrapling will set
|
| 43 |
-
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by
|
| 44 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 45 |
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
| 46 |
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
|
@@ -88,8 +88,8 @@ class StealthyFetcher(BaseFetcher):
|
|
| 88 |
:param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
| 89 |
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 90 |
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 91 |
-
:param google_search: Enabled by default, Scrapling will set
|
| 92 |
-
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by
|
| 93 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 94 |
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
| 95 |
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
|
|
|
| 39 |
:param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
| 40 |
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 41 |
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 42 |
+
:param google_search: Enabled by default, Scrapling will set a Google referer header.
|
| 43 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
|
| 44 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 45 |
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
| 46 |
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
|
|
|
| 88 |
:param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
| 89 |
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
| 90 |
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
| 91 |
+
:param google_search: Enabled by default, Scrapling will set a Google referer header.
|
| 92 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._
|
| 93 |
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
| 94 |
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
| 95 |
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
scrapling/parser.py
CHANGED
|
@@ -58,6 +58,7 @@ _find_all_elements = XPath(".//*")
|
|
| 58 |
_find_all_elements_with_spaces = XPath(
|
| 59 |
".//*[normalize-space(text())]"
|
| 60 |
) # This selector gets all elements with text content
|
|
|
|
| 61 |
|
| 62 |
|
| 63 |
class Selector(SelectorsGeneration):
|
|
@@ -299,18 +300,31 @@ class Selector(SelectorsGeneration):
|
|
| 299 |
|
| 300 |
ignored_elements: set[Any] = set()
|
| 301 |
if ignore_tags:
|
| 302 |
-
|
| 303 |
-
ignored_elements.add(element)
|
| 304 |
-
ignored_elements.update(cast(list, _find_all_elements(element)))
|
| 305 |
|
| 306 |
_all_strings = []
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
|
| 315 |
return cast(TextHandler, TextHandler(separator).join(_all_strings))
|
| 316 |
|
|
|
|
| 58 |
_find_all_elements_with_spaces = XPath(
|
| 59 |
".//*[normalize-space(text())]"
|
| 60 |
) # This selector gets all elements with text content
|
| 61 |
+
_find_all_text_nodes = XPath(".//text()")
|
| 62 |
|
| 63 |
|
| 64 |
class Selector(SelectorsGeneration):
|
|
|
|
| 300 |
|
| 301 |
ignored_elements: set[Any] = set()
|
| 302 |
if ignore_tags:
|
| 303 |
+
ignored_elements.update(self._root.iter(*ignore_tags))
|
|
|
|
|
|
|
| 304 |
|
| 305 |
_all_strings = []
|
| 306 |
+
|
| 307 |
+
def append_text(text: str) -> None:
|
| 308 |
+
processed_text = text.strip() if strip else text
|
| 309 |
+
if not valid_values or processed_text.strip():
|
| 310 |
+
_all_strings.append(processed_text)
|
| 311 |
+
|
| 312 |
+
def is_visible_text_node(text_node: _ElementUnicodeResult) -> bool:
|
| 313 |
+
parent = text_node.getparent()
|
| 314 |
+
if parent is None:
|
| 315 |
+
return False
|
| 316 |
+
|
| 317 |
+
owner = parent.getparent() if text_node.is_tail else parent
|
| 318 |
+
while owner is not None:
|
| 319 |
+
if owner in ignored_elements:
|
| 320 |
+
return False
|
| 321 |
+
owner = owner.getparent()
|
| 322 |
+
return True
|
| 323 |
+
|
| 324 |
+
for text_node in cast(list[_ElementUnicodeResult], _find_all_text_nodes(self._root)):
|
| 325 |
+
text = str(text_node)
|
| 326 |
+
if text and is_visible_text_node(text_node):
|
| 327 |
+
append_text(text)
|
| 328 |
|
| 329 |
return cast(TextHandler, TextHandler(separator).join(_all_strings))
|
| 330 |
|
server.json
CHANGED
|
@@ -14,12 +14,12 @@
|
|
| 14 |
"mimeType": "image/png"
|
| 15 |
}
|
| 16 |
],
|
| 17 |
-
"version": "0.4.
|
| 18 |
"packages": [
|
| 19 |
{
|
| 20 |
"registryType": "pypi",
|
| 21 |
"identifier": "scrapling",
|
| 22 |
-
"version": "0.4.
|
| 23 |
"runtimeHint": "uvx",
|
| 24 |
"packageArguments": [
|
| 25 |
{
|
|
|
|
| 14 |
"mimeType": "image/png"
|
| 15 |
}
|
| 16 |
],
|
| 17 |
+
"version": "0.4.2",
|
| 18 |
"packages": [
|
| 19 |
{
|
| 20 |
"registryType": "pypi",
|
| 21 |
"identifier": "scrapling",
|
| 22 |
+
"version": "0.4.2",
|
| 23 |
"runtimeHint": "uvx",
|
| 24 |
"packageArguments": [
|
| 25 |
{
|
setup.cfg
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[metadata]
|
| 2 |
name = scrapling
|
| 3 |
-
version = 0.4.
|
| 4 |
author = Karim Shoair
|
| 5 |
author_email = karim.shoair@pm.me
|
| 6 |
description = Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
|
|
|
|
| 1 |
[metadata]
|
| 2 |
name = scrapling
|
| 3 |
+
version = 0.4.2
|
| 4 |
author = Karim Shoair
|
| 5 |
author_email = karim.shoair@pm.me
|
| 6 |
description = Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
|
tests/fetchers/test_utils.py
CHANGED
|
@@ -7,7 +7,6 @@ from scrapling.engines.toolbelt.navigation import (
|
|
| 7 |
create_async_intercept_handler,
|
| 8 |
)
|
| 9 |
from scrapling.engines.toolbelt.fingerprints import (
|
| 10 |
-
generate_convincing_referer,
|
| 11 |
get_os_name,
|
| 12 |
generate_headers
|
| 13 |
)
|
|
@@ -204,22 +203,6 @@ class TestConstructProxyDict:
|
|
| 204 |
class TestFingerprintFunctions:
|
| 205 |
"""Test fingerprint generation functions"""
|
| 206 |
|
| 207 |
-
def test_generate_convincing_referer(self):
|
| 208 |
-
"""Test referer generation"""
|
| 209 |
-
url = "https://sub.example.com/page.html"
|
| 210 |
-
result = generate_convincing_referer(url)
|
| 211 |
-
|
| 212 |
-
assert result.startswith("https://www.google.com/search?q=")
|
| 213 |
-
assert "example" in result
|
| 214 |
-
|
| 215 |
-
def test_generate_convincing_referer_caching(self):
|
| 216 |
-
"""Test referer generation caching"""
|
| 217 |
-
url = "https://example.com"
|
| 218 |
-
result1 = generate_convincing_referer(url)
|
| 219 |
-
result2 = generate_convincing_referer(url)
|
| 220 |
-
|
| 221 |
-
assert result1 == result2
|
| 222 |
-
|
| 223 |
def test_get_os_name(self):
|
| 224 |
"""Test OS name detection"""
|
| 225 |
result = get_os_name()
|
|
|
|
| 7 |
create_async_intercept_handler,
|
| 8 |
)
|
| 9 |
from scrapling.engines.toolbelt.fingerprints import (
|
|
|
|
| 10 |
get_os_name,
|
| 11 |
generate_headers
|
| 12 |
)
|
|
|
|
| 203 |
class TestFingerprintFunctions:
|
| 204 |
"""Test fingerprint generation functions"""
|
| 205 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
def test_get_os_name(self):
|
| 207 |
"""Test OS name detection"""
|
| 208 |
result = get_os_name()
|
tests/parser/test_parser_advanced.py
CHANGED
|
@@ -183,6 +183,33 @@ class TestAdvancedSelectors:
|
|
| 183 |
text = page.get_all_text(valid_values=False)
|
| 184 |
assert text != ""
|
| 185 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
class TestTextHandlerAdvanced:
|
| 188 |
"""Test advanced TextHandler functionality"""
|
|
|
|
| 183 |
text = page.get_all_text(valid_values=False)
|
| 184 |
assert text != ""
|
| 185 |
|
| 186 |
+
def test_get_all_text_preserves_interleaved_text_nodes(self):
|
| 187 |
+
"""Test get_all_text preserves interleaved text nodes"""
|
| 188 |
+
html = """
|
| 189 |
+
<html>
|
| 190 |
+
<body>
|
| 191 |
+
<main>
|
| 192 |
+
string1
|
| 193 |
+
<b>string2</b>
|
| 194 |
+
string3
|
| 195 |
+
<div>
|
| 196 |
+
<span>string4</span>
|
| 197 |
+
</div>
|
| 198 |
+
string5
|
| 199 |
+
<script>ignored</script>
|
| 200 |
+
string6
|
| 201 |
+
<style>ignored</style>
|
| 202 |
+
string7
|
| 203 |
+
</main>
|
| 204 |
+
</body>
|
| 205 |
+
</html>
|
| 206 |
+
"""
|
| 207 |
+
|
| 208 |
+
page = Selector(html, adaptive=False)
|
| 209 |
+
node = page.css("main")[0]
|
| 210 |
+
|
| 211 |
+
assert node.get_all_text("\n", strip=True) == "string1\nstring2\nstring3\nstring4\nstring5\nstring6\nstring7"
|
| 212 |
+
|
| 213 |
|
| 214 |
class TestTextHandlerAdvanced:
|
| 215 |
"""Test advanced TextHandler functionality"""
|
tests/requirements.txt
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
pytest>=2.8.0,<9
|
| 2 |
pytest-cov
|
| 3 |
-
playwright==1.
|
| 4 |
werkzeug<3.0.0
|
| 5 |
pytest-httpbin==2.1.0
|
| 6 |
pytest-asyncio
|
|
|
|
| 1 |
pytest>=2.8.0,<9
|
| 2 |
pytest-cov
|
| 3 |
+
playwright==1.58.0
|
| 4 |
werkzeug<3.0.0
|
| 5 |
pytest-httpbin==2.1.0
|
| 6 |
pytest-asyncio
|
tox.ini
CHANGED
|
@@ -10,8 +10,8 @@ envlist = pre-commit,py{310,311,312,313}
|
|
| 10 |
usedevelop = True
|
| 11 |
changedir = tests
|
| 12 |
deps =
|
| 13 |
-
playwright==1.
|
| 14 |
-
patchright==1.
|
| 15 |
-r{toxinidir}/tests/requirements.txt
|
| 16 |
extras = ai,shell
|
| 17 |
commands =
|
|
|
|
| 10 |
usedevelop = True
|
| 11 |
changedir = tests
|
| 12 |
deps =
|
| 13 |
+
playwright==1.58.0
|
| 14 |
+
patchright==1.58.2
|
| 15 |
-r{toxinidir}/tests/requirements.txt
|
| 16 |
extras = ai,shell
|
| 17 |
commands =
|