Karim shoair commited on
Commit
a889c5d
·
2 Parent(s): 9eb5bf46d9ffc6
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .bandit.yml +3 -1
  2. .github/ISSUE_TEMPLATE/04-docs_issue.yml +40 -0
  3. .github/workflows/code-quality.yml +35 -1
  4. .gitignore +6 -0
  5. .readthedocs.yaml +11 -15
  6. docs/README.md → README.md +149 -85
  7. benchmarks.py +1 -1
  8. docs/README_AR.md +210 -138
  9. docs/README_CN.md +210 -138
  10. docs/README_DE.md +206 -134
  11. docs/README_ES.md +205 -133
  12. docs/README_JP.md +217 -145
  13. docs/README_RU.md +219 -147
  14. docs/ai/mcp-server.md +2 -2
  15. docs/api-reference/mcp-server.md +1 -1
  16. docs/api-reference/proxy-rotation.md +18 -0
  17. docs/api-reference/response.md +18 -0
  18. docs/api-reference/spiders.md +42 -0
  19. docs/benchmarks.md +14 -13
  20. docs/cli/extract-commands.md +8 -9
  21. docs/cli/interactive-shell.md +9 -9
  22. docs/development/adaptive_storage_system.md +3 -1
  23. docs/development/scrapling_custom_types.md +2 -0
  24. docs/fetching/choosing.md +14 -6
  25. docs/fetching/dynamic.md +60 -22
  26. docs/fetching/static.md +65 -31
  27. docs/fetching/stealthy.md +32 -27
  28. docs/index.md +71 -24
  29. docs/overview.md +27 -13
  30. docs/parsing/adaptive.md +14 -13
  31. docs/parsing/main_classes.md +75 -37
  32. docs/parsing/selection.md +50 -50
  33. docs/requirements.txt +4 -4
  34. docs/spiders/advanced.md +313 -0
  35. docs/spiders/architecture.md +98 -0
  36. docs/spiders/getting-started.md +159 -0
  37. docs/spiders/proxy-blocking.md +244 -0
  38. docs/spiders/requests-responses.md +202 -0
  39. docs/spiders/sessions.md +218 -0
  40. docs/tutorials/migrating_from_beautifulsoup.md +11 -9
  41. mkdocs.yml +0 -180
  42. pyproject.toml +27 -6
  43. scrapling/__init__.py +1 -1
  44. scrapling/cli.py +3 -0
  45. scrapling/core/_html_utils.py +0 -342
  46. scrapling/core/_types.py +5 -21
  47. scrapling/core/ai.py +2 -2
  48. scrapling/core/custom_types.py +6 -8
  49. scrapling/core/mixins.py +14 -10
  50. scrapling/core/shell.py +10 -7
.bandit.yml CHANGED
@@ -6,4 +6,6 @@ skips:
6
  - B404 # Using subprocess library
7
  - B602 # subprocess call with shell=True identified
8
  - B110 # Try, Except, Pass detected.
9
- - B104 # Possible binding to all interfaces.
 
 
 
6
  - B404 # Using subprocess library
7
  - B602 # subprocess call with shell=True identified
8
  - B110 # Try, Except, Pass detected.
9
+ - B104 # Possible binding to all interfaces.
10
+ - B301 # Pickle and modules that wrap it can be unsafe when used to deserialize untrusted data, possible security issue.
11
+ - B108 # Probable insecure usage of temp file/directory.
.github/ISSUE_TEMPLATE/04-docs_issue.yml ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Documentation issue
2
+ description: Report incorrect, unclear, or missing documentation.
3
+ labels: [documentation]
4
+ body:
5
+ - type: checkboxes
6
+ attributes:
7
+ label: Have you searched if there an existing issue for this?
8
+ description: Please search [existing issues](https://github.com/D4Vinci/Scrapling/labels/documentation).
9
+ options:
10
+ - label: I have searched the existing issues
11
+ required: true
12
+
13
+ - type: input
14
+ attributes:
15
+ label: "Page URL"
16
+ description: "Link to the documentation page with the issue."
17
+ placeholder: "https://scrapling.readthedocs.io/en/latest/..."
18
+ validations:
19
+ required: true
20
+
21
+ - type: dropdown
22
+ attributes:
23
+ label: "Type of issue"
24
+ options:
25
+ - Incorrect information
26
+ - Unclear or confusing
27
+ - Missing information
28
+ - Typo or formatting
29
+ - Broken link
30
+ - Other
31
+ default: 0
32
+ validations:
33
+ required: true
34
+
35
+ - type: textarea
36
+ attributes:
37
+ label: "Description"
38
+ description: "Describe what's wrong and what you expected to find."
39
+ validations:
40
+ required: true
.github/workflows/code-quality.yml CHANGED
@@ -50,7 +50,9 @@ jobs:
50
  - name: Install dependencies
51
  run: |
52
  python -m pip install --upgrade pip
53
- pip install bandit[toml] ruff vermin
 
 
54
 
55
  - name: Run Bandit (Security Linter)
56
  id: bandit
@@ -85,6 +87,22 @@ jobs:
85
  vermin -t=3.10- --violations --eval-annotations --no-tips scrapling/
86
  echo "::endgroup::"
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  - name: Check results and create summary
89
  if: always()
90
  run: |
@@ -126,6 +144,22 @@ jobs:
126
  all_passed=false
127
  fi
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  echo "" >> $GITHUB_STEP_SUMMARY
130
 
131
  if [ "$all_passed" == "true" ]; then
 
50
  - name: Install dependencies
51
  run: |
52
  python -m pip install --upgrade pip
53
+ pip install bandit[toml] ruff vermin mypy pyright
54
+ pip install -e ".[all]"
55
+ pip install lxml-stubs
56
 
57
  - name: Run Bandit (Security Linter)
58
  id: bandit
 
87
  vermin -t=3.10- --violations --eval-annotations --no-tips scrapling/
88
  echo "::endgroup::"
89
 
90
+ - name: Run Mypy (Static Type Checker)
91
+ id: mypy
92
+ continue-on-error: true
93
+ run: |
94
+ echo "::group::Mypy - Static Type Checker"
95
+ mypy scrapling/
96
+ echo "::endgroup::"
97
+
98
+ - name: Run Pyright (Static Type Checker)
99
+ id: pyright
100
+ continue-on-error: true
101
+ run: |
102
+ echo "::group::Pyright - Static Type Checker"
103
+ pyright scrapling/
104
+ echo "::endgroup::"
105
+
106
  - name: Check results and create summary
107
  if: always()
108
  run: |
 
144
  all_passed=false
145
  fi
146
 
147
+ # Check Mypy
148
+ if [ "${{ steps.mypy.outcome }}" == "success" ]; then
149
+ echo "✅ **Mypy (Type Checker)**: Passed" >> $GITHUB_STEP_SUMMARY
150
+ else
151
+ echo "❌ **Mypy (Type Checker)**: Failed" >> $GITHUB_STEP_SUMMARY
152
+ all_passed=false
153
+ fi
154
+
155
+ # Check Pyright
156
+ if [ "${{ steps.pyright.outcome }}" == "success" ]; then
157
+ echo "✅ **Pyright (Type Checker)**: Passed" >> $GITHUB_STEP_SUMMARY
158
+ else
159
+ echo "❌ **Pyright (Type Checker)**: Failed" >> $GITHUB_STEP_SUMMARY
160
+ all_passed=false
161
+ fi
162
+
163
  echo "" >> $GITHUB_STEP_SUMMARY
164
 
165
  if [ "$all_passed" == "true" ]; then
.gitignore CHANGED
@@ -1,3 +1,9 @@
 
 
 
 
 
 
1
  # cached files
2
  __pycache__/
3
  *.py[cod]
 
1
+ site/*
2
+
3
+ # AI related files
4
+ .claude/*
5
+ CLAUDE.md
6
+
7
  # cached files
8
  __pycache__/
9
  *.py[cod]
.readthedocs.yaml CHANGED
@@ -1,25 +1,21 @@
1
- # Read the Docs configuration file
2
- # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
3
 
4
- # Required
5
  version: 2
6
 
7
- # Set the OS, Python version, and other tools you might need
8
  build:
9
  os: ubuntu-24.04
10
  apt_packages:
11
  - pngquant
12
  tools:
13
  python: "3.13"
14
-
15
- # Build documentation with Mkdocs
16
- mkdocs:
17
- configuration: mkdocs.yml
18
-
19
- # Optionally, but recommended,
20
- # declare the Python requirements required to build your documentation
21
- # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
22
- python:
23
  install:
24
- - requirements: docs/requirements.txt
25
-
 
 
 
 
 
 
 
1
+ # See https://docs.readthedocs.com/platform/stable/intro/zensical.html for details
2
+ # Example: https://github.com/readthedocs/test-builds/tree/zensical
3
 
 
4
  version: 2
5
 
 
6
  build:
7
  os: ubuntu-24.04
8
  apt_packages:
9
  - pngquant
10
  tools:
11
  python: "3.13"
12
+ jobs:
 
 
 
 
 
 
 
 
13
  install:
14
+ - pip install -r docs/requirements.txt
15
+ - pip install ".[all]"
16
+ build:
17
+ html:
18
+ - zensical build
19
+ post_build:
20
+ - mkdir -p $READTHEDOCS_OUTPUT/html/
21
+ - cp --recursive site/* $READTHEDOCS_OUTPUT/html/
docs/README.md → README.md RENAMED
@@ -1,13 +1,17 @@
1
- Automated translations: [العربيه](https://github.com/D4Vinci/Scrapling/blob/main/docs/README_AR.md) | [Español](https://github.com/D4Vinci/Scrapling/blob/main/docs/README_ES.md) | [Deutsch](https://github.com/D4Vinci/Scrapling/blob/main/docs/README_DE.md) | [简体中文](https://github.com/D4Vinci/Scrapling/blob/main/docs/README_CN.md) | [日本語](https://github.com/D4Vinci/Scrapling/blob/main/docs/README_JP.md) | [Русский](https://github.com/D4Vinci/Scrapling/blob/main/docs/README_RU.md)
2
-
 
 
 
 
 
 
 
 
3
 
4
- <p align=center>
5
- <br>
6
- <a href="https://scrapling.readthedocs.io/en/latest/" target="_blank"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/poster.png" style="width: 50%; height: 100%;" alt="main poster"/></a>
7
- <br>
8
- <i><code>Easy, effortless Web Scraping as it should be!</code></i>
9
- </p>
10
  <p align="center">
 
 
11
  <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
12
  <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
13
  <a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
@@ -27,44 +31,45 @@ Automated translations: [العربيه](https://github.com/D4Vinci/Scrapling/bl
27
  </p>
28
 
29
  <p align="center">
30
- <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/">
31
- Selection methods
32
- </a>
33
- ·
34
- <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/">
35
- Choosing a fetcher
36
- </a>
37
- ·
38
- <a href="https://scrapling.readthedocs.io/en/latest/cli/overview/">
39
- CLI
40
- </a>
41
- ·
42
- <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/">
43
- MCP mode
44
- </a>
45
- ·
46
- <a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/">
47
- Migrating from Beautifulsoup
48
- </a>
49
  </p>
50
 
51
- **Stop fighting anti-bot systems. Stop rewriting selectors after every website update.**
52
 
53
- Scrapling isn't just another Web Scraping library. It's the first **adaptive** scraping library that learns from website changes and evolves with them. While other libraries break when websites update their structure, Scrapling automatically relocates your elements and keeps your scrapers running.
54
 
55
- Built for the modern Web, Scrapling features **its own rapid parsing engine** and fetchers to handle all Web Scraping challenges you face or will face. Built by Web Scrapers for Web Scrapers and regular users, there's something for everyone.
56
 
57
  ```python
58
- >> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
59
- >> StealthyFetcher.adaptive = True
60
- # Fetch websites' source under the radar!
61
- >> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
62
- >> print(page.status)
63
- 200
64
- >> products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
65
- >> # Later, if the website structure changes, pass `adaptive=True`
66
- >> products = page.css('.product', adaptive=True) # and Scrapling still finds them!
67
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  # Sponsors
70
 
@@ -90,16 +95,27 @@ Built for the modern Web, Scrapling features **its own rapid parsing engine** an
90
 
91
  ## Key Features
92
 
 
 
 
 
 
 
 
 
 
93
  ### Advanced Websites Fetching with Session Support
94
- - **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class. Can impersonate browsers' TLS fingerprint, headers, and use HTTP3.
95
  - **Dynamic Loading**: Fetch dynamic websites with full browser automation through the `DynamicFetcher` class supporting Playwright's Chromium and Google's Chrome.
96
  - **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` and fingerprint spoofing. Can easily bypass all types of Cloudflare's Turnstile/Interstitial with automation.
97
  - **Session Management**: Persistent session support with `FetcherSession`, `StealthySession`, and `DynamicSession` classes for cookie and state management across requests.
 
 
98
  - **Async Support**: Complete async support across all fetchers and dedicated async session classes.
99
 
100
  ### Adaptive Scraping & AI Integration
101
  - 🔄 **Smart Element Tracking**: Relocate elements after website changes using intelligent similarity algorithms.
102
- - 🎯 **Smart Flexible Selection**: CSS selectors, XPath selectors, filter-based search, text search, regex search, and more.
103
  - 🔍 **Find Similar Elements**: Automatically locate elements similar to found elements.
104
  - 🤖 **MCP Server to be used with AI**: Built-in MCP server for AI-assisted Web Scraping and data extraction. The MCP server features powerful, custom capabilities that leverage Scrapling to extract targeted content before passing it to the AI (Claude/Cursor/etc), thereby speeding up operations and reducing costs by minimizing token usage. ([demo video](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
105
 
@@ -111,51 +127,107 @@ Built for the modern Web, Scrapling features **its own rapid parsing engine** an
111
 
112
  ### Developer/Web Scraper Friendly Experience
113
  - 🎯 **Interactive Web Scraping Shell**: Optional built-in IPython shell with Scrapling integration, shortcuts, and new tools to speed up Web Scraping scripts development, like converting curl requests to Scrapling requests and viewing requests results in your browser.
114
- - 🚀 **Use it directly from the Terminal**: Optionally, you can use Scrapling to scrape a URL without writing a single code!
115
  - 🛠️ **Rich Navigation API**: Advanced DOM traversal with parent, sibling, and child navigation methods.
116
  - 🧬 **Enhanced Text Processing**: Built-in regex, cleaning methods, and optimized string operations.
117
  - 📝 **Auto Selector Generation**: Generate robust CSS/XPath selectors for any element.
118
  - 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup with the same pseudo-elements used in Scrapy/Parsel.
119
- - 📘 **Complete Type Coverage**: Full type hints for excellent IDE support and code completion.
120
  - 🔋 **Ready Docker image**: With each release, a Docker image containing all browsers is automatically built and pushed.
121
 
122
  ## Getting Started
123
 
 
 
124
  ### Basic Usage
 
125
  ```python
126
- from scrapling.fetchers import Fetcher, StealthyFetcher, DynamicFetcher
127
- from scrapling.fetchers import FetcherSession, StealthySession, DynamicSession
128
 
129
- # HTTP requests with session support
130
  with FetcherSession(impersonate='chrome') as session: # Use latest version of Chrome's TLS fingerprint
131
  page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
132
- quotes = page.css('.quote .text::text')
133
 
134
  # Or use one-off requests
135
  page = Fetcher.get('https://quotes.toscrape.com/')
136
- quotes = page.css('.quote .text::text')
 
 
 
 
137
 
138
- # Advanced stealth mode (Keep the browser open until you finish)
139
- with StealthySession(headless=True, solve_cloudflare=True) as session:
140
  page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
141
- data = page.css('#padded_content a')
142
 
143
  # Or use one-off request style, it opens the browser for this request, then closes it after finishing
144
  page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
145
- data = page.css('#padded_content a')
146
-
147
- # Full browser automation (Keep the browser open until you finish)
148
- with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session:
 
 
 
149
  page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
150
- data = page.xpath('//span[@class="text"]/text()') # XPath selector if you prefer it
151
 
152
  # Or use one-off request style, it opens the browser for this request, then closes it after finishing
153
  page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
154
- data = page.css('.quote .text::text')
155
  ```
156
 
157
- > [!NOTE]
158
- > There's a wonderful guide to get you started quickly with Scrapling [here](https://substack.thewebscraping.club/p/scrapling-hands-on-guide) written by The Web Scraping Club. In case you find it easier to get you started than the [documentation website](https://scrapling.readthedocs.io/en/latest/).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
  ### Advanced Parsing & Navigation
161
  ```python
@@ -176,10 +248,9 @@ quotes = page.find_all(class_='quote') # and so on...
176
  quotes = page.find_by_text('quote', tag='div')
177
 
178
  # Advanced navigation
179
- first_quote = page.css_first('.quote')
180
- quote_text = first_quote.css('.text::text')
181
- quote_text = page.css('.quote').css_first('.text::text') # Chained selectors
182
- quote_text = page.css_first('.quote .text').text # Using `css_first` is faster than `css` if you want the first element
183
  author = first_quote.next_sibling.css('.author::text')
184
  parent_container = first_quote.parent
185
 
@@ -220,7 +291,7 @@ async with AsyncStealthySession(max_pages=2) as session:
220
 
221
  ## CLI & Interactive Shell
222
 
223
- Scrapling v0.3 includes a powerful command-line interface:
224
 
225
  [![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)
226
 
@@ -237,34 +308,34 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.
237
  ```
238
 
239
  > [!NOTE]
240
- > There are many additional features, but we want to keep this page concise, such as the MCP server and the interactive Web Scraping Shell. Check out the full documentation [here](https://scrapling.readthedocs.io/en/latest/)
241
 
242
  ## Performance Benchmarks
243
 
244
- Scrapling isn't just powerful—it's also blazing fast, and the updates since version 0.3 have delivered exceptional performance improvements across all operations. The following benchmarks compare Scrapling's parser with other popular libraries.
245
 
246
  ### Text Extraction Speed Test (5000 nested elements)
247
 
248
  | # | Library | Time (ms) | vs Scrapling |
249
  |---|:-----------------:|:---------:|:------------:|
250
- | 1 | Scrapling | 1.99 | 1.0x |
251
- | 2 | Parsel/Scrapy | 2.01 | 1.01x |
252
- | 3 | Raw Lxml | 2.5 | 1.256x |
253
- | 4 | PyQuery | 22.93 | ~11.5x |
254
- | 5 | Selectolax | 80.57 | ~40.5x |
255
- | 6 | BS4 with Lxml | 1541.37 | ~774.6x |
256
- | 7 | MechanicalSoup | 1547.35 | ~777.6x |
257
- | 8 | BS4 with html5lib | 3410.58 | ~1713.9x |
258
 
259
 
260
  ### Element Similarity & Text Search Performance
261
 
262
  Scrapling's adaptive element finding capabilities significantly outperform alternatives:
263
 
264
- | Library | Time (ms) | vs Scrapling |
265
  |-------------|:---------:|:------------:|
266
- | Scrapling | 2.46 | 1.0x |
267
- | AutoScraper | 13.3 | 5.407x |
268
 
269
 
270
  > All benchmarks represent averages of 100+ runs. See [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology.
@@ -277,7 +348,7 @@ Scrapling requires Python 3.10 or higher:
277
  pip install scrapling
278
  ```
279
 
280
- Starting with v0.3.2, this installation only includes the parser engine and its dependencies, without any fetchers or commandline dependencies.
281
 
282
  ### Optional Dependencies
283
 
@@ -334,12 +405,5 @@ This work is licensed under the BSD-3-Clause License.
334
  This project includes code adapted from:
335
  - Parsel (BSD License)—Used for [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py) submodule
336
 
337
- ## Thanks and References
338
-
339
- - [Daijro](https://github.com/daijro)'s brilliant work on [BrowserForge](https://github.com/daijro/browserforge) and [Camoufox](https://github.com/daijro/camoufox)
340
- - [Vinyzu](https://github.com/Vinyzu)'s brilliant work on [Botright](https://github.com/Vinyzu/Botright) and [PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)
341
- - [brotector](https://github.com/kaliiiiiiiiii/brotector) for browser detection bypass techniques
342
- - [fakebrowser](https://github.com/kkoooqq/fakebrowser) and [BotBrowser](https://github.com/botswin/BotBrowser) for fingerprinting research
343
-
344
  ---
345
  <div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
 
1
+ <h1 align="center">
2
+ <a href="https://scrapling.readthedocs.io">
3
+ <picture>
4
+ <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
5
+ <img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
6
+ </picture>
7
+ </a>
8
+ <br>
9
+ <small>Effortless Web Scraping for the Modern Web</small>
10
+ </h1>
11
 
 
 
 
 
 
 
12
  <p align="center">
13
+ <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_AR.md">العربيه</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_ES.md">Español</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_DE.md">Deutsch</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_CN.md">简体中文</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_JP.md">日本語</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_RU.md">Русский</a>
14
+ <br/>
15
  <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
16
  <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
17
  <a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
 
31
  </p>
32
 
33
  <p align="center">
34
+ <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/"><strong>Selection methods</strong></a>
35
+ &middot;
36
+ <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/"><strong>Choosing a fetcher</strong></a>
37
+ &middot;
38
+ <a href="https://scrapling.readthedocs.io/en/latest/cli/overview/"><strong>CLI</strong></a>
39
+ &middot;
40
+ <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/"><strong>MCP mode</strong></a>
41
+ &middot;
42
+ <a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/"><strong>Migrating from Beautifulsoup</strong></a>
 
 
 
 
 
 
 
 
 
 
43
  </p>
44
 
45
+ Scrapling is an adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl.
46
 
47
+ Its parser learns from website changes and automatically relocates your elements when pages update. Its fetchers bypass anti-bot systems like Cloudflare Turnstile out of the box. And its spider framework lets you scale up to concurrent, multi-session crawls with pause/resume and automatic proxy rotation — all in a few lines of Python. One library, zero compromises.
48
 
49
+ Blazing fast crawls with real-time stats and streaming. Built by Web Scrapers for Web Scrapers and regular users, there's something for everyone.
50
 
51
  ```python
52
+ from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
53
+ StealthyFetcher.adaptive = True
54
+ page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # Fetch website under the radar!
55
+ products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
56
+ products = page.css('.product', adaptive=True) # Later, if the website structure changes, pass `adaptive=True` to find them!
 
 
 
 
57
  ```
58
+ Or scale up to full crawls
59
+ ```python
60
+ from scrapling.spiders import Spider, Response
61
+
62
+ class MySpider(Spider):
63
+ name = "demo"
64
+ start_urls = ["https://example.com/"]
65
+
66
+ async def parse(self, response: Response):
67
+ for item in response.css('.product'):
68
+ yield {"title": item.css('h2::text').get()}
69
+
70
+ MySpider().start()
71
+ ```
72
+
73
 
74
  # Sponsors
75
 
 
95
 
96
  ## Key Features
97
 
98
+ ### Spiders — A Full Crawling Framework
99
+ - 🕷️ **Scrapy-like Spider API**: Define spiders with `start_urls`, async `parse` callbacks, and `Request`/`Response` objects.
100
+ - ⚡ **Concurrent Crawling**: Configurable concurrency limits, per-domain throttling, and download delays.
101
+ - 🔄 **Multi-Session Support**: Unified interface for HTTP requests, and stealthy headless browsers in a single spider — route requests to different sessions by ID.
102
+ - 💾 **Pause & Resume**: Checkpoint-based crawl persistence. Press Ctrl+C for a graceful shutdown; restart to resume from where you left off.
103
+ - 📡 **Streaming Mode**: Stream scraped items as they arrive via `async for item in spider.stream()` with real-time stats — ideal for UI, pipelines, and long-running crawls.
104
+ - 🛡️ **Blocked Request Detection**: Automatic detection and retry of blocked requests with customizable logic.
105
+ - 📦 **Built-in Export**: Export results through hooks and your own pipeline or the built-in JSON/JSONL with `result.items.to_json()` / `result.items.to_jsonl()` respectively.
106
+
107
  ### Advanced Websites Fetching with Session Support
108
+ - **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class. Can impersonate browsers' TLS fingerprint, headers, and use HTTP/3.
109
  - **Dynamic Loading**: Fetch dynamic websites with full browser automation through the `DynamicFetcher` class supporting Playwright's Chromium and Google's Chrome.
110
  - **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` and fingerprint spoofing. Can easily bypass all types of Cloudflare's Turnstile/Interstitial with automation.
111
  - **Session Management**: Persistent session support with `FetcherSession`, `StealthySession`, and `DynamicSession` classes for cookie and state management across requests.
112
+ - **Proxy Rotation**: Built-in `ProxyRotator` with cyclic or custom rotation strategies across all session types, plus per-request proxy overrides.
113
+ - **Domain Blocking**: Block requests to specific domains (and their subdomains) in browser-based fetchers.
114
  - **Async Support**: Complete async support across all fetchers and dedicated async session classes.
115
 
116
  ### Adaptive Scraping & AI Integration
117
  - 🔄 **Smart Element Tracking**: Relocate elements after website changes using intelligent similarity algorithms.
118
+ - 🎯 **Smart Flexible Selection**: CSS selectors, XPath selectors, filter-based search, text search, regex search, and more.
119
  - 🔍 **Find Similar Elements**: Automatically locate elements similar to found elements.
120
  - 🤖 **MCP Server to be used with AI**: Built-in MCP server for AI-assisted Web Scraping and data extraction. The MCP server features powerful, custom capabilities that leverage Scrapling to extract targeted content before passing it to the AI (Claude/Cursor/etc), thereby speeding up operations and reducing costs by minimizing token usage. ([demo video](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
121
 
 
127
 
128
  ### Developer/Web Scraper Friendly Experience
129
  - 🎯 **Interactive Web Scraping Shell**: Optional built-in IPython shell with Scrapling integration, shortcuts, and new tools to speed up Web Scraping scripts development, like converting curl requests to Scrapling requests and viewing requests results in your browser.
130
+ - 🚀 **Use it directly from the Terminal**: Optionally, you can use Scrapling to scrape a URL without writing a single line of code!
131
  - 🛠️ **Rich Navigation API**: Advanced DOM traversal with parent, sibling, and child navigation methods.
132
  - 🧬 **Enhanced Text Processing**: Built-in regex, cleaning methods, and optimized string operations.
133
  - 📝 **Auto Selector Generation**: Generate robust CSS/XPath selectors for any element.
134
  - 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup with the same pseudo-elements used in Scrapy/Parsel.
135
+ - 📘 **Complete Type Coverage**: Full type hints for excellent IDE support and code completion. The entire codebase is automatically scanned with **PyRight** and **MyPy** with each change.
136
  - 🔋 **Ready Docker image**: With each release, a Docker image containing all browsers is automatically built and pushed.
137
 
138
  ## Getting Started
139
 
140
+ Let's give you a quick glimpse of what Scrapling can do without deep diving.
141
+
142
  ### Basic Usage
143
+ HTTP requests with session support
144
  ```python
145
+ from scrapling.fetchers import Fetcher, FetcherSession
 
146
 
 
147
  with FetcherSession(impersonate='chrome') as session: # Use latest version of Chrome's TLS fingerprint
148
  page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
149
+ quotes = page.css('.quote .text::text').getall()
150
 
151
  # Or use one-off requests
152
  page = Fetcher.get('https://quotes.toscrape.com/')
153
+ quotes = page.css('.quote .text::text').getall()
154
+ ```
155
+ Advanced stealth mode
156
+ ```python
157
+ from scrapling.fetchers import StealthyFetcher, StealthySession
158
 
159
+ with StealthySession(headless=True, solve_cloudflare=True) as session: # Keep the browser open until you finish
 
160
  page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
161
+ data = page.css('#padded_content a').getall()
162
 
163
  # Or use one-off request style, it opens the browser for this request, then closes it after finishing
164
  page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
165
+ data = page.css('#padded_content a').getall()
166
+ ```
167
+ Full browser automation
168
+ ```python
169
+ from scrapling.fetchers import DynamicFetcher, DynamicSession
170
+
171
+ with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # Keep the browser open until you finish
172
  page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
173
+ data = page.xpath('//span[@class="text"]/text()').getall() # XPath selector if you prefer it
174
 
175
  # Or use one-off request style, it opens the browser for this request, then closes it after finishing
176
  page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
177
+ data = page.css('.quote .text::text').getall()
178
  ```
179
 
180
+ ### Spiders
181
+ Build full crawlers with concurrent requests, multiple session types, and pause/resume:
182
+ ```python
183
+ from scrapling.spiders import Spider, Request, Response
184
+
185
+ class QuotesSpider(Spider):
186
+ name = "quotes"
187
+ start_urls = ["https://quotes.toscrape.com/"]
188
+ concurrent_requests = 10
189
+
190
+ async def parse(self, response: Response):
191
+ for quote in response.css('.quote'):
192
+ yield {
193
+ "text": quote.css('.text::text').get(),
194
+ "author": quote.css('.author::text').get(),
195
+ }
196
+
197
+ next_page = response.css('.next a')
198
+ if next_page:
199
+ yield response.follow(next_page[0].attrib['href'])
200
+
201
+ result = QuotesSpider().start()
202
+ print(f"Scraped {len(result.items)} quotes")
203
+ result.items.to_json("quotes.json")
204
+ ```
205
+ Use multiple session types in a single spider:
206
+ ```python
207
+ from scrapling.spiders import Spider, Request, Response
208
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession
209
+
210
+ class MultiSessionSpider(Spider):
211
+ name = "multi"
212
+ start_urls = ["https://example.com/"]
213
+
214
+ def configure_sessions(self, manager):
215
+ manager.add("fast", FetcherSession(impersonate="chrome"))
216
+ manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)
217
+
218
+ async def parse(self, response: Response):
219
+ for link in response.css('a::attr(href)').getall():
220
+ # Route protected pages through the stealth session
221
+ if "protected" in link:
222
+ yield Request(link, sid="stealth")
223
+ else:
224
+ yield Request(link, sid="fast", callback=self.parse) # explicit callback
225
+ ```
226
+ Pause and resume long crawls with checkpoints by running the spider like this:
227
+ ```python
228
+ QuotesSpider(crawldir="./crawl_data").start()
229
+ ```
230
+ Press Ctrl+C to pause gracefully — progress is saved automatically. Later, when you start the spider again, pass the same `crawldir`, and it will resume from where it stopped.
231
 
232
  ### Advanced Parsing & Navigation
233
  ```python
 
248
  quotes = page.find_by_text('quote', tag='div')
249
 
250
  # Advanced navigation
251
+ quote_text = page.css('.quote')[0].css('.text::text').get()
252
+ quote_text = page.css('.quote').css('.text::text').getall() # Chained selectors
253
+ first_quote = page.css('.quote')[0]
 
254
  author = first_quote.next_sibling.css('.author::text')
255
  parent_container = first_quote.parent
256
 
 
291
 
292
  ## CLI & Interactive Shell
293
 
294
+ Scrapling includes a powerful command-line interface:
295
 
296
  [![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)
297
 
 
308
  ```
309
 
310
  > [!NOTE]
311
+ > There are many additional features, but we want to keep this page concise, including the MCP server and the interactive Web Scraping Shell. Check out the full documentation [here](https://scrapling.readthedocs.io/en/latest/)
312
 
313
  ## Performance Benchmarks
314
 
315
+ Scrapling isn't just powerful—it's also blazing fast. The following benchmarks compare Scrapling's parser with the latest versions of other popular libraries.
316
 
317
  ### Text Extraction Speed Test (5000 nested elements)
318
 
319
  | # | Library | Time (ms) | vs Scrapling |
320
  |---|:-----------------:|:---------:|:------------:|
321
+ | 1 | Scrapling | 2.02 | 1.0x |
322
+ | 2 | Parsel/Scrapy | 2.04 | 1.01 |
323
+ | 3 | Raw Lxml | 2.54 | 1.257 |
324
+ | 4 | PyQuery | 24.17 | ~12x |
325
+ | 5 | Selectolax | 82.63 | ~41x |
326
+ | 6 | MechanicalSoup | 1549.71 | ~767.1x |
327
+ | 7 | BS4 with Lxml | 1584.31 | ~784.3x |
328
+ | 8 | BS4 with html5lib | 3391.91 | ~1679.1x |
329
 
330
 
331
  ### Element Similarity & Text Search Performance
332
 
333
  Scrapling's adaptive element finding capabilities significantly outperform alternatives:
334
 
335
+ | Library | Time (ms) | vs Scrapling |
336
  |-------------|:---------:|:------------:|
337
+ | Scrapling | 2.39 | 1.0x |
338
+ | AutoScraper | 12.45 | 5.209x |
339
 
340
 
341
  > All benchmarks represent averages of 100+ runs. See [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology.
 
348
  pip install scrapling
349
  ```
350
 
351
+ This installation only includes the parser engine and its dependencies, without any fetchers or commandline dependencies.
352
 
353
  ### Optional Dependencies
354
 
 
405
  This project includes code adapted from:
406
  - Parsel (BSD License)—Used for [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py) submodule
407
 
 
 
 
 
 
 
 
408
  ---
409
  <div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
benchmarks.py CHANGED
@@ -75,7 +75,7 @@ def test_scrapling():
75
  # No need to do `.extract()` like parsel to extract text
76
  # Also, this is faster than `[t.text for t in Selector(large_html, adaptive=False).css('.item')]`
77
  # for obvious reasons, of course.
78
- return ScraplingSelector(large_html, adaptive=False).css(".item::text")
79
 
80
 
81
  @benchmark
 
75
  # No need to do `.extract()` like parsel to extract text
76
  # Also, this is faster than `[t.text for t in Selector(large_html, adaptive=False).css('.item')]`
77
  # for obvious reasons, of course.
78
+ return ScraplingSelector(large_html, adaptive=False).css(".item::text").getall()
79
 
80
 
81
  @benchmark
docs/README_AR.md CHANGED
@@ -1,9 +1,14 @@
1
- <p align=center>
2
- <br>
3
- <a href="https://scrapling.readthedocs.io/en/latest/" target="_blank"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/poster.png" style="width: 50%; height: 100%;" alt="main poster"/></a>
4
- <br>
5
- <i><code>استخراج بيانات الويب بسهولة ويسر كما يجب أن يكون!</code></i>
6
- </p>
 
 
 
 
 
7
  <p align="center">
8
  <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
9
  <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
@@ -24,46 +29,47 @@
24
  </p>
25
 
26
  <p align="center">
27
- <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/">
28
- طرق الاختيار
29
- </a>
30
- ·
31
- <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/">
32
- اختيار الجالب
33
- </a>
34
- ·
35
- <a href="https://scrapling.readthedocs.io/en/latest/cli/overview/">
36
- واجهة سطر الأوامر
37
- </a>
38
- ·
39
- <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/">
40
- وضع MCP
41
- </a>
42
- ·
43
- <a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/">
44
- الانتقال من Beautifulsoup
45
- </a>
46
  </p>
47
 
48
- **توقف عن محاربة أنظمة مكافحة الروبوتات. توقف عن إعادة كتابة المحددات بعد كل تحديث للموقع.**
49
 
50
- Scrapling ليست مجرد مكتبة أخرى لاستخراج بيانات الويب. إنها أول مكتبة استخراج **تكيفية** تتعلم من تغييرات المواقع وتتطور معها. بينما تتعطل المكتبات الأخرى عندما تحدث المواقع بنيتها، يعيد Scrapling تحديد موقع عناصرك تلقائياً ويحافظ على عمل أدوات الاستخراج الخاصة بك.
51
 
52
- مبني للويب الحديث، يتميز Scrapling **بمحرك تحليل سريع خاص به** وجوالب للتعامل مع جميع تحديات استخراج بيانات الويب التي تواجهها أو ستواجهها. مبني بواسطة مستخرجي الويب لمستخرجي الويب والمستخدمين العاديين، هناك شيء للجميع.
53
 
54
  ```python
55
- >> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
56
- >> StealthyFetcher.adaptive = True
57
- # احصل على كود المصدر للمواقع بشكل خفي!
58
- >> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
59
- >> print(page.status)
60
- 200
61
- >> products = page.css('.product', auto_save=True) # استخرج البيانات التي تنجو من تغييرات تصميم الموقع!
62
- >> # لاحقاً، إذا تغيرت بنية الموقع، مرر `adaptive=True`
63
- >> products = page.css('.product', adaptive=True) # و Scrapling لا يزال يجدها!
64
  ```
 
 
 
65
 
66
- # الرعاة
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  <!-- sponsors -->
69
 
@@ -87,138 +93,211 @@ Scrapling ليست مجرد مكتبة أخرى لاستخراج بيانات ا
87
 
88
  ## الميزات الرئيسية
89
 
 
 
 
 
 
 
 
 
 
90
  ### جلب متقدم للمواقع مع دعم الجلسات
91
- - **طلبات HTTP**: طلبات HTTP سريعة وخفية مع فئة `Fetcher`. يمكنها تقليد بصمة TLS للمتصفح والرؤوس واستخدام HTTP3.
92
  - **التحميل الديناميكي**: جلب المواقع الديناميكية مع أتمتة كاملة للمتصفح من خلال فئة `DynamicFetcher` التي تدعم Chromium من Playwright و Google Chrome.
93
- - **تجاوز مكافحة الروبوتات**: قدرات تخفي متقدمة مع `StealthyFetcher` وانتحال البصمات. يمكنه تجاوز جميع أنواع Turnstile/Interstitial من Cloudflare بسهولة بالأتمتة.
94
  - **إدارة الجلسات**: دعم الجلسات المستمرة مع فئات `FetcherSession` و`StealthySession` و`DynamicSession` لإدارة ملفات تعريف الارتباط والحالة عبر الطلبات.
 
 
95
  - **دعم Async**: دعم async كامل عبر جميع الجوالب وفئات الجلسات async المخصصة.
96
 
97
  ### الاستخراج التكيفي والتكامل مع الذكاء الاصطناعي
98
  - 🔄 **تتبع العناصر الذكي**: إعادة تحديد موقع العناصر بعد تغييرات الموقع باستخدام خوارزميات التشابه الذكية.
99
  - 🎯 **الاختيار المرن الذكي**: محددات CSS، محددات XPath، البحث القائم على الفلاتر، البحث النصي، البحث ب��لتعبيرات العادية والمزيد.
100
  - 🔍 **البحث عن عناصر مشابهة**: تحديد العناصر المشابهة للعناصر الموجودة تلقائياً.
101
- - 🤖 **خادم MCP للاستخدام مع الذكاء الاصطناعي**: خادم MCP مدمج لاستخراج بيانات الويب بمساعدة الذكاء الاصطناعي واستخراج البيانات. يتميز خادم MCP بقدرات قوية مخصصة تستفيد من Scrapling لاستخراج المحتوى المستهدف قبل تمريره إلى الذكاء الاصطناعي (Claude/Cursor/إلخ)، وبالتالي تسريع العمليات وتقليل التكاليف عن طريق تقليل استخدام الرموز. ([فيديو توضيحي](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
102
 
103
- ### بنية عالية الأداء ومختبرة في المعارك
104
- - 🚀 **سريع كالبرق**: أداء محسّن يتفوق على معظم مكتبات استخراج Python.
105
  - 🔋 **فعال في استخدام الذاكرة**: هياكل بيانات محسّنة وتحميل كسول لأقل استخدام للذاكرة.
106
  - ⚡ **تسلسل JSON سريع**: أسرع 10 مرات من المكتبة القياسية.
107
- - 🏗️ **مُختبر في المعارك**: لا يمتلك Scrapling فقط تغطية اختبار بنسبة 92٪ وتغطية كاملة لتلميحات الأنواع، ولكن تم استخدامه يومياً من قبل مئات مستخرجي الويب خلال العام الماضي.
108
 
109
  ### تجربة صديقة للمطورين/مستخرجي الويب
110
- - 🎯 **غلاف استخراج ويب تفاعلي**: غلاف IPython مدمج اختياري مع تكامل Scrapling، واختصارات، وأدوات جديدة لتسريع تطوير سكريبتات استخراج الويب، مثل تحويل طلبات curl إلى طلبات Scrapling وعرض نتائج الطلبات في متصفحك.
111
  - 🚀 **استخدمه مباشرة من الطرفية**: اختيارياً، يمكنك استخدام Scrapling لاستخراج عنوان URL دون كتابة سطر واحد من الكود!
112
- - 🛠️ **واجهة برمجة تطبيقات التنقل الغنية**: اجتياز DOM متقدم مع طرق التنقل بين الوالدين والأشقاء والأطفال.
113
- - 🧬 **معالجة نصوص محسّنة**: تعبيرات عادية مدمجة وطرق تنظيف وعمليات سلسلة محسّنة.
114
- - 📝 **إنشاء محدد تلقائي**: إنشاء محددات CSS/XPath قوية لأي عنصر.
115
- - 🔌 **واجهة برمجة تطبيقات مألوفة**: مشابه لـ Scrapy/BeautifulSoup مع نفس العناصر الزائفة المستخدمة في Scrapy/Parsel.
116
- - 📘 **تغطية كاملة للأنواع**: تلميحات نوع كاملة لدعم IDE ممتاز وإكمال الكود.
117
  - 🔋 **صورة Docker جاهزة**: مع كل إصدار، يتم بناء ودفع صورة Docker تحتوي على جميع المتصفحات تلقائياً.
118
 
119
  ## البدء
120
 
 
 
121
  ### الاستخدام الأساسي
 
122
  ```python
123
- from scrapling.fetchers import Fetcher, StealthyFetcher, DynamicFetcher
124
- from scrapling.fetchers import FetcherSession, StealthySession, DynamicSession
125
 
126
- # طلبات HTTP مع دعم الجلسات
127
  with FetcherSession(impersonate='chrome') as session: # استخدم أحدث إصدار من بصمة TLS لـ Chrome
128
  page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
129
- quotes = page.css('.quote .text::text')
130
 
131
  # أو استخدم طلبات لمرة واحدة
132
  page = Fetcher.get('https://quotes.toscrape.com/')
133
- quotes = page.css('.quote .text::text')
 
 
 
 
134
 
135
- # وضع التخفي المتقدم (احتفظ بالمتصفح مفتوحاً حتى تنتهي)
136
- with StealthySession(headless=True, solve_cloudflare=True) as session:
137
  page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
138
- data = page.css('#padded_content a')
139
 
140
  # أو استخدم نمط الطلب لمرة واحدة، يفتح المتصفح لهذا الطلب، ثم يغلقه بعد الانتهاء
141
  page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
142
- data = page.css('#padded_content a')
143
-
144
- # أتمتة المتصفح الكاملة (احتفظ بالمتصفح مفتوحاً حتى تنتهي)
145
- with DynamicSession(headless=True) as session:
146
- page = session.fetch('https://quotes.toscrape.com/', network_idle=True)
147
- quotes = page.css('.quote .text::text')
148
-
149
- # أو استخدم نمط الطلب لمرة واحدة
150
- page = DynamicFetcher.fetch('https://quotes.toscrape.com/', network_idle=True)
151
- quotes = page.css('.quote .text::text')
 
 
 
152
  ```
153
 
154
- ### اختيار العناصر
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  ```python
156
- # محددات CSS
157
- page.css('a::text') # استخراج النص
158
- page.css('a::attr(href)') # استخراج السمات
159
- page.css('a', recursive=False) # العناصر المباشرة فقط
160
- page.css('a', auto_save=True) # حفظ مواضع العناصر تلقائياً
161
-
162
- # XPath
163
- page.xpath('//a/text()')
164
-
165
- # بحث مرن
166
- page.find_by_text('Python', first_match=True) # البحث بالنص
167
- page.find_by_regex(r'\d{4}') # البحث بنمط التعبير العادي
168
- page.find('div', {'class': 'container'}) # البحث بالسمات
169
-
170
- # التنقل
171
- element.parent # الحصول على العنصر الوالد
172
- element.next_sibling # الحصول على الشقيق التالي
173
- element.children # الحصول على الأطفال
174
-
175
- # عناصر مشابهة
176
- similar = page.get_similar(element) # البحث عن عناصر مشابهة
177
-
178
- # الاستخراج التكيفي
179
- saved_elements = page.css('.product', auto_save=True)
180
- # لاحقاً، عندما يتغير الموقع:
181
- page.css('.product', adaptive=True) # البحث عن العناصر باستخدام المواضع المحفوظة
182
  ```
 
183
 
184
- ### استخدام الجلسة
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  ```python
186
- from scrapling.fetchers import FetcherSession, AsyncFetcherSession
187
-
188
- # جلسة متزامنة
189
- with FetcherSession() as session:
190
- # يتم الاحتفاظ بملفات تعريف الارتباط تلقائياً
191
- page1 = session.get('https://quotes.toscrape.com/login')
192
- page2 = session.post('https://quotes.toscrape.com/login', data={'username': 'admin', 'password': 'admin'})
193
-
194
- # تبديل بصمة المتصفح إذا لزم الأمر
 
 
 
 
195
  page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
196
 
197
  # استخدام جلسة async
198
  async with AsyncStealthySession(max_pages=2) as session:
199
  tasks = []
200
  urls = ['https://example.com/page1', 'https://example.com/page2']
201
-
202
  for url in urls:
203
  task = session.fetch(url)
204
  tasks.append(task)
205
-
206
  print(session.get_pool_stats()) # اختياري - حالة مجموعة علامات تبويب المتصفح (مشغول/حر/خطأ)
207
  results = await asyncio.gather(*tasks)
208
  print(session.get_pool_stats())
209
  ```
210
 
211
- ## واجهة سطر الأوامر والغلاف التفاعلي
212
 
213
- يتضمن Scrapling v0.3 واجهة سطر أوامر قوية:
214
 
215
  [![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)
216
 
217
- تشغيل غلاف استخراج الويب التفاعلي
218
  ```bash
219
  scrapling shell
220
  ```
221
- استخراج الصفحات إلى ملف مباشرة دون برمجة (يستخرج المحتوى داخل وسم `body` افتراضياً). إذا انتهى ملف الإخراج بـ `.txt`، فسيتم استخراج محتوى النص للهدف. إذا انتهى بـ `.md`، فسيكون تمثيل Markdown لمحتوى HTML؛ إذا انتهى بـ `.html`، فسيكون محتوى HTML نفسه.
222
  ```bash
223
  scrapling extract get 'https://example.com' content.md
224
  scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # جميع العناصر المطابقة لمحدد CSS '#fromSkipToProducts'
@@ -227,24 +306,24 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.
227
  ```
228
 
229
  > [!NOTE]
230
- > هناك العديد من الميزات الإضافية، لكننا نريد إبقاء هذه الصفحة موجزة، مثل خادم MCP وغلاف استخراج الويب التفاعلي. تحقق من الوثائق الكاملة [هنا](https://scrapling.readthedocs.io/en/latest/)
231
 
232
  ## معايير الأداء
233
 
234
- Scrapling ليس قوياً فقط - إنه أيضاً سريع بشكل مذهل، والتحديثات منذ الإصدار 0.3 قدمت تحسينات أداء استثنائية عبر جميع العمليات. تقارن المعايير التالية محلل Scrapling مع المكتبات الشائعة الأخرى.
235
 
236
  ### اختبار سرعة استخراج النص (5000 عنصر متداخل)
237
 
238
- | # | المكتبة | الوقت (ms) | vs Scrapling |
239
  |---|:-----------------:|:----------:|:------------:|
240
- | 1 | Scrapling | 1.99 | 1.0x |
241
- | 2 | Parsel/Scrapy | 2.01 | 1.01x |
242
- | 3 | Raw Lxml | 2.5 | 1.256x |
243
- | 4 | PyQuery | 22.93 | ~11.5x |
244
- | 5 | Selectolax | 80.57 | ~40.5x |
245
- | 6 | BS4 with Lxml | 1541.37 | ~774.6x |
246
- | 7 | MechanicalSoup | 1547.35 | ~777.6x |
247
- | 8 | BS4 with html5lib | 3410.58 | ~1713.9x |
248
 
249
 
250
  ### أداء تشابه العناصر والبحث النصي
@@ -253,39 +332,39 @@ Scrapling ليس قوياً فقط - إنه أيضاً سريع بشكل مذه
253
 
254
  | المكتبة | الوقت (ms) | vs Scrapling |
255
  |-------------|:----------:|:------------:|
256
- | Scrapling | 2.46 | 1.0x |
257
- | AutoScraper | 13.3 | 5.407x |
258
 
259
 
260
  > تمثل جميع المعايير متوسطات أكثر من 100 تشغيل. انظر [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) للمنهجية.
261
 
262
  ## التثبيت
263
 
264
- يتطلب Scrapling Python 3.10 أو أعلى:
265
 
266
  ```bash
267
  pip install scrapling
268
  ```
269
 
270
- بدءاً من v0.3.2، يتضمن هذا التثبيت فقط محرك المحلل وتبعياته، بدون أي جوالب أو تبعيات سطر أوامر.
271
 
272
  ### التبعيات الاختيارية
273
 
274
  1. إذا كنت ستستخدم أياً من الميزات الإضافية أدناه، أو الجوالب، أو فئاتها، فستحتاج إلى تثبيت تبعيات الجوالب وتبعيات المتصفح الخاصة بها على النحو التالي:
275
  ```bash
276
  pip install "scrapling[fetchers]"
277
-
278
  scrapling install
279
  ```
280
 
281
- يقوم هذا بتنزيل جميع المتصفحات، إلى جانب تبعيات النظام وتبعيات معالجة البصمات الخاصة بها.
282
 
283
  2. ميزات إضافية:
284
  - تثبيت ميزة خادم MCP:
285
  ```bash
286
  pip install "scrapling[ai]"
287
  ```
288
- - تثبيت ميزات الغلاف (غلاف استخراج الويب وأمر `extract`):
289
  ```bash
290
  pip install "scrapling[shell]"
291
  ```
@@ -322,14 +401,7 @@ docker pull ghcr.io/d4vinci/scrapling:latest
322
  ## الشكر والتقدير
323
 
324
  يتضمن هذا المشروع كوداً معدلاً من:
325
- - Parsel (ترخيص BSD) - يستخدم للوحدة الفرعية [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)
326
-
327
- ## الشكر والمراجع
328
-
329
- - العمل الرائع لـ [Daijro](https://github.com/daijro) على [BrowserForge](https://github.com/daijro/browserforge) و[Camoufox](https://github.com/daijro/camoufox)
330
- - العمل الرائع لـ [Vinyzu](https://github.com/Vinyzu) على [Botright](https://github.com/Vinyzu/Botright) و[PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)
331
- - [brotector](https://github.com/kaliiiiiiiiii/brotector) لتقنيات تجاوز اكتشاف المتصفح
332
- - [fakebrowser](https://github.com/kkoooqq/fakebrowser) و[BotBrowser](https://github.com/botswin/BotBrowser) لأبحاث البصمات
333
 
334
  ---
335
- <div align="center"><small>مصمم ومصنوع بـ ❤️ بواسطة كريم شعير.</small></div><br>
 
1
+ <h1 align="center">
2
+ <a href="https://scrapling.readthedocs.io">
3
+ <picture>
4
+ <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
5
+ <img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
6
+ </picture>
7
+ </a>
8
+ <br>
9
+ <small>Effortless Web Scraping for the Modern Web</small>
10
+ </h1>
11
+
12
  <p align="center">
13
  <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
14
  <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
 
29
  </p>
30
 
31
  <p align="center">
32
+ <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/"><strong>طرق الاختيار</strong></a>
33
+ &middot;
34
+ <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/"><strong>اختيار Fetcher</strong></a>
35
+ &middot;
36
+ <a href="https://scrapling.readthedocs.io/en/latest/cli/overview/"><strong>واجهة سطر الأوامر</strong></a>
37
+ &middot;
38
+ <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/"><strong>وضع MCP</strong></a>
39
+ &middot;
40
+ <a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/"><strong>الانتقال من Beautifulsoup</strong></a>
 
 
 
 
 
 
 
 
 
 
41
  </p>
42
 
43
+ Scrapling هو إطار عمل تكيفي لـ Web Scraping يتعامل مع كل شيء من طلب واحد إلى زحف كامل النطاق.
44
 
45
+ محلله يتعلم من تغييرات المواقع ويعيد تحديد موقع عناصرك تلقائياً عند تحديث الصفحات. جوالبه تتجاوز أنظمة مكافحة الروبوتات مثل Cloudflare Turnstile مباشرةً. وإطار عمل Spider الخاص به يتيح لك التوسع إلى عمليات زحف متزامنة ومتعددة الجلسات مع إيقاف/استئناف وتدوير تلقائي لـ Proxy - كل ذلك في بضعة أسطر من Python. مكتبة واحدة، بدون تنازلات.
46
 
47
+ زحف سريع للغاية مع إحصائيات فورية و Streaming. مبني بواسطة مستخرجي الويب لمستخرجي الويب والمستخدمين العاديين، هناك شيء للجميع.
48
 
49
  ```python
50
+ from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
51
+ StealthyFetcher.adaptive = True
52
+ page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # احصل على الموقع بشكل خفي!
53
+ products = page.css('.product', auto_save=True) # استخرج بيانات تنجو من تغييرات تصميم الموقع!
54
+ products = page.css('.product', adaptive=True) # لاحقاً، إذا تغيرت بنية الموقع، مرر `adaptive=True` للعثور عليها!
 
 
 
 
55
  ```
56
+ أو توسع إلى عمليات زحف كاملة
57
+ ```python
58
+ from scrapling.spiders import Spider, Response
59
 
60
+ class MySpider(Spider):
61
+ name = "demo"
62
+ start_urls = ["https://example.com/"]
63
+
64
+ async def parse(self, response: Response):
65
+ for item in response.css('.product'):
66
+ yield {"title": item.css('h2::text').get()}
67
+
68
+ MySpider().start()
69
+ ```
70
+
71
+
72
+ # الرعاة
73
 
74
  <!-- sponsors -->
75
 
 
93
 
94
  ## الميزات الرئيسية
95
 
96
+ ### Spiders — إطار عمل زحف كامل
97
+ - 🕷️ **واجهة Spider شبيهة بـ Scrapy**: عرّف Spiders مع `start_urls`، و async `parse` callbacks، وكائنات `Request`/`Response`.
98
+ - ⚡ **زحف متزامن**: حدود تزامن قابلة للتكوين، وتحكم بالسرعة حسب النطاق، وتأخيرات التنزيل.
99
+ - 🔄 **دعم الجلسات المتعددة**: واجهة موحدة لطلبات HTTP، ومتصفحات خفية بدون واجهة في Spider واحد — وجّه الطلبات إلى جلسات مختلفة بالمعرّف.
100
+ - 💾 **إيقاف واستئناف**: استمرارية الزحف القائمة على Checkpoint. اضغط Ctrl+C للإيقاف بسلاسة؛ أعد التشغيل للاستئناف من حيث توقفت.
101
+ - 📡 **وضع Streaming**: بث العناصر المستخرجة فور وصولها عبر `async for item in spider.stream()` مع إحصائيات فورية — مثالي لواجهات المستخدم وخطوط الأنابيب وعمليات الزحف الطويلة.
102
+ - 🛡️ **كشف الطلبات المحظورة**: كشف تلقائي وإعادة محاولة للطلبات المحظورة مع منطق قابل للتخصيص.
103
+ - 📦 **تصدير مدمج**: صدّر النتائج عبر الخطافات وخط الأنابيب الخاص بك أو JSON/JSONL المدمج مع `result.items.to_json()` / `result.items.to_jsonl()` على التوالي.
104
+
105
  ### جلب متقدم للمواقع مع دعم الجلسات
106
+ - **طلبات HTTP**: طلبات HTTP سريعة وخفية مع فئة `Fetcher`. يمكنها تقليد بصمة TLS للمتصفح والرؤوس واستخدام HTTP/3.
107
  - **التحميل الديناميكي**: جلب المواقع الديناميكية مع أتمتة كاملة للمتصفح من خلال فئة `DynamicFetcher` التي تدعم Chromium من Playwright و Google Chrome.
108
+ - **تجاوز مكافحة الروبوتات**: قدرات تخفي متقدمة مع `StealthyFetcher` وانتحال fingerprint. يمكنه تجاوز جميع أنواع Turnstile/Interstitial من Cloudflare بسهولة بالأتمتة.
109
  - **إدارة الجلسات**: دعم الجلسات المستمرة مع فئات `FetcherSession` و`StealthySession` و`DynamicSession` لإدارة ملفات تعريف الارتباط والحالة عبر الطلبات.
110
+ - **تدوير Proxy**: `ProxyRotator` مدمج مع استراتيجيات التدوير الدوري أو المخصصة عبر جميع أنواع الجلسات، بالإضافة إلى تجاوزات Proxy لكل طلب.
111
+ - **حظر النطاقات**: حظر الطلبات إلى نطاقات محددة (ونطاقاتها الفرعية) في الجوالب المعتمدة على المتصفح.
112
  - **دعم Async**: دعم async كامل عبر جميع الجوالب وفئات الجلسات async المخصصة.
113
 
114
  ### الاستخراج التكيفي والتكامل مع الذكاء الاصطناعي
115
  - 🔄 **تتبع العناصر الذكي**: إعادة تحديد موقع العناصر بعد تغييرات الموقع باستخدام خوارزميات التشابه الذكية.
116
  - 🎯 **الاختيار المرن الذكي**: محددات CSS، محددات XPath، البحث القائم على الفلاتر، البحث النصي، البحث ب��لتعبيرات العادية والمزيد.
117
  - 🔍 **البحث عن عناصر مشابهة**: تحديد العناصر المشابهة للعناصر الموجودة تلقائياً.
118
+ - 🤖 **خادم MCP للاستخدام مع الذكاء الاصطناعي**: خادم MCP مدمج لـ Web Scraping بمساعدة الذكاء الاصطناعي واستخراج البيانات. يتميز خادم MCP بقدرات قوية مخصصة تستفيد من Scrapling لاستخراج المحتوى المستهدف قبل تمريره إلى الذكاء الاصطناعي (Claude/Cursor/إلخ)، وبالتالي تسريع العمليات وتقليل التكاليف عن طريق تقليل استخدام الرموز. ([فيديو توضيحي](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
119
 
120
+ ### بنية عالية الأداء ومختبرة ميدانياً
121
+ - 🚀 **سريع كالبرق**: أداء محسّن يتفوق على معظم مكتبات Web Scraping في Python.
122
  - 🔋 **فعال في استخدام الذاكرة**: هياكل بيانات محسّنة وتحميل كسول لأقل استخدام للذاكرة.
123
  - ⚡ **تسلسل JSON سريع**: أسرع 10 مرات من المكتبة القياسية.
124
+ - 🏗️ **مُختبر ميدانياً**: لا يمتلك Scrapling فقط تغطية اختبار بنسبة 92٪ وتغطية كاملة لتلميحات الأنواع، بل تم استخدامه يومياً من قبل مئات مستخرجي الويب خلال العام الماضي.
125
 
126
  ### تجربة صديقة للمطورين/مستخرجي الويب
127
+ - 🎯 **Shell تفاعلي لـ Web Scraping**: Shell IPython مدمج اختياري مع تكامل Scrapling، واختصارات، وأدوات جديدة لتسريع تطوير سكريبتات Web Scraping، مثل تحويل طلبات curl إلى طلبات Scrapling وعرض نتائج الطلبات في متصفحك.
128
  - 🚀 **استخدمه مباشرة من الطرفية**: اختيارياً، يمكنك استخدام Scrapling لاستخراج عنوان URL دون كتابة سطر واحد من الكود!
129
+ - 🛠️ **واجهة تنقل غنية**: اجتياز DOM متقدم مع طرق التنقل بين العناصر الوالدية والشقيقة والفرعية.
130
+ - 🧬 **معالجة نصوص محسّنة**: تعبيرات عادية مدمجة وطرق تنظيف وعمليات نصية محسّنة.
131
+ - 📝 **إنشاء محددات تلقائي**: إنشاء محددات CSS/XPath قوية لأي عنصر.
132
+ - 🔌 **واجهة مألوفة**: مشابه لـ Scrapy/BeautifulSoup مع نفس العناصر الزائفة المستخدمة في Scrapy/Parsel.
133
+ - 📘 **تغطية كاملة للأنواع**: تلميحات نوع كاملة لدعم IDE ممتاز وإكما�� الكود. يتم فحص قاعدة الكود بالكامل تلقائياً بواسطة **PyRight** و**MyPy** مع كل تغيير.
134
  - 🔋 **صورة Docker جاهزة**: مع كل إصدار، يتم بناء ودفع صورة Docker تحتوي على جميع المتصفحات تلقائياً.
135
 
136
  ## البدء
137
 
138
+ لنلقِ نظرة سريعة على ما يمكن لـ Scrapling فعله دون التعمق.
139
+
140
  ### الاستخدام الأساسي
141
+ طلبات HTTP مع دعم الجلسات
142
  ```python
143
+ from scrapling.fetchers import Fetcher, FetcherSession
 
144
 
 
145
  with FetcherSession(impersonate='chrome') as session: # استخدم أحدث إصدار من بصمة TLS لـ Chrome
146
  page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
147
+ quotes = page.css('.quote .text::text').getall()
148
 
149
  # أو استخدم طلبات لمرة واحدة
150
  page = Fetcher.get('https://quotes.toscrape.com/')
151
+ quotes = page.css('.quote .text::text').getall()
152
+ ```
153
+ وضع التخفي المتقدم
154
+ ```python
155
+ from scrapling.fetchers import StealthyFetcher, StealthySession
156
 
157
+ with StealthySession(headless=True, solve_cloudflare=True) as session: # أبقِ المتصفح مفتوحاً حتى تنتهي
 
158
  page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
159
+ data = page.css('#padded_content a').getall()
160
 
161
  # أو استخدم نمط الطلب لمرة واحدة، يفتح المتصفح لهذا الطلب، ثم يغلقه بعد الانتهاء
162
  page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
163
+ data = page.css('#padded_content a').getall()
164
+ ```
165
+ أتمتة المتصفح الكاملة
166
+ ```python
167
+ from scrapling.fetchers import DynamicFetcher, DynamicSession
168
+
169
+ with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # أبقِ المتصفح مفتوحاً حتى تنتهي
170
+ page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
171
+ data = page.xpath('//span[@class="text"]/text()').getall() # محدد XPath إذا كنت تفضله
172
+
173
+ # أو استخدم نمط الطلب لمرة واحدة، يفتح المتصفح لهذا الطلب، ثم يغلقه بعد الانتهاء
174
+ page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
175
+ data = page.css('.quote .text::text').getall()
176
  ```
177
 
178
+ ### Spiders
179
+ ابنِ زواحف كاملة مع طلبات متزامنة وأنواع جلسات متعددة وإيقاف/استئناف:
180
+ ```python
181
+ from scrapling.spiders import Spider, Request, Response
182
+
183
+ class QuotesSpider(Spider):
184
+ name = "quotes"
185
+ start_urls = ["https://quotes.toscrape.com/"]
186
+ concurrent_requests = 10
187
+
188
+ async def parse(self, response: Response):
189
+ for quote in response.css('.quote'):
190
+ yield {
191
+ "text": quote.css('.text::text').get(),
192
+ "author": quote.css('.author::text').get(),
193
+ }
194
+
195
+ next_page = response.css('.next a')
196
+ if next_page:
197
+ yield response.follow(next_page[0].attrib['href'])
198
+
199
+ result = QuotesSpider().start()
200
+ print(f"Scraped {len(result.items)} quotes")
201
+ result.items.to_json("quotes.json")
202
+ ```
203
+ استخدم أنواع جلسات متعددة في Spider واحد:
204
+ ```python
205
+ from scrapling.spiders import Spider, Request, Response
206
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession
207
+
208
+ class MultiSessionSpider(Spider):
209
+ name = "multi"
210
+ start_urls = ["https://example.com/"]
211
+
212
+ def configure_sessions(self, manager):
213
+ manager.add("fast", FetcherSession(impersonate="chrome"))
214
+ manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)
215
+
216
+ async def parse(self, response: Response):
217
+ for link in response.css('a::attr(href)').getall():
218
+ # وجّه الصفحات المحمية عبر جلسة التخفي
219
+ if "protected" in link:
220
+ yield Request(link, sid="stealth")
221
+ else:
222
+ yield Request(link, sid="fast", callback=self.parse) # callback صريح
223
+ ```
224
+ أوقف واستأنف عمليات الزحف الطويلة مع Checkpoints بتشغيل Spider هكذا:
225
  ```python
226
+ QuotesSpider(crawldir="./crawl_data").start()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  ```
228
+ اضغط Ctrl+C للإيقاف بسلاسة — يتم حفظ التقدم تلقائياً. لاحقاً، عند تشغيل Spider مرة أخرى، مرر نفس `crawldir`، وسيستأنف من حيث توقف.
229
 
230
+ ### التحليل المتقدم والتنقل
231
+ ```python
232
+ from scrapling.fetchers import Fetcher
233
+
234
+ # اختيار عناصر غني وتنقل
235
+ page = Fetcher.get('https://quotes.toscrape.com/')
236
+
237
+ # احصل على الاقتباسات بطرق اختيار متعددة
238
+ quotes = page.css('.quote') # محدد CSS
239
+ quotes = page.xpath('//div[@class="quote"]') # XPath
240
+ quotes = page.find_all('div', {'class': 'quote'}) # بأسلوب BeautifulSoup
241
+ # نفس الشيء مثل
242
+ quotes = page.find_all('div', class_='quote')
243
+ quotes = page.find_all(['div'], class_='quote')
244
+ quotes = page.find_all(class_='quote') # وهكذا...
245
+ # البحث عن عنصر بمحتوى النص
246
+ quotes = page.find_by_text('quote', tag='div')
247
+
248
+ # التنقل المتقدم
249
+ quote_text = page.css('.quote')[0].css('.text::text').get()
250
+ quote_text = page.css('.quote').css('.text::text').getall() # محددات متسلسلة
251
+ first_quote = page.css('.quote')[0]
252
+ author = first_quote.next_sibling.css('.author::text')
253
+ parent_container = first_quote.parent
254
+
255
+ # علاقات العناصر والتشابه
256
+ similar_elements = first_quote.find_similar()
257
+ below_elements = first_quote.below_elements()
258
+ ```
259
+ يمكنك استخدام المحلل مباشرة إذا كنت لا تريد جلب المواقع كما يلي:
260
  ```python
261
+ from scrapling.parser import Selector
262
+
263
+ page = Selector("<html>...</html>")
264
+ ```
265
+ وهو يعمل بنفس الطريقة تماماً!
266
+
267
+ ### أمثلة إدارة الجلسات بشكل Async
268
+ ```python
269
+ import asyncio
270
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession
271
+
272
+ async with FetcherSession(http3=True) as session: # `FetcherSession` واعٍ بالسياق ويعمل في كلا النمطين المتزامن/async
273
+ page1 = session.get('https://quotes.toscrape.com/')
274
  page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
275
 
276
  # استخدام جلسة async
277
  async with AsyncStealthySession(max_pages=2) as session:
278
  tasks = []
279
  urls = ['https://example.com/page1', 'https://example.com/page2']
280
+
281
  for url in urls:
282
  task = session.fetch(url)
283
  tasks.append(task)
284
+
285
  print(session.get_pool_stats()) # اختياري - حالة مجموعة علامات تبويب المتصفح (مشغول/حر/خطأ)
286
  results = await asyncio.gather(*tasks)
287
  print(session.get_pool_stats())
288
  ```
289
 
290
+ ## واجهة سطر الأوامر والـ Shell التفاعلي
291
 
292
+ يتضمن Scrapling واجهة سطر أوامر قوية:
293
 
294
  [![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)
295
 
296
+ تشغيل Shell الـ Web Scraping التفاعلي
297
  ```bash
298
  scrapling shell
299
  ```
300
+ استخرج الصفحات إلى ملف مباشرة دون برمجة (يستخرج المحتوى داخل وسم `body` افتراضياً). إذا انتهى ملف الإخراج بـ `.txt`، فسيتم استخراج محتوى النص للهدف. إذا انتهى بـ `.md`، فسيكون تمثيل Markdown لمحتوى HTML؛ إذا انتهى بـ `.html`، فسيكون محتوى HTML نفسه.
301
  ```bash
302
  scrapling extract get 'https://example.com' content.md
303
  scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # جميع العناصر المطابقة لمحدد CSS '#fromSkipToProducts'
 
306
  ```
307
 
308
  > [!NOTE]
309
+ > هناك العديد من الميزات الإضافية، لكننا نريد إبقاء هذه الصفحة موجزة، بما في ذلك خادم MCP والـ Shell التفاعلي لـ Web Scraping. تحقق من الوثائق الكاملة [هنا](https://scrapling.readthedocs.io/en/latest/)
310
 
311
  ## معايير الأداء
312
 
313
+ Scrapling ليس قوياً فحسب بل هو أيضاً سريع بشكل مذهل. تقارن المعايير التالية محلل Scrapling مع أحدث إصدارات المكتبات الشائعة الأخرى.
314
 
315
  ### اختبار سرعة استخراج النص (5000 عنصر متداخل)
316
 
317
+ | # | المكتبة | الوقت (ms) | vs Scrapling |
318
  |---|:-----------------:|:----------:|:------------:|
319
+ | 1 | Scrapling | 2.02 | 1.0x |
320
+ | 2 | Parsel/Scrapy | 2.04 | 1.01 |
321
+ | 3 | Raw Lxml | 2.54 | 1.257 |
322
+ | 4 | PyQuery | 24.17 | ~12x |
323
+ | 5 | Selectolax | 82.63 | ~41x |
324
+ | 6 | MechanicalSoup | 1549.71 | ~767.1x |
325
+ | 7 | BS4 with Lxml | 1584.31 | ~784.3x |
326
+ | 8 | BS4 with html5lib | 3391.91 | ~1679.1x |
327
 
328
 
329
  ### أداء تشابه العناصر والبحث النصي
 
332
 
333
  | المكتبة | الوقت (ms) | vs Scrapling |
334
  |-------------|:----------:|:------------:|
335
+ | Scrapling | 2.39 | 1.0x |
336
+ | AutoScraper | 12.45 | 5.209x |
337
 
338
 
339
  > تمثل جميع المعايير متوسطات أكثر من 100 تشغيل. انظر [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) للمنهجية.
340
 
341
  ## التثبيت
342
 
343
+ يتطلب Scrapling إصدار Python 3.10 أو أعلى:
344
 
345
  ```bash
346
  pip install scrapling
347
  ```
348
 
349
+ يتضمن هذا التثبيت فقط محرك المحلل وتبعياته، بدون أي جوالب أو تبعيات سطر الأوامر.
350
 
351
  ### التبعيات الاختيارية
352
 
353
  1. إذا كنت ستستخدم أياً من الميزات الإضافية أدناه، أو الجوالب، أو فئاتها، فستحتاج إلى تثبيت تبعيات الجوالب وتبعيات المتصفح الخاصة بها على النحو التالي:
354
  ```bash
355
  pip install "scrapling[fetchers]"
356
+
357
  scrapling install
358
  ```
359
 
360
+ يقوم هذا بتنزيل جميع المتصفحات، إلى جانب تبعيات النظام وتبعيات معالجة fingerprint الخاصة بها.
361
 
362
  2. ميزات إضافية:
363
  - تثبيت ميزة خادم MCP:
364
  ```bash
365
  pip install "scrapling[ai]"
366
  ```
367
+ - تثبيت ميزات Shell (Shell الـ Web Scraping وأمر `extract`):
368
  ```bash
369
  pip install "scrapling[shell]"
370
  ```
 
401
  ## الشكر والتقدير
402
 
403
  يتضمن هذا المشروع كوداً معدلاً من:
404
+ - Parsel (ترخيص BSD) يُستخدم للوحدة الفرعية [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)
 
 
 
 
 
 
 
405
 
406
  ---
407
+ <div align="center"><small>مصمم ومصنوع بـ ❤️ بواسطة كريم شعير.</small></div><br>
docs/README_CN.md CHANGED
@@ -1,9 +1,14 @@
1
- <p align=center>
2
- <br>
3
- <a href="https://scrapling.readthedocs.io/en/latest/" target="_blank"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/poster.png" style="width: 50%; height: 100%;" alt="main poster"/></a>
4
- <br>
5
- <i><code>简单、轻松的网页抓取,本该如此!</code></i>
6
- </p>
 
 
 
 
 
7
  <p align="center">
8
  <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
9
  <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
@@ -24,46 +29,47 @@
24
  </p>
25
 
26
  <p align="center">
27
- <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/">
28
- 选择方法
29
- </a>
30
- ·
31
- <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/">
32
- 选择获取器
33
- </a>
34
- ·
35
- <a href="https://scrapling.readthedocs.io/en/latest/cli/overview/">
36
- 命令行界面
37
- </a>
38
- ·
39
- <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/">
40
- MCP模式
41
- </a>
42
- ·
43
- <a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/">
44
- 从Beautifulsoup迁移
45
- </a>
46
  </p>
47
 
48
- **停止与反机器人系统斗争停止在每次网站更新后重写选择器。**
49
 
50
- Scrapling不仅仅是另一个网页抓取库。是第一个**自适应**抓取库,能够从网站变化中学习并与之共同进化。当其他库网站更新结构失效,Scrapling会自动重新定位您的元素并保持抓器运行。
51
 
52
- 为现代网络而构建,Scrapling具有**自己的快解析引擎**和获器来处理您面临或将要面临的所有网页抓取挑战。由网页抓取者网页抓取者和普通用户构建,适合每个人。
53
 
54
  ```python
55
- >> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
56
- >> StealthyFetcher.adaptive = True
57
- # 隐秘地获取网站源代码
58
- >> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
59
- >> print(page.status)
60
- 200
61
- >> products = page.css('.product', auto_save=True) # 抓取在网站设计变更后仍能存活的数据!
62
- >> # 之后,如果网站结构改变,传递 `adaptive=True`
63
- >> products = page.css('.product', adaptive=True) # Scrapling仍然能找到它们!
64
  ```
 
 
 
65
 
66
- # 赞助商
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  <!-- sponsors -->
69
 
@@ -87,122 +93,195 @@ Scrapling不仅仅是另一个网页��取库。它是第一个**自适应**抓
87
 
88
  ## 主要特性
89
 
90
- ### 支持会话高级网站获
91
- - **HTTP请求**:使用`Fetcher`类进行快速隐秘的HTTP请求可以模拟浏览器的TLS指纹、标头并使用HTTP3。
 
 
 
 
 
 
 
 
 
92
  - **动态加载**:通过`DynamicFetcher`类使用完整的浏览器自动化获取动态网站,支持Playwright的Chromium和Google Chrome。
93
- - **反机器人绕过**:使用`StealthyFetcher`的高级隐秘功能和指纹伪装。可以轻松自动绕过所有类型的CloudflareTurnstile/Interstitial。
94
- - **会话管理**:使用`FetcherSession`、`StealthySession`和`DynamicSession`类持久化会话支持,用于跨请求的cookie和状态管理。
95
- - **异步支持**:所有获取器和专用异步会话的完整异步支持。
 
 
96
 
97
  ### 自适应抓取和AI集成
98
  - 🔄 **智能元素跟踪**:使用智能相似性算法在网站更改后重新定位元素。
99
  - 🎯 **智能灵活选择**:CSS选择器、XPath选择器、基于过滤器的搜索、文本搜索、正则表达式搜索等。
100
- - 🔍 **查找相似元素**:自动定位与找到元素相似的元素。
101
- - 🤖 **与AI一起使用的MCP服务器**:内置MCP服务器用于AI辅助网页抓取和数据提取。MCP服务器具有强大的自定义功能,利用Scrapling在将内容传递给AI(Claude/Cursor等)之前提取目标内容,从而加快操作并通过最小化令牌使用来降低成本。([演示视频](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
102
 
103
  ### 高性能和经过实战测试的架构
104
  - 🚀 **闪电般快速**:优化性能超越大多数Python抓取库。
105
  - 🔋 **内存高效**:优化的数据结构和延迟加载,最小内存占用。
106
  - ⚡ **快速JSON序列化**:比标准库快10倍。
107
- - 🏗️ **经过实战测试**:Scrapling不仅拥有92%的测试覆盖率和完整的类型提示覆盖率,而且在过去一年中每天被数百名网页抓取者使用。
108
 
109
- ### 对开发者/网页抓取者友好的体验
110
- - 🎯 **交互式网页抓取Shell**:可选的内置IPython shell,具有Scrapling集成、快捷方式和新工具,可加快网页抓取脚本开发,例如将curl请求转换为Scrapling请求并在浏览器中查看请求结果。
111
  - 🚀 **直接从终端使用**:可选地,您可以使用Scrapling抓取URL而无需编��任何代码!
112
  - 🛠️ **丰富的导航API**:使用父级、兄弟级和子级导航方法进行高级DOM遍历。
113
  - 🧬 **增强的文本处理**:内置正则表达式、清理方法和优化的字符串操作。
114
  - 📝 **自动选择器生成**:为任何元素生成强大的CSS/XPath选择器。
115
  - 🔌 **熟悉的API**:类似于Scrapy/BeautifulSoup,使用与Scrapy/Parsel相同的伪元素。
116
- - 📘 **完整的类型覆盖**:完整的类型提示,出色的IDE支持和代码补全。
117
  - 🔋 **现成的Docker镜像**:每次发布时,包含所有浏览器的Docker镜像会自动构建和推送。
118
 
119
  ## 入门
120
 
 
 
121
  ### 基本用法
 
122
  ```python
123
- from scrapling.fetchers import Fetcher, StealthyFetcher, DynamicFetcher
124
- from scrapling.fetchers import FetcherSession, StealthySession, DynamicSession
125
 
126
- # 支持会话HTTP请求
127
- with FetcherSession(impersonate='chrome') as session: # 使用Chrome的最新版本TLS指纹
128
  page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
129
- quotes = page.css('.quote .text::text')
130
 
131
  # 或使用一次性请求
132
  page = Fetcher.get('https://quotes.toscrape.com/')
133
- quotes = page.css('.quote .text::text')
 
 
 
 
134
 
135
- # 高级隐秘模式(保持浏览器打开直到完成
136
- with StealthySession(headless=True, solve_cloudflare=True) as session:
137
  page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
138
- data = page.css('#padded_content a')
139
 
140
  # 或使用一次性请求样式,为此请求打开浏览器,完成后关闭
141
  page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
142
- data = page.css('#padded_content a')
143
-
144
- # 完整的浏览器自动化(保持浏览器打开直到完成)
145
- with DynamicSession(headless=True) as session:
146
- page = session.fetch('https://quotes.toscrape.com/', network_idle=True)
147
- quotes = page.css('.quote .text::text')
148
-
149
- # 或使用一次性请求样式
150
- page = DynamicFetcher.fetch('https://quotes.toscrape.com/', network_idle=True)
151
- quotes = page.css('.quote .text::text')
152
  ```
 
 
 
 
 
 
 
153
 
154
- ### 元素选择
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  ```python
156
- # CSS选择器
157
- page.css('a::text') # 提取文本
158
- page.css('a::attr(href)') # 提取属性
159
- page.css('a', recursive=False) # 仅直接元素
160
- page.css('a', auto_save=True) # 自动保存元素位置
161
-
162
- # XPath
163
- page.xpath('//a/text()')
164
-
165
- # 灵活搜索
166
- page.find_by_text('Python', first_match=True) # 按文本查找
167
- page.find_by_regex(r'\d{4}') # 按正则表达式模式查找
168
- page.find('div', {'class': 'container'}) # 按属性查找
169
-
170
- # 导航
171
- element.parent # 获取父元素
172
- element.next_sibling # 获取下一个兄弟元素
173
- element.children # 获取子元素
174
-
175
- # 相似元素
176
- similar = page.get_similar(element) # 查找相似元素
177
-
178
- # 自适应抓取
179
- saved_elements = page.css('.product', auto_save=True)
180
- # 之后,当网站更改时:
181
- page.css('.product', adaptive=True) # 使用保存的位置查找元素
182
  ```
 
183
 
184
- ### 会话使用
185
  ```python
186
- from scrapling.fetchers import FetcherSession, AsyncFetcherSession
187
-
188
- # 同步会话
189
- with FetcherSession() as session:
190
- # Cookie自动保持
191
- page1 = session.get('https://quotes.toscrape.com/login')
192
- page2 = session.post('https://quotes.toscrape.com/login', data={'username': 'admin', 'password': 'admin'})
193
-
194
- # 如需要,切换浏览器指纹
195
  page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
196
 
197
- # 异步会话使
198
  async with AsyncStealthySession(max_pages=2) as session:
199
  tasks = []
200
  urls = ['https://example.com/page1', 'https://example.com/page2']
201
-
202
  for url in urls:
203
  task = session.fetch(url)
204
  tasks.append(task)
205
-
206
  print(session.get_pool_stats()) # 可选 - 浏览器标签池的状态(忙/空闲/错误)
207
  results = await asyncio.gather(*tasks)
208
  print(session.get_pool_stats())
@@ -210,11 +289,11 @@ async with AsyncStealthySession(max_pages=2) as session:
210
 
211
  ## CLI和交互式Shell
212
 
213
- Scrapling v0.3包含强大的命令行界面:
214
 
215
  [![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)
216
 
217
- 启动交互式网页抓取shell
218
  ```bash
219
  scrapling shell
220
  ```
@@ -227,24 +306,24 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.
227
  ```
228
 
229
  > [!NOTE]
230
- > 还有许多其他功能,但我们希望保持此页面简洁,例如MCP服务器和交互式网页抓取Shell。查看完整文档[这里](https://scrapling.readthedocs.io/en/latest/)
231
 
232
  ## 性能基准
233
 
234
- Scrapling不仅功能强大——它还速度极快,自0.3版本以来的更新在所有操作中都提供了卓越的性能改进。以下基准测试将Scrapling的析器与其他流行库进行了比较。
235
 
236
  ### 文本提取速度测试(5000个嵌套元素)
237
 
238
- | # | 库 | 时间(ms) | vs Scrapling |
239
- |---|:-----------------:|:-------:|:------------:|
240
- | 1 | Scrapling | 1.99 | 1.0x |
241
- | 2 | Parsel/Scrapy | 2.01 | 1.01x |
242
- | 3 | Raw Lxml | 2.5 | 1.256x |
243
- | 4 | PyQuery | 22.93 | ~11.5x |
244
- | 5 | Selectolax | 80.57 | ~40.5x |
245
- | 6 | BS4 with Lxml | 1541.37 | ~774.6x |
246
- | 7 | MechanicalSoup | 1547.35 | ~777.6x |
247
- | 8 | BS4 with html5lib | 3410.58 | ~1713.9x |
248
 
249
 
250
  ### 元素相似性和文本搜索性能
@@ -252,9 +331,9 @@ Scrapling不仅功能强大——它还速度极快,自0.3版本以来的更
252
  Scrapling的自适应元素查找功能明显优于替代方案:
253
 
254
  | 库 | 时间(ms) | vs Scrapling |
255
- |-------------|:------:|:------------:|
256
- | Scrapling | 2.46 | 1.0x |
257
- | AutoScraper | 13.3 | 5.407x |
258
 
259
 
260
  > 所有基准测试代表100+次运行的平均值。请参阅[benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py)了解方法。
@@ -267,25 +346,25 @@ Scrapling需要Python 3.10或更高版本:
267
  pip install scrapling
268
  ```
269
 
270
- 从v0.3.2开始,此安装仅包括解析器引擎及其依赖项,没有任何获取器或命令行依赖项。
271
 
272
  ### 可选依赖项
273
 
274
- 1. 如果您要使用以下任何额外功能、获取器或它们的类,您将需要安装获取器的依赖项和它们的浏览器依赖项,如下所示:
275
  ```bash
276
  pip install "scrapling[fetchers]"
277
-
278
  scrapling install
279
  ```
280
 
281
- 这会下载所有浏览器,以及它们的系统依赖项和指纹操作依赖项。
282
 
283
  2. 额外功能:
284
  - 安装MCP服务器功能:
285
  ```bash
286
  pip install "scrapling[ai]"
287
  ```
288
- - 安装shell功能(网页抓取shell和`extract`命令):
289
  ```bash
290
  pip install "scrapling[shell]"
291
  ```
@@ -324,12 +403,5 @@ docker pull ghcr.io/d4vinci/scrapling:latest
324
  此项目包含改编自以下内容的代码:
325
  - Parsel(BSD许可证)——用于[translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)子模块
326
 
327
- ## 感谢和参考
328
-
329
- - [Daijro](https://github.com/daijro)在[BrowserForge](https://github.com/daijro/browserforge)和[Camoufox](https://github.com/daijro/camoufox)上的出色工作
330
- - [Vinyzu](https://github.com/Vinyzu)在[Botright](https://github.com/Vinyzu/Botright)和[PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)上的出色工作
331
- - [brotector](https://github.com/kaliiiiiiiiii/brotector)提供的浏览器检测绕过技术
332
- - [fakebrowser](https://github.com/kkoooqq/fakebrowser)和[BotBrowser](https://github.com/botswin/BotBrowser)提供的指纹识别研究
333
-
334
  ---
335
- <div align="center"><small>由Karim Shoair用❤️设计和制作。</small></div><br>
 
1
+ <h1 align="center">
2
+ <a href="https://scrapling.readthedocs.io">
3
+ <picture>
4
+ <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
5
+ <img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
6
+ </picture>
7
+ </a>
8
+ <br>
9
+ <small>Effortless Web Scraping for the Modern Web</small>
10
+ </h1>
11
+
12
  <p align="center">
13
  <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
14
  <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
 
29
  </p>
30
 
31
  <p align="center">
32
+ <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/"><strong>选择方法</strong></a>
33
+ &middot;
34
+ <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/"><strong>选择Fetcher</strong></a>
35
+ &middot;
36
+ <a href="https://scrapling.readthedocs.io/en/latest/cli/overview/"><strong>CLI</strong></a>
37
+ &middot;
38
+ <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/"><strong>MCP模式</strong></a>
39
+ &middot;
40
+ <a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/"><strong>从Beautifulsoup迁移</strong></a>
 
 
 
 
 
 
 
 
 
 
41
  </p>
42
 
43
+ Scrapling是一个自适应Web Scraping框架,能处理从单个请求到大规模爬取的一切需求
44
 
45
+ 的解析器能够从网站变化中学习并在页面更新时自动重新定位您的元素。它的Fetcher能够开箱即用地绕过Cloudflare Turnstile等反机器人系统。它的Spider框架让您可以扩展到发、多Session爬,支持暂停/恢复和自动Proxy轮换——只需几Python代码一个库,零妥协。
46
 
47
+ ,实时统计和Streaming。由Web ScraperWeb Scraper和普通用户构建,每个人都能找到适合自己的功能
48
 
49
  ```python
50
+ from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
51
+ StealthyFetcher.adaptive = True
52
+ page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # 隐秘地获取网站!
53
+ products = page.css('.product', auto_save=True) # 抓取在网站设计变更后仍能存活的数据!
54
+ products = page.css('.product', adaptive=True) # 之后,如果网站结构改变,传递 `adaptive=True` 来找到它们!
 
 
 
 
55
  ```
56
+ 或扩展为完整爬取
57
+ ```python
58
+ from scrapling.spiders import Spider, Response
59
 
60
+ class MySpider(Spider):
61
+ name = "demo"
62
+ start_urls = ["https://example.com/"]
63
+
64
+ async def parse(self, response: Response):
65
+ for item in response.css('.product'):
66
+ yield {"title": item.css('h2::text').get()}
67
+
68
+ MySpider().start()
69
+ ```
70
+
71
+
72
+ # 赞助商
73
 
74
  <!-- sponsors -->
75
 
 
93
 
94
  ## 主要特性
95
 
96
+ ### Spider — 完整框架
97
+ - 🕷️ **类Scrapy的Spider API**:使用`start_urls`、async `parse` callback`Request`/`Response`对象定义Spider
98
+ - ⚡ **并发爬取**:可配置的并发限制、按域名节流和下载延迟。
99
+ - 🔄 **多Session支持**:统一接口,支持HTTP请求和隐秘无头浏览器在同一个Spider中使用——通过ID将请求路由到不同的Session。
100
+ - 💾 **暂停与恢复**:基于Checkpoint的爬取持久化。按Ctrl+C优雅关闭;重启后从上次停止的地方继续。
101
+ - 📡 **Streaming模式**:通过`async for item in spider.stream()`以实时统计Streaming抓取的数据——非常适合UI、管道和长时间运行的爬取。
102
+ - 🛡️ **被阻止请求检测**:自动检测并重试被阻止的请求,支持自定义逻辑。
103
+ - 📦 **内置导出**:通过钩子和您自己的管道导出结果,或使用内置的JSON/JSONL,分别通过`result.items.to_json()`/`result.items.to_jsonl()`。
104
+
105
+ ### 支持Session的高级网站获取
106
+ - **HTTP请求**:使用`Fetcher`类进行快速和隐秘的HTTP请求。可以模拟浏览器的TLS fingerprint、标头并使用HTTP/3。
107
  - **动态加载**:通过`DynamicFetcher`类使用完整的浏览器自动化获取动态网站,支持Playwright的Chromium和Google Chrome。
108
+ - **反机器人绕过**:使用`StealthyFetcher`的高级隐秘功能和fingerprint伪装。可以轻松自动绕过所有类型的Cloudflare Turnstile/Interstitial。
109
+ - **Session管理**:使用`FetcherSession`、`StealthySession`和`DynamicSession`类实现持久化Session支持,用于跨请求的cookie和状态管理。
110
+ - **Proxy轮换**:内置`ProxyRotator`,支持轮询或自定义策略,适用于所有Session型,并支持按请求覆盖Proxy
111
+ - **域名屏蔽**:在基于浏览器的Fetcher中屏蔽对特定域名(及其子域名)的请求。
112
+ - **Async支持**:所有Fetcher和专用async Session类的完整async支持。
113
 
114
  ### 自适应抓取和AI集成
115
  - 🔄 **智能元素跟踪**:使用智能相似性算法在网站更改后重新定位元素。
116
  - 🎯 **智能灵活选择**:CSS选择器、XPath选择器、基于过滤器的搜索、文本搜索、正则表达式搜索等。
117
+ - 🔍 **查找相似元素**:自动定位与找到元素相似的元素。
118
+ - 🤖 **与AI一起使用的MCP服务器**:内置MCP服务器用于AI辅助Web Scraping和数据提取。MCP服务器具有强大的自定义功能,利用Scrapling在将内容传递给AI(Claude/Cursor等)之前提取目标内容,从而加快操作并通过最小化token使用来降低成本。([演示视频](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
119
 
120
  ### 高性能和经过实战测试的架构
121
  - 🚀 **闪电般快速**:优化性能超越大多数Python抓取库。
122
  - 🔋 **内存高效**:优化的数据结构和延迟加载,最小内存占用。
123
  - ⚡ **快速JSON序列化**:比标准库快10倍。
124
+ - 🏗️ **经过实战测试**:Scrapling不仅拥有92%的测试覆盖率和完整的类型提示覆盖率,而且在过去一年中每天被数百名Web Scraper使用。
125
 
126
+ ### 对开发者/Web Scraper友好的体验
127
+ - 🎯 **交互式Web Scraping Shell**:可选的内置IPython Shell,具有Scrapling集成、快捷方式和新工具,可加快Web Scraping脚本开发,例如将curl请求转换为Scrapling请求并在浏览器中查看请求结果。
128
  - 🚀 **直接从终端使用**:可选地,您可以使用Scrapling抓取URL而无需编��任何代码!
129
  - 🛠️ **丰富的导航API**:使用父级、兄弟级和子级导航方法进行高级DOM遍历。
130
  - 🧬 **增强的文本处理**:内置正则表达式、清理方法和优化的字符串操作。
131
  - 📝 **自动选择器生成**:为任何元素生成强大的CSS/XPath选择器。
132
  - 🔌 **熟悉的API**:类似于Scrapy/BeautifulSoup,使用与Scrapy/Parsel相同的伪元素。
133
+ - 📘 **完整的类型覆盖**:完整的类型提示,出色的IDE支持和代码补全。整个代码库在每次更改时都会自动使用**PyRight**和**MyPy**扫描。
134
  - 🔋 **现成的Docker镜像**:每次发布时,包含所有浏览器的Docker镜像会自动构建和推送。
135
 
136
  ## 入门
137
 
138
+ 让我们快速展示Scrapling的功能,无需深入了解。
139
+
140
  ### 基本用法
141
+ 支持Session的HTTP请求
142
  ```python
143
+ from scrapling.fetchers import Fetcher, FetcherSession
 
144
 
145
+ with FetcherSession(impersonate='chrome') as session: # 使用Chrome最新版本TLS fingerprint
 
146
  page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
147
+ quotes = page.css('.quote .text::text').getall()
148
 
149
  # 或使用一次性请求
150
  page = Fetcher.get('https://quotes.toscrape.com/')
151
+ quotes = page.css('.quote .text::text').getall()
152
+ ```
153
+ 高级隐秘模式
154
+ ```python
155
+ from scrapling.fetchers import StealthyFetcher, StealthySession
156
 
157
+ with StealthySession(headless=True, solve_cloudflare=True) as session: # 保持浏览器打开直到完成
 
158
  page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
159
+ data = page.css('#padded_content a').getall()
160
 
161
  # 或使用一次性请求样式,为此请求打开浏览器,完成后关闭
162
  page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
163
+ data = page.css('#padded_content a').getall()
 
 
 
 
 
 
 
 
 
164
  ```
165
+ 完整的浏览器自动化
166
+ ```python
167
+ from scrapling.fetchers import DynamicFetcher, DynamicSession
168
+
169
+ with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # 保持浏览器打开直到完成
170
+ page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
171
+ data = page.xpath('//span[@class="text"]/text()').getall() # 如果您偏好XPath选择器
172
 
173
+ # 或使用一次性请求样式,为此请求打开浏览器,完成后关闭
174
+ page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
175
+ data = page.css('.quote .text::text').getall()
176
+ ```
177
+
178
+ ### Spider
179
+ 构建具有并发请求、多种Session类型和暂停/恢复功能的完整爬虫:
180
+ ```python
181
+ from scrapling.spiders import Spider, Request, Response
182
+
183
+ class QuotesSpider(Spider):
184
+ name = "quotes"
185
+ start_urls = ["https://quotes.toscrape.com/"]
186
+ concurrent_requests = 10
187
+
188
+ async def parse(self, response: Response):
189
+ for quote in response.css('.quote'):
190
+ yield {
191
+ "text": quote.css('.text::text').get(),
192
+ "author": quote.css('.author::text').get(),
193
+ }
194
+
195
+ next_page = response.css('.next a')
196
+ if next_page:
197
+ yield response.follow(next_page[0].attrib['href'])
198
+
199
+ result = QuotesSpider().start()
200
+ print(f"抓取了 {len(result.items)} 条引用")
201
+ result.items.to_json("quotes.json")
202
+ ```
203
+ 在单个Spider中使用多种Session类型:
204
+ ```python
205
+ from scrapling.spiders import Spider, Request, Response
206
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession
207
+
208
+ class MultiSessionSpider(Spider):
209
+ name = "multi"
210
+ start_urls = ["https://example.com/"]
211
+
212
+ def configure_sessions(self, manager):
213
+ manager.add("fast", FetcherSession(impersonate="chrome"))
214
+ manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)
215
+
216
+ async def parse(self, response: Response):
217
+ for link in response.css('a::attr(href)').getall():
218
+ # 将受保护的页面路由到隐秘Session
219
+ if "protected" in link:
220
+ yield Request(link, sid="stealth")
221
+ else:
222
+ yield Request(link, sid="fast", callback=self.parse) # 显式callback
223
+ ```
224
+ 通过如下方式运行Spider来暂停和恢复长时间爬取,使用Checkpoint:
225
+ ```python
226
+ QuotesSpider(crawldir="./crawl_data").start()
227
+ ```
228
+ 按Ctrl+C优雅暂停——进度会自动保存。之后,当您再次启动Spider时,传递相同的`crawldir`,它将从上次停止的地方继续。
229
+
230
+ ### 高级解析与导航
231
+ ```python
232
+ from scrapling.fetchers import Fetcher
233
+
234
+ # 丰富的元素选择和导航
235
+ page = Fetcher.get('https://quotes.toscrape.com/')
236
+
237
+ # 使用多种选择方法获取引用
238
+ quotes = page.css('.quote') # CSS选择器
239
+ quotes = page.xpath('//div[@class="quote"]') # XPath
240
+ quotes = page.find_all('div', {'class': 'quote'}) # BeautifulSoup风格
241
+ # 等同于
242
+ quotes = page.find_all('div', class_='quote')
243
+ quotes = page.find_all(['div'], class_='quote')
244
+ quotes = page.find_all(class_='quote') # 等等...
245
+ # 按文本内容查找元素
246
+ quotes = page.find_by_text('quote', tag='div')
247
+
248
+ # 高级导航
249
+ quote_text = page.css('.quote')[0].css('.text::text').get()
250
+ quote_text = page.css('.quote').css('.text::text').getall() # 链式选择器
251
+ first_quote = page.css('.quote')[0]
252
+ author = first_quote.next_sibling.css('.author::text')
253
+ parent_container = first_quote.parent
254
+
255
+ # 元素关系和相似性
256
+ similar_elements = first_quote.find_similar()
257
+ below_elements = first_quote.below_elements()
258
+ ```
259
+ 如果您不想获取网站,可以直接使用解析器,如下所示:
260
  ```python
261
+ from scrapling.parser import Selector
262
+
263
+ page = Selector("<html>...</html>")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  ```
265
+ 用法完全相同!
266
 
267
+ ### Async Session管理示例
268
  ```python
269
+ import asyncio
270
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession
271
+
272
+ async with FetcherSession(http3=True) as session: # `FetcherSession`是上下文感知的,可以在sync/async模式下工作
273
+ page1 = session.get('https://quotes.toscrape.com/')
 
 
 
 
274
  page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
275
 
276
+ # Async Session
277
  async with AsyncStealthySession(max_pages=2) as session:
278
  tasks = []
279
  urls = ['https://example.com/page1', 'https://example.com/page2']
280
+
281
  for url in urls:
282
  task = session.fetch(url)
283
  tasks.append(task)
284
+
285
  print(session.get_pool_stats()) # 可选 - 浏览器标签池的状态(忙/空闲/错误)
286
  results = await asyncio.gather(*tasks)
287
  print(session.get_pool_stats())
 
289
 
290
  ## CLI和交互式Shell
291
 
292
+ Scrapling包含强大的命令行界面:
293
 
294
  [![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)
295
 
296
+ 启动交互式Web Scraping Shell
297
  ```bash
298
  scrapling shell
299
  ```
 
306
  ```
307
 
308
  > [!NOTE]
309
+ > 还有许多其他功能,但我们希望保持此页面简洁,包括MCP服务器和交互式Web Scraping Shell。查看完整文档[这里](https://scrapling.readthedocs.io/en/latest/)
310
 
311
  ## 性能基准
312
 
313
+ Scrapling不仅功能强大——它还速度极快。以下基准测试将Scrapling的��析器与其他流行库的最新版本进行了比较。
314
 
315
  ### 文本提取速度测试(5000个嵌套元素)
316
 
317
+ | # | 库 | 时间(ms) | vs Scrapling |
318
+ |---|:-----------------:|:---------:|:------------:|
319
+ | 1 | Scrapling | 2.02 | 1.0x |
320
+ | 2 | Parsel/Scrapy | 2.04 | 1.01 |
321
+ | 3 | Raw Lxml | 2.54 | 1.257 |
322
+ | 4 | PyQuery | 24.17 | ~12x |
323
+ | 5 | Selectolax | 82.63 | ~41x |
324
+ | 6 | MechanicalSoup | 1549.71 | ~767.1x |
325
+ | 7 | BS4 with Lxml | 1584.31 | ~784.3x |
326
+ | 8 | BS4 with html5lib | 3391.91 | ~1679.1x |
327
 
328
 
329
  ### 元素相似性和文本搜索性能
 
331
  Scrapling的自适应元素查找功能明显优于替代方案:
332
 
333
  | 库 | 时间(ms) | vs Scrapling |
334
+ |-------------|:---------:|:------------:|
335
+ | Scrapling | 2.39 | 1.0x |
336
+ | AutoScraper | 12.45 | 5.209x |
337
 
338
 
339
  > 所有基准测试代表100+次运行的平均值。请参阅[benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py)了解方法。
 
346
  pip install scrapling
347
  ```
348
 
349
+ 此安装仅包括解析器引擎及其依赖项,没有任何Fetcher或命令行依赖项。
350
 
351
  ### 可选依赖项
352
 
353
+ 1. 如果您要使用以下任何额外功能、Fetcher或它们的类,您将需要安装Fetcher的依赖项和它们的浏览器依赖项,如下所示:
354
  ```bash
355
  pip install "scrapling[fetchers]"
356
+
357
  scrapling install
358
  ```
359
 
360
+ 这会下载所有浏览器,以及它们的系统依赖项和fingerprint操作依赖项。
361
 
362
  2. 额外功能:
363
  - 安装MCP服务器功能:
364
  ```bash
365
  pip install "scrapling[ai]"
366
  ```
367
+ - 安装Shell功能(Web Scraping Shell和`extract`命令):
368
  ```bash
369
  pip install "scrapling[shell]"
370
  ```
 
403
  此项目包含改编自以下内容的代码:
404
  - Parsel(BSD许可证)——用于[translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)子模块
405
 
 
 
 
 
 
 
 
406
  ---
407
+ <div align="center"><small>由Karim Shoair用❤️设计和制作。</small></div><br>
docs/README_DE.md CHANGED
@@ -1,9 +1,14 @@
1
- <p align=center>
2
- <br>
3
- <a href="https://scrapling.readthedocs.io/en/latest/" target="_blank"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/poster.png" style="width: 50%; height: 100%;" alt="main poster"/></a>
4
- <br>
5
- <i><code>Einfaches, müheloses Web Scraping, wie es sein sollte!</code></i>
6
- </p>
 
 
 
 
 
7
  <p align="center">
8
  <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
9
  <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
@@ -24,46 +29,47 @@
24
  </p>
25
 
26
  <p align="center">
27
- <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/">
28
- Auswahlmethoden
29
- </a>
30
- ·
31
- <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/">
32
- Fetcher wählen
33
- </a>
34
- ·
35
- <a href="https://scrapling.readthedocs.io/en/latest/cli/overview/">
36
- CLI
37
- </a>
38
- ·
39
- <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/">
40
- MCP-Modus
41
- </a>
42
- ·
43
- <a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/">
44
- Migration von Beautifulsoup
45
- </a>
46
  </p>
47
 
48
- **Hören Sie auf, gegen Anti-Bot-Systeme zu kämpfen. Hören Sie auf, Selektoren nach jedem Website-Update neu zu schreiben.**
49
 
50
- Scrapling ist nicht nur eine weitere Web-Scraping-Bibliothek. Es ist die erste **adaptive** Scraping-Bibliothek, die von Website-Änderungen lernt und sich mit ihnen weiterentwickelt. Während andere Bibliotheken brechen, wenn Websites ihre Struktur aktualisieren, lokalisiert Scrapling Ihre Elemente automatisch neu und hält Ihre Scraper am Laufen.
51
 
52
- Für das moderne Web entwickelt, bietet Scrapling **seine eigene schnelle Parsing-Engine** und Fetcher, um alle Web-Scraping-Herausforderungen zu bewältigen, denen Sie begegnen oder begegnen werden. Von Web Scrapern für Web Scraper und normale Benutzer entwickelt, ist für jeden etwas dabei.
53
 
54
  ```python
55
- >> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
56
- >> StealthyFetcher.adaptive = True
57
- # Holen Sie sich Website-Quellcode unter dem Radar!
58
- >> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
59
- >> print(page.status)
60
- 200
61
- >> products = page.css('.product', auto_save=True) # Scrapen Sie Daten, die Website-Designänderungen überleben!
62
- >> # Später, wenn sich die Website-Struktur ändert, übergeben Sie `adaptive=True`
63
- >> products = page.css('.product', adaptive=True) # und Scrapling findet sie trotzdem!
 
 
 
 
 
 
 
 
 
 
64
  ```
65
 
66
- # Sponsoren
 
67
 
68
  <!-- sponsors -->
69
 
@@ -87,12 +93,23 @@ Für das moderne Web entwickelt, bietet Scrapling **seine eigene schnelle Parsin
87
 
88
  ## Hauptmerkmale
89
 
90
- ### Erweiterte Website-Abruf mit Sitzungsunterstützung
91
- - **HTTP-Anfragen**: Schnelle und heimliche HTTP-Anfragen mit der `Fetcher`-Klasse. Kann Browser-TLS-Fingerabdrücke, Header imitieren und HTTP3 verwenden.
92
- - **Dynamisches Laden**: Abrufen dynamischer Websites mit vollständiger Browser-Automatisierung über die `DynamicFetcher`-Klasse, die Playwrights Chromium und Google Chrome unterstützt.
93
- - **Anti-Bot-Umgehung**: Erweiterte Stealth-Fähigkeiten mit `StealthyFetcher` und Fingerabdruck-Spoofing. Kann alle Arten von Cloudflares Turnstile/Interstitial einfach mit Automatisierung umgehen.
94
- - **Sitzungsverwaltung**: Persistente Sitzungsunterstützung mit den Klassen `FetcherSession`, `StealthySession` und `DynamicSession` für Cookie- und Zustandsverwaltung über Anfragen hinweg.
95
- - **Async-Unterstützung**: Vollständige Async-Unterstützung über alle Fetcher und dedizierte Async-Sitzungsklassen hinweg.
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  ### Adaptives Scraping & KI-Integration
98
  - 🔄 **Intelligente Element-Verfolgung**: Elemente nach Website-Änderungen mit intelligenten Ähnlichkeitsalgorithmen neu lokalisieren.
@@ -106,103 +123,165 @@ Für das moderne Web entwickelt, bietet Scrapling **seine eigene schnelle Parsin
106
  - ⚡ **Schnelle JSON-Serialisierung**: 10x schneller als die Standardbibliothek.
107
  - 🏗️ **Praxiserprobt**: Scrapling hat nicht nur eine Testabdeckung von 92% und eine vollständige Type-Hints-Abdeckung, sondern wird seit dem letzten Jahr täglich von Hunderten von Web Scrapern verwendet.
108
 
109
- ### Entwickler/Web-Scraper-freundliche Erfahrung
110
  - 🎯 **Interaktive Web-Scraping-Shell**: Optionale integrierte IPython-Shell mit Scrapling-Integration, Shortcuts und neuen Tools zur Beschleunigung der Web-Scraping-Skriptentwicklung, wie das Konvertieren von Curl-Anfragen in Scrapling-Anfragen und das Anzeigen von Anfrageergebnissen in Ihrem Browser.
111
  - 🚀 **Direkt vom Terminal aus verwenden**: Optional können Sie Scrapling verwenden, um eine URL zu scrapen, ohne eine einzige Codezeile zu schreiben!
112
  - 🛠️ **Umfangreiche Navigations-API**: Erweiterte DOM-Traversierung mit Eltern-, Geschwister- und Kind-Navigationsmethoden.
113
  - 🧬 **Verbesserte Textverarbeitung**: Integrierte Regex, Bereinigungsmethoden und optimierte String-Operationen.
114
  - 📝 **Automatische Selektorgenerierung**: Robuste CSS/XPath-Selektoren für jedes Element generieren.
115
  - 🔌 **Vertraute API**: Ähnlich wie Scrapy/BeautifulSoup mit denselben Pseudo-Elementen, die in Scrapy/Parsel verwendet werden.
116
- - 📘 **Vollständige Typabdeckung**: Vollständige Type Hints für hervorragende IDE-Unterstützung und Code-Vervollständigung.
117
  - 🔋 **Fertiges Docker-Image**: Mit jeder Veröffentlichung wird automatisch ein Docker-Image erstellt und gepusht, das alle Browser enthält.
118
 
119
  ## Erste Schritte
120
 
 
 
121
  ### Grundlegende Verwendung
 
122
  ```python
123
- from scrapling.fetchers import Fetcher, StealthyFetcher, DynamicFetcher
124
- from scrapling.fetchers import FetcherSession, StealthySession, DynamicSession
125
 
126
- # HTTP-Anfragen mit Sitzungsunterstützung
127
- with FetcherSession(impersonate='chrome') as session: # Verwenden Sie die neueste Version von Chromes TLS-Fingerabdruck
128
  page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
129
- quotes = page.css('.quote .text::text')
130
 
131
- # Oder verwenden Sie einmalige Anfragen
132
  page = Fetcher.get('https://quotes.toscrape.com/')
133
- quotes = page.css('.quote .text::text')
 
 
 
 
134
 
135
- # Erweiterter Stealth-Modus (Browser offen halten, bis Sie fertig sind)
136
- with StealthySession(headless=True, solve_cloudflare=True) as session:
137
  page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
138
- data = page.css('#padded_content a')
139
 
140
- # Oder verwenden Sie den einmaligen Anfragenstil, öffnet den Browser für diese Anfrage und schließt ihn dann nach Abschluss
141
  page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
142
- data = page.css('#padded_content a')
143
-
144
- # Vollständige Browser-Automatisierung (Browser offen halten, bis Sie fertig sind)
145
- with DynamicSession(headless=True) as session:
146
- page = session.fetch('https://quotes.toscrape.com/', network_idle=True)
147
- quotes = page.css('.quote .text::text')
148
-
149
- # Oder verwenden Sie den einmaligen Anfragenstil
150
- page = DynamicFetcher.fetch('https://quotes.toscrape.com/', network_idle=True)
151
- quotes = page.css('.quote .text::text')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  ```
 
 
 
 
 
153
 
154
- ### Elementauswahl
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  ```python
156
- # CSS-Selektoren
157
- page.css('a::text') # Text extrahieren
158
- page.css('a::attr(href)') # Attribute extrahieren
159
- page.css('a', recursive=False) # Nur direkte Elemente
160
- page.css('a', auto_save=True) # Elementpositionen automatisch speichern
161
-
162
- # XPath
163
- page.xpath('//a/text()')
164
-
165
- # Flexible Suche
166
- page.find_by_text('Python', first_match=True) # Nach Text suchen
167
- page.find_by_regex(r'\d{4}') # Nach Regex-Muster suchen
168
- page.find('div', {'class': 'container'}) # Nach Attributen suchen
169
-
170
- # Navigation
171
- element.parent # Elternelement abrufen
172
- element.next_sibling # Nächstes Geschwister abrufen
173
- element.children # Kindelemente abrufen
174
-
175
- # Ähnliche Elemente
176
- similar = page.get_similar(element) # Ähnliche Elemente finden
177
-
178
- # Adaptives Scraping
179
- saved_elements = page.css('.product', auto_save=True)
180
- # Später, wenn sich die Website ändert:
181
- page.css('.product', adaptive=True) # Elemente mithilfe gespeicherter Positionen finden
182
  ```
 
183
 
184
- ### Sitzungsverwendung
185
  ```python
186
- from scrapling.fetchers import FetcherSession, AsyncFetcherSession
187
-
188
- # Synchrone Sitzung
189
- with FetcherSession() as session:
190
- # Cookies werden automatisch beibehalten
191
- page1 = session.get('https://quotes.toscrape.com/login')
192
- page2 = session.post('https://quotes.toscrape.com/login', data={'username': 'admin', 'password': 'admin'})
193
-
194
- # Bei Bedarf Browser-Fingerabdruck wechseln
195
  page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
196
 
197
- # Async-Sitzungsverwendung
198
  async with AsyncStealthySession(max_pages=2) as session:
199
  tasks = []
200
  urls = ['https://example.com/page1', 'https://example.com/page2']
201
-
202
  for url in urls:
203
  task = session.fetch(url)
204
  tasks.append(task)
205
-
206
  print(session.get_pool_stats()) # Optional - Der Status des Browser-Tab-Pools (beschäftigt/frei/Fehler)
207
  results = await asyncio.gather(*tasks)
208
  print(session.get_pool_stats())
@@ -210,7 +289,7 @@ async with AsyncStealthySession(max_pages=2) as session:
210
 
211
  ## CLI & Interaktive Shell
212
 
213
- Scrapling v0.3 enthält eine leistungsstarke Befehlszeilenschnittstelle:
214
 
215
  [![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)
216
 
@@ -218,7 +297,7 @@ Interaktive Web-Scraping-Shell starten
218
  ```bash
219
  scrapling shell
220
  ```
221
- Seiten direkt ohne Programmierung in eine Datei extrahieren (Extrahiert standardmäßig den Inhalt im `body`-Tag). Wenn die Ausgabedatei mit `.txt` endet, wird der Textinhalt des Ziels extrahiert. Wenn sie mit `.md` endet, ist es eine Markdown-Darstellung des HTML-Inhalts; wenn sie mit `.html` endet, ist es der HTML-Inhalt selbst.
222
  ```bash
223
  scrapling extract get 'https://example.com' content.md
224
  scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # Alle Elemente, die dem CSS-Selektor '#fromSkipToProducts' entsprechen
@@ -227,24 +306,24 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.
227
  ```
228
 
229
  > [!NOTE]
230
- > Es gibt viele zusätzliche Funktionen, aber wir möchten diese Seite prägnant halten, wie den MCP-Server und die interaktive Web-Scraping-Shell. Schauen Sie sich die vollständige Dokumentation [hier](https://scrapling.readthedocs.io/en/latest/) an
231
 
232
  ## Leistungsbenchmarks
233
 
234
- Scrapling ist nicht nur leistungsstark es ist auch blitzschnell, und die Updates seit Version 0.3 haben außergewöhnliche Leistungsverbesserungen bei allen Operationen gebracht. Die folgenden Benchmarks vergleichen den Parser von Scrapling mit anderen beliebten Bibliotheken.
235
 
236
  ### Textextraktions-Geschwindigkeitstest (5000 verschachtelte Elemente)
237
 
238
- | # | Bibliothek | Zeit (ms) | vs Scrapling |
239
  |---|:-----------------:|:---------:|:------------:|
240
- | 1 | Scrapling | 1.99 | 1.0x |
241
- | 2 | Parsel/Scrapy | 2.01 | 1.01x |
242
- | 3 | Raw Lxml | 2.5 | 1.256x |
243
- | 4 | PyQuery | 22.93 | ~11.5x |
244
- | 5 | Selectolax | 80.57 | ~40.5x |
245
- | 6 | BS4 with Lxml | 1541.37 | ~774.6x |
246
- | 7 | MechanicalSoup | 1547.35 | ~777.6x |
247
- | 8 | BS4 with html5lib | 3410.58 | ~1713.9x |
248
 
249
 
250
  ### Element-Ähnlichkeit & Textsuche-Leistung
@@ -253,8 +332,8 @@ Scraplings adaptive Element-Finding-Fähigkeiten übertreffen Alternativen deutl
253
 
254
  | Bibliothek | Zeit (ms) | vs Scrapling |
255
  |-------------|:---------:|:------------:|
256
- | Scrapling | 2.46 | 1.0x |
257
- | AutoScraper | 13.3 | 5.407x |
258
 
259
 
260
  > Alle Benchmarks stellen Durchschnittswerte von über 100 Durchläufen dar. Siehe [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) für die Methodik.
@@ -267,18 +346,18 @@ Scrapling erfordert Python 3.10 oder höher:
267
  pip install scrapling
268
  ```
269
 
270
- Ab v0.3.2 enthält diese Installation nur die Parser-Engine und ihre Abhängigkeiten, ohne Fetcher oder Kommandozeilenabhängigkeiten.
271
 
272
  ### Optionale Abhängigkeiten
273
 
274
  1. Wenn Sie eine der folgenden zusätzlichen Funktionen, die Fetcher oder ihre Klassen verwenden möchten, müssen Sie die Abhängigkeiten der Fetcher und ihre Browser-Abhängigkeiten wie folgt installieren:
275
  ```bash
276
  pip install "scrapling[fetchers]"
277
-
278
  scrapling install
279
  ```
280
 
281
- Dies lädt alle Browser zusammen mit ihren Systemabhängigkeiten und Fingerabdruck-Manipulationsabhängigkeiten herunter.
282
 
283
  2. Zusätzliche Funktionen:
284
  - MCP-Server-Funktion installieren:
@@ -322,14 +401,7 @@ Diese Arbeit ist unter der BSD-3-Clause-Lizenz lizenziert.
322
  ## Danksagungen
323
 
324
  Dieses Projekt enthält angepassten Code von:
325
- - Parsel (BSD-Lizenz) Verwendet für [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)-Submodul
326
-
327
- ## Dank und Referenzen
328
-
329
- - [Daijros](https://github.com/daijro) brillante Arbeit an [BrowserForge](https://github.com/daijro/browserforge) und [Camoufox](https://github.com/daijro/camoufox)
330
- - [Vinyzus](https://github.com/Vinyzu) brillante Arbeit an [Botright](https://github.com/Vinyzu/Botright) und [PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)
331
- - [brotector](https://github.com/kaliiiiiiiiii/brotector) für Browser-Erkennungs-Umgehungstechniken
332
- - [fakebrowser](https://github.com/kkoooqq/fakebrowser) und [BotBrowser](https://github.com/botswin/BotBrowser) für Fingerprinting-Forschung
333
 
334
  ---
335
- <div align="center"><small>Entworfen und hergestellt mit ❤️ von Karim Shoair.</small></div><br>
 
1
+ <h1 align="center">
2
+ <a href="https://scrapling.readthedocs.io">
3
+ <picture>
4
+ <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
5
+ <img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
6
+ </picture>
7
+ </a>
8
+ <br>
9
+ <small>Effortless Web Scraping for the Modern Web</small>
10
+ </h1>
11
+
12
  <p align="center">
13
  <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
14
  <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
 
29
  </p>
30
 
31
  <p align="center">
32
+ <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/"><strong>Auswahlmethoden</strong></a>
33
+ &middot;
34
+ <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/"><strong>Einen Fetcher wählen</strong></a>
35
+ &middot;
36
+ <a href="https://scrapling.readthedocs.io/en/latest/cli/overview/"><strong>CLI</strong></a>
37
+ &middot;
38
+ <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/"><strong>MCP-Modus</strong></a>
39
+ &middot;
40
+ <a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/"><strong>Migration von Beautifulsoup</strong></a>
 
 
 
 
 
 
 
 
 
 
41
  </p>
42
 
43
+ Scrapling ist ein adaptives Web-Scraping-Framework, das alles abdeckt -- von einer einzelnen Anfrage bis hin zu einem umfassenden Crawl.
44
 
45
+ Sein Parser lernt aus Website-Änderungen und lokalisiert Ihre Elemente automatisch neu, wenn sich Seiten aktualisieren. Seine Fetcher umgehen Anti-Bot-Systeme wie Cloudflare Turnstile direkt ab Werk. Und sein Spider-Framework ermöglicht es Ihnen, auf parallele Multi-Session-Crawls mit Pause & Resume und automatischer Proxy-Rotation hochzuskalieren -- alles in wenigen Zeilen Python. Eine Bibliothek, keine Kompromisse.
46
 
47
+ Blitzschnelle Crawls mit Echtzeit-Statistiken und Streaming. Von Web Scrapern für Web Scraper und normale Benutzer entwickelt, ist für jeden etwas dabei.
48
 
49
  ```python
50
+ from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
51
+ StealthyFetcher.adaptive = True
52
+ page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # Website unbemerkt abrufen!
53
+ products = page.css('.product', auto_save=True) # Daten scrapen, die Website-Designänderungen überleben!
54
+ products = page.css('.product', adaptive=True) # Später, wenn sich die Website-Struktur ändert, `adaptive=True` übergeben, um sie zu finden!
55
+ ```
56
+ Oder auf vollständige Crawls hochskalieren
57
+ ```python
58
+ from scrapling.spiders import Spider, Response
59
+
60
+ class MySpider(Spider):
61
+ name = "demo"
62
+ start_urls = ["https://example.com/"]
63
+
64
+ async def parse(self, response: Response):
65
+ for item in response.css('.product'):
66
+ yield {"title": item.css('h2::text').get()}
67
+
68
+ MySpider().start()
69
  ```
70
 
71
+
72
+ # Sponsoren
73
 
74
  <!-- sponsors -->
75
 
 
93
 
94
  ## Hauptmerkmale
95
 
96
+ ### Spiders -- Ein vollständiges Crawling-Framework
97
+ - 🕷️ **Scrapy-ähnliche Spider-API**: Definieren Sie Spiders mit `start_urls`, async `parse` Callbacks und `Request`/`Response`-Objekten.
98
+ - **Paralleles Crawling**: Konfigurierbare Parallelitätslimits, domainbezogenes Throttling und Download-Verzögerungen.
99
+ - 🔄 **Multi-Session-Unterstützung**: Einheitliche Schnittstelle für HTTP-Anfragen und heimliche Headless-Browser in einem einzigen Spider -- leiten Sie Anfragen per ID an verschiedene Sessions weiter.
100
+ - 💾 **Pause & Resume**: Checkpoint-basierte Crawl-Persistenz. Drücken Sie Strg+C für ein kontrolliertes Herunterfahren; starten Sie neu, um dort fortzufahren, wo Sie aufgehört haben.
101
+ - 📡 **Streaming-Modus**: Gescrapte Elemente in Echtzeit streamen über `async for item in spider.stream()` mit Echtzeit-Statistiken -- ideal für UI, Pipelines und lang laufende Crawls.
102
+ - 🛡️ **Erkennung blockierter Anfragen**: Automatische Erkennung und Wiederholung blockierter Anfragen mit anpassbarer Logik.
103
+ - 📦 **Integrierter Export**: Ergebnisse über Hooks und Ihre eigene Pipeline oder den integrierten JSON/JSONL-Export mit `result.items.to_json()` / `result.items.to_jsonl()` exportieren.
104
+
105
+ ### Erweitertes Website-Abrufen mit Session-Unterstützung
106
+ - **HTTP-Anfragen**: Schnelle und heimliche HTTP-Anfragen mit der `Fetcher`-Klasse. Kann Browser-TLS-Fingerprints und Header imitieren und HTTP/3 verwenden.
107
+ - **Dynamisches Laden**: Dynamische Websites mit vollständiger Browser-Automatisierung über die `DynamicFetcher`-Klasse abrufen, die Playwrights Chromium und Google Chrome unterstützt.
108
+ - **Anti-Bot-Umgehung**: Erweiterte Stealth-Fähigkeiten mit `StealthyFetcher` und Fingerprint-Spoofing. Kann alle Arten von Cloudflares Turnstile/Interstitial einfach mit Automatisierung umgehen.
109
+ - **Session-Verwaltung**: Persistente Session-Unterstützung mit den Klassen `FetcherSession`, `StealthySession` und `DynamicSession` für Cookie- und Zustandsverwaltung über Anfragen hinweg.
110
+ - **Proxy-Rotation**: Integrierter `ProxyRotator` mit zyklischen oder benutzerdefinierten Rotationsstrategien über alle Session-Typen hinweg, plus Proxy-Überschreibungen pro Anfrage.
111
+ - **Domain-Blockierung**: Anfragen an bestimmte Domains (und deren Subdomains) in browserbasierten Fetchern blockieren.
112
+ - **Async-Unterstützung**: Vollständige async-Unterstützung über alle Fetcher und dedizierte async Session-Klassen hinweg.
113
 
114
  ### Adaptives Scraping & KI-Integration
115
  - 🔄 **Intelligente Element-Verfolgung**: Elemente nach Website-Änderungen mit intelligenten Ähnlichkeitsalgorithmen neu lokalisieren.
 
123
  - ⚡ **Schnelle JSON-Serialisierung**: 10x schneller als die Standardbibliothek.
124
  - 🏗️ **Praxiserprobt**: Scrapling hat nicht nur eine Testabdeckung von 92% und eine vollständige Type-Hints-Abdeckung, sondern wird seit dem letzten Jahr täglich von Hunderten von Web Scrapern verwendet.
125
 
126
+ ### Entwickler-/Web-Scraper-freundliche Erfahrung
127
  - 🎯 **Interaktive Web-Scraping-Shell**: Optionale integrierte IPython-Shell mit Scrapling-Integration, Shortcuts und neuen Tools zur Beschleunigung der Web-Scraping-Skriptentwicklung, wie das Konvertieren von Curl-Anfragen in Scrapling-Anfragen und das Anzeigen von Anfrageergebnissen in Ihrem Browser.
128
  - 🚀 **Direkt vom Terminal aus verwenden**: Optional können Sie Scrapling verwenden, um eine URL zu scrapen, ohne eine einzige Codezeile zu schreiben!
129
  - 🛠️ **Umfangreiche Navigations-API**: Erweiterte DOM-Traversierung mit Eltern-, Geschwister- und Kind-Navigationsmethoden.
130
  - 🧬 **Verbesserte Textverarbeitung**: Integrierte Regex, Bereinigungsmethoden und optimierte String-Operationen.
131
  - 📝 **Automatische Selektorgenerierung**: Robuste CSS/XPath-Selektoren für jedes Element generieren.
132
  - 🔌 **Vertraute API**: Ähnlich wie Scrapy/BeautifulSoup mit denselben Pseudo-Elementen, die in Scrapy/Parsel verwendet werden.
133
+ - 📘 **Vollständige Typabdeckung**: Vollständige Type Hints für hervorragende IDE-Unterstützung und Code-Vervollständigung. Die gesamte Codebasis wird bei jeder Änderung automatisch mit **PyRight** und **MyPy** gescannt.
134
  - 🔋 **Fertiges Docker-Image**: Mit jeder Veröffentlichung wird automatisch ein Docker-Image erstellt und gepusht, das alle Browser enthält.
135
 
136
  ## Erste Schritte
137
 
138
+ Hier ein kurzer Überblick über das, was Scrapling kann, ohne zu sehr ins Detail zu gehen.
139
+
140
  ### Grundlegende Verwendung
141
+ HTTP-Anfragen mit Session-Unterstützung
142
  ```python
143
+ from scrapling.fetchers import Fetcher, FetcherSession
 
144
 
145
+ with FetcherSession(impersonate='chrome') as session: # Neueste Version von Chromes TLS-Fingerprint verwenden
 
146
  page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
147
+ quotes = page.css('.quote .text::text').getall()
148
 
149
+ # Oder einmalige Anfragen verwenden
150
  page = Fetcher.get('https://quotes.toscrape.com/')
151
+ quotes = page.css('.quote .text::text').getall()
152
+ ```
153
+ Erweiterter Stealth-Modus
154
+ ```python
155
+ from scrapling.fetchers import StealthyFetcher, StealthySession
156
 
157
+ with StealthySession(headless=True, solve_cloudflare=True) as session: # Browser offen halten, bis Sie fertig sind
 
158
  page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
159
+ data = page.css('#padded_content a').getall()
160
 
161
+ # Oder einmaligen Anfragenstil verwenden: öffnet den Browser für diese Anfrage und schließt ihn nach Abschluss
162
  page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
163
+ data = page.css('#padded_content a').getall()
164
+ ```
165
+ Vollständige Browser-Automatisierung
166
+ ```python
167
+ from scrapling.fetchers import DynamicFetcher, DynamicSession
168
+
169
+ with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # Browser offen halten, bis Sie fertig sind
170
+ page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
171
+ data = page.xpath('//span[@class="text"]/text()').getall() # XPath-Selektor, falls bevorzugt
172
+
173
+ # Oder einmaligen Anfragenstil verwenden: öffnet den Browser für diese Anfrage und schließt ihn nach Abschluss
174
+ page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
175
+ data = page.css('.quote .text::text').getall()
176
+ ```
177
+
178
+ ### Spiders
179
+ Vollständige Crawler mit parallelen Anfragen, mehreren Session-Typen und Pause & Resume erstellen:
180
+ ```python
181
+ from scrapling.spiders import Spider, Request, Response
182
+
183
+ class QuotesSpider(Spider):
184
+ name = "quotes"
185
+ start_urls = ["https://quotes.toscrape.com/"]
186
+ concurrent_requests = 10
187
+
188
+ async def parse(self, response: Response):
189
+ for quote in response.css('.quote'):
190
+ yield {
191
+ "text": quote.css('.text::text').get(),
192
+ "author": quote.css('.author::text').get(),
193
+ }
194
+
195
+ next_page = response.css('.next a')
196
+ if next_page:
197
+ yield response.follow(next_page[0].attrib['href'])
198
+
199
+ result = QuotesSpider().start()
200
+ print(f"{len(result.items)} Zitate gescrapt")
201
+ result.items.to_json("quotes.json")
202
+ ```
203
+ Mehrere Session-Typen in einem einzigen Spider verwenden:
204
+ ```python
205
+ from scrapling.spiders import Spider, Request, Response
206
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession
207
+
208
+ class MultiSessionSpider(Spider):
209
+ name = "multi"
210
+ start_urls = ["https://example.com/"]
211
+
212
+ def configure_sessions(self, manager):
213
+ manager.add("fast", FetcherSession(impersonate="chrome"))
214
+ manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)
215
+
216
+ async def parse(self, response: Response):
217
+ for link in response.css('a::attr(href)').getall():
218
+ # Geschützte Seiten über die Stealth-Session leiten
219
+ if "protected" in link:
220
+ yield Request(link, sid="stealth")
221
+ else:
222
+ yield Request(link, sid="fast", callback=self.parse) # Expliziter Callback
223
+ ```
224
+ Lange Crawls mit Checkpoints pausieren und fortsetzen, indem Sie den Spider so starten:
225
+ ```python
226
+ QuotesSpider(crawldir="./crawl_data").start()
227
  ```
228
+ Drücken Sie Strg+C, um kontrolliert zu pausieren -- der Fortschritt wird automatisch gespeichert. Wenn Sie den Spider später erneut starten, übergeben Sie dasselbe `crawldir`, und er setzt dort fort, wo er aufgehört hat.
229
+
230
+ ### Erweitertes Parsing & Navigation
231
+ ```python
232
+ from scrapling.fetchers import Fetcher
233
 
234
+ # Umfangreiche Elementauswahl und Navigation
235
+ page = Fetcher.get('https://quotes.toscrape.com/')
236
+
237
+ # Zitate mit verschiedenen Auswahlmethoden abrufen
238
+ quotes = page.css('.quote') # CSS-Selektor
239
+ quotes = page.xpath('//div[@class="quote"]') # XPath
240
+ quotes = page.find_all('div', {'class': 'quote'}) # BeautifulSoup-Stil
241
+ # Gleich wie
242
+ quotes = page.find_all('div', class_='quote')
243
+ quotes = page.find_all(['div'], class_='quote')
244
+ quotes = page.find_all(class_='quote') # und so weiter...
245
+ # Element nach Textinhalt finden
246
+ quotes = page.find_by_text('quote', tag='div')
247
+
248
+ # Erweiterte Navigation
249
+ quote_text = page.css('.quote')[0].css('.text::text').get()
250
+ quote_text = page.css('.quote').css('.text::text').getall() # Verkettete Selektoren
251
+ first_quote = page.css('.quote')[0]
252
+ author = first_quote.next_sibling.css('.author::text')
253
+ parent_container = first_quote.parent
254
+
255
+ # Elementbeziehungen und Ähnlichkeit
256
+ similar_elements = first_quote.find_similar()
257
+ below_elements = first_quote.below_elements()
258
+ ```
259
+ Sie können den Parser direkt verwenden, wenn Sie keine Websites abrufen möchten, wie unten gezeigt:
260
  ```python
261
+ from scrapling.parser import Selector
262
+
263
+ page = Selector("<html>...</html>")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  ```
265
+ Und es funktioniert genau auf die gleiche Weise!
266
 
267
+ ### Beispiele für async Session-Verwaltung
268
  ```python
269
+ import asyncio
270
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession
271
+
272
+ async with FetcherSession(http3=True) as session: # `FetcherSession` ist kontextbewusst und kann sowohl in sync- als auch in async-Mustern arbeiten
273
+ page1 = session.get('https://quotes.toscrape.com/')
 
 
 
 
274
  page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
275
 
276
+ # Async-Session-Verwendung
277
  async with AsyncStealthySession(max_pages=2) as session:
278
  tasks = []
279
  urls = ['https://example.com/page1', 'https://example.com/page2']
280
+
281
  for url in urls:
282
  task = session.fetch(url)
283
  tasks.append(task)
284
+
285
  print(session.get_pool_stats()) # Optional - Der Status des Browser-Tab-Pools (beschäftigt/frei/Fehler)
286
  results = await asyncio.gather(*tasks)
287
  print(session.get_pool_stats())
 
289
 
290
  ## CLI & Interaktive Shell
291
 
292
+ Scrapling enthält eine leistungsstarke Befehlszeilenschnittstelle:
293
 
294
  [![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)
295
 
 
297
  ```bash
298
  scrapling shell
299
  ```
300
+ Seiten direkt ohne Programmierung in eine Datei extrahieren (extrahiert standardmäßig den Inhalt im `body`-Tag). Wenn die Ausgabedatei mit `.txt` endet, wird der Textinhalt des Ziels extrahiert. Wenn sie mit `.md` endet, ist es eine Markdown-Darstellung des HTML-Inhalts; wenn sie mit `.html` endet, ist es der HTML-Inhalt selbst.
301
  ```bash
302
  scrapling extract get 'https://example.com' content.md
303
  scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # Alle Elemente, die dem CSS-Selektor '#fromSkipToProducts' entsprechen
 
306
  ```
307
 
308
  > [!NOTE]
309
+ > Es gibt viele zusätzliche Funktionen, aber wir möchten diese Seite prägnant halten, einschließlich des MCP-Servers und der interaktiven Web-Scraping-Shell. Schauen Sie sich die vollständige Dokumentation [hier](https://scrapling.readthedocs.io/en/latest/) an
310
 
311
  ## Leistungsbenchmarks
312
 
313
+ Scrapling ist nicht nur leistungsstark -- es ist auch blitzschnell. Die folgenden Benchmarks vergleichen Scraplings Parser mit den neuesten Versionen anderer beliebter Bibliotheken.
314
 
315
  ### Textextraktions-Geschwindigkeitstest (5000 verschachtelte Elemente)
316
 
317
+ | # | Bibliothek | Zeit (ms) | vs Scrapling |
318
  |---|:-----------------:|:---------:|:------------:|
319
+ | 1 | Scrapling | 2.02 | 1.0x |
320
+ | 2 | Parsel/Scrapy | 2.04 | 1.01 |
321
+ | 3 | Raw Lxml | 2.54 | 1.257 |
322
+ | 4 | PyQuery | 24.17 | ~12x |
323
+ | 5 | Selectolax | 82.63 | ~41x |
324
+ | 6 | MechanicalSoup | 1549.71 | ~767.1x |
325
+ | 7 | BS4 with Lxml | 1584.31 | ~784.3x |
326
+ | 8 | BS4 with html5lib | 3391.91 | ~1679.1x |
327
 
328
 
329
  ### Element-Ähnlichkeit & Textsuche-Leistung
 
332
 
333
  | Bibliothek | Zeit (ms) | vs Scrapling |
334
  |-------------|:---------:|:------------:|
335
+ | Scrapling | 2.39 | 1.0x |
336
+ | AutoScraper | 12.45 | 5.209x |
337
 
338
 
339
  > Alle Benchmarks stellen Durchschnittswerte von über 100 Durchläufen dar. Siehe [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) für die Methodik.
 
346
  pip install scrapling
347
  ```
348
 
349
+ Diese Installation enthält nur die Parser-Engine und ihre Abhängigkeiten, ohne Fetcher oder Kommandozeilenabhängigkeiten.
350
 
351
  ### Optionale Abhängigkeiten
352
 
353
  1. Wenn Sie eine der folgenden zusätzlichen Funktionen, die Fetcher oder ihre Klassen verwenden möchten, müssen Sie die Abhängigkeiten der Fetcher und ihre Browser-Abhängigkeiten wie folgt installieren:
354
  ```bash
355
  pip install "scrapling[fetchers]"
356
+
357
  scrapling install
358
  ```
359
 
360
+ Dies lädt alle Browser zusammen mit ihren Systemabhängigkeiten und Fingerprint-Manipulationsabhängigkeiten herunter.
361
 
362
  2. Zusätzliche Funktionen:
363
  - MCP-Server-Funktion installieren:
 
401
  ## Danksagungen
402
 
403
  Dieses Projekt enthält angepassten Code von:
404
+ - Parsel (BSD-Lizenz) -- Verwendet für das [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)-Submodul
 
 
 
 
 
 
 
405
 
406
  ---
407
+ <div align="center"><small>Entworfen und hergestellt mit ❤️ von Karim Shoair.</small></div><br>
docs/README_ES.md CHANGED
@@ -1,9 +1,14 @@
1
- <p align=center>
2
- <br>
3
- <a href="https://scrapling.readthedocs.io/en/latest/" target="_blank"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/poster.png" style="width: 50%; height: 100%;" alt="main poster"/></a>
4
- <br>
5
- <i><code>¡Web Scraping fácil y sin esfuerzo como debería ser!</code></i>
6
- </p>
 
 
 
 
 
7
  <p align="center">
8
  <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
9
  <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
@@ -24,46 +29,47 @@
24
  </p>
25
 
26
  <p align="center">
27
- <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/">
28
- Métodos de selección
29
- </a>
30
- ·
31
- <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/">
32
- Elegir un fetcher
33
- </a>
34
- ·
35
- <a href="https://scrapling.readthedocs.io/en/latest/cli/overview/">
36
- CLI
37
- </a>
38
- ·
39
- <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/">
40
- Modo MCP
41
- </a>
42
- ·
43
- <a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/">
44
- Migrar desde Beautifulsoup
45
- </a>
46
  </p>
47
 
48
- **Deja de luchar contra sistemas anti-bot. Deja de reescribir selectores después de cada actualización del sitio web.**
49
 
50
- Scrapling no es solo otra biblioteca de Web Scraping. Es la primera biblioteca de scraping **adaptativa** que aprende de los cambios de los sitios web y evoluciona con ellos. Mientras que otras bibliotecas se rompen cuando los sitios web actualizan su estructura, Scrapling relocaliza automáticamente tus elementos y mantiene tus scrapers funcionando.
51
 
52
- Construido para la Web moderna, Scrapling presenta **su propio motor de análisis rápido** y fetchers para manejar todos los desafíos de Web Scraping que enfrentas o enfrentarás. Construido por Web Scrapers para Web Scrapers y usuarios regulares, hay algo para todos.
53
 
54
  ```python
55
- >> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
56
- >> StealthyFetcher.adaptive = True
57
- # ¡Obtén el código fuente de sitios web bajo el radar!
58
- >> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
59
- >> print(page.status)
60
- 200
61
- >> products = page.css('.product', auto_save=True) # ¡Extrae datos que sobreviven a cambios de diseño del sitio web!
62
- >> # Más tarde, si la estructura del sitio web cambia, pasa `adaptive=True`
63
- >> products = page.css('.product', adaptive=True) # ¡y Scrapling aún los encuentra!
64
  ```
 
 
 
65
 
66
- # Patrocinadores
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  <!-- sponsors -->
69
 
@@ -87,24 +93,35 @@ Construido para la Web moderna, Scrapling presenta **su propio motor de análisi
87
 
88
  ## Características Principales
89
 
90
- ### Obtención Avanzada de Sitios Web con Soporte de Sesión
91
- - **Solicitudes HTTP**: Solicitudes HTTP rápidas y sigilosas con la clase `Fetcher`. Puede imitar la huella TLS de los navegadores, encabezados y usar HTTP3.
 
 
 
 
 
 
 
 
 
92
  - **Carga Dinámica**: Obtén sitios web dinámicos con automatización completa del navegador a través de la clase `DynamicFetcher` compatible con Chromium de Playwright y Google Chrome.
93
- - **Evasión Anti-bot**: Capacidades de sigilo avanzadas con `StealthyFetcher` y falsificación de huellas digitales. Puede evadir fácilmente todos los tipos de Turnstile/Interstitial de Cloudflare con automatización.
94
- - **Gestión de Sesión**: Soporte de sesión persistente con las clases `FetcherSession`, `StealthySession` y `DynamicSession` para la gestión de cookies y estado entre solicitudes.
 
 
95
  - **Soporte Async**: Soporte async completo en todos los fetchers y clases de sesión async dedicadas.
96
 
97
  ### Scraping Adaptativo e Integración con IA
98
  - 🔄 **Seguimiento Inteligente de Elementos**: Relocaliza elementos después de cambios en el sitio web usando algoritmos inteligentes de similitud.
99
  - 🎯 **Selección Flexible Inteligente**: Selectores CSS, selectores XPath, búsqueda basada en filtros, búsqueda de texto, búsqueda regex y más.
100
  - 🔍 **Encontrar Elementos Similares**: Localiza automáticamente elementos similares a los elementos encontrados.
101
- - 🤖 **Servidor MCP para usar con IA**: Servidor MCP integrado para Web Scraping asistido por IA y extracción de datos. El servidor MCP presenta capacidades poderosas y personalizadas que aprovechan Scrapling para extraer contenido específico antes de pasarlo a la IA (Claude/Cursor/etc), acelerando así las operaciones y reduciendo costos al minimizar el uso de tokens. ([video demo](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
102
 
103
  ### Arquitectura de Alto Rendimiento y Probada en Batalla
104
- - 🚀 **Ultrarrápido**: Rendimiento optimizado que supera a la mayoría de las bibliotecas de scraping de Python.
105
  - 🔋 **Eficiente en Memoria**: Estructuras de datos optimizadas y carga diferida para una huella de memoria mínima.
106
  - ⚡ **Serialización JSON Rápida**: 10 veces más rápido que la biblioteca estándar.
107
- - 🏗️ **Probado en batalla**: Scrapling no solo tiene una cobertura de prueba del 92% y cobertura completa de type hints, sino que ha sido utilizado diariamente por cientos de Web Scrapers durante el último año.
108
 
109
  ### Experiencia Amigable para Desarrolladores/Web Scrapers
110
  - 🎯 **Shell Interactivo de Web Scraping**: Shell IPython integrado opcional con integración de Scrapling, atajos y nuevas herramientas para acelerar el desarrollo de scripts de Web Scraping, como convertir solicitudes curl a solicitudes Scrapling y ver resultados de solicitudes en tu navegador.
@@ -113,96 +130,158 @@ Construido para la Web moderna, Scrapling presenta **su propio motor de análisi
113
  - 🧬 **Procesamiento de Texto Mejorado**: Métodos integrados de regex, limpieza y operaciones de cadena optimizadas.
114
  - 📝 **Generación Automática de Selectores**: Genera selectores CSS/XPath robustos para cualquier elemento.
115
  - 🔌 **API Familiar**: Similar a Scrapy/BeautifulSoup con los mismos pseudo-elementos usados en Scrapy/Parsel.
116
- - 📘 **Cobertura Completa de Tipos**: Type hints completos para excelente soporte de IDE y autocompletado de código.
117
  - 🔋 **Imagen Docker Lista**: Con cada lanzamiento, se construye y publica automáticamente una imagen Docker que contiene todos los navegadores.
118
 
119
- ## Empezando
 
 
120
 
121
  ### Uso Básico
 
122
  ```python
123
- from scrapling.fetchers import Fetcher, StealthyFetcher, DynamicFetcher
124
- from scrapling.fetchers import FetcherSession, StealthySession, DynamicSession
125
 
126
- # Solicitudes HTTP con soporte de sesión
127
- with FetcherSession(impersonate='chrome') as session: # Usa la última versión de la huella TLS de Chrome
128
  page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
129
- quotes = page.css('.quote .text::text')
130
 
131
  # O usa solicitudes de una sola vez
132
  page = Fetcher.get('https://quotes.toscrape.com/')
133
- quotes = page.css('.quote .text::text')
 
 
 
 
134
 
135
- # Modo sigiloso avanzado (Mantén el navegador abierto hasta que termines)
136
- with StealthySession(headless=True, solve_cloudflare=True) as session:
137
  page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
138
- data = page.css('#padded_content a')
139
 
140
  # O usa el estilo de solicitud de una sola vez, abre el navegador para esta solicitud, luego lo cierra después de terminar
141
  page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
142
- data = page.css('#padded_content a')
143
-
144
- # Automatización completa del navegador (Mantén el navegador abierto hasta que termines)
145
- with DynamicSession(headless=True) as session:
146
- page = session.fetch('https://quotes.toscrape.com/', network_idle=True)
147
- quotes = page.css('.quote .text::text')
148
-
149
- # O usa el estilo de solicitud de una sola vez
150
- page = DynamicFetcher.fetch('https://quotes.toscrape.com/', network_idle=True)
151
- quotes = page.css('.quote .text::text')
152
  ```
 
 
 
 
 
 
 
153
 
154
- ### Selección de Elementos
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  ```python
156
- # CSS selectors
157
- page.css('a::text') # Extracta texto
158
- page.css('a::attr(href)') # Extracta atributos
159
- page.css('a', recursive=False) # Solo elementos directos
160
- page.css('a', auto_save=True) # Guarda posiciones de los elementos automáticamente
161
-
162
- # XPath
163
- page.xpath('//a/text()')
164
-
165
- # Búsqueda flexible
166
- page.find_by_text('Python', first_match=True) # Encuentra por texto
167
- page.find_by_regex(r'\d{4}') # Encuentra por patrón regex
168
- page.find('div', {'class': 'container'}) # Encuentra por atributos
169
-
170
- # Navegación
171
- element.parent # Obtener elemento padre
172
- element.next_sibling # Obtener siguiente hermano
173
- element.children # Obtener hijos
174
-
175
- # Elementos similares
176
- similar = page.get_similar(element) # Encuentra elementos similares
177
-
178
- # Scraping adaptativo
179
- saved_elements = page.css('.product', auto_save=True)
180
- # Más tarde, cuando el sitio web cambia:
181
- page.css('.product', adaptive=True) # Encuentra elementos usando posiciones guardadas
182
  ```
 
183
 
184
- ### Uso de Sesión
185
  ```python
186
- from scrapling.fetchers import FetcherSession, AsyncFetcherSession
187
-
188
- # Sesión sincrónica
189
- with FetcherSession() as session:
190
- # Las cookies se mantienen automáticamente
191
- page1 = session.get('https://quotes.toscrape.com/login')
192
- page2 = session.post('https://quotes.toscrape.com/login', data={'username': 'admin', 'password': 'admin'})
193
-
194
- # Cambiar fingerprint del navegador si es necesario
195
  page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
196
 
197
  # Uso de sesión async
198
  async with AsyncStealthySession(max_pages=2) as session:
199
  tasks = []
200
  urls = ['https://example.com/page1', 'https://example.com/page2']
201
-
202
  for url in urls:
203
  task = session.fetch(url)
204
  tasks.append(task)
205
-
206
  print(session.get_pool_stats()) # Opcional - El estado del pool de pestañas del navegador (ocupado/libre/error)
207
  results = await asyncio.gather(*tasks)
208
  print(session.get_pool_stats())
@@ -210,11 +289,11 @@ async with AsyncStealthySession(max_pages=2) as session:
210
 
211
  ## CLI y Shell Interactivo
212
 
213
- Scrapling v0.3 incluye una poderosa interfaz de línea de comandos:
214
 
215
  [![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)
216
 
217
- Lanzar shell interactivo de Web Scraping
218
  ```bash
219
  scrapling shell
220
  ```
@@ -227,24 +306,24 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.
227
  ```
228
 
229
  > [!NOTE]
230
- > Hay muchas características adicionales, pero queremos mantener esta página concisa, como el servidor MCP y el Shell Interactivo de Web Scraping. Consulta la documentación completa [aquí](https://scrapling.readthedocs.io/en/latest/)
231
 
232
  ## Benchmarks de Rendimiento
233
 
234
- Scrapling no solo es poderoso, también es increíblemente rápido, y las actualizaciones desde la versión 0.3 han brindado mejoras de rendimiento excepcionales en todas las operaciones. Los siguientes benchmarks comparan el analizador de Scrapling con otras bibliotecas populares.
235
 
236
  ### Prueba de Velocidad de Extracción de Texto (5000 elementos anidados)
237
 
238
- | # | Biblioteca | Tiempo (ms) | vs Scrapling |
239
  |---|:-----------------:|:-----------:|:------------:|
240
- | 1 | Scrapling | 1.99 | 1.0x |
241
- | 2 | Parsel/Scrapy | 2.01 | 1.01x |
242
- | 3 | Raw Lxml | 2.5 | 1.256x |
243
- | 4 | PyQuery | 22.93 | ~11.5x |
244
- | 5 | Selectolax | 80.57 | ~40.5x |
245
- | 6 | BS4 with Lxml | 1541.37 | ~774.6x |
246
- | 7 | MechanicalSoup | 1547.35 | ~777.6x |
247
- | 8 | BS4 with html5lib | 3410.58 | ~1713.9x |
248
 
249
 
250
  ### Rendimiento de Similitud de Elementos y Búsqueda de Texto
@@ -253,8 +332,8 @@ Las capacidades de búsqueda adaptativa de elementos de Scrapling superan signif
253
 
254
  | Biblioteca | Tiempo (ms) | vs Scrapling |
255
  |-------------|:-----------:|:------------:|
256
- | Scrapling | 2.46 | 1.0x |
257
- | AutoScraper | 13.3 | 5.407x |
258
 
259
 
260
  > Todos los benchmarks representan promedios de más de 100 ejecuciones. Ver [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) para la metodología.
@@ -267,29 +346,29 @@ Scrapling requiere Python 3.10 o superior:
267
  pip install scrapling
268
  ```
269
 
270
- A partir de v0.3.2, esta instalación solo incluye el motor de análisis y sus dependencias, sin ningún fetcher o dependencias de línea de comandos.
271
 
272
  ### Dependencias Opcionales
273
 
274
  1. Si vas a usar alguna de las características adicionales a continuación, los fetchers, o sus clases, necesitarás instalar las dependencias de los fetchers y sus dependencias del navegador de la siguiente manera:
275
  ```bash
276
  pip install "scrapling[fetchers]"
277
-
278
  scrapling install
279
  ```
280
 
281
- Esto descarga todos los navegadores, junto con sus dependencias del sistema y dependencias de manipulación de huellas digitales.
282
 
283
  2. Características adicionales:
284
  - Instalar la característica del servidor MCP:
285
  ```bash
286
  pip install "scrapling[ai]"
287
  ```
288
- - Instalar características del shell (shell de Web Scraping y el comando `extract`):
289
  ```bash
290
  pip install "scrapling[shell]"
291
  ```
292
- - Instalar todo:
293
  ```bash
294
  pip install "scrapling[all]"
295
  ```
@@ -324,12 +403,5 @@ Este trabajo está licenciado bajo la Licencia BSD-3-Clause.
324
  Este proyecto incluye código adaptado de:
325
  - Parsel (Licencia BSD)—Usado para el submódulo [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)
326
 
327
- ## Agradecimientos y Referencias
328
-
329
- - El brillante trabajo de [Daijro](https://github.com/daijro) en [BrowserForge](https://github.com/daijro/browserforge) y [Camoufox](https://github.com/daijro/camoufox)
330
- - El brillante trabajo de [Vinyzu](https://github.com/Vinyzu) en [Botright](https://github.com/Vinyzu/Botright) y [PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)
331
- - [brotector](https://github.com/kaliiiiiiiiii/brotector) por técnicas de evasión de detección de navegador
332
- - [fakebrowser](https://github.com/kkoooqq/fakebrowser) y [BotBrowser](https://github.com/botswin/BotBrowser) por investigación de huellas digitales
333
-
334
  ---
335
- <div align="center"><small>Diseñado y elaborado con ❤️ por Karim Shoair.</small></div><br>
 
1
+ <h1 align="center">
2
+ <a href="https://scrapling.readthedocs.io">
3
+ <picture>
4
+ <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
5
+ <img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
6
+ </picture>
7
+ </a>
8
+ <br>
9
+ <small>Effortless Web Scraping for the Modern Web</small>
10
+ </h1>
11
+
12
  <p align="center">
13
  <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
14
  <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
 
29
  </p>
30
 
31
  <p align="center">
32
+ <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/"><strong>Metodos de seleccion</strong></a>
33
+ &middot;
34
+ <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/"><strong>Elegir un fetcher</strong></a>
35
+ &middot;
36
+ <a href="https://scrapling.readthedocs.io/en/latest/cli/overview/"><strong>CLI</strong></a>
37
+ &middot;
38
+ <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/"><strong>Modo MCP</strong></a>
39
+ &middot;
40
+ <a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/"><strong>Migrar desde Beautifulsoup</strong></a>
 
 
 
 
 
 
 
 
 
 
41
  </p>
42
 
43
+ Scrapling es un framework de Web Scraping adaptativo que se encarga de todo, desde una sola solicitud hasta un rastreo a gran escala.
44
 
45
+ Su parser aprende de los cambios de los sitios web y relocaliza automáticamente tus elementos cuando las páginas se actualizan. Sus fetchers evaden sistemas anti-bot como Cloudflare Turnstile de forma nativa. Y su framework Spider te permite escalar a rastreos concurrentes con múltiples sesiones, con Pause & Resume y rotación automática de Proxy, todo en unas pocas líneas de Python. Una biblioteca, cero compromisos.
46
 
47
+ Rastreos ultrarrápidos con estadísticas en tiempo real y Streaming. Construido por Web Scrapers para Web Scrapers y usuarios regulares, hay algo para todos.
48
 
49
  ```python
50
+ from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
51
+ StealthyFetcher.adaptive = True
52
+ page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # ¡Obtén el sitio web bajo el radar!
53
+ products = page.css('.product', auto_save=True) # ¡Extrae datos que sobreviven a cambios de diseño del sitio web!
54
+ products = page.css('.product', adaptive=True) # Más tarde, si la estructura del sitio web cambia, ¡pasa `adaptive=True` para encontrarlos!
 
 
 
 
55
  ```
56
+ O escala a rastreos completos
57
+ ```python
58
+ from scrapling.spiders import Spider, Response
59
 
60
+ class MySpider(Spider):
61
+ name = "demo"
62
+ start_urls = ["https://example.com/"]
63
+
64
+ async def parse(self, response: Response):
65
+ for item in response.css('.product'):
66
+ yield {"title": item.css('h2::text').get()}
67
+
68
+ MySpider().start()
69
+ ```
70
+
71
+
72
+ # Patrocinadores
73
 
74
  <!-- sponsors -->
75
 
 
93
 
94
  ## Características Principales
95
 
96
+ ### Spiders Un Framework Completo de Rastreo
97
+ - 🕷️ **API de Spider al estilo Scrapy**: Define spiders con `start_urls`, callbacks async `parse`, y objetos `Request`/`Response`.
98
+ - ⚡ **Rastreo Concurrente**: Límites de concurrencia configurables, limitación por dominio y retrasos de descarga.
99
+ - 🔄 **Soporte Multi-Session**: Interfaz unificada para solicitudes HTTP y navegadores headless sigilosos en un solo Spider — enruta solicitudes a diferentes sesiones por ID.
100
+ - 💾 **Pause & Resume**: Persistencia de rastreo basada en Checkpoint. Presiona Ctrl+C para un cierre ordenado; reinicia para continuar desde donde lo dejaste.
101
+ - 📡 **Modo Streaming**: Transmite elementos extraídos a medida que llegan con `async for item in spider.stream()` con estadísticas en tiempo real — ideal para UI, pipelines y rastreos de larga duración.
102
+ - 🛡️ **Detección de Solicitudes Bloqueadas**: Detección automática y reintento de solicitudes bloqueadas con lógica personalizable.
103
+ - 📦 **Exportación Integrada**: Exporta resultados a través de hooks y tu propio pipeline o el JSON/JSONL integrado con `result.items.to_json()` / `result.items.to_jsonl()` respectivamente.
104
+
105
+ ### Obtención Avanzada de Sitios Web con Soporte de Session
106
+ - **Solicitudes HTTP**: Solicitudes HTTP rápidas y sigilosas con la clase `Fetcher`. Puede imitar el fingerprint TLS de los navegadores, encabezados y usar HTTP/3.
107
  - **Carga Dinámica**: Obtén sitios web dinámicos con automatización completa del navegador a través de la clase `DynamicFetcher` compatible con Chromium de Playwright y Google Chrome.
108
+ - **Evasión Anti-bot**: Capacidades de sigilo avanzadas con `StealthyFetcher` y falsificación de fingerprint. Puede evadir fácilmente todos los tipos de Turnstile/Interstitial de Cloudflare con automatización.
109
+ - **Gestión de Session**: Soporte de sesión persistente con las clases `FetcherSession`, `StealthySession` y `DynamicSession` para la gestión de cookies y estado entre solicitudes.
110
+ - **Rotación de Proxy**: `ProxyRotator` integrado con estrategias de rotación cíclica o personalizadas en todos los tipos de sesión, además de sobrescrituras de Proxy por solicitud.
111
+ - **Bloqueo de Dominios**: Bloquea solicitudes a dominios específicos (y sus subdominios) en fetchers basados en navegador.
112
  - **Soporte Async**: Soporte async completo en todos los fetchers y clases de sesión async dedicadas.
113
 
114
  ### Scraping Adaptativo e Integración con IA
115
  - 🔄 **Seguimiento Inteligente de Elementos**: Relocaliza elementos después de cambios en el sitio web usando algoritmos inteligentes de similitud.
116
  - 🎯 **Selección Flexible Inteligente**: Selectores CSS, selectores XPath, búsqueda basada en filtros, búsqueda de texto, búsqueda regex y más.
117
  - 🔍 **Encontrar Elementos Similares**: Localiza automáticamente elementos similares a los elementos encontrados.
118
+ - 🤖 **Servidor MCP para usar con IA**: Servidor MCP integrado para Web Scraping asistido por IA y extracción de datos. El servidor MCP presenta capacidades potentes y personalizadas que aprovechan Scrapling para extraer contenido específico antes de pasarlo a la IA (Claude/Cursor/etc), acelerando así las operaciones y reduciendo costos al minimizar el uso de tokens. ([video demo](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
119
 
120
  ### Arquitectura de Alto Rendimiento y Probada en Batalla
121
+ - 🚀 **Ultrarrápido**: Rendimiento optimizado que supera a la mayoría de las bibliotecas de Web Scraping de Python.
122
  - 🔋 **Eficiente en Memoria**: Estructuras de datos optimizadas y carga diferida para una huella de memoria mínima.
123
  - ⚡ **Serialización JSON Rápida**: 10 veces más rápido que la biblioteca estándar.
124
+ - 🏗️ **Probado en batalla**: Scrapling no solo tiene una cobertura de pruebas del 92% y cobertura completa de type hints, sino que ha sido utilizado diariamente por cientos de Web Scrapers durante el último año.
125
 
126
  ### Experiencia Amigable para Desarrolladores/Web Scrapers
127
  - 🎯 **Shell Interactivo de Web Scraping**: Shell IPython integrado opcional con integración de Scrapling, atajos y nuevas herramientas para acelerar el desarrollo de scripts de Web Scraping, como convertir solicitudes curl a solicitudes Scrapling y ver resultados de solicitudes en tu navegador.
 
130
  - 🧬 **Procesamiento de Texto Mejorado**: Métodos integrados de regex, limpieza y operaciones de cadena optimizadas.
131
  - 📝 **Generación Automática de Selectores**: Genera selectores CSS/XPath robustos para cualquier elemento.
132
  - 🔌 **API Familiar**: Similar a Scrapy/BeautifulSoup con los mismos pseudo-elementos usados en Scrapy/Parsel.
133
+ - 📘 **Cobertura Completa de Tipos**: Type hints completos para excelente soporte de IDE y autocompletado de código. Todo el código fuente se escanea automáticamente con **PyRight** y **MyPy** en cada cambio.
134
  - 🔋 **Imagen Docker Lista**: Con cada lanzamiento, se construye y publica automáticamente una imagen Docker que contiene todos los navegadores.
135
 
136
+ ## Primeros Pasos
137
+
138
+ Aquí tienes un vistazo rápido de lo que Scrapling puede hacer sin entrar en profundidad.
139
 
140
  ### Uso Básico
141
+ Solicitudes HTTP con soporte de sesión
142
  ```python
143
+ from scrapling.fetchers import Fetcher, FetcherSession
 
144
 
145
+ with FetcherSession(impersonate='chrome') as session: # Usa la última versión del fingerprint TLS de Chrome
 
146
  page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
147
+ quotes = page.css('.quote .text::text').getall()
148
 
149
  # O usa solicitudes de una sola vez
150
  page = Fetcher.get('https://quotes.toscrape.com/')
151
+ quotes = page.css('.quote .text::text').getall()
152
+ ```
153
+ Modo sigiloso avanzado
154
+ ```python
155
+ from scrapling.fetchers import StealthyFetcher, StealthySession
156
 
157
+ with StealthySession(headless=True, solve_cloudflare=True) as session: # Mantén el navegador abierto hasta que termines
 
158
  page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
159
+ data = page.css('#padded_content a').getall()
160
 
161
  # O usa el estilo de solicitud de una sola vez, abre el navegador para esta solicitud, luego lo cierra después de terminar
162
  page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
163
+ data = page.css('#padded_content a').getall()
 
 
 
 
 
 
 
 
 
164
  ```
165
+ Automatización completa del navegador
166
+ ```python
167
+ from scrapling.fetchers import DynamicFetcher, DynamicSession
168
+
169
+ with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # Mantén el navegador abierto hasta que termines
170
+ page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
171
+ data = page.xpath('//span[@class="text"]/text()').getall() # Selector XPath si lo prefieres
172
 
173
+ # O usa el estilo de solicitud de una sola vez, abre el navegador para esta solicitud, luego lo cierra después de terminar
174
+ page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
175
+ data = page.css('.quote .text::text').getall()
176
+ ```
177
+
178
+ ### Spiders
179
+ Construye rastreadores completos con solicitudes concurrentes, múltiples tipos de sesión y Pause & Resume:
180
+ ```python
181
+ from scrapling.spiders import Spider, Request, Response
182
+
183
+ class QuotesSpider(Spider):
184
+ name = "quotes"
185
+ start_urls = ["https://quotes.toscrape.com/"]
186
+ concurrent_requests = 10
187
+
188
+ async def parse(self, response: Response):
189
+ for quote in response.css('.quote'):
190
+ yield {
191
+ "text": quote.css('.text::text').get(),
192
+ "author": quote.css('.author::text').get(),
193
+ }
194
+
195
+ next_page = response.css('.next a')
196
+ if next_page:
197
+ yield response.follow(next_page[0].attrib['href'])
198
+
199
+ result = QuotesSpider().start()
200
+ print(f"Se extrajeron {len(result.items)} citas")
201
+ result.items.to_json("quotes.json")
202
+ ```
203
+ Usa múltiples tipos de sesión en un solo Spider:
204
+ ```python
205
+ from scrapling.spiders import Spider, Request, Response
206
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession
207
+
208
+ class MultiSessionSpider(Spider):
209
+ name = "multi"
210
+ start_urls = ["https://example.com/"]
211
+
212
+ def configure_sessions(self, manager):
213
+ manager.add("fast", FetcherSession(impersonate="chrome"))
214
+ manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)
215
+
216
+ async def parse(self, response: Response):
217
+ for link in response.css('a::attr(href)').getall():
218
+ # Enruta las páginas protegidas a través de la sesión sigilosa
219
+ if "protected" in link:
220
+ yield Request(link, sid="stealth")
221
+ else:
222
+ yield Request(link, sid="fast", callback=self.parse) # callback explícito
223
+ ```
224
+ Pausa y reanuda rastreos largos con checkpoints ejecutando el Spider así:
225
+ ```python
226
+ QuotesSpider(crawldir="./crawl_data").start()
227
+ ```
228
+ Presiona Ctrl+C para pausar de forma ordenada — el progreso se guarda automáticamente. Después, cuando inicies el Spider de nuevo, pasa el mismo `crawldir`, y continuará desde donde se detuvo.
229
+
230
+ ### Análisis Avanzado y Navegación
231
+ ```python
232
+ from scrapling.fetchers import Fetcher
233
+
234
+ # Selección rica de elementos y navegación
235
+ page = Fetcher.get('https://quotes.toscrape.com/')
236
+
237
+ # Obtén citas con múltiples métodos de selección
238
+ quotes = page.css('.quote') # Selector CSS
239
+ quotes = page.xpath('//div[@class="quote"]') # XPath
240
+ quotes = page.find_all('div', {'class': 'quote'}) # Estilo BeautifulSoup
241
+ # Igual que
242
+ quotes = page.find_all('div', class_='quote')
243
+ quotes = page.find_all(['div'], class_='quote')
244
+ quotes = page.find_all(class_='quote') # y así sucesivamente...
245
+ # Encuentra elementos por contenido de texto
246
+ quotes = page.find_by_text('quote', tag='div')
247
+
248
+ # Navegación avanzada
249
+ quote_text = page.css('.quote')[0].css('.text::text').get()
250
+ quote_text = page.css('.quote').css('.text::text').getall() # Selectores encadenados
251
+ first_quote = page.css('.quote')[0]
252
+ author = first_quote.next_sibling.css('.author::text')
253
+ parent_container = first_quote.parent
254
+
255
+ # Relaciones y similitud de elementos
256
+ similar_elements = first_quote.find_similar()
257
+ below_elements = first_quote.below_elements()
258
+ ```
259
+ Puedes usar el parser directamente si no necesitas obtener sitios web, como se muestra a continuación:
260
  ```python
261
+ from scrapling.parser import Selector
262
+
263
+ page = Selector("<html>...</html>")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  ```
265
+ ¡Y funciona exactamente de la misma manera!
266
 
267
+ ### Ejemplos de Gestión de Session Async
268
  ```python
269
+ import asyncio
270
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession
271
+
272
+ async with FetcherSession(http3=True) as session: # `FetcherSession` es consciente del contexto y puede funcionar tanto en patrones sync/async
273
+ page1 = session.get('https://quotes.toscrape.com/')
 
 
 
 
274
  page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
275
 
276
  # Uso de sesión async
277
  async with AsyncStealthySession(max_pages=2) as session:
278
  tasks = []
279
  urls = ['https://example.com/page1', 'https://example.com/page2']
280
+
281
  for url in urls:
282
  task = session.fetch(url)
283
  tasks.append(task)
284
+
285
  print(session.get_pool_stats()) # Opcional - El estado del pool de pestañas del navegador (ocupado/libre/error)
286
  results = await asyncio.gather(*tasks)
287
  print(session.get_pool_stats())
 
289
 
290
  ## CLI y Shell Interactivo
291
 
292
+ Scrapling incluye una poderosa interfaz de línea de comandos:
293
 
294
  [![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)
295
 
296
+ Lanzar el Shell interactivo de Web Scraping
297
  ```bash
298
  scrapling shell
299
  ```
 
306
  ```
307
 
308
  > [!NOTE]
309
+ > Hay muchas características adicionales, pero queremos mantener esta página concisa, incluyendo el servidor MCP y el Shell Interactivo de Web Scraping. Consulta la documentación completa [aquí](https://scrapling.readthedocs.io/en/latest/)
310
 
311
  ## Benchmarks de Rendimiento
312
 
313
+ Scrapling no solo es potente, también es ultrarrápido. Los siguientes benchmarks comparan el parser de Scrapling con las últimas versiones de otras bibliotecas populares.
314
 
315
  ### Prueba de Velocidad de Extracción de Texto (5000 elementos anidados)
316
 
317
+ | # | Biblioteca | Tiempo (ms) | vs Scrapling |
318
  |---|:-----------------:|:-----------:|:------------:|
319
+ | 1 | Scrapling | 2.02 | 1.0x |
320
+ | 2 | Parsel/Scrapy | 2.04 | 1.01 |
321
+ | 3 | Raw Lxml | 2.54 | 1.257 |
322
+ | 4 | PyQuery | 24.17 | ~12x |
323
+ | 5 | Selectolax | 82.63 | ~41x |
324
+ | 6 | MechanicalSoup | 1549.71 | ~767.1x |
325
+ | 7 | BS4 with Lxml | 1584.31 | ~784.3x |
326
+ | 8 | BS4 with html5lib | 3391.91 | ~1679.1x |
327
 
328
 
329
  ### Rendimiento de Similitud de Elementos y Búsqueda de Texto
 
332
 
333
  | Biblioteca | Tiempo (ms) | vs Scrapling |
334
  |-------------|:-----------:|:------------:|
335
+ | Scrapling | 2.39 | 1.0x |
336
+ | AutoScraper | 12.45 | 5.209x |
337
 
338
 
339
  > Todos los benchmarks representan promedios de más de 100 ejecuciones. Ver [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) para la metodología.
 
346
  pip install scrapling
347
  ```
348
 
349
+ Esta instalación solo incluye el motor de análisis y sus dependencias, sin ningún fetcher ni dependencias de línea de comandos.
350
 
351
  ### Dependencias Opcionales
352
 
353
  1. Si vas a usar alguna de las características adicionales a continuación, los fetchers, o sus clases, necesitarás instalar las dependencias de los fetchers y sus dependencias del navegador de la siguiente manera:
354
  ```bash
355
  pip install "scrapling[fetchers]"
356
+
357
  scrapling install
358
  ```
359
 
360
+ Esto descarga todos los navegadores, junto con sus dependencias del sistema y dependencias de manipulación de fingerprint.
361
 
362
  2. Características adicionales:
363
  - Instalar la característica del servidor MCP:
364
  ```bash
365
  pip install "scrapling[ai]"
366
  ```
367
+ - Instalar características del Shell (Shell de Web Scraping y el comando `extract`):
368
  ```bash
369
  pip install "scrapling[shell]"
370
  ```
371
+ - Instalar todo:
372
  ```bash
373
  pip install "scrapling[all]"
374
  ```
 
403
  Este proyecto incluye código adaptado de:
404
  - Parsel (Licencia BSD)—Usado para el submódulo [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)
405
 
 
 
 
 
 
 
 
406
  ---
407
+ <div align="center"><small>Diseñado y elaborado con ❤️ por Karim Shoair.</small></div><br>
docs/README_JP.md CHANGED
@@ -1,9 +1,14 @@
1
- <p align=center>
2
- <br>
3
- <a href="https://scrapling.readthedocs.io/en/latest/" target="_blank"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/poster.png" style="width: 50%; height: 100%;" alt="main poster"/></a>
4
- <br>
5
- <i><code>簡単で効率的なウェブスクレイピング、あるべき姿!</code></i>
6
- </p>
 
 
 
 
 
7
  <p align="center">
8
  <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
9
  <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
@@ -24,46 +29,47 @@
24
  </p>
25
 
26
  <p align="center">
27
- <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/">
28
- 選択メソッド
29
- </a>
30
- ·
31
- <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/">
32
- フェッチャーの選択
33
- </a>
34
- ·
35
- <a href="https://scrapling.readthedocs.io/en/latest/cli/overview/">
36
- CLI
37
- </a>
38
- ·
39
- <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/">
40
- MCPモード
41
- </a>
42
- ·
43
- <a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/">
44
- Beautifulsoupからの移行
45
- </a>
46
  </p>
47
 
48
- **アンチボットシステムと戦いをやめましょう。ウェブサイが更新されたびにセレクタを書き直のをやめましょう**
49
 
50
- Scrapling単なるウェブスクレイピングライブラリではありません。ウェブサイトの変更から学習し、それとともに進化する最初の**適応型**スクレイピングライブラリです。他のライブラリウェブサイトの構造が更新され壊れる一方で、Scraplingは自動的に要素を再配置しイパーを稼働し続けます。
51
 
52
- モダンウェブ向け構築されたScraplingは、**独自の高速パースエンジン**とフェッチャーを備えており、あなたが直面する、または直面するであろうすべてのウェブスレイピングの課題に対応します。ウェブスクレイパーによってウェブスクレイパーと一般ユーザーのために構築され、誰にでも何かがあります。
53
 
54
  ```python
55
- >> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
56
- >> StealthyFetcher.adaptive = True
57
- # レーダーの下でウェブサイトのソースを取得!
58
- >> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
59
- >> print(page.status)
60
- 200
61
- >> products = page.css('.product', auto_save=True) # ウェブサイトのデザイン変更に耐えるデタをクレイ
62
- >> # 後でウェブサイトの構造が変わったら、`adaptive=True`を渡す
63
- >> products = page.css('.product', adaptive=True) # そしてScraplingはまだそれらを見つけます!
 
 
 
 
 
 
 
 
 
 
64
  ```
65
 
66
- # スポンサー
 
67
 
68
  <!-- sponsors -->
69
 
@@ -87,138 +93,211 @@ Scraplingは単なるウェブスクレイピングライブラリではあり
87
 
88
  ## 主な機能
89
 
90
- ### セッションサポート付き高度ウェブサイト取得
91
- - **HTTPリクエスト**:`Fetcher`クラスで高速でステルスなHTTPリクエスト。ブラウザのTLSフィンガープリントヘッダーを模倣しHTTP3を使用きます
92
- - **動的読み込み**:Playwright's ChromiumとGoogle Chromeをサポートする`DynamicFetcher`クラスを通じた完全ブラウザ自動化で動的ウェブサイトを取得
93
- - **アンボッ回避**:`StealthyFetcher`とフィンガープ偽装による高度なステルス機能。自動化でCloudflareTurnstile/Interstitialのすべてのタプを簡単回避できます
94
- - **セッション管理**:リクエト間でCookieと状態を管理するため`FetcherSession`、`StealthySession`、`DynamicSession`ラスによる永続的なセッョンサポート。
95
- - **非同期サポ**:すべてのフェッチャー専用非同期セッションクラ全体の完全な非同期サポ
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  ### 適応型スクレイピングとAI統合
98
  - 🔄 **スマート要素追跡**:インテリジェントな類似性アルゴリズムを使用してウェブサイトの変更後に要素を再配置。
99
  - 🎯 **スマート柔軟選択**:CSSセレクタ、XPathセレクタ、フィルタベース検索、テキスト検索、正規表現検索など。
100
- - 🔍 **類似要素を見つける**:見つかった要素に類似した要素を自動的に特定。
101
- - 🤖 **AIと使用するMCPサーバー**:AI支援ウェブスクレイピングとデータ抽出のための組み込みMCPサーバー。MCPサーバーは、AI(Claude/Cursorなど)に渡す前にScraplingを活用してターゲットコンテンツを抽出する強力でカスタムな機能を備えており、操作を高速化し、トークン使用量を最小限に抑えることでコストを削減します。([デモビデオ](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
102
 
103
  ### 高性能で実戦テスト済みのアーキテクチャ
104
- - 🚀 **高速**:ほとんどのPythonスクレイピングライブラリを上回る最適化されたパフォーマンス。
105
  - 🔋 **メモリ効率**:最小のメモリフットプリントのための最適化されたデータ構造と遅延読み込み。
106
  - ⚡ **高速JSONシリアル化**:標準ライブラリの10倍の速度。
107
- - 🏗️ **実戦テスト済み**:Scraplingは92%のテストカバレッジと完全な型ヒントカバレッジを備えているだけでなく、過去1年間に数百人のウェブスクレイパーによって毎日使用されてきました。
108
 
109
- ### 開発者/ウェブスクレイパーにやさしい体験
110
- - 🎯 **インタラクティブウェブスクレイピングシェル**:Scraping統合、ショートカット、curlリクエストをScraplingリクエストに変換したり、ブラウザでリクエスト結果を表示したりするなどの新しいツールを備えたオプションの組み込みIPythonシェルで、ウェブスクレイピングスクリプトの開発を加速します
111
  - 🚀 **ターミナルから直接使用**:オプションで、コードを一行も書かずにScraplingを使用してURLをスクレイプできます!
112
  - 🛠️ **豊富なナビゲーションAPI**:親、兄弟、子のナビゲーションメソッドによる高度なDOMトラバーサル。
113
  - 🧬 **強化されたテキスト処理**:組み込みの正規表現、クリーニングメソッド、最適化された文字列操作。
114
  - 📝 **自動セレクタ生成**:任意の要素に対して堅牢なCSS/XPathセレクタを生成。
115
- - 🔌 **馴染みのあるAPI**:Scrapy/Parselで使用されている同じ疑似要素を持つScrapy/BeautifulSoupに似ています
116
- - 📘 **完全な型カバレッジ**:優れたIDEサポートとコード補完のための完全な型ヒント。
117
  - 🔋 **すぐに使えるDockerイメージ**:各リリースで、すべてのブラウザを含むDockerイメージが自動的にビルドおよびプッシュされます。
118
 
119
  ## はじめに
120
 
 
 
121
  ### 基本的な使い方
 
122
  ```python
123
- from scrapling.fetchers import Fetcher, StealthyFetcher, DynamicFetcher
124
- from scrapling.fetchers import FetcherSession, StealthySession, DynamicSession
125
 
126
- # セッションサポート付きHTTPリクエスト
127
- with FetcherSession(impersonate='chrome') as session: # ChromeのTLS��ィンガープリントの最新バージョンを使用
128
  page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
129
- quotes = page.css('.quote .text::text')
130
 
131
  # または一回限りのリクエストを使用
132
  page = Fetcher.get('https://quotes.toscrape.com/')
133
- quotes = page.css('.quote .text::text')
 
 
 
 
134
 
135
- # 高度なステルスモード(完了するまでブラウザを開いたままにする
136
- with StealthySession(headless=True, solve_cloudflare=True) as session:
137
  page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
138
- data = page.css('#padded_content a')
139
 
140
- # または一回限りのリクエストスタイルを使用、このリクエストのためにブラウザを開き、完了後に閉じる
141
  page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
142
- data = page.css('#padded_content a')
143
-
144
- # 完全なブラウザ自動化(完了するまでブラウザを開いたままにする)
145
- with DynamicSession(headless=True) as session:
146
- page = session.fetch('https://quotes.toscrape.com/', network_idle=True)
147
- quotes = page.css('.quote .text::text')
148
-
149
- # または一回限りのリクエストスタイルを使用
150
- page = DynamicFetcher.fetch('https://quotes.toscrape.com/', network_idle=True)
151
- quotes = page.css('.quote .text::text')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  ```
 
 
 
 
 
153
 
154
- ### 要素選択
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  ```python
156
- # CSSセレクタ
157
- page.css('a::text') # テキストを抽出
158
- page.css('a::attr(href)') # 属性を抽出
159
- page.css('a', recursive=False) # 直接の要素のみ
160
- page.css('a', auto_save=True) # 要素の位置を自動保存
161
-
162
- # XPath
163
- page.xpath('//a/text()')
164
-
165
- # 柔軟な検索
166
- page.find_by_text('Python', first_match=True) # テキストで検索
167
- page.find_by_regex(r'\d{4}') # 正規表現パターンで検索
168
- page.find('div', {'class': 'container'}) # 属性で検索
169
-
170
- # ナビゲーション
171
- element.parent # 親要素を取得
172
- element.next_sibling # 次の兄弟を取得
173
- element.children # 子要素を取得
174
-
175
- # 類似要素
176
- similar = page.get_similar(element) # 類似要素を見つける
177
-
178
- # 適応型スクレイピング
179
- saved_elements = page.css('.product', auto_save=True)
180
- # 後でウェブサイトが変更されたとき:
181
- page.css('.product', adaptive=True) # 保存された位置を使用して要素を見つける
182
  ```
 
183
 
184
- ### セッション使用
185
  ```python
186
- from scrapling.fetchers import FetcherSession, AsyncFetcherSession
187
-
188
- # 同期セッション
189
- with FetcherSession() as session:
190
- # Cookieは自動的に維持されます
191
- page1 = session.get('https://quotes.toscrape.com/login')
192
- page2 = session.post('https://quotes.toscrape.com/login', data={'username': 'admin', 'password': 'admin'})
193
-
194
- # 必要に応じてブラウザのフィンガープリントを切り替え
195
  page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
196
 
197
- # 非同期セッションの使用
198
  async with AsyncStealthySession(max_pages=2) as session:
199
  tasks = []
200
  urls = ['https://example.com/page1', 'https://example.com/page2']
201
-
202
  for url in urls:
203
  task = session.fetch(url)
204
  tasks.append(task)
205
-
206
  print(session.get_pool_stats()) # オプション - ブラウザタブプールのステータス(ビジー/フリー/エラー)
207
  results = await asyncio.gather(*tasks)
208
  print(session.get_pool_stats())
209
  ```
210
 
211
- ## CLIとインタラクティブシェル
212
 
213
- Scrapling v0.3には強力なコマンドラインインターフェースが含まれています:
214
 
215
  [![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)
216
 
217
- インタラクティブウェブスクレイピングシェルを起動
218
  ```bash
219
  scrapling shell
220
  ```
221
- プログラミングせずに直接ページをファイルに抽出(デフォルトで`body`タグ内のコンテンツを抽出)。出力ファイルが`.txt`で終わる場合、ターゲットのテキストコンテンツが抽出されます。`.md`で終わる場合、HTMLコンテンツのMarkdown表現になります`.html`で終わる場合、HTMLコンテンツそのものになります。
222
  ```bash
223
  scrapling extract get 'https://example.com' content.md
224
  scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # CSSセレクタ'#fromSkipToProducts'に一致するすべての要素
@@ -227,34 +306,34 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.
227
  ```
228
 
229
  > [!NOTE]
230
- > MCPサーバーやインタラクティブウェブスクレイピングシェルなど、他にも多くの追加機能がありますが、このページは簡潔に保ちたいと思います。完全なドキュメントは[こちら](https://scrapling.readthedocs.io/en/latest/)をご覧ください
231
 
232
  ## パフォーマンスベンチマーク
233
 
234
- Scraplingは強力であるだけでなく、驚くほど高速で、バージョン0.3以降のアップデートはべての操作で優れたパフォーマンス向上を実現しています。以下のベンチマークは、Scraplingのパーサーを他の人気のあるライブラリと比較しています。
235
 
236
  ### テキスト抽出速度テスト(5000個のネストされた要素)
237
 
238
- | # | ライブラリ | 時間(ms) | vs Scrapling |
239
- |---|:-----------------:|:-------:|:------------:|
240
- | 1 | Scrapling | 1.99 | 1.0x |
241
- | 2 | Parsel/Scrapy | 2.01 | 1.01x |
242
- | 3 | Raw Lxml | 2.5 | 1.256x |
243
- | 4 | PyQuery | 22.93 | ~11.5x |
244
- | 5 | Selectolax | 80.57 | ~40.5x |
245
- | 6 | BS4 with Lxml | 1541.37 | ~774.6x |
246
- | 7 | MechanicalSoup | 1547.35 | ~777.6x |
247
- | 8 | BS4 with html5lib | 3410.58 | ~1713.9x |
248
 
249
 
250
  ### 要素類似性とテキスト検索のパフォーマンス
251
 
252
  Scraplingの適応型要素検索機能は代替手段を大幅に上回ります:
253
 
254
- | ライブラリ | 時間(ms) | vs Scrapling |
255
- |-------------|:------:|:------------:|
256
- | Scrapling | 2.46 | 1.0x |
257
- | AutoScraper | 13.3 | 5.407x |
258
 
259
 
260
  > すべてのベンチマークは100回以上の実行の平均を表します。方法論については[benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py)を参照してください。
@@ -267,25 +346,25 @@ ScraplingにはPython 3.10以上が必要です:
267
  pip install scrapling
268
  ```
269
 
270
- v0.3.2以降、このインストールにはパーサーエンジンとその依存関係のみが含まれており、フェッチャーやコマンドライン依存関係は含まれていません。
271
 
272
  ### オプションの依存関係
273
 
274
- 1. 以下の追加機能、フェッチャー、またはそれらのクラスのいずれかを使用する場合は、フェッチャーの依存関係とブラウザの依存関係を次のようにインストールする必要があります:
275
  ```bash
276
  pip install "scrapling[fetchers]"
277
-
278
  scrapling install
279
  ```
280
 
281
- これにより、すべてのブラウザ、およびそれらのシステム依存関係とフィンガープリント操作依存関係がダウンロードされます。
282
 
283
  2. 追加機能:
284
  - MCPサーバー機能をインストール:
285
  ```bash
286
  pip install "scrapling[ai]"
287
  ```
288
- - シェル機能(ウェブスクレイピングシェルと`extract`コマンド)をインス���ール:
289
  ```bash
290
  pip install "scrapling[shell]"
291
  ```
@@ -324,12 +403,5 @@ docker pull ghcr.io/d4vinci/scrapling:latest
324
  このプロジェクトには次から適応されたコードが含まれています:
325
  - Parsel(BSDライセンス)— [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)サブモジュールに使用
326
 
327
- ## 感謝と参考文献
328
-
329
- - [Daijro](https://github.com/daijro)の[BrowserForge](https://github.com/daijro/browserforge)と[Camoufox](https://github.com/daijro/camoufox)における素晴らしい仕事
330
- - [Vinyzu](https://github.com/Vinyzu)の[Botright](https://github.com/Vinyzu/Botright)と[PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)における素晴らしい仕事
331
- - ブラウザ検出回避技術を提供する[brotector](https://github.com/kaliiiiiiiiii/brotector)
332
- - フィンガープリント研究を提供する[fakebrowser](https://github.com/kkoooqq/fakebrowser)と[BotBrowser](https://github.com/botswin/BotBrowser)
333
-
334
  ---
335
- <div align="center"><small>Karim Shoairによって❤️でデザインおよび作成されました。</small></div><br>
 
1
+ <h1 align="center">
2
+ <a href="https://scrapling.readthedocs.io">
3
+ <picture>
4
+ <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
5
+ <img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
6
+ </picture>
7
+ </a>
8
+ <br>
9
+ <small>Effortless Web Scraping for the Modern Web</small>
10
+ </h1>
11
+
12
  <p align="center">
13
  <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
14
  <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
 
29
  </p>
30
 
31
  <p align="center">
32
+ <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/"><strong>選択メソッド</strong></a>
33
+ &middot;
34
+ <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/"><strong>Fetcherの選び方</strong></a>
35
+ &middot;
36
+ <a href="https://scrapling.readthedocs.io/en/latest/cli/overview/"><strong>CLI</strong></a>
37
+ &middot;
38
+ <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/"><strong>MCPモード</strong></a>
39
+ &middot;
40
+ <a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/"><strong>Beautifulsoupからの移行</strong></a>
 
 
 
 
 
 
 
 
 
 
41
  </p>
42
 
43
+ Scraplingは、単一リクエスから本格的なクロールまですべてを処理す適応型Web Scrapingフームワーす。
44
 
45
+ そのパーサーはウェブサイトの変更から学習し、ページが更新されに要素を自動的に再配置します。Fetcherはすぐに使えるCloudflare Turnstileなどのアンチボットシテムを回避します。そしてSpiderフレームワークにより、Pause & Resumeや自動Proxy回転機能備えた並行マルチSessionクロールへとスケールアップできます — すべてわずか数行のPythonで1つのライブラリ、妥協なし。
46
 
47
+ リアルタイム統計とStreamingよる超高速クル。Web Scraperによって、Web Scraperと一般ユーザーのために構築され、誰にでも何かがあります。
48
 
49
  ```python
50
+ from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
51
+ StealthyFetcher.adaptive = True
52
+ page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # レーダーの下でウェブサイトを取得!
53
+ products = page.css('.product', auto_save=True) # ウェブサイトのデザイン変更に耐えるデータをスクレイプ!
54
+ products = page.css('.product', adaptive=True) # 後でウェブサイトの構造が変わったら、`adaptive=True`を渡して見つける!
55
+ ```
56
+ または本格的なクロルへケールアッ
57
+ ```python
58
+ from scrapling.spiders import Spider, Response
59
+
60
+ class MySpider(Spider):
61
+ name = "demo"
62
+ start_urls = ["https://example.com/"]
63
+
64
+ async def parse(self, response: Response):
65
+ for item in response.css('.product'):
66
+ yield {"title": item.css('h2::text').get()}
67
+
68
+ MySpider().start()
69
  ```
70
 
71
+
72
+ # スポンサー
73
 
74
  <!-- sponsors -->
75
 
 
93
 
94
  ## 主な機能
95
 
96
+ ### Spider — 本格的クロールフレームワーク
97
+ - 🕷️ **Scrapy風のSpider API**:`start_urls`、async `parse` callback`Request`/`Response`オブジェクトSpiderを定義
98
+ - **並行クロール**:設定可能並行数制限、ドメンごとのスロッリング、ダウンロード遅延
99
+ - 🔄 **マルSessionサポート**:HTTPクエスステルスヘッドレスブラウザ統一ンターフェース — IDよって異なるSessionにリクエストをルーティング
100
+ - 💾 **Pause & Resume**:Checkpointベースのクロール永続化。Ctrl+Cで正常にャッダウン;再起動すると中断したところから再開
101
+ - 📡 **Streamingモ**:`async for item in spider.stream()`でリアルタイム統計ともにクレイプされたアイテムをStreaming受信 — UI、パイプライン、長時間実行クロルに最適
102
+ - 🛡️ **ブロックされたリクエストの検出**:カスタマイズ可能なロジックによるブロックされたリクエストの自動検出とリトライ。
103
+ - 📦 **組み込みエクスポート**:フックや独自のパイプライン、または組み込みのJSON/JSONLで結果をエクスポート。それぞれ`result.items.to_json()` / `result.items.to_jsonl()`を使用。
104
+
105
+ ### Sessionサポート付き高度なウェブサイト取得
106
+ - **HTTPリクエスト**:`Fetcher`クラスで高速かつステルスなHTTPリクエスト。ブラウザのTLS fingerprint、ヘッダーを模倣し、HTTP/3を使用可能。
107
+ - **動的読み込み**:PlaywrightのChromiumとGoogle Chromeをサポートする`DynamicFetcher`クラスによる完全なブラウザ自動化で動的ウェブサイトを取得。
108
+ - **アンチボット回避**:`StealthyFetcher`とfingerprint偽装による高度なステルス機能。自動化でCloudflareのTurnstile/Interstitialのすべてのタイプを簡単に回避。
109
+ - **Session管理**:リクエスト間でCookieと状態を管理するための`FetcherSession`、`StealthySession`、`DynamicSession`クラスによる永続的なSessionサポート。
110
+ - **Proxy回転**:すべてのSessionタイプに対応したラウンドロビンまたはカスタム戦略の組み込み`ProxyRotator`、さらにリクエストごとのProxyオーバーライド。
111
+ - **ドメインブロック**:ブラウザベースのFetcherで特定のドメイン(およびそのサブドメイン)へのリクエストをブロック。
112
+ - **asyncサポート**:すべてのFetcherおよび専用asyncSessionクラス全体での完全なasyncサポート。
113
 
114
  ### 適応型スクレイピングとAI統合
115
  - 🔄 **スマート要素追跡**:インテリジェントな類似性アルゴリズムを使用してウェブサイトの変更後に要素を再配置。
116
  - 🎯 **スマート柔軟選択**:CSSセレクタ、XPathセレクタ、フィルタベース検索、テキスト検索、正規表現検索など。
117
+ - 🔍 **類似要素の検出**:見つかった要素に類似した要素を自動的に特定。
118
+ - 🤖 **AIと使用するMCPサーバー**:AI支援Web Scrapingとデータ抽出のための組み込みMCPサーバー。MCPサーバーは、AI(Claude/Cursorなど)に渡す前にScraplingを活用してターゲットコンテンツを抽出する強力でカスタムな機能を備えており、操作を高速化し、トークン使用量を最小限に抑えることでコストを削減します。([デモ動画](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
119
 
120
  ### 高性能で実戦テスト済みのアーキテクチャ
121
+ - 🚀 **高速**:ほとんどのPythonスクレイピングライブラリを上回る最適化されたパフォーマンス。
122
  - 🔋 **メモリ効率**:最小のメモリフットプリントのための最適化されたデータ構造と遅延読み込み。
123
  - ⚡ **高速JSONシリアル化**:標準ライブラリの10倍の速度。
124
+ - 🏗️ **実戦テスト済み**:Scraplingは92%のテストカバレッジと完全な型ヒントカバレッジを備えているだけでなく、過去1年間に数百人のWeb Scraperによって毎日使用されてきました。
125
 
126
+ ### 開発者/Web Scraperにやさしい体験
127
+ - 🎯 **インタラクティブWeb Scraping Shell**:Scrapling統合、ショートカット、curlリクエストをScraplingリクエストに変換したり、ブラウザでリクエスト結果を表示したりするなどの新しいツールを備えたオプションの組み込みIPython Shellで、Web Scrapingスクリプトの開発を加速。
128
  - 🚀 **ターミナルから直接使用**:オプションで、コードを一行も書かずにScraplingを使用してURLをスクレイプできます!
129
  - 🛠️ **豊富なナビゲーションAPI**:親、兄弟、子のナビゲーションメソッドによる高度なDOMトラバーサル。
130
  - 🧬 **強化されたテキスト処理**:組み込みの正規表現、クリーニングメソッド、最適化された文字列操作。
131
  - 📝 **自動セレクタ生成**:任意の要素に対して堅牢なCSS/XPathセレクタを生成。
132
+ - 🔌 **馴染みのあるAPI**:Scrapy/Parselで使用されている同じ疑似要素を持つScrapy/BeautifulSoupに似た設計
133
+ - 📘 **完全な型カバレッジ**:優れたIDEサポートとコード補完のための完全な型ヒント。コードベース全体が変更のたびに**PyRight**と**MyPy**で自動的にスキャンされます。
134
  - 🔋 **すぐに使えるDockerイメージ**:各リリースで、すべてのブラウザを含むDockerイメージが自動的にビルドおよびプッシュされます。
135
 
136
  ## はじめに
137
 
138
+ 深く掘り下げずに、Scraplingにできることの簡単な概要をお見せしましょう。
139
+
140
  ### 基本的な使い方
141
+ Sessionサポート付きHTTPリクエスト
142
  ```python
143
+ from scrapling.fetchers import Fetcher, FetcherSession
 
144
 
145
+ with FetcherSession(impersonate='chrome') as session: # ChromeのTLS fingerprintの最新バージョンを使用
 
146
  page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
147
+ quotes = page.css('.quote .text::text').getall()
148
 
149
  # または一回限りのリクエストを使用
150
  page = Fetcher.get('https://quotes.toscrape.com/')
151
+ quotes = page.css('.quote .text::text').getall()
152
+ ```
153
+ 高度なステルスモード
154
+ ```python
155
+ from scrapling.fetchers import StealthyFetcher, StealthySession
156
 
157
+ with StealthySession(headless=True, solve_cloudflare=True) as session: # 完了するまでブラウザを開いたままにする
 
158
  page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
159
+ data = page.css('#padded_content a').getall()
160
 
161
+ # または一回限りのリクエストスタイル、このリクエストのためにブラウザを開き、完了後に閉じる
162
  page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
163
+ data = page.css('#padded_content a').getall()
164
+ ```
165
+ 完全なブラウザ自動化
166
+ ```python
167
+ from scrapling.fetchers import DynamicFetcher, DynamicSession
168
+
169
+ with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # 完了するまでブラウザを開いたままにする
170
+ page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
171
+ data = page.xpath('//span[@class="text"]/text()').getall() # お好みであればXPathセレクタを使用
172
+
173
+ # または一回限りのリクエストスタイル、このリクエストのためにブラウザを開き、完了後に閉じる
174
+ page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
175
+ data = page.css('.quote .text::text').getall()
176
+ ```
177
+
178
+ ### Spider
179
+ 並行リクエスト、複数のSessionタイプ、Pause & Resumeを備えた本格的なクローラーを構築:
180
+ ```python
181
+ from scrapling.spiders import Spider, Request, Response
182
+
183
+ class QuotesSpider(Spider):
184
+ name = "quotes"
185
+ start_urls = ["https://quotes.toscrape.com/"]
186
+ concurrent_requests = 10
187
+
188
+ async def parse(self, response: Response):
189
+ for quote in response.css('.quote'):
190
+ yield {
191
+ "text": quote.css('.text::text').get(),
192
+ "author": quote.css('.author::text').get(),
193
+ }
194
+
195
+ next_page = response.css('.next a')
196
+ if next_page:
197
+ yield response.follow(next_page[0].attrib['href'])
198
+
199
+ result = QuotesSpider().start()
200
+ print(f"{len(result.items)}件の引用をスクレイプしました")
201
+ result.items.to_json("quotes.json")
202
+ ```
203
+ 単一のSpiderで複数のSessionタイプを使用:
204
+ ```python
205
+ from scrapling.spiders import Spider, Request, Response
206
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession
207
+
208
+ class MultiSessionSpider(Spider):
209
+ name = "multi"
210
+ start_urls = ["https://example.com/"]
211
+
212
+ def configure_sessions(self, manager):
213
+ manager.add("fast", FetcherSession(impersonate="chrome"))
214
+ manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)
215
+
216
+ async def parse(self, response: Response):
217
+ for link in response.css('a::attr(href)').getall():
218
+ # 保護されたページはステルスSessionを通してルーティング
219
+ if "protected" in link:
220
+ yield Request(link, sid="stealth")
221
+ else:
222
+ yield Request(link, sid="fast", callback=self.parse) # 明示的なcallback
223
+ ```
224
+ Checkpointを使用して長時間のクロールをPause & Resume:
225
+ ```python
226
+ QuotesSpider(crawldir="./crawl_data").start()
227
  ```
228
+ Ctrl+Cを押すと正常に一時停止し、進捗は自動的に保存されます。後でSpiderを再度起動する際に同じ`crawldir`を渡すと、中断したところから再開します。
229
+
230
+ ### 高度なパースとナビゲーション
231
+ ```python
232
+ from scrapling.fetchers import Fetcher
233
 
234
+ # 豊富な要素選択とナビゲーション
235
+ page = Fetcher.get('https://quotes.toscrape.com/')
236
+
237
+ # 複数の選択メソッドで引用���取得
238
+ quotes = page.css('.quote') # CSSセレクタ
239
+ quotes = page.xpath('//div[@class="quote"]') # XPath
240
+ quotes = page.find_all('div', {'class': 'quote'}) # BeautifulSoupスタイル
241
+ # 以下と同じ
242
+ quotes = page.find_all('div', class_='quote')
243
+ quotes = page.find_all(['div'], class_='quote')
244
+ quotes = page.find_all(class_='quote') # など...
245
+ # テキスト内容で要素を検索
246
+ quotes = page.find_by_text('quote', tag='div')
247
+
248
+ # 高度なナビゲーション
249
+ quote_text = page.css('.quote')[0].css('.text::text').get()
250
+ quote_text = page.css('.quote').css('.text::text').getall() # チェーンセレクタ
251
+ first_quote = page.css('.quote')[0]
252
+ author = first_quote.next_sibling.css('.author::text')
253
+ parent_container = first_quote.parent
254
+
255
+ # 要素の関連性と類似性
256
+ similar_elements = first_quote.find_similar()
257
+ below_elements = first_quote.below_elements()
258
+ ```
259
+ ウェブサイトを取得せずにパーサーをすぐに使用することもできます:
260
  ```python
261
+ from scrapling.parser import Selector
262
+
263
+ page = Selector("<html>...</html>")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  ```
265
+ まったく同じ方法で動作します!
266
 
267
+ ### 非同期Session管理
268
  ```python
269
+ import asyncio
270
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession
271
+
272
+ async with FetcherSession(http3=True) as session: # `FetcherSession`はコンテキストアウェアで、同期/非同期両方のパターンで動作可能
273
+ page1 = session.get('https://quotes.toscrape.com/')
 
 
 
 
274
  page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
275
 
276
+ # 非同期Sessionの使用
277
  async with AsyncStealthySession(max_pages=2) as session:
278
  tasks = []
279
  urls = ['https://example.com/page1', 'https://example.com/page2']
280
+
281
  for url in urls:
282
  task = session.fetch(url)
283
  tasks.append(task)
284
+
285
  print(session.get_pool_stats()) # オプション - ブラウザタブプールのステータス(ビジー/フリー/エラー)
286
  results = await asyncio.gather(*tasks)
287
  print(session.get_pool_stats())
288
  ```
289
 
290
+ ## CLIとインタラクティブShell
291
 
292
+ Scraplingには強力なコマンドラインインターフェースが含まれています:
293
 
294
  [![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)
295
 
296
+ インタラクティブWeb Scraping Shellを起動
297
  ```bash
298
  scrapling shell
299
  ```
300
+ プログラミングせずに直接ページをファイルに抽出(デフォルトで`body`タグ内のコンテンツを抽出)。出力ファイルが`.txt`で終わる場合、ターゲットのテキストコンテンツが抽出されます。`.md`で終わる場合、HTMLコンテンツのMarkdown表現になります`.html`で終わる場合、HTMLコンテンツそのものになります。
301
  ```bash
302
  scrapling extract get 'https://example.com' content.md
303
  scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # CSSセレクタ'#fromSkipToProducts'に一致するすべての要素
 
306
  ```
307
 
308
  > [!NOTE]
309
+ > MCPサーバーやインタラクティブWeb Scraping Shellなど、他にも多くの追加機能がありますが、このページは簡潔に保ちたいと思います。完全なドキュメントは[こちら](https://scrapling.readthedocs.io/en/latest/)をご覧ください
310
 
311
  ## パフォーマンスベンチマーク
312
 
313
+ Scraplingは強力であるだけでなく、高速です。以下のベンチマークは、Scraplingのパーサーを他の人気ライブラリの最新バージョンと比較しています。
314
 
315
  ### テキスト抽出速度テスト(5000個のネストされた要素)
316
 
317
+ | # | ライブラリ | 時間(ms) | vs Scrapling |
318
+ |---|:-----------------:|:---------:|:------------:|
319
+ | 1 | Scrapling | 2.02 | 1.0x |
320
+ | 2 | Parsel/Scrapy | 2.04 | 1.01 |
321
+ | 3 | Raw Lxml | 2.54 | 1.257 |
322
+ | 4 | PyQuery | 24.17 | ~12x |
323
+ | 5 | Selectolax | 82.63 | ~41x |
324
+ | 6 | MechanicalSoup | 1549.71 | ~767.1x |
325
+ | 7 | BS4 with Lxml | 1584.31 | ~784.3x |
326
+ | 8 | BS4 with html5lib | 3391.91 | ~1679.1x |
327
 
328
 
329
  ### 要素類似性とテキスト検索のパフォーマンス
330
 
331
  Scraplingの適応型要素検索機能は代替手段を大幅に上回ります:
332
 
333
+ | ライブラリ | 時間(ms) | vs Scrapling |
334
+ |-------------|:---------:|:------------:|
335
+ | Scrapling | 2.39 | 1.0x |
336
+ | AutoScraper | 12.45 | 5.209x |
337
 
338
 
339
  > すべてのベンチマークは100回以上の実行の平均を表します。方法論については[benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py)を参照してください。
 
346
  pip install scrapling
347
  ```
348
 
349
+ このインストールにはパーサーエンジンとその依存関係のみが含まれており、Fetcherやコマンドライン依存関係は含まれていません。
350
 
351
  ### オプションの依存関係
352
 
353
+ 1. 以下の追加機能、Fetcher、またはそれらのクラスのいずれかを使用する場合は、Fetcherの依存関係とブラウザの依存関係を次のようにインストールする必要があります:
354
  ```bash
355
  pip install "scrapling[fetchers]"
356
+
357
  scrapling install
358
  ```
359
 
360
+ これにより、すべてのブラウザ、およびそれらのシステム依存関係とfingerprint操作依存関係がダウンロードされます。
361
 
362
  2. 追加機能:
363
  - MCPサーバー機能をインストール:
364
  ```bash
365
  pip install "scrapling[ai]"
366
  ```
367
+ - Shell機能(Web Scraping Shellと`extract`コマンド)をインスール:
368
  ```bash
369
  pip install "scrapling[shell]"
370
  ```
 
403
  このプロジェクトには次から適応されたコードが含まれています:
404
  - Parsel(BSDライセンス)— [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)サブモジュールに使用
405
 
 
 
 
 
 
 
 
406
  ---
407
+ <div align="center"><small>Karim Shoairによって❤️でデザインおよび作成されました。</small></div><br>
docs/README_RU.md CHANGED
@@ -1,9 +1,14 @@
1
- <p align=center>
2
- <br>
3
- <a href="https://scrapling.readthedocs.io/en/latest/" target="_blank"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/poster.png" style="width: 50%; height: 100%;" alt="main poster"/></a>
4
- <br>
5
- <i><code>Простой, легкий веб-скрапинг, каким он и должен быть!</code></i>
6
- </p>
 
 
 
 
 
7
  <p align="center">
8
  <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
9
  <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
@@ -24,46 +29,47 @@
24
  </p>
25
 
26
  <p align="center">
27
- <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/">
28
- Методы выбора
29
- </a>
30
- ·
31
- <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/">
32
- Выбор фетчера
33
- </a>
34
- ·
35
- <a href="https://scrapling.readthedocs.io/en/latest/cli/overview/">
36
- CLI
37
- </a>
38
- ·
39
- <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/">
40
- Режим MCP
41
- </a>
42
- ·
43
- <a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/">
44
- Миграция с Beautifulsoup
45
- </a>
46
  </p>
47
 
48
- **Прекратите бороться с анти-ботовыми системами. Прекратите переписывать селекторы после каждого обновления сайта.**
49
 
50
- Scrapling - это не просто очередная библиотека для веб-скрапинга. Это первая **адаптивная** библиотека для скрапинга, которая учится на изменениях сайтов и развивается вместе с ними. В то время как другие библиотеки ломаются, когда сайты обновляют свою структуру, Scrapling автоматически перемещает ваши элементы и поддерживает работу ваших скраперов.
51
 
52
- Созданный для современного веба, Scrapling имеет **собственный быстрый движок парсинга** и фетчеры для решения всех задач веб-скрапинга, с которыми вы сталкиваетесь или столкнетесь. Созданный веб-скраперами для веб-скраперов и обычных пользователей, здесь есть что-то для каждого.
53
 
54
  ```python
55
- >> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
56
- >> StealthyFetcher.adaptive = True
57
- # Получайте исходный код сайтов незаметно!
58
- >> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
59
- >> print(page.status)
60
- 200
61
- >> products = page.css('.product', auto_save=True) # Скрапьте данные, которые переживут изменения дизайна сайта!
62
- >> # Позже, если структура сайта изменится, передайте `adaptive=True`
63
- >> products = page.css('.product', adaptive=True) # и Scrapling все равно их найдет!
 
 
 
 
 
 
 
 
 
 
64
  ```
65
 
66
- # Спонсоры
 
67
 
68
  <!-- sponsors -->
69
 
@@ -87,138 +93,211 @@ Scrapling - это не просто очередная библиотека д
87
 
88
  ## Ключевые особенности
89
 
90
- ### Продвинутая загрузка сайтов с поддержкой сессий
91
- - **HTTP-запросы**: Быстрые и скрытные HTTP-запросы с классом `Fetcher`. Может имитировать TLS-отпечаток браузера, заголовки и использовать HTTP3.
 
 
 
 
 
 
 
 
 
92
  - **Динамическая загрузка**: Загрузка динамических сайтов с полной автоматизацией браузера через класс `DynamicFetcher`, поддерживающий Chromium от Playwright и Google Chrome.
93
- - **Обход анти-ботов**: Расширенные возможности скрытности с `StealthyFetcher` и подмену отпечатков. Может легко обойти все типы Turnstile/Interstitial от Cloudflare с помощь�� автоматизации.
94
  - **Управление сессиями**: Поддержка постоянных сессий с классами `FetcherSession`, `StealthySession` и `DynamicSession` для управления cookie и состоянием между запросами.
95
- - **Поддержка асинхронности**: Полная асинхронная поддержка во всех фетчерах и выделенных асинхронных классах сессий.
 
 
96
 
97
  ### Адаптивный скрапинг и интеграция с ИИ
98
  - 🔄 **Умное отслеживание элементов**: Перемещайте элементы после изменений сайта с помощью интеллектуальных алгоритмов подобия.
99
  - 🎯 **Умный гибкий выбор**: CSS-селекторы, XPath-селекторы, поиск на основе фильтров, текстовый поиск, поиск по регулярным выражениям и многое другое.
100
- - 🔍 **Поиск похожих элементов**: Автоматически находите элементы, похожие на найденные элементы.
101
- - 🤖 **MCP-сервер для использования с ИИ**: Встроенный MCP-сервер для веб-скрапинга с помощью ИИ и извлечения данных. MCP-сервер обладает мощными, пользовательскими возможностями, которые используют Scrapling для извлечения целевого контента перед передачей его ИИ (Claude/Cursor/и т.д.), тем самым ускоряя операции и снижая затраты за счет минимизации использования токенов. ([демо-видео](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
102
 
103
  ### Высокопроизводительная и проверенная в боях архитектура
104
- - 🚀 **Молниеносно быстро**: Оптимизированная производительность превосходит большинство библиотек скрапинга Python.
105
  - 🔋 **Эффективное использование памяти**: Оптимизированные структуры данных и ленивая загрузка для минимального потребления памяти.
106
- - ⚡ **Быстрая сериализация JSON**: В 10 раз быстрее, чем стандартная библиотека.
107
  - 🏗️ **Проверено в боях**: Scrapling имеет не только 92% покрытия тестами и полное покрытие type hints, но и ежедневно использовался сотнями веб-скраперов в течение последнего года.
108
 
109
  ### Удобный для разработчиков/веб-скраперов опыт
110
- - 🎯 **Интерактивная оболочка веб-скрапинга**: Опциональная встроенная оболочка IPython с интеграцией Scrapling, ярлыками и новыми инструментами для ускорения разработки скриптов веб-скрапинга, такими как преобразование curl-запросов в Scrapling-запросы и просмотр результатов запросов в вашем браузере.
111
  - 🚀 **Используйте прямо из терминала**: При желании вы можете использовать Scrapling для скрапинга URL без написания ни одной строки кода!
112
  - 🛠️ **Богатый API навигации**: Расширенный обход DOM с методами навигации по родителям, братьям и детям.
113
  - 🧬 **Улучшенная обработка текста**: Встроенные регулярные выражения, методы очистки и оптимизированные операции со строками.
114
- - 📝 **Автоматическая генерация селекторов**: Генерация надежных CSS/XPath селекторов для любого элемента.
115
  - 🔌 **Знакомый API**: Похож на Scrapy/BeautifulSoup с теми же псевдоэлементами, используемыми в Scrapy/Parsel.
116
- - 📘 **Полное покрытие типами**: Полные подсказки типов для отличной поддержки IDE и автодополнения кода.
117
- - 🔋 **Готовый Docker-образ**: С каждым релизом автоматически создается и отправляется Docker-образ, содержащий все браузеры.
118
 
119
  ## Начало работы
120
 
 
 
121
  ### Базовое использование
 
122
  ```python
123
- from scrapling.fetchers import Fetcher, StealthyFetcher, DynamicFetcher
124
- from scrapling.fetchers import FetcherSession, StealthySession, DynamicSession
125
 
126
- # HTTP-запросы с поддержкой сессий
127
- with FetcherSession(impersonate='chrome') as session: # Используйте последнюю версию TLS-отпечатка Chrome
128
  page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
129
- quotes = page.css('.quote .text::text')
130
 
131
  # Или используйте одноразовые запросы
132
  page = Fetcher.get('https://quotes.toscrape.com/')
133
- quotes = page.css('.quote .text::text')
 
 
 
 
134
 
135
- # Расширенный режим скрытности (Держите браузер открытым до завершения)
136
- with StealthySession(headless=True, solve_cloudflare=True) as session:
137
  page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
138
- data = page.css('#padded_content a')
139
 
140
- # Или используйте стиль одноразового запроса, открывает браузер для этого запроса, затем закрывает его после завершения
141
  page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
142
- data = page.css('#padded_content a')
143
-
144
- # Полная автоматизация браузера (Держите браузер открытым до завершения)
145
- with DynamicSession(headless=True) as session:
146
- page = session.fetch('https://quotes.toscrape.com/', network_idle=True)
147
- quotes = page.css('.quote .text::text')
148
-
149
- # Или используйте стиль одноразового запроса
150
- page = DynamicFetcher.fetch('https://quotes.toscrape.com/', network_idle=True)
151
- quotes = page.css('.quote .text::text')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  ```
 
 
 
 
 
153
 
154
- ### Выбор элементов
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  ```python
156
- # CSS-селекторы
157
- page.css('a::text') # Извлечь текст
158
- page.css('a::attr(href)') # Извлечь атрибуты
159
- page.css('a', recursive=False) # Только прямые элементы
160
- page.css('a', auto_save=True) # Автоматически сохранять позиции элементов
161
-
162
- # XPath
163
- page.xpath('//a/text()')
164
-
165
- # Гибкий поиск
166
- page.find_by_text('Python', first_match=True) # Найти по тексту
167
- page.find_by_regex(r'\d{4}') # Найти по паттерну regex
168
- page.find('div', {'class': 'container'}) # Найти по атрибутам
169
-
170
- # Навигация
171
- element.parent # Получить родительский элемент
172
- element.next_sibling # Получить следующего брата
173
- element.children # Получить дочерние элементы
174
-
175
- # Похожие элементы
176
- similar = page.get_similar(element) # Найти похожие элементы
177
-
178
- # Адаптивный скрапинг
179
- saved_elements = page.css('.product', auto_save=True)
180
- # Позже, когда сайт изменится:
181
- page.css('.product', adaptive=True) # Найти элементы используя сохраненные позиции
182
  ```
 
183
 
184
- ### Использование сессий
185
  ```python
186
- from scrapling.fetchers import FetcherSession, AsyncFetcherSession
187
-
188
- # Синхронная сессия
189
- with FetcherSession() as session:
190
- # Cookie автоматически сохраняются
191
- page1 = session.get('https://quotes.toscrape.com/login')
192
- page2 = session.post('https://quotes.toscrape.com/login', data={'username': 'admin', 'password': 'admin'})
193
-
194
- # При необходимости переключите отпечаток браузера
195
  page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
196
 
197
- # Использование асинхронной сессии
198
  async with AsyncStealthySession(max_pages=2) as session:
199
  tasks = []
200
  urls = ['https://example.com/page1', 'https://example.com/page2']
201
-
202
  for url in urls:
203
  task = session.fetch(url)
204
  tasks.append(task)
205
-
206
- print(session.get_pool_stats()) # Опционально - Статус пула вкладок браузера (занят/свободен/ошибка)
207
  results = await asyncio.gather(*tasks)
208
  print(session.get_pool_stats())
209
  ```
210
 
211
- ## CLI и интерактивная оболочка
212
 
213
- Scrapling v0.3 включает мощный интерфейс командной строки:
214
 
215
  [![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)
216
 
217
- Запустить интерактивную оболочку веб-скрапинга
218
  ```bash
219
  scrapling shell
220
  ```
221
- Извлечь страницы в файл напрямую без программирования (Извлекает содержимое внутри тега `body` по умолчанию). Если выходной файл заканчивается на `.txt`, то будет извлечено текстовое содержимое цели. Если заканчивается на `.md`, это будет Markdown-представление HTML-содержимого; если заканчивается на `.html`, это будет само HTML-содержимое.
222
  ```bash
223
  scrapling extract get 'https://example.com' content.md
224
  scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # Все элементы, соответствующие CSS-селектору '#fromSkipToProducts'
@@ -227,24 +306,24 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.
227
  ```
228
 
229
  > [!NOTE]
230
- > Есть много дополнительных функций, но мы хотим сохранить эту страницу краткой, например, MCP-сервер и интерактивная оболочка веб-скрапинга. Ознакомьтесь с полной документацией [здесь](https://scrapling.readthedocs.io/en/latest/)
231
 
232
  ## Тесты производительности
233
 
234
- Scrapling не только мощный - он также невероятно быстрый, и обновления с версии 0.3 обеспечили исключительные улучшения производительности во всех операциях. Следующие тесты производительности сравнивают парсер Scrapling с другими популярными библиотеками.
235
 
236
  ### Тест скорости извлечения текста (5000 вложенных элементов)
237
 
238
- | # | Библиотека | Время (мс) | vs Scrapling |
239
  |---|:-----------------:|:----------:|:------------:|
240
- | 1 | Scrapling | 1.99 | 1.0x |
241
- | 2 | Parsel/Scrapy | 2.01 | 1.01x |
242
- | 3 | Raw Lxml | 2.5 | 1.256x |
243
- | 4 | PyQuery | 22.93 | ~11.5x |
244
- | 5 | Selectolax | 80.57 | ~40.5x |
245
- | 6 | BS4 with Lxml | 1541.37 | ~774.6x |
246
- | 7 | MechanicalSoup | 1547.35 | ~777.6x |
247
- | 8 | BS4 with html5lib | 3410.58 | ~1713.9x |
248
 
249
 
250
  ### Производительность подобия элементов и текстового поиска
@@ -253,8 +332,8 @@ Scrapling не только мощный - он также невероятно
253
 
254
  | Библиотека | Время (мс) | vs Scrapling |
255
  |-------------|:----------:|:------------:|
256
- | Scrapling | 2.46 | 1.0x |
257
- | AutoScraper | 13.3 | 5.407x |
258
 
259
 
260
  > Все тесты производительности представляют собой средние значения более 100 запусков. См. [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) для методологии.
@@ -267,33 +346,33 @@ Scrapling требует Python 3.10 или выше:
267
  pip install scrapling
268
  ```
269
 
270
- Начиная с v0.3.2, эта установка включает только движок парсера и его зависимост��, без каких-либо фетчеров или зависимостей командной строки.
271
 
272
  ### Опциональные зависимости
273
 
274
- 1. Если вы собираетесь использовать какие-либо из дополнительных функций ниже, фетчеры или их классы, вам необходимо установить зависимости фетчеров и их зависимости браузера следующим образом:
275
  ```bash
276
  pip install "scrapling[fetchers]"
277
-
278
  scrapling install
279
  ```
280
 
281
- Это загрузит все браузеры вместе с их системными зависимостями и зависимостями манипуляции отпечатками.
282
 
283
- 2. Дополнительные функции:
284
  - Установить функцию MCP-сервера:
285
  ```bash
286
  pip install "scrapling[ai]"
287
  ```
288
- - Установить функции оболочки (оболочка веб-скрапинга и команда `extract`):
289
  ```bash
290
  pip install "scrapling[shell]"
291
  ```
292
- - Установить все:
293
  ```bash
294
  pip install "scrapling[all]"
295
  ```
296
- Помните, что вам нужно установить зависимости браузера с помощью `scrapling install` после любого из этих дополнений (если вы еще этого не сделали)
297
 
298
  ### Docker
299
  Вы также можете установить Docker-образ со всеми дополнениями и браузерами с помощью следующей команды из DockerHub:
@@ -304,11 +383,11 @@ docker pull pyd4vinci/scrapling
304
  ```bash
305
  docker pull ghcr.io/d4vinci/scrapling:latest
306
  ```
307
- Этот образ автоматически создается и отправляется с использованием GitHub Actions и основной ветки репозитория.
308
 
309
- ## Вклад
310
 
311
- Мы приветствуем вклад! Пожалуйста, прочитайте наши [руководства по внесению вклада](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) перед началом работы.
312
 
313
  ## Отказ от ответственности
314
 
@@ -324,12 +403,5 @@ docker pull ghcr.io/d4vinci/scrapling:latest
324
  Этот проект включает код, адаптированный из:
325
  - Parsel (лицензия BSD) — Используется для подмодуля [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)
326
 
327
- ## Благодарности и ссылки
328
-
329
- - Блестящая работа [Daijro](https://github.com/daijro) над [BrowserForge](https://github.com/daijro/browserforge) и [Camoufox](https://github.com/daijro/camoufox)
330
- - Блестящая работа [Vinyzu](https://github.com/Vinyzu) над [Botright](https://github.com/Vinyzu/Botright) и [PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)
331
- - [brotector](https://github.com/kaliiiiiiiiii/brotector) за техники обхода обнаружения браузера
332
- - [fakebrowser](https://github.com/kkoooqq/fakebrowser) и [BotBrowser](https://github.com/botswin/BotBrowser) з�� исследование отпечатков
333
-
334
  ---
335
- <div align="center"><small>Разработано и создано с ❤️ Карим Шоаир.</small></div><br>
 
1
+ <h1 align="center">
2
+ <a href="https://scrapling.readthedocs.io">
3
+ <picture>
4
+ <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
5
+ <img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
6
+ </picture>
7
+ </a>
8
+ <br>
9
+ <small>Effortless Web Scraping for the Modern Web</small>
10
+ </h1>
11
+
12
  <p align="center">
13
  <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
14
  <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
 
29
  </p>
30
 
31
  <p align="center">
32
+ <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/"><strong>Методы выбора</strong></a>
33
+ &middot;
34
+ <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/"><strong>Выбор Fetcher</strong></a>
35
+ &middot;
36
+ <a href="https://scrapling.readthedocs.io/en/latest/cli/overview/"><strong>CLI</strong></a>
37
+ &middot;
38
+ <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/"><strong>Режим MCP</strong></a>
39
+ &middot;
40
+ <a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/"><strong>Миграция с Beautifulsoup</strong></a>
 
 
 
 
 
 
 
 
 
 
41
  </p>
42
 
43
+ Scrapling — это адаптивный фреймворк для Web Scraping, который берёт на себя всё: от одного запроса до полномасштабного обхода сайтов.
44
 
45
+ Его парсер учится на изменениях сайтов и автоматически перемещает ваши элементы при обновлении страниц. Его Fetcher'ы обходят анти-бот системы вроде Cloudflare Turnstile прямо из коробки. А его Spider-фреймворк позволяет масштабироваться до параллельных, многосессионных обходов с Pause & Resume и автоматической ротацией Proxy — и всё это в нескольких строках Python. Одна библиотека, без компромиссов.
46
 
47
+ Молниеносно быстрые обходы с отслеживанием статистики в реальном времени и Streaming. Создано веб-скраперами для веб-скраперов и обычных пользователей здесь есть что-то для каждого.
48
 
49
  ```python
50
+ from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
51
+ StealthyFetcher.adaptive = True
52
+ page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # Загрузите сайт незаметно!
53
+ products = page.css('.product', auto_save=True) # Скрапьте данные, которые переживут изменения дизайна сайта!
54
+ products = page.css('.product', adaptive=True) # Позже, если структура сайта изменится, передайте `adaptive=True`, чтобы найти их!
55
+ ```
56
+ Или масштабируйте до полного обхода
57
+ ```python
58
+ from scrapling.spiders import Spider, Response
59
+
60
+ class MySpider(Spider):
61
+ name = "demo"
62
+ start_urls = ["https://example.com/"]
63
+
64
+ async def parse(self, response: Response):
65
+ for item in response.css('.product'):
66
+ yield {"title": item.css('h2::text').get()}
67
+
68
+ MySpider().start()
69
  ```
70
 
71
+
72
+ # Спонсоры
73
 
74
  <!-- sponsors -->
75
 
 
93
 
94
  ## Ключевые особенности
95
 
96
+ ### Spider'ы — полноценный фреймворк для обхода сайтов
97
+ - 🕷️ **Scrapy-подобный Spider API**: Определяйте Spider'ов с `start_urls`, async `parse` callback'ами и объектами `Request`/`Response`.
98
+ - ⚡ **Параллельный обход**: Настраиваемые лимиты параллелизма, ограничение скорости по домену и задержки загрузки.
99
+ - 🔄 **Поддержка нескольких сессий**: Единый интерфейс для HTTP-запросов и скрытных headless-браузеров в одном Spider — маршрутизируйте запросы к разным сессиям по ID.
100
+ - 💾 **Pause & Resume**: Persistence обхода на основе Checkpoint'ов. Нажмите Ctrl+C для мягкой остановки; перезапустите, чтобы продолжить с того места, где вы остановились.
101
+ - 📡 **Режим Streaming**: Стримьте извлечённые элементы по мере их поступления через `async for item in spider.stream()` со статистикой в реальном времени — идеально для UI, конвейеров и длительных обходов.
102
+ - 🛡️ **Обнаружение заблокированных запросов**: Автоматическое обнаружение и повторная отправка заблокированных запросов с настраиваемой логикой.
103
+ - 📦 **Встроенный экспорт**: Экспортируйте результаты через хуки и собственный конвейер или встроенный JSON/JSONL с `result.items.to_json()` / `result.items.to_jsonl()` соответственно.
104
+
105
+ ### Продвинутая загрузка сайтов с поддержкой Session
106
+ - **HTTP-запросы**: Быстрые и скрытные HTTP-запросы с классом `Fetcher`. Может имитировать TLS fingerprint браузера, заголовки и использовать HTTP/3.
107
  - **Динамическая загрузка**: Загрузка динамических сайтов с полной автоматизацией браузера через класс `DynamicFetcher`, поддерживающий Chromium от Playwright и Google Chrome.
108
+ - **Обход анти-ботов**: Расширенные возможности скрытности с `StealthyFetcher` и подмену fingerprint'ов. Может легко обойти все типы Cloudflare Turnstile/Interstitial с помощью автоматизации.
109
  - **Управление сессиями**: Поддержка постоянных сессий с классами `FetcherSession`, `StealthySession` и `DynamicSession` для управления cookie и состоянием между запросами.
110
+ - **Ротация Proxy**: Встроенный `ProxyRotator` с циклической или пользовательскими стратегиями для всех типов сессий, а также переопределение Proxy для каждого запроса.
111
+ - **Блокировка доменов**: Блокируйте запросы к определённым доменам (и их поддоменам) в браузерных Fetcher'ах.
112
+ - **Поддержка async**: Полная async-поддержка во всех Fetcher'ах и выделенных async-классах сессий.
113
 
114
  ### Адаптивный скрапинг и интеграция с ИИ
115
  - 🔄 **Умное отслеживание элементов**: Перемещайте элементы после изменений сайта с помощью интеллектуальных алгоритмов подобия.
116
  - 🎯 **Умный гибкий выбор**: CSS-селекторы, XPath-селекторы, поиск на основе фильтров, текстовый поиск, поиск по регулярным выражениям и многое другое.
117
+ - 🔍 **Поиск похожих элементов**: Автоматически находите элементы, похожие на найденные.
118
+ - 🤖 **MCP-сервер для использования с ИИ**: Встроенный MCP-сервер для Web Scraping с помощью ИИ и извлечения данных. MCP-сервер обладает мощными пользовательскими возможностями, которые используют Scrapling для извлечения целевого контента перед передачей его ИИ (Claude/Cursor/и т.д.), тем самым ускоряя операции и снижая затраты за счёт минимизации использования токенов. ([демо-видео](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
119
 
120
  ### Высокопроизводительная и проверенная в боях архитектура
121
+ - 🚀 **Молниеносная скорость**: Оптимизированная производительность, превосходящая большинство Python-библиотек для скрапинга.
122
  - 🔋 **Эффективное использование памяти**: Оптимизированные структуры данных и ленивая загрузка для минимального потребления памяти.
123
+ - ⚡ **Быстрая сериализация JSON**: В 10 раз быстрее стандартной библиотеки.
124
  - 🏗️ **Проверено в боях**: Scrapling имеет не только 92% покрытия тестами и полное покрытие type hints, но и ежедневно использовался сотнями веб-скраперов в течение последнего года.
125
 
126
  ### Удобный для разработчиков/веб-скраперов опыт
127
+ - 🎯 **Интерактивная Web Scraping Shell**: Опциональная встроенная IPython-оболочка с интеграцией Scrapling, ярлыками и новыми инструментами для ускорения разработки скриптов Web Scraping, такими как преобразование curl-запросов в запросы Scrapling и просмотр результатов запросов в браузере.
128
  - 🚀 **Используйте прямо из терминала**: При желании вы можете использовать Scrapling для скрапинга URL без написания ни одной строки кода!
129
  - 🛠️ **Богатый API навигации**: Расширенный обход DOM с методами навигации по родителям, братьям и детям.
130
  - 🧬 **Улучшенная обработка текста**: Встроенные регулярные выражения, методы очистки и оптимизированные операции со строками.
131
+ - 📝 **Автоматическая генерация селекторов**: Генерация надёжных CSS/XPath-селекторов для любого элемента.
132
  - 🔌 **Знакомый API**: Похож на Scrapy/BeautifulSoup с теми же псевдоэлементами, используемыми в Scrapy/Parsel.
133
+ - 📘 **Полное покрытие типами**: Полные type hints для отличной поддержки IDE и автодополнения кода. Вся кодовая база автоматически проверяется **PyRight** и **MyPy** при каждом изменении.
134
+ - 🔋 **Готовый Docker-образ**: С каждым релизом автоматически создаётся и публикуется Docker-образ, содержащий все браузеры.
135
 
136
  ## Начало работы
137
 
138
+ Давайте кратко покажем, на что способен Scrapling, без глубокого погружения.
139
+
140
  ### Базовое использование
141
+ HTTP-запросы с поддержкой Session
142
  ```python
143
+ from scrapling.fetchers import Fetcher, FetcherSession
 
144
 
145
+ with FetcherSession(impersonate='chrome') as session: # Используйте последнюю версию TLS fingerprint Chrome
 
146
  page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
147
+ quotes = page.css('.quote .text::text').getall()
148
 
149
  # Или используйте одноразовые запросы
150
  page = Fetcher.get('https://quotes.toscrape.com/')
151
+ quotes = page.css('.quote .text::text').getall()
152
+ ```
153
+ Расширенный режим скрытности
154
+ ```python
155
+ from scrapling.fetchers import StealthyFetcher, StealthySession
156
 
157
+ with StealthySession(headless=True, solve_cloudflare=True) as session: # Держите браузер открытым, пока не закончите
 
158
  page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
159
+ data = page.css('#padded_content a').getall()
160
 
161
+ # Или используйте стиль одноразового запроса открывает браузер для этого запроса, затем закрывает его после завершения
162
  page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
163
+ data = page.css('#padded_content a').getall()
164
+ ```
165
+ Полная автоматизация браузера
166
+ ```python
167
+ from scrapling.fetchers import DynamicFetcher, DynamicSession
168
+
169
+ with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # Держите браузер открытым, пока не закончите
170
+ page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
171
+ data = page.xpath('//span[@class="text"]/text()').getall() # XPath-селектор, если вы предпочитаете его
172
+
173
+ # Или используйте стиль одноразового запроса — открывает браузер для этого запроса, затем закрывает его после завершения
174
+ page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
175
+ data = page.css('.quote .text::text').getall()
176
+ ```
177
+
178
+ ### Spider'ы
179
+ Создавайте полноценные обходчики с параллельными запросами, несколькими типами сессий и Pause & Resume:
180
+ ```python
181
+ from scrapling.spiders import Spider, Request, Response
182
+
183
+ class QuotesSpider(Spider):
184
+ name = "quotes"
185
+ start_urls = ["https://quotes.toscrape.com/"]
186
+ concurrent_requests = 10
187
+
188
+ async def parse(self, response: Response):
189
+ for quote in response.css('.quote'):
190
+ yield {
191
+ "text": quote.css('.text::text').get(),
192
+ "author": quote.css('.author::text').get(),
193
+ }
194
+
195
+ next_page = response.css('.next a')
196
+ if next_page:
197
+ yield response.follow(next_page[0].attrib['href'])
198
+
199
+ result = QuotesSpider().start()
200
+ print(f"Извлечено {len(result.items)} цитат")
201
+ result.items.to_json("quotes.json")
202
+ ```
203
+ Используйте несколько типов сессий в одном Spider:
204
+ ```python
205
+ from scrapling.spiders import Spider, Request, Response
206
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession
207
+
208
+ class MultiSessionSpider(Spider):
209
+ name = "multi"
210
+ start_urls = ["https://example.com/"]
211
+
212
+ def configure_sessions(self, manager):
213
+ manager.add("fast", FetcherSession(impersonate="chrome"))
214
+ manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)
215
+
216
+ async def parse(self, response: Response):
217
+ for link in response.css('a::attr(href)').getall():
218
+ # Направляйте защищённые страницы через stealth-сессию
219
+ if "protected" in link:
220
+ yield Request(link, sid="stealth")
221
+ else:
222
+ yield Request(link, sid="fast", callback=self.parse) # явный callback
223
+ ```
224
+ Приостанавливайте и возобновляйте длительные обходы с помощью Checkpoint'ов, запуская Spider следующим образом:
225
+ ```python
226
+ QuotesSpider(crawldir="./crawl_data").start()
227
  ```
228
+ Нажмите Ctrl+C для мягкой остановки — прогресс сохраняется автоматически. Позже, когда вы снова запустите Spider, передайте тот же `crawldir`, и он продолжит с того места, где остановился.
229
+
230
+ ### Продвинутый парсинг и навигация
231
+ ```python
232
+ from scrapling.fetchers import Fetcher
233
 
234
+ # Богатый выбор элементов и навигация
235
+ page = Fetcher.get('https://quotes.toscrape.com/')
236
+
237
+ # Получение цитат различными методами выбора
238
+ quotes = page.css('.quote') # CSS-селектор
239
+ quotes = page.xpath('//div[@class="quote"]') # XPath
240
+ quotes = page.find_all('div', {'class': 'quote'}) # В стиле BeautifulSoup
241
+ # То же самое, что
242
+ quotes = page.find_all('div', class_='quote')
243
+ quotes = page.find_all(['div'], class_='quote')
244
+ quotes = page.find_all(class_='quote') # и так далее...
245
+ # Найти элемент по текстовому содержимому
246
+ quotes = page.find_by_text('quote', tag='div')
247
+
248
+ # Продвинутая навигация
249
+ quote_text = page.css('.quote')[0].css('.text::text').get()
250
+ quote_text = page.css('.quote').css('.text::text').getall() # Цепочка селекторов
251
+ first_quote = page.css('.quote')[0]
252
+ author = first_quote.next_sibling.css('.author::text')
253
+ parent_container = first_quote.parent
254
+
255
+ # Связи элементов и подобие
256
+ similar_elements = first_quote.find_similar()
257
+ below_elements = first_quote.below_elements()
258
+ ```
259
+ Вы можете использовать парсер напрямую, если не хотите загружать сайты, как показано ниже:
260
  ```python
261
+ from scrapling.parser import Selector
262
+
263
+ page = Selector("<html>...</html>")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  ```
265
+ И он работает точно так же!
266
 
267
+ ### Примеры async Session
268
  ```python
269
+ import asyncio
270
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession
271
+
272
+ async with FetcherSession(http3=True) as session: # `FetcherSession` контекстно-осведомлён и может работать как в sync, так и в async-режимах
273
+ page1 = session.get('https://quotes.toscrape.com/')
 
 
 
 
274
  page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
275
 
276
+ # Использование async-сессии
277
  async with AsyncStealthySession(max_pages=2) as session:
278
  tasks = []
279
  urls = ['https://example.com/page1', 'https://example.com/page2']
280
+
281
  for url in urls:
282
  task = session.fetch(url)
283
  tasks.append(task)
284
+
285
+ print(session.get_pool_stats()) # Опционально статус пула вкладок браузера (занят/свободен/ошибка)
286
  results = await asyncio.gather(*tasks)
287
  print(session.get_pool_stats())
288
  ```
289
 
290
+ ## CLI и интерактивная Shell
291
 
292
+ Scrapling включает мощный интерфейс командной строки:
293
 
294
  [![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)
295
 
296
+ Запустить интерактивную Web Scraping Shell
297
  ```bash
298
  scrapling shell
299
  ```
300
+ Извлечь страницы в файл напрямую без программирования (по умолчанию извлекает содержимое внутри тега `body`). Если выходной файл заканчивается на `.txt`, будет извлечено текстовое содержимое цели. Если заканчивается на `.md`, это будет Markdown-представление HTML-содержимого; если заканчивается на `.html`, это будет само HTML-содержимое.
301
  ```bash
302
  scrapling extract get 'https://example.com' content.md
303
  scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # Все элементы, соответствующие CSS-селектору '#fromSkipToProducts'
 
306
  ```
307
 
308
  > [!NOTE]
309
+ > Есть множество дополнительных возможностей, но мы хотим сохранить эту страницу краткой, включая MCP-сервер и интерактивную Web Scraping Shell. Ознакомьтесь с полной документацией [здесь](https://scrapling.readthedocs.io/en/latest/)
310
 
311
  ## Тесты производительности
312
 
313
+ Scrapling не только мощный он ещё и невероятно быстрый. Следующие тесты производительности сравнивают парсер Scrapling с последними версиями других популярных библиотек.
314
 
315
  ### Тест скорости извлечения текста (5000 вложенных элементов)
316
 
317
+ | # | Библиотека | Время (мс) | vs Scrapling |
318
  |---|:-----------------:|:----------:|:------------:|
319
+ | 1 | Scrapling | 2.02 | 1.0x |
320
+ | 2 | Parsel/Scrapy | 2.04 | 1.01 |
321
+ | 3 | Raw Lxml | 2.54 | 1.257 |
322
+ | 4 | PyQuery | 24.17 | ~12x |
323
+ | 5 | Selectolax | 82.63 | ~41x |
324
+ | 6 | MechanicalSoup | 1549.71 | ~767.1x |
325
+ | 7 | BS4 with Lxml | 1584.31 | ~784.3x |
326
+ | 8 | BS4 with html5lib | 3391.91 | ~1679.1x |
327
 
328
 
329
  ### Производительность подобия элементов и текстового поиска
 
332
 
333
  | Библиотека | Время (мс) | vs Scrapling |
334
  |-------------|:----------:|:------------:|
335
+ | Scrapling | 2.39 | 1.0x |
336
+ | AutoScraper | 12.45 | 5.209x |
337
 
338
 
339
  > Все тесты производительности представляют собой средние значения более 100 запусков. См. [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) для методологии.
 
346
  pip install scrapling
347
  ```
348
 
349
+ Эта установка включает только движок парсера и его зависимости, без каких-либо Fetcher'ов или зависимостей командной строки.
350
 
351
  ### Опциональные зависимости
352
 
353
+ 1. Если вы собираетесь использовать какие-либо из дополнительных возможностей ниже, Fetcher'ы или их классы, вам необходимо установить зависимости Fetcher'ов и браузеров следующим образом:
354
  ```bash
355
  pip install "scrapling[fetchers]"
356
+
357
  scrapling install
358
  ```
359
 
360
+ Это загрузит все браузеры вместе с их системными зависимостями и зависимостями для манипуляции fingerprint'ами.
361
 
362
+ 2. Дополнительные возможности:
363
  - Установить функцию MCP-сервера:
364
  ```bash
365
  pip install "scrapling[ai]"
366
  ```
367
+ - Установить функции Shell (Web Scraping Shell и команда `extract`):
368
  ```bash
369
  pip install "scrapling[shell]"
370
  ```
371
+ - Установить всё:
372
  ```bash
373
  pip install "scrapling[all]"
374
  ```
375
+ Помните, что вам нужно установить зависимости браузеров с помощью `scrapling install` после любого из этих дополнений (если вы ещё этого не сделали)
376
 
377
  ### Docker
378
  Вы также можете установить Docker-образ со всеми дополнениями и браузерами с помощью следующей команды из DockerHub:
 
383
  ```bash
384
  docker pull ghcr.io/d4vinci/scrapling:latest
385
  ```
386
+ Этот образ автоматически создаётся и публикуется с помощью GitHub Actions и основной ветки репозитория.
387
 
388
+ ## Участие в разработке
389
 
390
+ Мы приветствуем участие! Пожалуйста, прочитайте наши [руководства по участию в разработке](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) перед началом работы.
391
 
392
  ## Отказ от ответственности
393
 
 
403
  Этот проект включает код, адаптированный из:
404
  - Parsel (лицензия BSD) — Используется для подмодуля [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)
405
 
 
 
 
 
 
 
 
406
  ---
407
+ <div align="center"><small>Разработано и создано с ❤️ Карим Шоаир.</small></div><br>
docs/ai/mcp-server.md CHANGED
@@ -179,7 +179,7 @@ We will gradually go from simple prompts to more complex ones. We will use Claud
179
  ```
180
  Use regular requests to scrape the main content from https://example.com and convert it to markdown format.
181
  ```
182
- This tells Claude which tool to use here, so it doesn't have to guess. Sometimes it will start using normal requests on its own, and at other times, it will assume browsers are better suited for this website without any apparent reason. As a general rule of thumb, you should always tell Claude which tool to use if you want to save time and money and get consistent results.
183
 
184
  2. **Targeted Data Extraction**
185
 
@@ -189,7 +189,7 @@ We will gradually go from simple prompts to more complex ones. We will use Claud
189
  Get all product titles from https://shop.example.com using the CSS selector '.product-title'. If the request fails, retry up to 5 times every 10 seconds.
190
  ```
191
 
192
- The server will extract only the elements matching your selector and return them as a structured list. Notice I told it to set the tool to try only 3 times in case the website has connection issues, but the default setting should be fine for most cases.
193
 
194
  3. **E-commerce Data Collection**
195
 
 
179
  ```
180
  Use regular requests to scrape the main content from https://example.com and convert it to markdown format.
181
  ```
182
+ This tells Claude which tool to use here, so it doesn't have to guess. Sometimes it will start using normal requests on its own, and at other times, it will assume browsers are better suited for this website without any apparent reason. As a rule of thumb, you should always tell Claude which tool to use to save time and money and get consistent results.
183
 
184
  2. **Targeted Data Extraction**
185
 
 
189
  Get all product titles from https://shop.example.com using the CSS selector '.product-title'. If the request fails, retry up to 5 times every 10 seconds.
190
  ```
191
 
192
+ The server will extract only the elements matching your selector and return them as a structured list. Notice I told it to set the tool to try up to 5 times in case the website has connection issues, but the default setting should be fine for most cases.
193
 
194
  3. **E-commerce Data Collection**
195
 
docs/api-reference/mcp-server.md CHANGED
@@ -19,7 +19,7 @@ Or import the server class directly:
19
  from scrapling.core.ai import ScraplingMCPServer
20
 
21
  server = ScraplingMCPServer()
22
- server.serve()
23
  ```
24
 
25
  ## Response Model
 
19
  from scrapling.core.ai import ScraplingMCPServer
20
 
21
  server = ScraplingMCPServer()
22
+ server.serve(http=False, host="0.0.0.0", port=8000)
23
  ```
24
 
25
  ## Response Model
docs/api-reference/proxy-rotation.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ search:
3
+ exclude: true
4
+ ---
5
+
6
+ # Proxy Rotation
7
+
8
+ The `ProxyRotator` class provides thread-safe proxy rotation for any fetcher or session.
9
+
10
+ You can import it directly like below:
11
+
12
+ ```python
13
+ from scrapling.fetchers import ProxyRotator
14
+ ```
15
+
16
+ ## ::: scrapling.engines.toolbelt.proxy_rotation.ProxyRotator
17
+ handler: python
18
+ :docstring:
docs/api-reference/response.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ search:
3
+ exclude: true
4
+ ---
5
+
6
+ # Response Class
7
+
8
+ The `Response` class wraps HTTP responses returned by all fetchers, providing access to status, headers, body, cookies, and a `Selector` for parsing.
9
+
10
+ You can import the `Response` class like below:
11
+
12
+ ```python
13
+ from scrapling.engines.toolbelt.custom import Response
14
+ ```
15
+
16
+ ## ::: scrapling.engines.toolbelt.custom.Response
17
+ handler: python
18
+ :docstring:
docs/api-reference/spiders.md ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ search:
3
+ exclude: true
4
+ ---
5
+
6
+ # Spider Classes
7
+
8
+ Here's the reference information for the spider framework classes' parameters, attributes, and methods.
9
+
10
+ You can import them directly like below:
11
+
12
+ ```python
13
+ from scrapling.spiders import Spider, Request, CrawlResult, SessionManager, Response
14
+ ```
15
+
16
+ ## ::: scrapling.spiders.Spider
17
+ handler: python
18
+ :docstring:
19
+
20
+ ## ::: scrapling.spiders.Request
21
+ handler: python
22
+ :docstring:
23
+
24
+ ## Result Classes
25
+
26
+ ## ::: scrapling.spiders.result.CrawlResult
27
+ handler: python
28
+ :docstring:
29
+
30
+ ## ::: scrapling.spiders.result.CrawlStats
31
+ handler: python
32
+ :docstring:
33
+
34
+ ## ::: scrapling.spiders.result.ItemList
35
+ handler: python
36
+ :docstring:
37
+
38
+ ## Session Management
39
+
40
+ ## ::: scrapling.spiders.session.SessionManager
41
+ handler: python
42
+ :docstring:
docs/benchmarks.md CHANGED
@@ -1,21 +1,20 @@
1
  # Performance Benchmarks
2
 
3
- Scrapling isn't just powerful—it's also blazing fast, and the updates since version 0.3 have delivered exceptional performance improvements across all operations. The following benchmarks compare Scrapling's parser with other popular libraries.
4
-
5
- ## Benchmark Results
6
 
7
  ### Text Extraction Speed Test (5000 nested elements)
8
 
9
  | # | Library | Time (ms) | vs Scrapling |
10
  |---|:-----------------:|:---------:|:------------:|
11
- | 1 | Scrapling | 1.99 | 1.0x |
12
- | 2 | Parsel/Scrapy | 2.01 | 1.01x |
13
- | 3 | Raw Lxml | 2.5 | 1.256x |
14
- | 4 | PyQuery | 22.93 | ~11.5x |
15
- | 5 | Selectolax | 80.57 | ~40.5x |
16
- | 6 | BS4 with Lxml | 1541.37 | ~774.6x |
17
- | 7 | MechanicalSoup | 1547.35 | ~777.6x |
18
- | 8 | BS4 with html5lib | 3410.58 | ~1713.9x |
 
19
 
20
  ### Element Similarity & Text Search Performance
21
 
@@ -23,5 +22,7 @@ Scrapling's adaptive element finding capabilities significantly outperform alter
23
 
24
  | Library | Time (ms) | vs Scrapling |
25
  |-------------|:---------:|:------------:|
26
- | Scrapling | 2.46 | 1.0x |
27
- | AutoScraper | 13.3 | 5.407x |
 
 
 
1
  # Performance Benchmarks
2
 
3
+ Scrapling isn't just powerful—it's also blazing fast. The following benchmarks compare Scrapling's parser with the latest versions of other popular libraries.
 
 
4
 
5
  ### Text Extraction Speed Test (5000 nested elements)
6
 
7
  | # | Library | Time (ms) | vs Scrapling |
8
  |---|:-----------------:|:---------:|:------------:|
9
+ | 1 | Scrapling | 2.02 | 1.0x |
10
+ | 2 | Parsel/Scrapy | 2.04 | 1.01 |
11
+ | 3 | Raw Lxml | 2.54 | 1.257 |
12
+ | 4 | PyQuery | 24.17 | ~12x |
13
+ | 5 | Selectolax | 82.63 | ~41x |
14
+ | 6 | MechanicalSoup | 1549.71 | ~767.1x |
15
+ | 7 | BS4 with Lxml | 1584.31 | ~784.3x |
16
+ | 8 | BS4 with html5lib | 3391.91 | ~1679.1x |
17
+
18
 
19
  ### Element Similarity & Text Search Performance
20
 
 
22
 
23
  | Library | Time (ms) | vs Scrapling |
24
  |-------------|:---------:|:------------:|
25
+ | Scrapling | 2.39 | 1.0x |
26
+ | AutoScraper | 12.45 | 5.209x |
27
+
28
+ > All benchmarks represent averages of 100+ runs. See [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology.
docs/cli/extract-commands.md CHANGED
@@ -4,12 +4,12 @@
4
 
5
  The `scrapling extract` command lets you download and extract content from websites directly from your terminal without writing any code. Ideal for beginners, researchers, and anyone requiring rapid web data extraction.
6
 
7
- > 💡 **Prerequisites:**
8
- >
9
- > 1. Youve completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use.
10
- > 2. Youve completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object.
11
- > 3. Youve completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class.
12
- > 4. Youve completed or read at least one page from the fetchers section to use here for requests: [HTTP requests](../fetching/static.md), [Dynamic websites](../fetching/dynamic.md), or [Dynamic websites with hard protections](../fetching/stealthy.md).
13
 
14
 
15
  ## What is the Extract Command group?
@@ -280,7 +280,7 @@ We will go through each command in detail below.
280
  -s, --css-selector TEXT CSS selector to extract specific content from the page. It returns all matches.
281
  --wait-selector TEXT CSS selector to wait for before proceeding
282
  --locale TEXT Specify user locale. Defaults to the system default locale.
283
- ---real-chrome/--no-real-chrome If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False)
284
  --proxy TEXT Proxy URL in format "http://username:password@host:port"
285
  -H, --extra-headers TEXT Extra headers in format "Key: Value" (can be used multiple times)
286
  --help Show this message and exit.
@@ -320,8 +320,7 @@ We will go through each command in detail below.
320
  --solve-cloudflare / --no-solve-cloudflare Solve Cloudflare challenges (default: False)
321
  --allow-webgl / --block-webgl Allow WebGL (default: True)
322
  --network-idle / --no-network-idle Wait for network idle (default: False)
323
- ---real-chrome/--no-real-chrom If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False)
324
- --hide-canvas/--show-canvas Add noise to canvas operations (default: False)
325
  --timeout INTEGER Timeout in milliseconds (default: 30000)
326
  --wait INTEGER Additional wait time in milliseconds after page load (default: 0)
327
  -s, --css-selector TEXT CSS selector to extract specific content from the page. It returns all matches.
 
4
 
5
  The `scrapling extract` command lets you download and extract content from websites directly from your terminal without writing any code. Ideal for beginners, researchers, and anyone requiring rapid web data extraction.
6
 
7
+ !!! success "Prerequisites"
8
+
9
+ 1. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use.
10
+ 2. You've completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object.
11
+ 3. You've completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class.
12
+ 4. You've completed or read at least one page from the fetchers section to use here for requests: [HTTP requests](../fetching/static.md), [Dynamic websites](../fetching/dynamic.md), or [Dynamic websites with hard protections](../fetching/stealthy.md).
13
 
14
 
15
  ## What is the Extract Command group?
 
280
  -s, --css-selector TEXT CSS selector to extract specific content from the page. It returns all matches.
281
  --wait-selector TEXT CSS selector to wait for before proceeding
282
  --locale TEXT Specify user locale. Defaults to the system default locale.
283
+ --real-chrome/--no-real-chrome If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False)
284
  --proxy TEXT Proxy URL in format "http://username:password@host:port"
285
  -H, --extra-headers TEXT Extra headers in format "Key: Value" (can be used multiple times)
286
  --help Show this message and exit.
 
320
  --solve-cloudflare / --no-solve-cloudflare Solve Cloudflare challenges (default: False)
321
  --allow-webgl / --block-webgl Allow WebGL (default: True)
322
  --network-idle / --no-network-idle Wait for network idle (default: False)
323
+ --real-chrome/--no-real-chrome If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False)
 
324
  --timeout INTEGER Timeout in milliseconds (default: 30000)
325
  --wait INTEGER Additional wait time in milliseconds after page load (default: 0)
326
  -s, --css-selector TEXT CSS selector to extract specific content from the page. It returns all matches.
docs/cli/interactive-shell.md CHANGED
@@ -1,17 +1,17 @@
1
  # Scrapling Interactive Shell Guide
2
 
3
- <script src="https://asciinema.org/a/736339.js" id="asciicast-736339" async data-autoplay="1" data-loop="1" data-cols="225" data-rows="40" data-start-at="00:06" data-speed="1.5"></script>
4
 
5
  **Powerful Web Scraping REPL for Developers and Data Scientists**
6
 
7
  The Scrapling Interactive Shell is an enhanced IPython-based environment designed specifically for Web Scraping tasks. It provides instant access to all Scrapling features, clever shortcuts, automatic page management, and advanced tools, such as conversion of the curl command.
8
 
9
- > 💡 **Prerequisites:**
10
- >
11
- > 1. Youve completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use.
12
- > 2. Youve completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object.
13
- > 3. Youve completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class.
14
- > 4. Youve completed or read at least one page from the fetchers section to use here for requests: [HTTP requests](../fetching/static.md), [Dynamic websites](../fetching/dynamic.md), or [Dynamic websites with hard protections](../fetching/stealthy.md).
15
 
16
 
17
  ## Why use the Interactive Shell?
@@ -133,7 +133,7 @@ The shell provides a few functions to help you convert curl commands from the br
133
 
134
  First, you need to copy a request as a curl command like the following:
135
 
136
- <img src="../../assets/scrapling_shell_curl.png" title="Copying a request as a curl command from Chrome" alt="Copying a request as a curl command from Chrome" style="width: 70%;"/>
137
 
138
  - **Convert Curl command to Request Object**
139
 
@@ -174,7 +174,7 @@ The shell inherits all IPython capabilities:
174
  >>> %save filename.py 1-10 # Save commands 1-10 to file
175
 
176
  >>> # Tab completion works everywhere
177
- >>> page.c<TAB> # Shows: css, css_first, cookies, etc.
178
  >>> Fetcher.<TAB> # Shows all Fetcher methods
179
 
180
  >>> # Object inspection
 
1
  # Scrapling Interactive Shell Guide
2
 
3
+ <script src="https://asciinema.org/a/736339.js" id="asciicast-736339" async data-autoplay="1" data-loop="1" data-cols="225" data-rows="40" data-start-at="00:06" data-speed="1.5" data-theme="tango"></script>
4
 
5
  **Powerful Web Scraping REPL for Developers and Data Scientists**
6
 
7
  The Scrapling Interactive Shell is an enhanced IPython-based environment designed specifically for Web Scraping tasks. It provides instant access to all Scrapling features, clever shortcuts, automatic page management, and advanced tools, such as conversion of the curl command.
8
 
9
+ !!! success "Prerequisites"
10
+
11
+ 1. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use.
12
+ 2. You've completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object.
13
+ 3. You've completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class.
14
+ 4. You've completed or read at least one page from the fetchers section to use here for requests: [HTTP requests](../fetching/static.md), [Dynamic websites](../fetching/dynamic.md), or [Dynamic websites with hard protections](../fetching/stealthy.md).
15
 
16
 
17
  ## Why use the Interactive Shell?
 
133
 
134
  First, you need to copy a request as a curl command like the following:
135
 
136
+ <img src="../assets/scrapling_shell_curl.png" title="Copying a request as a curl command from Chrome" alt="Copying a request as a curl command from Chrome" style="width: 70%;"/>
137
 
138
  - **Convert Curl command to Request Object**
139
 
 
174
  >>> %save filename.py 1-10 # Save commands 1-10 to file
175
 
176
  >>> # Tab completion works everywhere
177
+ >>> page.c<TAB> # Shows: css, cookies, headers, etc.
178
  >>> Fetcher.<TAB> # Shows all Fetcher methods
179
 
180
  >>> # Object inspection
docs/development/adaptive_storage_system.md CHANGED
@@ -1,3 +1,5 @@
 
 
1
  Scrapling uses SQLite by default, but this tutorial shows how to write your own storage system to store element properties for the `adaptive` feature.
2
 
3
  You might want to use Firebase, for example, and share the database between multiple spiders on different machines. It's a great idea to use an online database like that because spiders can share adaptive data with each other.
@@ -54,7 +56,7 @@ class RedisStorage(StorageSystemMixin):
54
  orjson.dumps(element_dict)
55
  )
56
 
57
- def retrieve(self, identifier: str) -> dict:
58
  # Get data
59
  key = f"scrapling:{self._get_base_url()}:{identifier}"
60
  data = self.redis.get(key)
 
1
+ # Writing your retrieval system
2
+
3
  Scrapling uses SQLite by default, but this tutorial shows how to write your own storage system to store element properties for the `adaptive` feature.
4
 
5
  You might want to use Firebase, for example, and share the database between multiple spiders on different machines. It's a great idea to use an online database like that because spiders can share adaptive data with each other.
 
56
  orjson.dumps(element_dict)
57
  )
58
 
59
+ def retrieve(self, identifier: str) -> dict | None:
60
  # Get data
61
  key = f"scrapling:{self._get_base_url()}:{identifier}"
62
  data = self.redis.get(key)
docs/development/scrapling_custom_types.md CHANGED
@@ -1,3 +1,5 @@
 
 
1
  > You can take advantage of the custom-made types for Scrapling and use them outside the library if you want. It's better than copying their code, after all :)
2
 
3
  ### All current types can be imported alone, like below
 
1
+ # Using Scrapling's custom types
2
+
3
  > You can take advantage of the custom-made types for Scrapling and use them outside the library if you want. It's better than copying their code, after all :)
4
 
5
  ### All current types can be imported alone, like below
docs/fetching/choosing.md CHANGED
@@ -1,3 +1,5 @@
 
 
1
  ## Introduction
2
  Fetchers are classes that can do requests or fetch pages for you easily in a single-line fashion with many features and then return a [Response](#response-object) object. Starting with v0.3, all fetchers have separate classes to keep the session running, so for example, a fetcher that uses a browser will keep the browser open till you finish all your requests through it instead of opening multiple browsers. So it depends on your use case.
3
 
@@ -38,21 +40,22 @@ Then you use it right away without initializing like this, and it will use the d
38
  If you want to configure the parser ([Selector class](../parsing/main_classes.md#selector)) that will be used on the response before returning it for you, then do this first:
39
  ```python
40
  >>> from scrapling.fetchers import Fetcher
41
- >>> Fetcher.configure(adaptive=True, encoding="utf-8", keep_comments=False, keep_cdata=False) # and the rest
42
  ```
43
  or
44
  ```python
45
  >>> from scrapling.fetchers import Fetcher
46
  >>> Fetcher.adaptive=True
47
- >>> Fetcher.encoding="utf-8"
48
  >>> Fetcher.keep_comments=False
49
  >>> Fetcher.keep_cdata=False # and the rest
50
  ```
51
  Then, continue your code as usual.
52
 
53
- The available configuration arguments are: `adaptive`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the [Selector](../parsing/main_classes.md#selector) class. You can display the current configuration anytime by running `<fetcher_class>.display_config()`.
 
 
54
 
55
- > Note: The `adaptive` argument is disabled by default; you must enable it to use that feature.
56
 
57
  ### Set parser config per request
58
  As you probably understand, the logic above for setting the parser config will apply globally to all requests/fetches made through that class, and it's intended for simplicity.
@@ -71,7 +74,12 @@ The `Response` object is the same as the [Selector](../parsing/main_classes.md#s
71
  >>> page.headers # Response headers
72
  >>> page.request_headers # Request headers
73
  >>> page.history # Response history of redirections, if any
74
- >>> page.body # Raw response body without any processing
75
  >>> page.encoding # Response encoding
 
76
  ```
77
- All fetchers return the `Response` object.
 
 
 
 
 
1
+ # Fetchers basics
2
+
3
  ## Introduction
4
  Fetchers are classes that can do requests or fetch pages for you easily in a single-line fashion with many features and then return a [Response](#response-object) object. Starting with v0.3, all fetchers have separate classes to keep the session running, so for example, a fetcher that uses a browser will keep the browser open till you finish all your requests through it instead of opening multiple browsers. So it depends on your use case.
5
 
 
40
  If you want to configure the parser ([Selector class](../parsing/main_classes.md#selector)) that will be used on the response before returning it for you, then do this first:
41
  ```python
42
  >>> from scrapling.fetchers import Fetcher
43
+ >>> Fetcher.configure(adaptive=True, keep_comments=False, keep_cdata=False) # and the rest
44
  ```
45
  or
46
  ```python
47
  >>> from scrapling.fetchers import Fetcher
48
  >>> Fetcher.adaptive=True
 
49
  >>> Fetcher.keep_comments=False
50
  >>> Fetcher.keep_cdata=False # and the rest
51
  ```
52
  Then, continue your code as usual.
53
 
54
+ The available configuration arguments are: `adaptive`, `adaptive_domain`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the [Selector](../parsing/main_classes.md#selector) class. You can display the current configuration anytime by running `<fetcher_class>.display_config()`.
55
+
56
+ !!! info
57
 
58
+ The `adaptive` argument is disabled by default; you must enable it to use that feature.
59
 
60
  ### Set parser config per request
61
  As you probably understand, the logic above for setting the parser config will apply globally to all requests/fetches made through that class, and it's intended for simplicity.
 
74
  >>> page.headers # Response headers
75
  >>> page.request_headers # Request headers
76
  >>> page.history # Response history of redirections, if any
77
+ >>> page.body # Raw response body as bytes
78
  >>> page.encoding # Response encoding
79
+ >>> page.meta # Response metadata dictionary (e.g., proxy used). Mainly helpful with the spiders system.
80
  ```
81
+ All fetchers return the `Response` object.
82
+
83
+ !!! note
84
+
85
+ Unlike the [Selector](../parsing/main_classes.md#selector) class, the `Response` class's body is always bytes since v0.4.
docs/fetching/dynamic.md CHANGED
@@ -1,14 +1,14 @@
1
- # Introduction
2
 
3
  Here, we will discuss the `DynamicFetcher` class (formerly `PlayWrightFetcher`). This class provides flexible browser automation with multiple configuration options and little under-the-hood stealth improvements.
4
 
5
  As we will explain later, to automate the page, you need some knowledge of [Playwright's Page API](https://playwright.dev/python/docs/api/class-page).
6
 
7
- > 💡 **Prerequisites:**
8
- >
9
- > 1. Youve completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use.
10
- > 2. Youve completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object.
11
- > 3. Youve completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class.
12
 
13
  ## Basic Usage
14
  You have one primary way to import this Fetcher, which is the same for all fetchers.
@@ -20,7 +20,9 @@ Check out how to configure the parsing options [here](choosing.md#parser-configu
20
 
21
  Now, we will review most of the arguments one by one, using examples. If you want to jump to a table of all arguments for quick reference, [click here](#full-list-of-arguments)
22
 
23
- > Note: The async version of the `fetch` method is the `async_fetch` method, of course.
 
 
24
 
25
 
26
  This fetcher currently provides three main run options that can be combined as desired.
@@ -51,10 +53,10 @@ DynamicFetcher.fetch('https://example.com', cdp_url='ws://localhost:9222')
51
  Instead of launching a browser locally (Chromium/Google Chrome), you can connect to a remote browser through the [Chrome DevTools Protocol](https://chromedevtools.github.io/devtools-protocol/).
52
 
53
 
54
- > Notes:
55
- >
56
- > * There was a `stealth` option here, but it was moved to the `StealthyFetcher` class, as explained on the next page, with additional features since version 0.3.13.<br/>
57
- > * This makes it less confusing for new users, easier to maintain, and provides other benefits, as explained on the [StealthyFetcher page](../fetching/stealthy.md).
58
 
59
  ## Full list of arguments
60
  Scrapling provides many options with this fetcher and its session classes. To make it as simple as possible, we will list the options here and give examples of how to use most of them.
@@ -85,15 +87,19 @@ Scrapling provides many options with this fetcher and its session classes. To ma
85
  | extra_flags | A list of additional browser flags to pass to the browser on launch. | ✔️ |
86
  | additional_args | Additional arguments to be passed to Playwright's context as additional settings, and they take higher priority than Scrapling's settings. | ✔️ |
87
  | selector_config | A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class. | ✔️ |
 
 
 
 
88
 
89
- In session classes, all these arguments can be set globally for the session. Still, you can configure each request individually by passing some of the arguments here that can be configured on the browser tab level like: `google_search`, `timeout`, `wait`, `page_action`, `extra_headers`, `disable_resources`, `wait_selector`, `wait_selector_state`, `network_idle`, `load_dom`, and `selector_config`.
90
 
91
- > 🔍 Notes:
92
- >
93
- > 1. The `disable_resources` option made requests ~25% faster in my tests for some websites and can help save your proxy usage, but be careful with it, as it can cause some websites to never finish loading.
94
- > 2. The `google_search` argument is enabled by default for all requests, making the request appear to come from a Google search page. So, a request for `https://example.com` will set the referer to `https://www.google.com/search?q=example`. Also, if used together, it takes priority over the referer set by the `extra_headers` argument.
95
- > 3. Since version 0.3.13, the `stealth` option has been removed here in favor of the `StealthyFetcher` class, and the `hide_canvas` option has been moved to it. The `disable_webgl` argument has been moved to the `StealthyFetcher` class and renamed as `allow_webgl`.
96
- > 4. If you didn't set a user agent and enabled headless mode, the fetcher will generate a real user agent for the same browser version and use it. If you didn't set a user agent and didn't enable headless mode, the fetcher will use the browser's default user agent, which is the same as in standard browsers in the latest versions.
97
 
98
 
99
  ## Examples
@@ -106,6 +112,13 @@ It's easier to understand with examples, so let's take a look.
106
  page = DynamicFetcher.fetch('https://example.com', disable_resources=True) # Blocks fonts, images, media, etc.
107
  ```
108
 
 
 
 
 
 
 
 
109
  ### Network Control
110
 
111
  ```python
@@ -119,16 +132,41 @@ page = DynamicFetcher.fetch('https://example.com', timeout=30000) # 30 seconds
119
  page = DynamicFetcher.fetch('https://example.com', proxy='http://username:password@host:port')
120
  ```
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  ### Downloading Files
123
 
124
  ```python
125
- page = DynamicFetcher.fetch('https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/poster.png')
126
 
127
- with open(file='poster.png', mode='wb') as f:
128
  f.write(page.body)
129
  ```
130
 
131
- The `body` attribute of the `Response` object is a `bytes` object containing the response body in case of non-HTML responses.
132
 
133
  ### Browser Automation
134
  This is where your knowledge about [Playwright's Page API](https://playwright.dev/python/docs/api/class-page) comes into play. The function you pass here takes the page object from Playwright's API, performs the desired action, and then the fetcher continues.
@@ -206,7 +244,7 @@ def scrape_dynamic_content():
206
  content = page.css('.content')
207
 
208
  return {
209
- 'title': content.css_first('h1::text'),
210
  'items': [
211
  item.text for item in content.css('.item')
212
  ]
 
1
+ # Fetching dynamic websites
2
 
3
  Here, we will discuss the `DynamicFetcher` class (formerly `PlayWrightFetcher`). This class provides flexible browser automation with multiple configuration options and little under-the-hood stealth improvements.
4
 
5
  As we will explain later, to automate the page, you need some knowledge of [Playwright's Page API](https://playwright.dev/python/docs/api/class-page).
6
 
7
+ !!! success "Prerequisites"
8
+
9
+ 1. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use.
10
+ 2. You've completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object.
11
+ 3. You've completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class.
12
 
13
  ## Basic Usage
14
  You have one primary way to import this Fetcher, which is the same for all fetchers.
 
20
 
21
  Now, we will review most of the arguments one by one, using examples. If you want to jump to a table of all arguments for quick reference, [click here](#full-list-of-arguments)
22
 
23
+ !!! abstract
24
+
25
+ The async version of the `fetch` method is `async_fetch`, of course.
26
 
27
 
28
  This fetcher currently provides three main run options that can be combined as desired.
 
53
  Instead of launching a browser locally (Chromium/Google Chrome), you can connect to a remote browser through the [Chrome DevTools Protocol](https://chromedevtools.github.io/devtools-protocol/).
54
 
55
 
56
+ !!! note "Notes:"
57
+
58
+ * There was a `stealth` option here, but it was moved to the `StealthyFetcher` class, as explained on the next page, with additional features since version 0.3.13.<br/>
59
+ * This makes it less confusing for new users, easier to maintain, and provides other benefits, as explained on the [StealthyFetcher page](../fetching/stealthy.md).
60
 
61
  ## Full list of arguments
62
  Scrapling provides many options with this fetcher and its session classes. To make it as simple as possible, we will list the options here and give examples of how to use most of them.
 
87
  | extra_flags | A list of additional browser flags to pass to the browser on launch. | ✔️ |
88
  | additional_args | Additional arguments to be passed to Playwright's context as additional settings, and they take higher priority than Scrapling's settings. | ✔️ |
89
  | selector_config | A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class. | ✔️ |
90
+ | blocked_domains | A set of domain names to block requests to. Subdomains are also matched (e.g., `"example.com"` blocks `"sub.example.com"` too). | ✔️ |
91
+ | proxy_rotator | A `ProxyRotator` instance for automatic proxy rotation. Cannot be combined with `proxy`. | ✔️ |
92
+ | retries | Number of retry attempts for failed requests. Defaults to 3. | ✔️ |
93
+ | retry_delay | Seconds to wait between retry attempts. Defaults to 1. | ✔️ |
94
 
95
+ In session classes, all these arguments can be set globally for the session. Still, you can configure each request individually by passing some of the arguments here that can be configured on the browser tab level like: `google_search`, `timeout`, `wait`, `page_action`, `extra_headers`, `disable_resources`, `wait_selector`, `wait_selector_state`, `network_idle`, `load_dom`, `blocked_domains`, `proxy`, and `selector_config`.
96
 
97
+ !!! note "Notes:"
98
+
99
+ 1. The `disable_resources` option made requests ~25% faster in my tests for some websites and can help save your proxy usage, but be careful with it, as it can cause some websites to never finish loading.
100
+ 2. The `google_search` argument is enabled by default for all requests, making the request appear to come from a Google search page. So, a request for `https://example.com` will set the referer to `https://www.google.com/search?q=example`. Also, if used together, it takes priority over the referer set by the `extra_headers` argument.
101
+ 3. Since version 0.3.13, the `stealth` option has been removed here in favor of the `StealthyFetcher` class, and the `hide_canvas` option has been moved to it. The `disable_webgl` argument has been moved to the `StealthyFetcher` class and renamed as `allow_webgl`.
102
+ 4. If you didn't set a user agent and enabled headless mode, the fetcher will generate a real user agent for the same browser version and use it. If you didn't set a user agent and didn't enable headless mode, the fetcher will use the browser's default user agent, which is the same as in standard browsers in the latest versions.
103
 
104
 
105
  ## Examples
 
112
  page = DynamicFetcher.fetch('https://example.com', disable_resources=True) # Blocks fonts, images, media, etc.
113
  ```
114
 
115
+ ### Domain Blocking
116
+
117
+ ```python
118
+ # Block requests to specific domains (and their subdomains)
119
+ page = DynamicFetcher.fetch('https://example.com', blocked_domains={"ads.example.com", "tracker.net"})
120
+ ```
121
+
122
  ### Network Control
123
 
124
  ```python
 
132
  page = DynamicFetcher.fetch('https://example.com', proxy='http://username:password@host:port')
133
  ```
134
 
135
+ ### Proxy Rotation
136
+
137
+ ```python
138
+ from scrapling.fetchers import DynamicSession, ProxyRotator
139
+
140
+ # Set up proxy rotation
141
+ rotator = ProxyRotator([
142
+ "http://proxy1:8080",
143
+ "http://proxy2:8080",
144
+ "http://proxy3:8080",
145
+ ])
146
+
147
+ # Use with session - rotates proxy automatically with each request
148
+ with DynamicSession(proxy_rotator=rotator, headless=True) as session:
149
+ page1 = session.fetch('https://example1.com')
150
+ page2 = session.fetch('https://example2.com')
151
+
152
+ # Override rotator for a specific request
153
+ page3 = session.fetch('https://example3.com', proxy='http://specific-proxy:8080')
154
+ ```
155
+
156
+ !!! warning
157
+
158
+ Remember that by default, all browser-based fetchers and sessions use a persistent browser context with a pool of tabs. However, since browsers can't set a proxy per tab, when you use a `ProxyRotator`, the fetcher will automatically open a separate context for each proxy, with one tab per context. Once the tab's job is done, both the tab and its context are closed.
159
+
160
  ### Downloading Files
161
 
162
  ```python
163
+ page = DynamicFetcher.fetch('https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/main_cover.png')
164
 
165
+ with open(file='main_cover.png', mode='wb') as f:
166
  f.write(page.body)
167
  ```
168
 
169
+ The `body` attribute of the `Response` object always returns `bytes`.
170
 
171
  ### Browser Automation
172
  This is where your knowledge about [Playwright's Page API](https://playwright.dev/python/docs/api/class-page) comes into play. The function you pass here takes the page object from Playwright's API, performs the desired action, and then the fetcher continues.
 
244
  content = page.css('.content')
245
 
246
  return {
247
+ 'title': content.css('h1::text').get(),
248
  'items': [
249
  item.text for item in content.css('.item')
250
  ]
docs/fetching/static.md CHANGED
@@ -1,12 +1,12 @@
1
- # Introduction
2
 
3
  The `Fetcher` class provides rapid and lightweight HTTP requests using the high-performance `curl_cffi` library with a lot of stealth capabilities.
4
 
5
- > 💡 **Prerequisites:**
6
- >
7
- > 1. Youve completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use.
8
- > 2. Youve completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object.
9
- > 3. Youve completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class.
10
 
11
  ## Basic Usage
12
  You have one primary way to import this Fetcher, which is the same for all fetchers.
@@ -31,18 +31,20 @@ All methods for making requests here share some arguments, so let's discuss them
31
  - **proxy**: As the name implies, the proxy for this request is used to route all traffic (HTTP and HTTPS). The format accepted here is `http://username:password@localhost:8030`.
32
  - **proxy_auth**: HTTP basic auth for proxy, tuple of (username, password).
33
  - **proxies**: Dict of proxies to use. Format: `{"http": proxy_url, "https": proxy_url}`.
 
34
  - **headers**: Headers to include in the request. Can override any header generated by the `stealthy_headers` argument
35
  - **max_redirects**: Maximum number of redirects. **Defaults to 30**, use -1 for unlimited.
36
  - **verify**: Whether to verify HTTPS certificates. **Defaults to True**.
37
  - **cert**: Tuple of (cert, key) filenames for the client certificate.
38
  - **selector_config**: A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class.
39
 
40
- > Note: <br/>
41
- > 1. The currently available browsers to impersonate are (`"edge"`, `"chrome"`, `"chrome_android"`, `"safari"`, `"safari_beta"`, `"safari_ios"`, `"safari_ios_beta"`, `"firefox"`, `"tor"`)<br/>
42
- > 2. The available browsers to impersonate, along with their corresponding versions, are automatically displayed in the argument autocompletion and updated with each `curl_cffi` update.<br/>
43
- > 3. If any of the arguments `impersonate` or `stealthy_headers` are enabled, the fetchers will automatically generate real browser headers that match the browser version used.
 
44
 
45
- Other than this, for further customization, you can pass any arguments that `curl_cffi` supports for any method if that method doesn't already support it.
46
 
47
  ### HTTP Methods
48
  There are additional arguments for each method, depending on the method, such as `params` for GET requests and `data`/`json` for POST/PUT/DELETE requests.
@@ -186,19 +188,50 @@ with FetcherSession(
186
  page1 = session.get('https://scrapling.requestcatcher.com/get')
187
  page2 = session.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'})
188
  page3 = session.get('https://api.github.com/events')
189
-
190
  # All requests share the same session and connection pool
191
  ```
192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  And here's an async example
194
 
195
  ```python
196
  async with FetcherSession(impersonate='firefox', http3=True) as session:
197
  # All standard HTTP methods available
198
- response = async session.get('https://example.com')
199
- response = async session.post('https://scrapling.requestcatcher.com/post', json={'data': 'value'})
200
- response = async session.put('https://scrapling.requestcatcher.com/put', data={'update': 'info'})
201
- response = async session.delete('https://scrapling.requestcatcher.com/delete')
202
  ```
203
  or better
204
  ```python
@@ -239,11 +272,11 @@ page = Fetcher.get('https://example.com')
239
  # Check the status
240
  if page.status == 200:
241
  # Extract title
242
- title = page.css_first('title::text')
243
  print(f"Page title: {title}")
244
-
245
  # Extract all links
246
- links = page.css('a::attr(href)')
247
  print(f"Found {len(links)} links")
248
  ```
249
 
@@ -261,9 +294,9 @@ def scrape_products():
261
  results = []
262
  for product in products:
263
  results.append({
264
- 'title': product.css_first('.title::text'),
265
- 'price': product.css_first('.price::text').re_first(r'\d+\.\d{2}'),
266
- 'description': product.css_first('.description::text'),
267
  'in_stock': product.has_class('in-stock')
268
  })
269
 
@@ -275,8 +308,8 @@ def scrape_products():
275
  ```python
276
  from scrapling.fetchers import Fetcher
277
 
278
- page = Fetcher.get('https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/poster.png')
279
- with open(file='poster.png', mode='wb') as f:
280
  f.write(page.body)
281
  ```
282
 
@@ -302,8 +335,8 @@ def scrape_all_pages():
302
  # Process products
303
  for product in products:
304
  all_products.append({
305
- 'name': product.css_first('.name::text'),
306
- 'price': product.css_first('.price::text')
307
  })
308
 
309
  # Next page
@@ -329,7 +362,7 @@ response = Fetcher.post(
329
  # Check login success
330
  if response.status == 200:
331
  # Extract user info
332
- user_name = response.css_first('.user-name::text')
333
  print(f"Logged in as: {user_name}")
334
  ```
335
 
@@ -342,7 +375,7 @@ def extract_table():
342
  page = Fetcher.get('https://example.com/data')
343
 
344
  # Find table
345
- table = page.css_first('table')
346
 
347
  # Extract headers
348
  headers = [
@@ -367,12 +400,13 @@ def extract_menu():
367
  page = Fetcher.get('https://example.com')
368
 
369
  # Find navigation
370
- nav = page.css_first('nav')
371
 
372
  menu = {}
373
  for item in nav.css('li'):
374
- link = item.css_first('a')
375
- if link:
 
376
  menu[link.text] = {
377
  'url': link['href'],
378
  'has_submenu': bool(item.css('.submenu'))
 
1
+ # HTTP requests
2
 
3
  The `Fetcher` class provides rapid and lightweight HTTP requests using the high-performance `curl_cffi` library with a lot of stealth capabilities.
4
 
5
+ !!! success "Prerequisites"
6
+
7
+ 1. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use.
8
+ 2. You've completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object.
9
+ 3. You've completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class.
10
 
11
  ## Basic Usage
12
  You have one primary way to import this Fetcher, which is the same for all fetchers.
 
31
  - **proxy**: As the name implies, the proxy for this request is used to route all traffic (HTTP and HTTPS). The format accepted here is `http://username:password@localhost:8030`.
32
  - **proxy_auth**: HTTP basic auth for proxy, tuple of (username, password).
33
  - **proxies**: Dict of proxies to use. Format: `{"http": proxy_url, "https": proxy_url}`.
34
+ - **proxy_rotator**: A `ProxyRotator` instance for automatic proxy rotation. Cannot be combined with `proxy` or `proxies`.
35
  - **headers**: Headers to include in the request. Can override any header generated by the `stealthy_headers` argument
36
  - **max_redirects**: Maximum number of redirects. **Defaults to 30**, use -1 for unlimited.
37
  - **verify**: Whether to verify HTTPS certificates. **Defaults to True**.
38
  - **cert**: Tuple of (cert, key) filenames for the client certificate.
39
  - **selector_config**: A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class.
40
 
41
+ !!! note "Notes:"
42
+
43
+ 1. The currently available browsers to impersonate are (`"edge"`, `"chrome"`, `"chrome_android"`, `"safari"`, `"safari_beta"`, `"safari_ios"`, `"safari_ios_beta"`, `"firefox"`, `"tor"`)<br/>
44
+ 2. The available browsers to impersonate, along with their corresponding versions, are automatically displayed in the argument autocompletion and updated with each `curl_cffi` update.<br/>
45
+ 3. If any of the arguments `impersonate` or `stealthy_headers` are enabled, the fetchers will automatically generate real browser headers that match the browser version used.
46
 
47
+ Other than this, for further customization, you can pass any arguments that `curl_cffi` supports for any method if that method doesn't already support them.
48
 
49
  ### HTTP Methods
50
  There are additional arguments for each method, depending on the method, such as `params` for GET requests and `data`/`json` for POST/PUT/DELETE requests.
 
188
  page1 = session.get('https://scrapling.requestcatcher.com/get')
189
  page2 = session.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'})
190
  page3 = session.get('https://api.github.com/events')
191
+
192
  # All requests share the same session and connection pool
193
  ```
194
 
195
+ You can also use a `ProxyRotator` with `FetcherSession` for automatic proxy rotation across requests:
196
+
197
+ ```python
198
+ from scrapling.fetchers import FetcherSession, ProxyRotator
199
+
200
+ rotator = ProxyRotator([
201
+ 'http://proxy1:8080',
202
+ 'http://proxy2:8080',
203
+ 'http://proxy3:8080',
204
+ ])
205
+
206
+ with FetcherSession(proxy_rotator=rotator, impersonate='chrome') as session:
207
+ # Each request automatically uses the next proxy in rotation
208
+ page1 = session.get('https://example.com/page1')
209
+ page2 = session.get('https://example.com/page2')
210
+
211
+ # You can check which proxy was used via the response metadata
212
+ print(page1.meta['proxy'])
213
+ ```
214
+
215
+ You can also override the session proxy (or rotator) for a specific request by passing `proxy=` directly to the request method:
216
+
217
+ ```python
218
+ with FetcherSession(proxy='http://default-proxy:8080') as session:
219
+ # Uses the session proxy
220
+ page1 = session.get('https://example.com/page1')
221
+
222
+ # Override the proxy for this specific request
223
+ page2 = session.get('https://example.com/page2', proxy='http://special-proxy:9090')
224
+ ```
225
+
226
  And here's an async example
227
 
228
  ```python
229
  async with FetcherSession(impersonate='firefox', http3=True) as session:
230
  # All standard HTTP methods available
231
+ response = await session.get('https://example.com')
232
+ response = await session.post('https://scrapling.requestcatcher.com/post', json={'data': 'value'})
233
+ response = await session.put('https://scrapling.requestcatcher.com/put', data={'update': 'info'})
234
+ response = await session.delete('https://scrapling.requestcatcher.com/delete')
235
  ```
236
  or better
237
  ```python
 
272
  # Check the status
273
  if page.status == 200:
274
  # Extract title
275
+ title = page.css('title::text').get()
276
  print(f"Page title: {title}")
277
+
278
  # Extract all links
279
+ links = page.css('a::attr(href)').getall()
280
  print(f"Found {len(links)} links")
281
  ```
282
 
 
294
  results = []
295
  for product in products:
296
  results.append({
297
+ 'title': product.css('.title::text').get(),
298
+ 'price': product.css('.price::text').re_first(r'\d+\.\d{2}'),
299
+ 'description': product.css('.description::text').get(),
300
  'in_stock': product.has_class('in-stock')
301
  })
302
 
 
308
  ```python
309
  from scrapling.fetchers import Fetcher
310
 
311
+ page = Fetcher.get('https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/main_cover.png')
312
+ with open(file='main_cover.png', mode='wb') as f:
313
  f.write(page.body)
314
  ```
315
 
 
335
  # Process products
336
  for product in products:
337
  all_products.append({
338
+ 'name': product.css('.name::text').get(),
339
+ 'price': product.css('.price::text').get()
340
  })
341
 
342
  # Next page
 
362
  # Check login success
363
  if response.status == 200:
364
  # Extract user info
365
+ user_name = response.css('.user-name::text').get()
366
  print(f"Logged in as: {user_name}")
367
  ```
368
 
 
375
  page = Fetcher.get('https://example.com/data')
376
 
377
  # Find table
378
+ table = page.css('table')[0]
379
 
380
  # Extract headers
381
  headers = [
 
400
  page = Fetcher.get('https://example.com')
401
 
402
  # Find navigation
403
+ nav = page.css('nav')[0]
404
 
405
  menu = {}
406
  for item in nav.css('li'):
407
+ links = item.css('a')
408
+ if links:
409
+ link = links[0]
410
  menu[link.text] = {
411
  'url': link['href'],
412
  'has_submenu': bool(item.css('.submenu'))
docs/fetching/stealthy.md CHANGED
@@ -1,17 +1,15 @@
1
- # Introduction
2
 
3
  Here, we will discuss the `StealthyFetcher` class. This class is very similar to the [DynamicFetcher](dynamic.md#introduction) class, including the browsers, the automation, and the use of [Playwright's API](https://playwright.dev/python/docs/intro). The main difference is that this class provides advanced anti-bot protection bypass capabilities; most of them are handled automatically under the hood, and the rest is up to you to enable.
4
 
5
  As with [DynamicFetcher](dynamic.md#introduction), you will need some knowledge about [Playwright's Page API](https://playwright.dev/python/docs/api/class-page) to automate the page, as we will explain later.
6
 
7
- **Note:** _This fetcher was using a custom version of [Camoufox](https://github.com/daijro/camoufox) as an engine before version 0.3.13, which was replaced now with [patchright](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright) for many reasons. See [this section](#using-camoufox-as-an-engine) for information if you still need to use [Camoufox](https://github.com/daijro/camoufox). We might switch back to [Camoufox](https://github.com/daijro/camoufox) in the future if its development continues._
8
 
9
- > 💡 **Prerequisites:**
10
- >
11
- > 1. You've completed or read the [DynamicFetcher](dynamic.md#introduction) page since this class builds upon it, and we won't repeat the same information here for that reason.
12
- > 2. Youve completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use.
13
- > 3. You’ve completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object.
14
- > 4. You’ve completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class.
15
 
16
  ## Basic Usage
17
  You have one primary way to import this Fetcher, which is the same for all fetchers.
@@ -21,7 +19,9 @@ You have one primary way to import this Fetcher, which is the same for all fetch
21
  ```
22
  Check out how to configure the parsing options [here](choosing.md#parser-configuration-in-all-fetchers)
23
 
24
- > Note: The async version of the `fetch` method is the `async_fetch` method, of course.
 
 
25
 
26
  ## What does it do?
27
 
@@ -69,15 +69,19 @@ Scrapling provides many options with this fetcher and its session classes. Befor
69
  | allow_webgl | Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended, as many WAFs now check if WebGL is enabled. | ✔️ |
70
  | additional_args | Additional arguments to be passed to Playwright's context as additional settings, and they take higher priority than Scrapling's settings. | ✔️ |
71
  | selector_config | A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class. | ✔️ |
 
 
 
 
 
 
72
 
73
- In session classes, all these arguments can be set globally for the session. Still, you can configure each request individually by passing some of the arguments here that can be configured on the browser tab level like: `google_search`, `timeout`, `wait`, `page_action`, `extra_headers`, `disable_resources`, `wait_selector`, `wait_selector_state`, `network_idle`, `load_dom`, `solve_cloudflare`, and `selector_config`.
74
 
75
- > 🔍 Notes:
76
- >
77
- > 1. It's basically the same arguments as [DynamicFetcher](dynamic.md#introduction) class but with these additional arguments `solve_cloudflare`, `block_webrtc`, `hide_canvas`, and `allow_webgl`.
78
- > 2. The `disable_resources` option made requests ~25% faster in my tests for some websites and can help save your proxy usage, but be careful with it, as it can cause some websites to never finish loading.
79
- > 3. The `google_search` argument is enabled by default for all requests, making the request appear to come from a Google search page. So, a request for `https://example.com` will set the referer to `https://www.google.com/search?q=example`. Also, if used together, it takes priority over the referer set by the `extra_headers` argument.
80
- > 4. If you didn't set a user agent and enabled headless mode, the fetcher will generate a real user agent for the same browser version and use it. If you didn't set a user agent and didn't enable headless mode, the fetcher will use the browser's default user agent, which is the same as in standard browsers in the latest versions.
81
 
82
  ## Examples
83
  It's easier to understand with examples, so we will now review most of the arguments individually. Since it's the same class as the [DynamicFetcher](dynamic.md#introduction), you can refer to that page for more examples, as we won't repeat all the examples from there.
@@ -108,11 +112,11 @@ The `solve_cloudflare` parameter enables automatic detection and solving all typ
108
 
109
  And even solves the custom pages with embedded captcha.
110
 
111
- > 🔍 **Important notes:**
112
- >
113
- > 1. Sometimes, with websites that use custom implementations, you will need to use `wait_selector` to make sure Scrapling waits for the real website content to be loaded after solving the captcha. Some websites can be the real definition of an edge case while we are trying to make the solver as generic as possible.
114
- > 2. The timeout should be at least 60 seconds when using the Cloudflare solver for sufficient challenge-solving time.
115
- > 3. This feature works seamlessly with proxies and other stealth options.
116
 
117
  ### Browser Automation
118
  This is where your knowledge about [Playwright's Page API](https://playwright.dev/python/docs/api/class-page) comes into play. The function you pass here takes the page object from Playwright's API, performs the desired action, and then the fetcher continues.
@@ -172,14 +176,14 @@ def scrape_amazon_product(url):
172
 
173
  # Extract product details
174
  return {
175
- 'title': page.css_first('#productTitle::text').clean(),
176
- 'price': page.css_first('.a-price .a-offscreen::text'),
177
- 'rating': page.css_first('[data-feature-name="averageCustomerReviews"] .a-popover-trigger .a-color-base::text'),
178
  'reviews_count': page.css('#acrCustomerReviewText::text').re_first(r'[\d,]+'),
179
  'features': [
180
- li.clean() for li in page.css('#feature-bullets li span::text')
181
  ],
182
- 'availability': page.css_first('#availability').get_all_text(strip=True),
183
  'images': [
184
  img.attrib['src'] for img in page.css('#altImages img')
185
  ]
@@ -248,7 +252,8 @@ In versions 0.3 and 0.3.1, the pool was reusing finished tabs to save more resou
248
  - **Memory efficiency**: Better resource usage compared to launching new browsers with each fetch.
249
 
250
  ## Using Camoufox as an engine
251
- If you see that Camoufox is stable on your device, has no high memory issues, and want to continue using Camoufox as before v0.3.13. This section is for you.
 
252
 
253
  First, you will need to install the Camoufox library, browser, and Firefox system dependencies if you didn't already:
254
  ```commandline
 
1
+ # Fetching dynamic websites with hard protections
2
 
3
  Here, we will discuss the `StealthyFetcher` class. This class is very similar to the [DynamicFetcher](dynamic.md#introduction) class, including the browsers, the automation, and the use of [Playwright's API](https://playwright.dev/python/docs/intro). The main difference is that this class provides advanced anti-bot protection bypass capabilities; most of them are handled automatically under the hood, and the rest is up to you to enable.
4
 
5
  As with [DynamicFetcher](dynamic.md#introduction), you will need some knowledge about [Playwright's Page API](https://playwright.dev/python/docs/api/class-page) to automate the page, as we will explain later.
6
 
7
+ !!! success "Prerequisites"
8
 
9
+ 1. You've completed or read the [DynamicFetcher](dynamic.md#introduction) page since this class builds upon it, and we won't repeat the same information here for that reason.
10
+ 2. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use.
11
+ 3. You've completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object.
12
+ 4. You've completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class.
 
 
13
 
14
  ## Basic Usage
15
  You have one primary way to import this Fetcher, which is the same for all fetchers.
 
19
  ```
20
  Check out how to configure the parsing options [here](choosing.md#parser-configuration-in-all-fetchers)
21
 
22
+ !!! abstract
23
+
24
+ The async version of the `fetch` method is `async_fetch`, of course.
25
 
26
  ## What does it do?
27
 
 
69
  | allow_webgl | Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended, as many WAFs now check if WebGL is enabled. | ✔️ |
70
  | additional_args | Additional arguments to be passed to Playwright's context as additional settings, and they take higher priority than Scrapling's settings. | ✔️ |
71
  | selector_config | A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class. | ✔️ |
72
+ | blocked_domains | A set of domain names to block requests to. Subdomains are also matched (e.g., `"example.com"` blocks `"sub.example.com"` too). | ✔️ |
73
+ | proxy_rotator | A `ProxyRotator` instance for automatic proxy rotation. Cannot be combined with `proxy`. | ✔️ |
74
+ | retries | Number of retry attempts for failed requests. Defaults to 3. | ✔️ |
75
+ | retry_delay | Seconds to wait between retry attempts. Defaults to 1. | ✔️ |
76
+
77
+ In session classes, all these arguments can be set globally for the session. Still, you can configure each request individually by passing some of the arguments here that can be configured on the browser tab level like: `google_search`, `timeout`, `wait`, `page_action`, `extra_headers`, `disable_resources`, `wait_selector`, `wait_selector_state`, `network_idle`, `load_dom`, `solve_cloudflare`, `blocked_domains`, `proxy`, and `selector_config`.
78
 
79
+ !!! note "Notes:"
80
 
81
+ 1. It's basically the same arguments as [DynamicFetcher](dynamic.md#introduction) class, but with these additional arguments: `solve_cloudflare`, `block_webrtc`, `hide_canvas`, and `allow_webgl`.
82
+ 2. The `disable_resources` option made requests ~25% faster in my tests for some websites and can help save your proxy usage, but be careful with it, as it can cause some websites to never finish loading.
83
+ 3. The `google_search` argument is enabled by default for all requests, making the request appear to come from a Google search page. So, a request for `https://example.com` will set the referer to `https://www.google.com/search?q=example`. Also, if used together, it takes priority over the referer set by the `extra_headers` argument.
84
+ 4. If you didn't set a user agent and enabled headless mode, the fetcher will generate a real user agent for the same browser version and use it. If you didn't set a user agent and didn't enable headless mode, the fetcher will use the browser's default user agent, which is the same as in standard browsers in the latest versions.
 
 
85
 
86
  ## Examples
87
  It's easier to understand with examples, so we will now review most of the arguments individually. Since it's the same class as the [DynamicFetcher](dynamic.md#introduction), you can refer to that page for more examples, as we won't repeat all the examples from there.
 
112
 
113
  And even solves the custom pages with embedded captcha.
114
 
115
+ !!! notes "**Important notes:**"
116
+
117
+ 1. Sometimes, with websites that use custom implementations, you will need to use `wait_selector` to make sure Scrapling waits for the real website content to be loaded after solving the captcha. Some websites can be the real definition of an edge case while we are trying to make the solver as generic as possible.
118
+ 2. The timeout should be at least 60 seconds when using the Cloudflare solver for sufficient challenge-solving time.
119
+ 3. This feature works seamlessly with proxies and other stealth options.
120
 
121
  ### Browser Automation
122
  This is where your knowledge about [Playwright's Page API](https://playwright.dev/python/docs/api/class-page) comes into play. The function you pass here takes the page object from Playwright's API, performs the desired action, and then the fetcher continues.
 
176
 
177
  # Extract product details
178
  return {
179
+ 'title': page.css('#productTitle::text').get().clean(),
180
+ 'price': page.css('.a-price .a-offscreen::text').get(),
181
+ 'rating': page.css('[data-feature-name="averageCustomerReviews"] .a-popover-trigger .a-color-base::text').get(),
182
  'reviews_count': page.css('#acrCustomerReviewText::text').re_first(r'[\d,]+'),
183
  'features': [
184
+ li.get().clean() for li in page.css('#feature-bullets li span::text')
185
  ],
186
+ 'availability': page.css('#availability')[0].get_all_text(strip=True),
187
  'images': [
188
  img.attrib['src'] for img in page.css('#altImages img')
189
  ]
 
252
  - **Memory efficiency**: Better resource usage compared to launching new browsers with each fetch.
253
 
254
  ## Using Camoufox as an engine
255
+
256
+ This fetcher used a custom version of [Camoufox](https://github.com/daijro/camoufox) as an engine before version 0.3.13, which was replaced by [patchright](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright) for many reasons. If you see that Camoufox is stable on your device, has no high memory issues, and you want to continue using it, then you can.
257
 
258
  First, you will need to install the Camoufox library, browser, and Firefox system dependencies if you didn't already:
259
  ```commandline
docs/index.md CHANGED
@@ -2,34 +2,46 @@
2
  .md-typeset h1 {
3
  display: none;
4
  }
 
 
5
  </style>
6
 
 
7
  <div align="center">
8
  <a href="https://scrapling.readthedocs.io/en/latest/" alt="poster">
9
- <img alt="poster" src="assets/poster.png" style="width: 50%; height: 100%;"></a>
 
 
10
  </div>
11
 
12
- <div align="center">
13
- <i><code>Easy, effortless Web Scraping as it should be!</code></i>
14
- <br/><br/>
15
- </div>
16
 
17
- **Stop fighting anti-bot systems. Stop rewriting selectors after every website update.**
18
 
19
- Scrapling isn't just another Web Scraping library. It's the first **adaptive** scraping library that learns from website changes and evolves with them. While other libraries break when websites update their structure, Scrapling automatically relocates your elements and keeps your scrapers running.
20
 
21
- Built for the modern Web, Scrapling features **its own rapid parsing engine** and fetchers to handle all Web Scraping challenges you face or will face. Built by Web Scrapers for Web Scrapers and regular users, there's something for everyone.
22
 
23
  ```python
24
- >> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
25
- >> StealthyFetcher.adaptive = True
26
- # Fetch websites' source under the radar!
27
- >> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
28
- >> print(page.status)
29
- 200
30
- >> products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
31
- >> # Later, if the website structure changes, pass `adaptive=True`
32
- >> products = page.css('.product', adaptive=True) # and Scrapling still finds them!
 
 
 
 
 
 
 
 
 
 
33
  ```
34
 
35
  ## Top Sponsors
@@ -51,16 +63,27 @@ Built for the modern Web, Scrapling features **its own rapid parsing engine** an
51
 
52
  ## Key Features
53
 
 
 
 
 
 
 
 
 
 
54
  ### Advanced Websites Fetching with Session Support
55
  - **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class. Can impersonate browsers' TLS fingerprint, headers, and use HTTP/3.
56
- - **Dynamic Loading**: Fetch dynamic websites with full browser automation through the `DynamicFetcher` class supporting Playwright's Chromium, and Google's Chrome.
57
- - **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` and fingerprint spoofing. Can bypass all types of Cloudflare's Turnstile/Interstitial with automation easily.
58
  - **Session Management**: Persistent session support with `FetcherSession`, `StealthySession`, and `DynamicSession` classes for cookie and state management across requests.
 
 
59
  - **Async Support**: Complete async support across all fetchers and dedicated async session classes.
60
 
61
  ### Adaptive Scraping & AI Integration
62
  - 🔄 **Smart Element Tracking**: Relocate elements after website changes using intelligent similarity algorithms.
63
- - 🎯 **Smart Flexible Selection**: CSS selectors, XPath selectors, filter-based search, text search, regex search, and more.
64
  - 🔍 **Find Similar Elements**: Automatically locate elements similar to found elements.
65
  - 🤖 **MCP Server to be used with AI**: Built-in MCP server for AI-assisted Web Scraping and data extraction. The MCP server features powerful, custom capabilities that leverage Scrapling to extract targeted content before passing it to the AI (Claude/Cursor/etc), thereby speeding up operations and reducing costs by minimizing token usage. ([demo video](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
66
 
@@ -72,12 +95,12 @@ Built for the modern Web, Scrapling features **its own rapid parsing engine** an
72
 
73
  ### Developer/Web Scraper Friendly Experience
74
  - 🎯 **Interactive Web Scraping Shell**: Optional built-in IPython shell with Scrapling integration, shortcuts, and new tools to speed up Web Scraping scripts development, like converting curl requests to Scrapling requests and viewing requests results in your browser.
75
- - 🚀 **Use it directly from the Terminal**: Optionally, you can use Scrapling to scrape a URL without writing a single code!
76
  - 🛠️ **Rich Navigation API**: Advanced DOM traversal with parent, sibling, and child navigation methods.
77
  - 🧬 **Enhanced Text Processing**: Built-in regex, cleaning methods, and optimized string operations.
78
  - 📝 **Auto Selector Generation**: Generate robust CSS/XPath selectors for any element.
79
  - 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup with the same pseudo-elements used in Scrapy/Parsel.
80
- - 📘 **Complete Type Coverage**: Full type hints for excellent IDE support and code completion.
81
  - 🔋 **Ready Docker image**: With each release, a Docker image containing all browsers is automatically built and pushed.
82
 
83
 
@@ -86,10 +109,34 @@ Scrapling’s GitHub stars have grown steadily since its release (see chart belo
86
 
87
  <div id="chartContainer">
88
  <a href="https://github.com/D4Vinci/Scrapling">
89
- <img id="chartImage" alt="Star History Chart" loading="lazy" src="https://api.star-history.com/svg?repos=D4Vinci/Scrapling&type=date&legend=top-left&theme=dark" height="400"/>
90
  </a>
91
  </div>
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
  ## Installation
95
  Scrapling requires Python 3.10 or higher:
@@ -98,7 +145,7 @@ Scrapling requires Python 3.10 or higher:
98
  pip install scrapling
99
  ```
100
 
101
- Starting with v0.3.2, this installation only includes the parser engine and its dependencies, without any fetchers or commandline dependencies.
102
 
103
  ### Optional Dependencies
104
 
 
2
  .md-typeset h1 {
3
  display: none;
4
  }
5
+ [data-md-color-scheme="default"] .only-dark { display: none; }
6
+ [data-md-color-scheme="slate"] .only-light { display: none; }
7
  </style>
8
 
9
+ <br/>
10
  <div align="center">
11
  <a href="https://scrapling.readthedocs.io/en/latest/" alt="poster">
12
+ <img alt="Scrapling" src="assets/cover_light.svg" class="only-light">
13
+ <img alt="Scrapling" src="assets/cover_dark.svg" class="only-dark">
14
+ </a>
15
  </div>
16
 
17
+ <h2 align="center"><i>Effortless Web Scraping for the Modern Web</i></h2><br>
 
 
 
18
 
19
+ Scrapling is an adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl.
20
 
21
+ Its parser learns from website changes and automatically relocates your elements when pages update. Its fetchers bypass anti-bot systems like Cloudflare Turnstile out of the box. And its spider framework lets you scale up to concurrent, multi-session crawls with pause/resume and automatic proxy rotation — all in a few lines of Python. One library, zero compromises.
22
 
23
+ Blazing fast crawls with real-time stats and streaming. Built by Web Scrapers for Web Scrapers and regular users, there's something for everyone.
24
 
25
  ```python
26
+ from scrapling.fetchers import Fetcher, StealthyFetcher, DynamicFetcher
27
+ StealthyFetcher.adaptive = True
28
+ page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # Fetch website under the radar!
29
+ products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
30
+ products = page.css('.product', adaptive=True) # Later, if the website structure changes, pass `adaptive=True` to find them!
31
+ ```
32
+ Or scale up to full crawls
33
+ ```python
34
+ from scrapling.spiders import Spider, Response
35
+
36
+ class MySpider(Spider):
37
+ name = "demo"
38
+ start_urls = ["https://example.com/"]
39
+
40
+ async def parse(self, response: Response):
41
+ for item in response.css('.product'):
42
+ yield {"title": item.css('h2::text').get()}
43
+
44
+ MySpider().start()
45
  ```
46
 
47
  ## Top Sponsors
 
63
 
64
  ## Key Features
65
 
66
+ ### Spiders — A Full Crawling Framework
67
+ - 🕷️ **Scrapy-like Spider API**: Define spiders with `start_urls`, async `parse` callbacks, and `Request`/`Response` objects.
68
+ - ⚡ **Concurrent Crawling**: Configurable concurrency limits, per-domain throttling, and download delays.
69
+ - 🔄 **Multi-Session Support**: Unified interface for HTTP requests, and stealthy headless browsers in a single spider — route requests to different sessions by ID.
70
+ - 💾 **Pause & Resume**: Checkpoint-based crawl persistence. Press Ctrl+C for a graceful shutdown; restart to resume from where you left off.
71
+ - 📡 **Streaming Mode**: Stream scraped items as they arrive via `async for item in spider.stream()` with real-time stats — ideal for UI, pipelines, and long-running crawls.
72
+ - 🛡️ **Blocked Request Detection**: Automatic detection and retry of blocked requests with customizable logic.
73
+ - 📦 **Built-in Export**: Export results through hooks and your own pipeline or the built-in JSON/JSONL with `result.items.to_json()` / `result.items.to_jsonl()` respectively.
74
+
75
  ### Advanced Websites Fetching with Session Support
76
  - **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class. Can impersonate browsers' TLS fingerprint, headers, and use HTTP/3.
77
+ - **Dynamic Loading**: Fetch dynamic websites with full browser automation through the `DynamicFetcher` class supporting Playwright's Chromium and Google's Chrome.
78
+ - **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` and fingerprint spoofing. Can easily bypass all types of Cloudflare's Turnstile/Interstitial with automation.
79
  - **Session Management**: Persistent session support with `FetcherSession`, `StealthySession`, and `DynamicSession` classes for cookie and state management across requests.
80
+ - **Proxy Rotation**: Built-in `ProxyRotator` with cyclic or custom rotation strategies across all session types, plus per-request proxy overrides.
81
+ - **Domain Blocking**: Block requests to specific domains (and their subdomains) in browser-based fetchers.
82
  - **Async Support**: Complete async support across all fetchers and dedicated async session classes.
83
 
84
  ### Adaptive Scraping & AI Integration
85
  - 🔄 **Smart Element Tracking**: Relocate elements after website changes using intelligent similarity algorithms.
86
+ - 🎯 **Smart Flexible Selection**: CSS selectors, XPath selectors, filter-based search, text search, regex search, and more.
87
  - 🔍 **Find Similar Elements**: Automatically locate elements similar to found elements.
88
  - 🤖 **MCP Server to be used with AI**: Built-in MCP server for AI-assisted Web Scraping and data extraction. The MCP server features powerful, custom capabilities that leverage Scrapling to extract targeted content before passing it to the AI (Claude/Cursor/etc), thereby speeding up operations and reducing costs by minimizing token usage. ([demo video](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
89
 
 
95
 
96
  ### Developer/Web Scraper Friendly Experience
97
  - 🎯 **Interactive Web Scraping Shell**: Optional built-in IPython shell with Scrapling integration, shortcuts, and new tools to speed up Web Scraping scripts development, like converting curl requests to Scrapling requests and viewing requests results in your browser.
98
+ - 🚀 **Use it directly from the Terminal**: Optionally, you can use Scrapling to scrape a URL without writing a single line of code!
99
  - 🛠️ **Rich Navigation API**: Advanced DOM traversal with parent, sibling, and child navigation methods.
100
  - 🧬 **Enhanced Text Processing**: Built-in regex, cleaning methods, and optimized string operations.
101
  - 📝 **Auto Selector Generation**: Generate robust CSS/XPath selectors for any element.
102
  - 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup with the same pseudo-elements used in Scrapy/Parsel.
103
+ - 📘 **Complete Type Coverage**: Full type hints for excellent IDE support and code completion. The entire codebase is automatically scanned with **PyRight** and **MyPy** with each change.
104
  - 🔋 **Ready Docker image**: With each release, a Docker image containing all browsers is automatically built and pushed.
105
 
106
 
 
109
 
110
  <div id="chartContainer">
111
  <a href="https://github.com/D4Vinci/Scrapling">
112
+ <img id="chartImage" alt="Star History Chart" loading="lazy" src="https://api.star-history.com/svg?repos=D4Vinci/Scrapling&type=Date" height="400"/>
113
  </a>
114
  </div>
115
 
116
+ <script>
117
+ const observer = new MutationObserver((mutations) => {
118
+ mutations.forEach((mutation) => {
119
+ if (mutation.attributeName === 'data-md-color-media') {
120
+ const colorMedia = document.body.getAttribute('data-md-color-media');
121
+ const isDarkScheme = document.body.getAttribute('data-md-color-scheme') === 'slate';
122
+ const chartImg = document.querySelector('#chartImage');
123
+ const baseUrl = 'https://api.star-history.com/svg?repos=D4Vinci/Scrapling&type=Date';
124
+
125
+ if (colorMedia === '(prefers-color-scheme)' ? isDarkScheme : colorMedia.includes('dark')) {
126
+ chartImg.src = `${baseUrl}&theme=dark`;
127
+ } else {
128
+ chartImg.src = baseUrl;
129
+ }
130
+ }
131
+ });
132
+ });
133
+
134
+ observer.observe(document.body, {
135
+ attributes: true,
136
+ attributeFilter: ['data-md-color-media', 'data-md-color-scheme']
137
+ });
138
+ </script>
139
+
140
 
141
  ## Installation
142
  Scrapling requires Python 3.10 or higher:
 
145
  pip install scrapling
146
  ```
147
 
148
+ This installation only includes the parser engine and its dependencies, without any fetchers or commandline dependencies.
149
 
150
  ### Optional Dependencies
151
 
docs/overview.md CHANGED
@@ -1,3 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  We will start by quickly reviewing the parsing capabilities. Then we will fetch websites using custom browsers, make requests, and parse the responses.
2
 
3
  Here's an HTML document generated by ChatGPT that we will be using as an example throughout this page:
@@ -134,7 +148,7 @@ target_element.find_similar()
134
  ```
135
  Find the first element that matches a CSS selector
136
  ```python
137
- page.css_first('.product-list [data-id="1"]')
138
  # <data='<article class="product" data-id="1"><h3...' parent='<div class="product-list"> <article clas...'>
139
  ```
140
  Find all elements that match a CSS selector
@@ -144,7 +158,7 @@ page.css('.product-list article')
144
  ```
145
  Find the first element that matches an XPath selector
146
  ```python
147
- page.xpath_first("//*[@id='products']/div/article")
148
  # <data='<article class="product" data-id="1"><h3...' parent='<div class="product-list"> <article clas...'>
149
  ```
150
  Find all elements that match an XPath selector
@@ -220,14 +234,14 @@ Using the elements we found above
220
  [<data='<section id="reviews"><h2>Customer Revie...' parent='<main><section id="products" schema='{"j...'>]
221
  >>> section_element.next # gets the next element, the same logic applies to `quote.previous`.
222
  <data='<section id="reviews"><h2>Customer Revie...' parent='<main><section id="products" schema='{"j...'>
223
- >>> section_element.children.css('h2::text')
224
  ['Products']
225
- >>> page.css_first('[data-id="1"]').has_class('product')
226
  True
227
  ```
228
  If your case needs more than the element's parent, you can iterate over the whole ancestors' tree of any element, like the one below
229
  ```python
230
- for ancestor in quote.iterancestors():
231
  # do something with it...
232
  ```
233
  You can search for a specific ancestor of an element that satisfies a function; all you need to do is pass a function that takes a `Selector` object as an argument and returns `True` if the condition is satisfied or `False` otherwise, like below:
@@ -264,11 +278,11 @@ For Async requests, you will replace the import like below:
264
  >>> page = await AsyncFetcher.delete('https://scrapling.requestcatcher.com/delete')
265
  ```
266
 
267
- > Notes:
268
- >
269
- > 1. You have the `stealthy_headers` argument, which, when enabled, makes requests to generate real browser headers and use them, including a referer header, as if this request came from a Google search of this domain. It's enabled by default.
270
- > 2. The `impersonate` argument lets you fake the TLS fingerprint for a specific browser version.
271
- > 3. There's also the `http3` argument, which, when enabled, makes the fetcher use HTTP/3 for requests, which makes your requests more authentic
272
 
273
  This is just the tip of the iceberg with this fetcher; check out the rest from [here](fetching/static.md)
274
 
@@ -279,11 +293,11 @@ The `DynamicFetcher` class (formerly `PlayWrightFetcher`) offers many options fo
279
  ```python
280
  >>> from scrapling.fetchers import DynamicFetcher
281
  >>> page = DynamicFetcher.fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # Vanilla Playwright option
282
- >>> page.css_first("#search a::attr(href)")
283
  'https://github.com/D4Vinci/Scrapling'
284
  >>> # The async version of fetch
285
  >>> page = await DynamicFetcher.async_fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True)
286
- >>> page.css_first("#search a::attr(href)")
287
  'https://github.com/D4Vinci/Scrapling'
288
  ```
289
  It's built on top of [Playwright](https://playwright.dev/python/), and it's currently providing two main run options that can be mixed as you want:
@@ -324,7 +338,7 @@ True
324
  True
325
  ```
326
 
327
- Again, this is just the tip of the iceberg with this fetcher. Check out the rest from [here](fetching/dynamic.md) for all details and the complete list of arguments.
328
 
329
  ---
330
 
 
1
+ ## Pick Your Path
2
+
3
+ Not sure where to start? Pick the path that matches what you're trying to do:
4
+
5
+ | I want to... | Start here |
6
+ |:---|:---|
7
+ | **Parse HTML** I already have | [Querying elements](parsing/selection.md) — CSS, XPath, and text-based selection |
8
+ | **Quickly scrape a page** and prototype | Pick a [fetcher](fetching/choosing.md) and test right away, or launch the [interactive shell](cli/interactive-shell.md) |
9
+ | **Build a crawler** that scales | [Spiders](spiders/getting-started.md) — concurrent, multi-session crawls with pause/resume |
10
+ | **Scrape without writing code** | [CLI extract commands](cli/extract-commands.md) or hook up the [MCP server](ai/mcp-server.md) to your favourite AI tool |
11
+ | **Migrate** from another library | [From BeautifulSoup](tutorials/migrating_from_beautifulsoup.md) or [Scrapy comparison](spiders/architecture.md#comparison-with-scrapy) |
12
+
13
+ ---
14
+
15
  We will start by quickly reviewing the parsing capabilities. Then we will fetch websites using custom browsers, make requests, and parse the responses.
16
 
17
  Here's an HTML document generated by ChatGPT that we will be using as an example throughout this page:
 
148
  ```
149
  Find the first element that matches a CSS selector
150
  ```python
151
+ page.css('.product-list [data-id="1"]')[0]
152
  # <data='<article class="product" data-id="1"><h3...' parent='<div class="product-list"> <article clas...'>
153
  ```
154
  Find all elements that match a CSS selector
 
158
  ```
159
  Find the first element that matches an XPath selector
160
  ```python
161
+ page.xpath("//*[@id='products']/div/article")[0]
162
  # <data='<article class="product" data-id="1"><h3...' parent='<div class="product-list"> <article clas...'>
163
  ```
164
  Find all elements that match an XPath selector
 
234
  [<data='<section id="reviews"><h2>Customer Revie...' parent='<main><section id="products" schema='{"j...'>]
235
  >>> section_element.next # gets the next element, the same logic applies to `quote.previous`.
236
  <data='<section id="reviews"><h2>Customer Revie...' parent='<main><section id="products" schema='{"j...'>
237
+ >>> section_element.children.css('h2::text').getall()
238
  ['Products']
239
+ >>> page.css('[data-id="1"]')[0].has_class('product')
240
  True
241
  ```
242
  If your case needs more than the element's parent, you can iterate over the whole ancestors' tree of any element, like the one below
243
  ```python
244
+ for ancestor in section_element.iterancestors():
245
  # do something with it...
246
  ```
247
  You can search for a specific ancestor of an element that satisfies a function; all you need to do is pass a function that takes a `Selector` object as an argument and returns `True` if the condition is satisfied or `False` otherwise, like below:
 
278
  >>> page = await AsyncFetcher.delete('https://scrapling.requestcatcher.com/delete')
279
  ```
280
 
281
+ !!! note "Notes:"
282
+
283
+ 1. You have the `stealthy_headers` argument, which, when enabled, makes requests to generate real browser headers and use them, including a referer header, as if this request came from a Google search of this domain. It's enabled by default.
284
+ 2. The `impersonate` argument lets you fake the TLS fingerprint for a specific browser version.
285
+ 3. There's also the `http3` argument, which, when enabled, makes the fetcher use HTTP/3 for requests, which makes your requests more authentic
286
 
287
  This is just the tip of the iceberg with this fetcher; check out the rest from [here](fetching/static.md)
288
 
 
293
  ```python
294
  >>> from scrapling.fetchers import DynamicFetcher
295
  >>> page = DynamicFetcher.fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # Vanilla Playwright option
296
+ >>> page.css("#search a::attr(href)").get()
297
  'https://github.com/D4Vinci/Scrapling'
298
  >>> # The async version of fetch
299
  >>> page = await DynamicFetcher.async_fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True)
300
+ >>> page.css("#search a::attr(href)").get()
301
  'https://github.com/D4Vinci/Scrapling'
302
  ```
303
  It's built on top of [Playwright](https://playwright.dev/python/), and it's currently providing two main run options that can be mixed as you want:
 
338
  True
339
  ```
340
 
341
+ Again, this is just the tip of the iceberg with this fetcher. Check out the rest from [here](fetching/stealthy.md) for all details and the complete list of arguments.
342
 
343
  ---
344
 
docs/parsing/adaptive.md CHANGED
@@ -1,10 +1,9 @@
1
- ## Introduction
2
 
3
- > 💡 **Prerequisites:**
4
- >
5
- > 1. Youve completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector) object.
6
- > 2. Youve completed or read the [Main classes](../parsing/main_classes.md) page to understand the [Selector](../parsing/main_classes.md#selector) class.
7
- > <br><br>
8
 
9
  Adaptive scraping (previously known as automatch) is one of Scrapling's most powerful features. It allows your scraper to survive website changes by intelligently tracking and relocating elements.
10
 
@@ -84,11 +83,11 @@ Now, let's test the same selector in both versions
84
  >> Fetcher.configure(adaptive = True, adaptive_domain='stackoverflow.com')
85
  >>
86
  >> page = Fetcher.get(old_url, timeout=30)
87
- >> element1 = page.css_first(selector, auto_save=True)
88
  >>
89
  >> # Same selector but used in the updated website
90
  >> page = Fetcher.get(new_url)
91
- >> element2 = page.css_first(selector, adaptive=True)
92
  >>
93
  >> if element1.text == element2.text:
94
  ... print('Scrapling found the same element in the old and new designs!')
@@ -100,7 +99,9 @@ The code will be the same in a real-world scenario, except it will use the same
100
 
101
  Hence, in the two examples above, I used both the `Selector` and `Fetcher` classes to show that the adaptive logic is the same.
102
 
103
- > Note: the main reason for creating the `adaptive_domain` argument was to handle if the website changed its URL while changing the design/structure. In that case, you can use it to continue using the previously stored adaptive data for the new URL. Otherwise, scrapling will consider it a new website and discard the old data.
 
 
104
 
105
  ## How the adaptive scraping feature works
106
  Adaptive scraping works in two phases:
@@ -144,7 +145,7 @@ Examples:
144
  >>> page = Selector(html_doc, adaptive=True)
145
  # OR
146
  >>> Fetcher.adaptive = True
147
- >>> page = Fetcher.fetch('https://example.com')
148
  ```
149
  If you are using the [Selector](main_classes.md#selector) class, you need to pass the url of the website you are using with the argument `url` so Scrapling can separate the properties saved for each element by domain.
150
 
@@ -157,7 +158,7 @@ Now that you've enabled the `adaptive` feature globally, you have two main ways
157
  ### The CSS/XPath Selection way
158
  As you have seen in the example above, first, you have to use the `auto_save` argument while selecting an element that exists on the page, like below
159
  ```python
160
- element = page.css('#p1' auto_save=True)
161
  ```
162
  And when the element doesn't exist, you can use the same selector and the `adaptive` argument, and the library will find it for you
163
  ```python
@@ -165,7 +166,7 @@ element = page.css('#p1', adaptive=True)
165
  ```
166
  Pretty simple, eh?
167
 
168
- Well, a lot happened under the hood here. Remember the identifier we mentioned before that you need to set to retrieve the element you want? Here, with the `css`/`css_first`/`xpath`/`xpath_first` methods, the identifier is set automatically as the selector you passed here to make things easier :)
169
 
170
  Additionally, for all these methods, you can pass the `identifier` argument to set it yourself. This is useful in some instances, or you can use it to save properties with the `auto_save` argument.
171
 
@@ -185,7 +186,7 @@ Now, later, when you want to retrieve it and relocate it inside the page with `a
185
  >>> element_dict = page.retrieve('my_special_element')
186
  >>> page.relocate(element_dict, selector_type=True)
187
  [<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>]
188
- >>> page.relocate(element_dict, selector_type=True).css('::text')
189
  ['Tipping the Velvet']
190
  ```
191
  Hence, the `retrieve` and `relocate` methods are used.
 
1
+ # Adaptive scraping
2
 
3
+ !!! success "Prerequisites"
4
+
5
+ 1. You've completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector) object.
6
+ 2. You've completed or read the [Main classes](../parsing/main_classes.md) page to understand the [Selector](../parsing/main_classes.md#selector) class.
 
7
 
8
  Adaptive scraping (previously known as automatch) is one of Scrapling's most powerful features. It allows your scraper to survive website changes by intelligently tracking and relocating elements.
9
 
 
83
  >> Fetcher.configure(adaptive = True, adaptive_domain='stackoverflow.com')
84
  >>
85
  >> page = Fetcher.get(old_url, timeout=30)
86
+ >> element1 = page.css(selector, auto_save=True)[0]
87
  >>
88
  >> # Same selector but used in the updated website
89
  >> page = Fetcher.get(new_url)
90
+ >> element2 = page.css(selector, adaptive=True)[0]
91
  >>
92
  >> if element1.text == element2.text:
93
  ... print('Scrapling found the same element in the old and new designs!')
 
99
 
100
  Hence, in the two examples above, I used both the `Selector` and `Fetcher` classes to show that the adaptive logic is the same.
101
 
102
+ !!! info
103
+
104
+ The main reason for creating the `adaptive_domain` argument was to handle if the website changed its URL while changing the design/structure. In that case, you can use it to continue using the previously stored adaptive data for the new URL. Otherwise, scrapling will consider it a new website and discard the old data.
105
 
106
  ## How the adaptive scraping feature works
107
  Adaptive scraping works in two phases:
 
145
  >>> page = Selector(html_doc, adaptive=True)
146
  # OR
147
  >>> Fetcher.adaptive = True
148
+ >>> page = Fetcher.get('https://example.com')
149
  ```
150
  If you are using the [Selector](main_classes.md#selector) class, you need to pass the url of the website you are using with the argument `url` so Scrapling can separate the properties saved for each element by domain.
151
 
 
158
  ### The CSS/XPath Selection way
159
  As you have seen in the example above, first, you have to use the `auto_save` argument while selecting an element that exists on the page, like below
160
  ```python
161
+ element = page.css('#p1', auto_save=True)
162
  ```
163
  And when the element doesn't exist, you can use the same selector and the `adaptive` argument, and the library will find it for you
164
  ```python
 
166
  ```
167
  Pretty simple, eh?
168
 
169
+ Well, a lot happened under the hood here. Remember the identifier we mentioned before that you need to set to retrieve the element you want? Here, with the `css`/`xpath` methods, the identifier is set automatically as the selector you passed here to make things easier :)
170
 
171
  Additionally, for all these methods, you can pass the `identifier` argument to set it yourself. This is useful in some instances, or you can use it to save properties with the `auto_save` argument.
172
 
 
186
  >>> element_dict = page.retrieve('my_special_element')
187
  >>> page.relocate(element_dict, selector_type=True)
188
  [<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>]
189
+ >>> page.relocate(element_dict, selector_type=True).css('::text').getall()
190
  ['Tipping the Velvet']
191
  ```
192
  Hence, the `retrieve` and `relocate` methods are used.
docs/parsing/main_classes.md CHANGED
@@ -1,9 +1,8 @@
1
- ## Introduction
2
 
3
- > 💡 **Prerequisites:**
4
- >
5
- > - You’ve completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector) object.
6
- > <br><br>
7
 
8
  After exploring the various ways to select elements with Scrapling and its related features, let's take a step back and examine the [Selector](#selector) class in general, as well as other objects, to gain a better understanding of the parsing engine.
9
 
@@ -166,10 +165,10 @@ print(article.prettify())
166
  <div class="hidden stock">In stock: 5</div>
167
  </article>
168
  ```
169
- Use the `.body` property to get the raw content of the page
170
  ```python
171
  >>> page.body
172
- '<html>\n <head>\n <title>Some page</title>\n </head>\n <body>\n <div class="product-list">\n <article class="product" data-id="1">\n <h3>Product 1</h3>\n <p class="description">This is product 1</p>\n <span class="price">$10.99</span>\n <div class="hidden stock">In stock: 5</div>\n </article>\n\n <article class="product" data-id="2">\n <h3>Product 2</h3>\n <p class="description">This is product 2</p>\n <span class="price">$20.99</span>\n <div class="hidden stock">In stock: 3</div>\n </article>\n\n <article class="product" data-id="3">\n <h3>Product 3</h3>\n <p class="description">This is product 3</p>\n <span class="price">$15.99</span>\n <div class="hidden stock">Out of stock</div>\n </article>\n </div>\n\n <script id="page-data" type="application/json">\n {\n "lastUpdated": "2024-09-22T10:30:00Z",\n "totalProducts": 3\n }\n </script>\n </body>\n</html>'
173
  ```
174
  To get all the ancestors in the DOM tree of this element
175
  ```python
@@ -234,7 +233,7 @@ This element returns the same result as the `children` property because its chil
234
 
235
  Another example of using the element with the `product-list` class will clear the difference between the `children` property and the `below_elements` property
236
  ```python
237
- >>> products_list = page.css_first('.product-list')
238
  >>> products_list.children
239
  [<data='<article class="product" data-id="1"><h3...' parent='<div class="product-list"> <article clas...'>,
240
  <data='<article class="product" data-id="2"><h3...' parent='<div class="product-list"> <article clas...'>,
@@ -263,7 +262,7 @@ Get the next element of the current element
263
  The same logic applies to the `previous` property
264
  ```python
265
  >>> article.previous # It's the first child, so it doesn't have a previous element
266
- >>> second_article = page.css_first('.product[data-id="2"]')
267
  >>> second_article.previous
268
  <data='<article class="product" data-id="1"><h3...' parent='<div class="product-list"> <article clas...'>
269
  ```
@@ -277,7 +276,7 @@ If your case needs more than the element's parent, you can iterate over the whol
277
  for ancestor in article.iterancestors():
278
  # do something with it...
279
  ```
280
- You can search for a specific ancestor of an element that satisfies a search function; all you need to do is to pass a function that takes a [Selector](#selector) object as an argument and return `True` if the condition satisfies or `False` otherwise, like below:
281
  ```python
282
  >>> article.find_ancestor(lambda ancestor: ancestor.has_class('product-list'))
283
  <data='<div class="product-list"> <article clas...' parent='<body> <div class="product-list"> <artic...'>
@@ -288,33 +287,63 @@ You can search for a specific ancestor of an element that satisfies a search fun
288
  ## Selectors
289
  The class `Selectors` is the "List" version of the [Selector](#selector) class. It inherits from the Python standard `List` type, so it shares all `List` properties and methods while adding more methods to make the operations you want to execute on the [Selector](#selector) instances within more straightforward.
290
 
291
- In the [Selector](#selector) class, all methods/properties that should return a group of elements return them as a [Selectors](#selectors) class instance. The only exceptions are when you use the CSS/XPath methods as follows:
292
 
293
- - If you selected a text node with the selector, then the return type will be [TextHandler](#texthandler)/[TextHandlers](#texthandlers). <br/>Examples:
294
- ```python
295
- >>> page.css('a::text') # -> TextHandlers
296
- >>> page.xpath('//a/text()') # -> TextHandlers
297
- >>> page.css_first('a::text') # -> TextHandler
298
- >>> page.xpath_first('//a/text()') # -> TextHandler
299
- >>> page.css('a::attr(href)') # -> TextHandlers
300
- >>> page.xpath('//a/@href') # -> TextHandlers
301
- >>> page.css_first('a::attr(href)') # -> TextHandler
302
- >>> page.xpath_first('//a/@href') # -> TextHandler
303
- ```
304
- - If you used a combined selector that returns mixed types, the result will be a Python standard `List`. <br/>Examples:
305
- ```python
306
- >>> page.css('.price_color') # -> Selectors
307
- >>> page.css('.product_pod a::attr(href)') # -> TextHandlers
308
- >>> page.css('.price_color, .product_pod a::attr(href)') # -> List
309
- ```
 
 
 
 
 
 
 
310
 
311
- Let's see what [Selectors](#selectors) class adds to the table with that out of the way.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  ### Properties
313
  Apart from the standard operations on Python lists, such as iteration and slicing.
314
 
315
  You can do the following:
316
 
317
- Execute CSS and XPath selectors directly on the [Selector](#selector) instances it has, while the arguments and the return types are the same as [Selector](#selector)'s `css` and `xpath` methods. This, of course, makes chaining methods very straightforward.
318
  ```python
319
  >>> page.css('.product_pod a')
320
  [<data='<a href="catalogue/a-light-in-the-attic_...' parent='<div class="image_container"> <a href="c...'>,
@@ -370,6 +399,15 @@ You can use the `filter` method, too, which takes a function like the `search` m
370
  <data='<article class="product_pod"><div class=...' parent='<li class="col-xs-6 col-sm-4 col-md-3 co...'>,
371
  ...]
372
  ```
 
 
 
 
 
 
 
 
 
373
  If you are too lazy like me and want to know the number of [Selector](#selector) instances in a [Selectors](#selectors) instance. You can do this:
374
  ```python
375
  page.css('.product_pod').length
@@ -441,14 +479,14 @@ First, we start with the `re` and `re_first` methods. These are the same methods
441
 
442
  - You also have the `.json()` method, which tries to convert the content to a JSON object quickly if possible; otherwise, it throws an error
443
  ```python
444
- >>> page.css_first('#page-data::text')
445
  '\n {\n "lastUpdated": "2024-09-22T10:30:00Z",\n "totalProducts": 3\n }\n '
446
- >>> page.css_first('#page-data::text').json()
447
  {'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3}
448
  ```
449
  Hence, if you didn't specify a text node while selecting an element (like the text content or an attribute text content), the text content will be selected automatically, like this
450
  ```python
451
- >>> page.css_first('#page-data').json()
452
  {'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3}
453
  ```
454
  The [Selector](#selector) class adds one thing here, too; let's say this is the page we are working with:
@@ -469,12 +507,12 @@ First, we start with the `re` and `re_first` methods. These are the same methods
469
  The [Selector](#selector) class has the `get_all_text` method, which you should be aware of by now. This method returns a `TextHandler`, of course.<br/><br/>
470
  So, as you know here, if you did something like this
471
  ```python
472
- >>> page.css_first('div::text').json()
473
  ```
474
  You will get an error because the `div` tag doesn't have any direct text content that can be serialized to JSON; it doesn't have any direct text content at all.<br/><br/>
475
  In this case, the `get_all_text` method comes to the rescue, so you can do something like that
476
  ```python
477
- >>> page.css_first('div').get_all_text(ignore_tags=[]).json()
478
  {'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3}
479
  ```
480
  I used the `ignore_tags` argument here because the default value of it is `('script', 'style',)`, as you are aware.<br/><br/>
@@ -493,7 +531,7 @@ First, we start with the `re` and `re_first` methods. These are the same methods
493
  {'some_key': 'some_value'}
494
  ```
495
  You might wonder how this happened, given that the `html` tag doesn't contain direct text.<br/>
496
- Well, for cases like JSON responses, I made the [Selector](#selector) class keep a raw copy of the content it receives. This way, when you use the `.json()` method, it checks for that raw copy and then converts it to JSON. If the raw copy is not available like the case with the elements, it checks for the current element text content, or otherwise it uses the `get_all_text` method directly.<br/>
497
 
498
  - Another handy method is `.clean()`, which will remove all white spaces and consecutive spaces for you and return a new `TextHandler` instance
499
  ```python
@@ -521,7 +559,7 @@ You probably guessed it: This class is similar to [Selectors](#selectors) and [S
521
  The only difference is that the `re_first` method logic here runs `re` on each [TextHandler](#texthandler) and returns the first result, or `None`. Nothing new needs to be explained here, but new methods will be added over time.
522
 
523
  ## AttributesHandler
524
- This is a read-only version of Python's standard dictionary, or `dict`, used solely to store the attributes of each element or [Selector](#selector) instance.
525
  ```python
526
  >>> print(page.find('script').attrib)
527
  {'id': 'page-data', 'type': 'application/json'}
 
1
+ # Parsing main classes
2
 
3
+ !!! success "Prerequisites"
4
+
5
+ - You’ve completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector) object.
 
6
 
7
  After exploring the various ways to select elements with Scrapling and its related features, let's take a step back and examine the [Selector](#selector) class in general, as well as other objects, to gain a better understanding of the parsing engine.
8
 
 
165
  <div class="hidden stock">In stock: 5</div>
166
  </article>
167
  ```
168
+ Use the `.body` property to get the raw content of the page. Starting from v0.4, when used on a `Response` object from fetchers, `.body` always returns `bytes`.
169
  ```python
170
  >>> page.body
171
+ '<html>\n <head>\n <title>Some page</title>\n </head>\n ...'
172
  ```
173
  To get all the ancestors in the DOM tree of this element
174
  ```python
 
233
 
234
  Another example of using the element with the `product-list` class will clear the difference between the `children` property and the `below_elements` property
235
  ```python
236
+ >>> products_list = page.css('.product-list')[0]
237
  >>> products_list.children
238
  [<data='<article class="product" data-id="1"><h3...' parent='<div class="product-list"> <article clas...'>,
239
  <data='<article class="product" data-id="2"><h3...' parent='<div class="product-list"> <article clas...'>,
 
262
  The same logic applies to the `previous` property
263
  ```python
264
  >>> article.previous # It's the first child, so it doesn't have a previous element
265
+ >>> second_article = page.css('.product[data-id="2"]')[0]
266
  >>> second_article.previous
267
  <data='<article class="product" data-id="1"><h3...' parent='<div class="product-list"> <article clas...'>
268
  ```
 
276
  for ancestor in article.iterancestors():
277
  # do something with it...
278
  ```
279
+ You can search for a specific ancestor of an element that satisfies a search function; all you need to do is pass a function that takes a [Selector](#selector) object as an argument and return `True` if the condition satisfies or `False` otherwise, like below:
280
  ```python
281
  >>> article.find_ancestor(lambda ancestor: ancestor.has_class('product-list'))
282
  <data='<div class="product-list"> <article clas...' parent='<body> <div class="product-list"> <artic...'>
 
287
  ## Selectors
288
  The class `Selectors` is the "List" version of the [Selector](#selector) class. It inherits from the Python standard `List` type, so it shares all `List` properties and methods while adding more methods to make the operations you want to execute on the [Selector](#selector) instances within more straightforward.
289
 
290
+ In the [Selector](#selector) class, all methods/properties that should return a group of elements return them as a [Selectors](#selectors) class instance.
291
 
292
+ Starting with v0.4, all selection methods consistently return [Selector](#selector)/[Selectors](#selectors) objects, even for text nodes and attribute values. Text nodes (selected via `::text`, `/text()`, `::attr()`, `/@attr`) are wrapped in [Selector](#selector) objects. These text node selectors have `tag` set to `"#text"`, and their `text` property returns the text value. You can still access the text value directly, and all other properties return empty/default values gracefully.
293
+
294
+ ```python
295
+ >>> page.css('a::text') # -> Selectors (of text node Selectors)
296
+ >>> page.xpath('//a/text()') # -> Selectors
297
+ >>> page.css('a::text').get() # -> TextHandler (the first text value)
298
+ >>> page.css('a::text').getall() # -> TextHandlers (all text values)
299
+ >>> page.css('a::attr(href)') # -> Selectors
300
+ >>> page.xpath('//a/@href') # -> Selectors
301
+ >>> page.css('.price_color') # -> Selectors
302
+ ```
303
+
304
+ ### Data extraction methods
305
+ Starting with v0.4, [Selector](#selector) and [Selectors](#selectors) both provide `get()`, `getall()`, and their aliases `extract_first` and `extract` (following Scrapy conventions). The old `get_all()` method has been removed.
306
+
307
+ **On a [Selector](#selector) object:**
308
+
309
+ - `get()` returns a `TextHandler` — for text node selectors, it returns the text value; for HTML element selectors, it returns the serialized outer HTML.
310
+ - `getall()` returns a `TextHandlers` list containing the single serialized string.
311
+ - `extract_first` is an alias for `get()`, and `extract` is an alias for `getall()`.
312
+
313
+ ```python
314
+ >>> page.css('h3')[0].get() # Outer HTML of the element
315
+ '<h3>Product 1</h3>'
316
 
317
+ >>> page.css('h3::text')[0].get() # Text value of the text node
318
+ 'Product 1'
319
+ ```
320
+
321
+ **On a [Selectors](#selectors) object:**
322
+
323
+ - `get(default=None)` returns the serialized string of the **first** element, or `default` if the list is empty.
324
+ - `getall()` serializes **all** elements and returns a `TextHandlers` list.
325
+ - `extract_first` is an alias for `get()`, and `extract` is an alias for `getall()`.
326
+
327
+ ```python
328
+ >>> page.css('.price::text').get() # First price text
329
+ '$10.99'
330
+
331
+ >>> page.css('.price::text').getall() # All price texts
332
+ ['$10.99', '$20.99', '$15.99']
333
+
334
+ >>> page.css('.price::text').get('') # With default value
335
+ '$10.99'
336
+ ```
337
+
338
+ These methods work seamlessly with all selection types (CSS, XPath, `find`, etc.) and are the recommended way to extract text and attribute values in a Scrapy-compatible style.
339
+
340
+ Now, let's see what [Selectors](#selectors) class adds to the table with that out of the way.
341
  ### Properties
342
  Apart from the standard operations on Python lists, such as iteration and slicing.
343
 
344
  You can do the following:
345
 
346
+ Execute CSS and XPath selectors directly on the [Selector](#selector) instances it has, while the return types are the same as [Selector](#selector)'s `css` and `xpath` methods. The arguments are similar, except the `adaptive` argument is not available here. This, of course, makes chaining methods very straightforward.
347
  ```python
348
  >>> page.css('.product_pod a')
349
  [<data='<a href="catalogue/a-light-in-the-attic_...' parent='<div class="image_container"> <a href="c...'>,
 
399
  <data='<article class="product_pod"><div class=...' parent='<li class="col-xs-6 col-sm-4 col-md-3 co...'>,
400
  ...]
401
  ```
402
+ You can safely access the first or last element without worrying about index errors:
403
+ ```python
404
+ >>> page.css('.product').first # First Selector or None
405
+ <data='<article class="product" data-id="1"><h3...'>
406
+ >>> page.css('.product').last # Last Selector or None
407
+ <data='<article class="product" data-id="3"><h3...'>
408
+ >>> page.css('.nonexistent').first # Returns None instead of raising IndexError
409
+ ```
410
+
411
  If you are too lazy like me and want to know the number of [Selector](#selector) instances in a [Selectors](#selectors) instance. You can do this:
412
  ```python
413
  page.css('.product_pod').length
 
479
 
480
  - You also have the `.json()` method, which tries to convert the content to a JSON object quickly if possible; otherwise, it throws an error
481
  ```python
482
+ >>> page.css('#page-data::text').get()
483
  '\n {\n "lastUpdated": "2024-09-22T10:30:00Z",\n "totalProducts": 3\n }\n '
484
+ >>> page.css('#page-data::text').get().json()
485
  {'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3}
486
  ```
487
  Hence, if you didn't specify a text node while selecting an element (like the text content or an attribute text content), the text content will be selected automatically, like this
488
  ```python
489
+ >>> page.css('#page-data')[0].json()
490
  {'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3}
491
  ```
492
  The [Selector](#selector) class adds one thing here, too; let's say this is the page we are working with:
 
507
  The [Selector](#selector) class has the `get_all_text` method, which you should be aware of by now. This method returns a `TextHandler`, of course.<br/><br/>
508
  So, as you know here, if you did something like this
509
  ```python
510
+ >>> page.css('div::text').get().json()
511
  ```
512
  You will get an error because the `div` tag doesn't have any direct text content that can be serialized to JSON; it doesn't have any direct text content at all.<br/><br/>
513
  In this case, the `get_all_text` method comes to the rescue, so you can do something like that
514
  ```python
515
+ >>> page.css('div')[0].get_all_text(ignore_tags=[]).json()
516
  {'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3}
517
  ```
518
  I used the `ignore_tags` argument here because the default value of it is `('script', 'style',)`, as you are aware.<br/><br/>
 
531
  {'some_key': 'some_value'}
532
  ```
533
  You might wonder how this happened, given that the `html` tag doesn't contain direct text.<br/>
534
+ Well, for cases like JSON responses, I made the [Selector](#selector) class keep a raw copy of the content it receives. This way, when you use the `.json()` method, it checks for that raw copy and then converts it to JSON. If the raw copy is unavailable, as with the elements, it checks the current element's text content; otherwise, it uses the `get_all_text` method directly.<br/>
535
 
536
  - Another handy method is `.clean()`, which will remove all white spaces and consecutive spaces for you and return a new `TextHandler` instance
537
  ```python
 
559
  The only difference is that the `re_first` method logic here runs `re` on each [TextHandler](#texthandler) and returns the first result, or `None`. Nothing new needs to be explained here, but new methods will be added over time.
560
 
561
  ## AttributesHandler
562
+ This is a read-only version of Python's standard dictionary, or `dict`, used solely to store the attributes of each element/[Selector](#selector) instance.
563
  ```python
564
  >>> print(page.find('script').attrib)
565
  {'id': 'page-data', 'type': 'application/json'}
docs/parsing/selection.md CHANGED
@@ -1,4 +1,4 @@
1
- ## Introduction
2
  Scrapling currently supports parsing HTML pages exclusively, so it doesn't support XML feeds. This decision was made because the adaptive feature won't work with XML, but that might change soon, so stay tuned :)
3
 
4
  In Scrapling, there are five main ways to find elements:
@@ -27,16 +27,16 @@ Also, Scrapling implements some non-standard pseudo-elements like:
27
 
28
  In short, if you come from Scrapy/Parsel, you will find the same logic for selectors here to make it easier. No need to implement a stranger logic to the one that most of us are used to :)
29
 
30
- To select elements with CSS selectors, you have the `css` and `css_first` methods. The latter is ~10% faster and more valuable when you are interested in the first element it finds, or if it's just one element, etc. It's beneficial when there's more than one, as it returns `Selectors`.
31
 
32
  ### What are XPath selectors?
33
  [XPath](https://en.wikipedia.org/wiki/XPath) is a language for selecting nodes in XML documents, which can also be used with HTML. This [cheatsheet](https://devhints.io/xpath) is a good resource for learning about [XPath](https://en.wikipedia.org/wiki/XPath). Scrapling adds XPath selectors directly through [lxml](https://lxml.de/).
34
 
35
  In short, it is the same situation as CSS Selectors; if you come from Scrapy/Parsel, you will find the same logic for selectors here. However, Scrapling doesn't implement the XPath extension function `has-class` as Scrapy/Parsel does. Instead, it provides the `has_class` method, which can be used on elements returned for the same purpose.
36
 
37
- To select elements with XPath selectors, you have the `xpath` and `xpath_first` methods. Again, these methods follow the same logic as the CSS selectors methods above, and `xpath_first` is faster.
38
 
39
- > Note that each method of `css`, `css_first`, `xpath`, and `xpath_first` has additional arguments, but we didn't explain them here, as they are all about the adaptive feature. The adaptive feature will have its own page later to be described in detail.
40
 
41
  ### Selectors examples
42
  Let's see some shared examples of using CSS and XPath Selectors.
@@ -46,43 +46,40 @@ Select all elements with the class `product`.
46
  products = page.css('.product')
47
  products = page.xpath('//*[@class="product"]')
48
  ```
49
- Note: The XPath one won't be accurate if there's another class; **it's always better to rely on CSS for selecting by class**
 
 
50
 
51
  Select the first element with the class `product`.
52
  ```python
53
- product = page.css_first('.product')
54
- product = page.xpath_first('//*[@class="product"]')
55
- ```
56
- Which would be the same as doing (but a bit slower)
57
- ```python
58
  product = page.css('.product')[0]
59
  product = page.xpath('//*[@class="product"]')[0]
60
  ```
61
  Get the text of the first element with the `h1` tag name
62
  ```python
63
- title = page.css_first('h1::text')
64
- title = page.xpath_first('//h1//text()')
65
  ```
66
- Which is again the same as doing
67
  ```python
68
- title = page.css_first('h1').text
69
- title = page.xpath_first('//h1').text
70
  ```
71
- Get the `href` attribute of the first element with the `a` tag name
72
  ```python
73
- link = page.css_first('a::attr(href)')
74
- link = page.xpath_first('//a/@href')
75
  ```
76
  Select the text of the first element with the `h1` tag name, which contains `Phone`, and under an element with class `product`.
77
  ```python
78
- title = page.css_first('.product h1:contains("Phone")::text')
79
- title = page.page.xpath_first('//*[@class="product"]//h1[contains(text(),"Phone")]/text()')
80
  ```
81
  You can nest and chain selectors as you want, given that they return results
82
  ```python
83
- page.css_first('.product').css_first('h1:contains("Phone")::text')
84
- page.xpath_first('//*[@class="product"]').xpath_first('//h1[contains(text(),"Phone")]/text()')
85
- page.xpath_first('//*[@class="product"]').css_first('h1:contains("Phone")::text')
86
  ```
87
  Another example
88
 
@@ -91,7 +88,7 @@ All links that have 'image' in their 'href' attribute
91
  links = page.css('a[href*="image"]')
92
  links = page.xpath('//a[contains(@href, "image")]')
93
  for index, link in enumerate(links):
94
- link_value = link.attrib['href'] # Cleaner than link.css('::attr(href)')
95
  link_text = link.text
96
  print(f'Link number {index} points to this url {link_value} with text content as "{link_text}"')
97
  ```
@@ -114,7 +111,9 @@ By default, Scrapling searches for the exact matching of the text/pattern you pa
114
 
115
  * **partial**: If enabled, `find_by_text` will return elements that contain the input text. So it's not an exact match anymore
116
 
117
- Note: The method `find_by_regex` can accept both regular strings and a compiled regex pattern as its first argument, as you will see in the upcoming examples.
 
 
118
 
119
  ### Finding Similar Elements
120
  One of the most remarkable new features Scrapling puts on the table is the ability to tell Scrapling to find elements similar to the element at hand. This feature's inspiration came from the AutoScraper library, but in Scrapling, it can be used on elements found by any method. Most of its usage would likely occur after finding elements through text content, similar to how AutoScraper works, making it convenient to explain here.
@@ -239,9 +238,9 @@ To increase the complexity a little bit, let's say we want to get all the books'
239
  ```python
240
  >>> for product in element.parent.parent.find_similar():
241
  print({
242
- "name": product.css_first('h3 a::text'),
243
- "price": product.css_first('.price_color').re_first(r'[\d\.]+'),
244
- "stock": product.css('.availability::text')[-1].clean()
245
  })
246
  {'name': 'A Light in the ...', 'price': '51.77', 'stock': 'In stock'}
247
  {'name': 'Soumission', 'price': '50.10', 'stock': 'In stock'}
@@ -264,10 +263,10 @@ def extract_product_grid(page):
264
 
265
  return [
266
  {
267
- 'name': p.css_first('h3::text'),
268
- 'price': p.css_first('.price::text').re_first(r'\d+\.\d{2}'),
269
  'stock': 'In stock' in p.text,
270
- 'rating': p.css_first('.rating').attrib.get('data-rating')
271
  }
272
  for p in products
273
  ]
@@ -276,16 +275,16 @@ Table Row Extraction
276
  ```python
277
  def extract_table_data(page):
278
  # Find the first data row
279
- first_row = page.css_first('table tbody tr')
280
 
281
  # Find similar rows
282
  rows = first_row.find_similar()
283
 
284
  return [
285
  {
286
- 'column1': row.css_first('td:nth-child(1)::text'),
287
- 'column2': row.css_first('td:nth-child(2)::text'),
288
- 'column3': row.css_first('td:nth-child(3)::text')
289
  }
290
  for row in rows
291
  ]
@@ -294,7 +293,7 @@ Form Field Extraction
294
  ```python
295
  def extract_form_fields(page):
296
  # Find first form field container
297
- first_field = page.css_first('input').find_ancestor(
298
  lambda e: e.has_class('form-field')
299
  )
300
 
@@ -303,9 +302,9 @@ def extract_form_fields(page):
303
 
304
  return [
305
  {
306
- 'label': f.css_first('label::text'),
307
- 'type': f.css_first('input').attrib.get('type'),
308
- 'required': 'required' in f.css_first('input').attrib
309
  }
310
  for f in fields
311
  ]
@@ -324,9 +323,9 @@ def extract_reviews(page):
324
 
325
  return [
326
  {
327
- 'text': r.css_first('.review-text::text'),
328
  'rating': r.attrib.get('data-rating'),
329
- 'author': r.css_first('.reviewer::text')
330
  }
331
  for r in all_reviews
332
  ]
@@ -354,10 +353,10 @@ It filters all elements in the current page/element in the following order:
354
  3. All elements that match all passed regex patterns are collected, or if previous filter(s) are used, then previously collected elements are filtered.
355
  4. All elements that fulfill all passed function(s) are collected; if a previous filter(s) is used, then previously collected elements are filtered.
356
 
357
- Notes:
358
 
359
- 1. As you probably understood, the filtering process always starts from the first filter it finds in the filtering order above. So, if no tag name(s) are passed but attributes are passed, the process starts from that step (number 2), and so on.
360
- 2. The order in which you pass the arguments doesn't matter. The only order considered is the one explained above.
361
 
362
  Check examples to clear any confusion :)
363
 
@@ -396,10 +395,10 @@ Find all elements with a class that equals `quote`.
396
  ```
397
  Find all div elements with a class that equals `quote` and contains the element `.text`, which contains the word 'world' in its content.
398
  ```python
399
- >>> page.find_all('div', {'class': 'quote'}, lambda e: "world" in e.css_first('.text::text'))
400
  [<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>]
401
  ```
402
- Find all elements that don't have children.
403
  ```python
404
  >>> page.find_all(lambda element: len(element.children) > 0)
405
  [<data='<html lang="en"><head><meta charset="UTF...'>,
@@ -427,7 +426,7 @@ Find all div and span elements with class 'quote' (No span elements like that, s
427
  ```
428
  Mix things up
429
  ```python
430
- >>> page.find_all({'itemtype':"http://schema.org/CreativeWork"}, 'div').css('.author::text')
431
  ['Albert Einstein',
432
  'J.K. Rowling',
433
  ...]
@@ -473,15 +472,16 @@ Generate a full XPath selector for the `url_element` element from the start of t
473
  >>> url_element.generate_full_xpath_selector
474
  '//body/div/div[2]/div/div/span[2]/a'
475
  ```
476
- > Note: <br>
477
- > When you tell Scrapling to create a short selector, it tries to find a unique element to use in generation as a stop point, like an element with an `id` attribute, but in our case, there wasn't any, so that's why the short and the full selector will be the same.
 
478
 
479
  ## Using selectors with regular expressions
480
  Similar to `parsel`/`scrapy`, `re` and `re_first` methods are available for extracting data using regular expressions. However, unlike the former libraries, these methods are in nearly all classes like `Selector`/`Selectors`/`TextHandler` and `TextHandlers`, which means you can use them directly on the element even if you didn't select a text node.
481
 
482
  We will have a deep look at it while explaining the [TextHandler](main_classes.md#texthandler) class, but in general, it works like the examples below:
483
  ```python
484
- >>> page.css_first('.price_color').re_first(r'[\d\.]+')
485
  '51.77'
486
 
487
  >>> page.css('.price_color').re_first(r'[\d\.]+')
 
1
+ # Querying elements
2
  Scrapling currently supports parsing HTML pages exclusively, so it doesn't support XML feeds. This decision was made because the adaptive feature won't work with XML, but that might change soon, so stay tuned :)
3
 
4
  In Scrapling, there are five main ways to find elements:
 
27
 
28
  In short, if you come from Scrapy/Parsel, you will find the same logic for selectors here to make it easier. No need to implement a stranger logic to the one that most of us are used to :)
29
 
30
+ To select elements with CSS selectors, use the `css` method, which returns `Selectors`. Use `[0]` to get the first element, or `.get()` / `.getall()` to extract text values from text/attribute pseudo-selectors.
31
 
32
  ### What are XPath selectors?
33
  [XPath](https://en.wikipedia.org/wiki/XPath) is a language for selecting nodes in XML documents, which can also be used with HTML. This [cheatsheet](https://devhints.io/xpath) is a good resource for learning about [XPath](https://en.wikipedia.org/wiki/XPath). Scrapling adds XPath selectors directly through [lxml](https://lxml.de/).
34
 
35
  In short, it is the same situation as CSS Selectors; if you come from Scrapy/Parsel, you will find the same logic for selectors here. However, Scrapling doesn't implement the XPath extension function `has-class` as Scrapy/Parsel does. Instead, it provides the `has_class` method, which can be used on elements returned for the same purpose.
36
 
37
+ To select elements with XPath selectors, you have the `xpath` method. Again, this method follows the same logic as the CSS selectors method above.
38
 
39
+ > Note that each method of `css` and `xpath` has additional arguments, but we didn't explain them here, as they are all about the adaptive feature. The adaptive feature will have its own page later to be described in detail.
40
 
41
  ### Selectors examples
42
  Let's see some shared examples of using CSS and XPath Selectors.
 
46
  products = page.css('.product')
47
  products = page.xpath('//*[@class="product"]')
48
  ```
49
+ !!! info "Note:"
50
+
51
+ The XPath one won't be accurate if there's another class; **it's always better to rely on CSS for selecting by class**
52
 
53
  Select the first element with the class `product`.
54
  ```python
 
 
 
 
 
55
  product = page.css('.product')[0]
56
  product = page.xpath('//*[@class="product"]')[0]
57
  ```
58
  Get the text of the first element with the `h1` tag name
59
  ```python
60
+ title = page.css('h1::text').get()
61
+ title = page.xpath('//h1//text()').get()
62
  ```
63
+ Which is the same as doing
64
  ```python
65
+ title = page.css('h1')[0].text
66
+ title = page.xpath('//h1')[0].text
67
  ```
68
+ Get the `href` attribute of the first element with the `a` tag name
69
  ```python
70
+ link = page.css('a::attr(href)').get()
71
+ link = page.xpath('//a/@href').get()
72
  ```
73
  Select the text of the first element with the `h1` tag name, which contains `Phone`, and under an element with class `product`.
74
  ```python
75
+ title = page.css('.product h1:contains("Phone")::text').get()
76
+ title = page.xpath('//*[@class="product"]//h1[contains(text(),"Phone")]/text()').get()
77
  ```
78
  You can nest and chain selectors as you want, given that they return results
79
  ```python
80
+ page.css('.product')[0].css('h1:contains("Phone")::text').get()
81
+ page.xpath('//*[@class="product"]')[0].xpath('//h1[contains(text(),"Phone")]/text()').get()
82
+ page.xpath('//*[@class="product"]')[0].css('h1:contains("Phone")::text').get()
83
  ```
84
  Another example
85
 
 
88
  links = page.css('a[href*="image"]')
89
  links = page.xpath('//a[contains(@href, "image")]')
90
  for index, link in enumerate(links):
91
+ link_value = link.attrib['href'] # Cleaner than link.css('::attr(href)').get()
92
  link_text = link.text
93
  print(f'Link number {index} points to this url {link_value} with text content as "{link_text}"')
94
  ```
 
111
 
112
  * **partial**: If enabled, `find_by_text` will return elements that contain the input text. So it's not an exact match anymore
113
 
114
+ !!! abstract "Note:"
115
+
116
+ The method `find_by_regex` can accept both regular strings and a compiled regex pattern as its first argument, as you will see in the upcoming examples.
117
 
118
  ### Finding Similar Elements
119
  One of the most remarkable new features Scrapling puts on the table is the ability to tell Scrapling to find elements similar to the element at hand. This feature's inspiration came from the AutoScraper library, but in Scrapling, it can be used on elements found by any method. Most of its usage would likely occur after finding elements through text content, similar to how AutoScraper works, making it convenient to explain here.
 
238
  ```python
239
  >>> for product in element.parent.parent.find_similar():
240
  print({
241
+ "name": product.css('h3 a::text').get(),
242
+ "price": product.css('.price_color')[0].re_first(r'[\d\.]+'),
243
+ "stock": product.css('.availability::text').getall()[-1].clean()
244
  })
245
  {'name': 'A Light in the ...', 'price': '51.77', 'stock': 'In stock'}
246
  {'name': 'Soumission', 'price': '50.10', 'stock': 'In stock'}
 
263
 
264
  return [
265
  {
266
+ 'name': p.css('h3::text').get(),
267
+ 'price': p.css('.price::text').re_first(r'\d+\.\d{2}'),
268
  'stock': 'In stock' in p.text,
269
+ 'rating': p.css('.rating')[0].attrib.get('data-rating')
270
  }
271
  for p in products
272
  ]
 
275
  ```python
276
  def extract_table_data(page):
277
  # Find the first data row
278
+ first_row = page.css('table tbody tr')[0]
279
 
280
  # Find similar rows
281
  rows = first_row.find_similar()
282
 
283
  return [
284
  {
285
+ 'column1': row.css('td:nth-child(1)::text').get(),
286
+ 'column2': row.css('td:nth-child(2)::text').get(),
287
+ 'column3': row.css('td:nth-child(3)::text').get()
288
  }
289
  for row in rows
290
  ]
 
293
  ```python
294
  def extract_form_fields(page):
295
  # Find first form field container
296
+ first_field = page.css('input')[0].find_ancestor(
297
  lambda e: e.has_class('form-field')
298
  )
299
 
 
302
 
303
  return [
304
  {
305
+ 'label': f.css('label::text').get(),
306
+ 'type': f.css('input')[0].attrib.get('type'),
307
+ 'required': 'required' in f.css('input')[0].attrib
308
  }
309
  for f in fields
310
  ]
 
323
 
324
  return [
325
  {
326
+ 'text': r.css('.review-text::text').get(),
327
  'rating': r.attrib.get('data-rating'),
328
+ 'author': r.css('.reviewer::text').get()
329
  }
330
  for r in all_reviews
331
  ]
 
353
  3. All elements that match all passed regex patterns are collected, or if previous filter(s) are used, then previously collected elements are filtered.
354
  4. All elements that fulfill all passed function(s) are collected; if a previous filter(s) is used, then previously collected elements are filtered.
355
 
356
+ !!! note "Notes:"
357
 
358
+ 1. As you probably understood, the filtering process always starts from the first filter it finds in the filtering order above. So, if no tag name(s) are passed but attributes are passed, the process starts from that step (number 2), and so on.
359
+ 2. The order in which you pass the arguments doesn't matter. The only order considered is the one explained above.
360
 
361
  Check examples to clear any confusion :)
362
 
 
395
  ```
396
  Find all div elements with a class that equals `quote` and contains the element `.text`, which contains the word 'world' in its content.
397
  ```python
398
+ >>> page.find_all('div', {'class': 'quote'}, lambda e: "world" in e.css('.text::text').get())
399
  [<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>]
400
  ```
401
+ Find all elements that have children.
402
  ```python
403
  >>> page.find_all(lambda element: len(element.children) > 0)
404
  [<data='<html lang="en"><head><meta charset="UTF...'>,
 
426
  ```
427
  Mix things up
428
  ```python
429
+ >>> page.find_all({'itemtype':"http://schema.org/CreativeWork"}, 'div').css('.author::text').getall()
430
  ['Albert Einstein',
431
  'J.K. Rowling',
432
  ...]
 
472
  >>> url_element.generate_full_xpath_selector
473
  '//body/div/div[2]/div/div/span[2]/a'
474
  ```
475
+ !!! abstract "Note:"
476
+
477
+ When you tell Scrapling to create a short selector, it tries to find a unique element to use in generation as a stop point, like an element with an `id` attribute, but in our case, there wasn't any, so that's why the short and the full selector will be the same.
478
 
479
  ## Using selectors with regular expressions
480
  Similar to `parsel`/`scrapy`, `re` and `re_first` methods are available for extracting data using regular expressions. However, unlike the former libraries, these methods are in nearly all classes like `Selector`/`Selectors`/`TextHandler` and `TextHandlers`, which means you can use them directly on the element even if you didn't select a text node.
481
 
482
  We will have a deep look at it while explaining the [TextHandler](main_classes.md#texthandler) class, but in general, it works like the examples below:
483
  ```python
484
+ >>> page.css('.price_color')[0].re_first(r'[\d\.]+')
485
  '51.77'
486
 
487
  >>> page.css('.price_color').re_first(r'[\d\.]+')
docs/requirements.txt CHANGED
@@ -1,8 +1,8 @@
1
- mkdocstrings>=1.0.0
2
- mkdocstrings-python>=2.0.1
 
3
  griffe-inherited-docstrings
4
  griffe-runtime-objects
5
  griffe-sphinx
6
- mkdocs-material[imaging]>=9.7.1
7
- black>=25.12.0
8
  pngquant
 
1
+ zensical>=0.0.23
2
+ mkdocstrings>=1.0.3
3
+ mkdocstrings-python>=2.0.2
4
  griffe-inherited-docstrings
5
  griffe-runtime-objects
6
  griffe-sphinx
7
+ black>=26.1.0
 
8
  pngquant
docs/spiders/advanced.md ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Advanced usages
2
+
3
+ ## Introduction
4
+
5
+ !!! success "Prerequisites"
6
+
7
+ 1. You've read the [Getting started](getting-started.md) page and know how to create and run a basic spider.
8
+
9
+ This page covers the spider system's advanced features: concurrency control, pause/resume, streaming, lifecycle hooks, statistics, and logging.
10
+
11
+ ## Concurrency Control
12
+
13
+ The spider system uses three class attributes to control how aggressively it crawls:
14
+
15
+ | Attribute | Default | Description |
16
+ |----------------------------------|---------|------------------------------------------------------------------|
17
+ | `concurrent_requests` | `4` | Maximum number of requests being processed at the same time |
18
+ | `concurrent_requests_per_domain` | `0` | Maximum concurrent requests per domain (0 = no per-domain limit) |
19
+ | `download_delay` | `0.0` | Seconds to wait before each request |
20
+
21
+ ```python
22
+ class PoliteSpider(Spider):
23
+ name = "polite"
24
+ start_urls = ["https://example.com"]
25
+
26
+ # Be gentle with the server
27
+ concurrent_requests = 4
28
+ concurrent_requests_per_domain = 2
29
+ download_delay = 1.0 # Wait 1 second between requests
30
+
31
+ async def parse(self, response: Response):
32
+ yield {"title": response.css("title::text").get("")}
33
+ ```
34
+
35
+ When `concurrent_requests_per_domain` is set, each domain gets its own concurrency limiter in addition to the global limit. This is useful when crawling multiple domains simultaneously — you can allow high global concurrency while being polite to each individual domain.
36
+
37
+ !!! tip
38
+
39
+ The `download_delay` parameter adds a fixed wait before every request, regardless of the domain. Use it for simple rate limiting.
40
+
41
+ ### Using uvloop
42
+
43
+ The `start()` method accepts a `use_uvloop` parameter to use the faster [uvloop](https://github.com/MagicStack/uvloop)/[winloop](https://github.com/nicktimko/winloop) event loop implementation, if available:
44
+
45
+ ```python
46
+ result = MySpider().start(use_uvloop=True)
47
+ ```
48
+
49
+ This can improve throughput for I/O-heavy crawls. You'll need to install `uvloop` (Linux/macOS) or `winloop` (Windows) separately.
50
+
51
+ ## Pause & Resume
52
+
53
+ The spider supports graceful pause-and-resume via checkpointing. To enable it, pass a `crawldir` directory to the spider constructor:
54
+
55
+ ```python
56
+ spider = MySpider(crawldir="crawl_data/my_spider")
57
+ result = spider.start()
58
+
59
+ if result.paused:
60
+ print("Crawl was paused. Run again to resume.")
61
+ else:
62
+ print("Crawl completed!")
63
+ ```
64
+
65
+ ### How It Works
66
+
67
+ 1. **Pausing**: Press `Ctrl+C` during a crawl. The spider waits for all in-flight requests to finish, saves a checkpoint (pending requests + a set of seen request fingerprints), and then exits.
68
+ 2. **Force stopping**: Press `Ctrl+C` a second time to stop immediately without waiting for active tasks.
69
+ 3. **Resuming**: Run the spider again with the same `crawldir`. It detects the checkpoint, restores the queue and seen set, and continues from where it left off — skipping `start_requests()`.
70
+ 4. **Cleanup**: When a crawl completes normally (not paused), the checkpoint files are deleted automatically.
71
+
72
+ **Checkpoints are also saved periodically during the crawl (every 5 minutes by default).**
73
+
74
+ You can change the interval as follows:
75
+
76
+ ```python
77
+ # Save checkpoint every 2 minutes
78
+ spider = MySpider(crawldir="crawl_data/my_spider", interval=120.0)
79
+ ```
80
+
81
+ The writing to the disk is atomic, so it's totally safe.
82
+
83
+ !!! tip
84
+
85
+ Pressing `Ctrl+C` during a crawl always causes the spider to close gracefully, even if the checkpoint system is not enabled. Doing it again without waiting forces the spider to close immediately.
86
+
87
+ ### Knowing If You're Resuming
88
+
89
+ The `on_start()` hook receives a `resuming` flag:
90
+
91
+ ```python
92
+ async def on_start(self, resuming: bool = False):
93
+ if resuming:
94
+ self.logger.info("Resuming from checkpoint!")
95
+ else:
96
+ self.logger.info("Starting fresh crawl")
97
+ ```
98
+
99
+ ## Streaming
100
+
101
+ For long-running spiders or applications that need real-time access to scraped items, use the `stream()` method instead of `start()`:
102
+
103
+ ```python
104
+ import anyio
105
+
106
+ async def main():
107
+ spider = MySpider()
108
+ async for item in spider.stream():
109
+ print(f"Got item: {item}")
110
+ # Access real-time stats
111
+ print(f"Items so far: {spider.stats.items_scraped}")
112
+ print(f"Requests made: {spider.stats.requests_count}")
113
+
114
+ anyio.run(main)
115
+ ```
116
+
117
+ Key differences from `start()`:
118
+
119
+ - `stream()` must be called from an async context
120
+ - Items are yielded one by one as they're scraped, not collected into a list
121
+ - You can access `spider.stats` during iteration for real-time statistics
122
+
123
+ !!! abstract
124
+
125
+ The full list of all stats that can be accessed by `spider.stats` is explained below [here](#results--statistics)
126
+
127
+ You can use it with the checkpoint system too, so it's easy to build UI on top of spiders. UIs that have real-time data and can be paused/resumed.
128
+
129
+ ```python
130
+ import anyio
131
+
132
+ async def main():
133
+ spider = MySpider(crawldir="crawl_data/my_spider")
134
+ async for item in spider.stream():
135
+ print(f"Got item: {item}")
136
+ # Access real-time stats
137
+ print(f"Items so far: {spider.stats.items_scraped}")
138
+ print(f"Requests made: {spider.stats.requests_count}")
139
+
140
+ anyio.run(main)
141
+ ```
142
+ You can also use `spider.pause()` to shut down the spider in the code above. If you used it without enabling the checkpoint system, it will just close the crawl.
143
+
144
+ ## Lifecycle Hooks
145
+
146
+ The spider provides several hooks you can override to add custom behavior at different stages of the crawl:
147
+
148
+ ### on_start
149
+
150
+ Called before crawling begins. Use it for setup tasks like loading data or initializing resources:
151
+
152
+ ```python
153
+ async def on_start(self, resuming: bool = False):
154
+ self.logger.info("Spider starting up")
155
+ # Load seed URLs from a database, initialize counters, etc.
156
+ ```
157
+
158
+ ### on_close
159
+
160
+ Called after crawling finishes (whether completed or paused). Use it for cleanup:
161
+
162
+ ```python
163
+ async def on_close(self):
164
+ self.logger.info("Spider shutting down")
165
+ # Close database connections, flush buffers, etc.
166
+ ```
167
+
168
+ ### on_error
169
+
170
+ Called when a request fails with an exception. Use it for error tracking or custom recovery logic:
171
+
172
+ ```python
173
+ async def on_error(self, request: Request, error: Exception):
174
+ self.logger.error(f"Failed: {request.url} - {error}")
175
+ # Log to error tracker, save failed URL for later, etc.
176
+ ```
177
+
178
+ ### on_scraped_item
179
+
180
+ Called for every scraped item before it's added to the results. Return the item (modified or not) to keep it, or return `None` to drop it:
181
+
182
+ ```python
183
+ async def on_scraped_item(self, item: dict) -> dict | None:
184
+ # Drop items without a title
185
+ if not item.get("title"):
186
+ return None
187
+
188
+ # Modify items (e.g., add timestamps)
189
+ item["scraped_at"] = "2026-01-01"
190
+ return item
191
+ ```
192
+
193
+ !!! tip
194
+
195
+ This hook can also be used to direct items through your own pipelines and drop them from the spider.
196
+
197
+ ### start_requests
198
+
199
+ Override `start_requests()` for custom initial request generation instead of using `start_urls`:
200
+
201
+ ```python
202
+ async def start_requests(self):
203
+ # POST request to log in first
204
+ yield Request(
205
+ "https://example.com/login",
206
+ method="POST",
207
+ data={"user": "admin", "pass": "secret"},
208
+ callback=self.after_login,
209
+ )
210
+
211
+ async def after_login(self, response: Response):
212
+ # Now crawl the authenticated pages
213
+ yield response.follow("/dashboard", callback=self.parse)
214
+ ```
215
+
216
+ ## Results & Statistics
217
+
218
+ The `CrawlResult` returned by `start()` contains both the scraped items and detailed statistics:
219
+
220
+ ```python
221
+ result = MySpider().start()
222
+
223
+ # Items
224
+ print(f"Total items: {len(result.items)}")
225
+ result.items.to_json("output.json", indent=True)
226
+
227
+ # Did the crawl complete?
228
+ print(f"Completed: {result.completed}")
229
+ print(f"Paused: {result.paused}")
230
+
231
+ # Statistics
232
+ stats = result.stats
233
+ print(f"Requests: {stats.requests_count}")
234
+ print(f"Failed: {stats.failed_requests_count}")
235
+ print(f"Blocked: {stats.blocked_requests_count}")
236
+ print(f"Offsite filtered: {stats.offsite_requests_count}")
237
+ print(f"Items scraped: {stats.items_scraped}")
238
+ print(f"Items dropped: {stats.items_dropped}")
239
+ print(f"Response bytes: {stats.response_bytes}")
240
+ print(f"Duration: {stats.elapsed_seconds:.1f}s")
241
+ print(f"Speed: {stats.requests_per_second:.1f} req/s")
242
+ ```
243
+
244
+ ### Detailed Stats
245
+
246
+ The `CrawlStats` object tracks granular information:
247
+
248
+ ```python
249
+ stats = result.stats
250
+
251
+ # Status code distribution
252
+ print(stats.response_status_count)
253
+ # {'status_200': 150, 'status_404': 3, 'status_403': 1}
254
+
255
+ # Bytes downloaded per domain
256
+ print(stats.domains_response_bytes)
257
+ # {'example.com': 1234567, 'api.example.com': 45678}
258
+
259
+ # Requests per session
260
+ print(stats.sessions_requests_count)
261
+ # {'http': 120, 'stealth': 34}
262
+
263
+ # Proxies used during the crawl
264
+ print(stats.proxies)
265
+ # ['http://proxy1:8080', 'http://proxy2:8080']
266
+
267
+ # Log level counts
268
+ print(stats.log_levels_counter)
269
+ # {'debug': 200, 'info': 50, 'warning': 3, 'error': 1, 'critical': 0}
270
+
271
+ # Timing information
272
+ print(stats.start_time) # Unix timestamp when crawl started
273
+ print(stats.end_time) # Unix timestamp when crawl finished
274
+ print(stats.download_delay) # The download delay used (seconds)
275
+
276
+ # Concurrency settings used
277
+ print(stats.concurrent_requests) # Global concurrency limit
278
+ print(stats.concurrent_requests_per_domain) # Per-domain concurrency limit
279
+
280
+ # Custom stats (set by your spider code)
281
+ print(stats.custom_stats)
282
+ # {'login_attempts': 3, 'pages_with_errors': 5}
283
+
284
+ # Export everything as a dict
285
+ print(stats.to_dict())
286
+ ```
287
+
288
+ ## Logging
289
+
290
+ The spider has a built-in logger accessible via `self.logger`. It's pre-configured with the spider's name and supports several customization options:
291
+
292
+ | Attribute | Default | Description |
293
+ |-----------------------|--------------------------------------------------------------|----------------------------------------------------|
294
+ | `logging_level` | `logging.DEBUG` | Minimum log level |
295
+ | `logging_format` | `"[%(asctime)s]:({spider_name}) %(levelname)s: %(message)s"` | Log message format |
296
+ | `logging_date_format` | `"%Y-%m-%d %H:%M:%S"` | Date format in log messages |
297
+ | `log_file` | `None` | Path to a log file (in addition to console output) |
298
+
299
+ ```python
300
+ import logging
301
+
302
+ class MySpider(Spider):
303
+ name = "my_spider"
304
+ start_urls = ["https://example.com"]
305
+ logging_level = logging.INFO
306
+ log_file = "logs/my_spider.log"
307
+
308
+ async def parse(self, response: Response):
309
+ self.logger.info(f"Processing {response.url}")
310
+ yield {"title": response.css("title::text").get("")}
311
+ ```
312
+
313
+ The log file directory is created automatically if it doesn't exist. Both console and file output use the same format.
docs/spiders/architecture.md ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Spiders architecture
2
+
3
+ !!! success "Prerequisites"
4
+
5
+ 1. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand the different fetcher types and when to use each one.
6
+ 2. You've completed or read the [Main classes](../parsing/main_classes.md) page to understand the [Selector](../parsing/main_classes.md#selector) and [Response](../fetching/choosing.md#response-object) classes.
7
+
8
+ Scrapling's spider system is a Scrapy-inspired async crawling framework designed for concurrent, multi-session crawls with built-in pause/resume support. It brings together Scrapling's parsing engine and fetchers into a unified crawling API while adding scheduling, concurrency control, and checkpointing.
9
+
10
+ If you're familiar with Scrapy, you'll feel right at home. If not, don't worry — the system is designed to be straightforward.
11
+
12
+ ## Data Flow
13
+
14
+ The diagram below shows how data flows through the spider system when a crawl is running:
15
+
16
+ <img src="../assets/spider_architecture.png" title="Spider architecture diagram by @TrueSkills" alt="Spider architecture diagram by @TrueSkills" style="width: 70%;"/>
17
+
18
+ Here's what happens step by step when you run a spider without many details:
19
+
20
+ 1. The **Spider** produces the first batch of `Request` objects. By default, it creates one request for each URL in `start_urls`, but you can override `start_requests()` for custom logic.
21
+ 2. The **Scheduler** receives requests and places them in a priority queue, and creates fingerprints for them. Higher-priority requests are dequeued first.
22
+ 3. The **Crawler Engine** asks the **Scheduler** to dequeue the next request, respecting concurrency limits (global and per-domain) and download delays. Once the **Crawler Engine** receives the request, it passes it to the **Session Manager**, which routes it to the correct session based on the request's `sid` (session ID).
23
+ 4. The **session** fetches the page and returns a [Response](../fetching/choosing.md#response-object) object to the **Crawler Engine**. The engine records statistics and checks for blocked responses. If the response is blocked, the engine retries the request up to `max_blocked_retries` times. Of course, the blocking detection and the retry logic for blocked requests can be customized.
24
+ 5. The **Crawler Engine** passes the [Response](../fetching/choosing.md#response-object) to the request's callback. The callback either yields a dictionary, which gets treated as a scraped item, or a follow-up request, which gets sent to the scheduler for queuing.
25
+ 6. The cycle repeats from step 2 until the scheduler is empty and no tasks are active, or the spider is paused.
26
+ 7. If `crawldir` is set while starting the spider, the **Crawler Engine** periodically saves a checkpoint (pending requests + seen URLs set) to disk. On graceful shutdown (Ctrl+C), a final checkpoint is saved. The next time the spider runs with the same `crawldir`, it resumes from where it left off — skipping `start_requests()` and restoring the scheduler state.
27
+
28
+
29
+ ## Components
30
+
31
+ ### Spider
32
+
33
+ The central class you interact with. You subclass `Spider`, define your `start_urls` and `parse()` method, and optionally configure sessions and override lifecycle hooks.
34
+
35
+ ```python
36
+ from scrapling.spiders import Spider, Response, Request
37
+
38
+ class MySpider(Spider):
39
+ name = "my_spider"
40
+ start_urls = ["https://example.com"]
41
+
42
+ async def parse(self, response: Response):
43
+ for link in response.css("a::attr(href)").getall():
44
+ yield response.follow(link, callback=self.parse_page)
45
+
46
+ async def parse_page(self, response: Response):
47
+ yield {"title": response.css("h1::text").get("")}
48
+ ```
49
+
50
+ ### Crawler Engine
51
+
52
+ The engine orchestrates the entire crawl. It manages the main loop, enforces concurrency limits, dispatches requests through the Session Manager, and processes results from callbacks. You don't interact with it directly — the `Spider.start()` and `Spider.stream()` methods handle it for you.
53
+
54
+ ### Scheduler
55
+
56
+ A priority queue with built-in URL deduplication. Requests are fingerprinted based on their URL, HTTP method, body, and session ID. The scheduler supports `snapshot()` and `restore()` for the checkpoint system, allowing the crawl state to be saved and resumed.
57
+
58
+ ### Session Manager
59
+
60
+ Manages one or more named session instances. Each session is one of:
61
+
62
+ - [FetcherSession](../fetching/static.md)
63
+ - [AsyncDynamicSession](../fetching/dynamic.md)
64
+ - [AsyncStealthySession](../fetching/stealthy.md)
65
+
66
+ When a request comes in, the Session Manager routes it to the correct session based on the request's `sid` field. Sessions can be started with the spider start (default) or lazily (started on the first use).
67
+
68
+ ### Checkpoint System
69
+
70
+ An optional system that, if enabled, saves the crawler's state (pending requests + seen URL fingerprints) to a pickle file on disk. Writes are atomic (temp file + rename) to prevent corruption. Checkpoints are saved periodically at a configurable interval and on graceful shutdown. Upon successful completion (not paused), checkpoint files are automatically cleaned up.
71
+
72
+ ### Output
73
+
74
+ Scraped items are collected in an `ItemList` (a list subclass with `to_json()` and `to_jsonl()` export methods). Crawl statistics are tracked in a `CrawlStats` dataclass which contains a lot of useful info.
75
+
76
+
77
+ ## Comparison with Scrapy
78
+
79
+ If you're coming from Scrapy, here's how Scrapling's spider system maps:
80
+
81
+ | Concept | Scrapy | Scrapling |
82
+ |--------------------|-------------------------------|-----------------------------------------------------------------|
83
+ | Spider definition | `scrapy.Spider` subclass | `scrapling.spiders.Spider` subclass |
84
+ | Initial requests | `start_requests()` | `async start_requests()` |
85
+ | Callbacks | `def parse(self, response)` | `async def parse(self, response)` |
86
+ | Following links | `response.follow(url)` | `response.follow(url)` |
87
+ | Item output | `yield dict` or `yield Item` | `yield dict` |
88
+ | Request scheduling | Scheduler + Dupefilter | Scheduler with built-in deduplication |
89
+ | Downloading | Downloader + Middlewares | Session Manager with multi-session support |
90
+ | Item processing | Item Pipelines | `on_scraped_item()` hook |
91
+ | Blocked detection | Through custom middlewares | Built-in `is_blocked()` + `retry_blocked_request()` hooks |
92
+ | Concurrency | `CONCURRENT_REQUESTS` setting | `concurrent_requests` class attribute |
93
+ | Domain filtering | `allowed_domains` | `allowed_domains` |
94
+ | Pause/Resume | `JOBDIR` setting | `crawldir` constructor argument |
95
+ | Export | Feed exports | `result.items.to_json()` / `to_jsonl()` or custom through hooks |
96
+ | Running | `scrapy crawl spider_name` | `MySpider().start()` |
97
+ | Streaming | N/A | `async for item in spider.stream()` |
98
+ | Multi-session | N/A | Multiple sessions with different types per spider |
docs/spiders/getting-started.md ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Getting started
2
+
3
+ ## Introduction
4
+
5
+ !!! success "Prerequisites"
6
+
7
+ 1. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand the different fetcher types and when to use each one.
8
+ 2. You've completed or read the [Main classes](../parsing/main_classes.md) page to understand the [Selector](../parsing/main_classes.md#selector) and [Response](../fetching/choosing.md#response-object) classes.
9
+ 3. You've read the [Architecture](architecture.md) page for a high-level overview of how the spider system works.
10
+
11
+ The spider system lets you build concurrent, multi-page crawlers in just a few lines of code. If you've used Scrapy before, the patterns will feel familiar. If not, this guide will walk you through everything you need to get started.
12
+
13
+ ## Your First Spider
14
+
15
+ A spider is a class that defines how to crawl and extract data from websites. Here's the simplest possible spider:
16
+
17
+ ```python
18
+ from scrapling.spiders import Spider, Response
19
+
20
+ class QuotesSpider(Spider):
21
+ name = "quotes"
22
+ start_urls = ["https://quotes.toscrape.com"]
23
+
24
+ async def parse(self, response: Response):
25
+ for quote in response.css("div.quote"):
26
+ yield {
27
+ "text": quote.css("span.text::text").get(""),
28
+ "author": quote.css("small.author::text").get(""),
29
+ }
30
+ ```
31
+
32
+ Every spider needs three things:
33
+
34
+ 1. **`name`** — A unique identifier for the spider.
35
+ 2. **`start_urls`** — A list of URLs to start crawling from.
36
+ 3. **`parse()`** — An async generator method that processes each response and yields results.
37
+
38
+ The `parse()` method is where the magic happens. You use the same selection methods you'd use with Scrapling's [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object), and `yield` dictionaries to output scraped items.
39
+
40
+ ## Running the Spider
41
+
42
+ To run your spider, create an instance and call `start()`:
43
+
44
+ ```python
45
+ result = QuotesSpider().start()
46
+ ```
47
+
48
+ The `start()` method handles all the async machinery internally — no need to worry about event loops. While the spider is running, everything that happens is logged to the terminal, and at the end of the crawl, you get very detailed stats.
49
+
50
+ Those stats are in the returned `CrawlResult` object, which gives you everything you need:
51
+
52
+ ```python
53
+ result = QuotesSpider().start()
54
+
55
+ # Access scraped items
56
+ for item in result.items:
57
+ print(item["text"], "-", item["author"])
58
+
59
+ # Check statistics
60
+ print(f"Scraped {result.stats.items_scraped} items")
61
+ print(f"Made {result.stats.requests_count} requests")
62
+ print(f"Took {result.stats.elapsed_seconds:.1f} seconds")
63
+
64
+ # Did the crawl finish or was it paused?
65
+ print(f"Completed: {result.completed}")
66
+ ```
67
+
68
+ ## Following Links
69
+
70
+ Most crawls need to follow links across multiple pages. Use `response.follow()` to create follow-up requests:
71
+
72
+ ```python
73
+ from scrapling.spiders import Spider, Response
74
+
75
+ class QuotesSpider(Spider):
76
+ name = "quotes"
77
+ start_urls = ["https://quotes.toscrape.com"]
78
+
79
+ async def parse(self, response: Response):
80
+ # Extract items from the current page
81
+ for quote in response.css("div.quote"):
82
+ yield {
83
+ "text": quote.css("span.text::text").get(""),
84
+ "author": quote.css("small.author::text").get(""),
85
+ }
86
+
87
+ # Follow the "next page" link
88
+ next_page = response.css("li.next a::attr(href)").get()
89
+ if next_page:
90
+ yield response.follow(next_page, callback=self.parse)
91
+ ```
92
+
93
+ `response.follow()` handles relative URLs automatically — it joins them with the current page's URL. It also sets the current page as the `Referer` header by default.
94
+
95
+ You can point follow-up requests at different callback methods for different page types:
96
+
97
+ ```python
98
+ async def parse(self, response: Response):
99
+ for link in response.css("a.product-link::attr(href)").getall():
100
+ yield response.follow(link, callback=self.parse_product)
101
+
102
+ async def parse_product(self, response: Response):
103
+ yield {
104
+ "name": response.css("h1::text").get(""),
105
+ "price": response.css(".price::text").get(""),
106
+ }
107
+ ```
108
+
109
+ !!! note
110
+
111
+ All callback methods must be async generators (using `async def` and `yield`).
112
+
113
+ ## Exporting Data
114
+
115
+ The `ItemList` returned in `result.items` has built-in export methods:
116
+
117
+ ```python
118
+ result = QuotesSpider().start()
119
+
120
+ # Export as JSON
121
+ result.items.to_json("quotes.json")
122
+
123
+ # Export as JSON with pretty-printing
124
+ result.items.to_json("quotes.json", indent=True)
125
+
126
+ # Export as JSON Lines (one JSON object per line)
127
+ result.items.to_jsonl("quotes.jsonl")
128
+ ```
129
+
130
+ Both methods create parent directories automatically if they don't exist.
131
+
132
+ ## Filtering Domains
133
+
134
+ Use `allowed_domains` to restrict the spider to specific domains. This prevents it from accidentally following links to external websites:
135
+
136
+ ```python
137
+ class MySpider(Spider):
138
+ name = "my_spider"
139
+ start_urls = ["https://example.com"]
140
+ allowed_domains = {"example.com"}
141
+
142
+ async def parse(self, response: Response):
143
+ for link in response.css("a::attr(href)").getall():
144
+ # Links to other domains are silently dropped
145
+ yield response.follow(link, callback=self.parse)
146
+ ```
147
+
148
+ Subdomains are matched automatically — setting `allowed_domains = {"example.com"}` also allows `sub.example.com`, `blog.example.com`, etc.
149
+
150
+ When a request is filtered out, it's counted in `stats.offsite_requests_count` so you can see how many were dropped.
151
+
152
+ ## What's Next
153
+
154
+ Now that you have the basics, you can explore:
155
+
156
+ - [Requests & Responses](requests-responses.md) — learn about request priority, deduplication, metadata, and more.
157
+ - [Sessions](sessions.md) — use multiple fetcher types (HTTP, browser, stealth) in a single spider.
158
+ - [Proxy management & blocking](proxy-blocking.md) — rotate proxies across requests and how to handle blocking in the spider.
159
+ - [Advanced features](advanced.md) — concurrency control, pause/resume, streaming, lifecycle hooks, and logging.
docs/spiders/proxy-blocking.md ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Proxy management and handling Blocks
2
+
3
+ ## Introduction
4
+
5
+ !!! success "Prerequisites"
6
+
7
+ 1. You've read the [Getting started](getting-started.md) page and know how to create and run a basic spider.
8
+ 2. You've read the [Sessions](sessions.md) page and understand how to configure sessions.
9
+
10
+ When scraping at scale, you'll often need to rotate through multiple proxies to avoid rate limits and blocks. Scrapling's `ProxyRotator` makes this straightforward — it works with all session types and integrates with the spider's blocked request retry system.
11
+
12
+ If you don't know what a proxy is or how to choose a good one, [this guide can help](https://substack.thewebscraping.club/p/everything-about-proxies).
13
+
14
+ ## ProxyRotator
15
+
16
+ The `ProxyRotator` class manages a list of proxies and rotates through them automatically. Pass it to any session type via the `proxy_rotator` parameter:
17
+
18
+ ```python
19
+ from scrapling.spiders import Spider, Response
20
+ from scrapling.fetchers import FetcherSession, ProxyRotator
21
+
22
+ class MySpider(Spider):
23
+ name = "my_spider"
24
+ start_urls = ["https://example.com"]
25
+
26
+ def configure_sessions(self, manager):
27
+ rotator = ProxyRotator([
28
+ "http://proxy1:8080",
29
+ "http://proxy2:8080",
30
+ "http://user:pass@proxy3:8080",
31
+ ])
32
+ manager.add("default", FetcherSession(proxy_rotator=rotator))
33
+
34
+ async def parse(self, response: Response):
35
+ # Check which proxy was used
36
+ print(f"Proxy used: {response.meta.get('proxy')}")
37
+ yield {"title": response.css("title::text").get("")}
38
+ ```
39
+
40
+ Each request automatically gets the next proxy in the rotation. The proxy used is stored in `response.meta["proxy"]` so you can track which proxy fetched which page.
41
+
42
+
43
+ When you use it with browser sessions, you will need some adjustments, like below:
44
+
45
+ ```python
46
+ from scrapling.fetchers import AsyncDynamicSession, AsyncStealthySession, ProxyRotator
47
+
48
+ # String proxies work for all session types
49
+ rotator = ProxyRotator([
50
+ "http://proxy1:8080",
51
+ "http://proxy2:8080",
52
+ ])
53
+
54
+ # Dict proxies (Playwright format) work for browser sessions
55
+ rotator = ProxyRotator([
56
+ {"server": "http://proxy1:8080", "username": "user", "password": "pass"},
57
+ {"server": "http://proxy2:8080"},
58
+ ])
59
+
60
+ # Then inside the spider
61
+ def configure_sessions(self, manager):
62
+ rotator = ProxyRotator(["http://proxy1:8080", "http://proxy2:8080"])
63
+ manager.add("browser", AsyncStealthySession(proxy_rotator=rotator))
64
+ ```
65
+
66
+ !!! info
67
+
68
+ 1. You cannot use the `proxy_rotator` argument together with the static `proxy` or `proxies` parameters on the same session. Pick one approach when configuring the session, and override it per request later if you want, as we will show later.
69
+ 2. Remember that by default, all browser-based sessions use a persistent browser context with a pool of tabs. However, since browsers can't set a proxy per tab, when you use a `ProxyRotator`, the fetcher will automatically open a separate context for each proxy, with one tab per context. Once the tab's job is done, both the tab and its context are closed.
70
+
71
+ ## Custom Rotation Strategies
72
+
73
+ By default, `ProxyRotator` uses cyclic rotation — it iterates through proxies sequentially, wrapping around at the end.
74
+
75
+ You can provide a custom strategy function to change this behavior, but it has to match the below signature:
76
+
77
+ ```python
78
+ from scrapling.core._types import ProxyType
79
+
80
+ def my_strategy(proxies: list, current_index: int) -> tuple[ProxyType, int]:
81
+ ...
82
+ ```
83
+
84
+ It receives the list of proxies and the current index, and must return the chosen proxy and the next index.
85
+
86
+ Below are some examples of custom rotation strategies you can use.
87
+
88
+ ### Random Rotation
89
+
90
+ ```python
91
+ import random
92
+ from scrapling.fetchers import ProxyRotator
93
+
94
+ def random_strategy(proxies, current_index):
95
+ idx = random.randint(0, len(proxies) - 1)
96
+ return proxies[idx], idx
97
+
98
+ rotator = ProxyRotator(
99
+ ["http://proxy1:8080", "http://proxy2:8080", "http://proxy3:8080"],
100
+ strategy=random_strategy,
101
+ )
102
+ ```
103
+
104
+ ### Weighted Rotation
105
+
106
+ ```python
107
+ import random
108
+
109
+ def weighted_strategy(proxies, current_index):
110
+ # First proxy gets 60% of traffic, others split the rest
111
+ weights = [60] + [40 // (len(proxies) - 1)] * (len(proxies) - 1)
112
+ proxy = random.choices(proxies, weights=weights, k=1)[0]
113
+ return proxy, current_index # Index doesn't matter for weighted
114
+
115
+ rotator = ProxyRotator(proxies, strategy=weighted_strategy)
116
+ ```
117
+
118
+
119
+ ## Per-Request Proxy Override
120
+
121
+ You can override the rotator for individual requests by passing `proxy=` as a keyword argument:
122
+
123
+ ```python
124
+ async def parse(self, response: Response):
125
+ # This request uses the rotator's next proxy
126
+ yield response.follow("/page1", callback=self.parse_page)
127
+
128
+ # This request uses a specific proxy, bypassing the rotator
129
+ yield response.follow(
130
+ "/special-page",
131
+ callback=self.parse_page,
132
+ proxy="http://special-proxy:8080",
133
+ )
134
+ ```
135
+
136
+ This is useful when certain pages require a specific proxy (e.g., a geo-located proxy for region-specific content).
137
+
138
+ ## Blocked Request Handling
139
+
140
+ The spider has built-in blocked request detection and retry. By default, it considers the following HTTP status codes blocked: `401`, `403`, `407`, `429`, `444`, `500`, `502`, `503`, `504`.
141
+
142
+ The retry system works like this:
143
+
144
+ 1. After a response comes back, the spider calls the `is_blocked(response)` method.
145
+ 2. If blocked, it copies the request and calls the `retry_blocked_request()` method so you can modify it before retrying.
146
+ 3. The retried request is re-queued with `dont_filter=True` (bypassing deduplication) and lower priority, so it's not retried right away.
147
+ 4. This repeats up to `max_blocked_retries` times (default: 3).
148
+
149
+ !!! tip
150
+
151
+ 1. On retry, the previous `proxy`/`proxies` kwargs are cleared from the request automatically, so the rotator assigns a fresh proxy.
152
+ 2. The `max_blocked_retries` attribute is different than the session retries and doesn't share the counter.
153
+
154
+ ### Custom Block Detection
155
+
156
+ Override `is_blocked()` to add your own detection logic:
157
+
158
+ ```python
159
+ class MySpider(Spider):
160
+ name = "my_spider"
161
+ start_urls = ["https://example.com"]
162
+
163
+ async def is_blocked(self, response: Response) -> bool:
164
+ # Check status codes (default behavior)
165
+ if response.status in {403, 429, 503}:
166
+ return True
167
+
168
+ # Check response content
169
+ body = response.body.decode("utf-8", errors="ignore")
170
+ if "access denied" in body.lower() or "rate limit" in body.lower():
171
+ return True
172
+
173
+ return False
174
+
175
+ async def parse(self, response: Response):
176
+ yield {"title": response.css("title::text").get("")}
177
+ ```
178
+
179
+ ### Customizing Retries
180
+
181
+ Override `retry_blocked_request()` to modify the request before retrying. The `max_blocked_retries` attribute controls how many times a blocked request is retried (default: 3):
182
+
183
+ ```python
184
+ from scrapling.spiders import Spider, SessionManager, Request, Response
185
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession
186
+
187
+
188
+ class MySpider(Spider):
189
+ name = "my_spider"
190
+ start_urls = ["https://example.com"]
191
+ max_blocked_retries = 5
192
+
193
+ def configure_sessions(self, manager: SessionManager) -> None:
194
+ manager.add('requests', FetcherSession(impersonate=['chrome', 'firefox', 'safari']))
195
+ manager.add('stealth', AsyncStealthySession(block_webrtc=True), lazy=True)
196
+
197
+ async def retry_blocked_request(self, request: Request, response: Response) -> Request:
198
+ request.sid = "stealth"
199
+ self.logger.info(f"Retrying blocked request: {request.url}")
200
+ return request
201
+
202
+ async def parse(self, response: Response):
203
+ yield {"title": response.css("title::text").get("")}
204
+ ```
205
+
206
+ What happened above is that I left the blocking detection logic unchanged and had the spider mainly use requests until it got blocked, then switch to the stealthy browser.
207
+
208
+
209
+ Putting it all together:
210
+
211
+ ```python
212
+ from scrapling.spiders import Spider, SessionManager, Request, Response
213
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession, ProxyRotator
214
+
215
+
216
+ cheap_proxies = ProxyRotator([ "http://proxy1:8080", "http://proxy2:8080"])
217
+
218
+ # A format acceptable by the browser
219
+ expensive_proxies = ProxyRotator([
220
+ {"server": "http://residential_proxy1:8080", "username": "user", "password": "pass"},
221
+ {"server": "http://residential_proxy2:8080", "username": "user", "password": "pass"},
222
+ {"server": "http://mobile_proxy1:8080", "username": "user", "password": "pass"},
223
+ {"server": "http://mobile_proxy2:8080", "username": "user", "password": "pass"},
224
+ ])
225
+
226
+
227
+ class MySpider(Spider):
228
+ name = "my_spider"
229
+ start_urls = ["https://example.com"]
230
+ max_blocked_retries = 5
231
+
232
+ def configure_sessions(self, manager: SessionManager) -> None:
233
+ manager.add('requests', FetcherSession(impersonate=['chrome', 'firefox', 'safari'], proxy_rotator=cheap_proxies))
234
+ manager.add('stealth', AsyncStealthySession(block_webrtc=True, proxy_rotator=expensive_proxies), lazy=True)
235
+
236
+ async def retry_blocked_request(self, request: Request, response: Response) -> Request:
237
+ request.sid = "stealth"
238
+ self.logger.info(f"Retrying blocked request: {request.url}")
239
+ return request
240
+
241
+ async def parse(self, response: Response):
242
+ yield {"title": response.css("title::text").get("")}
243
+ ```
244
+ The above logic is: requests are made with cheap proxies, such as datacenter proxies, until they are blocked, then retried with higher-quality proxies, such as residential or mobile proxies.
docs/spiders/requests-responses.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Requests & Responses
2
+
3
+ !!! success "Prerequisites"
4
+
5
+ 1. You've read the [Getting started](getting-started.md) page and know how to create and run a basic spider.
6
+
7
+ This page covers the `Request` object in detail — how to construct requests, pass data between callbacks, control priority and deduplication, and use `response.follow()` for link-following.
8
+
9
+ ## The Request Object
10
+
11
+ A `Request` represents a URL to be fetched. You create requests either directly or via `response.follow()`:
12
+
13
+ ```python
14
+ from scrapling.spiders import Request
15
+
16
+ # Direct construction
17
+ request = Request(
18
+ "https://example.com/page",
19
+ callback=self.parse_page,
20
+ priority=5,
21
+ )
22
+
23
+ # Via response.follow (preferred in callbacks)
24
+ request = response.follow("/page", callback=self.parse_page)
25
+ ```
26
+
27
+ Here are all the arguments you can pass to `Request`:
28
+
29
+ | Argument | Type | Default | Description |
30
+ |---------------|------------|------------|-------------------------------------------------------------------------------------------------------|
31
+ | `url` | `str` | *required* | The URL to fetch |
32
+ | `sid` | `str` | `""` | Session ID — routes the request to a specific session (see [Sessions](sessions.md)) |
33
+ | `callback` | `callable` | `None` | Async generator method to process the response. Defaults to `parse()` |
34
+ | `priority` | `int` | `0` | Higher values are processed first |
35
+ | `dont_filter` | `bool` | `False` | If `True`, skip deduplication (allow duplicate requests) |
36
+ | `meta` | `dict` | `{}` | Arbitrary metadata passed through to the response |
37
+ | `**kwargs` | | | Additional keyword arguments passed to the session's fetch method (e.g., `headers`, `method`, `data`) |
38
+
39
+ Any extra keyword arguments are forwarded directly to the underlying session. For example, to make a POST request:
40
+
41
+ ```python
42
+ yield Request(
43
+ "https://example.com/api",
44
+ method="POST",
45
+ data={"key": "value"},
46
+ callback=self.parse_result,
47
+ )
48
+ ```
49
+
50
+ ## Response.follow()
51
+
52
+ `response.follow()` is the recommended way to create follow-up requests inside callbacks. It offers several advantages over constructing `Request` objects directly:
53
+
54
+ - **Relative URLs** are resolved automatically against the current page URL
55
+ - **Referer header** is set to the current page URL by default
56
+ - **Session kwargs** from the original request are inherited (headers, proxy settings, etc.)
57
+ - **Callback, session ID, and priority** are inherited from the original request if not specified
58
+
59
+ ```python
60
+ async def parse(self, response: Response):
61
+ # Minimal — inherits callback, sid, priority from current request
62
+ yield response.follow("/next-page")
63
+
64
+ # Override specific fields
65
+ yield response.follow(
66
+ "/product/123",
67
+ callback=self.parse_product,
68
+ priority=10,
69
+ )
70
+
71
+ # Pass additional metadata to
72
+ yield response.follow(
73
+ "/details",
74
+ callback=self.parse_details,
75
+ meta={"category": "electronics"},
76
+ )
77
+ ```
78
+
79
+ | Argument | Type | Default | Description |
80
+ |--------------------|------------|------------|------------------------------------------------------------|
81
+ | `url` | `str` | *required* | URL to follow (absolute or relative) |
82
+ | `sid` | `str` | `""` | Session ID (inherits from original request if empty) |
83
+ | `callback` | `callable` | `None` | Callback method (inherits from original request if `None`) |
84
+ | `priority` | `int` | `None` | Priority (inherits from original request if `None`) |
85
+ | `dont_filter` | `bool` | `False` | Skip deduplication |
86
+ | `meta` | `dict` | `None` | Metadata (merged with existing response meta) |
87
+ | **`referer_flow`** | `bool` | `True` | Set current URL as Referer header |
88
+ | `**kwargs` | | | Merged with original request's session kwargs |
89
+
90
+ ### Disabling Referer Flow
91
+
92
+ By default, `response.follow()` sets the `Referer` header to the current page URL. To disable this:
93
+
94
+ ```python
95
+ yield response.follow("/page", referer_flow=False)
96
+ ```
97
+
98
+ ## Callbacks
99
+
100
+ Callbacks are async generator methods on your spider that process responses. They must `yield` one of three types:
101
+
102
+ - **`dict`** — A scraped item, added to the results
103
+ - **`Request`** — A follow-up request, added to the queue
104
+ - **`None`** — Silently ignored
105
+
106
+ ```python
107
+ class MySpider(Spider):
108
+ name = "my_spider"
109
+ start_urls = ["https://example.com"]
110
+
111
+ async def parse(self, response: Response):
112
+ # Yield items (dicts)
113
+ yield {"url": response.url, "title": response.css("title::text").get("")}
114
+
115
+ # Yield follow-up requests
116
+ for link in response.css("a::attr(href)").getall():
117
+ yield response.follow(link, callback=self.parse_page)
118
+
119
+ async def parse_page(self, response: Response):
120
+ yield {"content": response.css("article::text").get("")}
121
+ ```
122
+
123
+ !!! tip "Note:"
124
+
125
+ All callback methods must be `async def` and use `yield` (not `return`). Even if a callback only yields items with no follow-up requests, it must still be an async generator.
126
+
127
+ ## Request Priority
128
+
129
+ Requests with higher priority values are processed first. This is useful when some pages are more important to be processed first before others:
130
+
131
+ ```python
132
+ async def parse(self, response: Response):
133
+ # High priority — process product pages first
134
+ for link in response.css("a.product::attr(href)").getall():
135
+ yield response.follow(link, callback=self.parse_product, priority=10)
136
+
137
+ # Low priority — pagination links processed after products
138
+ next_page = response.css("a.next::attr(href)").get()
139
+ if next_page:
140
+ yield response.follow(next_page, callback=self.parse, priority=0)
141
+ ```
142
+
143
+ When using `response.follow()`, the priority is inherited from the original request unless you specify a new one.
144
+
145
+ ## Deduplication
146
+
147
+ The spider automatically deduplicates requests based on a fingerprint computed from the URL, HTTP method, request body, and session ID. If two requests produce the same fingerprint, the second one is silently dropped.
148
+
149
+ To allow duplicate requests (e.g., re-visiting a page after login), set `dont_filter=True`:
150
+
151
+ ```python
152
+ yield Request("https://example.com/dashboard", dont_filter=True, callback=self.parse_dashboard)
153
+
154
+ # Or with response.follow
155
+ yield response.follow("/dashboard", dont_filter=True, callback=self.parse_dashboard)
156
+ ```
157
+
158
+ You can fine-tune what goes into the fingerprint using class attributes on your spider:
159
+
160
+ | Attribute | Default | Effect |
161
+ |----------------------|---------|-----------------------------------------------------------------------------------------------------------------|
162
+ | `fp_include_kwargs` | `False` | Include extra request kwargs (arguments you passed to the session fetch, like headers, etc.) in the fingerprint |
163
+ | `fp_keep_fragments` | `False` | Keep URL fragments (`#section`) when computing fingerprints |
164
+ | `fp_include_headers` | `False` | Include request headers in the fingerprint |
165
+
166
+ For example, if you need to treat `https://example.com/page#section1` and `https://example.com/page#section2` as different URLs:
167
+
168
+ ```python
169
+ class MySpider(Spider):
170
+ name = "my_spider"
171
+ fp_keep_fragments = True
172
+ # ...
173
+ ```
174
+
175
+ ## Request Meta
176
+
177
+ The `meta` dictionary lets you pass arbitrary data between callbacks. This is useful when you need context from one page to process another:
178
+
179
+ ```python
180
+ async def parse(self, response: Response):
181
+ for product in response.css("div.product"):
182
+ category = product.css("span.category::text").get("")
183
+ link = product.css("a::attr(href)").get()
184
+ if link:
185
+ yield response.follow(
186
+ link,
187
+ callback=self.parse_product,
188
+ meta={"category": category},
189
+ )
190
+
191
+ async def parse_product(self, response: Response):
192
+ yield {
193
+ "name": response.css("h1::text").get(""),
194
+ "price": response.css(".price::text").get(""),
195
+ # Access meta from the request
196
+ "category": response.meta.get("category", ""),
197
+ }
198
+ ```
199
+
200
+ When using `response.follow()`, the meta from the current response is merged with the new meta you provide (new values take precedence).
201
+
202
+ The spider system also automatically stores some metadata. For example, the proxy used for a request is available as `response.meta["proxy"]` when proxy rotation is enabled.
docs/spiders/sessions.md ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Spiders sessions
2
+
3
+ !!! success "Prerequisites"
4
+
5
+ 1. You've read the [Getting started](getting-started.md) page and know how to create and run a basic spider.
6
+ 2. You're familiar with [Fetchers basics](../fetching/choosing.md) and the differences between HTTP, Dynamic, and Stealthy sessions.
7
+
8
+ A spider can use multiple fetcher sessions simultaneously — for example, a fast HTTP session for simple pages and a stealth browser session for protected pages. This page shows you how to configure and use sessions.
9
+
10
+ ## What are Sessions?
11
+
12
+ As you should already know, a session is a pre-configured fetcher instance that stays alive for the duration of the crawl. Instead of creating a new connection or browser for every request, the spider reuses sessions, which is faster and more resource-efficient.
13
+
14
+ By default, every spider creates a single [FetcherSession](../fetching/static.md). You can add more sessions or swap the default by overriding the `configure_sessions()` method, but you have to use the async version of each session only, as the table shows below:
15
+
16
+
17
+ | Session Type | Use Case |
18
+ |-------------------------------------------------|------------------------------------------|
19
+ | [FetcherSession](../fetching/static.md) | Fast HTTP requests, no JavaScript |
20
+ | [AsyncDynamicSession](../fetching/dynamic.md) | Browser automation, JavaScript rendering |
21
+ | [AsyncStealthySession](../fetching/stealthy.md) | Anti-bot bypass, Cloudflare, etc. |
22
+
23
+
24
+ ## Configuring Sessions
25
+
26
+ Override `configure_sessions()` on your spider to set up sessions. The `manager` parameter is a `SessionManager` instance — use `manager.add()` to register sessions:
27
+
28
+ ```python
29
+ from scrapling.spiders import Spider, Response
30
+ from scrapling.fetchers import FetcherSession
31
+
32
+ class MySpider(Spider):
33
+ name = "my_spider"
34
+ start_urls = ["https://example.com"]
35
+
36
+ def configure_sessions(self, manager):
37
+ manager.add("default", FetcherSession())
38
+
39
+ async def parse(self, response: Response):
40
+ yield {"title": response.css("title::text").get("")}
41
+ ```
42
+
43
+ The `manager.add()` method takes:
44
+
45
+ | Argument | Type | Default | Description |
46
+ |--------------|-----------|------------|----------------------------------------------|
47
+ | `session_id` | `str` | *required* | A name to reference this session in requests |
48
+ | `session` | `Session` | *required* | The session instance |
49
+ | `default` | `bool` | `False` | Make this the default session |
50
+ | `lazy` | `bool` | `False` | Start the session only when first used |
51
+
52
+ !!! note "Notes:"
53
+
54
+ 1. In all requests, if you don't specify which session to use, the default session is used. The default session is determined in one of two ways:
55
+ 1. The first session you add to the managed becomes the default automatically.
56
+ 2. The session that gets `default=True` while added to the manager.
57
+ 2. The instances you pass of each session don't have to be already started by you; the spider checks on all sessions if they are not already started and starts them.
58
+ 3. If you want a specific session to start when used only, then use the `lazy` argument while adding that session to the manager. Example: start the browser only when you need it, not with the spider start.
59
+
60
+ ## Multi-Session Spider
61
+
62
+ Here's a practical example: use a fast HTTP session for listing pages and a stealth browser for detail pages that have bot protection:
63
+
64
+ ```python
65
+ from scrapling.spiders import Spider, Response
66
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession
67
+
68
+ class ProductSpider(Spider):
69
+ name = "products"
70
+ start_urls = ["https://shop.example.com/products"]
71
+
72
+ def configure_sessions(self, manager):
73
+ # Fast HTTP for listing pages (default)
74
+ manager.add("http", FetcherSession())
75
+
76
+ # Stealth browser for protected product pages
77
+ manager.add("stealth", AsyncStealthySession(
78
+ headless=True,
79
+ network_idle=True,
80
+ ))
81
+
82
+ async def parse(self, response: Response):
83
+ for link in response.css("a.product::attr(href)").getall():
84
+ # Route product pages through the stealth session
85
+ yield response.follow(link, sid="stealth", callback=self.parse_product)
86
+
87
+ next_page = response.css("a.next::attr(href)").get()
88
+ if next_page:
89
+ yield response.follow(next_page)
90
+
91
+ async def parse_product(self, response: Response):
92
+ yield {
93
+ "name": response.css("h1::text").get(""),
94
+ "price": response.css(".price::text").get(""),
95
+ }
96
+ ```
97
+
98
+ The key is the `sid` parameter — it tells the spider which session to use for each request. When you call `response.follow()` without `sid`, the session ID from the original request is inherited.
99
+
100
+ Note that the sessions don't have to be from different classes only, but can be the same session, but different instances with different configurations, for example, like below:
101
+
102
+ ```python
103
+ from scrapling.spiders import Spider, Response
104
+ from scrapling.fetchers import FetcherSession
105
+
106
+ class ProductSpider(Spider):
107
+ name = "products"
108
+ start_urls = ["https://shop.example.com/products"]
109
+
110
+ def configure_sessions(self, manager):
111
+ chrome_requests = FetcherSession(impersonate="chrome")
112
+ firefox_requests = FetcherSession(impersonate="firefox")
113
+
114
+ manager.add("chrome", chrome_requests)
115
+ manager.add("firefox", firefox_requests)
116
+
117
+ async def parse(self, response: Response):
118
+ for link in response.css("a.product::attr(href)").getall():
119
+ yield response.follow(link, callback=self.parse_product)
120
+
121
+ next_page = response.css("a.next::attr(href)").get()
122
+ if next_page:
123
+ yield response.follow(next_page, sid="firefox")
124
+
125
+ async def parse_product(self, response: Response):
126
+ yield {
127
+ "name": response.css("h1::text").get(""),
128
+ "price": response.css(".price::text").get(""),
129
+ }
130
+ ```
131
+
132
+ Or you can separate concerns and keep a session with its cookies/state for specific requests, etc...
133
+
134
+ ## Session Arguments
135
+
136
+ Extra keyword arguments passed to a `Request` (or through `response.follow(**kwargs)`) are forwarded to the session's fetch method. This lets you customize individual requests without changing the session configuration:
137
+
138
+ ```python
139
+ async def parse(self, response: Response):
140
+ # Pass extra headers for this specific request
141
+ yield Request(
142
+ "https://api.example.com/data",
143
+ headers={"Authorization": "Bearer token123"},
144
+ callback=self.parse_api,
145
+ )
146
+
147
+ # Use a different HTTP method
148
+ yield Request(
149
+ "https://example.com/submit",
150
+ method="POST",
151
+ data={"field": "value"},
152
+ sid="firefox",
153
+ callback=self.parse_result,
154
+ )
155
+ ```
156
+
157
+ !!! warning
158
+
159
+ Normally, when you use `FetcherSession`, `Fetcher`, or `AsyncFetcher`, you specify the HTTP method to use with the corresponding method like `.get()` and `.post()`. But while using `FetcherSession` in spiders, you can't do this. By default, the request is an _HTTP GET_ request; if you want to use another HTTP method, you have to pass it to the `method` argument, as in the above example. The reason for this is to unify the `Request` interface across all session types.
160
+
161
+ For browser sessions (`AsyncDynamicSession`, `AsyncStealthySession`), you can pass browser-specific arguments like `wait_selector`, `page_action`, or `extra_headers`:
162
+
163
+ ```python
164
+ async def parse(self, response: Response):
165
+ # Use Cloudflare solver with the `AsyncStealthySession` we configured above
166
+ yield Request(
167
+ "https://nopecha.com/demo/cloudflare",
168
+ sid="stealth",
169
+ callback=self.parse_result,
170
+ solve_cloudflare=True,
171
+ block_webrtc=True,
172
+ hide_canvas=True,
173
+ google_search=True,
174
+ )
175
+
176
+ yield response.follow(
177
+ "/dynamic-page",
178
+ sid="browser",
179
+ callback=self.parse_dynamic,
180
+ wait_selector="div.loaded",
181
+ network_idle=True,
182
+ )
183
+ ```
184
+
185
+ !!! warning
186
+
187
+ Session arguments (**kwargs) passed from the original request are inherited by `response.follow()`. New kwargs take precedence over inherited ones.
188
+
189
+ ```python
190
+ from scrapling.spiders import Spider, Response
191
+ from scrapling.fetchers import FetcherSession
192
+
193
+ class ProductSpider(Spider):
194
+ name = "products"
195
+ start_urls = ["https://shop.example.com/products"]
196
+
197
+ def configure_sessions(self, manager):
198
+ manager.add("http", FetcherSession(impersonate='chrome'))
199
+
200
+ async def parse(self, response: Response):
201
+ # I don't want the follow request to impersonate a desktop Chrome like the previous request, but a mobile one
202
+ # so I override it like this
203
+ for link in response.css("a.product::attr(href)").getall():
204
+ yield response.follow(link, impersonate="chrome131_android", callback=self.parse_product)
205
+
206
+ next_page = response.css("a.next::attr(href)").get()
207
+ if next_page:
208
+ yield Request(next_page)
209
+
210
+ async def parse_product(self, response: Response):
211
+ yield {
212
+ "name": response.css("h1::text").get(""),
213
+ "price": response.css(".price::text").get(""),
214
+ }
215
+ ```
216
+ !!! info
217
+
218
+ No need to mention that, upon spider closure, the manager automatically checks whether any sessions are still running and closes them before closing the spider.
docs/tutorials/migrating_from_beautifulsoup.md CHANGED
@@ -18,10 +18,10 @@ You will notice that some shortcuts in BeautifulSoup are missing in Scrapling, w
18
  | Finding a single element (Example 4) | `element = soup.find(lambda e: len(list(e.children)) > 0)` | `element = page.find(lambda e: len(e.children) > 0)` |
19
  | Finding a single element (Example 5) | `element = soup.find(["a", "b"])` | `element = page.find(["a", "b"])` |
20
  | Find element by its text content | `element = soup.find(text="some text")` | `element = page.find_by_text("some text", partial=False)` |
21
- | Using CSS selectors to find the first matching element | `elements = soup.select_one('div.example')` | `elements = page.css_first('div.example')` |
22
  | Using CSS selectors to find all matching element | `elements = soup.select('div.example')` | `elements = page.css('div.example')` |
23
  | Get a prettified version of the page/element source | `prettified = soup.prettify()` | `prettified = page.prettify()` |
24
- | Get a Non-pretty version of the page/element source | `source = str(soup)` | `source = page.body` |
25
  | Get tag name of an element | `name = element.name` | `name = element.tag` |
26
  | Extracting text content of an element | `string = element.string` | `string = element.text` |
27
  | Extracting all the text in a document or beneath a tag | `text = soup.get_text(strip=True)` | `text = page.get_all_text(strip=True)` |
@@ -36,14 +36,16 @@ You will notice that some shortcuts in BeautifulSoup are missing in Scrapling, w
36
  | Searching for elements in the siblings of an element | `target_sibling = element.find_next_siblings("a")`<br/>`target_sibling = element.find_previous_siblings("a")` | `target_sibling = element.siblings.filter(lambda s: s.tag == 'a')` |
37
  | Searching for an element in the next elements of an element | `target_parent = element.find_next("a")` | `target_parent = element.below_elements.search(lambda p: p.tag == 'a')` |
38
  | Searching for elements in the next elements of an element | `target_parent = element.find_all_next("a")` | `target_parent = element.below_elements.filter(lambda p: p.tag == 'a')` |
39
- | Searching for an element in the previous elements of an element | `target_parent = element.find_previous("a")` | `target_parent = element.path.search(lambda p: p.tag == 'a')` |
40
- | Searching for elements in the previous elements of an element | `target_parent = element.find_all_previous("a")` | `target_parent = element.path.filter(lambda p: p.tag == 'a')` |
41
  | Get previous sibling of an element | `prev_element = element.previous_sibling` | `prev_element = element.previous` |
42
  | Navigating to children | `children = list(element.children)` | `children = element.children` |
43
  | Get all descendants of an element | `children = list(element.descendants)` | `children = element.below_elements` |
44
  | Filtering a group of elements that satisfies a condition | `group = soup.find('p', 'story').css.filter('a')` | `group = page.find_all('p', 'story').filter(lambda p: p.tag == 'a')` |
45
 
46
 
 
 
47
  **One key point to remember**: BeautifulSoup offers features for modifying and manipulating the page after it has been parsed. Scrapling focuses more on scraping the page faster for you, and then you can do what you want with the extracted information. So, two different tools can be used in Web Scraping, but one of them specializes in Web Scraping :)
48
 
49
  ### Putting It All Together
@@ -80,12 +82,12 @@ for link in links:
80
 
81
  As you can see, Scrapling simplifies the process by combining fetching and parsing into a single step, making your code cleaner and more efficient.
82
 
83
- **Additional Notes:**
84
 
85
- - **Different parsers**: BeautifulSoup allows you to set the parser engine to use, and one of them is `lxml`. Scrapling doesn't do that and uses the `lxml` library by default for performance reasons.
86
- - **Element Types**: In BeautifulSoup, elements are `Tag` objects; in Scrapling, they are `Selector` objects. However, they provide similar methods and properties for navigation and data extraction.
87
- - **Error Handling**: Both libraries return `None` when an element is not found (e.g., `soup.find()` or `page.css_first()`). To avoid errors, check for `None` before accessing properties.
88
- - **Text Extraction**: Scrapling provides additional methods for handling text through `TextHandler`, such as `clean()`, which can help remove extra whitespace, consecutive spaces, or unwanted characters. Please check out the documentation for the complete list.
89
 
90
  The documentation provides more details on Scrapling's features and the complete list of arguments that can be passed to all methods.
91
 
 
18
  | Finding a single element (Example 4) | `element = soup.find(lambda e: len(list(e.children)) > 0)` | `element = page.find(lambda e: len(e.children) > 0)` |
19
  | Finding a single element (Example 5) | `element = soup.find(["a", "b"])` | `element = page.find(["a", "b"])` |
20
  | Find element by its text content | `element = soup.find(text="some text")` | `element = page.find_by_text("some text", partial=False)` |
21
+ | Using CSS selectors to find the first matching element | `elements = soup.select_one('div.example')` | `elements = page.css('div.example').first` |
22
  | Using CSS selectors to find all matching element | `elements = soup.select('div.example')` | `elements = page.css('div.example')` |
23
  | Get a prettified version of the page/element source | `prettified = soup.prettify()` | `prettified = page.prettify()` |
24
+ | Get a Non-pretty version of the page/element source | `source = str(soup)` | `source = page.html_content` |
25
  | Get tag name of an element | `name = element.name` | `name = element.tag` |
26
  | Extracting text content of an element | `string = element.string` | `string = element.text` |
27
  | Extracting all the text in a document or beneath a tag | `text = soup.get_text(strip=True)` | `text = page.get_all_text(strip=True)` |
 
36
  | Searching for elements in the siblings of an element | `target_sibling = element.find_next_siblings("a")`<br/>`target_sibling = element.find_previous_siblings("a")` | `target_sibling = element.siblings.filter(lambda s: s.tag == 'a')` |
37
  | Searching for an element in the next elements of an element | `target_parent = element.find_next("a")` | `target_parent = element.below_elements.search(lambda p: p.tag == 'a')` |
38
  | Searching for elements in the next elements of an element | `target_parent = element.find_all_next("a")` | `target_parent = element.below_elements.filter(lambda p: p.tag == 'a')` |
39
+ | Searching for an element in the ancestors of an element | `target_parent = element.find_previous("a")` ¹ | `target_parent = element.path.search(lambda p: p.tag == 'a')` |
40
+ | Searching for elements in the ancestors of an element | `target_parent = element.find_all_previous("a")` ¹ | `target_parent = element.path.filter(lambda p: p.tag == 'a')` |
41
  | Get previous sibling of an element | `prev_element = element.previous_sibling` | `prev_element = element.previous` |
42
  | Navigating to children | `children = list(element.children)` | `children = element.children` |
43
  | Get all descendants of an element | `children = list(element.descendants)` | `children = element.below_elements` |
44
  | Filtering a group of elements that satisfies a condition | `group = soup.find('p', 'story').css.filter('a')` | `group = page.find_all('p', 'story').filter(lambda p: p.tag == 'a')` |
45
 
46
 
47
+ ¹ **Note:** BS4's `find_previous`/`find_all_previous` searches all preceding elements in document order, while Scrapling's `path` only returns ancestors (the parent chain). These are not exact equivalents, but ancestor search covers the most common use case.
48
+
49
  **One key point to remember**: BeautifulSoup offers features for modifying and manipulating the page after it has been parsed. Scrapling focuses more on scraping the page faster for you, and then you can do what you want with the extracted information. So, two different tools can be used in Web Scraping, but one of them specializes in Web Scraping :)
50
 
51
  ### Putting It All Together
 
82
 
83
  As you can see, Scrapling simplifies the process by combining fetching and parsing into a single step, making your code cleaner and more efficient.
84
 
85
+ !!! abstract "**Additional Notes:**"
86
 
87
+ - **Different parsers**: BeautifulSoup allows you to set the parser engine to use, and one of them is `lxml`. Scrapling doesn't do that and uses the `lxml` library by default for performance reasons.
88
+ - **Element Types**: In BeautifulSoup, elements are `Tag` objects; in Scrapling, they are `Selector` objects. However, they provide similar methods and properties for navigation and data extraction.
89
+ - **Error Handling**: Both libraries return `None` when an element is not found (e.g., `soup.find()` or `page.find()`). In Scrapling, `page.css()` returns an empty `Selectors` list when no elements match, and you can use `page.css('.foo').first` to safely get the first match or `None`. To avoid errors, check for `None` or empty results before accessing properties.
90
+ - **Text Extraction**: Scrapling provides additional methods for handling text through `TextHandler`, such as `clean()`, which can help remove extra whitespace, consecutive spaces, or unwanted characters. Please check out the documentation for the complete list.
91
 
92
  The documentation provides more details on Scrapling's features and the complete list of arguments that can be passed to all methods.
93
 
mkdocs.yml DELETED
@@ -1,180 +0,0 @@
1
- site_name: Scrapling
2
- site_description: Scrapling - Easy, effortless Web Scraping as it should be!
3
- site_author: Karim Shoair
4
- repo_url: https://github.com/D4Vinci/Scrapling
5
- site_url: https://scrapling.readthedocs.io/en/latest/
6
- repo_name: D4Vinci/Scrapling
7
- copyright: Copyright &copy; 2025 Karim Shoair - <a href="#__consent">Change cookie settings</a>
8
-
9
- theme:
10
- name: material
11
- language: en
12
- logo: assets/logo.png
13
- favicon: assets/favicon.ico
14
- palette:
15
- scheme: slate
16
- primary: black
17
- accent: deep purple
18
- font:
19
- text: Open Sans
20
- code: JetBrains Mono
21
- icon:
22
- repo: fontawesome/brands/github-alt
23
- features:
24
- - announce.dismiss
25
- - navigation.top
26
- - navigation.footer
27
- - navigation.instant
28
- - navigation.indexes
29
- - navigation.sections
30
- - navigation.tracking
31
- - navigation.instant
32
- - navigation.instant.prefetch
33
- - navigation.instant.progress
34
- # - navigation.tabs
35
- # - navigation.expand
36
- # - toc.integrate
37
- - search.share
38
- - search.suggest
39
- - search.highlight
40
- - content.tabs.link
41
- - content.width.full
42
- - content.action.view
43
- - content.action.edit
44
- - content.code.copy
45
- - content.code.select
46
- - content.code.annotate
47
- - content.code.annotation
48
-
49
- nav:
50
- - Introduction: index.md
51
- - Overview: overview.md
52
- - What's New in v0.3: 'https://github.com/D4Vinci/Scrapling/releases/tag/v0.3'
53
- - Performance Benchmarks: benchmarks.md
54
- - User Guide:
55
- - Parsing:
56
- - Querying elements: parsing/selection.md
57
- - Main classes: parsing/main_classes.md
58
- - Adaptive scraping: parsing/adaptive.md
59
- - Fetching:
60
- - Fetchers basics: fetching/choosing.md
61
- - HTTP requests: fetching/static.md
62
- - Dynamic websites: fetching/dynamic.md
63
- - Dynamic websites with hard protections: fetching/stealthy.md
64
- - Command Line Interface:
65
- - Overview: cli/overview.md
66
- - Interactive shell: cli/interactive-shell.md
67
- - Extract commands: cli/extract-commands.md
68
- - Integrations:
69
- - AI MCP server: ai/mcp-server.md
70
- - Tutorials:
71
- - A Free Alternative to AI for Robust Web Scraping: tutorials/replacing_ai.md
72
- - Migrating from BeautifulSoup: tutorials/migrating_from_beautifulsoup.md
73
- - Using Scrapeless browser: tutorials/external.md
74
- # - Migrating from AutoScraper: tutorials/migrating_from_autoscraper.md
75
- - Development:
76
- - API Reference:
77
- - Selector: api-reference/selector.md
78
- - Fetchers: api-reference/fetchers.md
79
- - MCP Server: api-reference/mcp-server.md
80
- - Custom Types: api-reference/custom-types.md
81
- - Writing your retrieval system: development/adaptive_storage_system.md
82
- - Using Scrapling's custom types: development/scrapling_custom_types.md
83
- - Support and Advertisement: donate.md
84
- - Contributing: 'https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md'
85
- - Changelog: 'https://github.com/D4Vinci/Scrapling/releases'
86
-
87
- markdown_extensions:
88
- - admonition
89
- - abbr
90
- # - mkautodoc
91
- - pymdownx.emoji
92
- - pymdownx.details
93
- - pymdownx.superfences
94
- - pymdownx.highlight:
95
- anchor_linenums: true
96
- - pymdownx.inlinehilite
97
- - pymdownx.snippets
98
- - pymdownx.tabbed:
99
- alternate_style: true
100
- - tables
101
- - codehilite:
102
- css_class: highlight
103
- - toc:
104
- permalink: true
105
-
106
- plugins:
107
- - search
108
- - privacy:
109
- links: false
110
- - optimize
111
- - social:
112
- cards_layout_options:
113
- background_color: "#1f1f1f"
114
- font_family: Roboto
115
- - mkdocstrings:
116
- handlers:
117
- python:
118
- paths: [scrapling]
119
- options:
120
- docstring_style: sphinx
121
- show_source: true
122
- show_root_heading: true
123
- show_if_no_docstring: true
124
- inherited_members: true
125
- members_order: source
126
- separate_signature: true
127
- unwrap_annotated: true
128
- filters:
129
- - '!^_'
130
- - "^__"
131
- merge_init_into_class: true
132
- docstring_section_style: spacy
133
- signature_crossrefs: true
134
- show_symbol_type_heading: true
135
- show_symbol_type_toc: true
136
- show_inheritance_diagram: true
137
- modernize_annotations: true
138
- extensions:
139
- - griffe_runtime_objects
140
- - griffe_sphinx
141
- - griffe_inherited_docstrings:
142
- merge: true
143
-
144
- extra:
145
- homepage: https://scrapling.readthedocs.io/en/latest/
146
- social:
147
- - icon: fontawesome/brands/github
148
- link: https://github.com/D4Vinci/Scrapling
149
- - icon: fontawesome/brands/python
150
- link: https://pypi.org/project/scrapling/
151
- - icon: fontawesome/brands/x-twitter
152
- link: https://x.com/Scrapling_dev
153
- - icon: fontawesome/brands/discord
154
- link: https://discord.gg/EMgGbDceNQ
155
- analytics:
156
- provider: google
157
- property: G-CS3DKLY73Z
158
- feedback:
159
- title: Was this page helpful?
160
- ratings:
161
- - icon: material/emoticon-happy-outline
162
- name: This page was helpful
163
- data: 1
164
- note: >-
165
- Thanks for your feedback!
166
- - icon: material/emoticon-sad-outline
167
- name: This page could be improved
168
- data: 0
169
- note: >-
170
- Thanks for your feedback!
171
- consent:
172
- title: Cookie consent
173
- description: >-
174
- We use cookies to recognize your repeated visits and preferences, as well
175
- as to measure the effectiveness of our documentation and whether users
176
- find what they're searching for. With your consent, you're helping us to
177
- make our documentation better.
178
-
179
- extra_css:
180
- - stylesheets/extra.css
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pyproject.toml CHANGED
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
5
  [project]
6
  name = "scrapling"
7
  # Static version instead of a dynamic version so we can get better layer caching while building docker, check the docker file to understand
8
- version = "0.3.14"
9
  description = "Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!"
10
  readme = {file = "docs/README.md", content-type = "text/markdown"}
11
  license = {file = "LICENSE"}
@@ -28,6 +28,9 @@ keywords = [
28
  "web-crawler",
29
  "browser",
30
  "crawling",
 
 
 
31
  ]
32
  requires-python = ">=3.10"
33
  classifiers = [
@@ -46,6 +49,7 @@ classifiers = [
46
  "Topic :: Text Processing :: Markup :: HTML",
47
  "Topic :: Scientific/Engineering :: Artificial Intelligence",
48
  "Topic :: Software Development :: Libraries",
 
49
  "Topic :: Software Development :: Libraries :: Python Modules",
50
  "Programming Language :: Python :: 3",
51
  "Programming Language :: Python :: 3 :: Only",
@@ -58,9 +62,11 @@ classifiers = [
58
  ]
59
  dependencies = [
60
  "lxml>=6.0.2",
61
- "cssselect>=1.3.0",
62
- "orjson>=3.11.5",
63
- "tldextract>=5.3.1",
 
 
64
  ]
65
 
66
  [project.optional-dependencies]
@@ -69,8 +75,9 @@ fetchers = [
69
  "curl_cffi>=0.14.0",
70
  "playwright==1.56.0",
71
  "patchright==1.56.0",
72
- "browserforge>=1.2.3",
73
  "msgspec>=0.20.0",
 
74
  ]
75
  ai = [
76
  "mcp>=1.24.0",
@@ -92,6 +99,8 @@ Changelog = "https://github.com/D4Vinci/Scrapling/releases"
92
  Documentation = "https://scrapling.readthedocs.io/en/latest/"
93
  Repository = "https://github.com/D4Vinci/Scrapling"
94
  "Bug Tracker" = "https://github.com/D4Vinci/Scrapling/issues"
 
 
95
 
96
  [project.scripts]
97
  scrapling = "scrapling.cli:main"
@@ -102,4 +111,16 @@ include-package-data = true
102
 
103
  [tool.setuptools.packages.find]
104
  where = ["."]
105
- include = ["scrapling*"]
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  [project]
6
  name = "scrapling"
7
  # Static version instead of a dynamic version so we can get better layer caching while building docker, check the docker file to understand
8
+ version = "0.4"
9
  description = "Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!"
10
  readme = {file = "docs/README.md", content-type = "text/markdown"}
11
  license = {file = "LICENSE"}
 
28
  "web-crawler",
29
  "browser",
30
  "crawling",
31
+ "headless",
32
+ "scraper",
33
+ "chrome",
34
  ]
35
  requires-python = ">=3.10"
36
  classifiers = [
 
49
  "Topic :: Text Processing :: Markup :: HTML",
50
  "Topic :: Scientific/Engineering :: Artificial Intelligence",
51
  "Topic :: Software Development :: Libraries",
52
+ "Topic :: Software Development :: Libraries :: Application Frameworks",
53
  "Topic :: Software Development :: Libraries :: Python Modules",
54
  "Programming Language :: Python :: 3",
55
  "Programming Language :: Python :: 3 :: Only",
 
62
  ]
63
  dependencies = [
64
  "lxml>=6.0.2",
65
+ "cssselect>=1.4.0",
66
+ "orjson>=3.11.7",
67
+ "tld>=0.13.1",
68
+ "w3lib>=2.4.0",
69
+ "typing_extensions",
70
  ]
71
 
72
  [project.optional-dependencies]
 
75
  "curl_cffi>=0.14.0",
76
  "playwright==1.56.0",
77
  "patchright==1.56.0",
78
+ "browserforge>=1.2.4",
79
  "msgspec>=0.20.0",
80
+ "anyio>=4.12.1"
81
  ]
82
  ai = [
83
  "mcp>=1.24.0",
 
99
  Documentation = "https://scrapling.readthedocs.io/en/latest/"
100
  Repository = "https://github.com/D4Vinci/Scrapling"
101
  "Bug Tracker" = "https://github.com/D4Vinci/Scrapling/issues"
102
+ "Discord" = "https://discord.gg/EMgGbDceNQ"
103
+ "Release Notes" = "https://github.com/D4Vinci/Scrapling/releases"
104
 
105
  [project.scripts]
106
  scrapling = "scrapling.cli:main"
 
111
 
112
  [tool.setuptools.packages.find]
113
  where = ["."]
114
+ include = ["scrapling*"]
115
+
116
+ [tool.mypy]
117
+ python_version = "3.10"
118
+ warn_unused_configs = true
119
+ ignore_missing_imports = true
120
+ check_untyped_defs = true
121
+
122
+ [tool.pyright]
123
+ pythonVersion = "3.10"
124
+ typeCheckingMode = "basic"
125
+ include = ["scrapling"]
126
+ ignore = ["tests", "benchmarks.py"]
scrapling/__init__.py CHANGED
@@ -1,5 +1,5 @@
1
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
2
- __version__ = "0.3.14"
3
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
4
 
5
  from typing import Any, TYPE_CHECKING
 
1
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
2
+ __version__ = "0.4"
3
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
4
 
5
  from typing import Any, TYPE_CHECKING
scrapling/cli.py CHANGED
@@ -128,6 +128,9 @@ def install(force): # pragma: no cover
128
  ],
129
  "Playwright dependencies",
130
  )
 
 
 
131
  # if no errors raised by the above commands, then we add the below file
132
  __PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").touch()
133
  else:
 
128
  ],
129
  "Playwright dependencies",
130
  )
131
+ from tld.utils import update_tld_names
132
+
133
+ update_tld_names(fail_silently=True)
134
  # if no errors raised by the above commands, then we add the below file
135
  __PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").touch()
136
  else:
scrapling/core/_html_utils.py DELETED
@@ -1,342 +0,0 @@
1
- """
2
- This file is mostly copied from the submodule `w3lib.html` source code to stop downloading the whole library to use a small part of it.
3
- So the goal of doing this is to minimize the memory footprint and keep the library size relatively smaller.
4
- Repo source code: https://github.com/scrapy/w3lib/blob/master/w3lib/html.py
5
- """
6
-
7
- from re import compile as _re_compile, IGNORECASE
8
-
9
- from scrapling.core._types import Iterable, Optional, Match, StrOrBytes
10
-
11
- _ent_re = _re_compile(
12
- r"&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)",
13
- IGNORECASE,
14
- )
15
- # maps HTML4 entity name to the Unicode code point
16
- name2codepoint = {
17
- "AElig": 0x00C6, # latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1
18
- "Aacute": 0x00C1, # latin capital letter A with acute, U+00C1 ISOlat1
19
- "Acirc": 0x00C2, # latin capital letter A with circumflex, U+00C2 ISOlat1
20
- "Agrave": 0x00C0, # latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1
21
- "Alpha": 0x0391, # greek capital letter alpha, U+0391
22
- "Aring": 0x00C5, # latin capital letter A with the ring above = latin capital letter A ring, U+00C5 ISOlat1
23
- "Atilde": 0x00C3, # latin capital letter A with tilde, U+00C3 ISOlat1
24
- "Auml": 0x00C4, # latin capital letter A with diaeresis, U+00C4 ISOlat1
25
- "Beta": 0x0392, # greek capital letter beta, U+0392
26
- "Ccedil": 0x00C7, # latin capital letter C with cedilla, U+00C7 ISOlat1
27
- "Chi": 0x03A7, # greek capital letter chi, U+03A7
28
- "Dagger": 0x2021, # double dagger, U+2021 ISOpub
29
- "Delta": 0x0394, # greek capital letter delta, U+0394 ISOgrk3
30
- "ETH": 0x00D0, # latin capital letter ETH, U+00D0 ISOlat1
31
- "Eacute": 0x00C9, # latin capital letter E with acute, U+00C9 ISOlat1
32
- "Ecirc": 0x00CA, # latin capital letter E with circumflex, U+00CA ISOlat1
33
- "Egrave": 0x00C8, # latin capital letter E with grave, U+00C8 ISOlat1
34
- "Epsilon": 0x0395, # greek capital letter epsilon, U+0395
35
- "Eta": 0x0397, # greek capital letter eta, U+0397
36
- "Euml": 0x00CB, # latin capital letter E with diaeresis, U+00CB ISOlat1
37
- "Gamma": 0x0393, # greek capital letter gamma, U+0393 ISOgrk3
38
- "Iacute": 0x00CD, # latin capital letter I with acute, U+00CD ISOlat1
39
- "Icirc": 0x00CE, # latin capital letter I with circumflex, U+00CE ISOlat1
40
- "Igrave": 0x00CC, # latin capital letter I with grave, U+00CC ISOlat1
41
- "Iota": 0x0399, # greek capital letter iota, U+0399
42
- "Iuml": 0x00CF, # latin capital letter I with diaeresis, U+00CF ISOlat1
43
- "Kappa": 0x039A, # greek capital letter kappa, U+039A
44
- "Lambda": 0x039B, # greek capital letter lambda, U+039B ISOgrk3
45
- "Mu": 0x039C, # greek capital letter mu, U+039C
46
- "Ntilde": 0x00D1, # latin capital letter N with tilde, U+00D1 ISOlat1
47
- "Nu": 0x039D, # greek capital letter nu, U+039D
48
- "OElig": 0x0152, # latin capital ligature OE, U+0152 ISOlat2
49
- "Oacute": 0x00D3, # latin capital letter O with acute, U+00D3 ISOlat1
50
- "Ocirc": 0x00D4, # latin capital letter O with circumflex, U+00D4 ISOlat1
51
- "Ograve": 0x00D2, # latin capital letter O with grave, U+00D2 ISOlat1
52
- "Omega": 0x03A9, # greek capital letter omega, U+03A9 ISOgrk3
53
- "Omicron": 0x039F, # greek capital letter omicron, U+039F
54
- "Oslash": 0x00D8, # latin capital letter O with stroke = latin capital letter O slash, U+00D8 ISOlat1
55
- "Otilde": 0x00D5, # latin capital letter O with tilde, U+00D5 ISOlat1
56
- "Ouml": 0x00D6, # latin capital letter O with diaeresis, U+00D6 ISOlat1
57
- "Phi": 0x03A6, # greek capital letter phi, U+03A6 ISOgrk3
58
- "Pi": 0x03A0, # greek capital letter pi, U+03A0 ISOgrk3
59
- "Prime": 0x2033, # double prime = seconds = inches, U+2033 ISOtech
60
- "Psi": 0x03A8, # greek capital letter psi, U+03A8 ISOgrk3
61
- "Rho": 0x03A1, # greek capital letter rho, U+03A1
62
- "Scaron": 0x0160, # latin capital letter S with caron, U+0160 ISOlat2
63
- "Sigma": 0x03A3, # greek capital letter sigma, U+03A3 ISOgrk3
64
- "THORN": 0x00DE, # latin capital letter THORN, U+00DE ISOlat1
65
- "Tau": 0x03A4, # greek capital letter tau, U+03A4
66
- "Theta": 0x0398, # greek capital letter theta, U+0398 ISOgrk3
67
- "Uacute": 0x00DA, # latin capital letter U with acute, U+00DA ISOlat1
68
- "Ucirc": 0x00DB, # latin capital letter U with circumflex, U+00DB ISOlat1
69
- "Ugrave": 0x00D9, # latin capital letter U with grave, U+00D9 ISOlat1
70
- "Upsilon": 0x03A5, # greek capital letter upsilon, U+03A5 ISOgrk3
71
- "Uuml": 0x00DC, # latin capital letter U with diaeresis, U+00DC ISOlat1
72
- "Xi": 0x039E, # greek capital letter xi, U+039E ISOgrk3
73
- "Yacute": 0x00DD, # latin capital letter Y with acute, U+00DD ISOlat1
74
- "Yuml": 0x0178, # latin capital letter Y with diaeresis, U+0178 ISOlat2
75
- "Zeta": 0x0396, # greek capital letter zeta, U+0396
76
- "aacute": 0x00E1, # latin small letter a with acute, U+00E1 ISOlat1
77
- "acirc": 0x00E2, # latin small letter a with circumflex, U+00E2 ISOlat1
78
- "acute": 0x00B4, # acute accent = spacing acute, U+00B4 ISOdia
79
- "aelig": 0x00E6, # latin small letter ae = latin small ligature ae, U+00E6 ISOlat1
80
- "agrave": 0x00E0, # latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1
81
- "alefsym": 0x2135, # alef symbol = first transfinite cardinal, U+2135 NEW
82
- "alpha": 0x03B1, # greek small letter alpha, U+03B1 ISOgrk3
83
- "amp": 0x0026, # ampersand, U+0026 ISOnum
84
- "and": 0x2227, # logical and = wedge, U+2227 ISOtech
85
- "ang": 0x2220, # angle, U+2220 ISOamso
86
- "aring": 0x00E5, # latin small letter a with the ring above = latin small letter a ring, U+00E5 ISOlat1
87
- "asymp": 0x2248, # almost equal to = asymptotic to, U+2248 ISOamsr
88
- "atilde": 0x00E3, # latin small letter a with tilde, U+00E3 ISOlat1
89
- "auml": 0x00E4, # latin small letter a with diaeresis, U+00E4 ISOlat1
90
- "bdquo": 0x201E, # double low-9 quotation mark, U+201E NEW
91
- "beta": 0x03B2, # greek small letter beta, U+03B2 ISOgrk3
92
- "brvbar": 0x00A6, # broken bar = broken vertical bar, U+00A6 ISOnum
93
- "bull": 0x2022, # bullet = black small circle, U+2022 ISOpub
94
- "cap": 0x2229, # intersection = cap, U+2229 ISOtech
95
- "ccedil": 0x00E7, # latin small letter c with cedilla, U+00E7 ISOlat1
96
- "cedil": 0x00B8, # cedilla = spacing cedilla, U+00B8 ISOdia
97
- "cent": 0x00A2, # cent sign, U+00A2 ISOnum
98
- "chi": 0x03C7, # greek small letter chi, U+03C7 ISOgrk3
99
- "circ": 0x02C6, # modifier letter circumflex accent, U+02C6 ISOpub
100
- "clubs": 0x2663, # black club suit = shamrock, U+2663 ISOpub
101
- "cong": 0x2245, # approximately equal to, U+2245 ISOtech
102
- "copy": 0x00A9, # copyright sign, U+00A9 ISOnum
103
- "crarr": 0x21B5, # downwards arrow with corner leftwards = carriage return, U+21B5 NEW
104
- "cup": 0x222A, # union = cup, U+222A ISOtech
105
- "curren": 0x00A4, # currency sign, U+00A4 ISOnum
106
- "dArr": 0x21D3, # downwards double arrow, U+21D3 ISOamsa
107
- "dagger": 0x2020, # dagger, U+2020 ISOpub
108
- "darr": 0x2193, # downwards arrow, U+2193 ISOnum
109
- "deg": 0x00B0, # degree sign, U+00B0 ISOnum
110
- "delta": 0x03B4, # greek small letter delta, U+03B4 ISOgrk3
111
- "diams": 0x2666, # black diamond suit, U+2666 ISOpub
112
- "divide": 0x00F7, # division sign, U+00F7 ISOnum
113
- "eacute": 0x00E9, # latin small letter e with acute, U+00E9 ISOlat1
114
- "ecirc": 0x00EA, # latin small letter e with circumflex, U+00EA ISOlat1
115
- "egrave": 0x00E8, # latin small letter e with grave, U+00E8 ISOlat1
116
- "empty": 0x2205, # empty set = null set = diameter, U+2205 ISOamso
117
- "emsp": 0x2003, # em space, U+2003 ISOpub
118
- "ensp": 0x2002, # en space, U+2002 ISOpub
119
- "epsilon": 0x03B5, # greek small letter epsilon, U+03B5 ISOgrk3
120
- "equiv": 0x2261, # identical to, U+2261 ISOtech
121
- "eta": 0x03B7, # greek small letter eta, U+03B7 ISOgrk3
122
- "eth": 0x00F0, # latin small letter eth, U+00F0 ISOlat1
123
- "euml": 0x00EB, # latin small letter e with diaeresis, U+00EB ISOlat1
124
- "euro": 0x20AC, # euro sign, U+20AC NEW
125
- "exist": 0x2203, # there exists, U+2203 ISOtech
126
- "fnof": 0x0192, # latin small f with hook = function = florin, U+0192 ISOtech
127
- "forall": 0x2200, # for all, U+2200 ISOtech
128
- "frac12": 0x00BD, # vulgar fraction one half = fraction one half, U+00BD ISOnum
129
- "frac14": 0x00BC, # vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum
130
- "frac34": 0x00BE, # vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum
131
- "frasl": 0x2044, # fraction slash, U+2044 NEW
132
- "gamma": 0x03B3, # greek small letter gamma, U+03B3 ISOgrk3
133
- "ge": 0x2265, # greater-than or equal to, U+2265 ISOtech
134
- "gt": 0x003E, # greater-than sign, U+003E ISOnum
135
- "hArr": 0x21D4, # left right double arrow, U+21D4 ISOamsa
136
- "harr": 0x2194, # left right arrow, U+2194 ISOamsa
137
- "hearts": 0x2665, # black heart suit = valentine, U+2665 ISOpub
138
- "hellip": 0x2026, # horizontal ellipsis = three dot leader, U+2026 ISOpub
139
- "iacute": 0x00ED, # latin small letter i with acute, U+00ED ISOlat1
140
- "icirc": 0x00EE, # latin small letter i with circumflex, U+00EE ISOlat1
141
- "iexcl": 0x00A1, # inverted exclamation mark, U+00A1 ISOnum
142
- "igrave": 0x00EC, # latin small letter i with grave, U+00EC ISOlat1
143
- "image": 0x2111, # blackletter capital I = imaginary part, U+2111 ISOamso
144
- "infin": 0x221E, # infinity, U+221E ISOtech
145
- "int": 0x222B, # integral, U+222B ISOtech
146
- "iota": 0x03B9, # greek small letter iota, U+03B9 ISOgrk3
147
- "iquest": 0x00BF, # inverted question mark = turned question mark, U+00BF ISOnum
148
- "isin": 0x2208, # element of, U+2208 ISOtech
149
- "iuml": 0x00EF, # latin small letter i with diaeresis, U+00EF ISOlat1
150
- "kappa": 0x03BA, # greek small letter kappa, U+03BA ISOgrk3
151
- "lArr": 0x21D0, # leftwards double arrow, U+21D0 ISOtech
152
- "lambda": 0x03BB, # greek small letter lambda, U+03BB ISOgrk3
153
- "lang": 0x2329, # left-pointing angle bracket = bra, U+2329 ISOtech
154
- "laquo": 0x00AB, # left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum
155
- "larr": 0x2190, # leftwards arrow, U+2190 ISOnum
156
- "lceil": 0x2308, # left ceiling = apl upstile, U+2308 ISOamsc
157
- "ldquo": 0x201C, # left double quotation mark, U+201C ISOnum
158
- "le": 0x2264, # less-than or equal to, U+2264 ISOtech
159
- "lfloor": 0x230A, # left floor = apl downstile, U+230A ISOamsc
160
- "lowast": 0x2217, # asterisk operator, U+2217 ISOtech
161
- "loz": 0x25CA, # lozenge, U+25CA ISOpub
162
- "lrm": 0x200E, # left-to-right mark, U+200E NEW RFC 2070
163
- "lsaquo": 0x2039, # single left-pointing angle quotation mark, U+2039 ISO proposed
164
- "lsquo": 0x2018, # left single quotation mark, U+2018 ISOnum
165
- "lt": 0x003C, # less-than sign, U+003C ISOnum
166
- "macr": 0x00AF, # macron = spacing macron = overline = APL overbar, U+00AF ISOdia
167
- "mdash": 0x2014, # em dash, U+2014 ISOpub
168
- "micro": 0x00B5, # micro sign, U+00B5 ISOnum
169
- "middot": 0x00B7, # middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum
170
- "minus": 0x2212, # minus sign, U+2212 ISOtech
171
- "mu": 0x03BC, # greek small letter mu, U+03BC ISOgrk3
172
- "nabla": 0x2207, # nabla = backward difference, U+2207 ISOtech
173
- "nbsp": 0x00A0, # no-break space = non-breaking space, U+00A0 ISOnum
174
- "ndash": 0x2013, # en dash, U+2013 ISOpub
175
- "ne": 0x2260, # not equal to, U+2260 ISOtech
176
- "ni": 0x220B, # contains as member, U+220B ISOtech
177
- "not": 0x00AC, # not sign, U+00AC ISOnum
178
- "notin": 0x2209, # not an element of, U+2209 ISOtech
179
- "nsub": 0x2284, # not a subset of, U+2284 ISOamsn
180
- "ntilde": 0x00F1, # latin small letter n with tilde, U+00F1 ISOlat1
181
- "nu": 0x03BD, # greek small letter nu, U+03BD ISOgrk3
182
- "oacute": 0x00F3, # latin small letter o with acute, U+00F3 ISOlat1
183
- "ocirc": 0x00F4, # latin small letter o with circumflex, U+00F4 ISOlat1
184
- "oelig": 0x0153, # latin small ligature oe, U+0153 ISOlat2
185
- "ograve": 0x00F2, # latin small letter o with grave, U+00F2 ISOlat1
186
- "oline": 0x203E, # overline = spacing overscore, U+203E NEW
187
- "omega": 0x03C9, # greek small letter omega, U+03C9 ISOgrk3
188
- "omicron": 0x03BF, # greek small letter omicron, U+03BF NEW
189
- "oplus": 0x2295, # circled plus = direct sum, U+2295 ISOamsb
190
- "or": 0x2228, # logical or = vee, U+2228 ISOtech
191
- "ordf": 0x00AA, # feminine ordinal indicator, U+00AA ISOnum
192
- "ordm": 0x00BA, # masculine ordinal indicator, U+00BA ISOnum
193
- "oslash": 0x00F8, # latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1
194
- "otilde": 0x00F5, # latin small letter o with tilde, U+00F5 ISOlat1
195
- "otimes": 0x2297, # circled times = vector product, U+2297 ISOamsb
196
- "ouml": 0x00F6, # latin small letter o with diaeresis, U+00F6 ISOlat1
197
- "para": 0x00B6, # pilcrow sign = paragraph sign, U+00B6 ISOnum
198
- "part": 0x2202, # partial differential, U+2202 ISOtech
199
- "permil": 0x2030, # per mille sign, U+2030 ISOtech
200
- "perp": 0x22A5, # up tack = orthogonal to = perpendicular, U+22A5 ISOtech
201
- "phi": 0x03C6, # greek small letter phi, U+03C6 ISOgrk3
202
- "pi": 0x03C0, # greek small letter pi, U+03C0 ISOgrk3
203
- "piv": 0x03D6, # greek pi symbol, U+03D6 ISOgrk3
204
- "plusmn": 0x00B1, # plus-minus sign = plus-or-minus sign, U+00B1 ISOnum
205
- "pound": 0x00A3, # pound sign, U+00A3 ISOnum
206
- "prime": 0x2032, # prime = minutes = feet, U+2032 ISOtech
207
- "prod": 0x220F, # n-ary product = product sign, U+220F ISOamsb
208
- "prop": 0x221D, # proportional to, U+221D ISOtech
209
- "psi": 0x03C8, # greek small letter psi, U+03C8 ISOgrk3
210
- "quot": 0x0022, # quotation mark = APL quote, U+0022 ISOnum
211
- "rArr": 0x21D2, # rightwards double arrow, U+21D2 ISOtech
212
- "radic": 0x221A, # square root = radical sign, U+221A ISOtech
213
- "rang": 0x232A, # right-pointing angle bracket = ket, U+232A ISOtech
214
- "raquo": 0x00BB, # right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum
215
- "rarr": 0x2192, # rightwards arrow, U+2192 ISOnum
216
- "rceil": 0x2309, # right ceiling, U+2309 ISOamsc
217
- "rdquo": 0x201D, # right double quotation mark, U+201D ISOnum
218
- "real": 0x211C, # blackletter capital R = real part symbol, U+211C ISOamso
219
- "reg": 0x00AE, # registered sign = registered trade mark sign, U+00AE ISOnum
220
- "rfloor": 0x230B, # right floor, U+230B ISOamsc
221
- "rho": 0x03C1, # greek small letter rho, U+03C1 ISOgrk3
222
- "rlm": 0x200F, # right-to-left mark, U+200F NEW RFC 2070
223
- "rsaquo": 0x203A, # single right-pointing angle quotation mark, U+203A ISO proposed
224
- "rsquo": 0x2019, # right single quotation mark, U+2019 ISOnum
225
- "sbquo": 0x201A, # single low-9 quotation mark, U+201A NEW
226
- "scaron": 0x0161, # latin small letter s with caron, U+0161 ISOlat2
227
- "sdot": 0x22C5, # dot operator, U+22C5 ISOamsb
228
- "sect": 0x00A7, # section sign, U+00A7 ISOnum
229
- "shy": 0x00AD, # soft hyphen = discretionary hyphen, U+00AD ISOnum
230
- "sigma": 0x03C3, # greek small letter sigma, U+03C3 ISOgrk3
231
- "sigmaf": 0x03C2, # greek small letter final sigma, U+03C2 ISOgrk3
232
- "sim": 0x223C, # tilde operator = varies with = similar to, U+223C ISOtech
233
- "spades": 0x2660, # black spade suit, U+2660 ISOpub
234
- "sub": 0x2282, # subset of, U+2282 ISOtech
235
- "sube": 0x2286, # subset of or equal to, U+2286 ISOtech
236
- "sum": 0x2211, # n-ary summation, U+2211 ISOamsb
237
- "sup": 0x2283, # superset of, U+2283 ISOtech
238
- "sup1": 0x00B9, # superscript one = superscript digit one, U+00B9 ISOnum
239
- "sup2": 0x00B2, # superscript two = superscript digit two = squared, U+00B2 ISOnum
240
- "sup3": 0x00B3, # superscript three = superscript digit three = cubed, U+00B3 ISOnum
241
- "supe": 0x2287, # superset of or equal to, U+2287 ISOtech
242
- "szlig": 0x00DF, # latin small letter sharp s = ess-zed, U+00DF ISOlat1
243
- "tau": 0x03C4, # greek small letter tau, U+03C4 ISOgrk3
244
- "there4": 0x2234, # therefore, U+2234 ISOtech
245
- "theta": 0x03B8, # greek small letter theta, U+03B8 ISOgrk3
246
- "thetasym": 0x03D1, # greek small letter theta symbol, U+03D1 NEW
247
- "thinsp": 0x2009, # thin space, U+2009 ISOpub
248
- "thorn": 0x00FE, # latin small letter thorn with, U+00FE ISOlat1
249
- "tilde": 0x02DC, # small tilde, U+02DC ISOdia
250
- "times": 0x00D7, # multiplication sign, U+00D7 ISOnum
251
- "trade": 0x2122, # trade mark sign, U+2122 ISOnum
252
- "uArr": 0x21D1, # upwards double arrow, U+21D1 ISOamsa
253
- "uacute": 0x00FA, # latin small letter u with acute, U+00FA ISOlat1
254
- "uarr": 0x2191, # upwards arrow, U+2191 ISOnum
255
- "ucirc": 0x00FB, # latin small letter u with circumflex, U+00FB ISOlat1
256
- "ugrave": 0x00F9, # latin small letter u with grave, U+00F9 ISOlat1
257
- "uml": 0x00A8, # diaeresis = spacing diaeresis, U+00A8 ISOdia
258
- "upsih": 0x03D2, # greek upsilon with hook symbol, U+03D2 NEW
259
- "upsilon": 0x03C5, # greek small letter upsilon, U+03C5 ISOgrk3
260
- "uuml": 0x00FC, # latin small letter u with diaeresis, U+00FC ISOlat1
261
- "weierp": 0x2118, # script capital P = power set = Weierstrass p, U+2118 ISOamso
262
- "xi": 0x03BE, # greek small letter xi, U+03BE ISOgrk3
263
- "yacute": 0x00FD, # latin small letter y with acute, U+00FD ISOlat1
264
- "yen": 0x00A5, # yen sign = yuan sign, U+00A5 ISOnum
265
- "yuml": 0x00FF, # latin small letter y with diaeresis, U+00FF ISOlat1
266
- "zeta": 0x03B6, # greek small letter zeta, U+03B6 ISOgrk3
267
- "zwj": 0x200D, # zero width joiner, U+200D NEW RFC 2070
268
- "zwnj": 0x200C, # zero width non-joiner, U+200C NEW RFC 2070
269
- }
270
-
271
-
272
- def to_unicode(text: StrOrBytes, encoding: Optional[str] = None, errors: str = "strict") -> str:
273
- """Return the Unicode representation of a bytes object `text`. If `text`
274
- is already a Unicode object, return it as-is."""
275
- if isinstance(text, str):
276
- return text
277
- if not isinstance(text, (bytes, str)):
278
- raise TypeError(f"to_unicode must receive bytes or str, got {type(text).__name__}")
279
- if encoding is None:
280
- encoding = "utf-8"
281
- return text.decode(encoding, errors)
282
-
283
-
284
- def _replace_entities(
285
- text: StrOrBytes,
286
- keep: Iterable[str] = (),
287
- remove_illegal: bool = True,
288
- encoding: str = "utf-8",
289
- ) -> str:
290
- """Remove entities from the given `text` by converting them to their
291
- corresponding Unicode character.
292
-
293
- `text` can be a Unicode string or a byte string encoded in the given
294
- `encoding` (which defaults to 'utf-8').
295
-
296
- If `keep` is passed (with a list of entity names), those entities will
297
- be kept (they won't be removed).
298
-
299
- It supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
300
- and named entities (such as ``&nbsp;`` or ``&gt;``).
301
-
302
- If `remove_illegal` is ``True``, entities that can't be converted are removed.
303
- If `remove_illegal` is ``False``, entities that can't be converted are kept "as
304
- is". For more information, see the tests.
305
-
306
- Always returns a Unicode string (with the entities removed).
307
-
308
- >>> _replace_entities(b'Price: &pound;100')
309
- 'Price: \\xa3100'
310
- >>> print(_replace_entities(b'Price: &pound;100'))
311
- Price: £100
312
- >>>
313
-
314
- """
315
-
316
- def convert_entity(m: Match[str]) -> str:
317
- groups = m.groupdict()
318
- number = None
319
- if groups.get("dec"):
320
- number = int(groups["dec"], 10)
321
- elif groups.get("hex"):
322
- number = int(groups["hex"], 16)
323
- elif groups.get("named"):
324
- entity_name = groups["named"]
325
- if entity_name.lower() in keep:
326
- return m.group(0)
327
- number = name2codepoint.get(entity_name) or name2codepoint.get(entity_name.lower())
328
- if number is not None:
329
- # Browsers typically
330
- # interpret numeric character references in the 80-9F range as representing the characters mapped
331
- # to bytes 80-9F in the Windows-1252 encoding. For more info
332
- # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML
333
- try:
334
- if 0x80 <= number <= 0x9F:
335
- return bytes((number,)).decode("cp1252")
336
- return chr(number)
337
- except (ValueError, OverflowError): # pragma: no cover
338
- pass
339
-
340
- return "" if remove_illegal and groups.get("semicolon") else m.group(0)
341
-
342
- return _ent_re.sub(convert_entity, to_unicode(text, encoding))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scrapling/core/_types.py CHANGED
@@ -12,12 +12,14 @@ from typing import (
12
  Callable,
13
  Dict,
14
  Generator,
 
15
  Generic,
16
  Iterable,
17
  List,
18
  Set,
19
  Literal,
20
  Optional,
 
21
  Pattern,
22
  Sequence,
23
  Tuple,
@@ -30,34 +32,16 @@ from typing import (
30
  Coroutine,
31
  SupportsIndex,
32
  )
 
33
 
 
 
34
  SUPPORTED_HTTP_METHODS = Literal["GET", "POST", "PUT", "DELETE"]
35
  SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
36
  PageLoadStates = Literal["commit", "domcontentloaded", "load", "networkidle"]
37
  extraction_types = Literal["text", "html", "markdown"]
38
  StrOrBytes = Union[str, bytes]
39
 
40
- if TYPE_CHECKING: # pragma: no cover
41
- from typing_extensions import Unpack
42
- else: # pragma: no cover
43
-
44
- class _Unpack:
45
- @staticmethod
46
- def __getitem__(*args, **kwargs):
47
- pass
48
-
49
- Unpack = _Unpack()
50
-
51
-
52
- try:
53
- # Python 3.11+
54
- from typing import Self # novermin
55
- except ImportError: # pragma: no cover
56
- try:
57
- from typing_extensions import Self # Backport
58
- except ImportError:
59
- Self = object
60
-
61
 
62
  # Copied from `playwright._impl._api_structures.SetCookieParam`
63
  class SetCookieParam(TypedDict, total=False):
 
12
  Callable,
13
  Dict,
14
  Generator,
15
+ AsyncGenerator,
16
  Generic,
17
  Iterable,
18
  List,
19
  Set,
20
  Literal,
21
  Optional,
22
+ Iterator,
23
  Pattern,
24
  Sequence,
25
  Tuple,
 
32
  Coroutine,
33
  SupportsIndex,
34
  )
35
+ from typing_extensions import Self, Unpack
36
 
37
+ # Proxy can be a string URL or a dict (Playwright format: {"server": "...", "username": "...", "password": "..."})
38
+ ProxyType = Union[str, Dict[str, str]]
39
  SUPPORTED_HTTP_METHODS = Literal["GET", "POST", "PUT", "DELETE"]
40
  SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
41
  PageLoadStates = Literal["commit", "domcontentloaded", "load", "networkidle"]
42
  extraction_types = Literal["text", "html", "markdown"]
43
  StrOrBytes = Union[str, bytes]
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  # Copied from `playwright._impl._api_structures.SetCookieParam`
47
  class SetCookieParam(TypedDict, total=False):
scrapling/core/ai.py CHANGED
@@ -213,7 +213,7 @@ class ScraplingMCPServer:
213
  extraction_type: extraction_types = "markdown",
214
  css_selector: Optional[str] = None,
215
  main_content_only: bool = True,
216
- headless: bool = False,
217
  google_search: bool = True,
218
  real_chrome: bool = False,
219
  wait: int | float = 0,
@@ -295,7 +295,7 @@ class ScraplingMCPServer:
295
  extraction_type: extraction_types = "markdown",
296
  css_selector: Optional[str] = None,
297
  main_content_only: bool = True,
298
- headless: bool = False,
299
  google_search: bool = True,
300
  real_chrome: bool = False,
301
  wait: int | float = 0,
 
213
  extraction_type: extraction_types = "markdown",
214
  css_selector: Optional[str] = None,
215
  main_content_only: bool = True,
216
+ headless: bool = True, # noqa: F821
217
  google_search: bool = True,
218
  real_chrome: bool = False,
219
  wait: int | float = 0,
 
295
  extraction_type: extraction_types = "markdown",
296
  css_selector: Optional[str] = None,
297
  main_content_only: bool = True,
298
+ headless: bool = True, # noqa: F821
299
  google_search: bool = True,
300
  real_chrome: bool = False,
301
  wait: int | float = 0,
scrapling/core/custom_types.py CHANGED
@@ -3,6 +3,7 @@ from types import MappingProxyType
3
  from re import compile as re_compile, UNICODE, IGNORECASE
4
 
5
  from orjson import dumps, loads
 
6
 
7
  from scrapling.core._types import (
8
  Any,
@@ -19,7 +20,6 @@ from scrapling.core._types import (
19
  SupportsIndex,
20
  )
21
  from scrapling.core.utils import _is_iterable, flatten, __CONSECUTIVE_SPACES_REGEX__
22
- from scrapling.core._html_utils import _replace_entities
23
 
24
  # Define type variable for AttributeHandler value type
25
  _TextHandlerType = TypeVar("_TextHandlerType", bound="TextHandler")
@@ -35,9 +35,7 @@ class TextHandler(str):
35
  lst = super().__getitem__(key)
36
  return TextHandler(lst)
37
 
38
- def split(
39
- self, sep: str | None = None, maxsplit: SupportsIndex = -1
40
- ) -> Union[List, "TextHandlers"]: # pragma: no cover
41
  return TextHandlers([TextHandler(s) for s in super().split(sep, maxsplit)])
42
 
43
  def strip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
@@ -61,7 +59,7 @@ class TextHandler(str):
61
  def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, "TextHandler"]: # pragma: no cover
62
  return TextHandler(super().expandtabs(tabsize))
63
 
64
- def format(self, *args: object, **kwargs: str) -> Union[str, "TextHandler"]: # pragma: no cover
65
  return TextHandler(super().format(*args, **kwargs))
66
 
67
  def format_map(self, mapping) -> Union[str, "TextHandler"]: # pragma: no cover
@@ -291,7 +289,7 @@ class AttributesHandler(Mapping[str, _TextHandlerType]):
291
 
292
  __slots__ = ("_data",)
293
 
294
- def __init__(self, mapping=None, **kwargs):
295
  mapping = (
296
  {key: TextHandler(value) if isinstance(value, str) else value for key, value in mapping.items()}
297
  if mapping is not None
@@ -324,8 +322,8 @@ class AttributesHandler(Mapping[str, _TextHandlerType]):
324
  yield AttributesHandler({key: value})
325
 
326
  @property
327
- def json_string(self):
328
- """Convert current attributes to JSON string if the attributes are JSON serializable otherwise throws error"""
329
  return dumps(dict(self._data))
330
 
331
  def __getitem__(self, key: str) -> _TextHandlerType:
 
3
  from re import compile as re_compile, UNICODE, IGNORECASE
4
 
5
  from orjson import dumps, loads
6
+ from w3lib.html import replace_entities as _replace_entities
7
 
8
  from scrapling.core._types import (
9
  Any,
 
20
  SupportsIndex,
21
  )
22
  from scrapling.core.utils import _is_iterable, flatten, __CONSECUTIVE_SPACES_REGEX__
 
23
 
24
  # Define type variable for AttributeHandler value type
25
  _TextHandlerType = TypeVar("_TextHandlerType", bound="TextHandler")
 
35
  lst = super().__getitem__(key)
36
  return TextHandler(lst)
37
 
38
+ def split(self, sep: str | None = None, maxsplit: SupportsIndex = -1) -> list[Any]: # pragma: no cover
 
 
39
  return TextHandlers([TextHandler(s) for s in super().split(sep, maxsplit)])
40
 
41
  def strip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
 
59
  def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, "TextHandler"]: # pragma: no cover
60
  return TextHandler(super().expandtabs(tabsize))
61
 
62
+ def format(self, *args: object, **kwargs: object) -> Union[str, "TextHandler"]: # pragma: no cover
63
  return TextHandler(super().format(*args, **kwargs))
64
 
65
  def format_map(self, mapping) -> Union[str, "TextHandler"]: # pragma: no cover
 
289
 
290
  __slots__ = ("_data",)
291
 
292
+ def __init__(self, mapping: Any = None, **kwargs: Any) -> None:
293
  mapping = (
294
  {key: TextHandler(value) if isinstance(value, str) else value for key, value in mapping.items()}
295
  if mapping is not None
 
322
  yield AttributesHandler({key: value})
323
 
324
  @property
325
+ def json_string(self) -> bytes:
326
+ """Convert current attributes to JSON bytes if the attributes are JSON serializable otherwise throws error"""
327
  return dumps(dict(self._data))
328
 
329
  def __getitem__(self, key: str) -> _TextHandlerType:
scrapling/core/mixins.py CHANGED
@@ -1,7 +1,4 @@
1
- from scrapling.core._types import TYPE_CHECKING
2
-
3
- if TYPE_CHECKING:
4
- from scrapling.parser import Selector
5
 
6
 
7
  class SelectorsGeneration:
@@ -11,10 +8,17 @@ class SelectorsGeneration:
11
  Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591
12
  """
13
 
14
- def _general_selection(self: "Selector", selection: str = "css", full_path: bool = False) -> str: # type: ignore[name-defined]
 
 
 
 
15
  """Generate a selector for the current element.
16
  :return: A string of the generated selector.
17
  """
 
 
 
18
  selectorPath = []
19
  target = self
20
  css = selection.lower() == "css"
@@ -33,7 +37,7 @@ class SelectorsGeneration:
33
  # if classes and css:
34
  # part += f".{'.'.join(classes)}"
35
  # else:
36
- counter = {}
37
  for child in target.parent.children:
38
  counter.setdefault(child.tag, 0)
39
  counter[child.tag] += 1
@@ -53,28 +57,28 @@ class SelectorsGeneration:
53
  return " > ".join(reversed(selectorPath)) if css else "//" + "/".join(reversed(selectorPath))
54
 
55
  @property
56
- def generate_css_selector(self: "Selector") -> str: # type: ignore[name-defined]
57
  """Generate a CSS selector for the current element
58
  :return: A string of the generated selector.
59
  """
60
  return self._general_selection()
61
 
62
  @property
63
- def generate_full_css_selector(self: "Selector") -> str: # type: ignore[name-defined]
64
  """Generate a complete CSS selector for the current element
65
  :return: A string of the generated selector.
66
  """
67
  return self._general_selection(full_path=True)
68
 
69
  @property
70
- def generate_xpath_selector(self: "Selector") -> str: # type: ignore[name-defined]
71
  """Generate an XPath selector for the current element
72
  :return: A string of the generated selector.
73
  """
74
  return self._general_selection("xpath")
75
 
76
  @property
77
- def generate_full_xpath_selector(self: "Selector") -> str: # type: ignore[name-defined]
78
  """Generate a complete XPath selector for the current element
79
  :return: A string of the generated selector.
80
  """
 
1
+ from scrapling.core._types import Any, Dict
 
 
 
2
 
3
 
4
  class SelectorsGeneration:
 
8
  Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591
9
  """
10
 
11
+ # Note: This is a mixin class meant to be used with Selector.
12
+ # The methods access Selector attributes (._root, .parent, .attrib, .tag, etc.)
13
+ # through self, which will be a Selector instance at runtime.
14
+
15
+ def _general_selection(self: Any, selection: str = "css", full_path: bool = False) -> str:
16
  """Generate a selector for the current element.
17
  :return: A string of the generated selector.
18
  """
19
+ if self._is_text_node(self._root):
20
+ return ""
21
+
22
  selectorPath = []
23
  target = self
24
  css = selection.lower() == "css"
 
37
  # if classes and css:
38
  # part += f".{'.'.join(classes)}"
39
  # else:
40
+ counter: Dict[str, int] = {}
41
  for child in target.parent.children:
42
  counter.setdefault(child.tag, 0)
43
  counter[child.tag] += 1
 
57
  return " > ".join(reversed(selectorPath)) if css else "//" + "/".join(reversed(selectorPath))
58
 
59
  @property
60
+ def generate_css_selector(self: Any) -> str:
61
  """Generate a CSS selector for the current element
62
  :return: A string of the generated selector.
63
  """
64
  return self._general_selection()
65
 
66
  @property
67
+ def generate_full_css_selector(self: Any) -> str:
68
  """Generate a complete CSS selector for the current element
69
  :return: A string of the generated selector.
70
  """
71
  return self._general_selection(full_path=True)
72
 
73
  @property
74
+ def generate_xpath_selector(self: Any) -> str:
75
  """Generate an XPath selector for the current element
76
  :return: A string of the generated selector.
77
  """
78
  return self._general_selection("xpath")
79
 
80
  @property
81
+ def generate_full_xpath_selector(self: Any) -> str:
82
  """Generate a complete XPath selector for the current element
83
  :return: A string of the generated selector.
84
  """
scrapling/core/shell.py CHANGED
@@ -30,6 +30,7 @@ from scrapling.core.custom_types import TextHandler
30
  from scrapling.engines.toolbelt.custom import Response
31
  from scrapling.core.utils._shell import _ParseHeaders, _CookieParser
32
  from scrapling.core._types import (
 
33
  Dict,
34
  Any,
35
  cast,
@@ -82,7 +83,7 @@ class NoExitArgumentParser(ArgumentParser): # pragma: no cover
82
  class CurlParser:
83
  """Builds the argument parser for relevant curl flags from DevTools."""
84
 
85
- def __init__(self):
86
  from scrapling.fetchers import Fetcher as __Fetcher
87
 
88
  self.__fetcher = __Fetcher
@@ -467,19 +468,21 @@ Type 'exit' or press Ctrl+D to exit.
467
 
468
  return result
469
 
470
- def create_wrapper(self, func, get_signature=True, signature_name=None):
 
 
471
  """Create a wrapper that preserves function signature but updates page"""
472
 
473
  @wraps(func)
474
- def wrapper(*args, **kwargs):
475
  result = func(*args, **kwargs)
476
  return self.update_page(result)
477
 
478
  if get_signature:
479
  # Explicitly preserve and unpack signature for IPython introspection and autocompletion
480
- wrapper.__signature__ = _unpack_signature(func, signature_name) # pyright: ignore
481
  else:
482
- wrapper.__signature__ = signature(func) # pyright: ignore
483
 
484
  return wrapper
485
 
@@ -583,7 +586,7 @@ class Convertor:
583
  raise ValueError(f"Unknown extraction type: {extraction_type}")
584
  else:
585
  if main_content_only:
586
- page = cast(Selector, page.css_first("body")) or page
587
 
588
  pages = [page] if not css_selector else cast(Selectors, page.css(css_selector))
589
  for page in pages:
@@ -601,7 +604,7 @@ class Convertor:
601
  " ",
602
  ):
603
  # Remove consecutive white-spaces
604
- txt_content = re_sub(f"[{s}]+", s, txt_content)
605
  yield txt_content
606
  yield ""
607
 
 
30
  from scrapling.engines.toolbelt.custom import Response
31
  from scrapling.core.utils._shell import _ParseHeaders, _CookieParser
32
  from scrapling.core._types import (
33
+ Callable,
34
  Dict,
35
  Any,
36
  cast,
 
83
  class CurlParser:
84
  """Builds the argument parser for relevant curl flags from DevTools."""
85
 
86
+ def __init__(self) -> None:
87
  from scrapling.fetchers import Fetcher as __Fetcher
88
 
89
  self.__fetcher = __Fetcher
 
468
 
469
  return result
470
 
471
+ def create_wrapper(
472
+ self, func: Callable, get_signature: bool = True, signature_name: Optional[str] = None
473
+ ) -> Callable:
474
  """Create a wrapper that preserves function signature but updates page"""
475
 
476
  @wraps(func)
477
+ def wrapper(*args: Any, **kwargs: Any) -> Any:
478
  result = func(*args, **kwargs)
479
  return self.update_page(result)
480
 
481
  if get_signature:
482
  # Explicitly preserve and unpack signature for IPython introspection and autocompletion
483
+ setattr(wrapper, "__signature__", _unpack_signature(func, signature_name))
484
  else:
485
+ setattr(wrapper, "__signature__", signature(func))
486
 
487
  return wrapper
488
 
 
586
  raise ValueError(f"Unknown extraction type: {extraction_type}")
587
  else:
588
  if main_content_only:
589
+ page = cast(Selector, page.css("body").first) or page
590
 
591
  pages = [page] if not css_selector else cast(Selectors, page.css(css_selector))
592
  for page in pages:
 
604
  " ",
605
  ):
606
  # Remove consecutive white-spaces
607
+ txt_content = TextHandler(re_sub(f"[{s}]+", s, txt_content))
608
  yield txt_content
609
  yield ""
610