v0.4
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .bandit.yml +3 -1
- .github/ISSUE_TEMPLATE/04-docs_issue.yml +40 -0
- .github/workflows/code-quality.yml +35 -1
- .gitignore +6 -0
- .readthedocs.yaml +11 -15
- docs/README.md → README.md +149 -85
- benchmarks.py +1 -1
- docs/README_AR.md +210 -138
- docs/README_CN.md +210 -138
- docs/README_DE.md +206 -134
- docs/README_ES.md +205 -133
- docs/README_JP.md +217 -145
- docs/README_RU.md +219 -147
- docs/ai/mcp-server.md +2 -2
- docs/api-reference/mcp-server.md +1 -1
- docs/api-reference/proxy-rotation.md +18 -0
- docs/api-reference/response.md +18 -0
- docs/api-reference/spiders.md +42 -0
- docs/benchmarks.md +14 -13
- docs/cli/extract-commands.md +8 -9
- docs/cli/interactive-shell.md +9 -9
- docs/development/adaptive_storage_system.md +3 -1
- docs/development/scrapling_custom_types.md +2 -0
- docs/fetching/choosing.md +14 -6
- docs/fetching/dynamic.md +60 -22
- docs/fetching/static.md +65 -31
- docs/fetching/stealthy.md +32 -27
- docs/index.md +71 -24
- docs/overview.md +27 -13
- docs/parsing/adaptive.md +14 -13
- docs/parsing/main_classes.md +75 -37
- docs/parsing/selection.md +50 -50
- docs/requirements.txt +4 -4
- docs/spiders/advanced.md +313 -0
- docs/spiders/architecture.md +98 -0
- docs/spiders/getting-started.md +159 -0
- docs/spiders/proxy-blocking.md +244 -0
- docs/spiders/requests-responses.md +202 -0
- docs/spiders/sessions.md +218 -0
- docs/tutorials/migrating_from_beautifulsoup.md +11 -9
- mkdocs.yml +0 -180
- pyproject.toml +27 -6
- scrapling/__init__.py +1 -1
- scrapling/cli.py +3 -0
- scrapling/core/_html_utils.py +0 -342
- scrapling/core/_types.py +5 -21
- scrapling/core/ai.py +2 -2
- scrapling/core/custom_types.py +6 -8
- scrapling/core/mixins.py +14 -10
- scrapling/core/shell.py +10 -7
.bandit.yml
CHANGED
|
@@ -6,4 +6,6 @@ skips:
|
|
| 6 |
- B404 # Using subprocess library
|
| 7 |
- B602 # subprocess call with shell=True identified
|
| 8 |
- B110 # Try, Except, Pass detected.
|
| 9 |
-
- B104 # Possible binding to all interfaces.
|
|
|
|
|
|
|
|
|
| 6 |
- B404 # Using subprocess library
|
| 7 |
- B602 # subprocess call with shell=True identified
|
| 8 |
- B110 # Try, Except, Pass detected.
|
| 9 |
+
- B104 # Possible binding to all interfaces.
|
| 10 |
+
- B301 # Pickle and modules that wrap it can be unsafe when used to deserialize untrusted data, possible security issue.
|
| 11 |
+
- B108 # Probable insecure usage of temp file/directory.
|
.github/ISSUE_TEMPLATE/04-docs_issue.yml
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Documentation issue
|
| 2 |
+
description: Report incorrect, unclear, or missing documentation.
|
| 3 |
+
labels: [documentation]
|
| 4 |
+
body:
|
| 5 |
+
- type: checkboxes
|
| 6 |
+
attributes:
|
| 7 |
+
label: Have you searched if there an existing issue for this?
|
| 8 |
+
description: Please search [existing issues](https://github.com/D4Vinci/Scrapling/labels/documentation).
|
| 9 |
+
options:
|
| 10 |
+
- label: I have searched the existing issues
|
| 11 |
+
required: true
|
| 12 |
+
|
| 13 |
+
- type: input
|
| 14 |
+
attributes:
|
| 15 |
+
label: "Page URL"
|
| 16 |
+
description: "Link to the documentation page with the issue."
|
| 17 |
+
placeholder: "https://scrapling.readthedocs.io/en/latest/..."
|
| 18 |
+
validations:
|
| 19 |
+
required: true
|
| 20 |
+
|
| 21 |
+
- type: dropdown
|
| 22 |
+
attributes:
|
| 23 |
+
label: "Type of issue"
|
| 24 |
+
options:
|
| 25 |
+
- Incorrect information
|
| 26 |
+
- Unclear or confusing
|
| 27 |
+
- Missing information
|
| 28 |
+
- Typo or formatting
|
| 29 |
+
- Broken link
|
| 30 |
+
- Other
|
| 31 |
+
default: 0
|
| 32 |
+
validations:
|
| 33 |
+
required: true
|
| 34 |
+
|
| 35 |
+
- type: textarea
|
| 36 |
+
attributes:
|
| 37 |
+
label: "Description"
|
| 38 |
+
description: "Describe what's wrong and what you expected to find."
|
| 39 |
+
validations:
|
| 40 |
+
required: true
|
.github/workflows/code-quality.yml
CHANGED
|
@@ -50,7 +50,9 @@ jobs:
|
|
| 50 |
- name: Install dependencies
|
| 51 |
run: |
|
| 52 |
python -m pip install --upgrade pip
|
| 53 |
-
pip install bandit[toml] ruff vermin
|
|
|
|
|
|
|
| 54 |
|
| 55 |
- name: Run Bandit (Security Linter)
|
| 56 |
id: bandit
|
|
@@ -85,6 +87,22 @@ jobs:
|
|
| 85 |
vermin -t=3.10- --violations --eval-annotations --no-tips scrapling/
|
| 86 |
echo "::endgroup::"
|
| 87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
- name: Check results and create summary
|
| 89 |
if: always()
|
| 90 |
run: |
|
|
@@ -126,6 +144,22 @@ jobs:
|
|
| 126 |
all_passed=false
|
| 127 |
fi
|
| 128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
echo "" >> $GITHUB_STEP_SUMMARY
|
| 130 |
|
| 131 |
if [ "$all_passed" == "true" ]; then
|
|
|
|
| 50 |
- name: Install dependencies
|
| 51 |
run: |
|
| 52 |
python -m pip install --upgrade pip
|
| 53 |
+
pip install bandit[toml] ruff vermin mypy pyright
|
| 54 |
+
pip install -e ".[all]"
|
| 55 |
+
pip install lxml-stubs
|
| 56 |
|
| 57 |
- name: Run Bandit (Security Linter)
|
| 58 |
id: bandit
|
|
|
|
| 87 |
vermin -t=3.10- --violations --eval-annotations --no-tips scrapling/
|
| 88 |
echo "::endgroup::"
|
| 89 |
|
| 90 |
+
- name: Run Mypy (Static Type Checker)
|
| 91 |
+
id: mypy
|
| 92 |
+
continue-on-error: true
|
| 93 |
+
run: |
|
| 94 |
+
echo "::group::Mypy - Static Type Checker"
|
| 95 |
+
mypy scrapling/
|
| 96 |
+
echo "::endgroup::"
|
| 97 |
+
|
| 98 |
+
- name: Run Pyright (Static Type Checker)
|
| 99 |
+
id: pyright
|
| 100 |
+
continue-on-error: true
|
| 101 |
+
run: |
|
| 102 |
+
echo "::group::Pyright - Static Type Checker"
|
| 103 |
+
pyright scrapling/
|
| 104 |
+
echo "::endgroup::"
|
| 105 |
+
|
| 106 |
- name: Check results and create summary
|
| 107 |
if: always()
|
| 108 |
run: |
|
|
|
|
| 144 |
all_passed=false
|
| 145 |
fi
|
| 146 |
|
| 147 |
+
# Check Mypy
|
| 148 |
+
if [ "${{ steps.mypy.outcome }}" == "success" ]; then
|
| 149 |
+
echo "✅ **Mypy (Type Checker)**: Passed" >> $GITHUB_STEP_SUMMARY
|
| 150 |
+
else
|
| 151 |
+
echo "❌ **Mypy (Type Checker)**: Failed" >> $GITHUB_STEP_SUMMARY
|
| 152 |
+
all_passed=false
|
| 153 |
+
fi
|
| 154 |
+
|
| 155 |
+
# Check Pyright
|
| 156 |
+
if [ "${{ steps.pyright.outcome }}" == "success" ]; then
|
| 157 |
+
echo "✅ **Pyright (Type Checker)**: Passed" >> $GITHUB_STEP_SUMMARY
|
| 158 |
+
else
|
| 159 |
+
echo "❌ **Pyright (Type Checker)**: Failed" >> $GITHUB_STEP_SUMMARY
|
| 160 |
+
all_passed=false
|
| 161 |
+
fi
|
| 162 |
+
|
| 163 |
echo "" >> $GITHUB_STEP_SUMMARY
|
| 164 |
|
| 165 |
if [ "$all_passed" == "true" ]; then
|
.gitignore
CHANGED
|
@@ -1,3 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# cached files
|
| 2 |
__pycache__/
|
| 3 |
*.py[cod]
|
|
|
|
| 1 |
+
site/*
|
| 2 |
+
|
| 3 |
+
# AI related files
|
| 4 |
+
.claude/*
|
| 5 |
+
CLAUDE.md
|
| 6 |
+
|
| 7 |
# cached files
|
| 8 |
__pycache__/
|
| 9 |
*.py[cod]
|
.readthedocs.yaml
CHANGED
|
@@ -1,25 +1,21 @@
|
|
| 1 |
-
#
|
| 2 |
-
#
|
| 3 |
|
| 4 |
-
# Required
|
| 5 |
version: 2
|
| 6 |
|
| 7 |
-
# Set the OS, Python version, and other tools you might need
|
| 8 |
build:
|
| 9 |
os: ubuntu-24.04
|
| 10 |
apt_packages:
|
| 11 |
- pngquant
|
| 12 |
tools:
|
| 13 |
python: "3.13"
|
| 14 |
-
|
| 15 |
-
# Build documentation with Mkdocs
|
| 16 |
-
mkdocs:
|
| 17 |
-
configuration: mkdocs.yml
|
| 18 |
-
|
| 19 |
-
# Optionally, but recommended,
|
| 20 |
-
# declare the Python requirements required to build your documentation
|
| 21 |
-
# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
|
| 22 |
-
python:
|
| 23 |
install:
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# See https://docs.readthedocs.com/platform/stable/intro/zensical.html for details
|
| 2 |
+
# Example: https://github.com/readthedocs/test-builds/tree/zensical
|
| 3 |
|
|
|
|
| 4 |
version: 2
|
| 5 |
|
|
|
|
| 6 |
build:
|
| 7 |
os: ubuntu-24.04
|
| 8 |
apt_packages:
|
| 9 |
- pngquant
|
| 10 |
tools:
|
| 11 |
python: "3.13"
|
| 12 |
+
jobs:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
install:
|
| 14 |
+
- pip install -r docs/requirements.txt
|
| 15 |
+
- pip install ".[all]"
|
| 16 |
+
build:
|
| 17 |
+
html:
|
| 18 |
+
- zensical build
|
| 19 |
+
post_build:
|
| 20 |
+
- mkdir -p $READTHEDOCS_OUTPUT/html/
|
| 21 |
+
- cp --recursive site/* $READTHEDOCS_OUTPUT/html/
|
docs/README.md → README.md
RENAMED
|
@@ -1,13 +1,17 @@
|
|
| 1 |
-
|
| 2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
-
<p align=center>
|
| 5 |
-
<br>
|
| 6 |
-
<a href="https://scrapling.readthedocs.io/en/latest/" target="_blank"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/poster.png" style="width: 50%; height: 100%;" alt="main poster"/></a>
|
| 7 |
-
<br>
|
| 8 |
-
<i><code>Easy, effortless Web Scraping as it should be!</code></i>
|
| 9 |
-
</p>
|
| 10 |
<p align="center">
|
|
|
|
|
|
|
| 11 |
<a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
|
| 12 |
<img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
|
| 13 |
<a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
|
|
@@ -27,44 +31,45 @@ Automated translations: [العربيه](https://github.com/D4Vinci/Scrapling/bl
|
|
| 27 |
</p>
|
| 28 |
|
| 29 |
<p align="center">
|
| 30 |
-
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/">
|
| 31 |
-
|
| 32 |
-
</a>
|
| 33 |
-
|
| 34 |
-
<a href="https://scrapling.readthedocs.io/en/latest/
|
| 35 |
-
|
| 36 |
-
</a>
|
| 37 |
-
|
| 38 |
-
<a href="https://scrapling.readthedocs.io/en/latest/
|
| 39 |
-
CLI
|
| 40 |
-
</a>
|
| 41 |
-
·
|
| 42 |
-
<a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/">
|
| 43 |
-
MCP mode
|
| 44 |
-
</a>
|
| 45 |
-
·
|
| 46 |
-
<a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/">
|
| 47 |
-
Migrating from Beautifulsoup
|
| 48 |
-
</a>
|
| 49 |
</p>
|
| 50 |
|
| 51 |
-
|
| 52 |
|
| 53 |
-
|
| 54 |
|
| 55 |
-
|
| 56 |
|
| 57 |
```python
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
200
|
| 64 |
-
>> products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
|
| 65 |
-
>> # Later, if the website structure changes, pass `adaptive=True`
|
| 66 |
-
>> products = page.css('.product', adaptive=True) # and Scrapling still finds them!
|
| 67 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
# Sponsors
|
| 70 |
|
|
@@ -90,16 +95,27 @@ Built for the modern Web, Scrapling features **its own rapid parsing engine** an
|
|
| 90 |
|
| 91 |
## Key Features
|
| 92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
### Advanced Websites Fetching with Session Support
|
| 94 |
-
- **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class. Can impersonate browsers' TLS fingerprint, headers, and use
|
| 95 |
- **Dynamic Loading**: Fetch dynamic websites with full browser automation through the `DynamicFetcher` class supporting Playwright's Chromium and Google's Chrome.
|
| 96 |
- **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` and fingerprint spoofing. Can easily bypass all types of Cloudflare's Turnstile/Interstitial with automation.
|
| 97 |
- **Session Management**: Persistent session support with `FetcherSession`, `StealthySession`, and `DynamicSession` classes for cookie and state management across requests.
|
|
|
|
|
|
|
| 98 |
- **Async Support**: Complete async support across all fetchers and dedicated async session classes.
|
| 99 |
|
| 100 |
### Adaptive Scraping & AI Integration
|
| 101 |
- 🔄 **Smart Element Tracking**: Relocate elements after website changes using intelligent similarity algorithms.
|
| 102 |
-
- 🎯 **Smart Flexible Selection**: CSS selectors, XPath selectors, filter-based search, text search, regex search, and more.
|
| 103 |
- 🔍 **Find Similar Elements**: Automatically locate elements similar to found elements.
|
| 104 |
- 🤖 **MCP Server to be used with AI**: Built-in MCP server for AI-assisted Web Scraping and data extraction. The MCP server features powerful, custom capabilities that leverage Scrapling to extract targeted content before passing it to the AI (Claude/Cursor/etc), thereby speeding up operations and reducing costs by minimizing token usage. ([demo video](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
|
| 105 |
|
|
@@ -111,51 +127,107 @@ Built for the modern Web, Scrapling features **its own rapid parsing engine** an
|
|
| 111 |
|
| 112 |
### Developer/Web Scraper Friendly Experience
|
| 113 |
- 🎯 **Interactive Web Scraping Shell**: Optional built-in IPython shell with Scrapling integration, shortcuts, and new tools to speed up Web Scraping scripts development, like converting curl requests to Scrapling requests and viewing requests results in your browser.
|
| 114 |
-
- 🚀 **Use it directly from the Terminal**: Optionally, you can use Scrapling to scrape a URL without writing a single code!
|
| 115 |
- 🛠️ **Rich Navigation API**: Advanced DOM traversal with parent, sibling, and child navigation methods.
|
| 116 |
- 🧬 **Enhanced Text Processing**: Built-in regex, cleaning methods, and optimized string operations.
|
| 117 |
- 📝 **Auto Selector Generation**: Generate robust CSS/XPath selectors for any element.
|
| 118 |
- 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup with the same pseudo-elements used in Scrapy/Parsel.
|
| 119 |
-
- 📘 **Complete Type Coverage**: Full type hints for excellent IDE support and code completion.
|
| 120 |
- 🔋 **Ready Docker image**: With each release, a Docker image containing all browsers is automatically built and pushed.
|
| 121 |
|
| 122 |
## Getting Started
|
| 123 |
|
|
|
|
|
|
|
| 124 |
### Basic Usage
|
|
|
|
| 125 |
```python
|
| 126 |
-
from scrapling.fetchers import Fetcher,
|
| 127 |
-
from scrapling.fetchers import FetcherSession, StealthySession, DynamicSession
|
| 128 |
|
| 129 |
-
# HTTP requests with session support
|
| 130 |
with FetcherSession(impersonate='chrome') as session: # Use latest version of Chrome's TLS fingerprint
|
| 131 |
page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
| 132 |
-
quotes = page.css('.quote .text::text')
|
| 133 |
|
| 134 |
# Or use one-off requests
|
| 135 |
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 136 |
-
quotes = page.css('.quote .text::text')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
|
| 138 |
-
|
| 139 |
-
with StealthySession(headless=True, solve_cloudflare=True) as session:
|
| 140 |
page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
|
| 141 |
-
data = page.css('#padded_content a')
|
| 142 |
|
| 143 |
# Or use one-off request style, it opens the browser for this request, then closes it after finishing
|
| 144 |
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
|
| 145 |
-
data = page.css('#padded_content a')
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
|
|
|
|
|
|
|
|
|
| 149 |
page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
|
| 150 |
-
data = page.xpath('//span[@class="text"]/text()') # XPath selector if you prefer it
|
| 151 |
|
| 152 |
# Or use one-off request style, it opens the browser for this request, then closes it after finishing
|
| 153 |
page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
|
| 154 |
-
data = page.css('.quote .text::text')
|
| 155 |
```
|
| 156 |
|
| 157 |
-
|
| 158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
|
| 160 |
### Advanced Parsing & Navigation
|
| 161 |
```python
|
|
@@ -176,10 +248,9 @@ quotes = page.find_all(class_='quote') # and so on...
|
|
| 176 |
quotes = page.find_by_text('quote', tag='div')
|
| 177 |
|
| 178 |
# Advanced navigation
|
| 179 |
-
|
| 180 |
-
quote_text =
|
| 181 |
-
|
| 182 |
-
quote_text = page.css_first('.quote .text').text # Using `css_first` is faster than `css` if you want the first element
|
| 183 |
author = first_quote.next_sibling.css('.author::text')
|
| 184 |
parent_container = first_quote.parent
|
| 185 |
|
|
@@ -220,7 +291,7 @@ async with AsyncStealthySession(max_pages=2) as session:
|
|
| 220 |
|
| 221 |
## CLI & Interactive Shell
|
| 222 |
|
| 223 |
-
Scrapling
|
| 224 |
|
| 225 |
[](https://asciinema.org/a/736339)
|
| 226 |
|
|
@@ -237,34 +308,34 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.
|
|
| 237 |
```
|
| 238 |
|
| 239 |
> [!NOTE]
|
| 240 |
-
> There are many additional features, but we want to keep this page concise,
|
| 241 |
|
| 242 |
## Performance Benchmarks
|
| 243 |
|
| 244 |
-
Scrapling isn't just powerful—it's also blazing fast
|
| 245 |
|
| 246 |
### Text Extraction Speed Test (5000 nested elements)
|
| 247 |
|
| 248 |
| # | Library | Time (ms) | vs Scrapling |
|
| 249 |
|---|:-----------------:|:---------:|:------------:|
|
| 250 |
-
| 1 | Scrapling |
|
| 251 |
-
| 2 | Parsel/Scrapy | 2.
|
| 252 |
-
| 3 | Raw Lxml |
|
| 253 |
-
| 4 | PyQuery |
|
| 254 |
-
| 5 | Selectolax |
|
| 255 |
-
| 6 |
|
| 256 |
-
| 7 |
|
| 257 |
-
| 8 | BS4 with html5lib |
|
| 258 |
|
| 259 |
|
| 260 |
### Element Similarity & Text Search Performance
|
| 261 |
|
| 262 |
Scrapling's adaptive element finding capabilities significantly outperform alternatives:
|
| 263 |
|
| 264 |
-
|
|
| 265 |
|-------------|:---------:|:------------:|
|
| 266 |
-
| Scrapling | 2.
|
| 267 |
-
| AutoScraper |
|
| 268 |
|
| 269 |
|
| 270 |
> All benchmarks represent averages of 100+ runs. See [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology.
|
|
@@ -277,7 +348,7 @@ Scrapling requires Python 3.10 or higher:
|
|
| 277 |
pip install scrapling
|
| 278 |
```
|
| 279 |
|
| 280 |
-
|
| 281 |
|
| 282 |
### Optional Dependencies
|
| 283 |
|
|
@@ -334,12 +405,5 @@ This work is licensed under the BSD-3-Clause License.
|
|
| 334 |
This project includes code adapted from:
|
| 335 |
- Parsel (BSD License)—Used for [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py) submodule
|
| 336 |
|
| 337 |
-
## Thanks and References
|
| 338 |
-
|
| 339 |
-
- [Daijro](https://github.com/daijro)'s brilliant work on [BrowserForge](https://github.com/daijro/browserforge) and [Camoufox](https://github.com/daijro/camoufox)
|
| 340 |
-
- [Vinyzu](https://github.com/Vinyzu)'s brilliant work on [Botright](https://github.com/Vinyzu/Botright) and [PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)
|
| 341 |
-
- [brotector](https://github.com/kaliiiiiiiiii/brotector) for browser detection bypass techniques
|
| 342 |
-
- [fakebrowser](https://github.com/kkoooqq/fakebrowser) and [BotBrowser](https://github.com/botswin/BotBrowser) for fingerprinting research
|
| 343 |
-
|
| 344 |
---
|
| 345 |
<div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
|
|
|
|
| 1 |
+
<h1 align="center">
|
| 2 |
+
<a href="https://scrapling.readthedocs.io">
|
| 3 |
+
<picture>
|
| 4 |
+
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
|
| 5 |
+
<img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
|
| 6 |
+
</picture>
|
| 7 |
+
</a>
|
| 8 |
+
<br>
|
| 9 |
+
<small>Effortless Web Scraping for the Modern Web</small>
|
| 10 |
+
</h1>
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
<p align="center">
|
| 13 |
+
<a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_AR.md">العربيه</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_ES.md">Español</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_DE.md">Deutsch</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_CN.md">简体中文</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_JP.md">日本語</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_RU.md">Русский</a>
|
| 14 |
+
<br/>
|
| 15 |
<a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
|
| 16 |
<img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
|
| 17 |
<a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
|
|
|
|
| 31 |
</p>
|
| 32 |
|
| 33 |
<p align="center">
|
| 34 |
+
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/"><strong>Selection methods</strong></a>
|
| 35 |
+
·
|
| 36 |
+
<a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/"><strong>Choosing a fetcher</strong></a>
|
| 37 |
+
·
|
| 38 |
+
<a href="https://scrapling.readthedocs.io/en/latest/cli/overview/"><strong>CLI</strong></a>
|
| 39 |
+
·
|
| 40 |
+
<a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/"><strong>MCP mode</strong></a>
|
| 41 |
+
·
|
| 42 |
+
<a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/"><strong>Migrating from Beautifulsoup</strong></a>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
</p>
|
| 44 |
|
| 45 |
+
Scrapling is an adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl.
|
| 46 |
|
| 47 |
+
Its parser learns from website changes and automatically relocates your elements when pages update. Its fetchers bypass anti-bot systems like Cloudflare Turnstile out of the box. And its spider framework lets you scale up to concurrent, multi-session crawls with pause/resume and automatic proxy rotation — all in a few lines of Python. One library, zero compromises.
|
| 48 |
|
| 49 |
+
Blazing fast crawls with real-time stats and streaming. Built by Web Scrapers for Web Scrapers and regular users, there's something for everyone.
|
| 50 |
|
| 51 |
```python
|
| 52 |
+
from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
|
| 53 |
+
StealthyFetcher.adaptive = True
|
| 54 |
+
page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # Fetch website under the radar!
|
| 55 |
+
products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
|
| 56 |
+
products = page.css('.product', adaptive=True) # Later, if the website structure changes, pass `adaptive=True` to find them!
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
```
|
| 58 |
+
Or scale up to full crawls
|
| 59 |
+
```python
|
| 60 |
+
from scrapling.spiders import Spider, Response
|
| 61 |
+
|
| 62 |
+
class MySpider(Spider):
|
| 63 |
+
name = "demo"
|
| 64 |
+
start_urls = ["https://example.com/"]
|
| 65 |
+
|
| 66 |
+
async def parse(self, response: Response):
|
| 67 |
+
for item in response.css('.product'):
|
| 68 |
+
yield {"title": item.css('h2::text').get()}
|
| 69 |
+
|
| 70 |
+
MySpider().start()
|
| 71 |
+
```
|
| 72 |
+
|
| 73 |
|
| 74 |
# Sponsors
|
| 75 |
|
|
|
|
| 95 |
|
| 96 |
## Key Features
|
| 97 |
|
| 98 |
+
### Spiders — A Full Crawling Framework
|
| 99 |
+
- 🕷️ **Scrapy-like Spider API**: Define spiders with `start_urls`, async `parse` callbacks, and `Request`/`Response` objects.
|
| 100 |
+
- ⚡ **Concurrent Crawling**: Configurable concurrency limits, per-domain throttling, and download delays.
|
| 101 |
+
- 🔄 **Multi-Session Support**: Unified interface for HTTP requests, and stealthy headless browsers in a single spider — route requests to different sessions by ID.
|
| 102 |
+
- 💾 **Pause & Resume**: Checkpoint-based crawl persistence. Press Ctrl+C for a graceful shutdown; restart to resume from where you left off.
|
| 103 |
+
- 📡 **Streaming Mode**: Stream scraped items as they arrive via `async for item in spider.stream()` with real-time stats — ideal for UI, pipelines, and long-running crawls.
|
| 104 |
+
- 🛡️ **Blocked Request Detection**: Automatic detection and retry of blocked requests with customizable logic.
|
| 105 |
+
- 📦 **Built-in Export**: Export results through hooks and your own pipeline or the built-in JSON/JSONL with `result.items.to_json()` / `result.items.to_jsonl()` respectively.
|
| 106 |
+
|
| 107 |
### Advanced Websites Fetching with Session Support
|
| 108 |
+
- **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class. Can impersonate browsers' TLS fingerprint, headers, and use HTTP/3.
|
| 109 |
- **Dynamic Loading**: Fetch dynamic websites with full browser automation through the `DynamicFetcher` class supporting Playwright's Chromium and Google's Chrome.
|
| 110 |
- **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` and fingerprint spoofing. Can easily bypass all types of Cloudflare's Turnstile/Interstitial with automation.
|
| 111 |
- **Session Management**: Persistent session support with `FetcherSession`, `StealthySession`, and `DynamicSession` classes for cookie and state management across requests.
|
| 112 |
+
- **Proxy Rotation**: Built-in `ProxyRotator` with cyclic or custom rotation strategies across all session types, plus per-request proxy overrides.
|
| 113 |
+
- **Domain Blocking**: Block requests to specific domains (and their subdomains) in browser-based fetchers.
|
| 114 |
- **Async Support**: Complete async support across all fetchers and dedicated async session classes.
|
| 115 |
|
| 116 |
### Adaptive Scraping & AI Integration
|
| 117 |
- 🔄 **Smart Element Tracking**: Relocate elements after website changes using intelligent similarity algorithms.
|
| 118 |
+
- 🎯 **Smart Flexible Selection**: CSS selectors, XPath selectors, filter-based search, text search, regex search, and more.
|
| 119 |
- 🔍 **Find Similar Elements**: Automatically locate elements similar to found elements.
|
| 120 |
- 🤖 **MCP Server to be used with AI**: Built-in MCP server for AI-assisted Web Scraping and data extraction. The MCP server features powerful, custom capabilities that leverage Scrapling to extract targeted content before passing it to the AI (Claude/Cursor/etc), thereby speeding up operations and reducing costs by minimizing token usage. ([demo video](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
|
| 121 |
|
|
|
|
| 127 |
|
| 128 |
### Developer/Web Scraper Friendly Experience
|
| 129 |
- 🎯 **Interactive Web Scraping Shell**: Optional built-in IPython shell with Scrapling integration, shortcuts, and new tools to speed up Web Scraping scripts development, like converting curl requests to Scrapling requests and viewing requests results in your browser.
|
| 130 |
+
- 🚀 **Use it directly from the Terminal**: Optionally, you can use Scrapling to scrape a URL without writing a single line of code!
|
| 131 |
- 🛠️ **Rich Navigation API**: Advanced DOM traversal with parent, sibling, and child navigation methods.
|
| 132 |
- 🧬 **Enhanced Text Processing**: Built-in regex, cleaning methods, and optimized string operations.
|
| 133 |
- 📝 **Auto Selector Generation**: Generate robust CSS/XPath selectors for any element.
|
| 134 |
- 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup with the same pseudo-elements used in Scrapy/Parsel.
|
| 135 |
+
- 📘 **Complete Type Coverage**: Full type hints for excellent IDE support and code completion. The entire codebase is automatically scanned with **PyRight** and **MyPy** with each change.
|
| 136 |
- 🔋 **Ready Docker image**: With each release, a Docker image containing all browsers is automatically built and pushed.
|
| 137 |
|
| 138 |
## Getting Started
|
| 139 |
|
| 140 |
+
Let's give you a quick glimpse of what Scrapling can do without deep diving.
|
| 141 |
+
|
| 142 |
### Basic Usage
|
| 143 |
+
HTTP requests with session support
|
| 144 |
```python
|
| 145 |
+
from scrapling.fetchers import Fetcher, FetcherSession
|
|
|
|
| 146 |
|
|
|
|
| 147 |
with FetcherSession(impersonate='chrome') as session: # Use latest version of Chrome's TLS fingerprint
|
| 148 |
page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
| 149 |
+
quotes = page.css('.quote .text::text').getall()
|
| 150 |
|
| 151 |
# Or use one-off requests
|
| 152 |
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 153 |
+
quotes = page.css('.quote .text::text').getall()
|
| 154 |
+
```
|
| 155 |
+
Advanced stealth mode
|
| 156 |
+
```python
|
| 157 |
+
from scrapling.fetchers import StealthyFetcher, StealthySession
|
| 158 |
|
| 159 |
+
with StealthySession(headless=True, solve_cloudflare=True) as session: # Keep the browser open until you finish
|
|
|
|
| 160 |
page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
|
| 161 |
+
data = page.css('#padded_content a').getall()
|
| 162 |
|
| 163 |
# Or use one-off request style, it opens the browser for this request, then closes it after finishing
|
| 164 |
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
|
| 165 |
+
data = page.css('#padded_content a').getall()
|
| 166 |
+
```
|
| 167 |
+
Full browser automation
|
| 168 |
+
```python
|
| 169 |
+
from scrapling.fetchers import DynamicFetcher, DynamicSession
|
| 170 |
+
|
| 171 |
+
with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # Keep the browser open until you finish
|
| 172 |
page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
|
| 173 |
+
data = page.xpath('//span[@class="text"]/text()').getall() # XPath selector if you prefer it
|
| 174 |
|
| 175 |
# Or use one-off request style, it opens the browser for this request, then closes it after finishing
|
| 176 |
page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
|
| 177 |
+
data = page.css('.quote .text::text').getall()
|
| 178 |
```
|
| 179 |
|
| 180 |
+
### Spiders
|
| 181 |
+
Build full crawlers with concurrent requests, multiple session types, and pause/resume:
|
| 182 |
+
```python
|
| 183 |
+
from scrapling.spiders import Spider, Request, Response
|
| 184 |
+
|
| 185 |
+
class QuotesSpider(Spider):
|
| 186 |
+
name = "quotes"
|
| 187 |
+
start_urls = ["https://quotes.toscrape.com/"]
|
| 188 |
+
concurrent_requests = 10
|
| 189 |
+
|
| 190 |
+
async def parse(self, response: Response):
|
| 191 |
+
for quote in response.css('.quote'):
|
| 192 |
+
yield {
|
| 193 |
+
"text": quote.css('.text::text').get(),
|
| 194 |
+
"author": quote.css('.author::text').get(),
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
next_page = response.css('.next a')
|
| 198 |
+
if next_page:
|
| 199 |
+
yield response.follow(next_page[0].attrib['href'])
|
| 200 |
+
|
| 201 |
+
result = QuotesSpider().start()
|
| 202 |
+
print(f"Scraped {len(result.items)} quotes")
|
| 203 |
+
result.items.to_json("quotes.json")
|
| 204 |
+
```
|
| 205 |
+
Use multiple session types in a single spider:
|
| 206 |
+
```python
|
| 207 |
+
from scrapling.spiders import Spider, Request, Response
|
| 208 |
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession
|
| 209 |
+
|
| 210 |
+
class MultiSessionSpider(Spider):
|
| 211 |
+
name = "multi"
|
| 212 |
+
start_urls = ["https://example.com/"]
|
| 213 |
+
|
| 214 |
+
def configure_sessions(self, manager):
|
| 215 |
+
manager.add("fast", FetcherSession(impersonate="chrome"))
|
| 216 |
+
manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)
|
| 217 |
+
|
| 218 |
+
async def parse(self, response: Response):
|
| 219 |
+
for link in response.css('a::attr(href)').getall():
|
| 220 |
+
# Route protected pages through the stealth session
|
| 221 |
+
if "protected" in link:
|
| 222 |
+
yield Request(link, sid="stealth")
|
| 223 |
+
else:
|
| 224 |
+
yield Request(link, sid="fast", callback=self.parse) # explicit callback
|
| 225 |
+
```
|
| 226 |
+
Pause and resume long crawls with checkpoints by running the spider like this:
|
| 227 |
+
```python
|
| 228 |
+
QuotesSpider(crawldir="./crawl_data").start()
|
| 229 |
+
```
|
| 230 |
+
Press Ctrl+C to pause gracefully — progress is saved automatically. Later, when you start the spider again, pass the same `crawldir`, and it will resume from where it stopped.
|
| 231 |
|
| 232 |
### Advanced Parsing & Navigation
|
| 233 |
```python
|
|
|
|
| 248 |
quotes = page.find_by_text('quote', tag='div')
|
| 249 |
|
| 250 |
# Advanced navigation
|
| 251 |
+
quote_text = page.css('.quote')[0].css('.text::text').get()
|
| 252 |
+
quote_text = page.css('.quote').css('.text::text').getall() # Chained selectors
|
| 253 |
+
first_quote = page.css('.quote')[0]
|
|
|
|
| 254 |
author = first_quote.next_sibling.css('.author::text')
|
| 255 |
parent_container = first_quote.parent
|
| 256 |
|
|
|
|
| 291 |
|
| 292 |
## CLI & Interactive Shell
|
| 293 |
|
| 294 |
+
Scrapling includes a powerful command-line interface:
|
| 295 |
|
| 296 |
[](https://asciinema.org/a/736339)
|
| 297 |
|
|
|
|
| 308 |
```
|
| 309 |
|
| 310 |
> [!NOTE]
|
| 311 |
+
> There are many additional features, but we want to keep this page concise, including the MCP server and the interactive Web Scraping Shell. Check out the full documentation [here](https://scrapling.readthedocs.io/en/latest/)
|
| 312 |
|
| 313 |
## Performance Benchmarks
|
| 314 |
|
| 315 |
+
Scrapling isn't just powerful—it's also blazing fast. The following benchmarks compare Scrapling's parser with the latest versions of other popular libraries.
|
| 316 |
|
| 317 |
### Text Extraction Speed Test (5000 nested elements)
|
| 318 |
|
| 319 |
| # | Library | Time (ms) | vs Scrapling |
|
| 320 |
|---|:-----------------:|:---------:|:------------:|
|
| 321 |
+
| 1 | Scrapling | 2.02 | 1.0x |
|
| 322 |
+
| 2 | Parsel/Scrapy | 2.04 | 1.01 |
|
| 323 |
+
| 3 | Raw Lxml | 2.54 | 1.257 |
|
| 324 |
+
| 4 | PyQuery | 24.17 | ~12x |
|
| 325 |
+
| 5 | Selectolax | 82.63 | ~41x |
|
| 326 |
+
| 6 | MechanicalSoup | 1549.71 | ~767.1x |
|
| 327 |
+
| 7 | BS4 with Lxml | 1584.31 | ~784.3x |
|
| 328 |
+
| 8 | BS4 with html5lib | 3391.91 | ~1679.1x |
|
| 329 |
|
| 330 |
|
| 331 |
### Element Similarity & Text Search Performance
|
| 332 |
|
| 333 |
Scrapling's adaptive element finding capabilities significantly outperform alternatives:
|
| 334 |
|
| 335 |
+
| Library | Time (ms) | vs Scrapling |
|
| 336 |
|-------------|:---------:|:------------:|
|
| 337 |
+
| Scrapling | 2.39 | 1.0x |
|
| 338 |
+
| AutoScraper | 12.45 | 5.209x |
|
| 339 |
|
| 340 |
|
| 341 |
> All benchmarks represent averages of 100+ runs. See [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology.
|
|
|
|
| 348 |
pip install scrapling
|
| 349 |
```
|
| 350 |
|
| 351 |
+
This installation only includes the parser engine and its dependencies, without any fetchers or commandline dependencies.
|
| 352 |
|
| 353 |
### Optional Dependencies
|
| 354 |
|
|
|
|
| 405 |
This project includes code adapted from:
|
| 406 |
- Parsel (BSD License)—Used for [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py) submodule
|
| 407 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 408 |
---
|
| 409 |
<div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
|
benchmarks.py
CHANGED
|
@@ -75,7 +75,7 @@ def test_scrapling():
|
|
| 75 |
# No need to do `.extract()` like parsel to extract text
|
| 76 |
# Also, this is faster than `[t.text for t in Selector(large_html, adaptive=False).css('.item')]`
|
| 77 |
# for obvious reasons, of course.
|
| 78 |
-
return ScraplingSelector(large_html, adaptive=False).css(".item::text")
|
| 79 |
|
| 80 |
|
| 81 |
@benchmark
|
|
|
|
| 75 |
# No need to do `.extract()` like parsel to extract text
|
| 76 |
# Also, this is faster than `[t.text for t in Selector(large_html, adaptive=False).css('.item')]`
|
| 77 |
# for obvious reasons, of course.
|
| 78 |
+
return ScraplingSelector(large_html, adaptive=False).css(".item::text").getall()
|
| 79 |
|
| 80 |
|
| 81 |
@benchmark
|
docs/README_AR.md
CHANGED
|
@@ -1,9 +1,14 @@
|
|
| 1 |
-
<
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
</
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
<p align="center">
|
| 8 |
<a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
|
| 9 |
<img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
|
|
@@ -24,46 +29,47 @@
|
|
| 24 |
</p>
|
| 25 |
|
| 26 |
<p align="center">
|
| 27 |
-
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/">
|
| 28 |
-
|
| 29 |
-
</a>
|
| 30 |
-
|
| 31 |
-
<a href="https://scrapling.readthedocs.io/en/latest/
|
| 32 |
-
|
| 33 |
-
</a>
|
| 34 |
-
|
| 35 |
-
<a href="https://scrapling.readthedocs.io/en/latest/
|
| 36 |
-
واجهة سطر الأوامر
|
| 37 |
-
</a>
|
| 38 |
-
·
|
| 39 |
-
<a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/">
|
| 40 |
-
وضع MCP
|
| 41 |
-
</a>
|
| 42 |
-
·
|
| 43 |
-
<a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/">
|
| 44 |
-
الانتقال من Beautifulsoup
|
| 45 |
-
</a>
|
| 46 |
</p>
|
| 47 |
|
| 48 |
-
|
| 49 |
|
| 50 |
-
|
| 51 |
|
| 52 |
-
|
| 53 |
|
| 54 |
```python
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
# احصل على
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
200
|
| 61 |
-
>> products = page.css('.product', auto_save=True) # استخرج البيانات التي تنجو من تغييرات تصميم الموقع!
|
| 62 |
-
>> # لاحقاً، إذا تغيرت بنية الموقع، مرر `adaptive=True`
|
| 63 |
-
>> products = page.css('.product', adaptive=True) # و Scrapling لا يزال يجدها!
|
| 64 |
```
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
<!-- sponsors -->
|
| 69 |
|
|
@@ -87,138 +93,211 @@ Scrapling ليست مجرد مكتبة أخرى لاستخراج بيانات ا
|
|
| 87 |
|
| 88 |
## الميزات الرئيسية
|
| 89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
### جلب متقدم للمواقع مع دعم الجلسات
|
| 91 |
-
- **طلبات HTTP**: طلبات HTTP سريعة وخفية مع فئة `Fetcher`. يمكنها تقليد بصمة TLS للمتصفح والرؤوس واستخدام
|
| 92 |
- **التحميل الديناميكي**: جلب المواقع الديناميكية مع أتمتة كاملة للمتصفح من خلال فئة `DynamicFetcher` التي تدعم Chromium من Playwright و Google Chrome.
|
| 93 |
-
- **تجاوز مكافحة الروبوتات**: قدرات تخفي متقدمة مع `StealthyFetcher` وانتحال
|
| 94 |
- **إدارة الجلسات**: دعم الجلسات المستمرة مع فئات `FetcherSession` و`StealthySession` و`DynamicSession` لإدارة ملفات تعريف الارتباط والحالة عبر الطلبات.
|
|
|
|
|
|
|
| 95 |
- **دعم Async**: دعم async كامل عبر جميع الجوالب وفئات الجلسات async المخصصة.
|
| 96 |
|
| 97 |
### الاستخراج التكيفي والتكامل مع الذكاء الاصطناعي
|
| 98 |
- 🔄 **تتبع العناصر الذكي**: إعادة تحديد موقع العناصر بعد تغييرات الموقع باستخدام خوارزميات التشابه الذكية.
|
| 99 |
- 🎯 **الاختيار المرن الذكي**: محددات CSS، محددات XPath، البحث القائم على الفلاتر، البحث النصي، البحث ب��لتعبيرات العادية والمزيد.
|
| 100 |
- 🔍 **البحث عن عناصر مشابهة**: تحديد العناصر المشابهة للعناصر الموجودة تلقائياً.
|
| 101 |
-
- 🤖 **خادم MCP للاستخدام مع الذكاء الاصطناعي**: خادم MCP مدمج ل
|
| 102 |
|
| 103 |
-
### بنية عالية الأداء ومختبرة
|
| 104 |
-
- 🚀 **سريع كالبرق**: أداء محسّن يتفوق على معظم مكتبات
|
| 105 |
- 🔋 **فعال في استخدام الذاكرة**: هياكل بيانات محسّنة وتحميل كسول لأقل استخدام للذاكرة.
|
| 106 |
- ⚡ **تسلسل JSON سريع**: أسرع 10 مرات من المكتبة القياسية.
|
| 107 |
-
- 🏗️ **مُختبر
|
| 108 |
|
| 109 |
### تجربة صديقة للمطورين/مستخرجي الويب
|
| 110 |
-
- 🎯 **
|
| 111 |
- 🚀 **استخدمه مباشرة من الطرفية**: اختيارياً، يمكنك استخدام Scrapling لاستخراج عنوان URL دون كتابة سطر واحد من الكود!
|
| 112 |
-
- 🛠️ **واجهة
|
| 113 |
-
- 🧬 **معالجة نصوص محسّنة**: تعبيرات عادية مدمجة وطرق تنظيف وعمليات
|
| 114 |
-
- 📝 **إنشاء محدد تلقائي**: إنشاء محددات CSS/XPath قوية لأي عنصر.
|
| 115 |
-
- 🔌 **واجهة
|
| 116 |
-
- 📘 **تغطية كاملة للأنواع**: تلميحات نوع كاملة لدعم IDE ممتاز وإكمال الكود.
|
| 117 |
- 🔋 **صورة Docker جاهزة**: مع كل إصدار، يتم بناء ودفع صورة Docker تحتوي على جميع المتصفحات تلقائياً.
|
| 118 |
|
| 119 |
## البدء
|
| 120 |
|
|
|
|
|
|
|
| 121 |
### الاستخدام الأساسي
|
|
|
|
| 122 |
```python
|
| 123 |
-
from scrapling.fetchers import Fetcher,
|
| 124 |
-
from scrapling.fetchers import FetcherSession, StealthySession, DynamicSession
|
| 125 |
|
| 126 |
-
# طلبات HTTP مع دعم الجلسات
|
| 127 |
with FetcherSession(impersonate='chrome') as session: # استخدم أحدث إصدار من بصمة TLS لـ Chrome
|
| 128 |
page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
| 129 |
-
quotes = page.css('.quote .text::text')
|
| 130 |
|
| 131 |
# أو استخدم طلبات لمرة واحدة
|
| 132 |
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 133 |
-
quotes = page.css('.quote .text::text')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
-
|
| 136 |
-
with StealthySession(headless=True, solve_cloudflare=True) as session:
|
| 137 |
page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
|
| 138 |
-
data = page.css('#padded_content a')
|
| 139 |
|
| 140 |
# أو استخدم نمط الطلب لمرة واحدة، يفتح المتصفح لهذا الطلب، ثم يغلقه بعد الانتهاء
|
| 141 |
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
|
| 142 |
-
data = page.css('#padded_content a')
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
|
|
|
|
|
|
|
|
|
| 152 |
```
|
| 153 |
|
| 154 |
-
###
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
```python
|
| 156 |
-
|
| 157 |
-
page.css('a::text') # استخراج النص
|
| 158 |
-
page.css('a::attr(href)') # استخراج السمات
|
| 159 |
-
page.css('a', recursive=False) # العناصر المباشرة فقط
|
| 160 |
-
page.css('a', auto_save=True) # حفظ مواضع العناصر تلقائياً
|
| 161 |
-
|
| 162 |
-
# XPath
|
| 163 |
-
page.xpath('//a/text()')
|
| 164 |
-
|
| 165 |
-
# بحث مرن
|
| 166 |
-
page.find_by_text('Python', first_match=True) # البحث بالنص
|
| 167 |
-
page.find_by_regex(r'\d{4}') # البحث بنمط التعبير العادي
|
| 168 |
-
page.find('div', {'class': 'container'}) # البحث بالسمات
|
| 169 |
-
|
| 170 |
-
# التنقل
|
| 171 |
-
element.parent # الحصول على العنصر الوالد
|
| 172 |
-
element.next_sibling # الحصول على الشقيق التالي
|
| 173 |
-
element.children # الحصول على الأطفال
|
| 174 |
-
|
| 175 |
-
# عناصر مشابهة
|
| 176 |
-
similar = page.get_similar(element) # البحث عن عناصر مشابهة
|
| 177 |
-
|
| 178 |
-
# الاستخراج التكيفي
|
| 179 |
-
saved_elements = page.css('.product', auto_save=True)
|
| 180 |
-
# لاحقاً، عندما يتغير الموقع:
|
| 181 |
-
page.css('.product', adaptive=True) # البحث عن العناصر باستخدام المواضع المحفوظة
|
| 182 |
```
|
|
|
|
| 183 |
|
| 184 |
-
### ا
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
```python
|
| 186 |
-
from scrapling.
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
|
| 196 |
|
| 197 |
# استخدام جلسة async
|
| 198 |
async with AsyncStealthySession(max_pages=2) as session:
|
| 199 |
tasks = []
|
| 200 |
urls = ['https://example.com/page1', 'https://example.com/page2']
|
| 201 |
-
|
| 202 |
for url in urls:
|
| 203 |
task = session.fetch(url)
|
| 204 |
tasks.append(task)
|
| 205 |
-
|
| 206 |
print(session.get_pool_stats()) # اختياري - حالة مجموعة علامات تبويب المتصفح (مشغول/حر/خطأ)
|
| 207 |
results = await asyncio.gather(*tasks)
|
| 208 |
print(session.get_pool_stats())
|
| 209 |
```
|
| 210 |
|
| 211 |
-
## واجهة سطر الأوامر وال
|
| 212 |
|
| 213 |
-
يتضمن Scrapling
|
| 214 |
|
| 215 |
[](https://asciinema.org/a/736339)
|
| 216 |
|
| 217 |
-
تشغيل
|
| 218 |
```bash
|
| 219 |
scrapling shell
|
| 220 |
```
|
| 221 |
-
استخر
|
| 222 |
```bash
|
| 223 |
scrapling extract get 'https://example.com' content.md
|
| 224 |
scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # جميع العناصر المطابقة لمحدد CSS '#fromSkipToProducts'
|
|
@@ -227,24 +306,24 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.
|
|
| 227 |
```
|
| 228 |
|
| 229 |
> [!NOTE]
|
| 230 |
-
> هناك العديد من الميزات الإضافية، لكننا نريد إبقاء هذه الصفحة موجزة، م
|
| 231 |
|
| 232 |
## معايير الأداء
|
| 233 |
|
| 234 |
-
Scrapling ليس قوياً ف
|
| 235 |
|
| 236 |
### اختبار سرعة استخراج النص (5000 عنصر متداخل)
|
| 237 |
|
| 238 |
-
| # | المكتبة | الوقت (ms) | vs Scrapling |
|
| 239 |
|---|:-----------------:|:----------:|:------------:|
|
| 240 |
-
| 1 | Scrapling |
|
| 241 |
-
| 2 | Parsel/Scrapy | 2.
|
| 242 |
-
| 3 | Raw Lxml | 2.
|
| 243 |
-
| 4 | PyQuery |
|
| 244 |
-
| 5 | Selectolax |
|
| 245 |
-
| 6 |
|
| 246 |
-
| 7 |
|
| 247 |
-
| 8 | BS4 with html5lib |
|
| 248 |
|
| 249 |
|
| 250 |
### أداء تشابه العناصر والبحث النصي
|
|
@@ -253,39 +332,39 @@ Scrapling ليس قوياً فقط - إنه أيضاً سريع بشكل مذه
|
|
| 253 |
|
| 254 |
| المكتبة | الوقت (ms) | vs Scrapling |
|
| 255 |
|-------------|:----------:|:------------:|
|
| 256 |
-
| Scrapling | 2.
|
| 257 |
-
| AutoScraper |
|
| 258 |
|
| 259 |
|
| 260 |
> تمثل جميع المعايير متوسطات أكثر من 100 تشغيل. انظر [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) للمنهجية.
|
| 261 |
|
| 262 |
## التثبيت
|
| 263 |
|
| 264 |
-
يتطلب Scrapling Python 3.10 أو أعلى:
|
| 265 |
|
| 266 |
```bash
|
| 267 |
pip install scrapling
|
| 268 |
```
|
| 269 |
|
| 270 |
-
|
| 271 |
|
| 272 |
### التبعيات الاختيارية
|
| 273 |
|
| 274 |
1. إذا كنت ستستخدم أياً من الميزات الإضافية أدناه، أو الجوالب، أو فئاتها، فستحتاج إلى تثبيت تبعيات الجوالب وتبعيات المتصفح الخاصة بها على النحو التالي:
|
| 275 |
```bash
|
| 276 |
pip install "scrapling[fetchers]"
|
| 277 |
-
|
| 278 |
scrapling install
|
| 279 |
```
|
| 280 |
|
| 281 |
-
يقوم هذا بتنزيل جميع المتصفحات، إلى جانب تبعيات النظام وتبعيات معالجة
|
| 282 |
|
| 283 |
2. ميزات إضافية:
|
| 284 |
- تثبيت ميزة خادم MCP:
|
| 285 |
```bash
|
| 286 |
pip install "scrapling[ai]"
|
| 287 |
```
|
| 288 |
-
- تثبيت ميزات
|
| 289 |
```bash
|
| 290 |
pip install "scrapling[shell]"
|
| 291 |
```
|
|
@@ -322,14 +401,7 @@ docker pull ghcr.io/d4vinci/scrapling:latest
|
|
| 322 |
## الشكر والتقدير
|
| 323 |
|
| 324 |
يتضمن هذا المشروع كوداً معدلاً من:
|
| 325 |
-
- Parsel (ترخيص BSD)
|
| 326 |
-
|
| 327 |
-
## الشكر والمراجع
|
| 328 |
-
|
| 329 |
-
- العمل الرائع لـ [Daijro](https://github.com/daijro) على [BrowserForge](https://github.com/daijro/browserforge) و[Camoufox](https://github.com/daijro/camoufox)
|
| 330 |
-
- العمل الرائع لـ [Vinyzu](https://github.com/Vinyzu) على [Botright](https://github.com/Vinyzu/Botright) و[PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)
|
| 331 |
-
- [brotector](https://github.com/kaliiiiiiiiii/brotector) لتقنيات تجاوز اكتشاف المتصفح
|
| 332 |
-
- [fakebrowser](https://github.com/kkoooqq/fakebrowser) و[BotBrowser](https://github.com/botswin/BotBrowser) لأبحاث البصمات
|
| 333 |
|
| 334 |
---
|
| 335 |
-
<div align="center"><small>مصمم ومصنوع بـ ❤️ بواسطة كريم شعير.</small></div><br>
|
|
|
|
| 1 |
+
<h1 align="center">
|
| 2 |
+
<a href="https://scrapling.readthedocs.io">
|
| 3 |
+
<picture>
|
| 4 |
+
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
|
| 5 |
+
<img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
|
| 6 |
+
</picture>
|
| 7 |
+
</a>
|
| 8 |
+
<br>
|
| 9 |
+
<small>Effortless Web Scraping for the Modern Web</small>
|
| 10 |
+
</h1>
|
| 11 |
+
|
| 12 |
<p align="center">
|
| 13 |
<a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
|
| 14 |
<img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
|
|
|
|
| 29 |
</p>
|
| 30 |
|
| 31 |
<p align="center">
|
| 32 |
+
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/"><strong>طرق الاختيار</strong></a>
|
| 33 |
+
·
|
| 34 |
+
<a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/"><strong>اختيار Fetcher</strong></a>
|
| 35 |
+
·
|
| 36 |
+
<a href="https://scrapling.readthedocs.io/en/latest/cli/overview/"><strong>واجهة سطر الأوامر</strong></a>
|
| 37 |
+
·
|
| 38 |
+
<a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/"><strong>وضع MCP</strong></a>
|
| 39 |
+
·
|
| 40 |
+
<a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/"><strong>الانتقال من Beautifulsoup</strong></a>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
</p>
|
| 42 |
|
| 43 |
+
Scrapling هو إطار عمل تكيفي لـ Web Scraping يتعامل مع كل شيء من طلب واحد إلى زحف كامل النطاق.
|
| 44 |
|
| 45 |
+
محلله يتعلم من تغييرات المواقع ويعيد تحديد موقع عناصرك تلقائياً عند تحديث الصفحات. جوالبه تتجاوز أنظمة مكافحة الروبوتات مثل Cloudflare Turnstile مباشرةً. وإطار عمل Spider الخاص به يتيح لك التوسع إلى عمليات زحف متزامنة ومتعددة الجلسات مع إيقاف/استئناف وتدوير تلقائي لـ Proxy - كل ذلك في بضعة أسطر من Python. مكتبة واحدة، بدون تنازلات.
|
| 46 |
|
| 47 |
+
زحف سريع للغاية مع إحصائيات فورية و Streaming. مبني بواسطة مستخرجي الويب لمستخرجي الويب والمستخدمين العاديين، هناك شيء للجميع.
|
| 48 |
|
| 49 |
```python
|
| 50 |
+
from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
|
| 51 |
+
StealthyFetcher.adaptive = True
|
| 52 |
+
page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # احصل على الموقع بشكل خفي!
|
| 53 |
+
products = page.css('.product', auto_save=True) # استخرج بيانات تنجو من تغييرات تصميم الموقع!
|
| 54 |
+
products = page.css('.product', adaptive=True) # لاحقاً، إذا تغيرت بنية الموقع، مرر `adaptive=True` للعثور عليها!
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
```
|
| 56 |
+
أو توسع إلى عمليات زحف كاملة
|
| 57 |
+
```python
|
| 58 |
+
from scrapling.spiders import Spider, Response
|
| 59 |
|
| 60 |
+
class MySpider(Spider):
|
| 61 |
+
name = "demo"
|
| 62 |
+
start_urls = ["https://example.com/"]
|
| 63 |
+
|
| 64 |
+
async def parse(self, response: Response):
|
| 65 |
+
for item in response.css('.product'):
|
| 66 |
+
yield {"title": item.css('h2::text').get()}
|
| 67 |
+
|
| 68 |
+
MySpider().start()
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
# الرعاة
|
| 73 |
|
| 74 |
<!-- sponsors -->
|
| 75 |
|
|
|
|
| 93 |
|
| 94 |
## الميزات الرئيسية
|
| 95 |
|
| 96 |
+
### Spiders — إطار عمل زحف كامل
|
| 97 |
+
- 🕷️ **واجهة Spider شبيهة بـ Scrapy**: عرّف Spiders مع `start_urls`، و async `parse` callbacks، وكائنات `Request`/`Response`.
|
| 98 |
+
- ⚡ **زحف متزامن**: حدود تزامن قابلة للتكوين، وتحكم بالسرعة حسب النطاق، وتأخيرات التنزيل.
|
| 99 |
+
- 🔄 **دعم الجلسات المتعددة**: واجهة موحدة لطلبات HTTP، ومتصفحات خفية بدون واجهة في Spider واحد — وجّه الطلبات إلى جلسات مختلفة بالمعرّف.
|
| 100 |
+
- 💾 **إيقاف واستئناف**: استمرارية الزحف القائمة على Checkpoint. اضغط Ctrl+C للإيقاف بسلاسة؛ أعد التشغيل للاستئناف من حيث توقفت.
|
| 101 |
+
- 📡 **وضع Streaming**: بث العناصر المستخرجة فور وصولها عبر `async for item in spider.stream()` مع إحصائيات فورية — مثالي لواجهات المستخدم وخطوط الأنابيب وعمليات الزحف الطويلة.
|
| 102 |
+
- 🛡️ **كشف الطلبات المحظورة**: كشف تلقائي وإعادة محاولة للطلبات المحظورة مع منطق قابل للتخصيص.
|
| 103 |
+
- 📦 **تصدير مدمج**: صدّر النتائج عبر الخطافات وخط الأنابيب الخاص بك أو JSON/JSONL المدمج مع `result.items.to_json()` / `result.items.to_jsonl()` على التوالي.
|
| 104 |
+
|
| 105 |
### جلب متقدم للمواقع مع دعم الجلسات
|
| 106 |
+
- **طلبات HTTP**: طلبات HTTP سريعة وخفية مع فئة `Fetcher`. يمكنها تقليد بصمة TLS للمتصفح والرؤوس واستخدام HTTP/3.
|
| 107 |
- **التحميل الديناميكي**: جلب المواقع الديناميكية مع أتمتة كاملة للمتصفح من خلال فئة `DynamicFetcher` التي تدعم Chromium من Playwright و Google Chrome.
|
| 108 |
+
- **تجاوز مكافحة الروبوتات**: قدرات تخفي متقدمة مع `StealthyFetcher` وانتحال fingerprint. يمكنه تجاوز جميع أنواع Turnstile/Interstitial من Cloudflare بسهولة بالأتمتة.
|
| 109 |
- **إدارة الجلسات**: دعم الجلسات المستمرة مع فئات `FetcherSession` و`StealthySession` و`DynamicSession` لإدارة ملفات تعريف الارتباط والحالة عبر الطلبات.
|
| 110 |
+
- **تدوير Proxy**: `ProxyRotator` مدمج مع استراتيجيات التدوير الدوري أو المخصصة عبر جميع أنواع الجلسات، بالإضافة إلى تجاوزات Proxy لكل طلب.
|
| 111 |
+
- **حظر النطاقات**: حظر الطلبات إلى نطاقات محددة (ونطاقاتها الفرعية) في الجوالب المعتمدة على المتصفح.
|
| 112 |
- **دعم Async**: دعم async كامل عبر جميع الجوالب وفئات الجلسات async المخصصة.
|
| 113 |
|
| 114 |
### الاستخراج التكيفي والتكامل مع الذكاء الاصطناعي
|
| 115 |
- 🔄 **تتبع العناصر الذكي**: إعادة تحديد موقع العناصر بعد تغييرات الموقع باستخدام خوارزميات التشابه الذكية.
|
| 116 |
- 🎯 **الاختيار المرن الذكي**: محددات CSS، محددات XPath، البحث القائم على الفلاتر، البحث النصي، البحث ب��لتعبيرات العادية والمزيد.
|
| 117 |
- 🔍 **البحث عن عناصر مشابهة**: تحديد العناصر المشابهة للعناصر الموجودة تلقائياً.
|
| 118 |
+
- 🤖 **خادم MCP للاستخدام مع الذكاء الاصطناعي**: خادم MCP مدمج لـ Web Scraping بمساعدة الذكاء الاصطناعي واستخراج البيانات. يتميز خادم MCP بقدرات قوية مخصصة تستفيد من Scrapling لاستخراج المحتوى المستهدف قبل تمريره إلى الذكاء الاصطناعي (Claude/Cursor/إلخ)، وبالتالي تسريع العمليات وتقليل التكاليف عن طريق تقليل استخدام الرموز. ([فيديو توضيحي](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
|
| 119 |
|
| 120 |
+
### بنية عالية الأداء ومختبرة ميدانياً
|
| 121 |
+
- 🚀 **سريع كالبرق**: أداء محسّن يتفوق على معظم مكتبات Web Scraping في Python.
|
| 122 |
- 🔋 **فعال في استخدام الذاكرة**: هياكل بيانات محسّنة وتحميل كسول لأقل استخدام للذاكرة.
|
| 123 |
- ⚡ **تسلسل JSON سريع**: أسرع 10 مرات من المكتبة القياسية.
|
| 124 |
+
- 🏗️ **مُختبر ميدانياً**: لا يمتلك Scrapling فقط تغطية اختبار بنسبة 92٪ وتغطية كاملة لتلميحات الأنواع، بل تم استخدامه يومياً من قبل مئات مستخرجي الويب خلال العام الماضي.
|
| 125 |
|
| 126 |
### تجربة صديقة للمطورين/مستخرجي الويب
|
| 127 |
+
- 🎯 **Shell تفاعلي لـ Web Scraping**: Shell IPython مدمج اختياري مع تكامل Scrapling، واختصارات، وأدوات جديدة لتسريع تطوير سكريبتات Web Scraping، مثل تحويل طلبات curl إلى طلبات Scrapling وعرض نتائج الطلبات في متصفحك.
|
| 128 |
- 🚀 **استخدمه مباشرة من الطرفية**: اختيارياً، يمكنك استخدام Scrapling لاستخراج عنوان URL دون كتابة سطر واحد من الكود!
|
| 129 |
+
- 🛠️ **واجهة تنقل غنية**: اجتياز DOM متقدم مع طرق التنقل بين العناصر الوالدية والشقيقة والفرعية.
|
| 130 |
+
- 🧬 **معالجة نصوص محسّنة**: تعبيرات عادية مدمجة وطرق تنظيف وعمليات نصية محسّنة.
|
| 131 |
+
- 📝 **إنشاء محددات تلقائي**: إنشاء محددات CSS/XPath قوية لأي عنصر.
|
| 132 |
+
- 🔌 **واجهة مألوفة**: مشابه لـ Scrapy/BeautifulSoup مع نفس العناصر الزائفة المستخدمة في Scrapy/Parsel.
|
| 133 |
+
- 📘 **تغطية كاملة للأنواع**: تلميحات نوع كاملة لدعم IDE ممتاز وإكما�� الكود. يتم فحص قاعدة الكود بالكامل تلقائياً بواسطة **PyRight** و**MyPy** مع كل تغيير.
|
| 134 |
- 🔋 **صورة Docker جاهزة**: مع كل إصدار، يتم بناء ودفع صورة Docker تحتوي على جميع المتصفحات تلقائياً.
|
| 135 |
|
| 136 |
## البدء
|
| 137 |
|
| 138 |
+
لنلقِ نظرة سريعة على ما يمكن لـ Scrapling فعله دون التعمق.
|
| 139 |
+
|
| 140 |
### الاستخدام الأساسي
|
| 141 |
+
طلبات HTTP مع دعم الجلسات
|
| 142 |
```python
|
| 143 |
+
from scrapling.fetchers import Fetcher, FetcherSession
|
|
|
|
| 144 |
|
|
|
|
| 145 |
with FetcherSession(impersonate='chrome') as session: # استخدم أحدث إصدار من بصمة TLS لـ Chrome
|
| 146 |
page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
| 147 |
+
quotes = page.css('.quote .text::text').getall()
|
| 148 |
|
| 149 |
# أو استخدم طلبات لمرة واحدة
|
| 150 |
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 151 |
+
quotes = page.css('.quote .text::text').getall()
|
| 152 |
+
```
|
| 153 |
+
وضع التخفي المتقدم
|
| 154 |
+
```python
|
| 155 |
+
from scrapling.fetchers import StealthyFetcher, StealthySession
|
| 156 |
|
| 157 |
+
with StealthySession(headless=True, solve_cloudflare=True) as session: # أبقِ المتصفح مفتوحاً حتى تنتهي
|
|
|
|
| 158 |
page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
|
| 159 |
+
data = page.css('#padded_content a').getall()
|
| 160 |
|
| 161 |
# أو استخدم نمط الطلب لمرة واحدة، يفتح المتصفح لهذا الطلب، ثم يغلقه بعد الانتهاء
|
| 162 |
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
|
| 163 |
+
data = page.css('#padded_content a').getall()
|
| 164 |
+
```
|
| 165 |
+
أتمتة المتصفح الكاملة
|
| 166 |
+
```python
|
| 167 |
+
from scrapling.fetchers import DynamicFetcher, DynamicSession
|
| 168 |
+
|
| 169 |
+
with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # أبقِ المتصفح مفتوحاً حتى تنتهي
|
| 170 |
+
page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
|
| 171 |
+
data = page.xpath('//span[@class="text"]/text()').getall() # محدد XPath إذا كنت تفضله
|
| 172 |
+
|
| 173 |
+
# أو استخدم نمط الطلب لمرة واحدة، يفتح المتصفح لهذا الطلب، ثم يغلقه بعد الانتهاء
|
| 174 |
+
page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
|
| 175 |
+
data = page.css('.quote .text::text').getall()
|
| 176 |
```
|
| 177 |
|
| 178 |
+
### Spiders
|
| 179 |
+
ابنِ زواحف كاملة مع طلبات متزامنة وأنواع جلسات متعددة وإيقاف/استئناف:
|
| 180 |
+
```python
|
| 181 |
+
from scrapling.spiders import Spider, Request, Response
|
| 182 |
+
|
| 183 |
+
class QuotesSpider(Spider):
|
| 184 |
+
name = "quotes"
|
| 185 |
+
start_urls = ["https://quotes.toscrape.com/"]
|
| 186 |
+
concurrent_requests = 10
|
| 187 |
+
|
| 188 |
+
async def parse(self, response: Response):
|
| 189 |
+
for quote in response.css('.quote'):
|
| 190 |
+
yield {
|
| 191 |
+
"text": quote.css('.text::text').get(),
|
| 192 |
+
"author": quote.css('.author::text').get(),
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
next_page = response.css('.next a')
|
| 196 |
+
if next_page:
|
| 197 |
+
yield response.follow(next_page[0].attrib['href'])
|
| 198 |
+
|
| 199 |
+
result = QuotesSpider().start()
|
| 200 |
+
print(f"Scraped {len(result.items)} quotes")
|
| 201 |
+
result.items.to_json("quotes.json")
|
| 202 |
+
```
|
| 203 |
+
استخدم أنواع جلسات متعددة في Spider واحد:
|
| 204 |
+
```python
|
| 205 |
+
from scrapling.spiders import Spider, Request, Response
|
| 206 |
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession
|
| 207 |
+
|
| 208 |
+
class MultiSessionSpider(Spider):
|
| 209 |
+
name = "multi"
|
| 210 |
+
start_urls = ["https://example.com/"]
|
| 211 |
+
|
| 212 |
+
def configure_sessions(self, manager):
|
| 213 |
+
manager.add("fast", FetcherSession(impersonate="chrome"))
|
| 214 |
+
manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)
|
| 215 |
+
|
| 216 |
+
async def parse(self, response: Response):
|
| 217 |
+
for link in response.css('a::attr(href)').getall():
|
| 218 |
+
# وجّه الصفحات المحمية عبر جلسة التخفي
|
| 219 |
+
if "protected" in link:
|
| 220 |
+
yield Request(link, sid="stealth")
|
| 221 |
+
else:
|
| 222 |
+
yield Request(link, sid="fast", callback=self.parse) # callback صريح
|
| 223 |
+
```
|
| 224 |
+
أوقف واستأنف عمليات الزحف الطويلة مع Checkpoints بتشغيل Spider هكذا:
|
| 225 |
```python
|
| 226 |
+
QuotesSpider(crawldir="./crawl_data").start()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
```
|
| 228 |
+
اضغط Ctrl+C للإيقاف بسلاسة — يتم حفظ التقدم تلقائياً. لاحقاً، عند تشغيل Spider مرة أخرى، مرر نفس `crawldir`، وسيستأنف من حيث توقف.
|
| 229 |
|
| 230 |
+
### التحليل المتقدم والتنقل
|
| 231 |
+
```python
|
| 232 |
+
from scrapling.fetchers import Fetcher
|
| 233 |
+
|
| 234 |
+
# اختيار عناصر غني وتنقل
|
| 235 |
+
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 236 |
+
|
| 237 |
+
# احصل على الاقتباسات بطرق اختيار متعددة
|
| 238 |
+
quotes = page.css('.quote') # محدد CSS
|
| 239 |
+
quotes = page.xpath('//div[@class="quote"]') # XPath
|
| 240 |
+
quotes = page.find_all('div', {'class': 'quote'}) # بأسلوب BeautifulSoup
|
| 241 |
+
# نفس الشيء مثل
|
| 242 |
+
quotes = page.find_all('div', class_='quote')
|
| 243 |
+
quotes = page.find_all(['div'], class_='quote')
|
| 244 |
+
quotes = page.find_all(class_='quote') # وهكذا...
|
| 245 |
+
# البحث عن عنصر بمحتوى النص
|
| 246 |
+
quotes = page.find_by_text('quote', tag='div')
|
| 247 |
+
|
| 248 |
+
# التنقل المتقدم
|
| 249 |
+
quote_text = page.css('.quote')[0].css('.text::text').get()
|
| 250 |
+
quote_text = page.css('.quote').css('.text::text').getall() # محددات متسلسلة
|
| 251 |
+
first_quote = page.css('.quote')[0]
|
| 252 |
+
author = first_quote.next_sibling.css('.author::text')
|
| 253 |
+
parent_container = first_quote.parent
|
| 254 |
+
|
| 255 |
+
# علاقات العناصر والتشابه
|
| 256 |
+
similar_elements = first_quote.find_similar()
|
| 257 |
+
below_elements = first_quote.below_elements()
|
| 258 |
+
```
|
| 259 |
+
يمكنك استخدام المحلل مباشرة إذا كنت لا تريد جلب المواقع كما يلي:
|
| 260 |
```python
|
| 261 |
+
from scrapling.parser import Selector
|
| 262 |
+
|
| 263 |
+
page = Selector("<html>...</html>")
|
| 264 |
+
```
|
| 265 |
+
وهو يعمل بنفس الطريقة تماماً!
|
| 266 |
+
|
| 267 |
+
### أمثلة إدارة الجلسات بشكل Async
|
| 268 |
+
```python
|
| 269 |
+
import asyncio
|
| 270 |
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession
|
| 271 |
+
|
| 272 |
+
async with FetcherSession(http3=True) as session: # `FetcherSession` واعٍ بالسياق ويعمل في كلا النمطين المتزامن/async
|
| 273 |
+
page1 = session.get('https://quotes.toscrape.com/')
|
| 274 |
page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
|
| 275 |
|
| 276 |
# استخدام جلسة async
|
| 277 |
async with AsyncStealthySession(max_pages=2) as session:
|
| 278 |
tasks = []
|
| 279 |
urls = ['https://example.com/page1', 'https://example.com/page2']
|
| 280 |
+
|
| 281 |
for url in urls:
|
| 282 |
task = session.fetch(url)
|
| 283 |
tasks.append(task)
|
| 284 |
+
|
| 285 |
print(session.get_pool_stats()) # اختياري - حالة مجموعة علامات تبويب المتصفح (مشغول/حر/خطأ)
|
| 286 |
results = await asyncio.gather(*tasks)
|
| 287 |
print(session.get_pool_stats())
|
| 288 |
```
|
| 289 |
|
| 290 |
+
## واجهة سطر الأوامر والـ Shell التفاعلي
|
| 291 |
|
| 292 |
+
يتضمن Scrapling واجهة سطر أوامر قوية:
|
| 293 |
|
| 294 |
[](https://asciinema.org/a/736339)
|
| 295 |
|
| 296 |
+
تشغيل Shell الـ Web Scraping التفاعلي
|
| 297 |
```bash
|
| 298 |
scrapling shell
|
| 299 |
```
|
| 300 |
+
استخرج الصفحات إلى ملف مباشرة دون برمجة (يستخرج المحتوى داخل وسم `body` افتراضياً). إذا انتهى ملف الإخراج بـ `.txt`، فسيتم استخراج محتوى النص للهدف. إذا انتهى بـ `.md`، فسيكون تمثيل Markdown لمحتوى HTML؛ إذا انتهى بـ `.html`، فسيكون محتوى HTML نفسه.
|
| 301 |
```bash
|
| 302 |
scrapling extract get 'https://example.com' content.md
|
| 303 |
scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # جميع العناصر المطابقة لمحدد CSS '#fromSkipToProducts'
|
|
|
|
| 306 |
```
|
| 307 |
|
| 308 |
> [!NOTE]
|
| 309 |
+
> هناك العديد من الميزات الإضافية، لكننا نريد إبقاء هذه الصفحة موجزة، بما في ذلك خادم MCP والـ Shell التفاعلي لـ Web Scraping. تحقق من الوثائق الكاملة [هنا](https://scrapling.readthedocs.io/en/latest/)
|
| 310 |
|
| 311 |
## معايير الأداء
|
| 312 |
|
| 313 |
+
Scrapling ليس قوياً فحسب — بل هو أيضاً سريع بشكل مذهل. تقارن المعايير التالية محلل Scrapling مع أحدث إصدارات المكتبات الشائعة الأخرى.
|
| 314 |
|
| 315 |
### اختبار سرعة استخراج النص (5000 عنصر متداخل)
|
| 316 |
|
| 317 |
+
| # | المكتبة | الوقت (ms) | vs Scrapling |
|
| 318 |
|---|:-----------------:|:----------:|:------------:|
|
| 319 |
+
| 1 | Scrapling | 2.02 | 1.0x |
|
| 320 |
+
| 2 | Parsel/Scrapy | 2.04 | 1.01 |
|
| 321 |
+
| 3 | Raw Lxml | 2.54 | 1.257 |
|
| 322 |
+
| 4 | PyQuery | 24.17 | ~12x |
|
| 323 |
+
| 5 | Selectolax | 82.63 | ~41x |
|
| 324 |
+
| 6 | MechanicalSoup | 1549.71 | ~767.1x |
|
| 325 |
+
| 7 | BS4 with Lxml | 1584.31 | ~784.3x |
|
| 326 |
+
| 8 | BS4 with html5lib | 3391.91 | ~1679.1x |
|
| 327 |
|
| 328 |
|
| 329 |
### أداء تشابه العناصر والبحث النصي
|
|
|
|
| 332 |
|
| 333 |
| المكتبة | الوقت (ms) | vs Scrapling |
|
| 334 |
|-------------|:----------:|:------------:|
|
| 335 |
+
| Scrapling | 2.39 | 1.0x |
|
| 336 |
+
| AutoScraper | 12.45 | 5.209x |
|
| 337 |
|
| 338 |
|
| 339 |
> تمثل جميع المعايير متوسطات أكثر من 100 تشغيل. انظر [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) للمنهجية.
|
| 340 |
|
| 341 |
## التثبيت
|
| 342 |
|
| 343 |
+
يتطلب Scrapling إصدار Python 3.10 أو أعلى:
|
| 344 |
|
| 345 |
```bash
|
| 346 |
pip install scrapling
|
| 347 |
```
|
| 348 |
|
| 349 |
+
يتضمن هذا التثبيت فقط محرك المحلل وتبعياته، بدون أي جوالب أو تبعيات سطر الأوامر.
|
| 350 |
|
| 351 |
### التبعيات الاختيارية
|
| 352 |
|
| 353 |
1. إذا كنت ستستخدم أياً من الميزات الإضافية أدناه، أو الجوالب، أو فئاتها، فستحتاج إلى تثبيت تبعيات الجوالب وتبعيات المتصفح الخاصة بها على النحو التالي:
|
| 354 |
```bash
|
| 355 |
pip install "scrapling[fetchers]"
|
| 356 |
+
|
| 357 |
scrapling install
|
| 358 |
```
|
| 359 |
|
| 360 |
+
يقوم هذا بتنزيل جميع المتصفحات، إلى جانب تبعيات النظام وتبعيات معالجة fingerprint الخاصة بها.
|
| 361 |
|
| 362 |
2. ميزات إضافية:
|
| 363 |
- تثبيت ميزة خادم MCP:
|
| 364 |
```bash
|
| 365 |
pip install "scrapling[ai]"
|
| 366 |
```
|
| 367 |
+
- تثبيت ميزات Shell (Shell الـ Web Scraping وأمر `extract`):
|
| 368 |
```bash
|
| 369 |
pip install "scrapling[shell]"
|
| 370 |
```
|
|
|
|
| 401 |
## الشكر والتقدير
|
| 402 |
|
| 403 |
يتضمن هذا المشروع كوداً معدلاً من:
|
| 404 |
+
- Parsel (ترخيص BSD) — يُستخدم للوحدة الفرعية [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 405 |
|
| 406 |
---
|
| 407 |
+
<div align="center"><small>مصمم ومصنوع بـ ❤️ بواسطة كريم شعير.</small></div><br>
|
docs/README_CN.md
CHANGED
|
@@ -1,9 +1,14 @@
|
|
| 1 |
-
<
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
</
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
<p align="center">
|
| 8 |
<a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
|
| 9 |
<img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
|
|
@@ -24,46 +29,47 @@
|
|
| 24 |
</p>
|
| 25 |
|
| 26 |
<p align="center">
|
| 27 |
-
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/">
|
| 28 |
-
|
| 29 |
-
</a>
|
| 30 |
-
|
| 31 |
-
<a href="https://scrapling.readthedocs.io/en/latest/
|
| 32 |
-
|
| 33 |
-
</a>
|
| 34 |
-
|
| 35 |
-
<a href="https://scrapling.readthedocs.io/en/latest/
|
| 36 |
-
命令行界面
|
| 37 |
-
</a>
|
| 38 |
-
·
|
| 39 |
-
<a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/">
|
| 40 |
-
MCP模式
|
| 41 |
-
</a>
|
| 42 |
-
·
|
| 43 |
-
<a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/">
|
| 44 |
-
从Beautifulsoup迁移
|
| 45 |
-
</a>
|
| 46 |
</p>
|
| 47 |
|
| 48 |
-
|
| 49 |
|
| 50 |
-
|
| 51 |
|
| 52 |
-
|
| 53 |
|
| 54 |
```python
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
# 隐秘地获取网站
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
200
|
| 61 |
-
>> products = page.css('.product', auto_save=True) # 抓取在网站设计变更后仍能存活的数据!
|
| 62 |
-
>> # 之后,如果网站结构改变,传递 `adaptive=True`
|
| 63 |
-
>> products = page.css('.product', adaptive=True) # Scrapling仍然能找到它们!
|
| 64 |
```
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
<!-- sponsors -->
|
| 69 |
|
|
@@ -87,122 +93,195 @@ Scrapling不仅仅是另一个网页��取库。它是第一个**自适应**抓
|
|
| 87 |
|
| 88 |
## 主要特性
|
| 89 |
|
| 90 |
-
###
|
| 91 |
-
- **
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
- **动态加载**:通过`DynamicFetcher`类使用完整的浏览器自动化获取动态网站,支持Playwright的Chromium和Google Chrome。
|
| 93 |
-
- **反机器人绕过**:使用`StealthyFetcher`的高级隐秘功能和
|
| 94 |
-
- **
|
| 95 |
-
- **
|
|
|
|
|
|
|
| 96 |
|
| 97 |
### 自适应抓取和AI集成
|
| 98 |
- 🔄 **智能元素跟踪**:使用智能相似性算法在网站更改后重新定位元素。
|
| 99 |
- 🎯 **智能灵活选择**:CSS选择器、XPath选择器、基于过滤器的搜索、文本搜索、正则表达式搜索等。
|
| 100 |
-
- 🔍 **查找相似元素**:自动定位与找到
|
| 101 |
-
- 🤖 **与AI一起使用的MCP服务器**:内置MCP服务器用于AI辅助
|
| 102 |
|
| 103 |
### 高性能和经过实战测试的架构
|
| 104 |
- 🚀 **闪电般快速**:优化性能超越大多数Python抓取库。
|
| 105 |
- 🔋 **内存高效**:优化的数据结构和延迟加载,最小内存占用。
|
| 106 |
- ⚡ **快速JSON序列化**:比标准库快10倍。
|
| 107 |
-
- 🏗️ **经过实战测试**:Scrapling不仅拥有92%的测试覆盖率和完整的类型提示覆盖率,而且在过去一年中每天被数百名
|
| 108 |
|
| 109 |
-
### 对开发者/
|
| 110 |
-
- 🎯 **交互式
|
| 111 |
- 🚀 **直接从终端使用**:可选地,您可以使用Scrapling抓取URL而无需编��任何代码!
|
| 112 |
- 🛠️ **丰富的导航API**:使用父级、兄弟级和子级导航方法进行高级DOM遍历。
|
| 113 |
- 🧬 **增强的文本处理**:内置正则表达式、清理方法和优化的字符串操作。
|
| 114 |
- 📝 **自动选择器生成**:为任何元素生成强大的CSS/XPath选择器。
|
| 115 |
- 🔌 **熟悉的API**:类似于Scrapy/BeautifulSoup,使用与Scrapy/Parsel相同的伪元素。
|
| 116 |
-
- 📘 **完整的类型覆盖**:完整的类型提示,出色的IDE支持和代码补全。
|
| 117 |
- 🔋 **现成的Docker镜像**:每次发布时,包含所有浏览器的Docker镜像会自动构建和推送。
|
| 118 |
|
| 119 |
## 入门
|
| 120 |
|
|
|
|
|
|
|
| 121 |
### 基本用法
|
|
|
|
| 122 |
```python
|
| 123 |
-
from scrapling.fetchers import Fetcher,
|
| 124 |
-
from scrapling.fetchers import FetcherSession, StealthySession, DynamicSession
|
| 125 |
|
| 126 |
-
#
|
| 127 |
-
with FetcherSession(impersonate='chrome') as session: # 使用Chrome的最新版本TLS指纹
|
| 128 |
page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
| 129 |
-
quotes = page.css('.quote .text::text')
|
| 130 |
|
| 131 |
# 或使用一次性请求
|
| 132 |
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 133 |
-
quotes = page.css('.quote .text::text')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
-
#
|
| 136 |
-
with StealthySession(headless=True, solve_cloudflare=True) as session:
|
| 137 |
page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
|
| 138 |
-
data = page.css('#padded_content a')
|
| 139 |
|
| 140 |
# 或使用一次性请求样式,为此请求打开浏览器,完成后关闭
|
| 141 |
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
|
| 142 |
-
data = page.css('#padded_content a')
|
| 143 |
-
|
| 144 |
-
# 完整的浏览器自动化(保持浏览器打开直到完成)
|
| 145 |
-
with DynamicSession(headless=True) as session:
|
| 146 |
-
page = session.fetch('https://quotes.toscrape.com/', network_idle=True)
|
| 147 |
-
quotes = page.css('.quote .text::text')
|
| 148 |
-
|
| 149 |
-
# 或使用一次性请求样式
|
| 150 |
-
page = DynamicFetcher.fetch('https://quotes.toscrape.com/', network_idle=True)
|
| 151 |
-
quotes = page.css('.quote .text::text')
|
| 152 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
```python
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
page
|
| 159 |
-
page.css('a', recursive=False) # 仅直接元素
|
| 160 |
-
page.css('a', auto_save=True) # 自动保存元素位置
|
| 161 |
-
|
| 162 |
-
# XPath
|
| 163 |
-
page.xpath('//a/text()')
|
| 164 |
-
|
| 165 |
-
# 灵活搜索
|
| 166 |
-
page.find_by_text('Python', first_match=True) # 按文本查找
|
| 167 |
-
page.find_by_regex(r'\d{4}') # 按正则表达式模式查找
|
| 168 |
-
page.find('div', {'class': 'container'}) # 按属性查找
|
| 169 |
-
|
| 170 |
-
# 导航
|
| 171 |
-
element.parent # 获取父元素
|
| 172 |
-
element.next_sibling # 获取下一个兄弟元素
|
| 173 |
-
element.children # 获取子元素
|
| 174 |
-
|
| 175 |
-
# 相似元素
|
| 176 |
-
similar = page.get_similar(element) # 查找相似元素
|
| 177 |
-
|
| 178 |
-
# 自适应抓取
|
| 179 |
-
saved_elements = page.css('.product', auto_save=True)
|
| 180 |
-
# 之后,当网站更改时:
|
| 181 |
-
page.css('.product', adaptive=True) # 使用保存的位置查找元素
|
| 182 |
```
|
|
|
|
| 183 |
|
| 184 |
-
###
|
| 185 |
```python
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
with FetcherSession() as session:
|
| 190 |
-
|
| 191 |
-
page1 = session.get('https://quotes.toscrape.com/login')
|
| 192 |
-
page2 = session.post('https://quotes.toscrape.com/login', data={'username': 'admin', 'password': 'admin'})
|
| 193 |
-
|
| 194 |
-
# 如需要,切换浏览器指纹
|
| 195 |
page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
|
| 196 |
|
| 197 |
-
#
|
| 198 |
async with AsyncStealthySession(max_pages=2) as session:
|
| 199 |
tasks = []
|
| 200 |
urls = ['https://example.com/page1', 'https://example.com/page2']
|
| 201 |
-
|
| 202 |
for url in urls:
|
| 203 |
task = session.fetch(url)
|
| 204 |
tasks.append(task)
|
| 205 |
-
|
| 206 |
print(session.get_pool_stats()) # 可选 - 浏览器标签池的状态(忙/空闲/错误)
|
| 207 |
results = await asyncio.gather(*tasks)
|
| 208 |
print(session.get_pool_stats())
|
|
@@ -210,11 +289,11 @@ async with AsyncStealthySession(max_pages=2) as session:
|
|
| 210 |
|
| 211 |
## CLI和交互式Shell
|
| 212 |
|
| 213 |
-
Scrapling
|
| 214 |
|
| 215 |
[](https://asciinema.org/a/736339)
|
| 216 |
|
| 217 |
-
启动交互式
|
| 218 |
```bash
|
| 219 |
scrapling shell
|
| 220 |
```
|
|
@@ -227,24 +306,24 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.
|
|
| 227 |
```
|
| 228 |
|
| 229 |
> [!NOTE]
|
| 230 |
-
> 还有许多其他功能,但我们希望保持此页面简洁,
|
| 231 |
|
| 232 |
## 性能基准
|
| 233 |
|
| 234 |
-
Scrapling不仅功能强大——它还速度极快
|
| 235 |
|
| 236 |
### 文本提取速度测试(5000个嵌套元素)
|
| 237 |
|
| 238 |
-
| # | 库 | 时间(ms) | vs Scrapling |
|
| 239 |
-
|---|:-----------------:|:-------:|:------------:|
|
| 240 |
-
| 1 | Scrapling |
|
| 241 |
-
| 2 | Parsel/Scrapy |
|
| 242 |
-
| 3 | Raw Lxml | 2.
|
| 243 |
-
| 4 | PyQuery |
|
| 244 |
-
| 5 | Selectolax |
|
| 245 |
-
| 6 |
|
| 246 |
-
| 7 |
|
| 247 |
-
| 8 | BS4 with html5lib |
|
| 248 |
|
| 249 |
|
| 250 |
### 元素相似性和文本搜索性能
|
|
@@ -252,9 +331,9 @@ Scrapling不仅功能强大——它还速度极快,自0.3版本以来的更
|
|
| 252 |
Scrapling的自适应元素查找功能明显优于替代方案:
|
| 253 |
|
| 254 |
| 库 | 时间(ms) | vs Scrapling |
|
| 255 |
-
|-------------|:------:|:------------:|
|
| 256 |
-
| Scrapling |
|
| 257 |
-
| AutoScraper |
|
| 258 |
|
| 259 |
|
| 260 |
> 所有基准测试代表100+次运行的平均值。请参阅[benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py)了解方法。
|
|
@@ -267,25 +346,25 @@ Scrapling需要Python 3.10或更高版本:
|
|
| 267 |
pip install scrapling
|
| 268 |
```
|
| 269 |
|
| 270 |
-
|
| 271 |
|
| 272 |
### 可选依赖项
|
| 273 |
|
| 274 |
-
1. 如果您要使用以下任何额外功能、
|
| 275 |
```bash
|
| 276 |
pip install "scrapling[fetchers]"
|
| 277 |
-
|
| 278 |
scrapling install
|
| 279 |
```
|
| 280 |
|
| 281 |
-
这会下载所有浏览器,以及它们的系统依赖项和
|
| 282 |
|
| 283 |
2. 额外功能:
|
| 284 |
- 安装MCP服务器功能:
|
| 285 |
```bash
|
| 286 |
pip install "scrapling[ai]"
|
| 287 |
```
|
| 288 |
-
- 安装
|
| 289 |
```bash
|
| 290 |
pip install "scrapling[shell]"
|
| 291 |
```
|
|
@@ -324,12 +403,5 @@ docker pull ghcr.io/d4vinci/scrapling:latest
|
|
| 324 |
此项目包含改编自以下内容的代码:
|
| 325 |
- Parsel(BSD许可证)——用于[translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)子模块
|
| 326 |
|
| 327 |
-
## 感谢和参考
|
| 328 |
-
|
| 329 |
-
- [Daijro](https://github.com/daijro)在[BrowserForge](https://github.com/daijro/browserforge)和[Camoufox](https://github.com/daijro/camoufox)上的出色工作
|
| 330 |
-
- [Vinyzu](https://github.com/Vinyzu)在[Botright](https://github.com/Vinyzu/Botright)和[PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)上的出色工作
|
| 331 |
-
- [brotector](https://github.com/kaliiiiiiiiii/brotector)提供的浏览器检测绕过技术
|
| 332 |
-
- [fakebrowser](https://github.com/kkoooqq/fakebrowser)和[BotBrowser](https://github.com/botswin/BotBrowser)提供的指纹识别研究
|
| 333 |
-
|
| 334 |
---
|
| 335 |
-
<div align="center"><small>由Karim Shoair用❤️设计和制作。</small></div><br>
|
|
|
|
| 1 |
+
<h1 align="center">
|
| 2 |
+
<a href="https://scrapling.readthedocs.io">
|
| 3 |
+
<picture>
|
| 4 |
+
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
|
| 5 |
+
<img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
|
| 6 |
+
</picture>
|
| 7 |
+
</a>
|
| 8 |
+
<br>
|
| 9 |
+
<small>Effortless Web Scraping for the Modern Web</small>
|
| 10 |
+
</h1>
|
| 11 |
+
|
| 12 |
<p align="center">
|
| 13 |
<a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
|
| 14 |
<img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
|
|
|
|
| 29 |
</p>
|
| 30 |
|
| 31 |
<p align="center">
|
| 32 |
+
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/"><strong>选择方法</strong></a>
|
| 33 |
+
·
|
| 34 |
+
<a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/"><strong>选择Fetcher</strong></a>
|
| 35 |
+
·
|
| 36 |
+
<a href="https://scrapling.readthedocs.io/en/latest/cli/overview/"><strong>CLI</strong></a>
|
| 37 |
+
·
|
| 38 |
+
<a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/"><strong>MCP模式</strong></a>
|
| 39 |
+
·
|
| 40 |
+
<a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/"><strong>从Beautifulsoup迁移</strong></a>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
</p>
|
| 42 |
|
| 43 |
+
Scrapling是一个自适应Web Scraping框架,能处理从单个请求到大规模爬取的一切需求。
|
| 44 |
|
| 45 |
+
它的解析器能够从网站变化中学习,并在页面更新时自动重新定位您的元素。它的Fetcher能够开箱即用地绕过Cloudflare Turnstile等反机器人系统。它的Spider框架让您可以扩展到并发、多Session爬取,支持暂停/恢复和自动Proxy轮换——只需几行Python代码。一个库,零妥协。
|
| 46 |
|
| 47 |
+
极速爬取,实时统计和Streaming。由Web Scraper为Web Scraper和普通用户而构建,每个人都能找到适合自己的功能。
|
| 48 |
|
| 49 |
```python
|
| 50 |
+
from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
|
| 51 |
+
StealthyFetcher.adaptive = True
|
| 52 |
+
page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # 隐秘地获取网站!
|
| 53 |
+
products = page.css('.product', auto_save=True) # 抓取在网站设计变更后仍能存活的数据!
|
| 54 |
+
products = page.css('.product', adaptive=True) # 之后,如果网站结构改变,传递 `adaptive=True` 来找到它们!
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
```
|
| 56 |
+
或扩展为完整爬取
|
| 57 |
+
```python
|
| 58 |
+
from scrapling.spiders import Spider, Response
|
| 59 |
|
| 60 |
+
class MySpider(Spider):
|
| 61 |
+
name = "demo"
|
| 62 |
+
start_urls = ["https://example.com/"]
|
| 63 |
+
|
| 64 |
+
async def parse(self, response: Response):
|
| 65 |
+
for item in response.css('.product'):
|
| 66 |
+
yield {"title": item.css('h2::text').get()}
|
| 67 |
+
|
| 68 |
+
MySpider().start()
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
# 赞助商
|
| 73 |
|
| 74 |
<!-- sponsors -->
|
| 75 |
|
|
|
|
| 93 |
|
| 94 |
## 主要特性
|
| 95 |
|
| 96 |
+
### Spider — 完整的爬取框架
|
| 97 |
+
- 🕷️ **类Scrapy的Spider API**:使用`start_urls`、async `parse` callback和`Request`/`Response`对象定义Spider。
|
| 98 |
+
- ⚡ **并发爬取**:可配置的并发限制、按域名节流和下载延迟。
|
| 99 |
+
- 🔄 **多Session支持**:统一接口,支持HTTP请求和隐秘无头浏览器在同一个Spider中使用——通过ID将请求路由到不同的Session。
|
| 100 |
+
- 💾 **暂停与恢复**:基于Checkpoint的爬取持久化。按Ctrl+C优雅关闭;重启后从上次停止的地方继续。
|
| 101 |
+
- 📡 **Streaming模式**:通过`async for item in spider.stream()`以实时统计Streaming抓取的数据——非常适合UI、管道和长时间运行的爬取。
|
| 102 |
+
- 🛡️ **被阻止请求检测**:自动检测并重试被阻止的请求,支持自定义逻辑。
|
| 103 |
+
- 📦 **内置导出**:通过钩子和您自己的管道导出结果,或使用内置的JSON/JSONL,分别通过`result.items.to_json()`/`result.items.to_jsonl()`。
|
| 104 |
+
|
| 105 |
+
### 支持Session的高级网站获取
|
| 106 |
+
- **HTTP请求**:使用`Fetcher`类进行快速和隐秘的HTTP请求。可以模拟浏览器的TLS fingerprint、标头并使用HTTP/3。
|
| 107 |
- **动态加载**:通过`DynamicFetcher`类使用完整的浏览器自动化获取动态网站,支持Playwright的Chromium和Google Chrome。
|
| 108 |
+
- **反机器人绕过**:使用`StealthyFetcher`的高级隐秘功能和fingerprint伪装。可以轻松自动绕过所有类型的Cloudflare Turnstile/Interstitial。
|
| 109 |
+
- **Session管理**:使用`FetcherSession`、`StealthySession`和`DynamicSession`类实现持久化Session支持,用于跨请求的cookie和状态管理。
|
| 110 |
+
- **Proxy轮换**:内置`ProxyRotator`,支持轮询或自定义策略,适用于所有Session类型,并支持按请求覆盖Proxy。
|
| 111 |
+
- **域名屏蔽**:在基于浏览器的Fetcher中屏蔽对特定域名(及其子域名)的请求。
|
| 112 |
+
- **Async支持**:所有Fetcher和专用async Session类的完整async支持。
|
| 113 |
|
| 114 |
### 自适应抓取和AI集成
|
| 115 |
- 🔄 **智能元素跟踪**:使用智能相似性算法在网站更改后重新定位元素。
|
| 116 |
- 🎯 **智能灵活选择**:CSS选择器、XPath选择器、基于过滤器的搜索、文本搜索、正则表达式搜索等。
|
| 117 |
+
- 🔍 **查找相似元素**:自动定位与已找到元素相似的元素。
|
| 118 |
+
- 🤖 **与AI一起使用的MCP服务器**:内置MCP服务器用于AI辅助Web Scraping和数据提取。MCP服务器具有强大的自定义功能,利用Scrapling在将内容传递给AI(Claude/Cursor等)之前提取目标内容,从而加快操作并通过最小化token使用来降低成本。([演示视频](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
|
| 119 |
|
| 120 |
### 高性能和经过实战测试的架构
|
| 121 |
- 🚀 **闪电般快速**:优化性能超越大多数Python抓取库。
|
| 122 |
- 🔋 **内存高效**:优化的数据结构和延迟加载,最小内存占用。
|
| 123 |
- ⚡ **快速JSON序列化**:比标准库快10倍。
|
| 124 |
+
- 🏗️ **经过实战测试**:Scrapling不仅拥有92%的测试覆盖率和完整的类型提示覆盖率,而且在过去一年中每天被数百名Web Scraper使用。
|
| 125 |
|
| 126 |
+
### 对开发者/Web Scraper友好的体验
|
| 127 |
+
- 🎯 **交互式Web Scraping Shell**:可选的内置IPython Shell,具有Scrapling集成、快捷方式和新工具,可加快Web Scraping脚本开发,例如将curl请求转换为Scrapling请求并在浏览器中查看请求结果。
|
| 128 |
- 🚀 **直接从终端使用**:可选地,您可以使用Scrapling抓取URL而无需编��任何代码!
|
| 129 |
- 🛠️ **丰富的导航API**:使用父级、兄弟级和子级导航方法进行高级DOM遍历。
|
| 130 |
- 🧬 **增强的文本处理**:内置正则表达式、清理方法和优化的字符串操作。
|
| 131 |
- 📝 **自动选择器生成**:为任何元素生成强大的CSS/XPath选择器。
|
| 132 |
- 🔌 **熟悉的API**:类似于Scrapy/BeautifulSoup,使用与Scrapy/Parsel相同的伪元素。
|
| 133 |
+
- 📘 **完整的类型覆盖**:完整的类型提示,出色的IDE支持和代码补全。整个代码库在每次更改时都会自动使用**PyRight**和**MyPy**扫描。
|
| 134 |
- 🔋 **现成的Docker镜像**:每次发布时,包含所有浏览器的Docker镜像会自动构建和推送。
|
| 135 |
|
| 136 |
## 入门
|
| 137 |
|
| 138 |
+
让我们快速展示Scrapling的功能,无需深入了解。
|
| 139 |
+
|
| 140 |
### 基本用法
|
| 141 |
+
支持Session的HTTP请求
|
| 142 |
```python
|
| 143 |
+
from scrapling.fetchers import Fetcher, FetcherSession
|
|
|
|
| 144 |
|
| 145 |
+
with FetcherSession(impersonate='chrome') as session: # 使用Chrome的最新版本TLS fingerprint
|
|
|
|
| 146 |
page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
| 147 |
+
quotes = page.css('.quote .text::text').getall()
|
| 148 |
|
| 149 |
# 或使用一次性请求
|
| 150 |
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 151 |
+
quotes = page.css('.quote .text::text').getall()
|
| 152 |
+
```
|
| 153 |
+
高级隐秘模式
|
| 154 |
+
```python
|
| 155 |
+
from scrapling.fetchers import StealthyFetcher, StealthySession
|
| 156 |
|
| 157 |
+
with StealthySession(headless=True, solve_cloudflare=True) as session: # 保持浏览器打开直到完成
|
|
|
|
| 158 |
page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
|
| 159 |
+
data = page.css('#padded_content a').getall()
|
| 160 |
|
| 161 |
# 或使用一次性请求样式,为此请求打开浏览器,完成后关闭
|
| 162 |
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
|
| 163 |
+
data = page.css('#padded_content a').getall()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
```
|
| 165 |
+
完整的浏览器自动化
|
| 166 |
+
```python
|
| 167 |
+
from scrapling.fetchers import DynamicFetcher, DynamicSession
|
| 168 |
+
|
| 169 |
+
with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # 保持浏览器打开直到完成
|
| 170 |
+
page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
|
| 171 |
+
data = page.xpath('//span[@class="text"]/text()').getall() # 如果您偏好XPath选择器
|
| 172 |
|
| 173 |
+
# 或使用一次性请求样式,为此请求打开浏览器,完成后关闭
|
| 174 |
+
page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
|
| 175 |
+
data = page.css('.quote .text::text').getall()
|
| 176 |
+
```
|
| 177 |
+
|
| 178 |
+
### Spider
|
| 179 |
+
构建具有并发请求、多种Session类型和暂停/恢复功能的完整爬虫:
|
| 180 |
+
```python
|
| 181 |
+
from scrapling.spiders import Spider, Request, Response
|
| 182 |
+
|
| 183 |
+
class QuotesSpider(Spider):
|
| 184 |
+
name = "quotes"
|
| 185 |
+
start_urls = ["https://quotes.toscrape.com/"]
|
| 186 |
+
concurrent_requests = 10
|
| 187 |
+
|
| 188 |
+
async def parse(self, response: Response):
|
| 189 |
+
for quote in response.css('.quote'):
|
| 190 |
+
yield {
|
| 191 |
+
"text": quote.css('.text::text').get(),
|
| 192 |
+
"author": quote.css('.author::text').get(),
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
next_page = response.css('.next a')
|
| 196 |
+
if next_page:
|
| 197 |
+
yield response.follow(next_page[0].attrib['href'])
|
| 198 |
+
|
| 199 |
+
result = QuotesSpider().start()
|
| 200 |
+
print(f"抓取了 {len(result.items)} 条引用")
|
| 201 |
+
result.items.to_json("quotes.json")
|
| 202 |
+
```
|
| 203 |
+
在单个Spider中使用多种Session类型:
|
| 204 |
+
```python
|
| 205 |
+
from scrapling.spiders import Spider, Request, Response
|
| 206 |
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession
|
| 207 |
+
|
| 208 |
+
class MultiSessionSpider(Spider):
|
| 209 |
+
name = "multi"
|
| 210 |
+
start_urls = ["https://example.com/"]
|
| 211 |
+
|
| 212 |
+
def configure_sessions(self, manager):
|
| 213 |
+
manager.add("fast", FetcherSession(impersonate="chrome"))
|
| 214 |
+
manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)
|
| 215 |
+
|
| 216 |
+
async def parse(self, response: Response):
|
| 217 |
+
for link in response.css('a::attr(href)').getall():
|
| 218 |
+
# 将受保护的页面路由到隐秘Session
|
| 219 |
+
if "protected" in link:
|
| 220 |
+
yield Request(link, sid="stealth")
|
| 221 |
+
else:
|
| 222 |
+
yield Request(link, sid="fast", callback=self.parse) # 显式callback
|
| 223 |
+
```
|
| 224 |
+
通过如下方式运行Spider来暂停和恢复长时间爬取,使用Checkpoint:
|
| 225 |
+
```python
|
| 226 |
+
QuotesSpider(crawldir="./crawl_data").start()
|
| 227 |
+
```
|
| 228 |
+
按Ctrl+C优雅暂停——进度会自动保存。之后,当您再次启动Spider时,传递相同的`crawldir`,它将从上次停止的地方继续。
|
| 229 |
+
|
| 230 |
+
### 高级解析与导航
|
| 231 |
+
```python
|
| 232 |
+
from scrapling.fetchers import Fetcher
|
| 233 |
+
|
| 234 |
+
# 丰富的元素选择和导航
|
| 235 |
+
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 236 |
+
|
| 237 |
+
# 使用多种选择方法获取引用
|
| 238 |
+
quotes = page.css('.quote') # CSS选择器
|
| 239 |
+
quotes = page.xpath('//div[@class="quote"]') # XPath
|
| 240 |
+
quotes = page.find_all('div', {'class': 'quote'}) # BeautifulSoup风格
|
| 241 |
+
# 等同于
|
| 242 |
+
quotes = page.find_all('div', class_='quote')
|
| 243 |
+
quotes = page.find_all(['div'], class_='quote')
|
| 244 |
+
quotes = page.find_all(class_='quote') # 等等...
|
| 245 |
+
# 按文本内容查找元素
|
| 246 |
+
quotes = page.find_by_text('quote', tag='div')
|
| 247 |
+
|
| 248 |
+
# 高级导航
|
| 249 |
+
quote_text = page.css('.quote')[0].css('.text::text').get()
|
| 250 |
+
quote_text = page.css('.quote').css('.text::text').getall() # 链式选择器
|
| 251 |
+
first_quote = page.css('.quote')[0]
|
| 252 |
+
author = first_quote.next_sibling.css('.author::text')
|
| 253 |
+
parent_container = first_quote.parent
|
| 254 |
+
|
| 255 |
+
# 元素关系和相似性
|
| 256 |
+
similar_elements = first_quote.find_similar()
|
| 257 |
+
below_elements = first_quote.below_elements()
|
| 258 |
+
```
|
| 259 |
+
如果您不想获取网站,可以直接使用解析器,如下所示:
|
| 260 |
```python
|
| 261 |
+
from scrapling.parser import Selector
|
| 262 |
+
|
| 263 |
+
page = Selector("<html>...</html>")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
```
|
| 265 |
+
用法完全相同!
|
| 266 |
|
| 267 |
+
### Async Session管理示例
|
| 268 |
```python
|
| 269 |
+
import asyncio
|
| 270 |
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession
|
| 271 |
+
|
| 272 |
+
async with FetcherSession(http3=True) as session: # `FetcherSession`是上下文感知的,可以在sync/async模式下工作
|
| 273 |
+
page1 = session.get('https://quotes.toscrape.com/')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
|
| 275 |
|
| 276 |
+
# Async Session用法
|
| 277 |
async with AsyncStealthySession(max_pages=2) as session:
|
| 278 |
tasks = []
|
| 279 |
urls = ['https://example.com/page1', 'https://example.com/page2']
|
| 280 |
+
|
| 281 |
for url in urls:
|
| 282 |
task = session.fetch(url)
|
| 283 |
tasks.append(task)
|
| 284 |
+
|
| 285 |
print(session.get_pool_stats()) # 可选 - 浏览器标签池的状态(忙/空闲/错误)
|
| 286 |
results = await asyncio.gather(*tasks)
|
| 287 |
print(session.get_pool_stats())
|
|
|
|
| 289 |
|
| 290 |
## CLI和交互式Shell
|
| 291 |
|
| 292 |
+
Scrapling包含强大的命令行界面:
|
| 293 |
|
| 294 |
[](https://asciinema.org/a/736339)
|
| 295 |
|
| 296 |
+
启动交互式Web Scraping Shell
|
| 297 |
```bash
|
| 298 |
scrapling shell
|
| 299 |
```
|
|
|
|
| 306 |
```
|
| 307 |
|
| 308 |
> [!NOTE]
|
| 309 |
+
> 还有许多其他功能,但我们希望保持此页面简洁,包括MCP服务器和交互式Web Scraping Shell。查看完整文档[这里](https://scrapling.readthedocs.io/en/latest/)
|
| 310 |
|
| 311 |
## 性能基准
|
| 312 |
|
| 313 |
+
Scrapling不仅功能强大——它还速度极快。以下基准测试将Scrapling的��析器与其他流行库的最新版本进行了比较。
|
| 314 |
|
| 315 |
### 文本提取速度测试(5000个嵌套元素)
|
| 316 |
|
| 317 |
+
| # | 库 | 时间(ms) | vs Scrapling |
|
| 318 |
+
|---|:-----------------:|:---------:|:------------:|
|
| 319 |
+
| 1 | Scrapling | 2.02 | 1.0x |
|
| 320 |
+
| 2 | Parsel/Scrapy | 2.04 | 1.01 |
|
| 321 |
+
| 3 | Raw Lxml | 2.54 | 1.257 |
|
| 322 |
+
| 4 | PyQuery | 24.17 | ~12x |
|
| 323 |
+
| 5 | Selectolax | 82.63 | ~41x |
|
| 324 |
+
| 6 | MechanicalSoup | 1549.71 | ~767.1x |
|
| 325 |
+
| 7 | BS4 with Lxml | 1584.31 | ~784.3x |
|
| 326 |
+
| 8 | BS4 with html5lib | 3391.91 | ~1679.1x |
|
| 327 |
|
| 328 |
|
| 329 |
### 元素相似性和文本搜索性能
|
|
|
|
| 331 |
Scrapling的自适应元素查找功能明显优于替代方案:
|
| 332 |
|
| 333 |
| 库 | 时间(ms) | vs Scrapling |
|
| 334 |
+
|-------------|:---------:|:------------:|
|
| 335 |
+
| Scrapling | 2.39 | 1.0x |
|
| 336 |
+
| AutoScraper | 12.45 | 5.209x |
|
| 337 |
|
| 338 |
|
| 339 |
> 所有基准测试代表100+次运行的平均值。请参阅[benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py)了解方法。
|
|
|
|
| 346 |
pip install scrapling
|
| 347 |
```
|
| 348 |
|
| 349 |
+
此安装仅包括解析器引擎及其依赖项,没有任何Fetcher或命令行依赖项。
|
| 350 |
|
| 351 |
### 可选依赖项
|
| 352 |
|
| 353 |
+
1. 如果您要使用以下任何额外功能、Fetcher或它们的类,您将需要安装Fetcher的依赖项和它们的浏览器依赖项,如下所示:
|
| 354 |
```bash
|
| 355 |
pip install "scrapling[fetchers]"
|
| 356 |
+
|
| 357 |
scrapling install
|
| 358 |
```
|
| 359 |
|
| 360 |
+
这会下载所有浏览器,以及它们的系统依赖项和fingerprint操作依赖项。
|
| 361 |
|
| 362 |
2. 额外功能:
|
| 363 |
- 安装MCP服务器功能:
|
| 364 |
```bash
|
| 365 |
pip install "scrapling[ai]"
|
| 366 |
```
|
| 367 |
+
- 安装Shell功能(Web Scraping Shell和`extract`命令):
|
| 368 |
```bash
|
| 369 |
pip install "scrapling[shell]"
|
| 370 |
```
|
|
|
|
| 403 |
此项目包含改编自以下内容的代码:
|
| 404 |
- Parsel(BSD许可证)——用于[translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)子模块
|
| 405 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 406 |
---
|
| 407 |
+
<div align="center"><small>由Karim Shoair用❤️设计和制作。</small></div><br>
|
docs/README_DE.md
CHANGED
|
@@ -1,9 +1,14 @@
|
|
| 1 |
-
<
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
</
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
<p align="center">
|
| 8 |
<a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
|
| 9 |
<img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
|
|
@@ -24,46 +29,47 @@
|
|
| 24 |
</p>
|
| 25 |
|
| 26 |
<p align="center">
|
| 27 |
-
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/">
|
| 28 |
-
|
| 29 |
-
</a>
|
| 30 |
-
|
| 31 |
-
<a href="https://scrapling.readthedocs.io/en/latest/
|
| 32 |
-
|
| 33 |
-
</a>
|
| 34 |
-
|
| 35 |
-
<a href="https://scrapling.readthedocs.io/en/latest/
|
| 36 |
-
CLI
|
| 37 |
-
</a>
|
| 38 |
-
·
|
| 39 |
-
<a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/">
|
| 40 |
-
MCP-Modus
|
| 41 |
-
</a>
|
| 42 |
-
·
|
| 43 |
-
<a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/">
|
| 44 |
-
Migration von Beautifulsoup
|
| 45 |
-
</a>
|
| 46 |
</p>
|
| 47 |
|
| 48 |
-
|
| 49 |
|
| 50 |
-
|
| 51 |
|
| 52 |
-
|
| 53 |
|
| 54 |
```python
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
```
|
| 65 |
|
| 66 |
-
|
|
|
|
| 67 |
|
| 68 |
<!-- sponsors -->
|
| 69 |
|
|
@@ -87,12 +93,23 @@ Für das moderne Web entwickelt, bietet Scrapling **seine eigene schnelle Parsin
|
|
| 87 |
|
| 88 |
## Hauptmerkmale
|
| 89 |
|
| 90 |
-
###
|
| 91 |
-
- **
|
| 92 |
-
- **
|
| 93 |
-
- **
|
| 94 |
-
- **
|
| 95 |
-
- **
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
### Adaptives Scraping & KI-Integration
|
| 98 |
- 🔄 **Intelligente Element-Verfolgung**: Elemente nach Website-Änderungen mit intelligenten Ähnlichkeitsalgorithmen neu lokalisieren.
|
|
@@ -106,103 +123,165 @@ Für das moderne Web entwickelt, bietet Scrapling **seine eigene schnelle Parsin
|
|
| 106 |
- ⚡ **Schnelle JSON-Serialisierung**: 10x schneller als die Standardbibliothek.
|
| 107 |
- 🏗️ **Praxiserprobt**: Scrapling hat nicht nur eine Testabdeckung von 92% und eine vollständige Type-Hints-Abdeckung, sondern wird seit dem letzten Jahr täglich von Hunderten von Web Scrapern verwendet.
|
| 108 |
|
| 109 |
-
### Entwickler/Web-Scraper-freundliche Erfahrung
|
| 110 |
- 🎯 **Interaktive Web-Scraping-Shell**: Optionale integrierte IPython-Shell mit Scrapling-Integration, Shortcuts und neuen Tools zur Beschleunigung der Web-Scraping-Skriptentwicklung, wie das Konvertieren von Curl-Anfragen in Scrapling-Anfragen und das Anzeigen von Anfrageergebnissen in Ihrem Browser.
|
| 111 |
- 🚀 **Direkt vom Terminal aus verwenden**: Optional können Sie Scrapling verwenden, um eine URL zu scrapen, ohne eine einzige Codezeile zu schreiben!
|
| 112 |
- 🛠️ **Umfangreiche Navigations-API**: Erweiterte DOM-Traversierung mit Eltern-, Geschwister- und Kind-Navigationsmethoden.
|
| 113 |
- 🧬 **Verbesserte Textverarbeitung**: Integrierte Regex, Bereinigungsmethoden und optimierte String-Operationen.
|
| 114 |
- 📝 **Automatische Selektorgenerierung**: Robuste CSS/XPath-Selektoren für jedes Element generieren.
|
| 115 |
- 🔌 **Vertraute API**: Ähnlich wie Scrapy/BeautifulSoup mit denselben Pseudo-Elementen, die in Scrapy/Parsel verwendet werden.
|
| 116 |
-
- 📘 **Vollständige Typabdeckung**: Vollständige Type Hints für hervorragende IDE-Unterstützung und Code-Vervollständigung.
|
| 117 |
- 🔋 **Fertiges Docker-Image**: Mit jeder Veröffentlichung wird automatisch ein Docker-Image erstellt und gepusht, das alle Browser enthält.
|
| 118 |
|
| 119 |
## Erste Schritte
|
| 120 |
|
|
|
|
|
|
|
| 121 |
### Grundlegende Verwendung
|
|
|
|
| 122 |
```python
|
| 123 |
-
from scrapling.fetchers import Fetcher,
|
| 124 |
-
from scrapling.fetchers import FetcherSession, StealthySession, DynamicSession
|
| 125 |
|
| 126 |
-
#
|
| 127 |
-
with FetcherSession(impersonate='chrome') as session: # Verwenden Sie die neueste Version von Chromes TLS-Fingerabdruck
|
| 128 |
page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
| 129 |
-
quotes = page.css('.quote .text::text')
|
| 130 |
|
| 131 |
-
# Oder
|
| 132 |
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 133 |
-
quotes = page.css('.quote .text::text')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
-
|
| 136 |
-
with StealthySession(headless=True, solve_cloudflare=True) as session:
|
| 137 |
page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
|
| 138 |
-
data = page.css('#padded_content a')
|
| 139 |
|
| 140 |
-
# Oder
|
| 141 |
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
|
| 142 |
-
data = page.css('#padded_content a')
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
```python
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
page
|
| 159 |
-
page.css('a', recursive=False) # Nur direkte Elemente
|
| 160 |
-
page.css('a', auto_save=True) # Elementpositionen automatisch speichern
|
| 161 |
-
|
| 162 |
-
# XPath
|
| 163 |
-
page.xpath('//a/text()')
|
| 164 |
-
|
| 165 |
-
# Flexible Suche
|
| 166 |
-
page.find_by_text('Python', first_match=True) # Nach Text suchen
|
| 167 |
-
page.find_by_regex(r'\d{4}') # Nach Regex-Muster suchen
|
| 168 |
-
page.find('div', {'class': 'container'}) # Nach Attributen suchen
|
| 169 |
-
|
| 170 |
-
# Navigation
|
| 171 |
-
element.parent # Elternelement abrufen
|
| 172 |
-
element.next_sibling # Nächstes Geschwister abrufen
|
| 173 |
-
element.children # Kindelemente abrufen
|
| 174 |
-
|
| 175 |
-
# Ähnliche Elemente
|
| 176 |
-
similar = page.get_similar(element) # Ähnliche Elemente finden
|
| 177 |
-
|
| 178 |
-
# Adaptives Scraping
|
| 179 |
-
saved_elements = page.css('.product', auto_save=True)
|
| 180 |
-
# Später, wenn sich die Website ändert:
|
| 181 |
-
page.css('.product', adaptive=True) # Elemente mithilfe gespeicherter Positionen finden
|
| 182 |
```
|
|
|
|
| 183 |
|
| 184 |
-
###
|
| 185 |
```python
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
with FetcherSession() as session:
|
| 190 |
-
|
| 191 |
-
page1 = session.get('https://quotes.toscrape.com/login')
|
| 192 |
-
page2 = session.post('https://quotes.toscrape.com/login', data={'username': 'admin', 'password': 'admin'})
|
| 193 |
-
|
| 194 |
-
# Bei Bedarf Browser-Fingerabdruck wechseln
|
| 195 |
page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
|
| 196 |
|
| 197 |
-
# Async-
|
| 198 |
async with AsyncStealthySession(max_pages=2) as session:
|
| 199 |
tasks = []
|
| 200 |
urls = ['https://example.com/page1', 'https://example.com/page2']
|
| 201 |
-
|
| 202 |
for url in urls:
|
| 203 |
task = session.fetch(url)
|
| 204 |
tasks.append(task)
|
| 205 |
-
|
| 206 |
print(session.get_pool_stats()) # Optional - Der Status des Browser-Tab-Pools (beschäftigt/frei/Fehler)
|
| 207 |
results = await asyncio.gather(*tasks)
|
| 208 |
print(session.get_pool_stats())
|
|
@@ -210,7 +289,7 @@ async with AsyncStealthySession(max_pages=2) as session:
|
|
| 210 |
|
| 211 |
## CLI & Interaktive Shell
|
| 212 |
|
| 213 |
-
Scrapling
|
| 214 |
|
| 215 |
[](https://asciinema.org/a/736339)
|
| 216 |
|
|
@@ -218,7 +297,7 @@ Interaktive Web-Scraping-Shell starten
|
|
| 218 |
```bash
|
| 219 |
scrapling shell
|
| 220 |
```
|
| 221 |
-
Seiten direkt ohne Programmierung in eine Datei extrahieren (
|
| 222 |
```bash
|
| 223 |
scrapling extract get 'https://example.com' content.md
|
| 224 |
scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # Alle Elemente, die dem CSS-Selektor '#fromSkipToProducts' entsprechen
|
|
@@ -227,24 +306,24 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.
|
|
| 227 |
```
|
| 228 |
|
| 229 |
> [!NOTE]
|
| 230 |
-
> Es gibt viele zusätzliche Funktionen, aber wir möchten diese Seite prägnant halten,
|
| 231 |
|
| 232 |
## Leistungsbenchmarks
|
| 233 |
|
| 234 |
-
Scrapling ist nicht nur leistungsstark
|
| 235 |
|
| 236 |
### Textextraktions-Geschwindigkeitstest (5000 verschachtelte Elemente)
|
| 237 |
|
| 238 |
-
| # | Bibliothek | Zeit (ms) | vs Scrapling |
|
| 239 |
|---|:-----------------:|:---------:|:------------:|
|
| 240 |
-
| 1 | Scrapling |
|
| 241 |
-
| 2 | Parsel/Scrapy | 2.
|
| 242 |
-
| 3 | Raw Lxml |
|
| 243 |
-
| 4 | PyQuery |
|
| 244 |
-
| 5 | Selectolax |
|
| 245 |
-
| 6 |
|
| 246 |
-
| 7 |
|
| 247 |
-
| 8 | BS4 with html5lib |
|
| 248 |
|
| 249 |
|
| 250 |
### Element-Ähnlichkeit & Textsuche-Leistung
|
|
@@ -253,8 +332,8 @@ Scraplings adaptive Element-Finding-Fähigkeiten übertreffen Alternativen deutl
|
|
| 253 |
|
| 254 |
| Bibliothek | Zeit (ms) | vs Scrapling |
|
| 255 |
|-------------|:---------:|:------------:|
|
| 256 |
-
| Scrapling | 2.
|
| 257 |
-
| AutoScraper |
|
| 258 |
|
| 259 |
|
| 260 |
> Alle Benchmarks stellen Durchschnittswerte von über 100 Durchläufen dar. Siehe [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) für die Methodik.
|
|
@@ -267,18 +346,18 @@ Scrapling erfordert Python 3.10 oder höher:
|
|
| 267 |
pip install scrapling
|
| 268 |
```
|
| 269 |
|
| 270 |
-
|
| 271 |
|
| 272 |
### Optionale Abhängigkeiten
|
| 273 |
|
| 274 |
1. Wenn Sie eine der folgenden zusätzlichen Funktionen, die Fetcher oder ihre Klassen verwenden möchten, müssen Sie die Abhängigkeiten der Fetcher und ihre Browser-Abhängigkeiten wie folgt installieren:
|
| 275 |
```bash
|
| 276 |
pip install "scrapling[fetchers]"
|
| 277 |
-
|
| 278 |
scrapling install
|
| 279 |
```
|
| 280 |
|
| 281 |
-
Dies lädt alle Browser zusammen mit ihren Systemabhängigkeiten und
|
| 282 |
|
| 283 |
2. Zusätzliche Funktionen:
|
| 284 |
- MCP-Server-Funktion installieren:
|
|
@@ -322,14 +401,7 @@ Diese Arbeit ist unter der BSD-3-Clause-Lizenz lizenziert.
|
|
| 322 |
## Danksagungen
|
| 323 |
|
| 324 |
Dieses Projekt enthält angepassten Code von:
|
| 325 |
-
- Parsel (BSD-Lizenz)
|
| 326 |
-
|
| 327 |
-
## Dank und Referenzen
|
| 328 |
-
|
| 329 |
-
- [Daijros](https://github.com/daijro) brillante Arbeit an [BrowserForge](https://github.com/daijro/browserforge) und [Camoufox](https://github.com/daijro/camoufox)
|
| 330 |
-
- [Vinyzus](https://github.com/Vinyzu) brillante Arbeit an [Botright](https://github.com/Vinyzu/Botright) und [PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)
|
| 331 |
-
- [brotector](https://github.com/kaliiiiiiiiii/brotector) für Browser-Erkennungs-Umgehungstechniken
|
| 332 |
-
- [fakebrowser](https://github.com/kkoooqq/fakebrowser) und [BotBrowser](https://github.com/botswin/BotBrowser) für Fingerprinting-Forschung
|
| 333 |
|
| 334 |
---
|
| 335 |
-
<div align="center"><small>Entworfen und hergestellt mit ❤️ von Karim Shoair.</small></div><br>
|
|
|
|
| 1 |
+
<h1 align="center">
|
| 2 |
+
<a href="https://scrapling.readthedocs.io">
|
| 3 |
+
<picture>
|
| 4 |
+
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
|
| 5 |
+
<img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
|
| 6 |
+
</picture>
|
| 7 |
+
</a>
|
| 8 |
+
<br>
|
| 9 |
+
<small>Effortless Web Scraping for the Modern Web</small>
|
| 10 |
+
</h1>
|
| 11 |
+
|
| 12 |
<p align="center">
|
| 13 |
<a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
|
| 14 |
<img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
|
|
|
|
| 29 |
</p>
|
| 30 |
|
| 31 |
<p align="center">
|
| 32 |
+
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/"><strong>Auswahlmethoden</strong></a>
|
| 33 |
+
·
|
| 34 |
+
<a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/"><strong>Einen Fetcher wählen</strong></a>
|
| 35 |
+
·
|
| 36 |
+
<a href="https://scrapling.readthedocs.io/en/latest/cli/overview/"><strong>CLI</strong></a>
|
| 37 |
+
·
|
| 38 |
+
<a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/"><strong>MCP-Modus</strong></a>
|
| 39 |
+
·
|
| 40 |
+
<a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/"><strong>Migration von Beautifulsoup</strong></a>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
</p>
|
| 42 |
|
| 43 |
+
Scrapling ist ein adaptives Web-Scraping-Framework, das alles abdeckt -- von einer einzelnen Anfrage bis hin zu einem umfassenden Crawl.
|
| 44 |
|
| 45 |
+
Sein Parser lernt aus Website-Änderungen und lokalisiert Ihre Elemente automatisch neu, wenn sich Seiten aktualisieren. Seine Fetcher umgehen Anti-Bot-Systeme wie Cloudflare Turnstile direkt ab Werk. Und sein Spider-Framework ermöglicht es Ihnen, auf parallele Multi-Session-Crawls mit Pause & Resume und automatischer Proxy-Rotation hochzuskalieren -- alles in wenigen Zeilen Python. Eine Bibliothek, keine Kompromisse.
|
| 46 |
|
| 47 |
+
Blitzschnelle Crawls mit Echtzeit-Statistiken und Streaming. Von Web Scrapern für Web Scraper und normale Benutzer entwickelt, ist für jeden etwas dabei.
|
| 48 |
|
| 49 |
```python
|
| 50 |
+
from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
|
| 51 |
+
StealthyFetcher.adaptive = True
|
| 52 |
+
page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # Website unbemerkt abrufen!
|
| 53 |
+
products = page.css('.product', auto_save=True) # Daten scrapen, die Website-Designänderungen überleben!
|
| 54 |
+
products = page.css('.product', adaptive=True) # Später, wenn sich die Website-Struktur ändert, `adaptive=True` übergeben, um sie zu finden!
|
| 55 |
+
```
|
| 56 |
+
Oder auf vollständige Crawls hochskalieren
|
| 57 |
+
```python
|
| 58 |
+
from scrapling.spiders import Spider, Response
|
| 59 |
+
|
| 60 |
+
class MySpider(Spider):
|
| 61 |
+
name = "demo"
|
| 62 |
+
start_urls = ["https://example.com/"]
|
| 63 |
+
|
| 64 |
+
async def parse(self, response: Response):
|
| 65 |
+
for item in response.css('.product'):
|
| 66 |
+
yield {"title": item.css('h2::text').get()}
|
| 67 |
+
|
| 68 |
+
MySpider().start()
|
| 69 |
```
|
| 70 |
|
| 71 |
+
|
| 72 |
+
# Sponsoren
|
| 73 |
|
| 74 |
<!-- sponsors -->
|
| 75 |
|
|
|
|
| 93 |
|
| 94 |
## Hauptmerkmale
|
| 95 |
|
| 96 |
+
### Spiders -- Ein vollständiges Crawling-Framework
|
| 97 |
+
- 🕷️ **Scrapy-ähnliche Spider-API**: Definieren Sie Spiders mit `start_urls`, async `parse` Callbacks und `Request`/`Response`-Objekten.
|
| 98 |
+
- ⚡ **Paralleles Crawling**: Konfigurierbare Parallelitätslimits, domainbezogenes Throttling und Download-Verzögerungen.
|
| 99 |
+
- 🔄 **Multi-Session-Unterstützung**: Einheitliche Schnittstelle für HTTP-Anfragen und heimliche Headless-Browser in einem einzigen Spider -- leiten Sie Anfragen per ID an verschiedene Sessions weiter.
|
| 100 |
+
- 💾 **Pause & Resume**: Checkpoint-basierte Crawl-Persistenz. Drücken Sie Strg+C für ein kontrolliertes Herunterfahren; starten Sie neu, um dort fortzufahren, wo Sie aufgehört haben.
|
| 101 |
+
- 📡 **Streaming-Modus**: Gescrapte Elemente in Echtzeit streamen über `async for item in spider.stream()` mit Echtzeit-Statistiken -- ideal für UI, Pipelines und lang laufende Crawls.
|
| 102 |
+
- 🛡️ **Erkennung blockierter Anfragen**: Automatische Erkennung und Wiederholung blockierter Anfragen mit anpassbarer Logik.
|
| 103 |
+
- 📦 **Integrierter Export**: Ergebnisse über Hooks und Ihre eigene Pipeline oder den integrierten JSON/JSONL-Export mit `result.items.to_json()` / `result.items.to_jsonl()` exportieren.
|
| 104 |
+
|
| 105 |
+
### Erweitertes Website-Abrufen mit Session-Unterstützung
|
| 106 |
+
- **HTTP-Anfragen**: Schnelle und heimliche HTTP-Anfragen mit der `Fetcher`-Klasse. Kann Browser-TLS-Fingerprints und Header imitieren und HTTP/3 verwenden.
|
| 107 |
+
- **Dynamisches Laden**: Dynamische Websites mit vollständiger Browser-Automatisierung über die `DynamicFetcher`-Klasse abrufen, die Playwrights Chromium und Google Chrome unterstützt.
|
| 108 |
+
- **Anti-Bot-Umgehung**: Erweiterte Stealth-Fähigkeiten mit `StealthyFetcher` und Fingerprint-Spoofing. Kann alle Arten von Cloudflares Turnstile/Interstitial einfach mit Automatisierung umgehen.
|
| 109 |
+
- **Session-Verwaltung**: Persistente Session-Unterstützung mit den Klassen `FetcherSession`, `StealthySession` und `DynamicSession` für Cookie- und Zustandsverwaltung über Anfragen hinweg.
|
| 110 |
+
- **Proxy-Rotation**: Integrierter `ProxyRotator` mit zyklischen oder benutzerdefinierten Rotationsstrategien über alle Session-Typen hinweg, plus Proxy-Überschreibungen pro Anfrage.
|
| 111 |
+
- **Domain-Blockierung**: Anfragen an bestimmte Domains (und deren Subdomains) in browserbasierten Fetchern blockieren.
|
| 112 |
+
- **Async-Unterstützung**: Vollständige async-Unterstützung über alle Fetcher und dedizierte async Session-Klassen hinweg.
|
| 113 |
|
| 114 |
### Adaptives Scraping & KI-Integration
|
| 115 |
- 🔄 **Intelligente Element-Verfolgung**: Elemente nach Website-Änderungen mit intelligenten Ähnlichkeitsalgorithmen neu lokalisieren.
|
|
|
|
| 123 |
- ⚡ **Schnelle JSON-Serialisierung**: 10x schneller als die Standardbibliothek.
|
| 124 |
- 🏗️ **Praxiserprobt**: Scrapling hat nicht nur eine Testabdeckung von 92% und eine vollständige Type-Hints-Abdeckung, sondern wird seit dem letzten Jahr täglich von Hunderten von Web Scrapern verwendet.
|
| 125 |
|
| 126 |
+
### Entwickler-/Web-Scraper-freundliche Erfahrung
|
| 127 |
- 🎯 **Interaktive Web-Scraping-Shell**: Optionale integrierte IPython-Shell mit Scrapling-Integration, Shortcuts und neuen Tools zur Beschleunigung der Web-Scraping-Skriptentwicklung, wie das Konvertieren von Curl-Anfragen in Scrapling-Anfragen und das Anzeigen von Anfrageergebnissen in Ihrem Browser.
|
| 128 |
- 🚀 **Direkt vom Terminal aus verwenden**: Optional können Sie Scrapling verwenden, um eine URL zu scrapen, ohne eine einzige Codezeile zu schreiben!
|
| 129 |
- 🛠️ **Umfangreiche Navigations-API**: Erweiterte DOM-Traversierung mit Eltern-, Geschwister- und Kind-Navigationsmethoden.
|
| 130 |
- 🧬 **Verbesserte Textverarbeitung**: Integrierte Regex, Bereinigungsmethoden und optimierte String-Operationen.
|
| 131 |
- 📝 **Automatische Selektorgenerierung**: Robuste CSS/XPath-Selektoren für jedes Element generieren.
|
| 132 |
- 🔌 **Vertraute API**: Ähnlich wie Scrapy/BeautifulSoup mit denselben Pseudo-Elementen, die in Scrapy/Parsel verwendet werden.
|
| 133 |
+
- 📘 **Vollständige Typabdeckung**: Vollständige Type Hints für hervorragende IDE-Unterstützung und Code-Vervollständigung. Die gesamte Codebasis wird bei jeder Änderung automatisch mit **PyRight** und **MyPy** gescannt.
|
| 134 |
- 🔋 **Fertiges Docker-Image**: Mit jeder Veröffentlichung wird automatisch ein Docker-Image erstellt und gepusht, das alle Browser enthält.
|
| 135 |
|
| 136 |
## Erste Schritte
|
| 137 |
|
| 138 |
+
Hier ein kurzer Überblick über das, was Scrapling kann, ohne zu sehr ins Detail zu gehen.
|
| 139 |
+
|
| 140 |
### Grundlegende Verwendung
|
| 141 |
+
HTTP-Anfragen mit Session-Unterstützung
|
| 142 |
```python
|
| 143 |
+
from scrapling.fetchers import Fetcher, FetcherSession
|
|
|
|
| 144 |
|
| 145 |
+
with FetcherSession(impersonate='chrome') as session: # Neueste Version von Chromes TLS-Fingerprint verwenden
|
|
|
|
| 146 |
page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
| 147 |
+
quotes = page.css('.quote .text::text').getall()
|
| 148 |
|
| 149 |
+
# Oder einmalige Anfragen verwenden
|
| 150 |
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 151 |
+
quotes = page.css('.quote .text::text').getall()
|
| 152 |
+
```
|
| 153 |
+
Erweiterter Stealth-Modus
|
| 154 |
+
```python
|
| 155 |
+
from scrapling.fetchers import StealthyFetcher, StealthySession
|
| 156 |
|
| 157 |
+
with StealthySession(headless=True, solve_cloudflare=True) as session: # Browser offen halten, bis Sie fertig sind
|
|
|
|
| 158 |
page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
|
| 159 |
+
data = page.css('#padded_content a').getall()
|
| 160 |
|
| 161 |
+
# Oder einmaligen Anfragenstil verwenden: öffnet den Browser für diese Anfrage und schließt ihn nach Abschluss
|
| 162 |
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
|
| 163 |
+
data = page.css('#padded_content a').getall()
|
| 164 |
+
```
|
| 165 |
+
Vollständige Browser-Automatisierung
|
| 166 |
+
```python
|
| 167 |
+
from scrapling.fetchers import DynamicFetcher, DynamicSession
|
| 168 |
+
|
| 169 |
+
with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # Browser offen halten, bis Sie fertig sind
|
| 170 |
+
page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
|
| 171 |
+
data = page.xpath('//span[@class="text"]/text()').getall() # XPath-Selektor, falls bevorzugt
|
| 172 |
+
|
| 173 |
+
# Oder einmaligen Anfragenstil verwenden: öffnet den Browser für diese Anfrage und schließt ihn nach Abschluss
|
| 174 |
+
page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
|
| 175 |
+
data = page.css('.quote .text::text').getall()
|
| 176 |
+
```
|
| 177 |
+
|
| 178 |
+
### Spiders
|
| 179 |
+
Vollständige Crawler mit parallelen Anfragen, mehreren Session-Typen und Pause & Resume erstellen:
|
| 180 |
+
```python
|
| 181 |
+
from scrapling.spiders import Spider, Request, Response
|
| 182 |
+
|
| 183 |
+
class QuotesSpider(Spider):
|
| 184 |
+
name = "quotes"
|
| 185 |
+
start_urls = ["https://quotes.toscrape.com/"]
|
| 186 |
+
concurrent_requests = 10
|
| 187 |
+
|
| 188 |
+
async def parse(self, response: Response):
|
| 189 |
+
for quote in response.css('.quote'):
|
| 190 |
+
yield {
|
| 191 |
+
"text": quote.css('.text::text').get(),
|
| 192 |
+
"author": quote.css('.author::text').get(),
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
next_page = response.css('.next a')
|
| 196 |
+
if next_page:
|
| 197 |
+
yield response.follow(next_page[0].attrib['href'])
|
| 198 |
+
|
| 199 |
+
result = QuotesSpider().start()
|
| 200 |
+
print(f"{len(result.items)} Zitate gescrapt")
|
| 201 |
+
result.items.to_json("quotes.json")
|
| 202 |
+
```
|
| 203 |
+
Mehrere Session-Typen in einem einzigen Spider verwenden:
|
| 204 |
+
```python
|
| 205 |
+
from scrapling.spiders import Spider, Request, Response
|
| 206 |
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession
|
| 207 |
+
|
| 208 |
+
class MultiSessionSpider(Spider):
|
| 209 |
+
name = "multi"
|
| 210 |
+
start_urls = ["https://example.com/"]
|
| 211 |
+
|
| 212 |
+
def configure_sessions(self, manager):
|
| 213 |
+
manager.add("fast", FetcherSession(impersonate="chrome"))
|
| 214 |
+
manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)
|
| 215 |
+
|
| 216 |
+
async def parse(self, response: Response):
|
| 217 |
+
for link in response.css('a::attr(href)').getall():
|
| 218 |
+
# Geschützte Seiten über die Stealth-Session leiten
|
| 219 |
+
if "protected" in link:
|
| 220 |
+
yield Request(link, sid="stealth")
|
| 221 |
+
else:
|
| 222 |
+
yield Request(link, sid="fast", callback=self.parse) # Expliziter Callback
|
| 223 |
+
```
|
| 224 |
+
Lange Crawls mit Checkpoints pausieren und fortsetzen, indem Sie den Spider so starten:
|
| 225 |
+
```python
|
| 226 |
+
QuotesSpider(crawldir="./crawl_data").start()
|
| 227 |
```
|
| 228 |
+
Drücken Sie Strg+C, um kontrolliert zu pausieren -- der Fortschritt wird automatisch gespeichert. Wenn Sie den Spider später erneut starten, übergeben Sie dasselbe `crawldir`, und er setzt dort fort, wo er aufgehört hat.
|
| 229 |
+
|
| 230 |
+
### Erweitertes Parsing & Navigation
|
| 231 |
+
```python
|
| 232 |
+
from scrapling.fetchers import Fetcher
|
| 233 |
|
| 234 |
+
# Umfangreiche Elementauswahl und Navigation
|
| 235 |
+
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 236 |
+
|
| 237 |
+
# Zitate mit verschiedenen Auswahlmethoden abrufen
|
| 238 |
+
quotes = page.css('.quote') # CSS-Selektor
|
| 239 |
+
quotes = page.xpath('//div[@class="quote"]') # XPath
|
| 240 |
+
quotes = page.find_all('div', {'class': 'quote'}) # BeautifulSoup-Stil
|
| 241 |
+
# Gleich wie
|
| 242 |
+
quotes = page.find_all('div', class_='quote')
|
| 243 |
+
quotes = page.find_all(['div'], class_='quote')
|
| 244 |
+
quotes = page.find_all(class_='quote') # und so weiter...
|
| 245 |
+
# Element nach Textinhalt finden
|
| 246 |
+
quotes = page.find_by_text('quote', tag='div')
|
| 247 |
+
|
| 248 |
+
# Erweiterte Navigation
|
| 249 |
+
quote_text = page.css('.quote')[0].css('.text::text').get()
|
| 250 |
+
quote_text = page.css('.quote').css('.text::text').getall() # Verkettete Selektoren
|
| 251 |
+
first_quote = page.css('.quote')[0]
|
| 252 |
+
author = first_quote.next_sibling.css('.author::text')
|
| 253 |
+
parent_container = first_quote.parent
|
| 254 |
+
|
| 255 |
+
# Elementbeziehungen und Ähnlichkeit
|
| 256 |
+
similar_elements = first_quote.find_similar()
|
| 257 |
+
below_elements = first_quote.below_elements()
|
| 258 |
+
```
|
| 259 |
+
Sie können den Parser direkt verwenden, wenn Sie keine Websites abrufen möchten, wie unten gezeigt:
|
| 260 |
```python
|
| 261 |
+
from scrapling.parser import Selector
|
| 262 |
+
|
| 263 |
+
page = Selector("<html>...</html>")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
```
|
| 265 |
+
Und es funktioniert genau auf die gleiche Weise!
|
| 266 |
|
| 267 |
+
### Beispiele für async Session-Verwaltung
|
| 268 |
```python
|
| 269 |
+
import asyncio
|
| 270 |
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession
|
| 271 |
+
|
| 272 |
+
async with FetcherSession(http3=True) as session: # `FetcherSession` ist kontextbewusst und kann sowohl in sync- als auch in async-Mustern arbeiten
|
| 273 |
+
page1 = session.get('https://quotes.toscrape.com/')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
|
| 275 |
|
| 276 |
+
# Async-Session-Verwendung
|
| 277 |
async with AsyncStealthySession(max_pages=2) as session:
|
| 278 |
tasks = []
|
| 279 |
urls = ['https://example.com/page1', 'https://example.com/page2']
|
| 280 |
+
|
| 281 |
for url in urls:
|
| 282 |
task = session.fetch(url)
|
| 283 |
tasks.append(task)
|
| 284 |
+
|
| 285 |
print(session.get_pool_stats()) # Optional - Der Status des Browser-Tab-Pools (beschäftigt/frei/Fehler)
|
| 286 |
results = await asyncio.gather(*tasks)
|
| 287 |
print(session.get_pool_stats())
|
|
|
|
| 289 |
|
| 290 |
## CLI & Interaktive Shell
|
| 291 |
|
| 292 |
+
Scrapling enthält eine leistungsstarke Befehlszeilenschnittstelle:
|
| 293 |
|
| 294 |
[](https://asciinema.org/a/736339)
|
| 295 |
|
|
|
|
| 297 |
```bash
|
| 298 |
scrapling shell
|
| 299 |
```
|
| 300 |
+
Seiten direkt ohne Programmierung in eine Datei extrahieren (extrahiert standardmäßig den Inhalt im `body`-Tag). Wenn die Ausgabedatei mit `.txt` endet, wird der Textinhalt des Ziels extrahiert. Wenn sie mit `.md` endet, ist es eine Markdown-Darstellung des HTML-Inhalts; wenn sie mit `.html` endet, ist es der HTML-Inhalt selbst.
|
| 301 |
```bash
|
| 302 |
scrapling extract get 'https://example.com' content.md
|
| 303 |
scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # Alle Elemente, die dem CSS-Selektor '#fromSkipToProducts' entsprechen
|
|
|
|
| 306 |
```
|
| 307 |
|
| 308 |
> [!NOTE]
|
| 309 |
+
> Es gibt viele zusätzliche Funktionen, aber wir möchten diese Seite prägnant halten, einschließlich des MCP-Servers und der interaktiven Web-Scraping-Shell. Schauen Sie sich die vollständige Dokumentation [hier](https://scrapling.readthedocs.io/en/latest/) an
|
| 310 |
|
| 311 |
## Leistungsbenchmarks
|
| 312 |
|
| 313 |
+
Scrapling ist nicht nur leistungsstark -- es ist auch blitzschnell. Die folgenden Benchmarks vergleichen Scraplings Parser mit den neuesten Versionen anderer beliebter Bibliotheken.
|
| 314 |
|
| 315 |
### Textextraktions-Geschwindigkeitstest (5000 verschachtelte Elemente)
|
| 316 |
|
| 317 |
+
| # | Bibliothek | Zeit (ms) | vs Scrapling |
|
| 318 |
|---|:-----------------:|:---------:|:------------:|
|
| 319 |
+
| 1 | Scrapling | 2.02 | 1.0x |
|
| 320 |
+
| 2 | Parsel/Scrapy | 2.04 | 1.01 |
|
| 321 |
+
| 3 | Raw Lxml | 2.54 | 1.257 |
|
| 322 |
+
| 4 | PyQuery | 24.17 | ~12x |
|
| 323 |
+
| 5 | Selectolax | 82.63 | ~41x |
|
| 324 |
+
| 6 | MechanicalSoup | 1549.71 | ~767.1x |
|
| 325 |
+
| 7 | BS4 with Lxml | 1584.31 | ~784.3x |
|
| 326 |
+
| 8 | BS4 with html5lib | 3391.91 | ~1679.1x |
|
| 327 |
|
| 328 |
|
| 329 |
### Element-Ähnlichkeit & Textsuche-Leistung
|
|
|
|
| 332 |
|
| 333 |
| Bibliothek | Zeit (ms) | vs Scrapling |
|
| 334 |
|-------------|:---------:|:------------:|
|
| 335 |
+
| Scrapling | 2.39 | 1.0x |
|
| 336 |
+
| AutoScraper | 12.45 | 5.209x |
|
| 337 |
|
| 338 |
|
| 339 |
> Alle Benchmarks stellen Durchschnittswerte von über 100 Durchläufen dar. Siehe [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) für die Methodik.
|
|
|
|
| 346 |
pip install scrapling
|
| 347 |
```
|
| 348 |
|
| 349 |
+
Diese Installation enthält nur die Parser-Engine und ihre Abhängigkeiten, ohne Fetcher oder Kommandozeilenabhängigkeiten.
|
| 350 |
|
| 351 |
### Optionale Abhängigkeiten
|
| 352 |
|
| 353 |
1. Wenn Sie eine der folgenden zusätzlichen Funktionen, die Fetcher oder ihre Klassen verwenden möchten, müssen Sie die Abhängigkeiten der Fetcher und ihre Browser-Abhängigkeiten wie folgt installieren:
|
| 354 |
```bash
|
| 355 |
pip install "scrapling[fetchers]"
|
| 356 |
+
|
| 357 |
scrapling install
|
| 358 |
```
|
| 359 |
|
| 360 |
+
Dies lädt alle Browser zusammen mit ihren Systemabhängigkeiten und Fingerprint-Manipulationsabhängigkeiten herunter.
|
| 361 |
|
| 362 |
2. Zusätzliche Funktionen:
|
| 363 |
- MCP-Server-Funktion installieren:
|
|
|
|
| 401 |
## Danksagungen
|
| 402 |
|
| 403 |
Dieses Projekt enthält angepassten Code von:
|
| 404 |
+
- Parsel (BSD-Lizenz) -- Verwendet für das [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)-Submodul
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 405 |
|
| 406 |
---
|
| 407 |
+
<div align="center"><small>Entworfen und hergestellt mit ❤️ von Karim Shoair.</small></div><br>
|
docs/README_ES.md
CHANGED
|
@@ -1,9 +1,14 @@
|
|
| 1 |
-
<
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
</
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
<p align="center">
|
| 8 |
<a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
|
| 9 |
<img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
|
|
@@ -24,46 +29,47 @@
|
|
| 24 |
</p>
|
| 25 |
|
| 26 |
<p align="center">
|
| 27 |
-
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/">
|
| 28 |
-
|
| 29 |
-
</a>
|
| 30 |
-
|
| 31 |
-
<a href="https://scrapling.readthedocs.io/en/latest/
|
| 32 |
-
|
| 33 |
-
</a>
|
| 34 |
-
|
| 35 |
-
<a href="https://scrapling.readthedocs.io/en/latest/
|
| 36 |
-
CLI
|
| 37 |
-
</a>
|
| 38 |
-
·
|
| 39 |
-
<a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/">
|
| 40 |
-
Modo MCP
|
| 41 |
-
</a>
|
| 42 |
-
·
|
| 43 |
-
<a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/">
|
| 44 |
-
Migrar desde Beautifulsoup
|
| 45 |
-
</a>
|
| 46 |
</p>
|
| 47 |
|
| 48 |
-
|
| 49 |
|
| 50 |
-
|
| 51 |
|
| 52 |
-
|
| 53 |
|
| 54 |
```python
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
# ¡Obtén el
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
200
|
| 61 |
-
>> products = page.css('.product', auto_save=True) # ¡Extrae datos que sobreviven a cambios de diseño del sitio web!
|
| 62 |
-
>> # Más tarde, si la estructura del sitio web cambia, pasa `adaptive=True`
|
| 63 |
-
>> products = page.css('.product', adaptive=True) # ¡y Scrapling aún los encuentra!
|
| 64 |
```
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
<!-- sponsors -->
|
| 69 |
|
|
@@ -87,24 +93,35 @@ Construido para la Web moderna, Scrapling presenta **su propio motor de análisi
|
|
| 87 |
|
| 88 |
## Características Principales
|
| 89 |
|
| 90 |
-
###
|
| 91 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
- **Carga Dinámica**: Obtén sitios web dinámicos con automatización completa del navegador a través de la clase `DynamicFetcher` compatible con Chromium de Playwright y Google Chrome.
|
| 93 |
-
- **Evasión Anti-bot**: Capacidades de sigilo avanzadas con `StealthyFetcher` y falsificación de
|
| 94 |
-
- **Gestión de
|
|
|
|
|
|
|
| 95 |
- **Soporte Async**: Soporte async completo en todos los fetchers y clases de sesión async dedicadas.
|
| 96 |
|
| 97 |
### Scraping Adaptativo e Integración con IA
|
| 98 |
- 🔄 **Seguimiento Inteligente de Elementos**: Relocaliza elementos después de cambios en el sitio web usando algoritmos inteligentes de similitud.
|
| 99 |
- 🎯 **Selección Flexible Inteligente**: Selectores CSS, selectores XPath, búsqueda basada en filtros, búsqueda de texto, búsqueda regex y más.
|
| 100 |
- 🔍 **Encontrar Elementos Similares**: Localiza automáticamente elementos similares a los elementos encontrados.
|
| 101 |
-
- 🤖 **Servidor MCP para usar con IA**: Servidor MCP integrado para Web Scraping asistido por IA y extracción de datos. El servidor MCP presenta capacidades
|
| 102 |
|
| 103 |
### Arquitectura de Alto Rendimiento y Probada en Batalla
|
| 104 |
-
- 🚀 **Ultrarrápido**: Rendimiento optimizado que supera a la mayoría de las bibliotecas de
|
| 105 |
- 🔋 **Eficiente en Memoria**: Estructuras de datos optimizadas y carga diferida para una huella de memoria mínima.
|
| 106 |
- ⚡ **Serialización JSON Rápida**: 10 veces más rápido que la biblioteca estándar.
|
| 107 |
-
- 🏗️ **Probado en batalla**: Scrapling no solo tiene una cobertura de
|
| 108 |
|
| 109 |
### Experiencia Amigable para Desarrolladores/Web Scrapers
|
| 110 |
- 🎯 **Shell Interactivo de Web Scraping**: Shell IPython integrado opcional con integración de Scrapling, atajos y nuevas herramientas para acelerar el desarrollo de scripts de Web Scraping, como convertir solicitudes curl a solicitudes Scrapling y ver resultados de solicitudes en tu navegador.
|
|
@@ -113,96 +130,158 @@ Construido para la Web moderna, Scrapling presenta **su propio motor de análisi
|
|
| 113 |
- 🧬 **Procesamiento de Texto Mejorado**: Métodos integrados de regex, limpieza y operaciones de cadena optimizadas.
|
| 114 |
- 📝 **Generación Automática de Selectores**: Genera selectores CSS/XPath robustos para cualquier elemento.
|
| 115 |
- 🔌 **API Familiar**: Similar a Scrapy/BeautifulSoup con los mismos pseudo-elementos usados en Scrapy/Parsel.
|
| 116 |
-
- 📘 **Cobertura Completa de Tipos**: Type hints completos para excelente soporte de IDE y autocompletado de código.
|
| 117 |
- 🔋 **Imagen Docker Lista**: Con cada lanzamiento, se construye y publica automáticamente una imagen Docker que contiene todos los navegadores.
|
| 118 |
|
| 119 |
-
##
|
|
|
|
|
|
|
| 120 |
|
| 121 |
### Uso Básico
|
|
|
|
| 122 |
```python
|
| 123 |
-
from scrapling.fetchers import Fetcher,
|
| 124 |
-
from scrapling.fetchers import FetcherSession, StealthySession, DynamicSession
|
| 125 |
|
| 126 |
-
#
|
| 127 |
-
with FetcherSession(impersonate='chrome') as session: # Usa la última versión de la huella TLS de Chrome
|
| 128 |
page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
| 129 |
-
quotes = page.css('.quote .text::text')
|
| 130 |
|
| 131 |
# O usa solicitudes de una sola vez
|
| 132 |
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 133 |
-
quotes = page.css('.quote .text::text')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
-
|
| 136 |
-
with StealthySession(headless=True, solve_cloudflare=True) as session:
|
| 137 |
page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
|
| 138 |
-
data = page.css('#padded_content a')
|
| 139 |
|
| 140 |
# O usa el estilo de solicitud de una sola vez, abre el navegador para esta solicitud, luego lo cierra después de terminar
|
| 141 |
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
|
| 142 |
-
data = page.css('#padded_content a')
|
| 143 |
-
|
| 144 |
-
# Automatización completa del navegador (Mantén el navegador abierto hasta que termines)
|
| 145 |
-
with DynamicSession(headless=True) as session:
|
| 146 |
-
page = session.fetch('https://quotes.toscrape.com/', network_idle=True)
|
| 147 |
-
quotes = page.css('.quote .text::text')
|
| 148 |
-
|
| 149 |
-
# O usa el estilo de solicitud de una sola vez
|
| 150 |
-
page = DynamicFetcher.fetch('https://quotes.toscrape.com/', network_idle=True)
|
| 151 |
-
quotes = page.css('.quote .text::text')
|
| 152 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
```python
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
page
|
| 159 |
-
page.css('a', recursive=False) # Solo elementos directos
|
| 160 |
-
page.css('a', auto_save=True) # Guarda posiciones de los elementos automáticamente
|
| 161 |
-
|
| 162 |
-
# XPath
|
| 163 |
-
page.xpath('//a/text()')
|
| 164 |
-
|
| 165 |
-
# Búsqueda flexible
|
| 166 |
-
page.find_by_text('Python', first_match=True) # Encuentra por texto
|
| 167 |
-
page.find_by_regex(r'\d{4}') # Encuentra por patrón regex
|
| 168 |
-
page.find('div', {'class': 'container'}) # Encuentra por atributos
|
| 169 |
-
|
| 170 |
-
# Navegación
|
| 171 |
-
element.parent # Obtener elemento padre
|
| 172 |
-
element.next_sibling # Obtener siguiente hermano
|
| 173 |
-
element.children # Obtener hijos
|
| 174 |
-
|
| 175 |
-
# Elementos similares
|
| 176 |
-
similar = page.get_similar(element) # Encuentra elementos similares
|
| 177 |
-
|
| 178 |
-
# Scraping adaptativo
|
| 179 |
-
saved_elements = page.css('.product', auto_save=True)
|
| 180 |
-
# Más tarde, cuando el sitio web cambia:
|
| 181 |
-
page.css('.product', adaptive=True) # Encuentra elementos usando posiciones guardadas
|
| 182 |
```
|
|
|
|
| 183 |
|
| 184 |
-
###
|
| 185 |
```python
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
with FetcherSession() as session:
|
| 190 |
-
|
| 191 |
-
page1 = session.get('https://quotes.toscrape.com/login')
|
| 192 |
-
page2 = session.post('https://quotes.toscrape.com/login', data={'username': 'admin', 'password': 'admin'})
|
| 193 |
-
|
| 194 |
-
# Cambiar fingerprint del navegador si es necesario
|
| 195 |
page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
|
| 196 |
|
| 197 |
# Uso de sesión async
|
| 198 |
async with AsyncStealthySession(max_pages=2) as session:
|
| 199 |
tasks = []
|
| 200 |
urls = ['https://example.com/page1', 'https://example.com/page2']
|
| 201 |
-
|
| 202 |
for url in urls:
|
| 203 |
task = session.fetch(url)
|
| 204 |
tasks.append(task)
|
| 205 |
-
|
| 206 |
print(session.get_pool_stats()) # Opcional - El estado del pool de pestañas del navegador (ocupado/libre/error)
|
| 207 |
results = await asyncio.gather(*tasks)
|
| 208 |
print(session.get_pool_stats())
|
|
@@ -210,11 +289,11 @@ async with AsyncStealthySession(max_pages=2) as session:
|
|
| 210 |
|
| 211 |
## CLI y Shell Interactivo
|
| 212 |
|
| 213 |
-
Scrapling
|
| 214 |
|
| 215 |
[](https://asciinema.org/a/736339)
|
| 216 |
|
| 217 |
-
Lanzar
|
| 218 |
```bash
|
| 219 |
scrapling shell
|
| 220 |
```
|
|
@@ -227,24 +306,24 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.
|
|
| 227 |
```
|
| 228 |
|
| 229 |
> [!NOTE]
|
| 230 |
-
> Hay muchas características adicionales, pero queremos mantener esta página concisa,
|
| 231 |
|
| 232 |
## Benchmarks de Rendimiento
|
| 233 |
|
| 234 |
-
Scrapling no solo es
|
| 235 |
|
| 236 |
### Prueba de Velocidad de Extracción de Texto (5000 elementos anidados)
|
| 237 |
|
| 238 |
-
| # | Biblioteca | Tiempo (ms) | vs Scrapling |
|
| 239 |
|---|:-----------------:|:-----------:|:------------:|
|
| 240 |
-
| 1 | Scrapling |
|
| 241 |
-
| 2 | Parsel/Scrapy | 2.
|
| 242 |
-
| 3 | Raw Lxml |
|
| 243 |
-
| 4 | PyQuery |
|
| 244 |
-
| 5 | Selectolax |
|
| 245 |
-
| 6 |
|
| 246 |
-
| 7 |
|
| 247 |
-
| 8 | BS4 with html5lib |
|
| 248 |
|
| 249 |
|
| 250 |
### Rendimiento de Similitud de Elementos y Búsqueda de Texto
|
|
@@ -253,8 +332,8 @@ Las capacidades de búsqueda adaptativa de elementos de Scrapling superan signif
|
|
| 253 |
|
| 254 |
| Biblioteca | Tiempo (ms) | vs Scrapling |
|
| 255 |
|-------------|:-----------:|:------------:|
|
| 256 |
-
| Scrapling | 2.
|
| 257 |
-
| AutoScraper |
|
| 258 |
|
| 259 |
|
| 260 |
> Todos los benchmarks representan promedios de más de 100 ejecuciones. Ver [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) para la metodología.
|
|
@@ -267,29 +346,29 @@ Scrapling requiere Python 3.10 o superior:
|
|
| 267 |
pip install scrapling
|
| 268 |
```
|
| 269 |
|
| 270 |
-
|
| 271 |
|
| 272 |
### Dependencias Opcionales
|
| 273 |
|
| 274 |
1. Si vas a usar alguna de las características adicionales a continuación, los fetchers, o sus clases, necesitarás instalar las dependencias de los fetchers y sus dependencias del navegador de la siguiente manera:
|
| 275 |
```bash
|
| 276 |
pip install "scrapling[fetchers]"
|
| 277 |
-
|
| 278 |
scrapling install
|
| 279 |
```
|
| 280 |
|
| 281 |
-
Esto descarga todos los navegadores, junto con sus dependencias del sistema y dependencias de manipulación de
|
| 282 |
|
| 283 |
2. Características adicionales:
|
| 284 |
- Instalar la característica del servidor MCP:
|
| 285 |
```bash
|
| 286 |
pip install "scrapling[ai]"
|
| 287 |
```
|
| 288 |
-
- Instalar características del
|
| 289 |
```bash
|
| 290 |
pip install "scrapling[shell]"
|
| 291 |
```
|
| 292 |
-
- Instalar todo:
|
| 293 |
```bash
|
| 294 |
pip install "scrapling[all]"
|
| 295 |
```
|
|
@@ -324,12 +403,5 @@ Este trabajo está licenciado bajo la Licencia BSD-3-Clause.
|
|
| 324 |
Este proyecto incluye código adaptado de:
|
| 325 |
- Parsel (Licencia BSD)—Usado para el submódulo [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)
|
| 326 |
|
| 327 |
-
## Agradecimientos y Referencias
|
| 328 |
-
|
| 329 |
-
- El brillante trabajo de [Daijro](https://github.com/daijro) en [BrowserForge](https://github.com/daijro/browserforge) y [Camoufox](https://github.com/daijro/camoufox)
|
| 330 |
-
- El brillante trabajo de [Vinyzu](https://github.com/Vinyzu) en [Botright](https://github.com/Vinyzu/Botright) y [PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)
|
| 331 |
-
- [brotector](https://github.com/kaliiiiiiiiii/brotector) por técnicas de evasión de detección de navegador
|
| 332 |
-
- [fakebrowser](https://github.com/kkoooqq/fakebrowser) y [BotBrowser](https://github.com/botswin/BotBrowser) por investigación de huellas digitales
|
| 333 |
-
|
| 334 |
---
|
| 335 |
-
<div align="center"><small>Diseñado y elaborado con ❤️ por Karim Shoair.</small></div><br>
|
|
|
|
| 1 |
+
<h1 align="center">
|
| 2 |
+
<a href="https://scrapling.readthedocs.io">
|
| 3 |
+
<picture>
|
| 4 |
+
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
|
| 5 |
+
<img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
|
| 6 |
+
</picture>
|
| 7 |
+
</a>
|
| 8 |
+
<br>
|
| 9 |
+
<small>Effortless Web Scraping for the Modern Web</small>
|
| 10 |
+
</h1>
|
| 11 |
+
|
| 12 |
<p align="center">
|
| 13 |
<a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
|
| 14 |
<img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
|
|
|
|
| 29 |
</p>
|
| 30 |
|
| 31 |
<p align="center">
|
| 32 |
+
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/"><strong>Metodos de seleccion</strong></a>
|
| 33 |
+
·
|
| 34 |
+
<a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/"><strong>Elegir un fetcher</strong></a>
|
| 35 |
+
·
|
| 36 |
+
<a href="https://scrapling.readthedocs.io/en/latest/cli/overview/"><strong>CLI</strong></a>
|
| 37 |
+
·
|
| 38 |
+
<a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/"><strong>Modo MCP</strong></a>
|
| 39 |
+
·
|
| 40 |
+
<a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/"><strong>Migrar desde Beautifulsoup</strong></a>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
</p>
|
| 42 |
|
| 43 |
+
Scrapling es un framework de Web Scraping adaptativo que se encarga de todo, desde una sola solicitud hasta un rastreo a gran escala.
|
| 44 |
|
| 45 |
+
Su parser aprende de los cambios de los sitios web y relocaliza automáticamente tus elementos cuando las páginas se actualizan. Sus fetchers evaden sistemas anti-bot como Cloudflare Turnstile de forma nativa. Y su framework Spider te permite escalar a rastreos concurrentes con múltiples sesiones, con Pause & Resume y rotación automática de Proxy, todo en unas pocas líneas de Python. Una biblioteca, cero compromisos.
|
| 46 |
|
| 47 |
+
Rastreos ultrarrápidos con estadísticas en tiempo real y Streaming. Construido por Web Scrapers para Web Scrapers y usuarios regulares, hay algo para todos.
|
| 48 |
|
| 49 |
```python
|
| 50 |
+
from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
|
| 51 |
+
StealthyFetcher.adaptive = True
|
| 52 |
+
page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # ¡Obtén el sitio web bajo el radar!
|
| 53 |
+
products = page.css('.product', auto_save=True) # ¡Extrae datos que sobreviven a cambios de diseño del sitio web!
|
| 54 |
+
products = page.css('.product', adaptive=True) # Más tarde, si la estructura del sitio web cambia, ¡pasa `adaptive=True` para encontrarlos!
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
```
|
| 56 |
+
O escala a rastreos completos
|
| 57 |
+
```python
|
| 58 |
+
from scrapling.spiders import Spider, Response
|
| 59 |
|
| 60 |
+
class MySpider(Spider):
|
| 61 |
+
name = "demo"
|
| 62 |
+
start_urls = ["https://example.com/"]
|
| 63 |
+
|
| 64 |
+
async def parse(self, response: Response):
|
| 65 |
+
for item in response.css('.product'):
|
| 66 |
+
yield {"title": item.css('h2::text').get()}
|
| 67 |
+
|
| 68 |
+
MySpider().start()
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
# Patrocinadores
|
| 73 |
|
| 74 |
<!-- sponsors -->
|
| 75 |
|
|
|
|
| 93 |
|
| 94 |
## Características Principales
|
| 95 |
|
| 96 |
+
### Spiders — Un Framework Completo de Rastreo
|
| 97 |
+
- 🕷️ **API de Spider al estilo Scrapy**: Define spiders con `start_urls`, callbacks async `parse`, y objetos `Request`/`Response`.
|
| 98 |
+
- ⚡ **Rastreo Concurrente**: Límites de concurrencia configurables, limitación por dominio y retrasos de descarga.
|
| 99 |
+
- 🔄 **Soporte Multi-Session**: Interfaz unificada para solicitudes HTTP y navegadores headless sigilosos en un solo Spider — enruta solicitudes a diferentes sesiones por ID.
|
| 100 |
+
- 💾 **Pause & Resume**: Persistencia de rastreo basada en Checkpoint. Presiona Ctrl+C para un cierre ordenado; reinicia para continuar desde donde lo dejaste.
|
| 101 |
+
- 📡 **Modo Streaming**: Transmite elementos extraídos a medida que llegan con `async for item in spider.stream()` con estadísticas en tiempo real — ideal para UI, pipelines y rastreos de larga duración.
|
| 102 |
+
- 🛡️ **Detección de Solicitudes Bloqueadas**: Detección automática y reintento de solicitudes bloqueadas con lógica personalizable.
|
| 103 |
+
- 📦 **Exportación Integrada**: Exporta resultados a través de hooks y tu propio pipeline o el JSON/JSONL integrado con `result.items.to_json()` / `result.items.to_jsonl()` respectivamente.
|
| 104 |
+
|
| 105 |
+
### Obtención Avanzada de Sitios Web con Soporte de Session
|
| 106 |
+
- **Solicitudes HTTP**: Solicitudes HTTP rápidas y sigilosas con la clase `Fetcher`. Puede imitar el fingerprint TLS de los navegadores, encabezados y usar HTTP/3.
|
| 107 |
- **Carga Dinámica**: Obtén sitios web dinámicos con automatización completa del navegador a través de la clase `DynamicFetcher` compatible con Chromium de Playwright y Google Chrome.
|
| 108 |
+
- **Evasión Anti-bot**: Capacidades de sigilo avanzadas con `StealthyFetcher` y falsificación de fingerprint. Puede evadir fácilmente todos los tipos de Turnstile/Interstitial de Cloudflare con automatización.
|
| 109 |
+
- **Gestión de Session**: Soporte de sesión persistente con las clases `FetcherSession`, `StealthySession` y `DynamicSession` para la gestión de cookies y estado entre solicitudes.
|
| 110 |
+
- **Rotación de Proxy**: `ProxyRotator` integrado con estrategias de rotación cíclica o personalizadas en todos los tipos de sesión, además de sobrescrituras de Proxy por solicitud.
|
| 111 |
+
- **Bloqueo de Dominios**: Bloquea solicitudes a dominios específicos (y sus subdominios) en fetchers basados en navegador.
|
| 112 |
- **Soporte Async**: Soporte async completo en todos los fetchers y clases de sesión async dedicadas.
|
| 113 |
|
| 114 |
### Scraping Adaptativo e Integración con IA
|
| 115 |
- 🔄 **Seguimiento Inteligente de Elementos**: Relocaliza elementos después de cambios en el sitio web usando algoritmos inteligentes de similitud.
|
| 116 |
- 🎯 **Selección Flexible Inteligente**: Selectores CSS, selectores XPath, búsqueda basada en filtros, búsqueda de texto, búsqueda regex y más.
|
| 117 |
- 🔍 **Encontrar Elementos Similares**: Localiza automáticamente elementos similares a los elementos encontrados.
|
| 118 |
+
- 🤖 **Servidor MCP para usar con IA**: Servidor MCP integrado para Web Scraping asistido por IA y extracción de datos. El servidor MCP presenta capacidades potentes y personalizadas que aprovechan Scrapling para extraer contenido específico antes de pasarlo a la IA (Claude/Cursor/etc), acelerando así las operaciones y reduciendo costos al minimizar el uso de tokens. ([video demo](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
|
| 119 |
|
| 120 |
### Arquitectura de Alto Rendimiento y Probada en Batalla
|
| 121 |
+
- 🚀 **Ultrarrápido**: Rendimiento optimizado que supera a la mayoría de las bibliotecas de Web Scraping de Python.
|
| 122 |
- 🔋 **Eficiente en Memoria**: Estructuras de datos optimizadas y carga diferida para una huella de memoria mínima.
|
| 123 |
- ⚡ **Serialización JSON Rápida**: 10 veces más rápido que la biblioteca estándar.
|
| 124 |
+
- 🏗️ **Probado en batalla**: Scrapling no solo tiene una cobertura de pruebas del 92% y cobertura completa de type hints, sino que ha sido utilizado diariamente por cientos de Web Scrapers durante el último año.
|
| 125 |
|
| 126 |
### Experiencia Amigable para Desarrolladores/Web Scrapers
|
| 127 |
- 🎯 **Shell Interactivo de Web Scraping**: Shell IPython integrado opcional con integración de Scrapling, atajos y nuevas herramientas para acelerar el desarrollo de scripts de Web Scraping, como convertir solicitudes curl a solicitudes Scrapling y ver resultados de solicitudes en tu navegador.
|
|
|
|
| 130 |
- 🧬 **Procesamiento de Texto Mejorado**: Métodos integrados de regex, limpieza y operaciones de cadena optimizadas.
|
| 131 |
- 📝 **Generación Automática de Selectores**: Genera selectores CSS/XPath robustos para cualquier elemento.
|
| 132 |
- 🔌 **API Familiar**: Similar a Scrapy/BeautifulSoup con los mismos pseudo-elementos usados en Scrapy/Parsel.
|
| 133 |
+
- 📘 **Cobertura Completa de Tipos**: Type hints completos para excelente soporte de IDE y autocompletado de código. Todo el código fuente se escanea automáticamente con **PyRight** y **MyPy** en cada cambio.
|
| 134 |
- 🔋 **Imagen Docker Lista**: Con cada lanzamiento, se construye y publica automáticamente una imagen Docker que contiene todos los navegadores.
|
| 135 |
|
| 136 |
+
## Primeros Pasos
|
| 137 |
+
|
| 138 |
+
Aquí tienes un vistazo rápido de lo que Scrapling puede hacer sin entrar en profundidad.
|
| 139 |
|
| 140 |
### Uso Básico
|
| 141 |
+
Solicitudes HTTP con soporte de sesión
|
| 142 |
```python
|
| 143 |
+
from scrapling.fetchers import Fetcher, FetcherSession
|
|
|
|
| 144 |
|
| 145 |
+
with FetcherSession(impersonate='chrome') as session: # Usa la última versión del fingerprint TLS de Chrome
|
|
|
|
| 146 |
page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
| 147 |
+
quotes = page.css('.quote .text::text').getall()
|
| 148 |
|
| 149 |
# O usa solicitudes de una sola vez
|
| 150 |
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 151 |
+
quotes = page.css('.quote .text::text').getall()
|
| 152 |
+
```
|
| 153 |
+
Modo sigiloso avanzado
|
| 154 |
+
```python
|
| 155 |
+
from scrapling.fetchers import StealthyFetcher, StealthySession
|
| 156 |
|
| 157 |
+
with StealthySession(headless=True, solve_cloudflare=True) as session: # Mantén el navegador abierto hasta que termines
|
|
|
|
| 158 |
page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
|
| 159 |
+
data = page.css('#padded_content a').getall()
|
| 160 |
|
| 161 |
# O usa el estilo de solicitud de una sola vez, abre el navegador para esta solicitud, luego lo cierra después de terminar
|
| 162 |
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
|
| 163 |
+
data = page.css('#padded_content a').getall()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
```
|
| 165 |
+
Automatización completa del navegador
|
| 166 |
+
```python
|
| 167 |
+
from scrapling.fetchers import DynamicFetcher, DynamicSession
|
| 168 |
+
|
| 169 |
+
with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # Mantén el navegador abierto hasta que termines
|
| 170 |
+
page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
|
| 171 |
+
data = page.xpath('//span[@class="text"]/text()').getall() # Selector XPath si lo prefieres
|
| 172 |
|
| 173 |
+
# O usa el estilo de solicitud de una sola vez, abre el navegador para esta solicitud, luego lo cierra después de terminar
|
| 174 |
+
page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
|
| 175 |
+
data = page.css('.quote .text::text').getall()
|
| 176 |
+
```
|
| 177 |
+
|
| 178 |
+
### Spiders
|
| 179 |
+
Construye rastreadores completos con solicitudes concurrentes, múltiples tipos de sesión y Pause & Resume:
|
| 180 |
+
```python
|
| 181 |
+
from scrapling.spiders import Spider, Request, Response
|
| 182 |
+
|
| 183 |
+
class QuotesSpider(Spider):
|
| 184 |
+
name = "quotes"
|
| 185 |
+
start_urls = ["https://quotes.toscrape.com/"]
|
| 186 |
+
concurrent_requests = 10
|
| 187 |
+
|
| 188 |
+
async def parse(self, response: Response):
|
| 189 |
+
for quote in response.css('.quote'):
|
| 190 |
+
yield {
|
| 191 |
+
"text": quote.css('.text::text').get(),
|
| 192 |
+
"author": quote.css('.author::text').get(),
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
next_page = response.css('.next a')
|
| 196 |
+
if next_page:
|
| 197 |
+
yield response.follow(next_page[0].attrib['href'])
|
| 198 |
+
|
| 199 |
+
result = QuotesSpider().start()
|
| 200 |
+
print(f"Se extrajeron {len(result.items)} citas")
|
| 201 |
+
result.items.to_json("quotes.json")
|
| 202 |
+
```
|
| 203 |
+
Usa múltiples tipos de sesión en un solo Spider:
|
| 204 |
+
```python
|
| 205 |
+
from scrapling.spiders import Spider, Request, Response
|
| 206 |
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession
|
| 207 |
+
|
| 208 |
+
class MultiSessionSpider(Spider):
|
| 209 |
+
name = "multi"
|
| 210 |
+
start_urls = ["https://example.com/"]
|
| 211 |
+
|
| 212 |
+
def configure_sessions(self, manager):
|
| 213 |
+
manager.add("fast", FetcherSession(impersonate="chrome"))
|
| 214 |
+
manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)
|
| 215 |
+
|
| 216 |
+
async def parse(self, response: Response):
|
| 217 |
+
for link in response.css('a::attr(href)').getall():
|
| 218 |
+
# Enruta las páginas protegidas a través de la sesión sigilosa
|
| 219 |
+
if "protected" in link:
|
| 220 |
+
yield Request(link, sid="stealth")
|
| 221 |
+
else:
|
| 222 |
+
yield Request(link, sid="fast", callback=self.parse) # callback explícito
|
| 223 |
+
```
|
| 224 |
+
Pausa y reanuda rastreos largos con checkpoints ejecutando el Spider así:
|
| 225 |
+
```python
|
| 226 |
+
QuotesSpider(crawldir="./crawl_data").start()
|
| 227 |
+
```
|
| 228 |
+
Presiona Ctrl+C para pausar de forma ordenada — el progreso se guarda automáticamente. Después, cuando inicies el Spider de nuevo, pasa el mismo `crawldir`, y continuará desde donde se detuvo.
|
| 229 |
+
|
| 230 |
+
### Análisis Avanzado y Navegación
|
| 231 |
+
```python
|
| 232 |
+
from scrapling.fetchers import Fetcher
|
| 233 |
+
|
| 234 |
+
# Selección rica de elementos y navegación
|
| 235 |
+
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 236 |
+
|
| 237 |
+
# Obtén citas con múltiples métodos de selección
|
| 238 |
+
quotes = page.css('.quote') # Selector CSS
|
| 239 |
+
quotes = page.xpath('//div[@class="quote"]') # XPath
|
| 240 |
+
quotes = page.find_all('div', {'class': 'quote'}) # Estilo BeautifulSoup
|
| 241 |
+
# Igual que
|
| 242 |
+
quotes = page.find_all('div', class_='quote')
|
| 243 |
+
quotes = page.find_all(['div'], class_='quote')
|
| 244 |
+
quotes = page.find_all(class_='quote') # y así sucesivamente...
|
| 245 |
+
# Encuentra elementos por contenido de texto
|
| 246 |
+
quotes = page.find_by_text('quote', tag='div')
|
| 247 |
+
|
| 248 |
+
# Navegación avanzada
|
| 249 |
+
quote_text = page.css('.quote')[0].css('.text::text').get()
|
| 250 |
+
quote_text = page.css('.quote').css('.text::text').getall() # Selectores encadenados
|
| 251 |
+
first_quote = page.css('.quote')[0]
|
| 252 |
+
author = first_quote.next_sibling.css('.author::text')
|
| 253 |
+
parent_container = first_quote.parent
|
| 254 |
+
|
| 255 |
+
# Relaciones y similitud de elementos
|
| 256 |
+
similar_elements = first_quote.find_similar()
|
| 257 |
+
below_elements = first_quote.below_elements()
|
| 258 |
+
```
|
| 259 |
+
Puedes usar el parser directamente si no necesitas obtener sitios web, como se muestra a continuación:
|
| 260 |
```python
|
| 261 |
+
from scrapling.parser import Selector
|
| 262 |
+
|
| 263 |
+
page = Selector("<html>...</html>")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
```
|
| 265 |
+
¡Y funciona exactamente de la misma manera!
|
| 266 |
|
| 267 |
+
### Ejemplos de Gestión de Session Async
|
| 268 |
```python
|
| 269 |
+
import asyncio
|
| 270 |
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession
|
| 271 |
+
|
| 272 |
+
async with FetcherSession(http3=True) as session: # `FetcherSession` es consciente del contexto y puede funcionar tanto en patrones sync/async
|
| 273 |
+
page1 = session.get('https://quotes.toscrape.com/')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
|
| 275 |
|
| 276 |
# Uso de sesión async
|
| 277 |
async with AsyncStealthySession(max_pages=2) as session:
|
| 278 |
tasks = []
|
| 279 |
urls = ['https://example.com/page1', 'https://example.com/page2']
|
| 280 |
+
|
| 281 |
for url in urls:
|
| 282 |
task = session.fetch(url)
|
| 283 |
tasks.append(task)
|
| 284 |
+
|
| 285 |
print(session.get_pool_stats()) # Opcional - El estado del pool de pestañas del navegador (ocupado/libre/error)
|
| 286 |
results = await asyncio.gather(*tasks)
|
| 287 |
print(session.get_pool_stats())
|
|
|
|
| 289 |
|
| 290 |
## CLI y Shell Interactivo
|
| 291 |
|
| 292 |
+
Scrapling incluye una poderosa interfaz de línea de comandos:
|
| 293 |
|
| 294 |
[](https://asciinema.org/a/736339)
|
| 295 |
|
| 296 |
+
Lanzar el Shell interactivo de Web Scraping
|
| 297 |
```bash
|
| 298 |
scrapling shell
|
| 299 |
```
|
|
|
|
| 306 |
```
|
| 307 |
|
| 308 |
> [!NOTE]
|
| 309 |
+
> Hay muchas características adicionales, pero queremos mantener esta página concisa, incluyendo el servidor MCP y el Shell Interactivo de Web Scraping. Consulta la documentación completa [aquí](https://scrapling.readthedocs.io/en/latest/)
|
| 310 |
|
| 311 |
## Benchmarks de Rendimiento
|
| 312 |
|
| 313 |
+
Scrapling no solo es potente, también es ultrarrápido. Los siguientes benchmarks comparan el parser de Scrapling con las últimas versiones de otras bibliotecas populares.
|
| 314 |
|
| 315 |
### Prueba de Velocidad de Extracción de Texto (5000 elementos anidados)
|
| 316 |
|
| 317 |
+
| # | Biblioteca | Tiempo (ms) | vs Scrapling |
|
| 318 |
|---|:-----------------:|:-----------:|:------------:|
|
| 319 |
+
| 1 | Scrapling | 2.02 | 1.0x |
|
| 320 |
+
| 2 | Parsel/Scrapy | 2.04 | 1.01 |
|
| 321 |
+
| 3 | Raw Lxml | 2.54 | 1.257 |
|
| 322 |
+
| 4 | PyQuery | 24.17 | ~12x |
|
| 323 |
+
| 5 | Selectolax | 82.63 | ~41x |
|
| 324 |
+
| 6 | MechanicalSoup | 1549.71 | ~767.1x |
|
| 325 |
+
| 7 | BS4 with Lxml | 1584.31 | ~784.3x |
|
| 326 |
+
| 8 | BS4 with html5lib | 3391.91 | ~1679.1x |
|
| 327 |
|
| 328 |
|
| 329 |
### Rendimiento de Similitud de Elementos y Búsqueda de Texto
|
|
|
|
| 332 |
|
| 333 |
| Biblioteca | Tiempo (ms) | vs Scrapling |
|
| 334 |
|-------------|:-----------:|:------------:|
|
| 335 |
+
| Scrapling | 2.39 | 1.0x |
|
| 336 |
+
| AutoScraper | 12.45 | 5.209x |
|
| 337 |
|
| 338 |
|
| 339 |
> Todos los benchmarks representan promedios de más de 100 ejecuciones. Ver [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) para la metodología.
|
|
|
|
| 346 |
pip install scrapling
|
| 347 |
```
|
| 348 |
|
| 349 |
+
Esta instalación solo incluye el motor de análisis y sus dependencias, sin ningún fetcher ni dependencias de línea de comandos.
|
| 350 |
|
| 351 |
### Dependencias Opcionales
|
| 352 |
|
| 353 |
1. Si vas a usar alguna de las características adicionales a continuación, los fetchers, o sus clases, necesitarás instalar las dependencias de los fetchers y sus dependencias del navegador de la siguiente manera:
|
| 354 |
```bash
|
| 355 |
pip install "scrapling[fetchers]"
|
| 356 |
+
|
| 357 |
scrapling install
|
| 358 |
```
|
| 359 |
|
| 360 |
+
Esto descarga todos los navegadores, junto con sus dependencias del sistema y dependencias de manipulación de fingerprint.
|
| 361 |
|
| 362 |
2. Características adicionales:
|
| 363 |
- Instalar la característica del servidor MCP:
|
| 364 |
```bash
|
| 365 |
pip install "scrapling[ai]"
|
| 366 |
```
|
| 367 |
+
- Instalar características del Shell (Shell de Web Scraping y el comando `extract`):
|
| 368 |
```bash
|
| 369 |
pip install "scrapling[shell]"
|
| 370 |
```
|
| 371 |
+
- Instalar todo:
|
| 372 |
```bash
|
| 373 |
pip install "scrapling[all]"
|
| 374 |
```
|
|
|
|
| 403 |
Este proyecto incluye código adaptado de:
|
| 404 |
- Parsel (Licencia BSD)—Usado para el submódulo [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)
|
| 405 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 406 |
---
|
| 407 |
+
<div align="center"><small>Diseñado y elaborado con ❤️ por Karim Shoair.</small></div><br>
|
docs/README_JP.md
CHANGED
|
@@ -1,9 +1,14 @@
|
|
| 1 |
-
<
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
</
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
<p align="center">
|
| 8 |
<a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
|
| 9 |
<img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
|
|
@@ -24,46 +29,47 @@
|
|
| 24 |
</p>
|
| 25 |
|
| 26 |
<p align="center">
|
| 27 |
-
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/">
|
| 28 |
-
|
| 29 |
-
</a>
|
| 30 |
-
|
| 31 |
-
<a href="https://scrapling.readthedocs.io/en/latest/
|
| 32 |
-
|
| 33 |
-
</a>
|
| 34 |
-
|
| 35 |
-
<a href="https://scrapling.readthedocs.io/en/latest/
|
| 36 |
-
CLI
|
| 37 |
-
</a>
|
| 38 |
-
·
|
| 39 |
-
<a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/">
|
| 40 |
-
MCPモード
|
| 41 |
-
</a>
|
| 42 |
-
·
|
| 43 |
-
<a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/">
|
| 44 |
-
Beautifulsoupからの移行
|
| 45 |
-
</a>
|
| 46 |
</p>
|
| 47 |
|
| 48 |
-
|
| 49 |
|
| 50 |
-
|
| 51 |
|
| 52 |
-
|
| 53 |
|
| 54 |
```python
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
# レーダーの下でウェブサイト
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
```
|
| 65 |
|
| 66 |
-
|
|
|
|
| 67 |
|
| 68 |
<!-- sponsors -->
|
| 69 |
|
|
@@ -87,138 +93,211 @@ Scraplingは単なるウェブスクレイピングライブラリではあり
|
|
| 87 |
|
| 88 |
## 主な機能
|
| 89 |
|
| 90 |
-
###
|
| 91 |
-
- **
|
| 92 |
-
- **
|
| 93 |
-
- **
|
| 94 |
-
- **
|
| 95 |
-
- **
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
### 適応型スクレイピングとAI統合
|
| 98 |
- 🔄 **スマート要素追跡**:インテリジェントな類似性アルゴリズムを使用してウェブサイトの変更後に要素を再配置。
|
| 99 |
- 🎯 **スマート柔軟選択**:CSSセレクタ、XPathセレクタ、フィルタベース検索、テキスト検索、正規表現検索など。
|
| 100 |
-
- 🔍 **類似要素
|
| 101 |
-
- 🤖 **AIと使用するMCPサーバー**:AI支援
|
| 102 |
|
| 103 |
### 高性能で実戦テスト済みのアーキテクチャ
|
| 104 |
-
- 🚀 **高速**:ほとんどのPythonスクレイピングライブラリを上回る最適化されたパフォーマンス。
|
| 105 |
- 🔋 **メモリ効率**:最小のメモリフットプリントのための最適化されたデータ構造と遅延読み込み。
|
| 106 |
- ⚡ **高速JSONシリアル化**:標準ライブラリの10倍の速度。
|
| 107 |
-
- 🏗️ **実戦テスト済み**:Scraplingは92%のテストカバレッジと完全な型ヒントカバレッジを備えているだけでなく、過去1年間に数百人の
|
| 108 |
|
| 109 |
-
### 開発者/
|
| 110 |
-
- 🎯 **インタラクティブ
|
| 111 |
- 🚀 **ターミナルから直接使用**:オプションで、コードを一行も書かずにScraplingを使用してURLをスクレイプできます!
|
| 112 |
- 🛠️ **豊富なナビゲーションAPI**:親、兄弟、子のナビゲーションメソッドによる高度なDOMトラバーサル。
|
| 113 |
- 🧬 **強化されたテキスト処理**:組み込みの正規表現、クリーニングメソッド、最適化された文字列操作。
|
| 114 |
- 📝 **自動セレクタ生成**:任意の要素に対して堅牢なCSS/XPathセレクタを生成。
|
| 115 |
-
- 🔌 **馴染みのあるAPI**:Scrapy/Parselで使用されている同じ疑似要素を持つScrapy/BeautifulSoupに似
|
| 116 |
-
- 📘 **完全な型カバレッジ**:優れたIDEサポートとコード補完のための完全な型ヒント。
|
| 117 |
- 🔋 **すぐに使えるDockerイメージ**:各リリースで、すべてのブラウザを含むDockerイメージが自動的にビルドおよびプッシュされます。
|
| 118 |
|
| 119 |
## はじめに
|
| 120 |
|
|
|
|
|
|
|
| 121 |
### 基本的な使い方
|
|
|
|
| 122 |
```python
|
| 123 |
-
from scrapling.fetchers import Fetcher,
|
| 124 |
-
from scrapling.fetchers import FetcherSession, StealthySession, DynamicSession
|
| 125 |
|
| 126 |
-
#
|
| 127 |
-
with FetcherSession(impersonate='chrome') as session: # ChromeのTLS��ィンガープリントの最新バージョンを使用
|
| 128 |
page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
| 129 |
-
quotes = page.css('.quote .text::text')
|
| 130 |
|
| 131 |
# または一回限りのリクエストを使用
|
| 132 |
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 133 |
-
quotes = page.css('.quote .text::text')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
-
#
|
| 136 |
-
with StealthySession(headless=True, solve_cloudflare=True) as session:
|
| 137 |
page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
|
| 138 |
-
data = page.css('#padded_content a')
|
| 139 |
|
| 140 |
-
# または一回限りのリクエストスタイル
|
| 141 |
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
|
| 142 |
-
data = page.css('#padded_content a')
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
```python
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
page
|
| 159 |
-
page.css('a', recursive=False) # 直接の要素のみ
|
| 160 |
-
page.css('a', auto_save=True) # 要素の位置を自動保存
|
| 161 |
-
|
| 162 |
-
# XPath
|
| 163 |
-
page.xpath('//a/text()')
|
| 164 |
-
|
| 165 |
-
# 柔軟な検索
|
| 166 |
-
page.find_by_text('Python', first_match=True) # テキストで検索
|
| 167 |
-
page.find_by_regex(r'\d{4}') # 正規表現パターンで検索
|
| 168 |
-
page.find('div', {'class': 'container'}) # 属性で検索
|
| 169 |
-
|
| 170 |
-
# ナビゲーション
|
| 171 |
-
element.parent # 親要素を取得
|
| 172 |
-
element.next_sibling # 次の兄弟を取得
|
| 173 |
-
element.children # 子要素を取得
|
| 174 |
-
|
| 175 |
-
# 類似要素
|
| 176 |
-
similar = page.get_similar(element) # 類似要素を見つける
|
| 177 |
-
|
| 178 |
-
# 適応型スクレイピング
|
| 179 |
-
saved_elements = page.css('.product', auto_save=True)
|
| 180 |
-
# 後でウェブサイトが変更されたとき:
|
| 181 |
-
page.css('.product', adaptive=True) # 保存された位置を使用して要素を見つける
|
| 182 |
```
|
|
|
|
| 183 |
|
| 184 |
-
###
|
| 185 |
```python
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
with FetcherSession() as session:
|
| 190 |
-
|
| 191 |
-
page1 = session.get('https://quotes.toscrape.com/login')
|
| 192 |
-
page2 = session.post('https://quotes.toscrape.com/login', data={'username': 'admin', 'password': 'admin'})
|
| 193 |
-
|
| 194 |
-
# 必要に応じてブラウザのフィンガープリントを切り替え
|
| 195 |
page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
|
| 196 |
|
| 197 |
-
# 非同期
|
| 198 |
async with AsyncStealthySession(max_pages=2) as session:
|
| 199 |
tasks = []
|
| 200 |
urls = ['https://example.com/page1', 'https://example.com/page2']
|
| 201 |
-
|
| 202 |
for url in urls:
|
| 203 |
task = session.fetch(url)
|
| 204 |
tasks.append(task)
|
| 205 |
-
|
| 206 |
print(session.get_pool_stats()) # オプション - ブラウザタブプールのステータス(ビジー/フリー/エラー)
|
| 207 |
results = await asyncio.gather(*tasks)
|
| 208 |
print(session.get_pool_stats())
|
| 209 |
```
|
| 210 |
|
| 211 |
-
## CLIとインタラクティブ
|
| 212 |
|
| 213 |
-
Scrapling
|
| 214 |
|
| 215 |
[](https://asciinema.org/a/736339)
|
| 216 |
|
| 217 |
-
インタラクティブ
|
| 218 |
```bash
|
| 219 |
scrapling shell
|
| 220 |
```
|
| 221 |
-
プログラミングせずに直接ページをファイルに抽出(デフォルトで`body`タグ内のコンテンツを抽出)。出力ファイルが`.txt`で終わる場合、ターゲットのテキストコンテンツが抽出されます。`.md`で終わる場合、HTMLコンテンツのMarkdown表現になります
|
| 222 |
```bash
|
| 223 |
scrapling extract get 'https://example.com' content.md
|
| 224 |
scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # CSSセレクタ'#fromSkipToProducts'に一致するすべての要素
|
|
@@ -227,34 +306,34 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.
|
|
| 227 |
```
|
| 228 |
|
| 229 |
> [!NOTE]
|
| 230 |
-
> MCPサーバーやインタラクティブ
|
| 231 |
|
| 232 |
## パフォーマンスベンチマーク
|
| 233 |
|
| 234 |
-
Scraplingは強力であるだけでなく、
|
| 235 |
|
| 236 |
### テキスト抽出速度テスト(5000個のネストされた要素)
|
| 237 |
|
| 238 |
-
| # |
|
| 239 |
-
|---|:-----------------:|:-------:|:------------:|
|
| 240 |
-
| 1 | Scrapling |
|
| 241 |
-
| 2 | Parsel/Scrapy |
|
| 242 |
-
| 3 | Raw Lxml | 2.
|
| 243 |
-
| 4 | PyQuery |
|
| 244 |
-
| 5 | Selectolax |
|
| 245 |
-
| 6 |
|
| 246 |
-
| 7 |
|
| 247 |
-
| 8 | BS4 with html5lib |
|
| 248 |
|
| 249 |
|
| 250 |
### 要素類似性とテキスト検索のパフォーマンス
|
| 251 |
|
| 252 |
Scraplingの適応型要素検索機能は代替手段を大幅に上回ります:
|
| 253 |
|
| 254 |
-
| ライブラリ
|
| 255 |
-
|-------------|:------:|:------------:|
|
| 256 |
-
| Scrapling |
|
| 257 |
-
| AutoScraper |
|
| 258 |
|
| 259 |
|
| 260 |
> すべてのベンチマークは100回以上の実行の平均を表します。方法論については[benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py)を参照してください。
|
|
@@ -267,25 +346,25 @@ ScraplingにはPython 3.10以上が必要です:
|
|
| 267 |
pip install scrapling
|
| 268 |
```
|
| 269 |
|
| 270 |
-
|
| 271 |
|
| 272 |
### オプションの依存関係
|
| 273 |
|
| 274 |
-
1. 以下の追加機能、
|
| 275 |
```bash
|
| 276 |
pip install "scrapling[fetchers]"
|
| 277 |
-
|
| 278 |
scrapling install
|
| 279 |
```
|
| 280 |
|
| 281 |
-
これにより、すべてのブラウザ、およびそれらのシステム依存関係と
|
| 282 |
|
| 283 |
2. 追加機能:
|
| 284 |
- MCPサーバー機能をインストール:
|
| 285 |
```bash
|
| 286 |
pip install "scrapling[ai]"
|
| 287 |
```
|
| 288 |
-
-
|
| 289 |
```bash
|
| 290 |
pip install "scrapling[shell]"
|
| 291 |
```
|
|
@@ -324,12 +403,5 @@ docker pull ghcr.io/d4vinci/scrapling:latest
|
|
| 324 |
このプロジェクトには次から適応されたコードが含まれています:
|
| 325 |
- Parsel(BSDライセンス)— [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)サブモジュールに使用
|
| 326 |
|
| 327 |
-
## 感謝と参考文献
|
| 328 |
-
|
| 329 |
-
- [Daijro](https://github.com/daijro)の[BrowserForge](https://github.com/daijro/browserforge)と[Camoufox](https://github.com/daijro/camoufox)における素晴らしい仕事
|
| 330 |
-
- [Vinyzu](https://github.com/Vinyzu)の[Botright](https://github.com/Vinyzu/Botright)と[PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)における素晴らしい仕事
|
| 331 |
-
- ブラウザ検出回避技術を提供する[brotector](https://github.com/kaliiiiiiiiii/brotector)
|
| 332 |
-
- フィンガープリント研究を提供する[fakebrowser](https://github.com/kkoooqq/fakebrowser)と[BotBrowser](https://github.com/botswin/BotBrowser)
|
| 333 |
-
|
| 334 |
---
|
| 335 |
-
<div align="center"><small>Karim Shoairによって❤️でデザインおよび作成されました。</small></div><br>
|
|
|
|
| 1 |
+
<h1 align="center">
|
| 2 |
+
<a href="https://scrapling.readthedocs.io">
|
| 3 |
+
<picture>
|
| 4 |
+
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
|
| 5 |
+
<img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
|
| 6 |
+
</picture>
|
| 7 |
+
</a>
|
| 8 |
+
<br>
|
| 9 |
+
<small>Effortless Web Scraping for the Modern Web</small>
|
| 10 |
+
</h1>
|
| 11 |
+
|
| 12 |
<p align="center">
|
| 13 |
<a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
|
| 14 |
<img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
|
|
|
|
| 29 |
</p>
|
| 30 |
|
| 31 |
<p align="center">
|
| 32 |
+
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/"><strong>選択メソッド</strong></a>
|
| 33 |
+
·
|
| 34 |
+
<a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/"><strong>Fetcherの選び方</strong></a>
|
| 35 |
+
·
|
| 36 |
+
<a href="https://scrapling.readthedocs.io/en/latest/cli/overview/"><strong>CLI</strong></a>
|
| 37 |
+
·
|
| 38 |
+
<a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/"><strong>MCPモード</strong></a>
|
| 39 |
+
·
|
| 40 |
+
<a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/"><strong>Beautifulsoupからの移行</strong></a>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
</p>
|
| 42 |
|
| 43 |
+
Scraplingは、単一のリクエストから本格的なクロールまですべてを処理する適応型Web Scrapingフレームワークです。
|
| 44 |
|
| 45 |
+
そのパーサーはウェブサイトの変更から学習し、ページが更新されたときに要素を自動的に再配置します。Fetcherはすぐに使えるCloudflare Turnstileなどのアンチボットシステムを回避します。そしてSpiderフレームワークにより、Pause & Resumeや自動Proxy回転機能を備えた並行マルチSessionクロールへとスケールアップできます — すべてわずか数行のPythonで。1つのライブラリ、妥協なし。
|
| 46 |
|
| 47 |
+
リアルタイム統計とStreamingによる超高速クロール。Web Scraperによって、Web Scraperと一般ユーザーのために構築され、誰にでも何かがあります。
|
| 48 |
|
| 49 |
```python
|
| 50 |
+
from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
|
| 51 |
+
StealthyFetcher.adaptive = True
|
| 52 |
+
page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # レーダーの下でウェブサイトを取得!
|
| 53 |
+
products = page.css('.product', auto_save=True) # ウェブサイトのデザイン変更に耐えるデータをスクレイプ!
|
| 54 |
+
products = page.css('.product', adaptive=True) # 後でウェブサイトの構造が変わったら、`adaptive=True`を渡して見つける!
|
| 55 |
+
```
|
| 56 |
+
または本格的なクロールへスケールアップ
|
| 57 |
+
```python
|
| 58 |
+
from scrapling.spiders import Spider, Response
|
| 59 |
+
|
| 60 |
+
class MySpider(Spider):
|
| 61 |
+
name = "demo"
|
| 62 |
+
start_urls = ["https://example.com/"]
|
| 63 |
+
|
| 64 |
+
async def parse(self, response: Response):
|
| 65 |
+
for item in response.css('.product'):
|
| 66 |
+
yield {"title": item.css('h2::text').get()}
|
| 67 |
+
|
| 68 |
+
MySpider().start()
|
| 69 |
```
|
| 70 |
|
| 71 |
+
|
| 72 |
+
# スポンサー
|
| 73 |
|
| 74 |
<!-- sponsors -->
|
| 75 |
|
|
|
|
| 93 |
|
| 94 |
## 主な機能
|
| 95 |
|
| 96 |
+
### Spider — 本格的なクロールフレームワーク
|
| 97 |
+
- 🕷️ **Scrapy風のSpider API**:`start_urls`、async `parse` callback、`Request`/`Response`オブジェクトでSpiderを定義。
|
| 98 |
+
- ⚡ **並行クロール**:設定可能な並行数制限、ドメインごとのスロットリング、ダウンロード遅延。
|
| 99 |
+
- 🔄 **マルチSessionサポート**:HTTPリクエストとステルスヘッドレスブラウザの統一インターフェース — IDによって異なるSessionにリクエストをルーティング。
|
| 100 |
+
- 💾 **Pause & Resume**:Checkpointベースのクロール永続化。Ctrl+Cで正常にシャットダウン;再起動すると中断したところから再開。
|
| 101 |
+
- 📡 **Streamingモード**:`async for item in spider.stream()`でリアルタイム統計とともにスクレイプされたアイテムをStreamingで受信 — UI、パイプライン、長時間実行クロールに最適。
|
| 102 |
+
- 🛡️ **ブロックされたリクエストの検出**:カスタマイズ可能なロジックによるブロックされたリクエストの自動検出とリトライ。
|
| 103 |
+
- 📦 **組み込みエクスポート**:フックや独自のパイプライン、または組み込みのJSON/JSONLで結果をエクスポート。それぞれ`result.items.to_json()` / `result.items.to_jsonl()`を使用。
|
| 104 |
+
|
| 105 |
+
### Sessionサポート付き高度なウェブサイト取得
|
| 106 |
+
- **HTTPリクエスト**:`Fetcher`クラスで高速かつステルスなHTTPリクエスト。ブラウザのTLS fingerprint、ヘッダーを模倣し、HTTP/3を使用可能。
|
| 107 |
+
- **動的読み込み**:PlaywrightのChromiumとGoogle Chromeをサポートする`DynamicFetcher`クラスによる完全なブラウザ自動化で動的ウェブサイトを取得。
|
| 108 |
+
- **アンチボット回避**:`StealthyFetcher`とfingerprint偽装による高度なステルス機能。自動化でCloudflareのTurnstile/Interstitialのすべてのタイプを簡単に回避。
|
| 109 |
+
- **Session管理**:リクエスト間でCookieと状態を管理するための`FetcherSession`、`StealthySession`、`DynamicSession`クラスによる永続的なSessionサポート。
|
| 110 |
+
- **Proxy回転**:すべてのSessionタイプに対応したラウンドロビンまたはカスタム戦略の組み込み`ProxyRotator`、さらにリクエストごとのProxyオーバーライド。
|
| 111 |
+
- **ドメインブロック**:ブラウザベースのFetcherで特定のドメイン(およびそのサブドメイン)へのリクエストをブロック。
|
| 112 |
+
- **asyncサポート**:すべてのFetcherおよび専用asyncSessionクラス全体での完全なasyncサポート。
|
| 113 |
|
| 114 |
### 適応型スクレイピングとAI統合
|
| 115 |
- 🔄 **スマート要素追跡**:インテリジェントな類似性アルゴリズムを使用してウェブサイトの変更後に要素を再配置。
|
| 116 |
- 🎯 **スマート柔軟選択**:CSSセレクタ、XPathセレクタ、フィルタベース検索、テキスト検索、正規表現検索など。
|
| 117 |
+
- 🔍 **類似要素の検出**:見つかった要素に類似した要素を自動的に特定。
|
| 118 |
+
- 🤖 **AIと使用するMCPサーバー**:AI支援Web Scrapingとデータ抽出のための組み込みMCPサーバー。MCPサーバーは、AI(Claude/Cursorなど)に渡す前にScraplingを活用してターゲットコンテンツを抽出する強力でカスタムな機能を備えており、操作を高速化し、トークン使用量を最小限に抑えることでコストを削減します。([デモ動画](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
|
| 119 |
|
| 120 |
### 高性能で実戦テスト済みのアーキテクチャ
|
| 121 |
+
- 🚀 **超高速**:ほとんどのPythonスクレイピングライブラリを上回る最適化されたパフォーマンス。
|
| 122 |
- 🔋 **メモリ効率**:最小のメモリフットプリントのための最適化されたデータ構造と遅延読み込み。
|
| 123 |
- ⚡ **高速JSONシリアル化**:標準ライブラリの10倍の速度。
|
| 124 |
+
- 🏗️ **実戦テスト済み**:Scraplingは92%のテストカバレッジと完全な型ヒントカバレッジを備えているだけでなく、過去1年間に数百人のWeb Scraperによって毎日使用されてきました。
|
| 125 |
|
| 126 |
+
### 開発者/Web Scraperにやさしい体験
|
| 127 |
+
- 🎯 **インタラクティブWeb Scraping Shell**:Scrapling統合、ショートカット、curlリクエストをScraplingリクエストに変換したり、ブラウザでリクエスト結果を表示したりするなどの新しいツールを備えたオプションの組み込みIPython Shellで、Web Scrapingスクリプトの開発を加速。
|
| 128 |
- 🚀 **ターミナルから直接使用**:オプションで、コードを一行も書かずにScraplingを使用してURLをスクレイプできます!
|
| 129 |
- 🛠️ **豊富なナビゲーションAPI**:親、兄弟、子のナビゲーションメソッドによる高度なDOMトラバーサル。
|
| 130 |
- 🧬 **強化されたテキスト処理**:組み込みの正規表現、クリーニングメソッド、最適化された文字列操作。
|
| 131 |
- 📝 **自動セレクタ生成**:任意の要素に対して堅牢なCSS/XPathセレクタを生成。
|
| 132 |
+
- 🔌 **馴染みのあるAPI**:Scrapy/Parselで使用されている同じ疑似要素を持つScrapy/BeautifulSoupに似た設計。
|
| 133 |
+
- 📘 **完全な型カバレッジ**:優れたIDEサポートとコード補完のための完全な型ヒント。コードベース全体が変更のたびに**PyRight**と**MyPy**で自動的にスキャンされます。
|
| 134 |
- 🔋 **すぐに使えるDockerイメージ**:各リリースで、すべてのブラウザを含むDockerイメージが自動的にビルドおよびプッシュされます。
|
| 135 |
|
| 136 |
## はじめに
|
| 137 |
|
| 138 |
+
深く掘り下げずに、Scraplingにできることの簡単な概要をお見せしましょう。
|
| 139 |
+
|
| 140 |
### 基本的な使い方
|
| 141 |
+
Sessionサポート付きHTTPリクエスト
|
| 142 |
```python
|
| 143 |
+
from scrapling.fetchers import Fetcher, FetcherSession
|
|
|
|
| 144 |
|
| 145 |
+
with FetcherSession(impersonate='chrome') as session: # ChromeのTLS fingerprintの最新バージョンを使用
|
|
|
|
| 146 |
page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
| 147 |
+
quotes = page.css('.quote .text::text').getall()
|
| 148 |
|
| 149 |
# または一回限りのリクエストを使用
|
| 150 |
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 151 |
+
quotes = page.css('.quote .text::text').getall()
|
| 152 |
+
```
|
| 153 |
+
高度なステルスモード
|
| 154 |
+
```python
|
| 155 |
+
from scrapling.fetchers import StealthyFetcher, StealthySession
|
| 156 |
|
| 157 |
+
with StealthySession(headless=True, solve_cloudflare=True) as session: # 完了するまでブラウザを開いたままにする
|
|
|
|
| 158 |
page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
|
| 159 |
+
data = page.css('#padded_content a').getall()
|
| 160 |
|
| 161 |
+
# または一回限りのリクエストスタイル、このリクエストのためにブラウザを開き、完了後に閉じる
|
| 162 |
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
|
| 163 |
+
data = page.css('#padded_content a').getall()
|
| 164 |
+
```
|
| 165 |
+
完全なブラウザ自動化
|
| 166 |
+
```python
|
| 167 |
+
from scrapling.fetchers import DynamicFetcher, DynamicSession
|
| 168 |
+
|
| 169 |
+
with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # 完了するまでブラウザを開いたままにする
|
| 170 |
+
page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
|
| 171 |
+
data = page.xpath('//span[@class="text"]/text()').getall() # お好みであればXPathセレクタを使用
|
| 172 |
+
|
| 173 |
+
# または一回限りのリクエストスタイル、このリクエストのためにブラウザを開き、完了後に閉じる
|
| 174 |
+
page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
|
| 175 |
+
data = page.css('.quote .text::text').getall()
|
| 176 |
+
```
|
| 177 |
+
|
| 178 |
+
### Spider
|
| 179 |
+
並行リクエスト、複数のSessionタイプ、Pause & Resumeを備えた本格的なクローラーを構築:
|
| 180 |
+
```python
|
| 181 |
+
from scrapling.spiders import Spider, Request, Response
|
| 182 |
+
|
| 183 |
+
class QuotesSpider(Spider):
|
| 184 |
+
name = "quotes"
|
| 185 |
+
start_urls = ["https://quotes.toscrape.com/"]
|
| 186 |
+
concurrent_requests = 10
|
| 187 |
+
|
| 188 |
+
async def parse(self, response: Response):
|
| 189 |
+
for quote in response.css('.quote'):
|
| 190 |
+
yield {
|
| 191 |
+
"text": quote.css('.text::text').get(),
|
| 192 |
+
"author": quote.css('.author::text').get(),
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
next_page = response.css('.next a')
|
| 196 |
+
if next_page:
|
| 197 |
+
yield response.follow(next_page[0].attrib['href'])
|
| 198 |
+
|
| 199 |
+
result = QuotesSpider().start()
|
| 200 |
+
print(f"{len(result.items)}件の引用をスクレイプしました")
|
| 201 |
+
result.items.to_json("quotes.json")
|
| 202 |
+
```
|
| 203 |
+
単一のSpiderで複数のSessionタイプを使用:
|
| 204 |
+
```python
|
| 205 |
+
from scrapling.spiders import Spider, Request, Response
|
| 206 |
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession
|
| 207 |
+
|
| 208 |
+
class MultiSessionSpider(Spider):
|
| 209 |
+
name = "multi"
|
| 210 |
+
start_urls = ["https://example.com/"]
|
| 211 |
+
|
| 212 |
+
def configure_sessions(self, manager):
|
| 213 |
+
manager.add("fast", FetcherSession(impersonate="chrome"))
|
| 214 |
+
manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)
|
| 215 |
+
|
| 216 |
+
async def parse(self, response: Response):
|
| 217 |
+
for link in response.css('a::attr(href)').getall():
|
| 218 |
+
# 保護されたページはステルスSessionを通してルーティング
|
| 219 |
+
if "protected" in link:
|
| 220 |
+
yield Request(link, sid="stealth")
|
| 221 |
+
else:
|
| 222 |
+
yield Request(link, sid="fast", callback=self.parse) # 明示的なcallback
|
| 223 |
+
```
|
| 224 |
+
Checkpointを使用して長時間のクロールをPause & Resume:
|
| 225 |
+
```python
|
| 226 |
+
QuotesSpider(crawldir="./crawl_data").start()
|
| 227 |
```
|
| 228 |
+
Ctrl+Cを押すと正常に一時停止し、進捗は自動的に保存されます。後でSpiderを再度起動する際に同じ`crawldir`を渡すと、中断したところから再開します。
|
| 229 |
+
|
| 230 |
+
### 高度なパースとナビゲーション
|
| 231 |
+
```python
|
| 232 |
+
from scrapling.fetchers import Fetcher
|
| 233 |
|
| 234 |
+
# 豊富な要素選択とナビゲーション
|
| 235 |
+
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 236 |
+
|
| 237 |
+
# 複数の選択メソッドで引用���取得
|
| 238 |
+
quotes = page.css('.quote') # CSSセレクタ
|
| 239 |
+
quotes = page.xpath('//div[@class="quote"]') # XPath
|
| 240 |
+
quotes = page.find_all('div', {'class': 'quote'}) # BeautifulSoupスタイル
|
| 241 |
+
# 以下と同じ
|
| 242 |
+
quotes = page.find_all('div', class_='quote')
|
| 243 |
+
quotes = page.find_all(['div'], class_='quote')
|
| 244 |
+
quotes = page.find_all(class_='quote') # など...
|
| 245 |
+
# テキスト内容で要素を検索
|
| 246 |
+
quotes = page.find_by_text('quote', tag='div')
|
| 247 |
+
|
| 248 |
+
# 高度なナビゲーション
|
| 249 |
+
quote_text = page.css('.quote')[0].css('.text::text').get()
|
| 250 |
+
quote_text = page.css('.quote').css('.text::text').getall() # チェーンセレクタ
|
| 251 |
+
first_quote = page.css('.quote')[0]
|
| 252 |
+
author = first_quote.next_sibling.css('.author::text')
|
| 253 |
+
parent_container = first_quote.parent
|
| 254 |
+
|
| 255 |
+
# 要素の関連性と類似性
|
| 256 |
+
similar_elements = first_quote.find_similar()
|
| 257 |
+
below_elements = first_quote.below_elements()
|
| 258 |
+
```
|
| 259 |
+
ウェブサイトを取得せずにパーサーをすぐに使用することもできます:
|
| 260 |
```python
|
| 261 |
+
from scrapling.parser import Selector
|
| 262 |
+
|
| 263 |
+
page = Selector("<html>...</html>")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
```
|
| 265 |
+
まったく同じ方法で動作します!
|
| 266 |
|
| 267 |
+
### 非同期Session管理の例
|
| 268 |
```python
|
| 269 |
+
import asyncio
|
| 270 |
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession
|
| 271 |
+
|
| 272 |
+
async with FetcherSession(http3=True) as session: # `FetcherSession`はコンテキストアウェアで、同期/非同期両方のパターンで動作可能
|
| 273 |
+
page1 = session.get('https://quotes.toscrape.com/')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
|
| 275 |
|
| 276 |
+
# 非同期Sessionの使用
|
| 277 |
async with AsyncStealthySession(max_pages=2) as session:
|
| 278 |
tasks = []
|
| 279 |
urls = ['https://example.com/page1', 'https://example.com/page2']
|
| 280 |
+
|
| 281 |
for url in urls:
|
| 282 |
task = session.fetch(url)
|
| 283 |
tasks.append(task)
|
| 284 |
+
|
| 285 |
print(session.get_pool_stats()) # オプション - ブラウザタブプールのステータス(ビジー/フリー/エラー)
|
| 286 |
results = await asyncio.gather(*tasks)
|
| 287 |
print(session.get_pool_stats())
|
| 288 |
```
|
| 289 |
|
| 290 |
+
## CLIとインタラクティブShell
|
| 291 |
|
| 292 |
+
Scraplingには強力なコマンドラインインターフェースが含まれています:
|
| 293 |
|
| 294 |
[](https://asciinema.org/a/736339)
|
| 295 |
|
| 296 |
+
インタラクティブWeb Scraping Shellを起動
|
| 297 |
```bash
|
| 298 |
scrapling shell
|
| 299 |
```
|
| 300 |
+
プログラミングせずに直接ページをファイルに抽出(デフォルトで`body`タグ内のコンテンツを抽出)。出力ファイルが`.txt`で終わる場合、ターゲットのテキストコンテンツが抽出されます。`.md`で終わる場合、HTMLコンテンツのMarkdown表現になります。`.html`で終わる場合、HTMLコンテンツそのものになります。
|
| 301 |
```bash
|
| 302 |
scrapling extract get 'https://example.com' content.md
|
| 303 |
scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # CSSセレクタ'#fromSkipToProducts'に一致するすべての要素
|
|
|
|
| 306 |
```
|
| 307 |
|
| 308 |
> [!NOTE]
|
| 309 |
+
> MCPサーバーやインタラクティブWeb Scraping Shellなど、他にも多くの追加機能がありますが、このページは簡潔に保ちたいと思います。完全なドキュメントは[こちら](https://scrapling.readthedocs.io/en/latest/)をご覧ください
|
| 310 |
|
| 311 |
## パフォーマンスベンチマーク
|
| 312 |
|
| 313 |
+
Scraplingは強力であるだけでなく、超高速です。以下のベンチマークは、Scraplingのパーサーを他の人気ライブラリの最新バージョンと比較しています。
|
| 314 |
|
| 315 |
### テキスト抽出速度テスト(5000個のネストされた要素)
|
| 316 |
|
| 317 |
+
| # | ライブラリ | 時間(ms) | vs Scrapling |
|
| 318 |
+
|---|:-----------------:|:---------:|:------------:|
|
| 319 |
+
| 1 | Scrapling | 2.02 | 1.0x |
|
| 320 |
+
| 2 | Parsel/Scrapy | 2.04 | 1.01 |
|
| 321 |
+
| 3 | Raw Lxml | 2.54 | 1.257 |
|
| 322 |
+
| 4 | PyQuery | 24.17 | ~12x |
|
| 323 |
+
| 5 | Selectolax | 82.63 | ~41x |
|
| 324 |
+
| 6 | MechanicalSoup | 1549.71 | ~767.1x |
|
| 325 |
+
| 7 | BS4 with Lxml | 1584.31 | ~784.3x |
|
| 326 |
+
| 8 | BS4 with html5lib | 3391.91 | ~1679.1x |
|
| 327 |
|
| 328 |
|
| 329 |
### 要素類似性とテキスト検索のパフォーマンス
|
| 330 |
|
| 331 |
Scraplingの適応型要素検索機能は代替手段を大幅に上回ります:
|
| 332 |
|
| 333 |
+
| ライブラリ | 時間(ms) | vs Scrapling |
|
| 334 |
+
|-------------|:---------:|:------------:|
|
| 335 |
+
| Scrapling | 2.39 | 1.0x |
|
| 336 |
+
| AutoScraper | 12.45 | 5.209x |
|
| 337 |
|
| 338 |
|
| 339 |
> すべてのベンチマークは100回以上の実行の平均を表します。方法論については[benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py)を参照してください。
|
|
|
|
| 346 |
pip install scrapling
|
| 347 |
```
|
| 348 |
|
| 349 |
+
このインストールにはパーサーエンジンとその依存関係のみが含まれており、Fetcherやコマンドライン依存関係は含まれていません。
|
| 350 |
|
| 351 |
### オプションの依存関係
|
| 352 |
|
| 353 |
+
1. 以下の追加機能、Fetcher、またはそれらのクラスのいずれかを使用する場合は、Fetcherの依存関係とブラウザの依存関係を次のようにインストールする必要があります:
|
| 354 |
```bash
|
| 355 |
pip install "scrapling[fetchers]"
|
| 356 |
+
|
| 357 |
scrapling install
|
| 358 |
```
|
| 359 |
|
| 360 |
+
これにより、すべてのブラウザ、およびそれらのシステム依存関係とfingerprint操作依存関係がダウンロードされます。
|
| 361 |
|
| 362 |
2. 追加機能:
|
| 363 |
- MCPサーバー機能をインストール:
|
| 364 |
```bash
|
| 365 |
pip install "scrapling[ai]"
|
| 366 |
```
|
| 367 |
+
- Shell機能(Web Scraping Shellと`extract`コマンド)をインストール:
|
| 368 |
```bash
|
| 369 |
pip install "scrapling[shell]"
|
| 370 |
```
|
|
|
|
| 403 |
このプロジェクトには次から適応されたコードが含まれています:
|
| 404 |
- Parsel(BSDライセンス)— [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)サブモジュールに使用
|
| 405 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 406 |
---
|
| 407 |
+
<div align="center"><small>Karim Shoairによって❤️でデザインおよび作成されました。</small></div><br>
|
docs/README_RU.md
CHANGED
|
@@ -1,9 +1,14 @@
|
|
| 1 |
-
<
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
</
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
<p align="center">
|
| 8 |
<a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
|
| 9 |
<img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
|
|
@@ -24,46 +29,47 @@
|
|
| 24 |
</p>
|
| 25 |
|
| 26 |
<p align="center">
|
| 27 |
-
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/">
|
| 28 |
-
|
| 29 |
-
</a>
|
| 30 |
-
|
| 31 |
-
<a href="https://scrapling.readthedocs.io/en/latest/
|
| 32 |
-
|
| 33 |
-
</a>
|
| 34 |
-
|
| 35 |
-
<a href="https://scrapling.readthedocs.io/en/latest/
|
| 36 |
-
CLI
|
| 37 |
-
</a>
|
| 38 |
-
·
|
| 39 |
-
<a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/">
|
| 40 |
-
Режим MCP
|
| 41 |
-
</a>
|
| 42 |
-
·
|
| 43 |
-
<a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/">
|
| 44 |
-
Миграция с Beautifulsoup
|
| 45 |
-
</a>
|
| 46 |
</p>
|
| 47 |
|
| 48 |
-
|
| 49 |
|
| 50 |
-
|
| 51 |
|
| 52 |
-
|
| 53 |
|
| 54 |
```python
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
#
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
```
|
| 65 |
|
| 66 |
-
|
|
|
|
| 67 |
|
| 68 |
<!-- sponsors -->
|
| 69 |
|
|
@@ -87,138 +93,211 @@ Scrapling - это не просто очередная библиотека д
|
|
| 87 |
|
| 88 |
## Ключевые особенности
|
| 89 |
|
| 90 |
-
###
|
| 91 |
-
- **
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
- **Динамическая загрузка**: Загрузка динамических сайтов с полной автоматизацией браузера через класс `DynamicFetcher`, поддерживающий Chromium от Playwright и Google Chrome.
|
| 93 |
-
- **Обход анти-ботов**: Расширенные возможности скрытности с `StealthyFetcher` и подмену о
|
| 94 |
- **Управление сессиями**: Поддержка постоянных сессий с классами `FetcherSession`, `StealthySession` и `DynamicSession` для управления cookie и состоянием между запросами.
|
| 95 |
-
- **
|
|
|
|
|
|
|
| 96 |
|
| 97 |
### Адаптивный скрапинг и интеграция с ИИ
|
| 98 |
- 🔄 **Умное отслеживание элементов**: Перемещайте элементы после изменений сайта с помощью интеллектуальных алгоритмов подобия.
|
| 99 |
- 🎯 **Умный гибкий выбор**: CSS-селекторы, XPath-селекторы, поиск на основе фильтров, текстовый поиск, поиск по регулярным выражениям и многое другое.
|
| 100 |
-
- 🔍 **Поиск похожих элементов**: Автоматически находите элементы, похожие на найденные
|
| 101 |
-
- 🤖 **MCP-сервер для использования с ИИ**: Встроенный MCP-сервер для
|
| 102 |
|
| 103 |
### Высокопроизводительная и проверенная в боях архитектура
|
| 104 |
-
- 🚀 **Молниеносн
|
| 105 |
- 🔋 **Эффективное использование памяти**: Оптимизированные структуры данных и ленивая загрузка для минимального потребления памяти.
|
| 106 |
-
- ⚡ **Быстрая сериализация JSON**: В 10 раз быстрее
|
| 107 |
- 🏗️ **Проверено в боях**: Scrapling имеет не только 92% покрытия тестами и полное покрытие type hints, но и ежедневно использовался сотнями веб-скраперов в течение последнего года.
|
| 108 |
|
| 109 |
### Удобный для разработчиков/веб-скраперов опыт
|
| 110 |
-
- 🎯 **Интерактивная
|
| 111 |
- 🚀 **Используйте прямо из терминала**: При желании вы можете использовать Scrapling для скрапинга URL без написания ни одной строки кода!
|
| 112 |
- 🛠️ **Богатый API навигации**: Расширенный обход DOM с методами навигации по родителям, братьям и детям.
|
| 113 |
- 🧬 **Улучшенная обработка текста**: Встроенные регулярные выражения, методы очистки и оптимизированные операции со строками.
|
| 114 |
-
- 📝 **Автоматическая генерация селекторов**: Генерация над
|
| 115 |
- 🔌 **Знакомый API**: Похож на Scrapy/BeautifulSoup с теми же псевдоэлементами, используемыми в Scrapy/Parsel.
|
| 116 |
-
- 📘 **Полное покрытие типами**: Полные
|
| 117 |
-
- 🔋 **Готовый Docker-образ**: С каждым релизом автоматически созда
|
| 118 |
|
| 119 |
## Начало работы
|
| 120 |
|
|
|
|
|
|
|
| 121 |
### Базовое использование
|
|
|
|
| 122 |
```python
|
| 123 |
-
from scrapling.fetchers import Fetcher,
|
| 124 |
-
from scrapling.fetchers import FetcherSession, StealthySession, DynamicSession
|
| 125 |
|
| 126 |
-
#
|
| 127 |
-
with FetcherSession(impersonate='chrome') as session: # Используйте последнюю версию TLS-отпечатка Chrome
|
| 128 |
page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
| 129 |
-
quotes = page.css('.quote .text::text')
|
| 130 |
|
| 131 |
# Или используйте одноразовые запросы
|
| 132 |
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 133 |
-
quotes = page.css('.quote .text::text')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
-
|
| 136 |
-
with StealthySession(headless=True, solve_cloudflare=True) as session:
|
| 137 |
page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
|
| 138 |
-
data = page.css('#padded_content a')
|
| 139 |
|
| 140 |
-
# Или используйте стиль одноразового запроса
|
| 141 |
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
|
| 142 |
-
data = page.css('#padded_content a')
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
```python
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
page
|
| 159 |
-
page.css('a', recursive=False) # Только прямые элементы
|
| 160 |
-
page.css('a', auto_save=True) # Автоматически сохранять позиции элементов
|
| 161 |
-
|
| 162 |
-
# XPath
|
| 163 |
-
page.xpath('//a/text()')
|
| 164 |
-
|
| 165 |
-
# Гибкий поиск
|
| 166 |
-
page.find_by_text('Python', first_match=True) # Найти по тексту
|
| 167 |
-
page.find_by_regex(r'\d{4}') # Найти по паттерну regex
|
| 168 |
-
page.find('div', {'class': 'container'}) # Найти по атрибутам
|
| 169 |
-
|
| 170 |
-
# Навигация
|
| 171 |
-
element.parent # Получить родительский элемент
|
| 172 |
-
element.next_sibling # Получить следующего брата
|
| 173 |
-
element.children # Получить дочерние элементы
|
| 174 |
-
|
| 175 |
-
# Похожие элементы
|
| 176 |
-
similar = page.get_similar(element) # Найти похожие элементы
|
| 177 |
-
|
| 178 |
-
# Адаптивный скрапинг
|
| 179 |
-
saved_elements = page.css('.product', auto_save=True)
|
| 180 |
-
# Позже, когда сайт изменится:
|
| 181 |
-
page.css('.product', adaptive=True) # Найти элементы используя сохраненные позиции
|
| 182 |
```
|
|
|
|
| 183 |
|
| 184 |
-
###
|
| 185 |
```python
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
with FetcherSession() as session:
|
| 190 |
-
|
| 191 |
-
page1 = session.get('https://quotes.toscrape.com/login')
|
| 192 |
-
page2 = session.post('https://quotes.toscrape.com/login', data={'username': 'admin', 'password': 'admin'})
|
| 193 |
-
|
| 194 |
-
# При необходимости переключите отпечаток браузера
|
| 195 |
page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
|
| 196 |
|
| 197 |
-
# Использование
|
| 198 |
async with AsyncStealthySession(max_pages=2) as session:
|
| 199 |
tasks = []
|
| 200 |
urls = ['https://example.com/page1', 'https://example.com/page2']
|
| 201 |
-
|
| 202 |
for url in urls:
|
| 203 |
task = session.fetch(url)
|
| 204 |
tasks.append(task)
|
| 205 |
-
|
| 206 |
-
print(session.get_pool_stats()) # Опционально
|
| 207 |
results = await asyncio.gather(*tasks)
|
| 208 |
print(session.get_pool_stats())
|
| 209 |
```
|
| 210 |
|
| 211 |
-
## CLI и интерактивная
|
| 212 |
|
| 213 |
-
Scrapling
|
| 214 |
|
| 215 |
[](https://asciinema.org/a/736339)
|
| 216 |
|
| 217 |
-
Запустить интерактивную
|
| 218 |
```bash
|
| 219 |
scrapling shell
|
| 220 |
```
|
| 221 |
-
Извлечь страницы в файл напрямую без программирования (
|
| 222 |
```bash
|
| 223 |
scrapling extract get 'https://example.com' content.md
|
| 224 |
scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # Все элементы, соответствующие CSS-селектору '#fromSkipToProducts'
|
|
@@ -227,24 +306,24 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.
|
|
| 227 |
```
|
| 228 |
|
| 229 |
> [!NOTE]
|
| 230 |
-
> Есть мно
|
| 231 |
|
| 232 |
## Тесты производительности
|
| 233 |
|
| 234 |
-
Scrapling не только мощный
|
| 235 |
|
| 236 |
### Тест скорости извлечения текста (5000 вложенных элементов)
|
| 237 |
|
| 238 |
-
| # | Библиотека | Время (мс) | vs Scrapling |
|
| 239 |
|---|:-----------------:|:----------:|:------------:|
|
| 240 |
-
| 1 | Scrapling |
|
| 241 |
-
| 2 | Parsel/Scrapy | 2.
|
| 242 |
-
| 3 | Raw Lxml | 2.
|
| 243 |
-
| 4 | PyQuery |
|
| 244 |
-
| 5 | Selectolax |
|
| 245 |
-
| 6 |
|
| 246 |
-
| 7 |
|
| 247 |
-
| 8 | BS4 with html5lib |
|
| 248 |
|
| 249 |
|
| 250 |
### Производительность подобия элементов и текстового поиска
|
|
@@ -253,8 +332,8 @@ Scrapling не только мощный - он также невероятно
|
|
| 253 |
|
| 254 |
| Библиотека | Время (мс) | vs Scrapling |
|
| 255 |
|-------------|:----------:|:------------:|
|
| 256 |
-
| Scrapling | 2.
|
| 257 |
-
| AutoScraper |
|
| 258 |
|
| 259 |
|
| 260 |
> Все тесты производительности представляют собой средние значения более 100 запусков. См. [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) для методологии.
|
|
@@ -267,33 +346,33 @@ Scrapling требует Python 3.10 или выше:
|
|
| 267 |
pip install scrapling
|
| 268 |
```
|
| 269 |
|
| 270 |
-
|
| 271 |
|
| 272 |
### Опциональные зависимости
|
| 273 |
|
| 274 |
-
1. Если вы собираетесь использовать какие-либо из дополнительных
|
| 275 |
```bash
|
| 276 |
pip install "scrapling[fetchers]"
|
| 277 |
-
|
| 278 |
scrapling install
|
| 279 |
```
|
| 280 |
|
| 281 |
-
Это загрузит все браузеры вместе с их системными зависимостями и зависимостями манипуляции
|
| 282 |
|
| 283 |
-
2. Дополнительные
|
| 284 |
- Установить функцию MCP-сервера:
|
| 285 |
```bash
|
| 286 |
pip install "scrapling[ai]"
|
| 287 |
```
|
| 288 |
-
- Установить функции
|
| 289 |
```bash
|
| 290 |
pip install "scrapling[shell]"
|
| 291 |
```
|
| 292 |
-
- Установить вс
|
| 293 |
```bash
|
| 294 |
pip install "scrapling[all]"
|
| 295 |
```
|
| 296 |
-
Помните, что вам нужно установить зависимости браузер
|
| 297 |
|
| 298 |
### Docker
|
| 299 |
Вы также можете установить Docker-образ со всеми дополнениями и браузерами с помощью следующей команды из DockerHub:
|
|
@@ -304,11 +383,11 @@ docker pull pyd4vinci/scrapling
|
|
| 304 |
```bash
|
| 305 |
docker pull ghcr.io/d4vinci/scrapling:latest
|
| 306 |
```
|
| 307 |
-
Этот образ автоматически созда
|
| 308 |
|
| 309 |
-
##
|
| 310 |
|
| 311 |
-
Мы приветствуем
|
| 312 |
|
| 313 |
## Отказ от ответственности
|
| 314 |
|
|
@@ -324,12 +403,5 @@ docker pull ghcr.io/d4vinci/scrapling:latest
|
|
| 324 |
Этот проект включает код, адаптированный из:
|
| 325 |
- Parsel (лицензия BSD) — Используется для подмодуля [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)
|
| 326 |
|
| 327 |
-
## Благодарности и ссылки
|
| 328 |
-
|
| 329 |
-
- Блестящая работа [Daijro](https://github.com/daijro) над [BrowserForge](https://github.com/daijro/browserforge) и [Camoufox](https://github.com/daijro/camoufox)
|
| 330 |
-
- Блестящая работа [Vinyzu](https://github.com/Vinyzu) над [Botright](https://github.com/Vinyzu/Botright) и [PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)
|
| 331 |
-
- [brotector](https://github.com/kaliiiiiiiiii/brotector) за техники обхода обнаружения браузера
|
| 332 |
-
- [fakebrowser](https://github.com/kkoooqq/fakebrowser) и [BotBrowser](https://github.com/botswin/BotBrowser) з�� исследование отпечатков
|
| 333 |
-
|
| 334 |
---
|
| 335 |
-
<div align="center"><small>Разработано и создано с ❤️ Карим Шоаир.</small></div><br>
|
|
|
|
| 1 |
+
<h1 align="center">
|
| 2 |
+
<a href="https://scrapling.readthedocs.io">
|
| 3 |
+
<picture>
|
| 4 |
+
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
|
| 5 |
+
<img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
|
| 6 |
+
</picture>
|
| 7 |
+
</a>
|
| 8 |
+
<br>
|
| 9 |
+
<small>Effortless Web Scraping for the Modern Web</small>
|
| 10 |
+
</h1>
|
| 11 |
+
|
| 12 |
<p align="center">
|
| 13 |
<a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
|
| 14 |
<img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
|
|
|
|
| 29 |
</p>
|
| 30 |
|
| 31 |
<p align="center">
|
| 32 |
+
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/"><strong>Методы выбора</strong></a>
|
| 33 |
+
·
|
| 34 |
+
<a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/"><strong>Выбор Fetcher</strong></a>
|
| 35 |
+
·
|
| 36 |
+
<a href="https://scrapling.readthedocs.io/en/latest/cli/overview/"><strong>CLI</strong></a>
|
| 37 |
+
·
|
| 38 |
+
<a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/"><strong>Режим MCP</strong></a>
|
| 39 |
+
·
|
| 40 |
+
<a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/"><strong>Миграция с Beautifulsoup</strong></a>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
</p>
|
| 42 |
|
| 43 |
+
Scrapling — это адаптивный фреймворк для Web Scraping, который берёт на себя всё: от одного запроса до полномасштабного обхода сайтов.
|
| 44 |
|
| 45 |
+
Его парсер учится на изменениях сайтов и автоматически перемещает ваши элементы при обновлении страниц. Его Fetcher'ы обходят анти-бот системы вроде Cloudflare Turnstile прямо из коробки. А его Spider-фреймворк позволяет масштабироваться до параллельных, многосессионных обходов с Pause & Resume и автоматической ротацией Proxy — и всё это в нескольких строках Python. Одна библиотека, без компромиссов.
|
| 46 |
|
| 47 |
+
Молниеносно быстрые обходы с отслеживанием статистики в реальном времени и Streaming. Создано веб-скраперами для веб-скраперов и обычных пользователей — здесь есть что-то для каждого.
|
| 48 |
|
| 49 |
```python
|
| 50 |
+
from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
|
| 51 |
+
StealthyFetcher.adaptive = True
|
| 52 |
+
page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # Загрузите сайт незаметно!
|
| 53 |
+
products = page.css('.product', auto_save=True) # Скрапьте данные, которые переживут изменения дизайна сайта!
|
| 54 |
+
products = page.css('.product', adaptive=True) # Позже, если структура сайта изменится, передайте `adaptive=True`, чтобы найти их!
|
| 55 |
+
```
|
| 56 |
+
Или масштабируйте до полного обхода
|
| 57 |
+
```python
|
| 58 |
+
from scrapling.spiders import Spider, Response
|
| 59 |
+
|
| 60 |
+
class MySpider(Spider):
|
| 61 |
+
name = "demo"
|
| 62 |
+
start_urls = ["https://example.com/"]
|
| 63 |
+
|
| 64 |
+
async def parse(self, response: Response):
|
| 65 |
+
for item in response.css('.product'):
|
| 66 |
+
yield {"title": item.css('h2::text').get()}
|
| 67 |
+
|
| 68 |
+
MySpider().start()
|
| 69 |
```
|
| 70 |
|
| 71 |
+
|
| 72 |
+
# Спонсоры
|
| 73 |
|
| 74 |
<!-- sponsors -->
|
| 75 |
|
|
|
|
| 93 |
|
| 94 |
## Ключевые особенности
|
| 95 |
|
| 96 |
+
### Spider'ы — полноценный фреймворк для обхода сайтов
|
| 97 |
+
- 🕷️ **Scrapy-подобный Spider API**: Определяйте Spider'ов с `start_urls`, async `parse` callback'ами и объектами `Request`/`Response`.
|
| 98 |
+
- ⚡ **Параллельный обход**: Настраиваемые лимиты параллелизма, ограничение скорости по домену и задержки загрузки.
|
| 99 |
+
- 🔄 **Поддержка нескольких сессий**: Единый интерфейс для HTTP-запросов и скрытных headless-браузеров в одном Spider — маршрутизируйте запросы к разным сессиям по ID.
|
| 100 |
+
- 💾 **Pause & Resume**: Persistence обхода на основе Checkpoint'ов. Нажмите Ctrl+C для мягкой остановки; перезапустите, чтобы продолжить с того места, где вы остановились.
|
| 101 |
+
- 📡 **Режим Streaming**: Стримьте извлечённые элементы по мере их поступления через `async for item in spider.stream()` со статистикой в реальном времени — идеально для UI, конвейеров и длительных обходов.
|
| 102 |
+
- 🛡️ **Обнаружение заблокированных запросов**: Автоматическое обнаружение и повторная отправка заблокированных запросов с настраиваемой логикой.
|
| 103 |
+
- 📦 **Встроенный экспорт**: Экспортируйте результаты через хуки и собственный конвейер или встроенный JSON/JSONL с `result.items.to_json()` / `result.items.to_jsonl()` соответственно.
|
| 104 |
+
|
| 105 |
+
### Продвинутая загрузка сайтов с поддержкой Session
|
| 106 |
+
- **HTTP-запросы**: Быстрые и скрытные HTTP-запросы с классом `Fetcher`. Может имитировать TLS fingerprint браузера, заголовки и использовать HTTP/3.
|
| 107 |
- **Динамическая загрузка**: Загрузка динамических сайтов с полной автоматизацией браузера через класс `DynamicFetcher`, поддерживающий Chromium от Playwright и Google Chrome.
|
| 108 |
+
- **Обход анти-ботов**: Расширенные возможности скрытности с `StealthyFetcher` и подмену fingerprint'ов. Может легко обойти все типы Cloudflare Turnstile/Interstitial с помощью автоматизации.
|
| 109 |
- **Управление сессиями**: Поддержка постоянных сессий с классами `FetcherSession`, `StealthySession` и `DynamicSession` для управления cookie и состоянием между запросами.
|
| 110 |
+
- **Ротация Proxy**: Встроенный `ProxyRotator` с циклической или пользовательскими стратегиями для всех типов сессий, а также переопределение Proxy для каждого запроса.
|
| 111 |
+
- **Блокировка доменов**: Блокируйте запросы к определённым доменам (и их поддоменам) в браузерных Fetcher'ах.
|
| 112 |
+
- **Поддержка async**: Полная async-поддержка во всех Fetcher'ах и выделенных async-классах сессий.
|
| 113 |
|
| 114 |
### Адаптивный скрапинг и интеграция с ИИ
|
| 115 |
- 🔄 **Умное отслеживание элементов**: Перемещайте элементы после изменений сайта с помощью интеллектуальных алгоритмов подобия.
|
| 116 |
- 🎯 **Умный гибкий выбор**: CSS-селекторы, XPath-селекторы, поиск на основе фильтров, текстовый поиск, поиск по регулярным выражениям и многое другое.
|
| 117 |
+
- 🔍 **Поиск похожих элементов**: Автоматически находите элементы, похожие на найденные.
|
| 118 |
+
- 🤖 **MCP-сервер для использования с ИИ**: Встроенный MCP-сервер для Web Scraping с помощью ИИ и извлечения данных. MCP-сервер обладает мощными пользовательскими возможностями, которые используют Scrapling для извлечения целевого контента перед передачей его ИИ (Claude/Cursor/и т.д.), тем самым ускоряя операции и снижая затраты за счёт минимизации использования токенов. ([демо-видео](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
|
| 119 |
|
| 120 |
### Высокопроизводительная и проверенная в боях архитектура
|
| 121 |
+
- 🚀 **Молниеносная скорость**: Оптимизированная производительность, превосходящая большинство Python-библиотек для скрапинга.
|
| 122 |
- 🔋 **Эффективное использование памяти**: Оптимизированные структуры данных и ленивая загрузка для минимального потребления памяти.
|
| 123 |
+
- ⚡ **Быстрая сериализация JSON**: В 10 раз быстрее стандартной библиотеки.
|
| 124 |
- 🏗️ **Проверено в боях**: Scrapling имеет не только 92% покрытия тестами и полное покрытие type hints, но и ежедневно использовался сотнями веб-скраперов в течение последнего года.
|
| 125 |
|
| 126 |
### Удобный для разработчиков/веб-скраперов опыт
|
| 127 |
+
- 🎯 **Интерактивная Web Scraping Shell**: Опциональная встроенная IPython-оболочка с интеграцией Scrapling, ярлыками и новыми инструментами для ускорения разработки скриптов Web Scraping, такими как преобразование curl-запросов в запросы Scrapling и просмотр результатов запросов в браузере.
|
| 128 |
- 🚀 **Используйте прямо из терминала**: При желании вы можете использовать Scrapling для скрапинга URL без написания ни одной строки кода!
|
| 129 |
- 🛠️ **Богатый API навигации**: Расширенный обход DOM с методами навигации по родителям, братьям и детям.
|
| 130 |
- 🧬 **Улучшенная обработка текста**: Встроенные регулярные выражения, методы очистки и оптимизированные операции со строками.
|
| 131 |
+
- 📝 **Автоматическая генерация селекторов**: Генерация надёжных CSS/XPath-селекторов для любого элемента.
|
| 132 |
- 🔌 **Знакомый API**: Похож на Scrapy/BeautifulSoup с теми же псевдоэлементами, используемыми в Scrapy/Parsel.
|
| 133 |
+
- 📘 **Полное покрытие типами**: Полные type hints для отличной поддержки IDE и автодополнения кода. Вся кодовая база автоматически проверяется **PyRight** и **MyPy** при каждом изменении.
|
| 134 |
+
- 🔋 **Готовый Docker-образ**: С каждым релизом автоматически создаётся и публикуется Docker-образ, содержащий все браузеры.
|
| 135 |
|
| 136 |
## Начало работы
|
| 137 |
|
| 138 |
+
Давайте кратко покажем, на что способен Scrapling, без глубокого погружения.
|
| 139 |
+
|
| 140 |
### Базовое использование
|
| 141 |
+
HTTP-запросы с поддержкой Session
|
| 142 |
```python
|
| 143 |
+
from scrapling.fetchers import Fetcher, FetcherSession
|
|
|
|
| 144 |
|
| 145 |
+
with FetcherSession(impersonate='chrome') as session: # Используйте последнюю версию TLS fingerprint Chrome
|
|
|
|
| 146 |
page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
| 147 |
+
quotes = page.css('.quote .text::text').getall()
|
| 148 |
|
| 149 |
# Или используйте одноразовые запросы
|
| 150 |
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 151 |
+
quotes = page.css('.quote .text::text').getall()
|
| 152 |
+
```
|
| 153 |
+
Расширенный режим скрытности
|
| 154 |
+
```python
|
| 155 |
+
from scrapling.fetchers import StealthyFetcher, StealthySession
|
| 156 |
|
| 157 |
+
with StealthySession(headless=True, solve_cloudflare=True) as session: # Держите браузер открытым, пока не закончите
|
|
|
|
| 158 |
page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
|
| 159 |
+
data = page.css('#padded_content a').getall()
|
| 160 |
|
| 161 |
+
# Или используйте стиль одноразового запроса — открывает браузер для этого запроса, затем закрывает его после завершения
|
| 162 |
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
|
| 163 |
+
data = page.css('#padded_content a').getall()
|
| 164 |
+
```
|
| 165 |
+
Полная автоматизация браузера
|
| 166 |
+
```python
|
| 167 |
+
from scrapling.fetchers import DynamicFetcher, DynamicSession
|
| 168 |
+
|
| 169 |
+
with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # Держите браузер открытым, пока не закончите
|
| 170 |
+
page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
|
| 171 |
+
data = page.xpath('//span[@class="text"]/text()').getall() # XPath-селектор, если вы предпочитаете его
|
| 172 |
+
|
| 173 |
+
# Или используйте стиль одноразового запроса — открывает браузер для этого запроса, затем закрывает его после завершения
|
| 174 |
+
page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
|
| 175 |
+
data = page.css('.quote .text::text').getall()
|
| 176 |
+
```
|
| 177 |
+
|
| 178 |
+
### Spider'ы
|
| 179 |
+
Создавайте полноценные обходчики с параллельными запросами, несколькими типами сессий и Pause & Resume:
|
| 180 |
+
```python
|
| 181 |
+
from scrapling.spiders import Spider, Request, Response
|
| 182 |
+
|
| 183 |
+
class QuotesSpider(Spider):
|
| 184 |
+
name = "quotes"
|
| 185 |
+
start_urls = ["https://quotes.toscrape.com/"]
|
| 186 |
+
concurrent_requests = 10
|
| 187 |
+
|
| 188 |
+
async def parse(self, response: Response):
|
| 189 |
+
for quote in response.css('.quote'):
|
| 190 |
+
yield {
|
| 191 |
+
"text": quote.css('.text::text').get(),
|
| 192 |
+
"author": quote.css('.author::text').get(),
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
next_page = response.css('.next a')
|
| 196 |
+
if next_page:
|
| 197 |
+
yield response.follow(next_page[0].attrib['href'])
|
| 198 |
+
|
| 199 |
+
result = QuotesSpider().start()
|
| 200 |
+
print(f"Извлечено {len(result.items)} цитат")
|
| 201 |
+
result.items.to_json("quotes.json")
|
| 202 |
+
```
|
| 203 |
+
Используйте несколько типов сессий в одном Spider:
|
| 204 |
+
```python
|
| 205 |
+
from scrapling.spiders import Spider, Request, Response
|
| 206 |
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession
|
| 207 |
+
|
| 208 |
+
class MultiSessionSpider(Spider):
|
| 209 |
+
name = "multi"
|
| 210 |
+
start_urls = ["https://example.com/"]
|
| 211 |
+
|
| 212 |
+
def configure_sessions(self, manager):
|
| 213 |
+
manager.add("fast", FetcherSession(impersonate="chrome"))
|
| 214 |
+
manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)
|
| 215 |
+
|
| 216 |
+
async def parse(self, response: Response):
|
| 217 |
+
for link in response.css('a::attr(href)').getall():
|
| 218 |
+
# Направляйте защищённые страницы через stealth-сессию
|
| 219 |
+
if "protected" in link:
|
| 220 |
+
yield Request(link, sid="stealth")
|
| 221 |
+
else:
|
| 222 |
+
yield Request(link, sid="fast", callback=self.parse) # явный callback
|
| 223 |
+
```
|
| 224 |
+
Приостанавливайте и возобновляйте длительные обходы с помощью Checkpoint'ов, запуская Spider следующим образом:
|
| 225 |
+
```python
|
| 226 |
+
QuotesSpider(crawldir="./crawl_data").start()
|
| 227 |
```
|
| 228 |
+
Нажмите Ctrl+C для мягкой остановки — прогресс сохраняется автоматически. Позже, когда вы снова запустите Spider, передайте тот же `crawldir`, и он продолжит с того места, где остановился.
|
| 229 |
+
|
| 230 |
+
### Продвинутый парсинг и навигация
|
| 231 |
+
```python
|
| 232 |
+
from scrapling.fetchers import Fetcher
|
| 233 |
|
| 234 |
+
# Богатый выбор элементов и навигация
|
| 235 |
+
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 236 |
+
|
| 237 |
+
# Получение цитат различными методами выбора
|
| 238 |
+
quotes = page.css('.quote') # CSS-селектор
|
| 239 |
+
quotes = page.xpath('//div[@class="quote"]') # XPath
|
| 240 |
+
quotes = page.find_all('div', {'class': 'quote'}) # В стиле BeautifulSoup
|
| 241 |
+
# То же самое, что
|
| 242 |
+
quotes = page.find_all('div', class_='quote')
|
| 243 |
+
quotes = page.find_all(['div'], class_='quote')
|
| 244 |
+
quotes = page.find_all(class_='quote') # и так далее...
|
| 245 |
+
# Найти элемент по текстовому содержимому
|
| 246 |
+
quotes = page.find_by_text('quote', tag='div')
|
| 247 |
+
|
| 248 |
+
# Продвинутая навигация
|
| 249 |
+
quote_text = page.css('.quote')[0].css('.text::text').get()
|
| 250 |
+
quote_text = page.css('.quote').css('.text::text').getall() # Цепочка селекторов
|
| 251 |
+
first_quote = page.css('.quote')[0]
|
| 252 |
+
author = first_quote.next_sibling.css('.author::text')
|
| 253 |
+
parent_container = first_quote.parent
|
| 254 |
+
|
| 255 |
+
# Связи элементов и подобие
|
| 256 |
+
similar_elements = first_quote.find_similar()
|
| 257 |
+
below_elements = first_quote.below_elements()
|
| 258 |
+
```
|
| 259 |
+
Вы можете использовать парсер напрямую, если не хотите загружать сайты, как показано ниже:
|
| 260 |
```python
|
| 261 |
+
from scrapling.parser import Selector
|
| 262 |
+
|
| 263 |
+
page = Selector("<html>...</html>")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
```
|
| 265 |
+
И он работает точно так же!
|
| 266 |
|
| 267 |
+
### Примеры async Session
|
| 268 |
```python
|
| 269 |
+
import asyncio
|
| 270 |
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession
|
| 271 |
+
|
| 272 |
+
async with FetcherSession(http3=True) as session: # `FetcherSession` контекстно-осведомлён и может работать как в sync, так и в async-режимах
|
| 273 |
+
page1 = session.get('https://quotes.toscrape.com/')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
|
| 275 |
|
| 276 |
+
# Использование async-сессии
|
| 277 |
async with AsyncStealthySession(max_pages=2) as session:
|
| 278 |
tasks = []
|
| 279 |
urls = ['https://example.com/page1', 'https://example.com/page2']
|
| 280 |
+
|
| 281 |
for url in urls:
|
| 282 |
task = session.fetch(url)
|
| 283 |
tasks.append(task)
|
| 284 |
+
|
| 285 |
+
print(session.get_pool_stats()) # Опционально — статус пула вкладок браузера (занят/свободен/ошибка)
|
| 286 |
results = await asyncio.gather(*tasks)
|
| 287 |
print(session.get_pool_stats())
|
| 288 |
```
|
| 289 |
|
| 290 |
+
## CLI и интерактивная Shell
|
| 291 |
|
| 292 |
+
Scrapling включает мощный интерфейс командной строки:
|
| 293 |
|
| 294 |
[](https://asciinema.org/a/736339)
|
| 295 |
|
| 296 |
+
Запустить интерактивную Web Scraping Shell
|
| 297 |
```bash
|
| 298 |
scrapling shell
|
| 299 |
```
|
| 300 |
+
Извлечь страницы в файл напрямую без программирования (по умолчанию извлекает содержимое внутри тега `body`). Если выходной файл заканчивается на `.txt`, будет извлечено текстовое содержимое цели. Если заканчивается на `.md`, это будет Markdown-представление HTML-содержимого; если заканчивается на `.html`, это будет само HTML-содержимое.
|
| 301 |
```bash
|
| 302 |
scrapling extract get 'https://example.com' content.md
|
| 303 |
scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # Все элементы, соответствующие CSS-селектору '#fromSkipToProducts'
|
|
|
|
| 306 |
```
|
| 307 |
|
| 308 |
> [!NOTE]
|
| 309 |
+
> Есть множество дополнительных возможностей, но мы хотим сохранить эту страницу краткой, включая MCP-сервер и интерактивную Web Scraping Shell. Ознакомьтесь с полной документацией [здесь](https://scrapling.readthedocs.io/en/latest/)
|
| 310 |
|
| 311 |
## Тесты производительности
|
| 312 |
|
| 313 |
+
Scrapling не только мощный — он ещё и невероятно быстрый. Следующие тесты производительности сравнивают парсер Scrapling с последними версиями других популярных библиотек.
|
| 314 |
|
| 315 |
### Тест скорости извлечения текста (5000 вложенных элементов)
|
| 316 |
|
| 317 |
+
| # | Библиотека | Время (мс) | vs Scrapling |
|
| 318 |
|---|:-----------------:|:----------:|:------------:|
|
| 319 |
+
| 1 | Scrapling | 2.02 | 1.0x |
|
| 320 |
+
| 2 | Parsel/Scrapy | 2.04 | 1.01 |
|
| 321 |
+
| 3 | Raw Lxml | 2.54 | 1.257 |
|
| 322 |
+
| 4 | PyQuery | 24.17 | ~12x |
|
| 323 |
+
| 5 | Selectolax | 82.63 | ~41x |
|
| 324 |
+
| 6 | MechanicalSoup | 1549.71 | ~767.1x |
|
| 325 |
+
| 7 | BS4 with Lxml | 1584.31 | ~784.3x |
|
| 326 |
+
| 8 | BS4 with html5lib | 3391.91 | ~1679.1x |
|
| 327 |
|
| 328 |
|
| 329 |
### Производительность подобия элементов и текстового поиска
|
|
|
|
| 332 |
|
| 333 |
| Библиотека | Время (мс) | vs Scrapling |
|
| 334 |
|-------------|:----------:|:------------:|
|
| 335 |
+
| Scrapling | 2.39 | 1.0x |
|
| 336 |
+
| AutoScraper | 12.45 | 5.209x |
|
| 337 |
|
| 338 |
|
| 339 |
> Все тесты производительности представляют собой средние значения более 100 запусков. См. [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) для методологии.
|
|
|
|
| 346 |
pip install scrapling
|
| 347 |
```
|
| 348 |
|
| 349 |
+
Эта установка включает только движок парсера и его зависимости, без каких-либо Fetcher'ов или зависимостей командной строки.
|
| 350 |
|
| 351 |
### Опциональные зависимости
|
| 352 |
|
| 353 |
+
1. Если вы собираетесь использовать какие-либо из дополнительных возможностей ниже, Fetcher'ы или их классы, вам необходимо установить зависимости Fetcher'ов и браузеров следующим образом:
|
| 354 |
```bash
|
| 355 |
pip install "scrapling[fetchers]"
|
| 356 |
+
|
| 357 |
scrapling install
|
| 358 |
```
|
| 359 |
|
| 360 |
+
Это загрузит все браузеры вместе с их системными зависимостями и зависимостями для манипуляции fingerprint'ами.
|
| 361 |
|
| 362 |
+
2. Дополнительные возможности:
|
| 363 |
- Установить функцию MCP-сервера:
|
| 364 |
```bash
|
| 365 |
pip install "scrapling[ai]"
|
| 366 |
```
|
| 367 |
+
- Установить функции Shell (Web Scraping Shell и команда `extract`):
|
| 368 |
```bash
|
| 369 |
pip install "scrapling[shell]"
|
| 370 |
```
|
| 371 |
+
- Установить всё:
|
| 372 |
```bash
|
| 373 |
pip install "scrapling[all]"
|
| 374 |
```
|
| 375 |
+
Помните, что вам нужно установить зависимости браузеров с помощью `scrapling install` после любого из этих дополнений (если вы ещё этого не сделали)
|
| 376 |
|
| 377 |
### Docker
|
| 378 |
Вы также можете установить Docker-образ со всеми дополнениями и браузерами с помощью следующей команды из DockerHub:
|
|
|
|
| 383 |
```bash
|
| 384 |
docker pull ghcr.io/d4vinci/scrapling:latest
|
| 385 |
```
|
| 386 |
+
Этот образ автоматически создаётся и публикуется с помощью GitHub Actions и основной ветки репозитория.
|
| 387 |
|
| 388 |
+
## Участие в разработке
|
| 389 |
|
| 390 |
+
Мы приветствуем участие! Пожалуйста, прочитайте наши [руководства по участию в разработке](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) перед началом работы.
|
| 391 |
|
| 392 |
## Отказ от ответственности
|
| 393 |
|
|
|
|
| 403 |
Этот проект включает код, адаптированный из:
|
| 404 |
- Parsel (лицензия BSD) — Используется для подмодуля [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)
|
| 405 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 406 |
---
|
| 407 |
+
<div align="center"><small>Разработано и создано с ❤️ Карим Шоаир.</small></div><br>
|
docs/ai/mcp-server.md
CHANGED
|
@@ -179,7 +179,7 @@ We will gradually go from simple prompts to more complex ones. We will use Claud
|
|
| 179 |
```
|
| 180 |
Use regular requests to scrape the main content from https://example.com and convert it to markdown format.
|
| 181 |
```
|
| 182 |
-
This tells Claude which tool to use here, so it doesn't have to guess. Sometimes it will start using normal requests on its own, and at other times, it will assume browsers are better suited for this website without any apparent reason. As a
|
| 183 |
|
| 184 |
2. **Targeted Data Extraction**
|
| 185 |
|
|
@@ -189,7 +189,7 @@ We will gradually go from simple prompts to more complex ones. We will use Claud
|
|
| 189 |
Get all product titles from https://shop.example.com using the CSS selector '.product-title'. If the request fails, retry up to 5 times every 10 seconds.
|
| 190 |
```
|
| 191 |
|
| 192 |
-
The server will extract only the elements matching your selector and return them as a structured list. Notice I told it to set the tool to try
|
| 193 |
|
| 194 |
3. **E-commerce Data Collection**
|
| 195 |
|
|
|
|
| 179 |
```
|
| 180 |
Use regular requests to scrape the main content from https://example.com and convert it to markdown format.
|
| 181 |
```
|
| 182 |
+
This tells Claude which tool to use here, so it doesn't have to guess. Sometimes it will start using normal requests on its own, and at other times, it will assume browsers are better suited for this website without any apparent reason. As a rule of thumb, you should always tell Claude which tool to use to save time and money and get consistent results.
|
| 183 |
|
| 184 |
2. **Targeted Data Extraction**
|
| 185 |
|
|
|
|
| 189 |
Get all product titles from https://shop.example.com using the CSS selector '.product-title'. If the request fails, retry up to 5 times every 10 seconds.
|
| 190 |
```
|
| 191 |
|
| 192 |
+
The server will extract only the elements matching your selector and return them as a structured list. Notice I told it to set the tool to try up to 5 times in case the website has connection issues, but the default setting should be fine for most cases.
|
| 193 |
|
| 194 |
3. **E-commerce Data Collection**
|
| 195 |
|
docs/api-reference/mcp-server.md
CHANGED
|
@@ -19,7 +19,7 @@ Or import the server class directly:
|
|
| 19 |
from scrapling.core.ai import ScraplingMCPServer
|
| 20 |
|
| 21 |
server = ScraplingMCPServer()
|
| 22 |
-
server.serve()
|
| 23 |
```
|
| 24 |
|
| 25 |
## Response Model
|
|
|
|
| 19 |
from scrapling.core.ai import ScraplingMCPServer
|
| 20 |
|
| 21 |
server = ScraplingMCPServer()
|
| 22 |
+
server.serve(http=False, host="0.0.0.0", port=8000)
|
| 23 |
```
|
| 24 |
|
| 25 |
## Response Model
|
docs/api-reference/proxy-rotation.md
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
search:
|
| 3 |
+
exclude: true
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
# Proxy Rotation
|
| 7 |
+
|
| 8 |
+
The `ProxyRotator` class provides thread-safe proxy rotation for any fetcher or session.
|
| 9 |
+
|
| 10 |
+
You can import it directly like below:
|
| 11 |
+
|
| 12 |
+
```python
|
| 13 |
+
from scrapling.fetchers import ProxyRotator
|
| 14 |
+
```
|
| 15 |
+
|
| 16 |
+
## ::: scrapling.engines.toolbelt.proxy_rotation.ProxyRotator
|
| 17 |
+
handler: python
|
| 18 |
+
:docstring:
|
docs/api-reference/response.md
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
search:
|
| 3 |
+
exclude: true
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
# Response Class
|
| 7 |
+
|
| 8 |
+
The `Response` class wraps HTTP responses returned by all fetchers, providing access to status, headers, body, cookies, and a `Selector` for parsing.
|
| 9 |
+
|
| 10 |
+
You can import the `Response` class like below:
|
| 11 |
+
|
| 12 |
+
```python
|
| 13 |
+
from scrapling.engines.toolbelt.custom import Response
|
| 14 |
+
```
|
| 15 |
+
|
| 16 |
+
## ::: scrapling.engines.toolbelt.custom.Response
|
| 17 |
+
handler: python
|
| 18 |
+
:docstring:
|
docs/api-reference/spiders.md
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
search:
|
| 3 |
+
exclude: true
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
# Spider Classes
|
| 7 |
+
|
| 8 |
+
Here's the reference information for the spider framework classes' parameters, attributes, and methods.
|
| 9 |
+
|
| 10 |
+
You can import them directly like below:
|
| 11 |
+
|
| 12 |
+
```python
|
| 13 |
+
from scrapling.spiders import Spider, Request, CrawlResult, SessionManager, Response
|
| 14 |
+
```
|
| 15 |
+
|
| 16 |
+
## ::: scrapling.spiders.Spider
|
| 17 |
+
handler: python
|
| 18 |
+
:docstring:
|
| 19 |
+
|
| 20 |
+
## ::: scrapling.spiders.Request
|
| 21 |
+
handler: python
|
| 22 |
+
:docstring:
|
| 23 |
+
|
| 24 |
+
## Result Classes
|
| 25 |
+
|
| 26 |
+
## ::: scrapling.spiders.result.CrawlResult
|
| 27 |
+
handler: python
|
| 28 |
+
:docstring:
|
| 29 |
+
|
| 30 |
+
## ::: scrapling.spiders.result.CrawlStats
|
| 31 |
+
handler: python
|
| 32 |
+
:docstring:
|
| 33 |
+
|
| 34 |
+
## ::: scrapling.spiders.result.ItemList
|
| 35 |
+
handler: python
|
| 36 |
+
:docstring:
|
| 37 |
+
|
| 38 |
+
## Session Management
|
| 39 |
+
|
| 40 |
+
## ::: scrapling.spiders.session.SessionManager
|
| 41 |
+
handler: python
|
| 42 |
+
:docstring:
|
docs/benchmarks.md
CHANGED
|
@@ -1,21 +1,20 @@
|
|
| 1 |
# Performance Benchmarks
|
| 2 |
|
| 3 |
-
Scrapling isn't just powerful—it's also blazing fast
|
| 4 |
-
|
| 5 |
-
## Benchmark Results
|
| 6 |
|
| 7 |
### Text Extraction Speed Test (5000 nested elements)
|
| 8 |
|
| 9 |
| # | Library | Time (ms) | vs Scrapling |
|
| 10 |
|---|:-----------------:|:---------:|:------------:|
|
| 11 |
-
| 1 | Scrapling |
|
| 12 |
-
| 2 | Parsel/Scrapy | 2.
|
| 13 |
-
| 3 | Raw Lxml |
|
| 14 |
-
| 4 | PyQuery |
|
| 15 |
-
| 5 | Selectolax |
|
| 16 |
-
| 6 |
|
| 17 |
-
| 7 |
|
| 18 |
-
| 8 | BS4 with html5lib |
|
|
|
|
| 19 |
|
| 20 |
### Element Similarity & Text Search Performance
|
| 21 |
|
|
@@ -23,5 +22,7 @@ Scrapling's adaptive element finding capabilities significantly outperform alter
|
|
| 23 |
|
| 24 |
| Library | Time (ms) | vs Scrapling |
|
| 25 |
|-------------|:---------:|:------------:|
|
| 26 |
-
| Scrapling | 2.
|
| 27 |
-
| AutoScraper |
|
|
|
|
|
|
|
|
|
| 1 |
# Performance Benchmarks
|
| 2 |
|
| 3 |
+
Scrapling isn't just powerful—it's also blazing fast. The following benchmarks compare Scrapling's parser with the latest versions of other popular libraries.
|
|
|
|
|
|
|
| 4 |
|
| 5 |
### Text Extraction Speed Test (5000 nested elements)
|
| 6 |
|
| 7 |
| # | Library | Time (ms) | vs Scrapling |
|
| 8 |
|---|:-----------------:|:---------:|:------------:|
|
| 9 |
+
| 1 | Scrapling | 2.02 | 1.0x |
|
| 10 |
+
| 2 | Parsel/Scrapy | 2.04 | 1.01 |
|
| 11 |
+
| 3 | Raw Lxml | 2.54 | 1.257 |
|
| 12 |
+
| 4 | PyQuery | 24.17 | ~12x |
|
| 13 |
+
| 5 | Selectolax | 82.63 | ~41x |
|
| 14 |
+
| 6 | MechanicalSoup | 1549.71 | ~767.1x |
|
| 15 |
+
| 7 | BS4 with Lxml | 1584.31 | ~784.3x |
|
| 16 |
+
| 8 | BS4 with html5lib | 3391.91 | ~1679.1x |
|
| 17 |
+
|
| 18 |
|
| 19 |
### Element Similarity & Text Search Performance
|
| 20 |
|
|
|
|
| 22 |
|
| 23 |
| Library | Time (ms) | vs Scrapling |
|
| 24 |
|-------------|:---------:|:------------:|
|
| 25 |
+
| Scrapling | 2.39 | 1.0x |
|
| 26 |
+
| AutoScraper | 12.45 | 5.209x |
|
| 27 |
+
|
| 28 |
+
> All benchmarks represent averages of 100+ runs. See [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology.
|
docs/cli/extract-commands.md
CHANGED
|
@@ -4,12 +4,12 @@
|
|
| 4 |
|
| 5 |
The `scrapling extract` command lets you download and extract content from websites directly from your terminal without writing any code. Ideal for beginners, researchers, and anyone requiring rapid web data extraction.
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
|
| 14 |
|
| 15 |
## What is the Extract Command group?
|
|
@@ -280,7 +280,7 @@ We will go through each command in detail below.
|
|
| 280 |
-s, --css-selector TEXT CSS selector to extract specific content from the page. It returns all matches.
|
| 281 |
--wait-selector TEXT CSS selector to wait for before proceeding
|
| 282 |
--locale TEXT Specify user locale. Defaults to the system default locale.
|
| 283 |
-
--
|
| 284 |
--proxy TEXT Proxy URL in format "http://username:password@host:port"
|
| 285 |
-H, --extra-headers TEXT Extra headers in format "Key: Value" (can be used multiple times)
|
| 286 |
--help Show this message and exit.
|
|
@@ -320,8 +320,7 @@ We will go through each command in detail below.
|
|
| 320 |
--solve-cloudflare / --no-solve-cloudflare Solve Cloudflare challenges (default: False)
|
| 321 |
--allow-webgl / --block-webgl Allow WebGL (default: True)
|
| 322 |
--network-idle / --no-network-idle Wait for network idle (default: False)
|
| 323 |
-
--
|
| 324 |
-
--hide-canvas/--show-canvas Add noise to canvas operations (default: False)
|
| 325 |
--timeout INTEGER Timeout in milliseconds (default: 30000)
|
| 326 |
--wait INTEGER Additional wait time in milliseconds after page load (default: 0)
|
| 327 |
-s, --css-selector TEXT CSS selector to extract specific content from the page. It returns all matches.
|
|
|
|
| 4 |
|
| 5 |
The `scrapling extract` command lets you download and extract content from websites directly from your terminal without writing any code. Ideal for beginners, researchers, and anyone requiring rapid web data extraction.
|
| 6 |
|
| 7 |
+
!!! success "Prerequisites"
|
| 8 |
+
|
| 9 |
+
1. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use.
|
| 10 |
+
2. You've completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object.
|
| 11 |
+
3. You've completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class.
|
| 12 |
+
4. You've completed or read at least one page from the fetchers section to use here for requests: [HTTP requests](../fetching/static.md), [Dynamic websites](../fetching/dynamic.md), or [Dynamic websites with hard protections](../fetching/stealthy.md).
|
| 13 |
|
| 14 |
|
| 15 |
## What is the Extract Command group?
|
|
|
|
| 280 |
-s, --css-selector TEXT CSS selector to extract specific content from the page. It returns all matches.
|
| 281 |
--wait-selector TEXT CSS selector to wait for before proceeding
|
| 282 |
--locale TEXT Specify user locale. Defaults to the system default locale.
|
| 283 |
+
--real-chrome/--no-real-chrome If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False)
|
| 284 |
--proxy TEXT Proxy URL in format "http://username:password@host:port"
|
| 285 |
-H, --extra-headers TEXT Extra headers in format "Key: Value" (can be used multiple times)
|
| 286 |
--help Show this message and exit.
|
|
|
|
| 320 |
--solve-cloudflare / --no-solve-cloudflare Solve Cloudflare challenges (default: False)
|
| 321 |
--allow-webgl / --block-webgl Allow WebGL (default: True)
|
| 322 |
--network-idle / --no-network-idle Wait for network idle (default: False)
|
| 323 |
+
--real-chrome/--no-real-chrome If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False)
|
|
|
|
| 324 |
--timeout INTEGER Timeout in milliseconds (default: 30000)
|
| 325 |
--wait INTEGER Additional wait time in milliseconds after page load (default: 0)
|
| 326 |
-s, --css-selector TEXT CSS selector to extract specific content from the page. It returns all matches.
|
docs/cli/interactive-shell.md
CHANGED
|
@@ -1,17 +1,17 @@
|
|
| 1 |
# Scrapling Interactive Shell Guide
|
| 2 |
|
| 3 |
-
<script src="https://asciinema.org/a/736339.js" id="asciicast-736339" async data-autoplay="1" data-loop="1" data-cols="225" data-rows="40" data-start-at="00:06" data-speed="1.5"></script>
|
| 4 |
|
| 5 |
**Powerful Web Scraping REPL for Developers and Data Scientists**
|
| 6 |
|
| 7 |
The Scrapling Interactive Shell is an enhanced IPython-based environment designed specifically for Web Scraping tasks. It provides instant access to all Scrapling features, clever shortcuts, automatic page management, and advanced tools, such as conversion of the curl command.
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
|
| 16 |
|
| 17 |
## Why use the Interactive Shell?
|
|
@@ -133,7 +133,7 @@ The shell provides a few functions to help you convert curl commands from the br
|
|
| 133 |
|
| 134 |
First, you need to copy a request as a curl command like the following:
|
| 135 |
|
| 136 |
-
<img src="../
|
| 137 |
|
| 138 |
- **Convert Curl command to Request Object**
|
| 139 |
|
|
@@ -174,7 +174,7 @@ The shell inherits all IPython capabilities:
|
|
| 174 |
>>> %save filename.py 1-10 # Save commands 1-10 to file
|
| 175 |
|
| 176 |
>>> # Tab completion works everywhere
|
| 177 |
-
>>> page.c<TAB> # Shows: css,
|
| 178 |
>>> Fetcher.<TAB> # Shows all Fetcher methods
|
| 179 |
|
| 180 |
>>> # Object inspection
|
|
|
|
| 1 |
# Scrapling Interactive Shell Guide
|
| 2 |
|
| 3 |
+
<script src="https://asciinema.org/a/736339.js" id="asciicast-736339" async data-autoplay="1" data-loop="1" data-cols="225" data-rows="40" data-start-at="00:06" data-speed="1.5" data-theme="tango"></script>
|
| 4 |
|
| 5 |
**Powerful Web Scraping REPL for Developers and Data Scientists**
|
| 6 |
|
| 7 |
The Scrapling Interactive Shell is an enhanced IPython-based environment designed specifically for Web Scraping tasks. It provides instant access to all Scrapling features, clever shortcuts, automatic page management, and advanced tools, such as conversion of the curl command.
|
| 8 |
|
| 9 |
+
!!! success "Prerequisites"
|
| 10 |
+
|
| 11 |
+
1. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use.
|
| 12 |
+
2. You've completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object.
|
| 13 |
+
3. You've completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class.
|
| 14 |
+
4. You've completed or read at least one page from the fetchers section to use here for requests: [HTTP requests](../fetching/static.md), [Dynamic websites](../fetching/dynamic.md), or [Dynamic websites with hard protections](../fetching/stealthy.md).
|
| 15 |
|
| 16 |
|
| 17 |
## Why use the Interactive Shell?
|
|
|
|
| 133 |
|
| 134 |
First, you need to copy a request as a curl command like the following:
|
| 135 |
|
| 136 |
+
<img src="../assets/scrapling_shell_curl.png" title="Copying a request as a curl command from Chrome" alt="Copying a request as a curl command from Chrome" style="width: 70%;"/>
|
| 137 |
|
| 138 |
- **Convert Curl command to Request Object**
|
| 139 |
|
|
|
|
| 174 |
>>> %save filename.py 1-10 # Save commands 1-10 to file
|
| 175 |
|
| 176 |
>>> # Tab completion works everywhere
|
| 177 |
+
>>> page.c<TAB> # Shows: css, cookies, headers, etc.
|
| 178 |
>>> Fetcher.<TAB> # Shows all Fetcher methods
|
| 179 |
|
| 180 |
>>> # Object inspection
|
docs/development/adaptive_storage_system.md
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
Scrapling uses SQLite by default, but this tutorial shows how to write your own storage system to store element properties for the `adaptive` feature.
|
| 2 |
|
| 3 |
You might want to use Firebase, for example, and share the database between multiple spiders on different machines. It's a great idea to use an online database like that because spiders can share adaptive data with each other.
|
|
@@ -54,7 +56,7 @@ class RedisStorage(StorageSystemMixin):
|
|
| 54 |
orjson.dumps(element_dict)
|
| 55 |
)
|
| 56 |
|
| 57 |
-
def retrieve(self, identifier: str) -> dict:
|
| 58 |
# Get data
|
| 59 |
key = f"scrapling:{self._get_base_url()}:{identifier}"
|
| 60 |
data = self.redis.get(key)
|
|
|
|
| 1 |
+
# Writing your retrieval system
|
| 2 |
+
|
| 3 |
Scrapling uses SQLite by default, but this tutorial shows how to write your own storage system to store element properties for the `adaptive` feature.
|
| 4 |
|
| 5 |
You might want to use Firebase, for example, and share the database between multiple spiders on different machines. It's a great idea to use an online database like that because spiders can share adaptive data with each other.
|
|
|
|
| 56 |
orjson.dumps(element_dict)
|
| 57 |
)
|
| 58 |
|
| 59 |
+
def retrieve(self, identifier: str) -> dict | None:
|
| 60 |
# Get data
|
| 61 |
key = f"scrapling:{self._get_base_url()}:{identifier}"
|
| 62 |
data = self.redis.get(key)
|
docs/development/scrapling_custom_types.md
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
> You can take advantage of the custom-made types for Scrapling and use them outside the library if you want. It's better than copying their code, after all :)
|
| 2 |
|
| 3 |
### All current types can be imported alone, like below
|
|
|
|
| 1 |
+
# Using Scrapling's custom types
|
| 2 |
+
|
| 3 |
> You can take advantage of the custom-made types for Scrapling and use them outside the library if you want. It's better than copying their code, after all :)
|
| 4 |
|
| 5 |
### All current types can be imported alone, like below
|
docs/fetching/choosing.md
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
## Introduction
|
| 2 |
Fetchers are classes that can do requests or fetch pages for you easily in a single-line fashion with many features and then return a [Response](#response-object) object. Starting with v0.3, all fetchers have separate classes to keep the session running, so for example, a fetcher that uses a browser will keep the browser open till you finish all your requests through it instead of opening multiple browsers. So it depends on your use case.
|
| 3 |
|
|
@@ -38,21 +40,22 @@ Then you use it right away without initializing like this, and it will use the d
|
|
| 38 |
If you want to configure the parser ([Selector class](../parsing/main_classes.md#selector)) that will be used on the response before returning it for you, then do this first:
|
| 39 |
```python
|
| 40 |
>>> from scrapling.fetchers import Fetcher
|
| 41 |
-
>>> Fetcher.configure(adaptive=True,
|
| 42 |
```
|
| 43 |
or
|
| 44 |
```python
|
| 45 |
>>> from scrapling.fetchers import Fetcher
|
| 46 |
>>> Fetcher.adaptive=True
|
| 47 |
-
>>> Fetcher.encoding="utf-8"
|
| 48 |
>>> Fetcher.keep_comments=False
|
| 49 |
>>> Fetcher.keep_cdata=False # and the rest
|
| 50 |
```
|
| 51 |
Then, continue your code as usual.
|
| 52 |
|
| 53 |
-
The available configuration arguments are: `adaptive`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the [Selector](../parsing/main_classes.md#selector) class. You can display the current configuration anytime by running `<fetcher_class>.display_config()`.
|
|
|
|
|
|
|
| 54 |
|
| 55 |
-
|
| 56 |
|
| 57 |
### Set parser config per request
|
| 58 |
As you probably understand, the logic above for setting the parser config will apply globally to all requests/fetches made through that class, and it's intended for simplicity.
|
|
@@ -71,7 +74,12 @@ The `Response` object is the same as the [Selector](../parsing/main_classes.md#s
|
|
| 71 |
>>> page.headers # Response headers
|
| 72 |
>>> page.request_headers # Request headers
|
| 73 |
>>> page.history # Response history of redirections, if any
|
| 74 |
-
>>> page.body # Raw response body
|
| 75 |
>>> page.encoding # Response encoding
|
|
|
|
| 76 |
```
|
| 77 |
-
All fetchers return the `Response` object.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Fetchers basics
|
| 2 |
+
|
| 3 |
## Introduction
|
| 4 |
Fetchers are classes that can do requests or fetch pages for you easily in a single-line fashion with many features and then return a [Response](#response-object) object. Starting with v0.3, all fetchers have separate classes to keep the session running, so for example, a fetcher that uses a browser will keep the browser open till you finish all your requests through it instead of opening multiple browsers. So it depends on your use case.
|
| 5 |
|
|
|
|
| 40 |
If you want to configure the parser ([Selector class](../parsing/main_classes.md#selector)) that will be used on the response before returning it for you, then do this first:
|
| 41 |
```python
|
| 42 |
>>> from scrapling.fetchers import Fetcher
|
| 43 |
+
>>> Fetcher.configure(adaptive=True, keep_comments=False, keep_cdata=False) # and the rest
|
| 44 |
```
|
| 45 |
or
|
| 46 |
```python
|
| 47 |
>>> from scrapling.fetchers import Fetcher
|
| 48 |
>>> Fetcher.adaptive=True
|
|
|
|
| 49 |
>>> Fetcher.keep_comments=False
|
| 50 |
>>> Fetcher.keep_cdata=False # and the rest
|
| 51 |
```
|
| 52 |
Then, continue your code as usual.
|
| 53 |
|
| 54 |
+
The available configuration arguments are: `adaptive`, `adaptive_domain`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the [Selector](../parsing/main_classes.md#selector) class. You can display the current configuration anytime by running `<fetcher_class>.display_config()`.
|
| 55 |
+
|
| 56 |
+
!!! info
|
| 57 |
|
| 58 |
+
The `adaptive` argument is disabled by default; you must enable it to use that feature.
|
| 59 |
|
| 60 |
### Set parser config per request
|
| 61 |
As you probably understand, the logic above for setting the parser config will apply globally to all requests/fetches made through that class, and it's intended for simplicity.
|
|
|
|
| 74 |
>>> page.headers # Response headers
|
| 75 |
>>> page.request_headers # Request headers
|
| 76 |
>>> page.history # Response history of redirections, if any
|
| 77 |
+
>>> page.body # Raw response body as bytes
|
| 78 |
>>> page.encoding # Response encoding
|
| 79 |
+
>>> page.meta # Response metadata dictionary (e.g., proxy used). Mainly helpful with the spiders system.
|
| 80 |
```
|
| 81 |
+
All fetchers return the `Response` object.
|
| 82 |
+
|
| 83 |
+
!!! note
|
| 84 |
+
|
| 85 |
+
Unlike the [Selector](../parsing/main_classes.md#selector) class, the `Response` class's body is always bytes since v0.4.
|
docs/fetching/dynamic.md
CHANGED
|
@@ -1,14 +1,14 @@
|
|
| 1 |
-
#
|
| 2 |
|
| 3 |
Here, we will discuss the `DynamicFetcher` class (formerly `PlayWrightFetcher`). This class provides flexible browser automation with multiple configuration options and little under-the-hood stealth improvements.
|
| 4 |
|
| 5 |
As we will explain later, to automate the page, you need some knowledge of [Playwright's Page API](https://playwright.dev/python/docs/api/class-page).
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
|
| 13 |
## Basic Usage
|
| 14 |
You have one primary way to import this Fetcher, which is the same for all fetchers.
|
|
@@ -20,7 +20,9 @@ Check out how to configure the parsing options [here](choosing.md#parser-configu
|
|
| 20 |
|
| 21 |
Now, we will review most of the arguments one by one, using examples. If you want to jump to a table of all arguments for quick reference, [click here](#full-list-of-arguments)
|
| 22 |
|
| 23 |
-
|
|
|
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
This fetcher currently provides three main run options that can be combined as desired.
|
|
@@ -51,10 +53,10 @@ DynamicFetcher.fetch('https://example.com', cdp_url='ws://localhost:9222')
|
|
| 51 |
Instead of launching a browser locally (Chromium/Google Chrome), you can connect to a remote browser through the [Chrome DevTools Protocol](https://chromedevtools.github.io/devtools-protocol/).
|
| 52 |
|
| 53 |
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
|
| 59 |
## Full list of arguments
|
| 60 |
Scrapling provides many options with this fetcher and its session classes. To make it as simple as possible, we will list the options here and give examples of how to use most of them.
|
|
@@ -85,15 +87,19 @@ Scrapling provides many options with this fetcher and its session classes. To ma
|
|
| 85 |
| extra_flags | A list of additional browser flags to pass to the browser on launch. | ✔️ |
|
| 86 |
| additional_args | Additional arguments to be passed to Playwright's context as additional settings, and they take higher priority than Scrapling's settings. | ✔️ |
|
| 87 |
| selector_config | A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class. | ✔️ |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
-
In session classes, all these arguments can be set globally for the session. Still, you can configure each request individually by passing some of the arguments here that can be configured on the browser tab level like: `google_search`, `timeout`, `wait`, `page_action`, `extra_headers`, `disable_resources`, `wait_selector`, `wait_selector_state`, `network_idle`, `load_dom`, and `selector_config`.
|
| 90 |
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
|
| 98 |
|
| 99 |
## Examples
|
|
@@ -106,6 +112,13 @@ It's easier to understand with examples, so let's take a look.
|
|
| 106 |
page = DynamicFetcher.fetch('https://example.com', disable_resources=True) # Blocks fonts, images, media, etc.
|
| 107 |
```
|
| 108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
### Network Control
|
| 110 |
|
| 111 |
```python
|
|
@@ -119,16 +132,41 @@ page = DynamicFetcher.fetch('https://example.com', timeout=30000) # 30 seconds
|
|
| 119 |
page = DynamicFetcher.fetch('https://example.com', proxy='http://username:password@host:port')
|
| 120 |
```
|
| 121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
### Downloading Files
|
| 123 |
|
| 124 |
```python
|
| 125 |
-
page = DynamicFetcher.fetch('https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/
|
| 126 |
|
| 127 |
-
with open(file='
|
| 128 |
f.write(page.body)
|
| 129 |
```
|
| 130 |
|
| 131 |
-
The `body` attribute of the `Response` object
|
| 132 |
|
| 133 |
### Browser Automation
|
| 134 |
This is where your knowledge about [Playwright's Page API](https://playwright.dev/python/docs/api/class-page) comes into play. The function you pass here takes the page object from Playwright's API, performs the desired action, and then the fetcher continues.
|
|
@@ -206,7 +244,7 @@ def scrape_dynamic_content():
|
|
| 206 |
content = page.css('.content')
|
| 207 |
|
| 208 |
return {
|
| 209 |
-
'title': content.
|
| 210 |
'items': [
|
| 211 |
item.text for item in content.css('.item')
|
| 212 |
]
|
|
|
|
| 1 |
+
# Fetching dynamic websites
|
| 2 |
|
| 3 |
Here, we will discuss the `DynamicFetcher` class (formerly `PlayWrightFetcher`). This class provides flexible browser automation with multiple configuration options and little under-the-hood stealth improvements.
|
| 4 |
|
| 5 |
As we will explain later, to automate the page, you need some knowledge of [Playwright's Page API](https://playwright.dev/python/docs/api/class-page).
|
| 6 |
|
| 7 |
+
!!! success "Prerequisites"
|
| 8 |
+
|
| 9 |
+
1. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use.
|
| 10 |
+
2. You've completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object.
|
| 11 |
+
3. You've completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class.
|
| 12 |
|
| 13 |
## Basic Usage
|
| 14 |
You have one primary way to import this Fetcher, which is the same for all fetchers.
|
|
|
|
| 20 |
|
| 21 |
Now, we will review most of the arguments one by one, using examples. If you want to jump to a table of all arguments for quick reference, [click here](#full-list-of-arguments)
|
| 22 |
|
| 23 |
+
!!! abstract
|
| 24 |
+
|
| 25 |
+
The async version of the `fetch` method is `async_fetch`, of course.
|
| 26 |
|
| 27 |
|
| 28 |
This fetcher currently provides three main run options that can be combined as desired.
|
|
|
|
| 53 |
Instead of launching a browser locally (Chromium/Google Chrome), you can connect to a remote browser through the [Chrome DevTools Protocol](https://chromedevtools.github.io/devtools-protocol/).
|
| 54 |
|
| 55 |
|
| 56 |
+
!!! note "Notes:"
|
| 57 |
+
|
| 58 |
+
* There was a `stealth` option here, but it was moved to the `StealthyFetcher` class, as explained on the next page, with additional features since version 0.3.13.<br/>
|
| 59 |
+
* This makes it less confusing for new users, easier to maintain, and provides other benefits, as explained on the [StealthyFetcher page](../fetching/stealthy.md).
|
| 60 |
|
| 61 |
## Full list of arguments
|
| 62 |
Scrapling provides many options with this fetcher and its session classes. To make it as simple as possible, we will list the options here and give examples of how to use most of them.
|
|
|
|
| 87 |
| extra_flags | A list of additional browser flags to pass to the browser on launch. | ✔️ |
|
| 88 |
| additional_args | Additional arguments to be passed to Playwright's context as additional settings, and they take higher priority than Scrapling's settings. | ✔️ |
|
| 89 |
| selector_config | A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class. | ✔️ |
|
| 90 |
+
| blocked_domains | A set of domain names to block requests to. Subdomains are also matched (e.g., `"example.com"` blocks `"sub.example.com"` too). | ✔️ |
|
| 91 |
+
| proxy_rotator | A `ProxyRotator` instance for automatic proxy rotation. Cannot be combined with `proxy`. | ✔️ |
|
| 92 |
+
| retries | Number of retry attempts for failed requests. Defaults to 3. | ✔️ |
|
| 93 |
+
| retry_delay | Seconds to wait between retry attempts. Defaults to 1. | ✔️ |
|
| 94 |
|
| 95 |
+
In session classes, all these arguments can be set globally for the session. Still, you can configure each request individually by passing some of the arguments here that can be configured on the browser tab level like: `google_search`, `timeout`, `wait`, `page_action`, `extra_headers`, `disable_resources`, `wait_selector`, `wait_selector_state`, `network_idle`, `load_dom`, `blocked_domains`, `proxy`, and `selector_config`.
|
| 96 |
|
| 97 |
+
!!! note "Notes:"
|
| 98 |
+
|
| 99 |
+
1. The `disable_resources` option made requests ~25% faster in my tests for some websites and can help save your proxy usage, but be careful with it, as it can cause some websites to never finish loading.
|
| 100 |
+
2. The `google_search` argument is enabled by default for all requests, making the request appear to come from a Google search page. So, a request for `https://example.com` will set the referer to `https://www.google.com/search?q=example`. Also, if used together, it takes priority over the referer set by the `extra_headers` argument.
|
| 101 |
+
3. Since version 0.3.13, the `stealth` option has been removed here in favor of the `StealthyFetcher` class, and the `hide_canvas` option has been moved to it. The `disable_webgl` argument has been moved to the `StealthyFetcher` class and renamed as `allow_webgl`.
|
| 102 |
+
4. If you didn't set a user agent and enabled headless mode, the fetcher will generate a real user agent for the same browser version and use it. If you didn't set a user agent and didn't enable headless mode, the fetcher will use the browser's default user agent, which is the same as in standard browsers in the latest versions.
|
| 103 |
|
| 104 |
|
| 105 |
## Examples
|
|
|
|
| 112 |
page = DynamicFetcher.fetch('https://example.com', disable_resources=True) # Blocks fonts, images, media, etc.
|
| 113 |
```
|
| 114 |
|
| 115 |
+
### Domain Blocking
|
| 116 |
+
|
| 117 |
+
```python
|
| 118 |
+
# Block requests to specific domains (and their subdomains)
|
| 119 |
+
page = DynamicFetcher.fetch('https://example.com', blocked_domains={"ads.example.com", "tracker.net"})
|
| 120 |
+
```
|
| 121 |
+
|
| 122 |
### Network Control
|
| 123 |
|
| 124 |
```python
|
|
|
|
| 132 |
page = DynamicFetcher.fetch('https://example.com', proxy='http://username:password@host:port')
|
| 133 |
```
|
| 134 |
|
| 135 |
+
### Proxy Rotation
|
| 136 |
+
|
| 137 |
+
```python
|
| 138 |
+
from scrapling.fetchers import DynamicSession, ProxyRotator
|
| 139 |
+
|
| 140 |
+
# Set up proxy rotation
|
| 141 |
+
rotator = ProxyRotator([
|
| 142 |
+
"http://proxy1:8080",
|
| 143 |
+
"http://proxy2:8080",
|
| 144 |
+
"http://proxy3:8080",
|
| 145 |
+
])
|
| 146 |
+
|
| 147 |
+
# Use with session - rotates proxy automatically with each request
|
| 148 |
+
with DynamicSession(proxy_rotator=rotator, headless=True) as session:
|
| 149 |
+
page1 = session.fetch('https://example1.com')
|
| 150 |
+
page2 = session.fetch('https://example2.com')
|
| 151 |
+
|
| 152 |
+
# Override rotator for a specific request
|
| 153 |
+
page3 = session.fetch('https://example3.com', proxy='http://specific-proxy:8080')
|
| 154 |
+
```
|
| 155 |
+
|
| 156 |
+
!!! warning
|
| 157 |
+
|
| 158 |
+
Remember that by default, all browser-based fetchers and sessions use a persistent browser context with a pool of tabs. However, since browsers can't set a proxy per tab, when you use a `ProxyRotator`, the fetcher will automatically open a separate context for each proxy, with one tab per context. Once the tab's job is done, both the tab and its context are closed.
|
| 159 |
+
|
| 160 |
### Downloading Files
|
| 161 |
|
| 162 |
```python
|
| 163 |
+
page = DynamicFetcher.fetch('https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/main_cover.png')
|
| 164 |
|
| 165 |
+
with open(file='main_cover.png', mode='wb') as f:
|
| 166 |
f.write(page.body)
|
| 167 |
```
|
| 168 |
|
| 169 |
+
The `body` attribute of the `Response` object always returns `bytes`.
|
| 170 |
|
| 171 |
### Browser Automation
|
| 172 |
This is where your knowledge about [Playwright's Page API](https://playwright.dev/python/docs/api/class-page) comes into play. The function you pass here takes the page object from Playwright's API, performs the desired action, and then the fetcher continues.
|
|
|
|
| 244 |
content = page.css('.content')
|
| 245 |
|
| 246 |
return {
|
| 247 |
+
'title': content.css('h1::text').get(),
|
| 248 |
'items': [
|
| 249 |
item.text for item in content.css('.item')
|
| 250 |
]
|
docs/fetching/static.md
CHANGED
|
@@ -1,12 +1,12 @@
|
|
| 1 |
-
#
|
| 2 |
|
| 3 |
The `Fetcher` class provides rapid and lightweight HTTP requests using the high-performance `curl_cffi` library with a lot of stealth capabilities.
|
| 4 |
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
|
| 11 |
## Basic Usage
|
| 12 |
You have one primary way to import this Fetcher, which is the same for all fetchers.
|
|
@@ -31,18 +31,20 @@ All methods for making requests here share some arguments, so let's discuss them
|
|
| 31 |
- **proxy**: As the name implies, the proxy for this request is used to route all traffic (HTTP and HTTPS). The format accepted here is `http://username:password@localhost:8030`.
|
| 32 |
- **proxy_auth**: HTTP basic auth for proxy, tuple of (username, password).
|
| 33 |
- **proxies**: Dict of proxies to use. Format: `{"http": proxy_url, "https": proxy_url}`.
|
|
|
|
| 34 |
- **headers**: Headers to include in the request. Can override any header generated by the `stealthy_headers` argument
|
| 35 |
- **max_redirects**: Maximum number of redirects. **Defaults to 30**, use -1 for unlimited.
|
| 36 |
- **verify**: Whether to verify HTTPS certificates. **Defaults to True**.
|
| 37 |
- **cert**: Tuple of (cert, key) filenames for the client certificate.
|
| 38 |
- **selector_config**: A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class.
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
|
|
|
| 44 |
|
| 45 |
-
Other than this, for further customization, you can pass any arguments that `curl_cffi` supports for any method if that method doesn't already support
|
| 46 |
|
| 47 |
### HTTP Methods
|
| 48 |
There are additional arguments for each method, depending on the method, such as `params` for GET requests and `data`/`json` for POST/PUT/DELETE requests.
|
|
@@ -186,19 +188,50 @@ with FetcherSession(
|
|
| 186 |
page1 = session.get('https://scrapling.requestcatcher.com/get')
|
| 187 |
page2 = session.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'})
|
| 188 |
page3 = session.get('https://api.github.com/events')
|
| 189 |
-
|
| 190 |
# All requests share the same session and connection pool
|
| 191 |
```
|
| 192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
And here's an async example
|
| 194 |
|
| 195 |
```python
|
| 196 |
async with FetcherSession(impersonate='firefox', http3=True) as session:
|
| 197 |
# All standard HTTP methods available
|
| 198 |
-
response =
|
| 199 |
-
response =
|
| 200 |
-
response =
|
| 201 |
-
response =
|
| 202 |
```
|
| 203 |
or better
|
| 204 |
```python
|
|
@@ -239,11 +272,11 @@ page = Fetcher.get('https://example.com')
|
|
| 239 |
# Check the status
|
| 240 |
if page.status == 200:
|
| 241 |
# Extract title
|
| 242 |
-
title = page.
|
| 243 |
print(f"Page title: {title}")
|
| 244 |
-
|
| 245 |
# Extract all links
|
| 246 |
-
links = page.css('a::attr(href)')
|
| 247 |
print(f"Found {len(links)} links")
|
| 248 |
```
|
| 249 |
|
|
@@ -261,9 +294,9 @@ def scrape_products():
|
|
| 261 |
results = []
|
| 262 |
for product in products:
|
| 263 |
results.append({
|
| 264 |
-
'title': product.
|
| 265 |
-
'price': product.
|
| 266 |
-
'description': product.
|
| 267 |
'in_stock': product.has_class('in-stock')
|
| 268 |
})
|
| 269 |
|
|
@@ -275,8 +308,8 @@ def scrape_products():
|
|
| 275 |
```python
|
| 276 |
from scrapling.fetchers import Fetcher
|
| 277 |
|
| 278 |
-
page = Fetcher.get('https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/
|
| 279 |
-
with open(file='
|
| 280 |
f.write(page.body)
|
| 281 |
```
|
| 282 |
|
|
@@ -302,8 +335,8 @@ def scrape_all_pages():
|
|
| 302 |
# Process products
|
| 303 |
for product in products:
|
| 304 |
all_products.append({
|
| 305 |
-
'name': product.
|
| 306 |
-
'price': product.
|
| 307 |
})
|
| 308 |
|
| 309 |
# Next page
|
|
@@ -329,7 +362,7 @@ response = Fetcher.post(
|
|
| 329 |
# Check login success
|
| 330 |
if response.status == 200:
|
| 331 |
# Extract user info
|
| 332 |
-
user_name = response.
|
| 333 |
print(f"Logged in as: {user_name}")
|
| 334 |
```
|
| 335 |
|
|
@@ -342,7 +375,7 @@ def extract_table():
|
|
| 342 |
page = Fetcher.get('https://example.com/data')
|
| 343 |
|
| 344 |
# Find table
|
| 345 |
-
table = page.
|
| 346 |
|
| 347 |
# Extract headers
|
| 348 |
headers = [
|
|
@@ -367,12 +400,13 @@ def extract_menu():
|
|
| 367 |
page = Fetcher.get('https://example.com')
|
| 368 |
|
| 369 |
# Find navigation
|
| 370 |
-
nav = page.
|
| 371 |
|
| 372 |
menu = {}
|
| 373 |
for item in nav.css('li'):
|
| 374 |
-
|
| 375 |
-
if
|
|
|
|
| 376 |
menu[link.text] = {
|
| 377 |
'url': link['href'],
|
| 378 |
'has_submenu': bool(item.css('.submenu'))
|
|
|
|
| 1 |
+
# HTTP requests
|
| 2 |
|
| 3 |
The `Fetcher` class provides rapid and lightweight HTTP requests using the high-performance `curl_cffi` library with a lot of stealth capabilities.
|
| 4 |
|
| 5 |
+
!!! success "Prerequisites"
|
| 6 |
+
|
| 7 |
+
1. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use.
|
| 8 |
+
2. You've completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object.
|
| 9 |
+
3. You've completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class.
|
| 10 |
|
| 11 |
## Basic Usage
|
| 12 |
You have one primary way to import this Fetcher, which is the same for all fetchers.
|
|
|
|
| 31 |
- **proxy**: As the name implies, the proxy for this request is used to route all traffic (HTTP and HTTPS). The format accepted here is `http://username:password@localhost:8030`.
|
| 32 |
- **proxy_auth**: HTTP basic auth for proxy, tuple of (username, password).
|
| 33 |
- **proxies**: Dict of proxies to use. Format: `{"http": proxy_url, "https": proxy_url}`.
|
| 34 |
+
- **proxy_rotator**: A `ProxyRotator` instance for automatic proxy rotation. Cannot be combined with `proxy` or `proxies`.
|
| 35 |
- **headers**: Headers to include in the request. Can override any header generated by the `stealthy_headers` argument
|
| 36 |
- **max_redirects**: Maximum number of redirects. **Defaults to 30**, use -1 for unlimited.
|
| 37 |
- **verify**: Whether to verify HTTPS certificates. **Defaults to True**.
|
| 38 |
- **cert**: Tuple of (cert, key) filenames for the client certificate.
|
| 39 |
- **selector_config**: A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class.
|
| 40 |
|
| 41 |
+
!!! note "Notes:"
|
| 42 |
+
|
| 43 |
+
1. The currently available browsers to impersonate are (`"edge"`, `"chrome"`, `"chrome_android"`, `"safari"`, `"safari_beta"`, `"safari_ios"`, `"safari_ios_beta"`, `"firefox"`, `"tor"`)<br/>
|
| 44 |
+
2. The available browsers to impersonate, along with their corresponding versions, are automatically displayed in the argument autocompletion and updated with each `curl_cffi` update.<br/>
|
| 45 |
+
3. If any of the arguments `impersonate` or `stealthy_headers` are enabled, the fetchers will automatically generate real browser headers that match the browser version used.
|
| 46 |
|
| 47 |
+
Other than this, for further customization, you can pass any arguments that `curl_cffi` supports for any method if that method doesn't already support them.
|
| 48 |
|
| 49 |
### HTTP Methods
|
| 50 |
There are additional arguments for each method, depending on the method, such as `params` for GET requests and `data`/`json` for POST/PUT/DELETE requests.
|
|
|
|
| 188 |
page1 = session.get('https://scrapling.requestcatcher.com/get')
|
| 189 |
page2 = session.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'})
|
| 190 |
page3 = session.get('https://api.github.com/events')
|
| 191 |
+
|
| 192 |
# All requests share the same session and connection pool
|
| 193 |
```
|
| 194 |
|
| 195 |
+
You can also use a `ProxyRotator` with `FetcherSession` for automatic proxy rotation across requests:
|
| 196 |
+
|
| 197 |
+
```python
|
| 198 |
+
from scrapling.fetchers import FetcherSession, ProxyRotator
|
| 199 |
+
|
| 200 |
+
rotator = ProxyRotator([
|
| 201 |
+
'http://proxy1:8080',
|
| 202 |
+
'http://proxy2:8080',
|
| 203 |
+
'http://proxy3:8080',
|
| 204 |
+
])
|
| 205 |
+
|
| 206 |
+
with FetcherSession(proxy_rotator=rotator, impersonate='chrome') as session:
|
| 207 |
+
# Each request automatically uses the next proxy in rotation
|
| 208 |
+
page1 = session.get('https://example.com/page1')
|
| 209 |
+
page2 = session.get('https://example.com/page2')
|
| 210 |
+
|
| 211 |
+
# You can check which proxy was used via the response metadata
|
| 212 |
+
print(page1.meta['proxy'])
|
| 213 |
+
```
|
| 214 |
+
|
| 215 |
+
You can also override the session proxy (or rotator) for a specific request by passing `proxy=` directly to the request method:
|
| 216 |
+
|
| 217 |
+
```python
|
| 218 |
+
with FetcherSession(proxy='http://default-proxy:8080') as session:
|
| 219 |
+
# Uses the session proxy
|
| 220 |
+
page1 = session.get('https://example.com/page1')
|
| 221 |
+
|
| 222 |
+
# Override the proxy for this specific request
|
| 223 |
+
page2 = session.get('https://example.com/page2', proxy='http://special-proxy:9090')
|
| 224 |
+
```
|
| 225 |
+
|
| 226 |
And here's an async example
|
| 227 |
|
| 228 |
```python
|
| 229 |
async with FetcherSession(impersonate='firefox', http3=True) as session:
|
| 230 |
# All standard HTTP methods available
|
| 231 |
+
response = await session.get('https://example.com')
|
| 232 |
+
response = await session.post('https://scrapling.requestcatcher.com/post', json={'data': 'value'})
|
| 233 |
+
response = await session.put('https://scrapling.requestcatcher.com/put', data={'update': 'info'})
|
| 234 |
+
response = await session.delete('https://scrapling.requestcatcher.com/delete')
|
| 235 |
```
|
| 236 |
or better
|
| 237 |
```python
|
|
|
|
| 272 |
# Check the status
|
| 273 |
if page.status == 200:
|
| 274 |
# Extract title
|
| 275 |
+
title = page.css('title::text').get()
|
| 276 |
print(f"Page title: {title}")
|
| 277 |
+
|
| 278 |
# Extract all links
|
| 279 |
+
links = page.css('a::attr(href)').getall()
|
| 280 |
print(f"Found {len(links)} links")
|
| 281 |
```
|
| 282 |
|
|
|
|
| 294 |
results = []
|
| 295 |
for product in products:
|
| 296 |
results.append({
|
| 297 |
+
'title': product.css('.title::text').get(),
|
| 298 |
+
'price': product.css('.price::text').re_first(r'\d+\.\d{2}'),
|
| 299 |
+
'description': product.css('.description::text').get(),
|
| 300 |
'in_stock': product.has_class('in-stock')
|
| 301 |
})
|
| 302 |
|
|
|
|
| 308 |
```python
|
| 309 |
from scrapling.fetchers import Fetcher
|
| 310 |
|
| 311 |
+
page = Fetcher.get('https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/main_cover.png')
|
| 312 |
+
with open(file='main_cover.png', mode='wb') as f:
|
| 313 |
f.write(page.body)
|
| 314 |
```
|
| 315 |
|
|
|
|
| 335 |
# Process products
|
| 336 |
for product in products:
|
| 337 |
all_products.append({
|
| 338 |
+
'name': product.css('.name::text').get(),
|
| 339 |
+
'price': product.css('.price::text').get()
|
| 340 |
})
|
| 341 |
|
| 342 |
# Next page
|
|
|
|
| 362 |
# Check login success
|
| 363 |
if response.status == 200:
|
| 364 |
# Extract user info
|
| 365 |
+
user_name = response.css('.user-name::text').get()
|
| 366 |
print(f"Logged in as: {user_name}")
|
| 367 |
```
|
| 368 |
|
|
|
|
| 375 |
page = Fetcher.get('https://example.com/data')
|
| 376 |
|
| 377 |
# Find table
|
| 378 |
+
table = page.css('table')[0]
|
| 379 |
|
| 380 |
# Extract headers
|
| 381 |
headers = [
|
|
|
|
| 400 |
page = Fetcher.get('https://example.com')
|
| 401 |
|
| 402 |
# Find navigation
|
| 403 |
+
nav = page.css('nav')[0]
|
| 404 |
|
| 405 |
menu = {}
|
| 406 |
for item in nav.css('li'):
|
| 407 |
+
links = item.css('a')
|
| 408 |
+
if links:
|
| 409 |
+
link = links[0]
|
| 410 |
menu[link.text] = {
|
| 411 |
'url': link['href'],
|
| 412 |
'has_submenu': bool(item.css('.submenu'))
|
docs/fetching/stealthy.md
CHANGED
|
@@ -1,17 +1,15 @@
|
|
| 1 |
-
#
|
| 2 |
|
| 3 |
Here, we will discuss the `StealthyFetcher` class. This class is very similar to the [DynamicFetcher](dynamic.md#introduction) class, including the browsers, the automation, and the use of [Playwright's API](https://playwright.dev/python/docs/intro). The main difference is that this class provides advanced anti-bot protection bypass capabilities; most of them are handled automatically under the hood, and the rest is up to you to enable.
|
| 4 |
|
| 5 |
As with [DynamicFetcher](dynamic.md#introduction), you will need some knowledge about [Playwright's Page API](https://playwright.dev/python/docs/api/class-page) to automate the page, as we will explain later.
|
| 6 |
|
| 7 |
-
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
> 3. You’ve completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object.
|
| 14 |
-
> 4. You’ve completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class.
|
| 15 |
|
| 16 |
## Basic Usage
|
| 17 |
You have one primary way to import this Fetcher, which is the same for all fetchers.
|
|
@@ -21,7 +19,9 @@ You have one primary way to import this Fetcher, which is the same for all fetch
|
|
| 21 |
```
|
| 22 |
Check out how to configure the parsing options [here](choosing.md#parser-configuration-in-all-fetchers)
|
| 23 |
|
| 24 |
-
|
|
|
|
|
|
|
| 25 |
|
| 26 |
## What does it do?
|
| 27 |
|
|
@@ -69,15 +69,19 @@ Scrapling provides many options with this fetcher and its session classes. Befor
|
|
| 69 |
| allow_webgl | Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended, as many WAFs now check if WebGL is enabled. | ✔️ |
|
| 70 |
| additional_args | Additional arguments to be passed to Playwright's context as additional settings, and they take higher priority than Scrapling's settings. | ✔️ |
|
| 71 |
| selector_config | A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class. | ✔️ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
-
|
| 74 |
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
> 3. The `google_search` argument is enabled by default for all requests, making the request appear to come from a Google search page. So, a request for `https://example.com` will set the referer to `https://www.google.com/search?q=example`. Also, if used together, it takes priority over the referer set by the `extra_headers` argument.
|
| 80 |
-
> 4. If you didn't set a user agent and enabled headless mode, the fetcher will generate a real user agent for the same browser version and use it. If you didn't set a user agent and didn't enable headless mode, the fetcher will use the browser's default user agent, which is the same as in standard browsers in the latest versions.
|
| 81 |
|
| 82 |
## Examples
|
| 83 |
It's easier to understand with examples, so we will now review most of the arguments individually. Since it's the same class as the [DynamicFetcher](dynamic.md#introduction), you can refer to that page for more examples, as we won't repeat all the examples from there.
|
|
@@ -108,11 +112,11 @@ The `solve_cloudflare` parameter enables automatic detection and solving all typ
|
|
| 108 |
|
| 109 |
And even solves the custom pages with embedded captcha.
|
| 110 |
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
|
| 117 |
### Browser Automation
|
| 118 |
This is where your knowledge about [Playwright's Page API](https://playwright.dev/python/docs/api/class-page) comes into play. The function you pass here takes the page object from Playwright's API, performs the desired action, and then the fetcher continues.
|
|
@@ -172,14 +176,14 @@ def scrape_amazon_product(url):
|
|
| 172 |
|
| 173 |
# Extract product details
|
| 174 |
return {
|
| 175 |
-
'title': page.
|
| 176 |
-
'price': page.
|
| 177 |
-
'rating': page.
|
| 178 |
'reviews_count': page.css('#acrCustomerReviewText::text').re_first(r'[\d,]+'),
|
| 179 |
'features': [
|
| 180 |
-
li.clean() for li in page.css('#feature-bullets li span::text')
|
| 181 |
],
|
| 182 |
-
'availability': page.
|
| 183 |
'images': [
|
| 184 |
img.attrib['src'] for img in page.css('#altImages img')
|
| 185 |
]
|
|
@@ -248,7 +252,8 @@ In versions 0.3 and 0.3.1, the pool was reusing finished tabs to save more resou
|
|
| 248 |
- **Memory efficiency**: Better resource usage compared to launching new browsers with each fetch.
|
| 249 |
|
| 250 |
## Using Camoufox as an engine
|
| 251 |
-
|
|
|
|
| 252 |
|
| 253 |
First, you will need to install the Camoufox library, browser, and Firefox system dependencies if you didn't already:
|
| 254 |
```commandline
|
|
|
|
| 1 |
+
# Fetching dynamic websites with hard protections
|
| 2 |
|
| 3 |
Here, we will discuss the `StealthyFetcher` class. This class is very similar to the [DynamicFetcher](dynamic.md#introduction) class, including the browsers, the automation, and the use of [Playwright's API](https://playwright.dev/python/docs/intro). The main difference is that this class provides advanced anti-bot protection bypass capabilities; most of them are handled automatically under the hood, and the rest is up to you to enable.
|
| 4 |
|
| 5 |
As with [DynamicFetcher](dynamic.md#introduction), you will need some knowledge about [Playwright's Page API](https://playwright.dev/python/docs/api/class-page) to automate the page, as we will explain later.
|
| 6 |
|
| 7 |
+
!!! success "Prerequisites"
|
| 8 |
|
| 9 |
+
1. You've completed or read the [DynamicFetcher](dynamic.md#introduction) page since this class builds upon it, and we won't repeat the same information here for that reason.
|
| 10 |
+
2. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use.
|
| 11 |
+
3. You've completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object.
|
| 12 |
+
4. You've completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class.
|
|
|
|
|
|
|
| 13 |
|
| 14 |
## Basic Usage
|
| 15 |
You have one primary way to import this Fetcher, which is the same for all fetchers.
|
|
|
|
| 19 |
```
|
| 20 |
Check out how to configure the parsing options [here](choosing.md#parser-configuration-in-all-fetchers)
|
| 21 |
|
| 22 |
+
!!! abstract
|
| 23 |
+
|
| 24 |
+
The async version of the `fetch` method is `async_fetch`, of course.
|
| 25 |
|
| 26 |
## What does it do?
|
| 27 |
|
|
|
|
| 69 |
| allow_webgl | Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended, as many WAFs now check if WebGL is enabled. | ✔️ |
|
| 70 |
| additional_args | Additional arguments to be passed to Playwright's context as additional settings, and they take higher priority than Scrapling's settings. | ✔️ |
|
| 71 |
| selector_config | A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class. | ✔️ |
|
| 72 |
+
| blocked_domains | A set of domain names to block requests to. Subdomains are also matched (e.g., `"example.com"` blocks `"sub.example.com"` too). | ✔️ |
|
| 73 |
+
| proxy_rotator | A `ProxyRotator` instance for automatic proxy rotation. Cannot be combined with `proxy`. | ✔️ |
|
| 74 |
+
| retries | Number of retry attempts for failed requests. Defaults to 3. | ✔️ |
|
| 75 |
+
| retry_delay | Seconds to wait between retry attempts. Defaults to 1. | ✔️ |
|
| 76 |
+
|
| 77 |
+
In session classes, all these arguments can be set globally for the session. Still, you can configure each request individually by passing some of the arguments here that can be configured on the browser tab level like: `google_search`, `timeout`, `wait`, `page_action`, `extra_headers`, `disable_resources`, `wait_selector`, `wait_selector_state`, `network_idle`, `load_dom`, `solve_cloudflare`, `blocked_domains`, `proxy`, and `selector_config`.
|
| 78 |
|
| 79 |
+
!!! note "Notes:"
|
| 80 |
|
| 81 |
+
1. It's basically the same arguments as [DynamicFetcher](dynamic.md#introduction) class, but with these additional arguments: `solve_cloudflare`, `block_webrtc`, `hide_canvas`, and `allow_webgl`.
|
| 82 |
+
2. The `disable_resources` option made requests ~25% faster in my tests for some websites and can help save your proxy usage, but be careful with it, as it can cause some websites to never finish loading.
|
| 83 |
+
3. The `google_search` argument is enabled by default for all requests, making the request appear to come from a Google search page. So, a request for `https://example.com` will set the referer to `https://www.google.com/search?q=example`. Also, if used together, it takes priority over the referer set by the `extra_headers` argument.
|
| 84 |
+
4. If you didn't set a user agent and enabled headless mode, the fetcher will generate a real user agent for the same browser version and use it. If you didn't set a user agent and didn't enable headless mode, the fetcher will use the browser's default user agent, which is the same as in standard browsers in the latest versions.
|
|
|
|
|
|
|
| 85 |
|
| 86 |
## Examples
|
| 87 |
It's easier to understand with examples, so we will now review most of the arguments individually. Since it's the same class as the [DynamicFetcher](dynamic.md#introduction), you can refer to that page for more examples, as we won't repeat all the examples from there.
|
|
|
|
| 112 |
|
| 113 |
And even solves the custom pages with embedded captcha.
|
| 114 |
|
| 115 |
+
!!! notes "**Important notes:**"
|
| 116 |
+
|
| 117 |
+
1. Sometimes, with websites that use custom implementations, you will need to use `wait_selector` to make sure Scrapling waits for the real website content to be loaded after solving the captcha. Some websites can be the real definition of an edge case while we are trying to make the solver as generic as possible.
|
| 118 |
+
2. The timeout should be at least 60 seconds when using the Cloudflare solver for sufficient challenge-solving time.
|
| 119 |
+
3. This feature works seamlessly with proxies and other stealth options.
|
| 120 |
|
| 121 |
### Browser Automation
|
| 122 |
This is where your knowledge about [Playwright's Page API](https://playwright.dev/python/docs/api/class-page) comes into play. The function you pass here takes the page object from Playwright's API, performs the desired action, and then the fetcher continues.
|
|
|
|
| 176 |
|
| 177 |
# Extract product details
|
| 178 |
return {
|
| 179 |
+
'title': page.css('#productTitle::text').get().clean(),
|
| 180 |
+
'price': page.css('.a-price .a-offscreen::text').get(),
|
| 181 |
+
'rating': page.css('[data-feature-name="averageCustomerReviews"] .a-popover-trigger .a-color-base::text').get(),
|
| 182 |
'reviews_count': page.css('#acrCustomerReviewText::text').re_first(r'[\d,]+'),
|
| 183 |
'features': [
|
| 184 |
+
li.get().clean() for li in page.css('#feature-bullets li span::text')
|
| 185 |
],
|
| 186 |
+
'availability': page.css('#availability')[0].get_all_text(strip=True),
|
| 187 |
'images': [
|
| 188 |
img.attrib['src'] for img in page.css('#altImages img')
|
| 189 |
]
|
|
|
|
| 252 |
- **Memory efficiency**: Better resource usage compared to launching new browsers with each fetch.
|
| 253 |
|
| 254 |
## Using Camoufox as an engine
|
| 255 |
+
|
| 256 |
+
This fetcher used a custom version of [Camoufox](https://github.com/daijro/camoufox) as an engine before version 0.3.13, which was replaced by [patchright](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright) for many reasons. If you see that Camoufox is stable on your device, has no high memory issues, and you want to continue using it, then you can.
|
| 257 |
|
| 258 |
First, you will need to install the Camoufox library, browser, and Firefox system dependencies if you didn't already:
|
| 259 |
```commandline
|
docs/index.md
CHANGED
|
@@ -2,34 +2,46 @@
|
|
| 2 |
.md-typeset h1 {
|
| 3 |
display: none;
|
| 4 |
}
|
|
|
|
|
|
|
| 5 |
</style>
|
| 6 |
|
|
|
|
| 7 |
<div align="center">
|
| 8 |
<a href="https://scrapling.readthedocs.io/en/latest/" alt="poster">
|
| 9 |
-
<img alt="
|
|
|
|
|
|
|
| 10 |
</div>
|
| 11 |
|
| 12 |
-
<
|
| 13 |
-
<i><code>Easy, effortless Web Scraping as it should be!</code></i>
|
| 14 |
-
<br/><br/>
|
| 15 |
-
</div>
|
| 16 |
|
| 17 |
-
|
| 18 |
|
| 19 |
-
|
| 20 |
|
| 21 |
-
|
| 22 |
|
| 23 |
```python
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
```
|
| 34 |
|
| 35 |
## Top Sponsors
|
|
@@ -51,16 +63,27 @@ Built for the modern Web, Scrapling features **its own rapid parsing engine** an
|
|
| 51 |
|
| 52 |
## Key Features
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
### Advanced Websites Fetching with Session Support
|
| 55 |
- **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class. Can impersonate browsers' TLS fingerprint, headers, and use HTTP/3.
|
| 56 |
-
- **Dynamic Loading**: Fetch dynamic websites with full browser automation through the `DynamicFetcher` class supporting Playwright's Chromium
|
| 57 |
-
- **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` and fingerprint spoofing. Can bypass all types of Cloudflare's Turnstile/Interstitial with automation
|
| 58 |
- **Session Management**: Persistent session support with `FetcherSession`, `StealthySession`, and `DynamicSession` classes for cookie and state management across requests.
|
|
|
|
|
|
|
| 59 |
- **Async Support**: Complete async support across all fetchers and dedicated async session classes.
|
| 60 |
|
| 61 |
### Adaptive Scraping & AI Integration
|
| 62 |
- 🔄 **Smart Element Tracking**: Relocate elements after website changes using intelligent similarity algorithms.
|
| 63 |
-
- 🎯 **Smart Flexible Selection**: CSS selectors, XPath selectors, filter-based search, text search, regex search, and more.
|
| 64 |
- 🔍 **Find Similar Elements**: Automatically locate elements similar to found elements.
|
| 65 |
- 🤖 **MCP Server to be used with AI**: Built-in MCP server for AI-assisted Web Scraping and data extraction. The MCP server features powerful, custom capabilities that leverage Scrapling to extract targeted content before passing it to the AI (Claude/Cursor/etc), thereby speeding up operations and reducing costs by minimizing token usage. ([demo video](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
|
| 66 |
|
|
@@ -72,12 +95,12 @@ Built for the modern Web, Scrapling features **its own rapid parsing engine** an
|
|
| 72 |
|
| 73 |
### Developer/Web Scraper Friendly Experience
|
| 74 |
- 🎯 **Interactive Web Scraping Shell**: Optional built-in IPython shell with Scrapling integration, shortcuts, and new tools to speed up Web Scraping scripts development, like converting curl requests to Scrapling requests and viewing requests results in your browser.
|
| 75 |
-
- 🚀 **Use it directly from the Terminal**: Optionally, you can use Scrapling to scrape a URL without writing a single code!
|
| 76 |
- 🛠️ **Rich Navigation API**: Advanced DOM traversal with parent, sibling, and child navigation methods.
|
| 77 |
- 🧬 **Enhanced Text Processing**: Built-in regex, cleaning methods, and optimized string operations.
|
| 78 |
- 📝 **Auto Selector Generation**: Generate robust CSS/XPath selectors for any element.
|
| 79 |
- 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup with the same pseudo-elements used in Scrapy/Parsel.
|
| 80 |
-
- 📘 **Complete Type Coverage**: Full type hints for excellent IDE support and code completion.
|
| 81 |
- 🔋 **Ready Docker image**: With each release, a Docker image containing all browsers is automatically built and pushed.
|
| 82 |
|
| 83 |
|
|
@@ -86,10 +109,34 @@ Scrapling’s GitHub stars have grown steadily since its release (see chart belo
|
|
| 86 |
|
| 87 |
<div id="chartContainer">
|
| 88 |
<a href="https://github.com/D4Vinci/Scrapling">
|
| 89 |
-
<img id="chartImage" alt="Star History Chart" loading="lazy" src="https://api.star-history.com/svg?repos=D4Vinci/Scrapling&type=
|
| 90 |
</a>
|
| 91 |
</div>
|
| 92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
## Installation
|
| 95 |
Scrapling requires Python 3.10 or higher:
|
|
@@ -98,7 +145,7 @@ Scrapling requires Python 3.10 or higher:
|
|
| 98 |
pip install scrapling
|
| 99 |
```
|
| 100 |
|
| 101 |
-
|
| 102 |
|
| 103 |
### Optional Dependencies
|
| 104 |
|
|
|
|
| 2 |
.md-typeset h1 {
|
| 3 |
display: none;
|
| 4 |
}
|
| 5 |
+
[data-md-color-scheme="default"] .only-dark { display: none; }
|
| 6 |
+
[data-md-color-scheme="slate"] .only-light { display: none; }
|
| 7 |
</style>
|
| 8 |
|
| 9 |
+
<br/>
|
| 10 |
<div align="center">
|
| 11 |
<a href="https://scrapling.readthedocs.io/en/latest/" alt="poster">
|
| 12 |
+
<img alt="Scrapling" src="assets/cover_light.svg" class="only-light">
|
| 13 |
+
<img alt="Scrapling" src="assets/cover_dark.svg" class="only-dark">
|
| 14 |
+
</a>
|
| 15 |
</div>
|
| 16 |
|
| 17 |
+
<h2 align="center"><i>Effortless Web Scraping for the Modern Web</i></h2><br>
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
+
Scrapling is an adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl.
|
| 20 |
|
| 21 |
+
Its parser learns from website changes and automatically relocates your elements when pages update. Its fetchers bypass anti-bot systems like Cloudflare Turnstile out of the box. And its spider framework lets you scale up to concurrent, multi-session crawls with pause/resume and automatic proxy rotation — all in a few lines of Python. One library, zero compromises.
|
| 22 |
|
| 23 |
+
Blazing fast crawls with real-time stats and streaming. Built by Web Scrapers for Web Scrapers and regular users, there's something for everyone.
|
| 24 |
|
| 25 |
```python
|
| 26 |
+
from scrapling.fetchers import Fetcher, StealthyFetcher, DynamicFetcher
|
| 27 |
+
StealthyFetcher.adaptive = True
|
| 28 |
+
page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # Fetch website under the radar!
|
| 29 |
+
products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
|
| 30 |
+
products = page.css('.product', adaptive=True) # Later, if the website structure changes, pass `adaptive=True` to find them!
|
| 31 |
+
```
|
| 32 |
+
Or scale up to full crawls
|
| 33 |
+
```python
|
| 34 |
+
from scrapling.spiders import Spider, Response
|
| 35 |
+
|
| 36 |
+
class MySpider(Spider):
|
| 37 |
+
name = "demo"
|
| 38 |
+
start_urls = ["https://example.com/"]
|
| 39 |
+
|
| 40 |
+
async def parse(self, response: Response):
|
| 41 |
+
for item in response.css('.product'):
|
| 42 |
+
yield {"title": item.css('h2::text').get()}
|
| 43 |
+
|
| 44 |
+
MySpider().start()
|
| 45 |
```
|
| 46 |
|
| 47 |
## Top Sponsors
|
|
|
|
| 63 |
|
| 64 |
## Key Features
|
| 65 |
|
| 66 |
+
### Spiders — A Full Crawling Framework
|
| 67 |
+
- 🕷️ **Scrapy-like Spider API**: Define spiders with `start_urls`, async `parse` callbacks, and `Request`/`Response` objects.
|
| 68 |
+
- ⚡ **Concurrent Crawling**: Configurable concurrency limits, per-domain throttling, and download delays.
|
| 69 |
+
- 🔄 **Multi-Session Support**: Unified interface for HTTP requests, and stealthy headless browsers in a single spider — route requests to different sessions by ID.
|
| 70 |
+
- 💾 **Pause & Resume**: Checkpoint-based crawl persistence. Press Ctrl+C for a graceful shutdown; restart to resume from where you left off.
|
| 71 |
+
- 📡 **Streaming Mode**: Stream scraped items as they arrive via `async for item in spider.stream()` with real-time stats — ideal for UI, pipelines, and long-running crawls.
|
| 72 |
+
- 🛡️ **Blocked Request Detection**: Automatic detection and retry of blocked requests with customizable logic.
|
| 73 |
+
- 📦 **Built-in Export**: Export results through hooks and your own pipeline or the built-in JSON/JSONL with `result.items.to_json()` / `result.items.to_jsonl()` respectively.
|
| 74 |
+
|
| 75 |
### Advanced Websites Fetching with Session Support
|
| 76 |
- **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class. Can impersonate browsers' TLS fingerprint, headers, and use HTTP/3.
|
| 77 |
+
- **Dynamic Loading**: Fetch dynamic websites with full browser automation through the `DynamicFetcher` class supporting Playwright's Chromium and Google's Chrome.
|
| 78 |
+
- **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` and fingerprint spoofing. Can easily bypass all types of Cloudflare's Turnstile/Interstitial with automation.
|
| 79 |
- **Session Management**: Persistent session support with `FetcherSession`, `StealthySession`, and `DynamicSession` classes for cookie and state management across requests.
|
| 80 |
+
- **Proxy Rotation**: Built-in `ProxyRotator` with cyclic or custom rotation strategies across all session types, plus per-request proxy overrides.
|
| 81 |
+
- **Domain Blocking**: Block requests to specific domains (and their subdomains) in browser-based fetchers.
|
| 82 |
- **Async Support**: Complete async support across all fetchers and dedicated async session classes.
|
| 83 |
|
| 84 |
### Adaptive Scraping & AI Integration
|
| 85 |
- 🔄 **Smart Element Tracking**: Relocate elements after website changes using intelligent similarity algorithms.
|
| 86 |
+
- 🎯 **Smart Flexible Selection**: CSS selectors, XPath selectors, filter-based search, text search, regex search, and more.
|
| 87 |
- 🔍 **Find Similar Elements**: Automatically locate elements similar to found elements.
|
| 88 |
- 🤖 **MCP Server to be used with AI**: Built-in MCP server for AI-assisted Web Scraping and data extraction. The MCP server features powerful, custom capabilities that leverage Scrapling to extract targeted content before passing it to the AI (Claude/Cursor/etc), thereby speeding up operations and reducing costs by minimizing token usage. ([demo video](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
|
| 89 |
|
|
|
|
| 95 |
|
| 96 |
### Developer/Web Scraper Friendly Experience
|
| 97 |
- 🎯 **Interactive Web Scraping Shell**: Optional built-in IPython shell with Scrapling integration, shortcuts, and new tools to speed up Web Scraping scripts development, like converting curl requests to Scrapling requests and viewing requests results in your browser.
|
| 98 |
+
- 🚀 **Use it directly from the Terminal**: Optionally, you can use Scrapling to scrape a URL without writing a single line of code!
|
| 99 |
- 🛠️ **Rich Navigation API**: Advanced DOM traversal with parent, sibling, and child navigation methods.
|
| 100 |
- 🧬 **Enhanced Text Processing**: Built-in regex, cleaning methods, and optimized string operations.
|
| 101 |
- 📝 **Auto Selector Generation**: Generate robust CSS/XPath selectors for any element.
|
| 102 |
- 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup with the same pseudo-elements used in Scrapy/Parsel.
|
| 103 |
+
- 📘 **Complete Type Coverage**: Full type hints for excellent IDE support and code completion. The entire codebase is automatically scanned with **PyRight** and **MyPy** with each change.
|
| 104 |
- 🔋 **Ready Docker image**: With each release, a Docker image containing all browsers is automatically built and pushed.
|
| 105 |
|
| 106 |
|
|
|
|
| 109 |
|
| 110 |
<div id="chartContainer">
|
| 111 |
<a href="https://github.com/D4Vinci/Scrapling">
|
| 112 |
+
<img id="chartImage" alt="Star History Chart" loading="lazy" src="https://api.star-history.com/svg?repos=D4Vinci/Scrapling&type=Date" height="400"/>
|
| 113 |
</a>
|
| 114 |
</div>
|
| 115 |
|
| 116 |
+
<script>
|
| 117 |
+
const observer = new MutationObserver((mutations) => {
|
| 118 |
+
mutations.forEach((mutation) => {
|
| 119 |
+
if (mutation.attributeName === 'data-md-color-media') {
|
| 120 |
+
const colorMedia = document.body.getAttribute('data-md-color-media');
|
| 121 |
+
const isDarkScheme = document.body.getAttribute('data-md-color-scheme') === 'slate';
|
| 122 |
+
const chartImg = document.querySelector('#chartImage');
|
| 123 |
+
const baseUrl = 'https://api.star-history.com/svg?repos=D4Vinci/Scrapling&type=Date';
|
| 124 |
+
|
| 125 |
+
if (colorMedia === '(prefers-color-scheme)' ? isDarkScheme : colorMedia.includes('dark')) {
|
| 126 |
+
chartImg.src = `${baseUrl}&theme=dark`;
|
| 127 |
+
} else {
|
| 128 |
+
chartImg.src = baseUrl;
|
| 129 |
+
}
|
| 130 |
+
}
|
| 131 |
+
});
|
| 132 |
+
});
|
| 133 |
+
|
| 134 |
+
observer.observe(document.body, {
|
| 135 |
+
attributes: true,
|
| 136 |
+
attributeFilter: ['data-md-color-media', 'data-md-color-scheme']
|
| 137 |
+
});
|
| 138 |
+
</script>
|
| 139 |
+
|
| 140 |
|
| 141 |
## Installation
|
| 142 |
Scrapling requires Python 3.10 or higher:
|
|
|
|
| 145 |
pip install scrapling
|
| 146 |
```
|
| 147 |
|
| 148 |
+
This installation only includes the parser engine and its dependencies, without any fetchers or commandline dependencies.
|
| 149 |
|
| 150 |
### Optional Dependencies
|
| 151 |
|
docs/overview.md
CHANGED
|
@@ -1,3 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
We will start by quickly reviewing the parsing capabilities. Then we will fetch websites using custom browsers, make requests, and parse the responses.
|
| 2 |
|
| 3 |
Here's an HTML document generated by ChatGPT that we will be using as an example throughout this page:
|
|
@@ -134,7 +148,7 @@ target_element.find_similar()
|
|
| 134 |
```
|
| 135 |
Find the first element that matches a CSS selector
|
| 136 |
```python
|
| 137 |
-
page.
|
| 138 |
# <data='<article class="product" data-id="1"><h3...' parent='<div class="product-list"> <article clas...'>
|
| 139 |
```
|
| 140 |
Find all elements that match a CSS selector
|
|
@@ -144,7 +158,7 @@ page.css('.product-list article')
|
|
| 144 |
```
|
| 145 |
Find the first element that matches an XPath selector
|
| 146 |
```python
|
| 147 |
-
page.
|
| 148 |
# <data='<article class="product" data-id="1"><h3...' parent='<div class="product-list"> <article clas...'>
|
| 149 |
```
|
| 150 |
Find all elements that match an XPath selector
|
|
@@ -220,14 +234,14 @@ Using the elements we found above
|
|
| 220 |
[<data='<section id="reviews"><h2>Customer Revie...' parent='<main><section id="products" schema='{"j...'>]
|
| 221 |
>>> section_element.next # gets the next element, the same logic applies to `quote.previous`.
|
| 222 |
<data='<section id="reviews"><h2>Customer Revie...' parent='<main><section id="products" schema='{"j...'>
|
| 223 |
-
>>> section_element.children.css('h2::text')
|
| 224 |
['Products']
|
| 225 |
-
>>> page.
|
| 226 |
True
|
| 227 |
```
|
| 228 |
If your case needs more than the element's parent, you can iterate over the whole ancestors' tree of any element, like the one below
|
| 229 |
```python
|
| 230 |
-
for ancestor in
|
| 231 |
# do something with it...
|
| 232 |
```
|
| 233 |
You can search for a specific ancestor of an element that satisfies a function; all you need to do is pass a function that takes a `Selector` object as an argument and returns `True` if the condition is satisfied or `False` otherwise, like below:
|
|
@@ -264,11 +278,11 @@ For Async requests, you will replace the import like below:
|
|
| 264 |
>>> page = await AsyncFetcher.delete('https://scrapling.requestcatcher.com/delete')
|
| 265 |
```
|
| 266 |
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
|
| 273 |
This is just the tip of the iceberg with this fetcher; check out the rest from [here](fetching/static.md)
|
| 274 |
|
|
@@ -279,11 +293,11 @@ The `DynamicFetcher` class (formerly `PlayWrightFetcher`) offers many options fo
|
|
| 279 |
```python
|
| 280 |
>>> from scrapling.fetchers import DynamicFetcher
|
| 281 |
>>> page = DynamicFetcher.fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # Vanilla Playwright option
|
| 282 |
-
>>> page.
|
| 283 |
'https://github.com/D4Vinci/Scrapling'
|
| 284 |
>>> # The async version of fetch
|
| 285 |
>>> page = await DynamicFetcher.async_fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True)
|
| 286 |
-
>>> page.
|
| 287 |
'https://github.com/D4Vinci/Scrapling'
|
| 288 |
```
|
| 289 |
It's built on top of [Playwright](https://playwright.dev/python/), and it's currently providing two main run options that can be mixed as you want:
|
|
@@ -324,7 +338,7 @@ True
|
|
| 324 |
True
|
| 325 |
```
|
| 326 |
|
| 327 |
-
Again, this is just the tip of the iceberg with this fetcher. Check out the rest from [here](fetching/
|
| 328 |
|
| 329 |
---
|
| 330 |
|
|
|
|
| 1 |
+
## Pick Your Path
|
| 2 |
+
|
| 3 |
+
Not sure where to start? Pick the path that matches what you're trying to do:
|
| 4 |
+
|
| 5 |
+
| I want to... | Start here |
|
| 6 |
+
|:---|:---|
|
| 7 |
+
| **Parse HTML** I already have | [Querying elements](parsing/selection.md) — CSS, XPath, and text-based selection |
|
| 8 |
+
| **Quickly scrape a page** and prototype | Pick a [fetcher](fetching/choosing.md) and test right away, or launch the [interactive shell](cli/interactive-shell.md) |
|
| 9 |
+
| **Build a crawler** that scales | [Spiders](spiders/getting-started.md) — concurrent, multi-session crawls with pause/resume |
|
| 10 |
+
| **Scrape without writing code** | [CLI extract commands](cli/extract-commands.md) or hook up the [MCP server](ai/mcp-server.md) to your favourite AI tool |
|
| 11 |
+
| **Migrate** from another library | [From BeautifulSoup](tutorials/migrating_from_beautifulsoup.md) or [Scrapy comparison](spiders/architecture.md#comparison-with-scrapy) |
|
| 12 |
+
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
We will start by quickly reviewing the parsing capabilities. Then we will fetch websites using custom browsers, make requests, and parse the responses.
|
| 16 |
|
| 17 |
Here's an HTML document generated by ChatGPT that we will be using as an example throughout this page:
|
|
|
|
| 148 |
```
|
| 149 |
Find the first element that matches a CSS selector
|
| 150 |
```python
|
| 151 |
+
page.css('.product-list [data-id="1"]')[0]
|
| 152 |
# <data='<article class="product" data-id="1"><h3...' parent='<div class="product-list"> <article clas...'>
|
| 153 |
```
|
| 154 |
Find all elements that match a CSS selector
|
|
|
|
| 158 |
```
|
| 159 |
Find the first element that matches an XPath selector
|
| 160 |
```python
|
| 161 |
+
page.xpath("//*[@id='products']/div/article")[0]
|
| 162 |
# <data='<article class="product" data-id="1"><h3...' parent='<div class="product-list"> <article clas...'>
|
| 163 |
```
|
| 164 |
Find all elements that match an XPath selector
|
|
|
|
| 234 |
[<data='<section id="reviews"><h2>Customer Revie...' parent='<main><section id="products" schema='{"j...'>]
|
| 235 |
>>> section_element.next # gets the next element, the same logic applies to `quote.previous`.
|
| 236 |
<data='<section id="reviews"><h2>Customer Revie...' parent='<main><section id="products" schema='{"j...'>
|
| 237 |
+
>>> section_element.children.css('h2::text').getall()
|
| 238 |
['Products']
|
| 239 |
+
>>> page.css('[data-id="1"]')[0].has_class('product')
|
| 240 |
True
|
| 241 |
```
|
| 242 |
If your case needs more than the element's parent, you can iterate over the whole ancestors' tree of any element, like the one below
|
| 243 |
```python
|
| 244 |
+
for ancestor in section_element.iterancestors():
|
| 245 |
# do something with it...
|
| 246 |
```
|
| 247 |
You can search for a specific ancestor of an element that satisfies a function; all you need to do is pass a function that takes a `Selector` object as an argument and returns `True` if the condition is satisfied or `False` otherwise, like below:
|
|
|
|
| 278 |
>>> page = await AsyncFetcher.delete('https://scrapling.requestcatcher.com/delete')
|
| 279 |
```
|
| 280 |
|
| 281 |
+
!!! note "Notes:"
|
| 282 |
+
|
| 283 |
+
1. You have the `stealthy_headers` argument, which, when enabled, makes requests to generate real browser headers and use them, including a referer header, as if this request came from a Google search of this domain. It's enabled by default.
|
| 284 |
+
2. The `impersonate` argument lets you fake the TLS fingerprint for a specific browser version.
|
| 285 |
+
3. There's also the `http3` argument, which, when enabled, makes the fetcher use HTTP/3 for requests, which makes your requests more authentic
|
| 286 |
|
| 287 |
This is just the tip of the iceberg with this fetcher; check out the rest from [here](fetching/static.md)
|
| 288 |
|
|
|
|
| 293 |
```python
|
| 294 |
>>> from scrapling.fetchers import DynamicFetcher
|
| 295 |
>>> page = DynamicFetcher.fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # Vanilla Playwright option
|
| 296 |
+
>>> page.css("#search a::attr(href)").get()
|
| 297 |
'https://github.com/D4Vinci/Scrapling'
|
| 298 |
>>> # The async version of fetch
|
| 299 |
>>> page = await DynamicFetcher.async_fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True)
|
| 300 |
+
>>> page.css("#search a::attr(href)").get()
|
| 301 |
'https://github.com/D4Vinci/Scrapling'
|
| 302 |
```
|
| 303 |
It's built on top of [Playwright](https://playwright.dev/python/), and it's currently providing two main run options that can be mixed as you want:
|
|
|
|
| 338 |
True
|
| 339 |
```
|
| 340 |
|
| 341 |
+
Again, this is just the tip of the iceberg with this fetcher. Check out the rest from [here](fetching/stealthy.md) for all details and the complete list of arguments.
|
| 342 |
|
| 343 |
---
|
| 344 |
|
docs/parsing/adaptive.md
CHANGED
|
@@ -1,10 +1,9 @@
|
|
| 1 |
-
#
|
| 2 |
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
> <br><br>
|
| 8 |
|
| 9 |
Adaptive scraping (previously known as automatch) is one of Scrapling's most powerful features. It allows your scraper to survive website changes by intelligently tracking and relocating elements.
|
| 10 |
|
|
@@ -84,11 +83,11 @@ Now, let's test the same selector in both versions
|
|
| 84 |
>> Fetcher.configure(adaptive = True, adaptive_domain='stackoverflow.com')
|
| 85 |
>>
|
| 86 |
>> page = Fetcher.get(old_url, timeout=30)
|
| 87 |
-
>> element1 = page.
|
| 88 |
>>
|
| 89 |
>> # Same selector but used in the updated website
|
| 90 |
>> page = Fetcher.get(new_url)
|
| 91 |
-
>> element2 = page.
|
| 92 |
>>
|
| 93 |
>> if element1.text == element2.text:
|
| 94 |
... print('Scrapling found the same element in the old and new designs!')
|
|
@@ -100,7 +99,9 @@ The code will be the same in a real-world scenario, except it will use the same
|
|
| 100 |
|
| 101 |
Hence, in the two examples above, I used both the `Selector` and `Fetcher` classes to show that the adaptive logic is the same.
|
| 102 |
|
| 103 |
-
|
|
|
|
|
|
|
| 104 |
|
| 105 |
## How the adaptive scraping feature works
|
| 106 |
Adaptive scraping works in two phases:
|
|
@@ -144,7 +145,7 @@ Examples:
|
|
| 144 |
>>> page = Selector(html_doc, adaptive=True)
|
| 145 |
# OR
|
| 146 |
>>> Fetcher.adaptive = True
|
| 147 |
-
>>> page = Fetcher.
|
| 148 |
```
|
| 149 |
If you are using the [Selector](main_classes.md#selector) class, you need to pass the url of the website you are using with the argument `url` so Scrapling can separate the properties saved for each element by domain.
|
| 150 |
|
|
@@ -157,7 +158,7 @@ Now that you've enabled the `adaptive` feature globally, you have two main ways
|
|
| 157 |
### The CSS/XPath Selection way
|
| 158 |
As you have seen in the example above, first, you have to use the `auto_save` argument while selecting an element that exists on the page, like below
|
| 159 |
```python
|
| 160 |
-
element = page.css('#p1' auto_save=True)
|
| 161 |
```
|
| 162 |
And when the element doesn't exist, you can use the same selector and the `adaptive` argument, and the library will find it for you
|
| 163 |
```python
|
|
@@ -165,7 +166,7 @@ element = page.css('#p1', adaptive=True)
|
|
| 165 |
```
|
| 166 |
Pretty simple, eh?
|
| 167 |
|
| 168 |
-
Well, a lot happened under the hood here. Remember the identifier we mentioned before that you need to set to retrieve the element you want? Here, with the `css`/`
|
| 169 |
|
| 170 |
Additionally, for all these methods, you can pass the `identifier` argument to set it yourself. This is useful in some instances, or you can use it to save properties with the `auto_save` argument.
|
| 171 |
|
|
@@ -185,7 +186,7 @@ Now, later, when you want to retrieve it and relocate it inside the page with `a
|
|
| 185 |
>>> element_dict = page.retrieve('my_special_element')
|
| 186 |
>>> page.relocate(element_dict, selector_type=True)
|
| 187 |
[<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>]
|
| 188 |
-
>>> page.relocate(element_dict, selector_type=True).css('::text')
|
| 189 |
['Tipping the Velvet']
|
| 190 |
```
|
| 191 |
Hence, the `retrieve` and `relocate` methods are used.
|
|
|
|
| 1 |
+
# Adaptive scraping
|
| 2 |
|
| 3 |
+
!!! success "Prerequisites"
|
| 4 |
+
|
| 5 |
+
1. You've completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector) object.
|
| 6 |
+
2. You've completed or read the [Main classes](../parsing/main_classes.md) page to understand the [Selector](../parsing/main_classes.md#selector) class.
|
|
|
|
| 7 |
|
| 8 |
Adaptive scraping (previously known as automatch) is one of Scrapling's most powerful features. It allows your scraper to survive website changes by intelligently tracking and relocating elements.
|
| 9 |
|
|
|
|
| 83 |
>> Fetcher.configure(adaptive = True, adaptive_domain='stackoverflow.com')
|
| 84 |
>>
|
| 85 |
>> page = Fetcher.get(old_url, timeout=30)
|
| 86 |
+
>> element1 = page.css(selector, auto_save=True)[0]
|
| 87 |
>>
|
| 88 |
>> # Same selector but used in the updated website
|
| 89 |
>> page = Fetcher.get(new_url)
|
| 90 |
+
>> element2 = page.css(selector, adaptive=True)[0]
|
| 91 |
>>
|
| 92 |
>> if element1.text == element2.text:
|
| 93 |
... print('Scrapling found the same element in the old and new designs!')
|
|
|
|
| 99 |
|
| 100 |
Hence, in the two examples above, I used both the `Selector` and `Fetcher` classes to show that the adaptive logic is the same.
|
| 101 |
|
| 102 |
+
!!! info
|
| 103 |
+
|
| 104 |
+
The main reason for creating the `adaptive_domain` argument was to handle if the website changed its URL while changing the design/structure. In that case, you can use it to continue using the previously stored adaptive data for the new URL. Otherwise, scrapling will consider it a new website and discard the old data.
|
| 105 |
|
| 106 |
## How the adaptive scraping feature works
|
| 107 |
Adaptive scraping works in two phases:
|
|
|
|
| 145 |
>>> page = Selector(html_doc, adaptive=True)
|
| 146 |
# OR
|
| 147 |
>>> Fetcher.adaptive = True
|
| 148 |
+
>>> page = Fetcher.get('https://example.com')
|
| 149 |
```
|
| 150 |
If you are using the [Selector](main_classes.md#selector) class, you need to pass the url of the website you are using with the argument `url` so Scrapling can separate the properties saved for each element by domain.
|
| 151 |
|
|
|
|
| 158 |
### The CSS/XPath Selection way
|
| 159 |
As you have seen in the example above, first, you have to use the `auto_save` argument while selecting an element that exists on the page, like below
|
| 160 |
```python
|
| 161 |
+
element = page.css('#p1', auto_save=True)
|
| 162 |
```
|
| 163 |
And when the element doesn't exist, you can use the same selector and the `adaptive` argument, and the library will find it for you
|
| 164 |
```python
|
|
|
|
| 166 |
```
|
| 167 |
Pretty simple, eh?
|
| 168 |
|
| 169 |
+
Well, a lot happened under the hood here. Remember the identifier we mentioned before that you need to set to retrieve the element you want? Here, with the `css`/`xpath` methods, the identifier is set automatically as the selector you passed here to make things easier :)
|
| 170 |
|
| 171 |
Additionally, for all these methods, you can pass the `identifier` argument to set it yourself. This is useful in some instances, or you can use it to save properties with the `auto_save` argument.
|
| 172 |
|
|
|
|
| 186 |
>>> element_dict = page.retrieve('my_special_element')
|
| 187 |
>>> page.relocate(element_dict, selector_type=True)
|
| 188 |
[<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>]
|
| 189 |
+
>>> page.relocate(element_dict, selector_type=True).css('::text').getall()
|
| 190 |
['Tipping the Velvet']
|
| 191 |
```
|
| 192 |
Hence, the `retrieve` and `relocate` methods are used.
|
docs/parsing/main_classes.md
CHANGED
|
@@ -1,9 +1,8 @@
|
|
| 1 |
-
#
|
| 2 |
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
> <br><br>
|
| 7 |
|
| 8 |
After exploring the various ways to select elements with Scrapling and its related features, let's take a step back and examine the [Selector](#selector) class in general, as well as other objects, to gain a better understanding of the parsing engine.
|
| 9 |
|
|
@@ -166,10 +165,10 @@ print(article.prettify())
|
|
| 166 |
<div class="hidden stock">In stock: 5</div>
|
| 167 |
</article>
|
| 168 |
```
|
| 169 |
-
Use the `.body` property to get the raw content of the page
|
| 170 |
```python
|
| 171 |
>>> page.body
|
| 172 |
-
'<html>\n <head>\n <title>Some page</title>\n </head>\n
|
| 173 |
```
|
| 174 |
To get all the ancestors in the DOM tree of this element
|
| 175 |
```python
|
|
@@ -234,7 +233,7 @@ This element returns the same result as the `children` property because its chil
|
|
| 234 |
|
| 235 |
Another example of using the element with the `product-list` class will clear the difference between the `children` property and the `below_elements` property
|
| 236 |
```python
|
| 237 |
-
>>> products_list = page.
|
| 238 |
>>> products_list.children
|
| 239 |
[<data='<article class="product" data-id="1"><h3...' parent='<div class="product-list"> <article clas...'>,
|
| 240 |
<data='<article class="product" data-id="2"><h3...' parent='<div class="product-list"> <article clas...'>,
|
|
@@ -263,7 +262,7 @@ Get the next element of the current element
|
|
| 263 |
The same logic applies to the `previous` property
|
| 264 |
```python
|
| 265 |
>>> article.previous # It's the first child, so it doesn't have a previous element
|
| 266 |
-
>>> second_article = page.
|
| 267 |
>>> second_article.previous
|
| 268 |
<data='<article class="product" data-id="1"><h3...' parent='<div class="product-list"> <article clas...'>
|
| 269 |
```
|
|
@@ -277,7 +276,7 @@ If your case needs more than the element's parent, you can iterate over the whol
|
|
| 277 |
for ancestor in article.iterancestors():
|
| 278 |
# do something with it...
|
| 279 |
```
|
| 280 |
-
You can search for a specific ancestor of an element that satisfies a search function; all you need to do is
|
| 281 |
```python
|
| 282 |
>>> article.find_ancestor(lambda ancestor: ancestor.has_class('product-list'))
|
| 283 |
<data='<div class="product-list"> <article clas...' parent='<body> <div class="product-list"> <artic...'>
|
|
@@ -288,33 +287,63 @@ You can search for a specific ancestor of an element that satisfies a search fun
|
|
| 288 |
## Selectors
|
| 289 |
The class `Selectors` is the "List" version of the [Selector](#selector) class. It inherits from the Python standard `List` type, so it shares all `List` properties and methods while adding more methods to make the operations you want to execute on the [Selector](#selector) instances within more straightforward.
|
| 290 |
|
| 291 |
-
In the [Selector](#selector) class, all methods/properties that should return a group of elements return them as a [Selectors](#selectors) class instance.
|
| 292 |
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
|
| 311 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
### Properties
|
| 313 |
Apart from the standard operations on Python lists, such as iteration and slicing.
|
| 314 |
|
| 315 |
You can do the following:
|
| 316 |
|
| 317 |
-
Execute CSS and XPath selectors directly on the [Selector](#selector) instances it has, while the
|
| 318 |
```python
|
| 319 |
>>> page.css('.product_pod a')
|
| 320 |
[<data='<a href="catalogue/a-light-in-the-attic_...' parent='<div class="image_container"> <a href="c...'>,
|
|
@@ -370,6 +399,15 @@ You can use the `filter` method, too, which takes a function like the `search` m
|
|
| 370 |
<data='<article class="product_pod"><div class=...' parent='<li class="col-xs-6 col-sm-4 col-md-3 co...'>,
|
| 371 |
...]
|
| 372 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 373 |
If you are too lazy like me and want to know the number of [Selector](#selector) instances in a [Selectors](#selectors) instance. You can do this:
|
| 374 |
```python
|
| 375 |
page.css('.product_pod').length
|
|
@@ -441,14 +479,14 @@ First, we start with the `re` and `re_first` methods. These are the same methods
|
|
| 441 |
|
| 442 |
- You also have the `.json()` method, which tries to convert the content to a JSON object quickly if possible; otherwise, it throws an error
|
| 443 |
```python
|
| 444 |
-
>>> page.
|
| 445 |
'\n {\n "lastUpdated": "2024-09-22T10:30:00Z",\n "totalProducts": 3\n }\n '
|
| 446 |
-
>>> page.
|
| 447 |
{'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3}
|
| 448 |
```
|
| 449 |
Hence, if you didn't specify a text node while selecting an element (like the text content or an attribute text content), the text content will be selected automatically, like this
|
| 450 |
```python
|
| 451 |
-
>>> page.
|
| 452 |
{'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3}
|
| 453 |
```
|
| 454 |
The [Selector](#selector) class adds one thing here, too; let's say this is the page we are working with:
|
|
@@ -469,12 +507,12 @@ First, we start with the `re` and `re_first` methods. These are the same methods
|
|
| 469 |
The [Selector](#selector) class has the `get_all_text` method, which you should be aware of by now. This method returns a `TextHandler`, of course.<br/><br/>
|
| 470 |
So, as you know here, if you did something like this
|
| 471 |
```python
|
| 472 |
-
>>> page.
|
| 473 |
```
|
| 474 |
You will get an error because the `div` tag doesn't have any direct text content that can be serialized to JSON; it doesn't have any direct text content at all.<br/><br/>
|
| 475 |
In this case, the `get_all_text` method comes to the rescue, so you can do something like that
|
| 476 |
```python
|
| 477 |
-
>>> page.
|
| 478 |
{'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3}
|
| 479 |
```
|
| 480 |
I used the `ignore_tags` argument here because the default value of it is `('script', 'style',)`, as you are aware.<br/><br/>
|
|
@@ -493,7 +531,7 @@ First, we start with the `re` and `re_first` methods. These are the same methods
|
|
| 493 |
{'some_key': 'some_value'}
|
| 494 |
```
|
| 495 |
You might wonder how this happened, given that the `html` tag doesn't contain direct text.<br/>
|
| 496 |
-
Well, for cases like JSON responses, I made the [Selector](#selector) class keep a raw copy of the content it receives. This way, when you use the `.json()` method, it checks for that raw copy and then converts it to JSON. If the raw copy is
|
| 497 |
|
| 498 |
- Another handy method is `.clean()`, which will remove all white spaces and consecutive spaces for you and return a new `TextHandler` instance
|
| 499 |
```python
|
|
@@ -521,7 +559,7 @@ You probably guessed it: This class is similar to [Selectors](#selectors) and [S
|
|
| 521 |
The only difference is that the `re_first` method logic here runs `re` on each [TextHandler](#texthandler) and returns the first result, or `None`. Nothing new needs to be explained here, but new methods will be added over time.
|
| 522 |
|
| 523 |
## AttributesHandler
|
| 524 |
-
This is a read-only version of Python's standard dictionary, or `dict`, used solely to store the attributes of each element
|
| 525 |
```python
|
| 526 |
>>> print(page.find('script').attrib)
|
| 527 |
{'id': 'page-data', 'type': 'application/json'}
|
|
|
|
| 1 |
+
# Parsing main classes
|
| 2 |
|
| 3 |
+
!!! success "Prerequisites"
|
| 4 |
+
|
| 5 |
+
- You’ve completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector) object.
|
|
|
|
| 6 |
|
| 7 |
After exploring the various ways to select elements with Scrapling and its related features, let's take a step back and examine the [Selector](#selector) class in general, as well as other objects, to gain a better understanding of the parsing engine.
|
| 8 |
|
|
|
|
| 165 |
<div class="hidden stock">In stock: 5</div>
|
| 166 |
</article>
|
| 167 |
```
|
| 168 |
+
Use the `.body` property to get the raw content of the page. Starting from v0.4, when used on a `Response` object from fetchers, `.body` always returns `bytes`.
|
| 169 |
```python
|
| 170 |
>>> page.body
|
| 171 |
+
'<html>\n <head>\n <title>Some page</title>\n </head>\n ...'
|
| 172 |
```
|
| 173 |
To get all the ancestors in the DOM tree of this element
|
| 174 |
```python
|
|
|
|
| 233 |
|
| 234 |
Another example of using the element with the `product-list` class will clear the difference between the `children` property and the `below_elements` property
|
| 235 |
```python
|
| 236 |
+
>>> products_list = page.css('.product-list')[0]
|
| 237 |
>>> products_list.children
|
| 238 |
[<data='<article class="product" data-id="1"><h3...' parent='<div class="product-list"> <article clas...'>,
|
| 239 |
<data='<article class="product" data-id="2"><h3...' parent='<div class="product-list"> <article clas...'>,
|
|
|
|
| 262 |
The same logic applies to the `previous` property
|
| 263 |
```python
|
| 264 |
>>> article.previous # It's the first child, so it doesn't have a previous element
|
| 265 |
+
>>> second_article = page.css('.product[data-id="2"]')[0]
|
| 266 |
>>> second_article.previous
|
| 267 |
<data='<article class="product" data-id="1"><h3...' parent='<div class="product-list"> <article clas...'>
|
| 268 |
```
|
|
|
|
| 276 |
for ancestor in article.iterancestors():
|
| 277 |
# do something with it...
|
| 278 |
```
|
| 279 |
+
You can search for a specific ancestor of an element that satisfies a search function; all you need to do is pass a function that takes a [Selector](#selector) object as an argument and return `True` if the condition satisfies or `False` otherwise, like below:
|
| 280 |
```python
|
| 281 |
>>> article.find_ancestor(lambda ancestor: ancestor.has_class('product-list'))
|
| 282 |
<data='<div class="product-list"> <article clas...' parent='<body> <div class="product-list"> <artic...'>
|
|
|
|
| 287 |
## Selectors
|
| 288 |
The class `Selectors` is the "List" version of the [Selector](#selector) class. It inherits from the Python standard `List` type, so it shares all `List` properties and methods while adding more methods to make the operations you want to execute on the [Selector](#selector) instances within more straightforward.
|
| 289 |
|
| 290 |
+
In the [Selector](#selector) class, all methods/properties that should return a group of elements return them as a [Selectors](#selectors) class instance.
|
| 291 |
|
| 292 |
+
Starting with v0.4, all selection methods consistently return [Selector](#selector)/[Selectors](#selectors) objects, even for text nodes and attribute values. Text nodes (selected via `::text`, `/text()`, `::attr()`, `/@attr`) are wrapped in [Selector](#selector) objects. These text node selectors have `tag` set to `"#text"`, and their `text` property returns the text value. You can still access the text value directly, and all other properties return empty/default values gracefully.
|
| 293 |
+
|
| 294 |
+
```python
|
| 295 |
+
>>> page.css('a::text') # -> Selectors (of text node Selectors)
|
| 296 |
+
>>> page.xpath('//a/text()') # -> Selectors
|
| 297 |
+
>>> page.css('a::text').get() # -> TextHandler (the first text value)
|
| 298 |
+
>>> page.css('a::text').getall() # -> TextHandlers (all text values)
|
| 299 |
+
>>> page.css('a::attr(href)') # -> Selectors
|
| 300 |
+
>>> page.xpath('//a/@href') # -> Selectors
|
| 301 |
+
>>> page.css('.price_color') # -> Selectors
|
| 302 |
+
```
|
| 303 |
+
|
| 304 |
+
### Data extraction methods
|
| 305 |
+
Starting with v0.4, [Selector](#selector) and [Selectors](#selectors) both provide `get()`, `getall()`, and their aliases `extract_first` and `extract` (following Scrapy conventions). The old `get_all()` method has been removed.
|
| 306 |
+
|
| 307 |
+
**On a [Selector](#selector) object:**
|
| 308 |
+
|
| 309 |
+
- `get()` returns a `TextHandler` — for text node selectors, it returns the text value; for HTML element selectors, it returns the serialized outer HTML.
|
| 310 |
+
- `getall()` returns a `TextHandlers` list containing the single serialized string.
|
| 311 |
+
- `extract_first` is an alias for `get()`, and `extract` is an alias for `getall()`.
|
| 312 |
+
|
| 313 |
+
```python
|
| 314 |
+
>>> page.css('h3')[0].get() # Outer HTML of the element
|
| 315 |
+
'<h3>Product 1</h3>'
|
| 316 |
|
| 317 |
+
>>> page.css('h3::text')[0].get() # Text value of the text node
|
| 318 |
+
'Product 1'
|
| 319 |
+
```
|
| 320 |
+
|
| 321 |
+
**On a [Selectors](#selectors) object:**
|
| 322 |
+
|
| 323 |
+
- `get(default=None)` returns the serialized string of the **first** element, or `default` if the list is empty.
|
| 324 |
+
- `getall()` serializes **all** elements and returns a `TextHandlers` list.
|
| 325 |
+
- `extract_first` is an alias for `get()`, and `extract` is an alias for `getall()`.
|
| 326 |
+
|
| 327 |
+
```python
|
| 328 |
+
>>> page.css('.price::text').get() # First price text
|
| 329 |
+
'$10.99'
|
| 330 |
+
|
| 331 |
+
>>> page.css('.price::text').getall() # All price texts
|
| 332 |
+
['$10.99', '$20.99', '$15.99']
|
| 333 |
+
|
| 334 |
+
>>> page.css('.price::text').get('') # With default value
|
| 335 |
+
'$10.99'
|
| 336 |
+
```
|
| 337 |
+
|
| 338 |
+
These methods work seamlessly with all selection types (CSS, XPath, `find`, etc.) and are the recommended way to extract text and attribute values in a Scrapy-compatible style.
|
| 339 |
+
|
| 340 |
+
Now, let's see what [Selectors](#selectors) class adds to the table with that out of the way.
|
| 341 |
### Properties
|
| 342 |
Apart from the standard operations on Python lists, such as iteration and slicing.
|
| 343 |
|
| 344 |
You can do the following:
|
| 345 |
|
| 346 |
+
Execute CSS and XPath selectors directly on the [Selector](#selector) instances it has, while the return types are the same as [Selector](#selector)'s `css` and `xpath` methods. The arguments are similar, except the `adaptive` argument is not available here. This, of course, makes chaining methods very straightforward.
|
| 347 |
```python
|
| 348 |
>>> page.css('.product_pod a')
|
| 349 |
[<data='<a href="catalogue/a-light-in-the-attic_...' parent='<div class="image_container"> <a href="c...'>,
|
|
|
|
| 399 |
<data='<article class="product_pod"><div class=...' parent='<li class="col-xs-6 col-sm-4 col-md-3 co...'>,
|
| 400 |
...]
|
| 401 |
```
|
| 402 |
+
You can safely access the first or last element without worrying about index errors:
|
| 403 |
+
```python
|
| 404 |
+
>>> page.css('.product').first # First Selector or None
|
| 405 |
+
<data='<article class="product" data-id="1"><h3...'>
|
| 406 |
+
>>> page.css('.product').last # Last Selector or None
|
| 407 |
+
<data='<article class="product" data-id="3"><h3...'>
|
| 408 |
+
>>> page.css('.nonexistent').first # Returns None instead of raising IndexError
|
| 409 |
+
```
|
| 410 |
+
|
| 411 |
If you are too lazy like me and want to know the number of [Selector](#selector) instances in a [Selectors](#selectors) instance. You can do this:
|
| 412 |
```python
|
| 413 |
page.css('.product_pod').length
|
|
|
|
| 479 |
|
| 480 |
- You also have the `.json()` method, which tries to convert the content to a JSON object quickly if possible; otherwise, it throws an error
|
| 481 |
```python
|
| 482 |
+
>>> page.css('#page-data::text').get()
|
| 483 |
'\n {\n "lastUpdated": "2024-09-22T10:30:00Z",\n "totalProducts": 3\n }\n '
|
| 484 |
+
>>> page.css('#page-data::text').get().json()
|
| 485 |
{'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3}
|
| 486 |
```
|
| 487 |
Hence, if you didn't specify a text node while selecting an element (like the text content or an attribute text content), the text content will be selected automatically, like this
|
| 488 |
```python
|
| 489 |
+
>>> page.css('#page-data')[0].json()
|
| 490 |
{'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3}
|
| 491 |
```
|
| 492 |
The [Selector](#selector) class adds one thing here, too; let's say this is the page we are working with:
|
|
|
|
| 507 |
The [Selector](#selector) class has the `get_all_text` method, which you should be aware of by now. This method returns a `TextHandler`, of course.<br/><br/>
|
| 508 |
So, as you know here, if you did something like this
|
| 509 |
```python
|
| 510 |
+
>>> page.css('div::text').get().json()
|
| 511 |
```
|
| 512 |
You will get an error because the `div` tag doesn't have any direct text content that can be serialized to JSON; it doesn't have any direct text content at all.<br/><br/>
|
| 513 |
In this case, the `get_all_text` method comes to the rescue, so you can do something like that
|
| 514 |
```python
|
| 515 |
+
>>> page.css('div')[0].get_all_text(ignore_tags=[]).json()
|
| 516 |
{'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3}
|
| 517 |
```
|
| 518 |
I used the `ignore_tags` argument here because the default value of it is `('script', 'style',)`, as you are aware.<br/><br/>
|
|
|
|
| 531 |
{'some_key': 'some_value'}
|
| 532 |
```
|
| 533 |
You might wonder how this happened, given that the `html` tag doesn't contain direct text.<br/>
|
| 534 |
+
Well, for cases like JSON responses, I made the [Selector](#selector) class keep a raw copy of the content it receives. This way, when you use the `.json()` method, it checks for that raw copy and then converts it to JSON. If the raw copy is unavailable, as with the elements, it checks the current element's text content; otherwise, it uses the `get_all_text` method directly.<br/>
|
| 535 |
|
| 536 |
- Another handy method is `.clean()`, which will remove all white spaces and consecutive spaces for you and return a new `TextHandler` instance
|
| 537 |
```python
|
|
|
|
| 559 |
The only difference is that the `re_first` method logic here runs `re` on each [TextHandler](#texthandler) and returns the first result, or `None`. Nothing new needs to be explained here, but new methods will be added over time.
|
| 560 |
|
| 561 |
## AttributesHandler
|
| 562 |
+
This is a read-only version of Python's standard dictionary, or `dict`, used solely to store the attributes of each element/[Selector](#selector) instance.
|
| 563 |
```python
|
| 564 |
>>> print(page.find('script').attrib)
|
| 565 |
{'id': 'page-data', 'type': 'application/json'}
|
docs/parsing/selection.md
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
#
|
| 2 |
Scrapling currently supports parsing HTML pages exclusively, so it doesn't support XML feeds. This decision was made because the adaptive feature won't work with XML, but that might change soon, so stay tuned :)
|
| 3 |
|
| 4 |
In Scrapling, there are five main ways to find elements:
|
|
@@ -27,16 +27,16 @@ Also, Scrapling implements some non-standard pseudo-elements like:
|
|
| 27 |
|
| 28 |
In short, if you come from Scrapy/Parsel, you will find the same logic for selectors here to make it easier. No need to implement a stranger logic to the one that most of us are used to :)
|
| 29 |
|
| 30 |
-
To select elements with CSS selectors,
|
| 31 |
|
| 32 |
### What are XPath selectors?
|
| 33 |
[XPath](https://en.wikipedia.org/wiki/XPath) is a language for selecting nodes in XML documents, which can also be used with HTML. This [cheatsheet](https://devhints.io/xpath) is a good resource for learning about [XPath](https://en.wikipedia.org/wiki/XPath). Scrapling adds XPath selectors directly through [lxml](https://lxml.de/).
|
| 34 |
|
| 35 |
In short, it is the same situation as CSS Selectors; if you come from Scrapy/Parsel, you will find the same logic for selectors here. However, Scrapling doesn't implement the XPath extension function `has-class` as Scrapy/Parsel does. Instead, it provides the `has_class` method, which can be used on elements returned for the same purpose.
|
| 36 |
|
| 37 |
-
To select elements with XPath selectors, you have the `xpath`
|
| 38 |
|
| 39 |
-
> Note that each method of `css`
|
| 40 |
|
| 41 |
### Selectors examples
|
| 42 |
Let's see some shared examples of using CSS and XPath Selectors.
|
|
@@ -46,43 +46,40 @@ Select all elements with the class `product`.
|
|
| 46 |
products = page.css('.product')
|
| 47 |
products = page.xpath('//*[@class="product"]')
|
| 48 |
```
|
| 49 |
-
|
|
|
|
|
|
|
| 50 |
|
| 51 |
Select the first element with the class `product`.
|
| 52 |
```python
|
| 53 |
-
product = page.css_first('.product')
|
| 54 |
-
product = page.xpath_first('//*[@class="product"]')
|
| 55 |
-
```
|
| 56 |
-
Which would be the same as doing (but a bit slower)
|
| 57 |
-
```python
|
| 58 |
product = page.css('.product')[0]
|
| 59 |
product = page.xpath('//*[@class="product"]')[0]
|
| 60 |
```
|
| 61 |
Get the text of the first element with the `h1` tag name
|
| 62 |
```python
|
| 63 |
-
title = page.
|
| 64 |
-
title = page.
|
| 65 |
```
|
| 66 |
-
Which is
|
| 67 |
```python
|
| 68 |
-
title = page.
|
| 69 |
-
title = page.
|
| 70 |
```
|
| 71 |
-
Get the `href` attribute of the first element with the `a` tag name
|
| 72 |
```python
|
| 73 |
-
link = page.
|
| 74 |
-
link = page.
|
| 75 |
```
|
| 76 |
Select the text of the first element with the `h1` tag name, which contains `Phone`, and under an element with class `product`.
|
| 77 |
```python
|
| 78 |
-
title = page.
|
| 79 |
-
title = page.
|
| 80 |
```
|
| 81 |
You can nest and chain selectors as you want, given that they return results
|
| 82 |
```python
|
| 83 |
-
page.
|
| 84 |
-
page.
|
| 85 |
-
page.
|
| 86 |
```
|
| 87 |
Another example
|
| 88 |
|
|
@@ -91,7 +88,7 @@ All links that have 'image' in their 'href' attribute
|
|
| 91 |
links = page.css('a[href*="image"]')
|
| 92 |
links = page.xpath('//a[contains(@href, "image")]')
|
| 93 |
for index, link in enumerate(links):
|
| 94 |
-
link_value = link.attrib['href'] # Cleaner than link.css('::attr(href)')
|
| 95 |
link_text = link.text
|
| 96 |
print(f'Link number {index} points to this url {link_value} with text content as "{link_text}"')
|
| 97 |
```
|
|
@@ -114,7 +111,9 @@ By default, Scrapling searches for the exact matching of the text/pattern you pa
|
|
| 114 |
|
| 115 |
* **partial**: If enabled, `find_by_text` will return elements that contain the input text. So it's not an exact match anymore
|
| 116 |
|
| 117 |
-
|
|
|
|
|
|
|
| 118 |
|
| 119 |
### Finding Similar Elements
|
| 120 |
One of the most remarkable new features Scrapling puts on the table is the ability to tell Scrapling to find elements similar to the element at hand. This feature's inspiration came from the AutoScraper library, but in Scrapling, it can be used on elements found by any method. Most of its usage would likely occur after finding elements through text content, similar to how AutoScraper works, making it convenient to explain here.
|
|
@@ -239,9 +238,9 @@ To increase the complexity a little bit, let's say we want to get all the books'
|
|
| 239 |
```python
|
| 240 |
>>> for product in element.parent.parent.find_similar():
|
| 241 |
print({
|
| 242 |
-
"name": product.
|
| 243 |
-
"price": product.
|
| 244 |
-
"stock": product.css('.availability::text')[-1].clean()
|
| 245 |
})
|
| 246 |
{'name': 'A Light in the ...', 'price': '51.77', 'stock': 'In stock'}
|
| 247 |
{'name': 'Soumission', 'price': '50.10', 'stock': 'In stock'}
|
|
@@ -264,10 +263,10 @@ def extract_product_grid(page):
|
|
| 264 |
|
| 265 |
return [
|
| 266 |
{
|
| 267 |
-
'name': p.
|
| 268 |
-
'price': p.
|
| 269 |
'stock': 'In stock' in p.text,
|
| 270 |
-
'rating': p.
|
| 271 |
}
|
| 272 |
for p in products
|
| 273 |
]
|
|
@@ -276,16 +275,16 @@ Table Row Extraction
|
|
| 276 |
```python
|
| 277 |
def extract_table_data(page):
|
| 278 |
# Find the first data row
|
| 279 |
-
first_row = page.
|
| 280 |
|
| 281 |
# Find similar rows
|
| 282 |
rows = first_row.find_similar()
|
| 283 |
|
| 284 |
return [
|
| 285 |
{
|
| 286 |
-
'column1': row.
|
| 287 |
-
'column2': row.
|
| 288 |
-
'column3': row.
|
| 289 |
}
|
| 290 |
for row in rows
|
| 291 |
]
|
|
@@ -294,7 +293,7 @@ Form Field Extraction
|
|
| 294 |
```python
|
| 295 |
def extract_form_fields(page):
|
| 296 |
# Find first form field container
|
| 297 |
-
first_field = page.
|
| 298 |
lambda e: e.has_class('form-field')
|
| 299 |
)
|
| 300 |
|
|
@@ -303,9 +302,9 @@ def extract_form_fields(page):
|
|
| 303 |
|
| 304 |
return [
|
| 305 |
{
|
| 306 |
-
'label': f.
|
| 307 |
-
'type': f.
|
| 308 |
-
'required': 'required' in f.
|
| 309 |
}
|
| 310 |
for f in fields
|
| 311 |
]
|
|
@@ -324,9 +323,9 @@ def extract_reviews(page):
|
|
| 324 |
|
| 325 |
return [
|
| 326 |
{
|
| 327 |
-
'text': r.
|
| 328 |
'rating': r.attrib.get('data-rating'),
|
| 329 |
-
'author': r.
|
| 330 |
}
|
| 331 |
for r in all_reviews
|
| 332 |
]
|
|
@@ -354,10 +353,10 @@ It filters all elements in the current page/element in the following order:
|
|
| 354 |
3. All elements that match all passed regex patterns are collected, or if previous filter(s) are used, then previously collected elements are filtered.
|
| 355 |
4. All elements that fulfill all passed function(s) are collected; if a previous filter(s) is used, then previously collected elements are filtered.
|
| 356 |
|
| 357 |
-
Notes:
|
| 358 |
|
| 359 |
-
1. As you probably understood, the filtering process always starts from the first filter it finds in the filtering order above. So, if no tag name(s) are passed but attributes are passed, the process starts from that step (number 2), and so on.
|
| 360 |
-
2. The order in which you pass the arguments doesn't matter. The only order considered is the one explained above.
|
| 361 |
|
| 362 |
Check examples to clear any confusion :)
|
| 363 |
|
|
@@ -396,10 +395,10 @@ Find all elements with a class that equals `quote`.
|
|
| 396 |
```
|
| 397 |
Find all div elements with a class that equals `quote` and contains the element `.text`, which contains the word 'world' in its content.
|
| 398 |
```python
|
| 399 |
-
>>> page.find_all('div', {'class': 'quote'}, lambda e: "world" in e.
|
| 400 |
[<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>]
|
| 401 |
```
|
| 402 |
-
Find all elements that
|
| 403 |
```python
|
| 404 |
>>> page.find_all(lambda element: len(element.children) > 0)
|
| 405 |
[<data='<html lang="en"><head><meta charset="UTF...'>,
|
|
@@ -427,7 +426,7 @@ Find all div and span elements with class 'quote' (No span elements like that, s
|
|
| 427 |
```
|
| 428 |
Mix things up
|
| 429 |
```python
|
| 430 |
-
>>> page.find_all({'itemtype':"http://schema.org/CreativeWork"}, 'div').css('.author::text')
|
| 431 |
['Albert Einstein',
|
| 432 |
'J.K. Rowling',
|
| 433 |
...]
|
|
@@ -473,15 +472,16 @@ Generate a full XPath selector for the `url_element` element from the start of t
|
|
| 473 |
>>> url_element.generate_full_xpath_selector
|
| 474 |
'//body/div/div[2]/div/div/span[2]/a'
|
| 475 |
```
|
| 476 |
-
|
| 477 |
-
|
|
|
|
| 478 |
|
| 479 |
## Using selectors with regular expressions
|
| 480 |
Similar to `parsel`/`scrapy`, `re` and `re_first` methods are available for extracting data using regular expressions. However, unlike the former libraries, these methods are in nearly all classes like `Selector`/`Selectors`/`TextHandler` and `TextHandlers`, which means you can use them directly on the element even if you didn't select a text node.
|
| 481 |
|
| 482 |
We will have a deep look at it while explaining the [TextHandler](main_classes.md#texthandler) class, but in general, it works like the examples below:
|
| 483 |
```python
|
| 484 |
-
>>> page.
|
| 485 |
'51.77'
|
| 486 |
|
| 487 |
>>> page.css('.price_color').re_first(r'[\d\.]+')
|
|
|
|
| 1 |
+
# Querying elements
|
| 2 |
Scrapling currently supports parsing HTML pages exclusively, so it doesn't support XML feeds. This decision was made because the adaptive feature won't work with XML, but that might change soon, so stay tuned :)
|
| 3 |
|
| 4 |
In Scrapling, there are five main ways to find elements:
|
|
|
|
| 27 |
|
| 28 |
In short, if you come from Scrapy/Parsel, you will find the same logic for selectors here to make it easier. No need to implement a stranger logic to the one that most of us are used to :)
|
| 29 |
|
| 30 |
+
To select elements with CSS selectors, use the `css` method, which returns `Selectors`. Use `[0]` to get the first element, or `.get()` / `.getall()` to extract text values from text/attribute pseudo-selectors.
|
| 31 |
|
| 32 |
### What are XPath selectors?
|
| 33 |
[XPath](https://en.wikipedia.org/wiki/XPath) is a language for selecting nodes in XML documents, which can also be used with HTML. This [cheatsheet](https://devhints.io/xpath) is a good resource for learning about [XPath](https://en.wikipedia.org/wiki/XPath). Scrapling adds XPath selectors directly through [lxml](https://lxml.de/).
|
| 34 |
|
| 35 |
In short, it is the same situation as CSS Selectors; if you come from Scrapy/Parsel, you will find the same logic for selectors here. However, Scrapling doesn't implement the XPath extension function `has-class` as Scrapy/Parsel does. Instead, it provides the `has_class` method, which can be used on elements returned for the same purpose.
|
| 36 |
|
| 37 |
+
To select elements with XPath selectors, you have the `xpath` method. Again, this method follows the same logic as the CSS selectors method above.
|
| 38 |
|
| 39 |
+
> Note that each method of `css` and `xpath` has additional arguments, but we didn't explain them here, as they are all about the adaptive feature. The adaptive feature will have its own page later to be described in detail.
|
| 40 |
|
| 41 |
### Selectors examples
|
| 42 |
Let's see some shared examples of using CSS and XPath Selectors.
|
|
|
|
| 46 |
products = page.css('.product')
|
| 47 |
products = page.xpath('//*[@class="product"]')
|
| 48 |
```
|
| 49 |
+
!!! info "Note:"
|
| 50 |
+
|
| 51 |
+
The XPath one won't be accurate if there's another class; **it's always better to rely on CSS for selecting by class**
|
| 52 |
|
| 53 |
Select the first element with the class `product`.
|
| 54 |
```python
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
product = page.css('.product')[0]
|
| 56 |
product = page.xpath('//*[@class="product"]')[0]
|
| 57 |
```
|
| 58 |
Get the text of the first element with the `h1` tag name
|
| 59 |
```python
|
| 60 |
+
title = page.css('h1::text').get()
|
| 61 |
+
title = page.xpath('//h1//text()').get()
|
| 62 |
```
|
| 63 |
+
Which is the same as doing
|
| 64 |
```python
|
| 65 |
+
title = page.css('h1')[0].text
|
| 66 |
+
title = page.xpath('//h1')[0].text
|
| 67 |
```
|
| 68 |
+
Get the `href` attribute of the first element with the `a` tag name
|
| 69 |
```python
|
| 70 |
+
link = page.css('a::attr(href)').get()
|
| 71 |
+
link = page.xpath('//a/@href').get()
|
| 72 |
```
|
| 73 |
Select the text of the first element with the `h1` tag name, which contains `Phone`, and under an element with class `product`.
|
| 74 |
```python
|
| 75 |
+
title = page.css('.product h1:contains("Phone")::text').get()
|
| 76 |
+
title = page.xpath('//*[@class="product"]//h1[contains(text(),"Phone")]/text()').get()
|
| 77 |
```
|
| 78 |
You can nest and chain selectors as you want, given that they return results
|
| 79 |
```python
|
| 80 |
+
page.css('.product')[0].css('h1:contains("Phone")::text').get()
|
| 81 |
+
page.xpath('//*[@class="product"]')[0].xpath('//h1[contains(text(),"Phone")]/text()').get()
|
| 82 |
+
page.xpath('//*[@class="product"]')[0].css('h1:contains("Phone")::text').get()
|
| 83 |
```
|
| 84 |
Another example
|
| 85 |
|
|
|
|
| 88 |
links = page.css('a[href*="image"]')
|
| 89 |
links = page.xpath('//a[contains(@href, "image")]')
|
| 90 |
for index, link in enumerate(links):
|
| 91 |
+
link_value = link.attrib['href'] # Cleaner than link.css('::attr(href)').get()
|
| 92 |
link_text = link.text
|
| 93 |
print(f'Link number {index} points to this url {link_value} with text content as "{link_text}"')
|
| 94 |
```
|
|
|
|
| 111 |
|
| 112 |
* **partial**: If enabled, `find_by_text` will return elements that contain the input text. So it's not an exact match anymore
|
| 113 |
|
| 114 |
+
!!! abstract "Note:"
|
| 115 |
+
|
| 116 |
+
The method `find_by_regex` can accept both regular strings and a compiled regex pattern as its first argument, as you will see in the upcoming examples.
|
| 117 |
|
| 118 |
### Finding Similar Elements
|
| 119 |
One of the most remarkable new features Scrapling puts on the table is the ability to tell Scrapling to find elements similar to the element at hand. This feature's inspiration came from the AutoScraper library, but in Scrapling, it can be used on elements found by any method. Most of its usage would likely occur after finding elements through text content, similar to how AutoScraper works, making it convenient to explain here.
|
|
|
|
| 238 |
```python
|
| 239 |
>>> for product in element.parent.parent.find_similar():
|
| 240 |
print({
|
| 241 |
+
"name": product.css('h3 a::text').get(),
|
| 242 |
+
"price": product.css('.price_color')[0].re_first(r'[\d\.]+'),
|
| 243 |
+
"stock": product.css('.availability::text').getall()[-1].clean()
|
| 244 |
})
|
| 245 |
{'name': 'A Light in the ...', 'price': '51.77', 'stock': 'In stock'}
|
| 246 |
{'name': 'Soumission', 'price': '50.10', 'stock': 'In stock'}
|
|
|
|
| 263 |
|
| 264 |
return [
|
| 265 |
{
|
| 266 |
+
'name': p.css('h3::text').get(),
|
| 267 |
+
'price': p.css('.price::text').re_first(r'\d+\.\d{2}'),
|
| 268 |
'stock': 'In stock' in p.text,
|
| 269 |
+
'rating': p.css('.rating')[0].attrib.get('data-rating')
|
| 270 |
}
|
| 271 |
for p in products
|
| 272 |
]
|
|
|
|
| 275 |
```python
|
| 276 |
def extract_table_data(page):
|
| 277 |
# Find the first data row
|
| 278 |
+
first_row = page.css('table tbody tr')[0]
|
| 279 |
|
| 280 |
# Find similar rows
|
| 281 |
rows = first_row.find_similar()
|
| 282 |
|
| 283 |
return [
|
| 284 |
{
|
| 285 |
+
'column1': row.css('td:nth-child(1)::text').get(),
|
| 286 |
+
'column2': row.css('td:nth-child(2)::text').get(),
|
| 287 |
+
'column3': row.css('td:nth-child(3)::text').get()
|
| 288 |
}
|
| 289 |
for row in rows
|
| 290 |
]
|
|
|
|
| 293 |
```python
|
| 294 |
def extract_form_fields(page):
|
| 295 |
# Find first form field container
|
| 296 |
+
first_field = page.css('input')[0].find_ancestor(
|
| 297 |
lambda e: e.has_class('form-field')
|
| 298 |
)
|
| 299 |
|
|
|
|
| 302 |
|
| 303 |
return [
|
| 304 |
{
|
| 305 |
+
'label': f.css('label::text').get(),
|
| 306 |
+
'type': f.css('input')[0].attrib.get('type'),
|
| 307 |
+
'required': 'required' in f.css('input')[0].attrib
|
| 308 |
}
|
| 309 |
for f in fields
|
| 310 |
]
|
|
|
|
| 323 |
|
| 324 |
return [
|
| 325 |
{
|
| 326 |
+
'text': r.css('.review-text::text').get(),
|
| 327 |
'rating': r.attrib.get('data-rating'),
|
| 328 |
+
'author': r.css('.reviewer::text').get()
|
| 329 |
}
|
| 330 |
for r in all_reviews
|
| 331 |
]
|
|
|
|
| 353 |
3. All elements that match all passed regex patterns are collected, or if previous filter(s) are used, then previously collected elements are filtered.
|
| 354 |
4. All elements that fulfill all passed function(s) are collected; if a previous filter(s) is used, then previously collected elements are filtered.
|
| 355 |
|
| 356 |
+
!!! note "Notes:"
|
| 357 |
|
| 358 |
+
1. As you probably understood, the filtering process always starts from the first filter it finds in the filtering order above. So, if no tag name(s) are passed but attributes are passed, the process starts from that step (number 2), and so on.
|
| 359 |
+
2. The order in which you pass the arguments doesn't matter. The only order considered is the one explained above.
|
| 360 |
|
| 361 |
Check examples to clear any confusion :)
|
| 362 |
|
|
|
|
| 395 |
```
|
| 396 |
Find all div elements with a class that equals `quote` and contains the element `.text`, which contains the word 'world' in its content.
|
| 397 |
```python
|
| 398 |
+
>>> page.find_all('div', {'class': 'quote'}, lambda e: "world" in e.css('.text::text').get())
|
| 399 |
[<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>]
|
| 400 |
```
|
| 401 |
+
Find all elements that have children.
|
| 402 |
```python
|
| 403 |
>>> page.find_all(lambda element: len(element.children) > 0)
|
| 404 |
[<data='<html lang="en"><head><meta charset="UTF...'>,
|
|
|
|
| 426 |
```
|
| 427 |
Mix things up
|
| 428 |
```python
|
| 429 |
+
>>> page.find_all({'itemtype':"http://schema.org/CreativeWork"}, 'div').css('.author::text').getall()
|
| 430 |
['Albert Einstein',
|
| 431 |
'J.K. Rowling',
|
| 432 |
...]
|
|
|
|
| 472 |
>>> url_element.generate_full_xpath_selector
|
| 473 |
'//body/div/div[2]/div/div/span[2]/a'
|
| 474 |
```
|
| 475 |
+
!!! abstract "Note:"
|
| 476 |
+
|
| 477 |
+
When you tell Scrapling to create a short selector, it tries to find a unique element to use in generation as a stop point, like an element with an `id` attribute, but in our case, there wasn't any, so that's why the short and the full selector will be the same.
|
| 478 |
|
| 479 |
## Using selectors with regular expressions
|
| 480 |
Similar to `parsel`/`scrapy`, `re` and `re_first` methods are available for extracting data using regular expressions. However, unlike the former libraries, these methods are in nearly all classes like `Selector`/`Selectors`/`TextHandler` and `TextHandlers`, which means you can use them directly on the element even if you didn't select a text node.
|
| 481 |
|
| 482 |
We will have a deep look at it while explaining the [TextHandler](main_classes.md#texthandler) class, but in general, it works like the examples below:
|
| 483 |
```python
|
| 484 |
+
>>> page.css('.price_color')[0].re_first(r'[\d\.]+')
|
| 485 |
'51.77'
|
| 486 |
|
| 487 |
>>> page.css('.price_color').re_first(r'[\d\.]+')
|
docs/requirements.txt
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
-
|
| 2 |
-
mkdocstrings
|
|
|
|
| 3 |
griffe-inherited-docstrings
|
| 4 |
griffe-runtime-objects
|
| 5 |
griffe-sphinx
|
| 6 |
-
|
| 7 |
-
black>=25.12.0
|
| 8 |
pngquant
|
|
|
|
| 1 |
+
zensical>=0.0.23
|
| 2 |
+
mkdocstrings>=1.0.3
|
| 3 |
+
mkdocstrings-python>=2.0.2
|
| 4 |
griffe-inherited-docstrings
|
| 5 |
griffe-runtime-objects
|
| 6 |
griffe-sphinx
|
| 7 |
+
black>=26.1.0
|
|
|
|
| 8 |
pngquant
|
docs/spiders/advanced.md
ADDED
|
@@ -0,0 +1,313 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Advanced usages
|
| 2 |
+
|
| 3 |
+
## Introduction
|
| 4 |
+
|
| 5 |
+
!!! success "Prerequisites"
|
| 6 |
+
|
| 7 |
+
1. You've read the [Getting started](getting-started.md) page and know how to create and run a basic spider.
|
| 8 |
+
|
| 9 |
+
This page covers the spider system's advanced features: concurrency control, pause/resume, streaming, lifecycle hooks, statistics, and logging.
|
| 10 |
+
|
| 11 |
+
## Concurrency Control
|
| 12 |
+
|
| 13 |
+
The spider system uses three class attributes to control how aggressively it crawls:
|
| 14 |
+
|
| 15 |
+
| Attribute | Default | Description |
|
| 16 |
+
|----------------------------------|---------|------------------------------------------------------------------|
|
| 17 |
+
| `concurrent_requests` | `4` | Maximum number of requests being processed at the same time |
|
| 18 |
+
| `concurrent_requests_per_domain` | `0` | Maximum concurrent requests per domain (0 = no per-domain limit) |
|
| 19 |
+
| `download_delay` | `0.0` | Seconds to wait before each request |
|
| 20 |
+
|
| 21 |
+
```python
|
| 22 |
+
class PoliteSpider(Spider):
|
| 23 |
+
name = "polite"
|
| 24 |
+
start_urls = ["https://example.com"]
|
| 25 |
+
|
| 26 |
+
# Be gentle with the server
|
| 27 |
+
concurrent_requests = 4
|
| 28 |
+
concurrent_requests_per_domain = 2
|
| 29 |
+
download_delay = 1.0 # Wait 1 second between requests
|
| 30 |
+
|
| 31 |
+
async def parse(self, response: Response):
|
| 32 |
+
yield {"title": response.css("title::text").get("")}
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
When `concurrent_requests_per_domain` is set, each domain gets its own concurrency limiter in addition to the global limit. This is useful when crawling multiple domains simultaneously — you can allow high global concurrency while being polite to each individual domain.
|
| 36 |
+
|
| 37 |
+
!!! tip
|
| 38 |
+
|
| 39 |
+
The `download_delay` parameter adds a fixed wait before every request, regardless of the domain. Use it for simple rate limiting.
|
| 40 |
+
|
| 41 |
+
### Using uvloop
|
| 42 |
+
|
| 43 |
+
The `start()` method accepts a `use_uvloop` parameter to use the faster [uvloop](https://github.com/MagicStack/uvloop)/[winloop](https://github.com/nicktimko/winloop) event loop implementation, if available:
|
| 44 |
+
|
| 45 |
+
```python
|
| 46 |
+
result = MySpider().start(use_uvloop=True)
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
This can improve throughput for I/O-heavy crawls. You'll need to install `uvloop` (Linux/macOS) or `winloop` (Windows) separately.
|
| 50 |
+
|
| 51 |
+
## Pause & Resume
|
| 52 |
+
|
| 53 |
+
The spider supports graceful pause-and-resume via checkpointing. To enable it, pass a `crawldir` directory to the spider constructor:
|
| 54 |
+
|
| 55 |
+
```python
|
| 56 |
+
spider = MySpider(crawldir="crawl_data/my_spider")
|
| 57 |
+
result = spider.start()
|
| 58 |
+
|
| 59 |
+
if result.paused:
|
| 60 |
+
print("Crawl was paused. Run again to resume.")
|
| 61 |
+
else:
|
| 62 |
+
print("Crawl completed!")
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
### How It Works
|
| 66 |
+
|
| 67 |
+
1. **Pausing**: Press `Ctrl+C` during a crawl. The spider waits for all in-flight requests to finish, saves a checkpoint (pending requests + a set of seen request fingerprints), and then exits.
|
| 68 |
+
2. **Force stopping**: Press `Ctrl+C` a second time to stop immediately without waiting for active tasks.
|
| 69 |
+
3. **Resuming**: Run the spider again with the same `crawldir`. It detects the checkpoint, restores the queue and seen set, and continues from where it left off — skipping `start_requests()`.
|
| 70 |
+
4. **Cleanup**: When a crawl completes normally (not paused), the checkpoint files are deleted automatically.
|
| 71 |
+
|
| 72 |
+
**Checkpoints are also saved periodically during the crawl (every 5 minutes by default).**
|
| 73 |
+
|
| 74 |
+
You can change the interval as follows:
|
| 75 |
+
|
| 76 |
+
```python
|
| 77 |
+
# Save checkpoint every 2 minutes
|
| 78 |
+
spider = MySpider(crawldir="crawl_data/my_spider", interval=120.0)
|
| 79 |
+
```
|
| 80 |
+
|
| 81 |
+
The writing to the disk is atomic, so it's totally safe.
|
| 82 |
+
|
| 83 |
+
!!! tip
|
| 84 |
+
|
| 85 |
+
Pressing `Ctrl+C` during a crawl always causes the spider to close gracefully, even if the checkpoint system is not enabled. Doing it again without waiting forces the spider to close immediately.
|
| 86 |
+
|
| 87 |
+
### Knowing If You're Resuming
|
| 88 |
+
|
| 89 |
+
The `on_start()` hook receives a `resuming` flag:
|
| 90 |
+
|
| 91 |
+
```python
|
| 92 |
+
async def on_start(self, resuming: bool = False):
|
| 93 |
+
if resuming:
|
| 94 |
+
self.logger.info("Resuming from checkpoint!")
|
| 95 |
+
else:
|
| 96 |
+
self.logger.info("Starting fresh crawl")
|
| 97 |
+
```
|
| 98 |
+
|
| 99 |
+
## Streaming
|
| 100 |
+
|
| 101 |
+
For long-running spiders or applications that need real-time access to scraped items, use the `stream()` method instead of `start()`:
|
| 102 |
+
|
| 103 |
+
```python
|
| 104 |
+
import anyio
|
| 105 |
+
|
| 106 |
+
async def main():
|
| 107 |
+
spider = MySpider()
|
| 108 |
+
async for item in spider.stream():
|
| 109 |
+
print(f"Got item: {item}")
|
| 110 |
+
# Access real-time stats
|
| 111 |
+
print(f"Items so far: {spider.stats.items_scraped}")
|
| 112 |
+
print(f"Requests made: {spider.stats.requests_count}")
|
| 113 |
+
|
| 114 |
+
anyio.run(main)
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
Key differences from `start()`:
|
| 118 |
+
|
| 119 |
+
- `stream()` must be called from an async context
|
| 120 |
+
- Items are yielded one by one as they're scraped, not collected into a list
|
| 121 |
+
- You can access `spider.stats` during iteration for real-time statistics
|
| 122 |
+
|
| 123 |
+
!!! abstract
|
| 124 |
+
|
| 125 |
+
The full list of all stats that can be accessed by `spider.stats` is explained below [here](#results--statistics)
|
| 126 |
+
|
| 127 |
+
You can use it with the checkpoint system too, so it's easy to build UI on top of spiders. UIs that have real-time data and can be paused/resumed.
|
| 128 |
+
|
| 129 |
+
```python
|
| 130 |
+
import anyio
|
| 131 |
+
|
| 132 |
+
async def main():
|
| 133 |
+
spider = MySpider(crawldir="crawl_data/my_spider")
|
| 134 |
+
async for item in spider.stream():
|
| 135 |
+
print(f"Got item: {item}")
|
| 136 |
+
# Access real-time stats
|
| 137 |
+
print(f"Items so far: {spider.stats.items_scraped}")
|
| 138 |
+
print(f"Requests made: {spider.stats.requests_count}")
|
| 139 |
+
|
| 140 |
+
anyio.run(main)
|
| 141 |
+
```
|
| 142 |
+
You can also use `spider.pause()` to shut down the spider in the code above. If you used it without enabling the checkpoint system, it will just close the crawl.
|
| 143 |
+
|
| 144 |
+
## Lifecycle Hooks
|
| 145 |
+
|
| 146 |
+
The spider provides several hooks you can override to add custom behavior at different stages of the crawl:
|
| 147 |
+
|
| 148 |
+
### on_start
|
| 149 |
+
|
| 150 |
+
Called before crawling begins. Use it for setup tasks like loading data or initializing resources:
|
| 151 |
+
|
| 152 |
+
```python
|
| 153 |
+
async def on_start(self, resuming: bool = False):
|
| 154 |
+
self.logger.info("Spider starting up")
|
| 155 |
+
# Load seed URLs from a database, initialize counters, etc.
|
| 156 |
+
```
|
| 157 |
+
|
| 158 |
+
### on_close
|
| 159 |
+
|
| 160 |
+
Called after crawling finishes (whether completed or paused). Use it for cleanup:
|
| 161 |
+
|
| 162 |
+
```python
|
| 163 |
+
async def on_close(self):
|
| 164 |
+
self.logger.info("Spider shutting down")
|
| 165 |
+
# Close database connections, flush buffers, etc.
|
| 166 |
+
```
|
| 167 |
+
|
| 168 |
+
### on_error
|
| 169 |
+
|
| 170 |
+
Called when a request fails with an exception. Use it for error tracking or custom recovery logic:
|
| 171 |
+
|
| 172 |
+
```python
|
| 173 |
+
async def on_error(self, request: Request, error: Exception):
|
| 174 |
+
self.logger.error(f"Failed: {request.url} - {error}")
|
| 175 |
+
# Log to error tracker, save failed URL for later, etc.
|
| 176 |
+
```
|
| 177 |
+
|
| 178 |
+
### on_scraped_item
|
| 179 |
+
|
| 180 |
+
Called for every scraped item before it's added to the results. Return the item (modified or not) to keep it, or return `None` to drop it:
|
| 181 |
+
|
| 182 |
+
```python
|
| 183 |
+
async def on_scraped_item(self, item: dict) -> dict | None:
|
| 184 |
+
# Drop items without a title
|
| 185 |
+
if not item.get("title"):
|
| 186 |
+
return None
|
| 187 |
+
|
| 188 |
+
# Modify items (e.g., add timestamps)
|
| 189 |
+
item["scraped_at"] = "2026-01-01"
|
| 190 |
+
return item
|
| 191 |
+
```
|
| 192 |
+
|
| 193 |
+
!!! tip
|
| 194 |
+
|
| 195 |
+
This hook can also be used to direct items through your own pipelines and drop them from the spider.
|
| 196 |
+
|
| 197 |
+
### start_requests
|
| 198 |
+
|
| 199 |
+
Override `start_requests()` for custom initial request generation instead of using `start_urls`:
|
| 200 |
+
|
| 201 |
+
```python
|
| 202 |
+
async def start_requests(self):
|
| 203 |
+
# POST request to log in first
|
| 204 |
+
yield Request(
|
| 205 |
+
"https://example.com/login",
|
| 206 |
+
method="POST",
|
| 207 |
+
data={"user": "admin", "pass": "secret"},
|
| 208 |
+
callback=self.after_login,
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
async def after_login(self, response: Response):
|
| 212 |
+
# Now crawl the authenticated pages
|
| 213 |
+
yield response.follow("/dashboard", callback=self.parse)
|
| 214 |
+
```
|
| 215 |
+
|
| 216 |
+
## Results & Statistics
|
| 217 |
+
|
| 218 |
+
The `CrawlResult` returned by `start()` contains both the scraped items and detailed statistics:
|
| 219 |
+
|
| 220 |
+
```python
|
| 221 |
+
result = MySpider().start()
|
| 222 |
+
|
| 223 |
+
# Items
|
| 224 |
+
print(f"Total items: {len(result.items)}")
|
| 225 |
+
result.items.to_json("output.json", indent=True)
|
| 226 |
+
|
| 227 |
+
# Did the crawl complete?
|
| 228 |
+
print(f"Completed: {result.completed}")
|
| 229 |
+
print(f"Paused: {result.paused}")
|
| 230 |
+
|
| 231 |
+
# Statistics
|
| 232 |
+
stats = result.stats
|
| 233 |
+
print(f"Requests: {stats.requests_count}")
|
| 234 |
+
print(f"Failed: {stats.failed_requests_count}")
|
| 235 |
+
print(f"Blocked: {stats.blocked_requests_count}")
|
| 236 |
+
print(f"Offsite filtered: {stats.offsite_requests_count}")
|
| 237 |
+
print(f"Items scraped: {stats.items_scraped}")
|
| 238 |
+
print(f"Items dropped: {stats.items_dropped}")
|
| 239 |
+
print(f"Response bytes: {stats.response_bytes}")
|
| 240 |
+
print(f"Duration: {stats.elapsed_seconds:.1f}s")
|
| 241 |
+
print(f"Speed: {stats.requests_per_second:.1f} req/s")
|
| 242 |
+
```
|
| 243 |
+
|
| 244 |
+
### Detailed Stats
|
| 245 |
+
|
| 246 |
+
The `CrawlStats` object tracks granular information:
|
| 247 |
+
|
| 248 |
+
```python
|
| 249 |
+
stats = result.stats
|
| 250 |
+
|
| 251 |
+
# Status code distribution
|
| 252 |
+
print(stats.response_status_count)
|
| 253 |
+
# {'status_200': 150, 'status_404': 3, 'status_403': 1}
|
| 254 |
+
|
| 255 |
+
# Bytes downloaded per domain
|
| 256 |
+
print(stats.domains_response_bytes)
|
| 257 |
+
# {'example.com': 1234567, 'api.example.com': 45678}
|
| 258 |
+
|
| 259 |
+
# Requests per session
|
| 260 |
+
print(stats.sessions_requests_count)
|
| 261 |
+
# {'http': 120, 'stealth': 34}
|
| 262 |
+
|
| 263 |
+
# Proxies used during the crawl
|
| 264 |
+
print(stats.proxies)
|
| 265 |
+
# ['http://proxy1:8080', 'http://proxy2:8080']
|
| 266 |
+
|
| 267 |
+
# Log level counts
|
| 268 |
+
print(stats.log_levels_counter)
|
| 269 |
+
# {'debug': 200, 'info': 50, 'warning': 3, 'error': 1, 'critical': 0}
|
| 270 |
+
|
| 271 |
+
# Timing information
|
| 272 |
+
print(stats.start_time) # Unix timestamp when crawl started
|
| 273 |
+
print(stats.end_time) # Unix timestamp when crawl finished
|
| 274 |
+
print(stats.download_delay) # The download delay used (seconds)
|
| 275 |
+
|
| 276 |
+
# Concurrency settings used
|
| 277 |
+
print(stats.concurrent_requests) # Global concurrency limit
|
| 278 |
+
print(stats.concurrent_requests_per_domain) # Per-domain concurrency limit
|
| 279 |
+
|
| 280 |
+
# Custom stats (set by your spider code)
|
| 281 |
+
print(stats.custom_stats)
|
| 282 |
+
# {'login_attempts': 3, 'pages_with_errors': 5}
|
| 283 |
+
|
| 284 |
+
# Export everything as a dict
|
| 285 |
+
print(stats.to_dict())
|
| 286 |
+
```
|
| 287 |
+
|
| 288 |
+
## Logging
|
| 289 |
+
|
| 290 |
+
The spider has a built-in logger accessible via `self.logger`. It's pre-configured with the spider's name and supports several customization options:
|
| 291 |
+
|
| 292 |
+
| Attribute | Default | Description |
|
| 293 |
+
|-----------------------|--------------------------------------------------------------|----------------------------------------------------|
|
| 294 |
+
| `logging_level` | `logging.DEBUG` | Minimum log level |
|
| 295 |
+
| `logging_format` | `"[%(asctime)s]:({spider_name}) %(levelname)s: %(message)s"` | Log message format |
|
| 296 |
+
| `logging_date_format` | `"%Y-%m-%d %H:%M:%S"` | Date format in log messages |
|
| 297 |
+
| `log_file` | `None` | Path to a log file (in addition to console output) |
|
| 298 |
+
|
| 299 |
+
```python
|
| 300 |
+
import logging
|
| 301 |
+
|
| 302 |
+
class MySpider(Spider):
|
| 303 |
+
name = "my_spider"
|
| 304 |
+
start_urls = ["https://example.com"]
|
| 305 |
+
logging_level = logging.INFO
|
| 306 |
+
log_file = "logs/my_spider.log"
|
| 307 |
+
|
| 308 |
+
async def parse(self, response: Response):
|
| 309 |
+
self.logger.info(f"Processing {response.url}")
|
| 310 |
+
yield {"title": response.css("title::text").get("")}
|
| 311 |
+
```
|
| 312 |
+
|
| 313 |
+
The log file directory is created automatically if it doesn't exist. Both console and file output use the same format.
|
docs/spiders/architecture.md
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Spiders architecture
|
| 2 |
+
|
| 3 |
+
!!! success "Prerequisites"
|
| 4 |
+
|
| 5 |
+
1. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand the different fetcher types and when to use each one.
|
| 6 |
+
2. You've completed or read the [Main classes](../parsing/main_classes.md) page to understand the [Selector](../parsing/main_classes.md#selector) and [Response](../fetching/choosing.md#response-object) classes.
|
| 7 |
+
|
| 8 |
+
Scrapling's spider system is a Scrapy-inspired async crawling framework designed for concurrent, multi-session crawls with built-in pause/resume support. It brings together Scrapling's parsing engine and fetchers into a unified crawling API while adding scheduling, concurrency control, and checkpointing.
|
| 9 |
+
|
| 10 |
+
If you're familiar with Scrapy, you'll feel right at home. If not, don't worry — the system is designed to be straightforward.
|
| 11 |
+
|
| 12 |
+
## Data Flow
|
| 13 |
+
|
| 14 |
+
The diagram below shows how data flows through the spider system when a crawl is running:
|
| 15 |
+
|
| 16 |
+
<img src="../assets/spider_architecture.png" title="Spider architecture diagram by @TrueSkills" alt="Spider architecture diagram by @TrueSkills" style="width: 70%;"/>
|
| 17 |
+
|
| 18 |
+
Here's what happens step by step when you run a spider without many details:
|
| 19 |
+
|
| 20 |
+
1. The **Spider** produces the first batch of `Request` objects. By default, it creates one request for each URL in `start_urls`, but you can override `start_requests()` for custom logic.
|
| 21 |
+
2. The **Scheduler** receives requests and places them in a priority queue, and creates fingerprints for them. Higher-priority requests are dequeued first.
|
| 22 |
+
3. The **Crawler Engine** asks the **Scheduler** to dequeue the next request, respecting concurrency limits (global and per-domain) and download delays. Once the **Crawler Engine** receives the request, it passes it to the **Session Manager**, which routes it to the correct session based on the request's `sid` (session ID).
|
| 23 |
+
4. The **session** fetches the page and returns a [Response](../fetching/choosing.md#response-object) object to the **Crawler Engine**. The engine records statistics and checks for blocked responses. If the response is blocked, the engine retries the request up to `max_blocked_retries` times. Of course, the blocking detection and the retry logic for blocked requests can be customized.
|
| 24 |
+
5. The **Crawler Engine** passes the [Response](../fetching/choosing.md#response-object) to the request's callback. The callback either yields a dictionary, which gets treated as a scraped item, or a follow-up request, which gets sent to the scheduler for queuing.
|
| 25 |
+
6. The cycle repeats from step 2 until the scheduler is empty and no tasks are active, or the spider is paused.
|
| 26 |
+
7. If `crawldir` is set while starting the spider, the **Crawler Engine** periodically saves a checkpoint (pending requests + seen URLs set) to disk. On graceful shutdown (Ctrl+C), a final checkpoint is saved. The next time the spider runs with the same `crawldir`, it resumes from where it left off — skipping `start_requests()` and restoring the scheduler state.
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
## Components
|
| 30 |
+
|
| 31 |
+
### Spider
|
| 32 |
+
|
| 33 |
+
The central class you interact with. You subclass `Spider`, define your `start_urls` and `parse()` method, and optionally configure sessions and override lifecycle hooks.
|
| 34 |
+
|
| 35 |
+
```python
|
| 36 |
+
from scrapling.spiders import Spider, Response, Request
|
| 37 |
+
|
| 38 |
+
class MySpider(Spider):
|
| 39 |
+
name = "my_spider"
|
| 40 |
+
start_urls = ["https://example.com"]
|
| 41 |
+
|
| 42 |
+
async def parse(self, response: Response):
|
| 43 |
+
for link in response.css("a::attr(href)").getall():
|
| 44 |
+
yield response.follow(link, callback=self.parse_page)
|
| 45 |
+
|
| 46 |
+
async def parse_page(self, response: Response):
|
| 47 |
+
yield {"title": response.css("h1::text").get("")}
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
### Crawler Engine
|
| 51 |
+
|
| 52 |
+
The engine orchestrates the entire crawl. It manages the main loop, enforces concurrency limits, dispatches requests through the Session Manager, and processes results from callbacks. You don't interact with it directly — the `Spider.start()` and `Spider.stream()` methods handle it for you.
|
| 53 |
+
|
| 54 |
+
### Scheduler
|
| 55 |
+
|
| 56 |
+
A priority queue with built-in URL deduplication. Requests are fingerprinted based on their URL, HTTP method, body, and session ID. The scheduler supports `snapshot()` and `restore()` for the checkpoint system, allowing the crawl state to be saved and resumed.
|
| 57 |
+
|
| 58 |
+
### Session Manager
|
| 59 |
+
|
| 60 |
+
Manages one or more named session instances. Each session is one of:
|
| 61 |
+
|
| 62 |
+
- [FetcherSession](../fetching/static.md)
|
| 63 |
+
- [AsyncDynamicSession](../fetching/dynamic.md)
|
| 64 |
+
- [AsyncStealthySession](../fetching/stealthy.md)
|
| 65 |
+
|
| 66 |
+
When a request comes in, the Session Manager routes it to the correct session based on the request's `sid` field. Sessions can be started with the spider start (default) or lazily (started on the first use).
|
| 67 |
+
|
| 68 |
+
### Checkpoint System
|
| 69 |
+
|
| 70 |
+
An optional system that, if enabled, saves the crawler's state (pending requests + seen URL fingerprints) to a pickle file on disk. Writes are atomic (temp file + rename) to prevent corruption. Checkpoints are saved periodically at a configurable interval and on graceful shutdown. Upon successful completion (not paused), checkpoint files are automatically cleaned up.
|
| 71 |
+
|
| 72 |
+
### Output
|
| 73 |
+
|
| 74 |
+
Scraped items are collected in an `ItemList` (a list subclass with `to_json()` and `to_jsonl()` export methods). Crawl statistics are tracked in a `CrawlStats` dataclass which contains a lot of useful info.
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
## Comparison with Scrapy
|
| 78 |
+
|
| 79 |
+
If you're coming from Scrapy, here's how Scrapling's spider system maps:
|
| 80 |
+
|
| 81 |
+
| Concept | Scrapy | Scrapling |
|
| 82 |
+
|--------------------|-------------------------------|-----------------------------------------------------------------|
|
| 83 |
+
| Spider definition | `scrapy.Spider` subclass | `scrapling.spiders.Spider` subclass |
|
| 84 |
+
| Initial requests | `start_requests()` | `async start_requests()` |
|
| 85 |
+
| Callbacks | `def parse(self, response)` | `async def parse(self, response)` |
|
| 86 |
+
| Following links | `response.follow(url)` | `response.follow(url)` |
|
| 87 |
+
| Item output | `yield dict` or `yield Item` | `yield dict` |
|
| 88 |
+
| Request scheduling | Scheduler + Dupefilter | Scheduler with built-in deduplication |
|
| 89 |
+
| Downloading | Downloader + Middlewares | Session Manager with multi-session support |
|
| 90 |
+
| Item processing | Item Pipelines | `on_scraped_item()` hook |
|
| 91 |
+
| Blocked detection | Through custom middlewares | Built-in `is_blocked()` + `retry_blocked_request()` hooks |
|
| 92 |
+
| Concurrency | `CONCURRENT_REQUESTS` setting | `concurrent_requests` class attribute |
|
| 93 |
+
| Domain filtering | `allowed_domains` | `allowed_domains` |
|
| 94 |
+
| Pause/Resume | `JOBDIR` setting | `crawldir` constructor argument |
|
| 95 |
+
| Export | Feed exports | `result.items.to_json()` / `to_jsonl()` or custom through hooks |
|
| 96 |
+
| Running | `scrapy crawl spider_name` | `MySpider().start()` |
|
| 97 |
+
| Streaming | N/A | `async for item in spider.stream()` |
|
| 98 |
+
| Multi-session | N/A | Multiple sessions with different types per spider |
|
docs/spiders/getting-started.md
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Getting started
|
| 2 |
+
|
| 3 |
+
## Introduction
|
| 4 |
+
|
| 5 |
+
!!! success "Prerequisites"
|
| 6 |
+
|
| 7 |
+
1. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand the different fetcher types and when to use each one.
|
| 8 |
+
2. You've completed or read the [Main classes](../parsing/main_classes.md) page to understand the [Selector](../parsing/main_classes.md#selector) and [Response](../fetching/choosing.md#response-object) classes.
|
| 9 |
+
3. You've read the [Architecture](architecture.md) page for a high-level overview of how the spider system works.
|
| 10 |
+
|
| 11 |
+
The spider system lets you build concurrent, multi-page crawlers in just a few lines of code. If you've used Scrapy before, the patterns will feel familiar. If not, this guide will walk you through everything you need to get started.
|
| 12 |
+
|
| 13 |
+
## Your First Spider
|
| 14 |
+
|
| 15 |
+
A spider is a class that defines how to crawl and extract data from websites. Here's the simplest possible spider:
|
| 16 |
+
|
| 17 |
+
```python
|
| 18 |
+
from scrapling.spiders import Spider, Response
|
| 19 |
+
|
| 20 |
+
class QuotesSpider(Spider):
|
| 21 |
+
name = "quotes"
|
| 22 |
+
start_urls = ["https://quotes.toscrape.com"]
|
| 23 |
+
|
| 24 |
+
async def parse(self, response: Response):
|
| 25 |
+
for quote in response.css("div.quote"):
|
| 26 |
+
yield {
|
| 27 |
+
"text": quote.css("span.text::text").get(""),
|
| 28 |
+
"author": quote.css("small.author::text").get(""),
|
| 29 |
+
}
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
Every spider needs three things:
|
| 33 |
+
|
| 34 |
+
1. **`name`** — A unique identifier for the spider.
|
| 35 |
+
2. **`start_urls`** — A list of URLs to start crawling from.
|
| 36 |
+
3. **`parse()`** — An async generator method that processes each response and yields results.
|
| 37 |
+
|
| 38 |
+
The `parse()` method is where the magic happens. You use the same selection methods you'd use with Scrapling's [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object), and `yield` dictionaries to output scraped items.
|
| 39 |
+
|
| 40 |
+
## Running the Spider
|
| 41 |
+
|
| 42 |
+
To run your spider, create an instance and call `start()`:
|
| 43 |
+
|
| 44 |
+
```python
|
| 45 |
+
result = QuotesSpider().start()
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
The `start()` method handles all the async machinery internally — no need to worry about event loops. While the spider is running, everything that happens is logged to the terminal, and at the end of the crawl, you get very detailed stats.
|
| 49 |
+
|
| 50 |
+
Those stats are in the returned `CrawlResult` object, which gives you everything you need:
|
| 51 |
+
|
| 52 |
+
```python
|
| 53 |
+
result = QuotesSpider().start()
|
| 54 |
+
|
| 55 |
+
# Access scraped items
|
| 56 |
+
for item in result.items:
|
| 57 |
+
print(item["text"], "-", item["author"])
|
| 58 |
+
|
| 59 |
+
# Check statistics
|
| 60 |
+
print(f"Scraped {result.stats.items_scraped} items")
|
| 61 |
+
print(f"Made {result.stats.requests_count} requests")
|
| 62 |
+
print(f"Took {result.stats.elapsed_seconds:.1f} seconds")
|
| 63 |
+
|
| 64 |
+
# Did the crawl finish or was it paused?
|
| 65 |
+
print(f"Completed: {result.completed}")
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
## Following Links
|
| 69 |
+
|
| 70 |
+
Most crawls need to follow links across multiple pages. Use `response.follow()` to create follow-up requests:
|
| 71 |
+
|
| 72 |
+
```python
|
| 73 |
+
from scrapling.spiders import Spider, Response
|
| 74 |
+
|
| 75 |
+
class QuotesSpider(Spider):
|
| 76 |
+
name = "quotes"
|
| 77 |
+
start_urls = ["https://quotes.toscrape.com"]
|
| 78 |
+
|
| 79 |
+
async def parse(self, response: Response):
|
| 80 |
+
# Extract items from the current page
|
| 81 |
+
for quote in response.css("div.quote"):
|
| 82 |
+
yield {
|
| 83 |
+
"text": quote.css("span.text::text").get(""),
|
| 84 |
+
"author": quote.css("small.author::text").get(""),
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
# Follow the "next page" link
|
| 88 |
+
next_page = response.css("li.next a::attr(href)").get()
|
| 89 |
+
if next_page:
|
| 90 |
+
yield response.follow(next_page, callback=self.parse)
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
`response.follow()` handles relative URLs automatically — it joins them with the current page's URL. It also sets the current page as the `Referer` header by default.
|
| 94 |
+
|
| 95 |
+
You can point follow-up requests at different callback methods for different page types:
|
| 96 |
+
|
| 97 |
+
```python
|
| 98 |
+
async def parse(self, response: Response):
|
| 99 |
+
for link in response.css("a.product-link::attr(href)").getall():
|
| 100 |
+
yield response.follow(link, callback=self.parse_product)
|
| 101 |
+
|
| 102 |
+
async def parse_product(self, response: Response):
|
| 103 |
+
yield {
|
| 104 |
+
"name": response.css("h1::text").get(""),
|
| 105 |
+
"price": response.css(".price::text").get(""),
|
| 106 |
+
}
|
| 107 |
+
```
|
| 108 |
+
|
| 109 |
+
!!! note
|
| 110 |
+
|
| 111 |
+
All callback methods must be async generators (using `async def` and `yield`).
|
| 112 |
+
|
| 113 |
+
## Exporting Data
|
| 114 |
+
|
| 115 |
+
The `ItemList` returned in `result.items` has built-in export methods:
|
| 116 |
+
|
| 117 |
+
```python
|
| 118 |
+
result = QuotesSpider().start()
|
| 119 |
+
|
| 120 |
+
# Export as JSON
|
| 121 |
+
result.items.to_json("quotes.json")
|
| 122 |
+
|
| 123 |
+
# Export as JSON with pretty-printing
|
| 124 |
+
result.items.to_json("quotes.json", indent=True)
|
| 125 |
+
|
| 126 |
+
# Export as JSON Lines (one JSON object per line)
|
| 127 |
+
result.items.to_jsonl("quotes.jsonl")
|
| 128 |
+
```
|
| 129 |
+
|
| 130 |
+
Both methods create parent directories automatically if they don't exist.
|
| 131 |
+
|
| 132 |
+
## Filtering Domains
|
| 133 |
+
|
| 134 |
+
Use `allowed_domains` to restrict the spider to specific domains. This prevents it from accidentally following links to external websites:
|
| 135 |
+
|
| 136 |
+
```python
|
| 137 |
+
class MySpider(Spider):
|
| 138 |
+
name = "my_spider"
|
| 139 |
+
start_urls = ["https://example.com"]
|
| 140 |
+
allowed_domains = {"example.com"}
|
| 141 |
+
|
| 142 |
+
async def parse(self, response: Response):
|
| 143 |
+
for link in response.css("a::attr(href)").getall():
|
| 144 |
+
# Links to other domains are silently dropped
|
| 145 |
+
yield response.follow(link, callback=self.parse)
|
| 146 |
+
```
|
| 147 |
+
|
| 148 |
+
Subdomains are matched automatically — setting `allowed_domains = {"example.com"}` also allows `sub.example.com`, `blog.example.com`, etc.
|
| 149 |
+
|
| 150 |
+
When a request is filtered out, it's counted in `stats.offsite_requests_count` so you can see how many were dropped.
|
| 151 |
+
|
| 152 |
+
## What's Next
|
| 153 |
+
|
| 154 |
+
Now that you have the basics, you can explore:
|
| 155 |
+
|
| 156 |
+
- [Requests & Responses](requests-responses.md) — learn about request priority, deduplication, metadata, and more.
|
| 157 |
+
- [Sessions](sessions.md) — use multiple fetcher types (HTTP, browser, stealth) in a single spider.
|
| 158 |
+
- [Proxy management & blocking](proxy-blocking.md) — rotate proxies across requests and how to handle blocking in the spider.
|
| 159 |
+
- [Advanced features](advanced.md) — concurrency control, pause/resume, streaming, lifecycle hooks, and logging.
|
docs/spiders/proxy-blocking.md
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Proxy management and handling Blocks
|
| 2 |
+
|
| 3 |
+
## Introduction
|
| 4 |
+
|
| 5 |
+
!!! success "Prerequisites"
|
| 6 |
+
|
| 7 |
+
1. You've read the [Getting started](getting-started.md) page and know how to create and run a basic spider.
|
| 8 |
+
2. You've read the [Sessions](sessions.md) page and understand how to configure sessions.
|
| 9 |
+
|
| 10 |
+
When scraping at scale, you'll often need to rotate through multiple proxies to avoid rate limits and blocks. Scrapling's `ProxyRotator` makes this straightforward — it works with all session types and integrates with the spider's blocked request retry system.
|
| 11 |
+
|
| 12 |
+
If you don't know what a proxy is or how to choose a good one, [this guide can help](https://substack.thewebscraping.club/p/everything-about-proxies).
|
| 13 |
+
|
| 14 |
+
## ProxyRotator
|
| 15 |
+
|
| 16 |
+
The `ProxyRotator` class manages a list of proxies and rotates through them automatically. Pass it to any session type via the `proxy_rotator` parameter:
|
| 17 |
+
|
| 18 |
+
```python
|
| 19 |
+
from scrapling.spiders import Spider, Response
|
| 20 |
+
from scrapling.fetchers import FetcherSession, ProxyRotator
|
| 21 |
+
|
| 22 |
+
class MySpider(Spider):
|
| 23 |
+
name = "my_spider"
|
| 24 |
+
start_urls = ["https://example.com"]
|
| 25 |
+
|
| 26 |
+
def configure_sessions(self, manager):
|
| 27 |
+
rotator = ProxyRotator([
|
| 28 |
+
"http://proxy1:8080",
|
| 29 |
+
"http://proxy2:8080",
|
| 30 |
+
"http://user:pass@proxy3:8080",
|
| 31 |
+
])
|
| 32 |
+
manager.add("default", FetcherSession(proxy_rotator=rotator))
|
| 33 |
+
|
| 34 |
+
async def parse(self, response: Response):
|
| 35 |
+
# Check which proxy was used
|
| 36 |
+
print(f"Proxy used: {response.meta.get('proxy')}")
|
| 37 |
+
yield {"title": response.css("title::text").get("")}
|
| 38 |
+
```
|
| 39 |
+
|
| 40 |
+
Each request automatically gets the next proxy in the rotation. The proxy used is stored in `response.meta["proxy"]` so you can track which proxy fetched which page.
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
When you use it with browser sessions, you will need some adjustments, like below:
|
| 44 |
+
|
| 45 |
+
```python
|
| 46 |
+
from scrapling.fetchers import AsyncDynamicSession, AsyncStealthySession, ProxyRotator
|
| 47 |
+
|
| 48 |
+
# String proxies work for all session types
|
| 49 |
+
rotator = ProxyRotator([
|
| 50 |
+
"http://proxy1:8080",
|
| 51 |
+
"http://proxy2:8080",
|
| 52 |
+
])
|
| 53 |
+
|
| 54 |
+
# Dict proxies (Playwright format) work for browser sessions
|
| 55 |
+
rotator = ProxyRotator([
|
| 56 |
+
{"server": "http://proxy1:8080", "username": "user", "password": "pass"},
|
| 57 |
+
{"server": "http://proxy2:8080"},
|
| 58 |
+
])
|
| 59 |
+
|
| 60 |
+
# Then inside the spider
|
| 61 |
+
def configure_sessions(self, manager):
|
| 62 |
+
rotator = ProxyRotator(["http://proxy1:8080", "http://proxy2:8080"])
|
| 63 |
+
manager.add("browser", AsyncStealthySession(proxy_rotator=rotator))
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
!!! info
|
| 67 |
+
|
| 68 |
+
1. You cannot use the `proxy_rotator` argument together with the static `proxy` or `proxies` parameters on the same session. Pick one approach when configuring the session, and override it per request later if you want, as we will show later.
|
| 69 |
+
2. Remember that by default, all browser-based sessions use a persistent browser context with a pool of tabs. However, since browsers can't set a proxy per tab, when you use a `ProxyRotator`, the fetcher will automatically open a separate context for each proxy, with one tab per context. Once the tab's job is done, both the tab and its context are closed.
|
| 70 |
+
|
| 71 |
+
## Custom Rotation Strategies
|
| 72 |
+
|
| 73 |
+
By default, `ProxyRotator` uses cyclic rotation — it iterates through proxies sequentially, wrapping around at the end.
|
| 74 |
+
|
| 75 |
+
You can provide a custom strategy function to change this behavior, but it has to match the below signature:
|
| 76 |
+
|
| 77 |
+
```python
|
| 78 |
+
from scrapling.core._types import ProxyType
|
| 79 |
+
|
| 80 |
+
def my_strategy(proxies: list, current_index: int) -> tuple[ProxyType, int]:
|
| 81 |
+
...
|
| 82 |
+
```
|
| 83 |
+
|
| 84 |
+
It receives the list of proxies and the current index, and must return the chosen proxy and the next index.
|
| 85 |
+
|
| 86 |
+
Below are some examples of custom rotation strategies you can use.
|
| 87 |
+
|
| 88 |
+
### Random Rotation
|
| 89 |
+
|
| 90 |
+
```python
|
| 91 |
+
import random
|
| 92 |
+
from scrapling.fetchers import ProxyRotator
|
| 93 |
+
|
| 94 |
+
def random_strategy(proxies, current_index):
|
| 95 |
+
idx = random.randint(0, len(proxies) - 1)
|
| 96 |
+
return proxies[idx], idx
|
| 97 |
+
|
| 98 |
+
rotator = ProxyRotator(
|
| 99 |
+
["http://proxy1:8080", "http://proxy2:8080", "http://proxy3:8080"],
|
| 100 |
+
strategy=random_strategy,
|
| 101 |
+
)
|
| 102 |
+
```
|
| 103 |
+
|
| 104 |
+
### Weighted Rotation
|
| 105 |
+
|
| 106 |
+
```python
|
| 107 |
+
import random
|
| 108 |
+
|
| 109 |
+
def weighted_strategy(proxies, current_index):
|
| 110 |
+
# First proxy gets 60% of traffic, others split the rest
|
| 111 |
+
weights = [60] + [40 // (len(proxies) - 1)] * (len(proxies) - 1)
|
| 112 |
+
proxy = random.choices(proxies, weights=weights, k=1)[0]
|
| 113 |
+
return proxy, current_index # Index doesn't matter for weighted
|
| 114 |
+
|
| 115 |
+
rotator = ProxyRotator(proxies, strategy=weighted_strategy)
|
| 116 |
+
```
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
## Per-Request Proxy Override
|
| 120 |
+
|
| 121 |
+
You can override the rotator for individual requests by passing `proxy=` as a keyword argument:
|
| 122 |
+
|
| 123 |
+
```python
|
| 124 |
+
async def parse(self, response: Response):
|
| 125 |
+
# This request uses the rotator's next proxy
|
| 126 |
+
yield response.follow("/page1", callback=self.parse_page)
|
| 127 |
+
|
| 128 |
+
# This request uses a specific proxy, bypassing the rotator
|
| 129 |
+
yield response.follow(
|
| 130 |
+
"/special-page",
|
| 131 |
+
callback=self.parse_page,
|
| 132 |
+
proxy="http://special-proxy:8080",
|
| 133 |
+
)
|
| 134 |
+
```
|
| 135 |
+
|
| 136 |
+
This is useful when certain pages require a specific proxy (e.g., a geo-located proxy for region-specific content).
|
| 137 |
+
|
| 138 |
+
## Blocked Request Handling
|
| 139 |
+
|
| 140 |
+
The spider has built-in blocked request detection and retry. By default, it considers the following HTTP status codes blocked: `401`, `403`, `407`, `429`, `444`, `500`, `502`, `503`, `504`.
|
| 141 |
+
|
| 142 |
+
The retry system works like this:
|
| 143 |
+
|
| 144 |
+
1. After a response comes back, the spider calls the `is_blocked(response)` method.
|
| 145 |
+
2. If blocked, it copies the request and calls the `retry_blocked_request()` method so you can modify it before retrying.
|
| 146 |
+
3. The retried request is re-queued with `dont_filter=True` (bypassing deduplication) and lower priority, so it's not retried right away.
|
| 147 |
+
4. This repeats up to `max_blocked_retries` times (default: 3).
|
| 148 |
+
|
| 149 |
+
!!! tip
|
| 150 |
+
|
| 151 |
+
1. On retry, the previous `proxy`/`proxies` kwargs are cleared from the request automatically, so the rotator assigns a fresh proxy.
|
| 152 |
+
2. The `max_blocked_retries` attribute is different than the session retries and doesn't share the counter.
|
| 153 |
+
|
| 154 |
+
### Custom Block Detection
|
| 155 |
+
|
| 156 |
+
Override `is_blocked()` to add your own detection logic:
|
| 157 |
+
|
| 158 |
+
```python
|
| 159 |
+
class MySpider(Spider):
|
| 160 |
+
name = "my_spider"
|
| 161 |
+
start_urls = ["https://example.com"]
|
| 162 |
+
|
| 163 |
+
async def is_blocked(self, response: Response) -> bool:
|
| 164 |
+
# Check status codes (default behavior)
|
| 165 |
+
if response.status in {403, 429, 503}:
|
| 166 |
+
return True
|
| 167 |
+
|
| 168 |
+
# Check response content
|
| 169 |
+
body = response.body.decode("utf-8", errors="ignore")
|
| 170 |
+
if "access denied" in body.lower() or "rate limit" in body.lower():
|
| 171 |
+
return True
|
| 172 |
+
|
| 173 |
+
return False
|
| 174 |
+
|
| 175 |
+
async def parse(self, response: Response):
|
| 176 |
+
yield {"title": response.css("title::text").get("")}
|
| 177 |
+
```
|
| 178 |
+
|
| 179 |
+
### Customizing Retries
|
| 180 |
+
|
| 181 |
+
Override `retry_blocked_request()` to modify the request before retrying. The `max_blocked_retries` attribute controls how many times a blocked request is retried (default: 3):
|
| 182 |
+
|
| 183 |
+
```python
|
| 184 |
+
from scrapling.spiders import Spider, SessionManager, Request, Response
|
| 185 |
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
class MySpider(Spider):
|
| 189 |
+
name = "my_spider"
|
| 190 |
+
start_urls = ["https://example.com"]
|
| 191 |
+
max_blocked_retries = 5
|
| 192 |
+
|
| 193 |
+
def configure_sessions(self, manager: SessionManager) -> None:
|
| 194 |
+
manager.add('requests', FetcherSession(impersonate=['chrome', 'firefox', 'safari']))
|
| 195 |
+
manager.add('stealth', AsyncStealthySession(block_webrtc=True), lazy=True)
|
| 196 |
+
|
| 197 |
+
async def retry_blocked_request(self, request: Request, response: Response) -> Request:
|
| 198 |
+
request.sid = "stealth"
|
| 199 |
+
self.logger.info(f"Retrying blocked request: {request.url}")
|
| 200 |
+
return request
|
| 201 |
+
|
| 202 |
+
async def parse(self, response: Response):
|
| 203 |
+
yield {"title": response.css("title::text").get("")}
|
| 204 |
+
```
|
| 205 |
+
|
| 206 |
+
What happened above is that I left the blocking detection logic unchanged and had the spider mainly use requests until it got blocked, then switch to the stealthy browser.
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
Putting it all together:
|
| 210 |
+
|
| 211 |
+
```python
|
| 212 |
+
from scrapling.spiders import Spider, SessionManager, Request, Response
|
| 213 |
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession, ProxyRotator
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
cheap_proxies = ProxyRotator([ "http://proxy1:8080", "http://proxy2:8080"])
|
| 217 |
+
|
| 218 |
+
# A format acceptable by the browser
|
| 219 |
+
expensive_proxies = ProxyRotator([
|
| 220 |
+
{"server": "http://residential_proxy1:8080", "username": "user", "password": "pass"},
|
| 221 |
+
{"server": "http://residential_proxy2:8080", "username": "user", "password": "pass"},
|
| 222 |
+
{"server": "http://mobile_proxy1:8080", "username": "user", "password": "pass"},
|
| 223 |
+
{"server": "http://mobile_proxy2:8080", "username": "user", "password": "pass"},
|
| 224 |
+
])
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
class MySpider(Spider):
|
| 228 |
+
name = "my_spider"
|
| 229 |
+
start_urls = ["https://example.com"]
|
| 230 |
+
max_blocked_retries = 5
|
| 231 |
+
|
| 232 |
+
def configure_sessions(self, manager: SessionManager) -> None:
|
| 233 |
+
manager.add('requests', FetcherSession(impersonate=['chrome', 'firefox', 'safari'], proxy_rotator=cheap_proxies))
|
| 234 |
+
manager.add('stealth', AsyncStealthySession(block_webrtc=True, proxy_rotator=expensive_proxies), lazy=True)
|
| 235 |
+
|
| 236 |
+
async def retry_blocked_request(self, request: Request, response: Response) -> Request:
|
| 237 |
+
request.sid = "stealth"
|
| 238 |
+
self.logger.info(f"Retrying blocked request: {request.url}")
|
| 239 |
+
return request
|
| 240 |
+
|
| 241 |
+
async def parse(self, response: Response):
|
| 242 |
+
yield {"title": response.css("title::text").get("")}
|
| 243 |
+
```
|
| 244 |
+
The above logic is: requests are made with cheap proxies, such as datacenter proxies, until they are blocked, then retried with higher-quality proxies, such as residential or mobile proxies.
|
docs/spiders/requests-responses.md
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Requests & Responses
|
| 2 |
+
|
| 3 |
+
!!! success "Prerequisites"
|
| 4 |
+
|
| 5 |
+
1. You've read the [Getting started](getting-started.md) page and know how to create and run a basic spider.
|
| 6 |
+
|
| 7 |
+
This page covers the `Request` object in detail — how to construct requests, pass data between callbacks, control priority and deduplication, and use `response.follow()` for link-following.
|
| 8 |
+
|
| 9 |
+
## The Request Object
|
| 10 |
+
|
| 11 |
+
A `Request` represents a URL to be fetched. You create requests either directly or via `response.follow()`:
|
| 12 |
+
|
| 13 |
+
```python
|
| 14 |
+
from scrapling.spiders import Request
|
| 15 |
+
|
| 16 |
+
# Direct construction
|
| 17 |
+
request = Request(
|
| 18 |
+
"https://example.com/page",
|
| 19 |
+
callback=self.parse_page,
|
| 20 |
+
priority=5,
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
# Via response.follow (preferred in callbacks)
|
| 24 |
+
request = response.follow("/page", callback=self.parse_page)
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
Here are all the arguments you can pass to `Request`:
|
| 28 |
+
|
| 29 |
+
| Argument | Type | Default | Description |
|
| 30 |
+
|---------------|------------|------------|-------------------------------------------------------------------------------------------------------|
|
| 31 |
+
| `url` | `str` | *required* | The URL to fetch |
|
| 32 |
+
| `sid` | `str` | `""` | Session ID — routes the request to a specific session (see [Sessions](sessions.md)) |
|
| 33 |
+
| `callback` | `callable` | `None` | Async generator method to process the response. Defaults to `parse()` |
|
| 34 |
+
| `priority` | `int` | `0` | Higher values are processed first |
|
| 35 |
+
| `dont_filter` | `bool` | `False` | If `True`, skip deduplication (allow duplicate requests) |
|
| 36 |
+
| `meta` | `dict` | `{}` | Arbitrary metadata passed through to the response |
|
| 37 |
+
| `**kwargs` | | | Additional keyword arguments passed to the session's fetch method (e.g., `headers`, `method`, `data`) |
|
| 38 |
+
|
| 39 |
+
Any extra keyword arguments are forwarded directly to the underlying session. For example, to make a POST request:
|
| 40 |
+
|
| 41 |
+
```python
|
| 42 |
+
yield Request(
|
| 43 |
+
"https://example.com/api",
|
| 44 |
+
method="POST",
|
| 45 |
+
data={"key": "value"},
|
| 46 |
+
callback=self.parse_result,
|
| 47 |
+
)
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
## Response.follow()
|
| 51 |
+
|
| 52 |
+
`response.follow()` is the recommended way to create follow-up requests inside callbacks. It offers several advantages over constructing `Request` objects directly:
|
| 53 |
+
|
| 54 |
+
- **Relative URLs** are resolved automatically against the current page URL
|
| 55 |
+
- **Referer header** is set to the current page URL by default
|
| 56 |
+
- **Session kwargs** from the original request are inherited (headers, proxy settings, etc.)
|
| 57 |
+
- **Callback, session ID, and priority** are inherited from the original request if not specified
|
| 58 |
+
|
| 59 |
+
```python
|
| 60 |
+
async def parse(self, response: Response):
|
| 61 |
+
# Minimal — inherits callback, sid, priority from current request
|
| 62 |
+
yield response.follow("/next-page")
|
| 63 |
+
|
| 64 |
+
# Override specific fields
|
| 65 |
+
yield response.follow(
|
| 66 |
+
"/product/123",
|
| 67 |
+
callback=self.parse_product,
|
| 68 |
+
priority=10,
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
# Pass additional metadata to
|
| 72 |
+
yield response.follow(
|
| 73 |
+
"/details",
|
| 74 |
+
callback=self.parse_details,
|
| 75 |
+
meta={"category": "electronics"},
|
| 76 |
+
)
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
| Argument | Type | Default | Description |
|
| 80 |
+
|--------------------|------------|------------|------------------------------------------------------------|
|
| 81 |
+
| `url` | `str` | *required* | URL to follow (absolute or relative) |
|
| 82 |
+
| `sid` | `str` | `""` | Session ID (inherits from original request if empty) |
|
| 83 |
+
| `callback` | `callable` | `None` | Callback method (inherits from original request if `None`) |
|
| 84 |
+
| `priority` | `int` | `None` | Priority (inherits from original request if `None`) |
|
| 85 |
+
| `dont_filter` | `bool` | `False` | Skip deduplication |
|
| 86 |
+
| `meta` | `dict` | `None` | Metadata (merged with existing response meta) |
|
| 87 |
+
| **`referer_flow`** | `bool` | `True` | Set current URL as Referer header |
|
| 88 |
+
| `**kwargs` | | | Merged with original request's session kwargs |
|
| 89 |
+
|
| 90 |
+
### Disabling Referer Flow
|
| 91 |
+
|
| 92 |
+
By default, `response.follow()` sets the `Referer` header to the current page URL. To disable this:
|
| 93 |
+
|
| 94 |
+
```python
|
| 95 |
+
yield response.follow("/page", referer_flow=False)
|
| 96 |
+
```
|
| 97 |
+
|
| 98 |
+
## Callbacks
|
| 99 |
+
|
| 100 |
+
Callbacks are async generator methods on your spider that process responses. They must `yield` one of three types:
|
| 101 |
+
|
| 102 |
+
- **`dict`** — A scraped item, added to the results
|
| 103 |
+
- **`Request`** — A follow-up request, added to the queue
|
| 104 |
+
- **`None`** — Silently ignored
|
| 105 |
+
|
| 106 |
+
```python
|
| 107 |
+
class MySpider(Spider):
|
| 108 |
+
name = "my_spider"
|
| 109 |
+
start_urls = ["https://example.com"]
|
| 110 |
+
|
| 111 |
+
async def parse(self, response: Response):
|
| 112 |
+
# Yield items (dicts)
|
| 113 |
+
yield {"url": response.url, "title": response.css("title::text").get("")}
|
| 114 |
+
|
| 115 |
+
# Yield follow-up requests
|
| 116 |
+
for link in response.css("a::attr(href)").getall():
|
| 117 |
+
yield response.follow(link, callback=self.parse_page)
|
| 118 |
+
|
| 119 |
+
async def parse_page(self, response: Response):
|
| 120 |
+
yield {"content": response.css("article::text").get("")}
|
| 121 |
+
```
|
| 122 |
+
|
| 123 |
+
!!! tip "Note:"
|
| 124 |
+
|
| 125 |
+
All callback methods must be `async def` and use `yield` (not `return`). Even if a callback only yields items with no follow-up requests, it must still be an async generator.
|
| 126 |
+
|
| 127 |
+
## Request Priority
|
| 128 |
+
|
| 129 |
+
Requests with higher priority values are processed first. This is useful when some pages are more important to be processed first before others:
|
| 130 |
+
|
| 131 |
+
```python
|
| 132 |
+
async def parse(self, response: Response):
|
| 133 |
+
# High priority — process product pages first
|
| 134 |
+
for link in response.css("a.product::attr(href)").getall():
|
| 135 |
+
yield response.follow(link, callback=self.parse_product, priority=10)
|
| 136 |
+
|
| 137 |
+
# Low priority — pagination links processed after products
|
| 138 |
+
next_page = response.css("a.next::attr(href)").get()
|
| 139 |
+
if next_page:
|
| 140 |
+
yield response.follow(next_page, callback=self.parse, priority=0)
|
| 141 |
+
```
|
| 142 |
+
|
| 143 |
+
When using `response.follow()`, the priority is inherited from the original request unless you specify a new one.
|
| 144 |
+
|
| 145 |
+
## Deduplication
|
| 146 |
+
|
| 147 |
+
The spider automatically deduplicates requests based on a fingerprint computed from the URL, HTTP method, request body, and session ID. If two requests produce the same fingerprint, the second one is silently dropped.
|
| 148 |
+
|
| 149 |
+
To allow duplicate requests (e.g., re-visiting a page after login), set `dont_filter=True`:
|
| 150 |
+
|
| 151 |
+
```python
|
| 152 |
+
yield Request("https://example.com/dashboard", dont_filter=True, callback=self.parse_dashboard)
|
| 153 |
+
|
| 154 |
+
# Or with response.follow
|
| 155 |
+
yield response.follow("/dashboard", dont_filter=True, callback=self.parse_dashboard)
|
| 156 |
+
```
|
| 157 |
+
|
| 158 |
+
You can fine-tune what goes into the fingerprint using class attributes on your spider:
|
| 159 |
+
|
| 160 |
+
| Attribute | Default | Effect |
|
| 161 |
+
|----------------------|---------|-----------------------------------------------------------------------------------------------------------------|
|
| 162 |
+
| `fp_include_kwargs` | `False` | Include extra request kwargs (arguments you passed to the session fetch, like headers, etc.) in the fingerprint |
|
| 163 |
+
| `fp_keep_fragments` | `False` | Keep URL fragments (`#section`) when computing fingerprints |
|
| 164 |
+
| `fp_include_headers` | `False` | Include request headers in the fingerprint |
|
| 165 |
+
|
| 166 |
+
For example, if you need to treat `https://example.com/page#section1` and `https://example.com/page#section2` as different URLs:
|
| 167 |
+
|
| 168 |
+
```python
|
| 169 |
+
class MySpider(Spider):
|
| 170 |
+
name = "my_spider"
|
| 171 |
+
fp_keep_fragments = True
|
| 172 |
+
# ...
|
| 173 |
+
```
|
| 174 |
+
|
| 175 |
+
## Request Meta
|
| 176 |
+
|
| 177 |
+
The `meta` dictionary lets you pass arbitrary data between callbacks. This is useful when you need context from one page to process another:
|
| 178 |
+
|
| 179 |
+
```python
|
| 180 |
+
async def parse(self, response: Response):
|
| 181 |
+
for product in response.css("div.product"):
|
| 182 |
+
category = product.css("span.category::text").get("")
|
| 183 |
+
link = product.css("a::attr(href)").get()
|
| 184 |
+
if link:
|
| 185 |
+
yield response.follow(
|
| 186 |
+
link,
|
| 187 |
+
callback=self.parse_product,
|
| 188 |
+
meta={"category": category},
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
async def parse_product(self, response: Response):
|
| 192 |
+
yield {
|
| 193 |
+
"name": response.css("h1::text").get(""),
|
| 194 |
+
"price": response.css(".price::text").get(""),
|
| 195 |
+
# Access meta from the request
|
| 196 |
+
"category": response.meta.get("category", ""),
|
| 197 |
+
}
|
| 198 |
+
```
|
| 199 |
+
|
| 200 |
+
When using `response.follow()`, the meta from the current response is merged with the new meta you provide (new values take precedence).
|
| 201 |
+
|
| 202 |
+
The spider system also automatically stores some metadata. For example, the proxy used for a request is available as `response.meta["proxy"]` when proxy rotation is enabled.
|
docs/spiders/sessions.md
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Spiders sessions
|
| 2 |
+
|
| 3 |
+
!!! success "Prerequisites"
|
| 4 |
+
|
| 5 |
+
1. You've read the [Getting started](getting-started.md) page and know how to create and run a basic spider.
|
| 6 |
+
2. You're familiar with [Fetchers basics](../fetching/choosing.md) and the differences between HTTP, Dynamic, and Stealthy sessions.
|
| 7 |
+
|
| 8 |
+
A spider can use multiple fetcher sessions simultaneously — for example, a fast HTTP session for simple pages and a stealth browser session for protected pages. This page shows you how to configure and use sessions.
|
| 9 |
+
|
| 10 |
+
## What are Sessions?
|
| 11 |
+
|
| 12 |
+
As you should already know, a session is a pre-configured fetcher instance that stays alive for the duration of the crawl. Instead of creating a new connection or browser for every request, the spider reuses sessions, which is faster and more resource-efficient.
|
| 13 |
+
|
| 14 |
+
By default, every spider creates a single [FetcherSession](../fetching/static.md). You can add more sessions or swap the default by overriding the `configure_sessions()` method, but you have to use the async version of each session only, as the table shows below:
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
| Session Type | Use Case |
|
| 18 |
+
|-------------------------------------------------|------------------------------------------|
|
| 19 |
+
| [FetcherSession](../fetching/static.md) | Fast HTTP requests, no JavaScript |
|
| 20 |
+
| [AsyncDynamicSession](../fetching/dynamic.md) | Browser automation, JavaScript rendering |
|
| 21 |
+
| [AsyncStealthySession](../fetching/stealthy.md) | Anti-bot bypass, Cloudflare, etc. |
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
## Configuring Sessions
|
| 25 |
+
|
| 26 |
+
Override `configure_sessions()` on your spider to set up sessions. The `manager` parameter is a `SessionManager` instance — use `manager.add()` to register sessions:
|
| 27 |
+
|
| 28 |
+
```python
|
| 29 |
+
from scrapling.spiders import Spider, Response
|
| 30 |
+
from scrapling.fetchers import FetcherSession
|
| 31 |
+
|
| 32 |
+
class MySpider(Spider):
|
| 33 |
+
name = "my_spider"
|
| 34 |
+
start_urls = ["https://example.com"]
|
| 35 |
+
|
| 36 |
+
def configure_sessions(self, manager):
|
| 37 |
+
manager.add("default", FetcherSession())
|
| 38 |
+
|
| 39 |
+
async def parse(self, response: Response):
|
| 40 |
+
yield {"title": response.css("title::text").get("")}
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
The `manager.add()` method takes:
|
| 44 |
+
|
| 45 |
+
| Argument | Type | Default | Description |
|
| 46 |
+
|--------------|-----------|------------|----------------------------------------------|
|
| 47 |
+
| `session_id` | `str` | *required* | A name to reference this session in requests |
|
| 48 |
+
| `session` | `Session` | *required* | The session instance |
|
| 49 |
+
| `default` | `bool` | `False` | Make this the default session |
|
| 50 |
+
| `lazy` | `bool` | `False` | Start the session only when first used |
|
| 51 |
+
|
| 52 |
+
!!! note "Notes:"
|
| 53 |
+
|
| 54 |
+
1. In all requests, if you don't specify which session to use, the default session is used. The default session is determined in one of two ways:
|
| 55 |
+
1. The first session you add to the managed becomes the default automatically.
|
| 56 |
+
2. The session that gets `default=True` while added to the manager.
|
| 57 |
+
2. The instances you pass of each session don't have to be already started by you; the spider checks on all sessions if they are not already started and starts them.
|
| 58 |
+
3. If you want a specific session to start when used only, then use the `lazy` argument while adding that session to the manager. Example: start the browser only when you need it, not with the spider start.
|
| 59 |
+
|
| 60 |
+
## Multi-Session Spider
|
| 61 |
+
|
| 62 |
+
Here's a practical example: use a fast HTTP session for listing pages and a stealth browser for detail pages that have bot protection:
|
| 63 |
+
|
| 64 |
+
```python
|
| 65 |
+
from scrapling.spiders import Spider, Response
|
| 66 |
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession
|
| 67 |
+
|
| 68 |
+
class ProductSpider(Spider):
|
| 69 |
+
name = "products"
|
| 70 |
+
start_urls = ["https://shop.example.com/products"]
|
| 71 |
+
|
| 72 |
+
def configure_sessions(self, manager):
|
| 73 |
+
# Fast HTTP for listing pages (default)
|
| 74 |
+
manager.add("http", FetcherSession())
|
| 75 |
+
|
| 76 |
+
# Stealth browser for protected product pages
|
| 77 |
+
manager.add("stealth", AsyncStealthySession(
|
| 78 |
+
headless=True,
|
| 79 |
+
network_idle=True,
|
| 80 |
+
))
|
| 81 |
+
|
| 82 |
+
async def parse(self, response: Response):
|
| 83 |
+
for link in response.css("a.product::attr(href)").getall():
|
| 84 |
+
# Route product pages through the stealth session
|
| 85 |
+
yield response.follow(link, sid="stealth", callback=self.parse_product)
|
| 86 |
+
|
| 87 |
+
next_page = response.css("a.next::attr(href)").get()
|
| 88 |
+
if next_page:
|
| 89 |
+
yield response.follow(next_page)
|
| 90 |
+
|
| 91 |
+
async def parse_product(self, response: Response):
|
| 92 |
+
yield {
|
| 93 |
+
"name": response.css("h1::text").get(""),
|
| 94 |
+
"price": response.css(".price::text").get(""),
|
| 95 |
+
}
|
| 96 |
+
```
|
| 97 |
+
|
| 98 |
+
The key is the `sid` parameter — it tells the spider which session to use for each request. When you call `response.follow()` without `sid`, the session ID from the original request is inherited.
|
| 99 |
+
|
| 100 |
+
Note that the sessions don't have to be from different classes only, but can be the same session, but different instances with different configurations, for example, like below:
|
| 101 |
+
|
| 102 |
+
```python
|
| 103 |
+
from scrapling.spiders import Spider, Response
|
| 104 |
+
from scrapling.fetchers import FetcherSession
|
| 105 |
+
|
| 106 |
+
class ProductSpider(Spider):
|
| 107 |
+
name = "products"
|
| 108 |
+
start_urls = ["https://shop.example.com/products"]
|
| 109 |
+
|
| 110 |
+
def configure_sessions(self, manager):
|
| 111 |
+
chrome_requests = FetcherSession(impersonate="chrome")
|
| 112 |
+
firefox_requests = FetcherSession(impersonate="firefox")
|
| 113 |
+
|
| 114 |
+
manager.add("chrome", chrome_requests)
|
| 115 |
+
manager.add("firefox", firefox_requests)
|
| 116 |
+
|
| 117 |
+
async def parse(self, response: Response):
|
| 118 |
+
for link in response.css("a.product::attr(href)").getall():
|
| 119 |
+
yield response.follow(link, callback=self.parse_product)
|
| 120 |
+
|
| 121 |
+
next_page = response.css("a.next::attr(href)").get()
|
| 122 |
+
if next_page:
|
| 123 |
+
yield response.follow(next_page, sid="firefox")
|
| 124 |
+
|
| 125 |
+
async def parse_product(self, response: Response):
|
| 126 |
+
yield {
|
| 127 |
+
"name": response.css("h1::text").get(""),
|
| 128 |
+
"price": response.css(".price::text").get(""),
|
| 129 |
+
}
|
| 130 |
+
```
|
| 131 |
+
|
| 132 |
+
Or you can separate concerns and keep a session with its cookies/state for specific requests, etc...
|
| 133 |
+
|
| 134 |
+
## Session Arguments
|
| 135 |
+
|
| 136 |
+
Extra keyword arguments passed to a `Request` (or through `response.follow(**kwargs)`) are forwarded to the session's fetch method. This lets you customize individual requests without changing the session configuration:
|
| 137 |
+
|
| 138 |
+
```python
|
| 139 |
+
async def parse(self, response: Response):
|
| 140 |
+
# Pass extra headers for this specific request
|
| 141 |
+
yield Request(
|
| 142 |
+
"https://api.example.com/data",
|
| 143 |
+
headers={"Authorization": "Bearer token123"},
|
| 144 |
+
callback=self.parse_api,
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
# Use a different HTTP method
|
| 148 |
+
yield Request(
|
| 149 |
+
"https://example.com/submit",
|
| 150 |
+
method="POST",
|
| 151 |
+
data={"field": "value"},
|
| 152 |
+
sid="firefox",
|
| 153 |
+
callback=self.parse_result,
|
| 154 |
+
)
|
| 155 |
+
```
|
| 156 |
+
|
| 157 |
+
!!! warning
|
| 158 |
+
|
| 159 |
+
Normally, when you use `FetcherSession`, `Fetcher`, or `AsyncFetcher`, you specify the HTTP method to use with the corresponding method like `.get()` and `.post()`. But while using `FetcherSession` in spiders, you can't do this. By default, the request is an _HTTP GET_ request; if you want to use another HTTP method, you have to pass it to the `method` argument, as in the above example. The reason for this is to unify the `Request` interface across all session types.
|
| 160 |
+
|
| 161 |
+
For browser sessions (`AsyncDynamicSession`, `AsyncStealthySession`), you can pass browser-specific arguments like `wait_selector`, `page_action`, or `extra_headers`:
|
| 162 |
+
|
| 163 |
+
```python
|
| 164 |
+
async def parse(self, response: Response):
|
| 165 |
+
# Use Cloudflare solver with the `AsyncStealthySession` we configured above
|
| 166 |
+
yield Request(
|
| 167 |
+
"https://nopecha.com/demo/cloudflare",
|
| 168 |
+
sid="stealth",
|
| 169 |
+
callback=self.parse_result,
|
| 170 |
+
solve_cloudflare=True,
|
| 171 |
+
block_webrtc=True,
|
| 172 |
+
hide_canvas=True,
|
| 173 |
+
google_search=True,
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
yield response.follow(
|
| 177 |
+
"/dynamic-page",
|
| 178 |
+
sid="browser",
|
| 179 |
+
callback=self.parse_dynamic,
|
| 180 |
+
wait_selector="div.loaded",
|
| 181 |
+
network_idle=True,
|
| 182 |
+
)
|
| 183 |
+
```
|
| 184 |
+
|
| 185 |
+
!!! warning
|
| 186 |
+
|
| 187 |
+
Session arguments (**kwargs) passed from the original request are inherited by `response.follow()`. New kwargs take precedence over inherited ones.
|
| 188 |
+
|
| 189 |
+
```python
|
| 190 |
+
from scrapling.spiders import Spider, Response
|
| 191 |
+
from scrapling.fetchers import FetcherSession
|
| 192 |
+
|
| 193 |
+
class ProductSpider(Spider):
|
| 194 |
+
name = "products"
|
| 195 |
+
start_urls = ["https://shop.example.com/products"]
|
| 196 |
+
|
| 197 |
+
def configure_sessions(self, manager):
|
| 198 |
+
manager.add("http", FetcherSession(impersonate='chrome'))
|
| 199 |
+
|
| 200 |
+
async def parse(self, response: Response):
|
| 201 |
+
# I don't want the follow request to impersonate a desktop Chrome like the previous request, but a mobile one
|
| 202 |
+
# so I override it like this
|
| 203 |
+
for link in response.css("a.product::attr(href)").getall():
|
| 204 |
+
yield response.follow(link, impersonate="chrome131_android", callback=self.parse_product)
|
| 205 |
+
|
| 206 |
+
next_page = response.css("a.next::attr(href)").get()
|
| 207 |
+
if next_page:
|
| 208 |
+
yield Request(next_page)
|
| 209 |
+
|
| 210 |
+
async def parse_product(self, response: Response):
|
| 211 |
+
yield {
|
| 212 |
+
"name": response.css("h1::text").get(""),
|
| 213 |
+
"price": response.css(".price::text").get(""),
|
| 214 |
+
}
|
| 215 |
+
```
|
| 216 |
+
!!! info
|
| 217 |
+
|
| 218 |
+
No need to mention that, upon spider closure, the manager automatically checks whether any sessions are still running and closes them before closing the spider.
|
docs/tutorials/migrating_from_beautifulsoup.md
CHANGED
|
@@ -18,10 +18,10 @@ You will notice that some shortcuts in BeautifulSoup are missing in Scrapling, w
|
|
| 18 |
| Finding a single element (Example 4) | `element = soup.find(lambda e: len(list(e.children)) > 0)` | `element = page.find(lambda e: len(e.children) > 0)` |
|
| 19 |
| Finding a single element (Example 5) | `element = soup.find(["a", "b"])` | `element = page.find(["a", "b"])` |
|
| 20 |
| Find element by its text content | `element = soup.find(text="some text")` | `element = page.find_by_text("some text", partial=False)` |
|
| 21 |
-
| Using CSS selectors to find the first matching element | `elements = soup.select_one('div.example')` | `elements = page.
|
| 22 |
| Using CSS selectors to find all matching element | `elements = soup.select('div.example')` | `elements = page.css('div.example')` |
|
| 23 |
| Get a prettified version of the page/element source | `prettified = soup.prettify()` | `prettified = page.prettify()` |
|
| 24 |
-
| Get a Non-pretty version of the page/element source | `source = str(soup)` | `source = page.
|
| 25 |
| Get tag name of an element | `name = element.name` | `name = element.tag` |
|
| 26 |
| Extracting text content of an element | `string = element.string` | `string = element.text` |
|
| 27 |
| Extracting all the text in a document or beneath a tag | `text = soup.get_text(strip=True)` | `text = page.get_all_text(strip=True)` |
|
|
@@ -36,14 +36,16 @@ You will notice that some shortcuts in BeautifulSoup are missing in Scrapling, w
|
|
| 36 |
| Searching for elements in the siblings of an element | `target_sibling = element.find_next_siblings("a")`<br/>`target_sibling = element.find_previous_siblings("a")` | `target_sibling = element.siblings.filter(lambda s: s.tag == 'a')` |
|
| 37 |
| Searching for an element in the next elements of an element | `target_parent = element.find_next("a")` | `target_parent = element.below_elements.search(lambda p: p.tag == 'a')` |
|
| 38 |
| Searching for elements in the next elements of an element | `target_parent = element.find_all_next("a")` | `target_parent = element.below_elements.filter(lambda p: p.tag == 'a')` |
|
| 39 |
-
| Searching for an element in the
|
| 40 |
-
| Searching for elements in the
|
| 41 |
| Get previous sibling of an element | `prev_element = element.previous_sibling` | `prev_element = element.previous` |
|
| 42 |
| Navigating to children | `children = list(element.children)` | `children = element.children` |
|
| 43 |
| Get all descendants of an element | `children = list(element.descendants)` | `children = element.below_elements` |
|
| 44 |
| Filtering a group of elements that satisfies a condition | `group = soup.find('p', 'story').css.filter('a')` | `group = page.find_all('p', 'story').filter(lambda p: p.tag == 'a')` |
|
| 45 |
|
| 46 |
|
|
|
|
|
|
|
| 47 |
**One key point to remember**: BeautifulSoup offers features for modifying and manipulating the page after it has been parsed. Scrapling focuses more on scraping the page faster for you, and then you can do what you want with the extracted information. So, two different tools can be used in Web Scraping, but one of them specializes in Web Scraping :)
|
| 48 |
|
| 49 |
### Putting It All Together
|
|
@@ -80,12 +82,12 @@ for link in links:
|
|
| 80 |
|
| 81 |
As you can see, Scrapling simplifies the process by combining fetching and parsing into a single step, making your code cleaner and more efficient.
|
| 82 |
|
| 83 |
-
**Additional Notes:**
|
| 84 |
|
| 85 |
-
- **Different parsers**: BeautifulSoup allows you to set the parser engine to use, and one of them is `lxml`. Scrapling doesn't do that and uses the `lxml` library by default for performance reasons.
|
| 86 |
-
- **Element Types**: In BeautifulSoup, elements are `Tag` objects; in Scrapling, they are `Selector` objects. However, they provide similar methods and properties for navigation and data extraction.
|
| 87 |
-
- **Error Handling**: Both libraries return `None` when an element is not found (e.g., `soup.find()` or `page.
|
| 88 |
-
- **Text Extraction**: Scrapling provides additional methods for handling text through `TextHandler`, such as `clean()`, which can help remove extra whitespace, consecutive spaces, or unwanted characters. Please check out the documentation for the complete list.
|
| 89 |
|
| 90 |
The documentation provides more details on Scrapling's features and the complete list of arguments that can be passed to all methods.
|
| 91 |
|
|
|
|
| 18 |
| Finding a single element (Example 4) | `element = soup.find(lambda e: len(list(e.children)) > 0)` | `element = page.find(lambda e: len(e.children) > 0)` |
|
| 19 |
| Finding a single element (Example 5) | `element = soup.find(["a", "b"])` | `element = page.find(["a", "b"])` |
|
| 20 |
| Find element by its text content | `element = soup.find(text="some text")` | `element = page.find_by_text("some text", partial=False)` |
|
| 21 |
+
| Using CSS selectors to find the first matching element | `elements = soup.select_one('div.example')` | `elements = page.css('div.example').first` |
|
| 22 |
| Using CSS selectors to find all matching element | `elements = soup.select('div.example')` | `elements = page.css('div.example')` |
|
| 23 |
| Get a prettified version of the page/element source | `prettified = soup.prettify()` | `prettified = page.prettify()` |
|
| 24 |
+
| Get a Non-pretty version of the page/element source | `source = str(soup)` | `source = page.html_content` |
|
| 25 |
| Get tag name of an element | `name = element.name` | `name = element.tag` |
|
| 26 |
| Extracting text content of an element | `string = element.string` | `string = element.text` |
|
| 27 |
| Extracting all the text in a document or beneath a tag | `text = soup.get_text(strip=True)` | `text = page.get_all_text(strip=True)` |
|
|
|
|
| 36 |
| Searching for elements in the siblings of an element | `target_sibling = element.find_next_siblings("a")`<br/>`target_sibling = element.find_previous_siblings("a")` | `target_sibling = element.siblings.filter(lambda s: s.tag == 'a')` |
|
| 37 |
| Searching for an element in the next elements of an element | `target_parent = element.find_next("a")` | `target_parent = element.below_elements.search(lambda p: p.tag == 'a')` |
|
| 38 |
| Searching for elements in the next elements of an element | `target_parent = element.find_all_next("a")` | `target_parent = element.below_elements.filter(lambda p: p.tag == 'a')` |
|
| 39 |
+
| Searching for an element in the ancestors of an element | `target_parent = element.find_previous("a")` ¹ | `target_parent = element.path.search(lambda p: p.tag == 'a')` |
|
| 40 |
+
| Searching for elements in the ancestors of an element | `target_parent = element.find_all_previous("a")` ¹ | `target_parent = element.path.filter(lambda p: p.tag == 'a')` |
|
| 41 |
| Get previous sibling of an element | `prev_element = element.previous_sibling` | `prev_element = element.previous` |
|
| 42 |
| Navigating to children | `children = list(element.children)` | `children = element.children` |
|
| 43 |
| Get all descendants of an element | `children = list(element.descendants)` | `children = element.below_elements` |
|
| 44 |
| Filtering a group of elements that satisfies a condition | `group = soup.find('p', 'story').css.filter('a')` | `group = page.find_all('p', 'story').filter(lambda p: p.tag == 'a')` |
|
| 45 |
|
| 46 |
|
| 47 |
+
¹ **Note:** BS4's `find_previous`/`find_all_previous` searches all preceding elements in document order, while Scrapling's `path` only returns ancestors (the parent chain). These are not exact equivalents, but ancestor search covers the most common use case.
|
| 48 |
+
|
| 49 |
**One key point to remember**: BeautifulSoup offers features for modifying and manipulating the page after it has been parsed. Scrapling focuses more on scraping the page faster for you, and then you can do what you want with the extracted information. So, two different tools can be used in Web Scraping, but one of them specializes in Web Scraping :)
|
| 50 |
|
| 51 |
### Putting It All Together
|
|
|
|
| 82 |
|
| 83 |
As you can see, Scrapling simplifies the process by combining fetching and parsing into a single step, making your code cleaner and more efficient.
|
| 84 |
|
| 85 |
+
!!! abstract "**Additional Notes:**"
|
| 86 |
|
| 87 |
+
- **Different parsers**: BeautifulSoup allows you to set the parser engine to use, and one of them is `lxml`. Scrapling doesn't do that and uses the `lxml` library by default for performance reasons.
|
| 88 |
+
- **Element Types**: In BeautifulSoup, elements are `Tag` objects; in Scrapling, they are `Selector` objects. However, they provide similar methods and properties for navigation and data extraction.
|
| 89 |
+
- **Error Handling**: Both libraries return `None` when an element is not found (e.g., `soup.find()` or `page.find()`). In Scrapling, `page.css()` returns an empty `Selectors` list when no elements match, and you can use `page.css('.foo').first` to safely get the first match or `None`. To avoid errors, check for `None` or empty results before accessing properties.
|
| 90 |
+
- **Text Extraction**: Scrapling provides additional methods for handling text through `TextHandler`, such as `clean()`, which can help remove extra whitespace, consecutive spaces, or unwanted characters. Please check out the documentation for the complete list.
|
| 91 |
|
| 92 |
The documentation provides more details on Scrapling's features and the complete list of arguments that can be passed to all methods.
|
| 93 |
|
mkdocs.yml
DELETED
|
@@ -1,180 +0,0 @@
|
|
| 1 |
-
site_name: Scrapling
|
| 2 |
-
site_description: Scrapling - Easy, effortless Web Scraping as it should be!
|
| 3 |
-
site_author: Karim Shoair
|
| 4 |
-
repo_url: https://github.com/D4Vinci/Scrapling
|
| 5 |
-
site_url: https://scrapling.readthedocs.io/en/latest/
|
| 6 |
-
repo_name: D4Vinci/Scrapling
|
| 7 |
-
copyright: Copyright © 2025 Karim Shoair - <a href="#__consent">Change cookie settings</a>
|
| 8 |
-
|
| 9 |
-
theme:
|
| 10 |
-
name: material
|
| 11 |
-
language: en
|
| 12 |
-
logo: assets/logo.png
|
| 13 |
-
favicon: assets/favicon.ico
|
| 14 |
-
palette:
|
| 15 |
-
scheme: slate
|
| 16 |
-
primary: black
|
| 17 |
-
accent: deep purple
|
| 18 |
-
font:
|
| 19 |
-
text: Open Sans
|
| 20 |
-
code: JetBrains Mono
|
| 21 |
-
icon:
|
| 22 |
-
repo: fontawesome/brands/github-alt
|
| 23 |
-
features:
|
| 24 |
-
- announce.dismiss
|
| 25 |
-
- navigation.top
|
| 26 |
-
- navigation.footer
|
| 27 |
-
- navigation.instant
|
| 28 |
-
- navigation.indexes
|
| 29 |
-
- navigation.sections
|
| 30 |
-
- navigation.tracking
|
| 31 |
-
- navigation.instant
|
| 32 |
-
- navigation.instant.prefetch
|
| 33 |
-
- navigation.instant.progress
|
| 34 |
-
# - navigation.tabs
|
| 35 |
-
# - navigation.expand
|
| 36 |
-
# - toc.integrate
|
| 37 |
-
- search.share
|
| 38 |
-
- search.suggest
|
| 39 |
-
- search.highlight
|
| 40 |
-
- content.tabs.link
|
| 41 |
-
- content.width.full
|
| 42 |
-
- content.action.view
|
| 43 |
-
- content.action.edit
|
| 44 |
-
- content.code.copy
|
| 45 |
-
- content.code.select
|
| 46 |
-
- content.code.annotate
|
| 47 |
-
- content.code.annotation
|
| 48 |
-
|
| 49 |
-
nav:
|
| 50 |
-
- Introduction: index.md
|
| 51 |
-
- Overview: overview.md
|
| 52 |
-
- What's New in v0.3: 'https://github.com/D4Vinci/Scrapling/releases/tag/v0.3'
|
| 53 |
-
- Performance Benchmarks: benchmarks.md
|
| 54 |
-
- User Guide:
|
| 55 |
-
- Parsing:
|
| 56 |
-
- Querying elements: parsing/selection.md
|
| 57 |
-
- Main classes: parsing/main_classes.md
|
| 58 |
-
- Adaptive scraping: parsing/adaptive.md
|
| 59 |
-
- Fetching:
|
| 60 |
-
- Fetchers basics: fetching/choosing.md
|
| 61 |
-
- HTTP requests: fetching/static.md
|
| 62 |
-
- Dynamic websites: fetching/dynamic.md
|
| 63 |
-
- Dynamic websites with hard protections: fetching/stealthy.md
|
| 64 |
-
- Command Line Interface:
|
| 65 |
-
- Overview: cli/overview.md
|
| 66 |
-
- Interactive shell: cli/interactive-shell.md
|
| 67 |
-
- Extract commands: cli/extract-commands.md
|
| 68 |
-
- Integrations:
|
| 69 |
-
- AI MCP server: ai/mcp-server.md
|
| 70 |
-
- Tutorials:
|
| 71 |
-
- A Free Alternative to AI for Robust Web Scraping: tutorials/replacing_ai.md
|
| 72 |
-
- Migrating from BeautifulSoup: tutorials/migrating_from_beautifulsoup.md
|
| 73 |
-
- Using Scrapeless browser: tutorials/external.md
|
| 74 |
-
# - Migrating from AutoScraper: tutorials/migrating_from_autoscraper.md
|
| 75 |
-
- Development:
|
| 76 |
-
- API Reference:
|
| 77 |
-
- Selector: api-reference/selector.md
|
| 78 |
-
- Fetchers: api-reference/fetchers.md
|
| 79 |
-
- MCP Server: api-reference/mcp-server.md
|
| 80 |
-
- Custom Types: api-reference/custom-types.md
|
| 81 |
-
- Writing your retrieval system: development/adaptive_storage_system.md
|
| 82 |
-
- Using Scrapling's custom types: development/scrapling_custom_types.md
|
| 83 |
-
- Support and Advertisement: donate.md
|
| 84 |
-
- Contributing: 'https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md'
|
| 85 |
-
- Changelog: 'https://github.com/D4Vinci/Scrapling/releases'
|
| 86 |
-
|
| 87 |
-
markdown_extensions:
|
| 88 |
-
- admonition
|
| 89 |
-
- abbr
|
| 90 |
-
# - mkautodoc
|
| 91 |
-
- pymdownx.emoji
|
| 92 |
-
- pymdownx.details
|
| 93 |
-
- pymdownx.superfences
|
| 94 |
-
- pymdownx.highlight:
|
| 95 |
-
anchor_linenums: true
|
| 96 |
-
- pymdownx.inlinehilite
|
| 97 |
-
- pymdownx.snippets
|
| 98 |
-
- pymdownx.tabbed:
|
| 99 |
-
alternate_style: true
|
| 100 |
-
- tables
|
| 101 |
-
- codehilite:
|
| 102 |
-
css_class: highlight
|
| 103 |
-
- toc:
|
| 104 |
-
permalink: true
|
| 105 |
-
|
| 106 |
-
plugins:
|
| 107 |
-
- search
|
| 108 |
-
- privacy:
|
| 109 |
-
links: false
|
| 110 |
-
- optimize
|
| 111 |
-
- social:
|
| 112 |
-
cards_layout_options:
|
| 113 |
-
background_color: "#1f1f1f"
|
| 114 |
-
font_family: Roboto
|
| 115 |
-
- mkdocstrings:
|
| 116 |
-
handlers:
|
| 117 |
-
python:
|
| 118 |
-
paths: [scrapling]
|
| 119 |
-
options:
|
| 120 |
-
docstring_style: sphinx
|
| 121 |
-
show_source: true
|
| 122 |
-
show_root_heading: true
|
| 123 |
-
show_if_no_docstring: true
|
| 124 |
-
inherited_members: true
|
| 125 |
-
members_order: source
|
| 126 |
-
separate_signature: true
|
| 127 |
-
unwrap_annotated: true
|
| 128 |
-
filters:
|
| 129 |
-
- '!^_'
|
| 130 |
-
- "^__"
|
| 131 |
-
merge_init_into_class: true
|
| 132 |
-
docstring_section_style: spacy
|
| 133 |
-
signature_crossrefs: true
|
| 134 |
-
show_symbol_type_heading: true
|
| 135 |
-
show_symbol_type_toc: true
|
| 136 |
-
show_inheritance_diagram: true
|
| 137 |
-
modernize_annotations: true
|
| 138 |
-
extensions:
|
| 139 |
-
- griffe_runtime_objects
|
| 140 |
-
- griffe_sphinx
|
| 141 |
-
- griffe_inherited_docstrings:
|
| 142 |
-
merge: true
|
| 143 |
-
|
| 144 |
-
extra:
|
| 145 |
-
homepage: https://scrapling.readthedocs.io/en/latest/
|
| 146 |
-
social:
|
| 147 |
-
- icon: fontawesome/brands/github
|
| 148 |
-
link: https://github.com/D4Vinci/Scrapling
|
| 149 |
-
- icon: fontawesome/brands/python
|
| 150 |
-
link: https://pypi.org/project/scrapling/
|
| 151 |
-
- icon: fontawesome/brands/x-twitter
|
| 152 |
-
link: https://x.com/Scrapling_dev
|
| 153 |
-
- icon: fontawesome/brands/discord
|
| 154 |
-
link: https://discord.gg/EMgGbDceNQ
|
| 155 |
-
analytics:
|
| 156 |
-
provider: google
|
| 157 |
-
property: G-CS3DKLY73Z
|
| 158 |
-
feedback:
|
| 159 |
-
title: Was this page helpful?
|
| 160 |
-
ratings:
|
| 161 |
-
- icon: material/emoticon-happy-outline
|
| 162 |
-
name: This page was helpful
|
| 163 |
-
data: 1
|
| 164 |
-
note: >-
|
| 165 |
-
Thanks for your feedback!
|
| 166 |
-
- icon: material/emoticon-sad-outline
|
| 167 |
-
name: This page could be improved
|
| 168 |
-
data: 0
|
| 169 |
-
note: >-
|
| 170 |
-
Thanks for your feedback!
|
| 171 |
-
consent:
|
| 172 |
-
title: Cookie consent
|
| 173 |
-
description: >-
|
| 174 |
-
We use cookies to recognize your repeated visits and preferences, as well
|
| 175 |
-
as to measure the effectiveness of our documentation and whether users
|
| 176 |
-
find what they're searching for. With your consent, you're helping us to
|
| 177 |
-
make our documentation better.
|
| 178 |
-
|
| 179 |
-
extra_css:
|
| 180 |
-
- stylesheets/extra.css
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pyproject.toml
CHANGED
|
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
|
|
| 5 |
[project]
|
| 6 |
name = "scrapling"
|
| 7 |
# Static version instead of a dynamic version so we can get better layer caching while building docker, check the docker file to understand
|
| 8 |
-
version = "0.
|
| 9 |
description = "Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!"
|
| 10 |
readme = {file = "docs/README.md", content-type = "text/markdown"}
|
| 11 |
license = {file = "LICENSE"}
|
|
@@ -28,6 +28,9 @@ keywords = [
|
|
| 28 |
"web-crawler",
|
| 29 |
"browser",
|
| 30 |
"crawling",
|
|
|
|
|
|
|
|
|
|
| 31 |
]
|
| 32 |
requires-python = ">=3.10"
|
| 33 |
classifiers = [
|
|
@@ -46,6 +49,7 @@ classifiers = [
|
|
| 46 |
"Topic :: Text Processing :: Markup :: HTML",
|
| 47 |
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
| 48 |
"Topic :: Software Development :: Libraries",
|
|
|
|
| 49 |
"Topic :: Software Development :: Libraries :: Python Modules",
|
| 50 |
"Programming Language :: Python :: 3",
|
| 51 |
"Programming Language :: Python :: 3 :: Only",
|
|
@@ -58,9 +62,11 @@ classifiers = [
|
|
| 58 |
]
|
| 59 |
dependencies = [
|
| 60 |
"lxml>=6.0.2",
|
| 61 |
-
"cssselect>=1.
|
| 62 |
-
"orjson>=3.11.
|
| 63 |
-
"
|
|
|
|
|
|
|
| 64 |
]
|
| 65 |
|
| 66 |
[project.optional-dependencies]
|
|
@@ -69,8 +75,9 @@ fetchers = [
|
|
| 69 |
"curl_cffi>=0.14.0",
|
| 70 |
"playwright==1.56.0",
|
| 71 |
"patchright==1.56.0",
|
| 72 |
-
"browserforge>=1.2.
|
| 73 |
"msgspec>=0.20.0",
|
|
|
|
| 74 |
]
|
| 75 |
ai = [
|
| 76 |
"mcp>=1.24.0",
|
|
@@ -92,6 +99,8 @@ Changelog = "https://github.com/D4Vinci/Scrapling/releases"
|
|
| 92 |
Documentation = "https://scrapling.readthedocs.io/en/latest/"
|
| 93 |
Repository = "https://github.com/D4Vinci/Scrapling"
|
| 94 |
"Bug Tracker" = "https://github.com/D4Vinci/Scrapling/issues"
|
|
|
|
|
|
|
| 95 |
|
| 96 |
[project.scripts]
|
| 97 |
scrapling = "scrapling.cli:main"
|
|
@@ -102,4 +111,16 @@ include-package-data = true
|
|
| 102 |
|
| 103 |
[tool.setuptools.packages.find]
|
| 104 |
where = ["."]
|
| 105 |
-
include = ["scrapling*"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
[project]
|
| 6 |
name = "scrapling"
|
| 7 |
# Static version instead of a dynamic version so we can get better layer caching while building docker, check the docker file to understand
|
| 8 |
+
version = "0.4"
|
| 9 |
description = "Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!"
|
| 10 |
readme = {file = "docs/README.md", content-type = "text/markdown"}
|
| 11 |
license = {file = "LICENSE"}
|
|
|
|
| 28 |
"web-crawler",
|
| 29 |
"browser",
|
| 30 |
"crawling",
|
| 31 |
+
"headless",
|
| 32 |
+
"scraper",
|
| 33 |
+
"chrome",
|
| 34 |
]
|
| 35 |
requires-python = ">=3.10"
|
| 36 |
classifiers = [
|
|
|
|
| 49 |
"Topic :: Text Processing :: Markup :: HTML",
|
| 50 |
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
| 51 |
"Topic :: Software Development :: Libraries",
|
| 52 |
+
"Topic :: Software Development :: Libraries :: Application Frameworks",
|
| 53 |
"Topic :: Software Development :: Libraries :: Python Modules",
|
| 54 |
"Programming Language :: Python :: 3",
|
| 55 |
"Programming Language :: Python :: 3 :: Only",
|
|
|
|
| 62 |
]
|
| 63 |
dependencies = [
|
| 64 |
"lxml>=6.0.2",
|
| 65 |
+
"cssselect>=1.4.0",
|
| 66 |
+
"orjson>=3.11.7",
|
| 67 |
+
"tld>=0.13.1",
|
| 68 |
+
"w3lib>=2.4.0",
|
| 69 |
+
"typing_extensions",
|
| 70 |
]
|
| 71 |
|
| 72 |
[project.optional-dependencies]
|
|
|
|
| 75 |
"curl_cffi>=0.14.0",
|
| 76 |
"playwright==1.56.0",
|
| 77 |
"patchright==1.56.0",
|
| 78 |
+
"browserforge>=1.2.4",
|
| 79 |
"msgspec>=0.20.0",
|
| 80 |
+
"anyio>=4.12.1"
|
| 81 |
]
|
| 82 |
ai = [
|
| 83 |
"mcp>=1.24.0",
|
|
|
|
| 99 |
Documentation = "https://scrapling.readthedocs.io/en/latest/"
|
| 100 |
Repository = "https://github.com/D4Vinci/Scrapling"
|
| 101 |
"Bug Tracker" = "https://github.com/D4Vinci/Scrapling/issues"
|
| 102 |
+
"Discord" = "https://discord.gg/EMgGbDceNQ"
|
| 103 |
+
"Release Notes" = "https://github.com/D4Vinci/Scrapling/releases"
|
| 104 |
|
| 105 |
[project.scripts]
|
| 106 |
scrapling = "scrapling.cli:main"
|
|
|
|
| 111 |
|
| 112 |
[tool.setuptools.packages.find]
|
| 113 |
where = ["."]
|
| 114 |
+
include = ["scrapling*"]
|
| 115 |
+
|
| 116 |
+
[tool.mypy]
|
| 117 |
+
python_version = "3.10"
|
| 118 |
+
warn_unused_configs = true
|
| 119 |
+
ignore_missing_imports = true
|
| 120 |
+
check_untyped_defs = true
|
| 121 |
+
|
| 122 |
+
[tool.pyright]
|
| 123 |
+
pythonVersion = "3.10"
|
| 124 |
+
typeCheckingMode = "basic"
|
| 125 |
+
include = ["scrapling"]
|
| 126 |
+
ignore = ["tests", "benchmarks.py"]
|
scrapling/__init__.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
| 2 |
-
__version__ = "0.
|
| 3 |
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
| 4 |
|
| 5 |
from typing import Any, TYPE_CHECKING
|
|
|
|
| 1 |
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
| 2 |
+
__version__ = "0.4"
|
| 3 |
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
| 4 |
|
| 5 |
from typing import Any, TYPE_CHECKING
|
scrapling/cli.py
CHANGED
|
@@ -128,6 +128,9 @@ def install(force): # pragma: no cover
|
|
| 128 |
],
|
| 129 |
"Playwright dependencies",
|
| 130 |
)
|
|
|
|
|
|
|
|
|
|
| 131 |
# if no errors raised by the above commands, then we add the below file
|
| 132 |
__PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").touch()
|
| 133 |
else:
|
|
|
|
| 128 |
],
|
| 129 |
"Playwright dependencies",
|
| 130 |
)
|
| 131 |
+
from tld.utils import update_tld_names
|
| 132 |
+
|
| 133 |
+
update_tld_names(fail_silently=True)
|
| 134 |
# if no errors raised by the above commands, then we add the below file
|
| 135 |
__PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").touch()
|
| 136 |
else:
|
scrapling/core/_html_utils.py
DELETED
|
@@ -1,342 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
This file is mostly copied from the submodule `w3lib.html` source code to stop downloading the whole library to use a small part of it.
|
| 3 |
-
So the goal of doing this is to minimize the memory footprint and keep the library size relatively smaller.
|
| 4 |
-
Repo source code: https://github.com/scrapy/w3lib/blob/master/w3lib/html.py
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
from re import compile as _re_compile, IGNORECASE
|
| 8 |
-
|
| 9 |
-
from scrapling.core._types import Iterable, Optional, Match, StrOrBytes
|
| 10 |
-
|
| 11 |
-
_ent_re = _re_compile(
|
| 12 |
-
r"&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)",
|
| 13 |
-
IGNORECASE,
|
| 14 |
-
)
|
| 15 |
-
# maps HTML4 entity name to the Unicode code point
|
| 16 |
-
name2codepoint = {
|
| 17 |
-
"AElig": 0x00C6, # latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1
|
| 18 |
-
"Aacute": 0x00C1, # latin capital letter A with acute, U+00C1 ISOlat1
|
| 19 |
-
"Acirc": 0x00C2, # latin capital letter A with circumflex, U+00C2 ISOlat1
|
| 20 |
-
"Agrave": 0x00C0, # latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1
|
| 21 |
-
"Alpha": 0x0391, # greek capital letter alpha, U+0391
|
| 22 |
-
"Aring": 0x00C5, # latin capital letter A with the ring above = latin capital letter A ring, U+00C5 ISOlat1
|
| 23 |
-
"Atilde": 0x00C3, # latin capital letter A with tilde, U+00C3 ISOlat1
|
| 24 |
-
"Auml": 0x00C4, # latin capital letter A with diaeresis, U+00C4 ISOlat1
|
| 25 |
-
"Beta": 0x0392, # greek capital letter beta, U+0392
|
| 26 |
-
"Ccedil": 0x00C7, # latin capital letter C with cedilla, U+00C7 ISOlat1
|
| 27 |
-
"Chi": 0x03A7, # greek capital letter chi, U+03A7
|
| 28 |
-
"Dagger": 0x2021, # double dagger, U+2021 ISOpub
|
| 29 |
-
"Delta": 0x0394, # greek capital letter delta, U+0394 ISOgrk3
|
| 30 |
-
"ETH": 0x00D0, # latin capital letter ETH, U+00D0 ISOlat1
|
| 31 |
-
"Eacute": 0x00C9, # latin capital letter E with acute, U+00C9 ISOlat1
|
| 32 |
-
"Ecirc": 0x00CA, # latin capital letter E with circumflex, U+00CA ISOlat1
|
| 33 |
-
"Egrave": 0x00C8, # latin capital letter E with grave, U+00C8 ISOlat1
|
| 34 |
-
"Epsilon": 0x0395, # greek capital letter epsilon, U+0395
|
| 35 |
-
"Eta": 0x0397, # greek capital letter eta, U+0397
|
| 36 |
-
"Euml": 0x00CB, # latin capital letter E with diaeresis, U+00CB ISOlat1
|
| 37 |
-
"Gamma": 0x0393, # greek capital letter gamma, U+0393 ISOgrk3
|
| 38 |
-
"Iacute": 0x00CD, # latin capital letter I with acute, U+00CD ISOlat1
|
| 39 |
-
"Icirc": 0x00CE, # latin capital letter I with circumflex, U+00CE ISOlat1
|
| 40 |
-
"Igrave": 0x00CC, # latin capital letter I with grave, U+00CC ISOlat1
|
| 41 |
-
"Iota": 0x0399, # greek capital letter iota, U+0399
|
| 42 |
-
"Iuml": 0x00CF, # latin capital letter I with diaeresis, U+00CF ISOlat1
|
| 43 |
-
"Kappa": 0x039A, # greek capital letter kappa, U+039A
|
| 44 |
-
"Lambda": 0x039B, # greek capital letter lambda, U+039B ISOgrk3
|
| 45 |
-
"Mu": 0x039C, # greek capital letter mu, U+039C
|
| 46 |
-
"Ntilde": 0x00D1, # latin capital letter N with tilde, U+00D1 ISOlat1
|
| 47 |
-
"Nu": 0x039D, # greek capital letter nu, U+039D
|
| 48 |
-
"OElig": 0x0152, # latin capital ligature OE, U+0152 ISOlat2
|
| 49 |
-
"Oacute": 0x00D3, # latin capital letter O with acute, U+00D3 ISOlat1
|
| 50 |
-
"Ocirc": 0x00D4, # latin capital letter O with circumflex, U+00D4 ISOlat1
|
| 51 |
-
"Ograve": 0x00D2, # latin capital letter O with grave, U+00D2 ISOlat1
|
| 52 |
-
"Omega": 0x03A9, # greek capital letter omega, U+03A9 ISOgrk3
|
| 53 |
-
"Omicron": 0x039F, # greek capital letter omicron, U+039F
|
| 54 |
-
"Oslash": 0x00D8, # latin capital letter O with stroke = latin capital letter O slash, U+00D8 ISOlat1
|
| 55 |
-
"Otilde": 0x00D5, # latin capital letter O with tilde, U+00D5 ISOlat1
|
| 56 |
-
"Ouml": 0x00D6, # latin capital letter O with diaeresis, U+00D6 ISOlat1
|
| 57 |
-
"Phi": 0x03A6, # greek capital letter phi, U+03A6 ISOgrk3
|
| 58 |
-
"Pi": 0x03A0, # greek capital letter pi, U+03A0 ISOgrk3
|
| 59 |
-
"Prime": 0x2033, # double prime = seconds = inches, U+2033 ISOtech
|
| 60 |
-
"Psi": 0x03A8, # greek capital letter psi, U+03A8 ISOgrk3
|
| 61 |
-
"Rho": 0x03A1, # greek capital letter rho, U+03A1
|
| 62 |
-
"Scaron": 0x0160, # latin capital letter S with caron, U+0160 ISOlat2
|
| 63 |
-
"Sigma": 0x03A3, # greek capital letter sigma, U+03A3 ISOgrk3
|
| 64 |
-
"THORN": 0x00DE, # latin capital letter THORN, U+00DE ISOlat1
|
| 65 |
-
"Tau": 0x03A4, # greek capital letter tau, U+03A4
|
| 66 |
-
"Theta": 0x0398, # greek capital letter theta, U+0398 ISOgrk3
|
| 67 |
-
"Uacute": 0x00DA, # latin capital letter U with acute, U+00DA ISOlat1
|
| 68 |
-
"Ucirc": 0x00DB, # latin capital letter U with circumflex, U+00DB ISOlat1
|
| 69 |
-
"Ugrave": 0x00D9, # latin capital letter U with grave, U+00D9 ISOlat1
|
| 70 |
-
"Upsilon": 0x03A5, # greek capital letter upsilon, U+03A5 ISOgrk3
|
| 71 |
-
"Uuml": 0x00DC, # latin capital letter U with diaeresis, U+00DC ISOlat1
|
| 72 |
-
"Xi": 0x039E, # greek capital letter xi, U+039E ISOgrk3
|
| 73 |
-
"Yacute": 0x00DD, # latin capital letter Y with acute, U+00DD ISOlat1
|
| 74 |
-
"Yuml": 0x0178, # latin capital letter Y with diaeresis, U+0178 ISOlat2
|
| 75 |
-
"Zeta": 0x0396, # greek capital letter zeta, U+0396
|
| 76 |
-
"aacute": 0x00E1, # latin small letter a with acute, U+00E1 ISOlat1
|
| 77 |
-
"acirc": 0x00E2, # latin small letter a with circumflex, U+00E2 ISOlat1
|
| 78 |
-
"acute": 0x00B4, # acute accent = spacing acute, U+00B4 ISOdia
|
| 79 |
-
"aelig": 0x00E6, # latin small letter ae = latin small ligature ae, U+00E6 ISOlat1
|
| 80 |
-
"agrave": 0x00E0, # latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1
|
| 81 |
-
"alefsym": 0x2135, # alef symbol = first transfinite cardinal, U+2135 NEW
|
| 82 |
-
"alpha": 0x03B1, # greek small letter alpha, U+03B1 ISOgrk3
|
| 83 |
-
"amp": 0x0026, # ampersand, U+0026 ISOnum
|
| 84 |
-
"and": 0x2227, # logical and = wedge, U+2227 ISOtech
|
| 85 |
-
"ang": 0x2220, # angle, U+2220 ISOamso
|
| 86 |
-
"aring": 0x00E5, # latin small letter a with the ring above = latin small letter a ring, U+00E5 ISOlat1
|
| 87 |
-
"asymp": 0x2248, # almost equal to = asymptotic to, U+2248 ISOamsr
|
| 88 |
-
"atilde": 0x00E3, # latin small letter a with tilde, U+00E3 ISOlat1
|
| 89 |
-
"auml": 0x00E4, # latin small letter a with diaeresis, U+00E4 ISOlat1
|
| 90 |
-
"bdquo": 0x201E, # double low-9 quotation mark, U+201E NEW
|
| 91 |
-
"beta": 0x03B2, # greek small letter beta, U+03B2 ISOgrk3
|
| 92 |
-
"brvbar": 0x00A6, # broken bar = broken vertical bar, U+00A6 ISOnum
|
| 93 |
-
"bull": 0x2022, # bullet = black small circle, U+2022 ISOpub
|
| 94 |
-
"cap": 0x2229, # intersection = cap, U+2229 ISOtech
|
| 95 |
-
"ccedil": 0x00E7, # latin small letter c with cedilla, U+00E7 ISOlat1
|
| 96 |
-
"cedil": 0x00B8, # cedilla = spacing cedilla, U+00B8 ISOdia
|
| 97 |
-
"cent": 0x00A2, # cent sign, U+00A2 ISOnum
|
| 98 |
-
"chi": 0x03C7, # greek small letter chi, U+03C7 ISOgrk3
|
| 99 |
-
"circ": 0x02C6, # modifier letter circumflex accent, U+02C6 ISOpub
|
| 100 |
-
"clubs": 0x2663, # black club suit = shamrock, U+2663 ISOpub
|
| 101 |
-
"cong": 0x2245, # approximately equal to, U+2245 ISOtech
|
| 102 |
-
"copy": 0x00A9, # copyright sign, U+00A9 ISOnum
|
| 103 |
-
"crarr": 0x21B5, # downwards arrow with corner leftwards = carriage return, U+21B5 NEW
|
| 104 |
-
"cup": 0x222A, # union = cup, U+222A ISOtech
|
| 105 |
-
"curren": 0x00A4, # currency sign, U+00A4 ISOnum
|
| 106 |
-
"dArr": 0x21D3, # downwards double arrow, U+21D3 ISOamsa
|
| 107 |
-
"dagger": 0x2020, # dagger, U+2020 ISOpub
|
| 108 |
-
"darr": 0x2193, # downwards arrow, U+2193 ISOnum
|
| 109 |
-
"deg": 0x00B0, # degree sign, U+00B0 ISOnum
|
| 110 |
-
"delta": 0x03B4, # greek small letter delta, U+03B4 ISOgrk3
|
| 111 |
-
"diams": 0x2666, # black diamond suit, U+2666 ISOpub
|
| 112 |
-
"divide": 0x00F7, # division sign, U+00F7 ISOnum
|
| 113 |
-
"eacute": 0x00E9, # latin small letter e with acute, U+00E9 ISOlat1
|
| 114 |
-
"ecirc": 0x00EA, # latin small letter e with circumflex, U+00EA ISOlat1
|
| 115 |
-
"egrave": 0x00E8, # latin small letter e with grave, U+00E8 ISOlat1
|
| 116 |
-
"empty": 0x2205, # empty set = null set = diameter, U+2205 ISOamso
|
| 117 |
-
"emsp": 0x2003, # em space, U+2003 ISOpub
|
| 118 |
-
"ensp": 0x2002, # en space, U+2002 ISOpub
|
| 119 |
-
"epsilon": 0x03B5, # greek small letter epsilon, U+03B5 ISOgrk3
|
| 120 |
-
"equiv": 0x2261, # identical to, U+2261 ISOtech
|
| 121 |
-
"eta": 0x03B7, # greek small letter eta, U+03B7 ISOgrk3
|
| 122 |
-
"eth": 0x00F0, # latin small letter eth, U+00F0 ISOlat1
|
| 123 |
-
"euml": 0x00EB, # latin small letter e with diaeresis, U+00EB ISOlat1
|
| 124 |
-
"euro": 0x20AC, # euro sign, U+20AC NEW
|
| 125 |
-
"exist": 0x2203, # there exists, U+2203 ISOtech
|
| 126 |
-
"fnof": 0x0192, # latin small f with hook = function = florin, U+0192 ISOtech
|
| 127 |
-
"forall": 0x2200, # for all, U+2200 ISOtech
|
| 128 |
-
"frac12": 0x00BD, # vulgar fraction one half = fraction one half, U+00BD ISOnum
|
| 129 |
-
"frac14": 0x00BC, # vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum
|
| 130 |
-
"frac34": 0x00BE, # vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum
|
| 131 |
-
"frasl": 0x2044, # fraction slash, U+2044 NEW
|
| 132 |
-
"gamma": 0x03B3, # greek small letter gamma, U+03B3 ISOgrk3
|
| 133 |
-
"ge": 0x2265, # greater-than or equal to, U+2265 ISOtech
|
| 134 |
-
"gt": 0x003E, # greater-than sign, U+003E ISOnum
|
| 135 |
-
"hArr": 0x21D4, # left right double arrow, U+21D4 ISOamsa
|
| 136 |
-
"harr": 0x2194, # left right arrow, U+2194 ISOamsa
|
| 137 |
-
"hearts": 0x2665, # black heart suit = valentine, U+2665 ISOpub
|
| 138 |
-
"hellip": 0x2026, # horizontal ellipsis = three dot leader, U+2026 ISOpub
|
| 139 |
-
"iacute": 0x00ED, # latin small letter i with acute, U+00ED ISOlat1
|
| 140 |
-
"icirc": 0x00EE, # latin small letter i with circumflex, U+00EE ISOlat1
|
| 141 |
-
"iexcl": 0x00A1, # inverted exclamation mark, U+00A1 ISOnum
|
| 142 |
-
"igrave": 0x00EC, # latin small letter i with grave, U+00EC ISOlat1
|
| 143 |
-
"image": 0x2111, # blackletter capital I = imaginary part, U+2111 ISOamso
|
| 144 |
-
"infin": 0x221E, # infinity, U+221E ISOtech
|
| 145 |
-
"int": 0x222B, # integral, U+222B ISOtech
|
| 146 |
-
"iota": 0x03B9, # greek small letter iota, U+03B9 ISOgrk3
|
| 147 |
-
"iquest": 0x00BF, # inverted question mark = turned question mark, U+00BF ISOnum
|
| 148 |
-
"isin": 0x2208, # element of, U+2208 ISOtech
|
| 149 |
-
"iuml": 0x00EF, # latin small letter i with diaeresis, U+00EF ISOlat1
|
| 150 |
-
"kappa": 0x03BA, # greek small letter kappa, U+03BA ISOgrk3
|
| 151 |
-
"lArr": 0x21D0, # leftwards double arrow, U+21D0 ISOtech
|
| 152 |
-
"lambda": 0x03BB, # greek small letter lambda, U+03BB ISOgrk3
|
| 153 |
-
"lang": 0x2329, # left-pointing angle bracket = bra, U+2329 ISOtech
|
| 154 |
-
"laquo": 0x00AB, # left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum
|
| 155 |
-
"larr": 0x2190, # leftwards arrow, U+2190 ISOnum
|
| 156 |
-
"lceil": 0x2308, # left ceiling = apl upstile, U+2308 ISOamsc
|
| 157 |
-
"ldquo": 0x201C, # left double quotation mark, U+201C ISOnum
|
| 158 |
-
"le": 0x2264, # less-than or equal to, U+2264 ISOtech
|
| 159 |
-
"lfloor": 0x230A, # left floor = apl downstile, U+230A ISOamsc
|
| 160 |
-
"lowast": 0x2217, # asterisk operator, U+2217 ISOtech
|
| 161 |
-
"loz": 0x25CA, # lozenge, U+25CA ISOpub
|
| 162 |
-
"lrm": 0x200E, # left-to-right mark, U+200E NEW RFC 2070
|
| 163 |
-
"lsaquo": 0x2039, # single left-pointing angle quotation mark, U+2039 ISO proposed
|
| 164 |
-
"lsquo": 0x2018, # left single quotation mark, U+2018 ISOnum
|
| 165 |
-
"lt": 0x003C, # less-than sign, U+003C ISOnum
|
| 166 |
-
"macr": 0x00AF, # macron = spacing macron = overline = APL overbar, U+00AF ISOdia
|
| 167 |
-
"mdash": 0x2014, # em dash, U+2014 ISOpub
|
| 168 |
-
"micro": 0x00B5, # micro sign, U+00B5 ISOnum
|
| 169 |
-
"middot": 0x00B7, # middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum
|
| 170 |
-
"minus": 0x2212, # minus sign, U+2212 ISOtech
|
| 171 |
-
"mu": 0x03BC, # greek small letter mu, U+03BC ISOgrk3
|
| 172 |
-
"nabla": 0x2207, # nabla = backward difference, U+2207 ISOtech
|
| 173 |
-
"nbsp": 0x00A0, # no-break space = non-breaking space, U+00A0 ISOnum
|
| 174 |
-
"ndash": 0x2013, # en dash, U+2013 ISOpub
|
| 175 |
-
"ne": 0x2260, # not equal to, U+2260 ISOtech
|
| 176 |
-
"ni": 0x220B, # contains as member, U+220B ISOtech
|
| 177 |
-
"not": 0x00AC, # not sign, U+00AC ISOnum
|
| 178 |
-
"notin": 0x2209, # not an element of, U+2209 ISOtech
|
| 179 |
-
"nsub": 0x2284, # not a subset of, U+2284 ISOamsn
|
| 180 |
-
"ntilde": 0x00F1, # latin small letter n with tilde, U+00F1 ISOlat1
|
| 181 |
-
"nu": 0x03BD, # greek small letter nu, U+03BD ISOgrk3
|
| 182 |
-
"oacute": 0x00F3, # latin small letter o with acute, U+00F3 ISOlat1
|
| 183 |
-
"ocirc": 0x00F4, # latin small letter o with circumflex, U+00F4 ISOlat1
|
| 184 |
-
"oelig": 0x0153, # latin small ligature oe, U+0153 ISOlat2
|
| 185 |
-
"ograve": 0x00F2, # latin small letter o with grave, U+00F2 ISOlat1
|
| 186 |
-
"oline": 0x203E, # overline = spacing overscore, U+203E NEW
|
| 187 |
-
"omega": 0x03C9, # greek small letter omega, U+03C9 ISOgrk3
|
| 188 |
-
"omicron": 0x03BF, # greek small letter omicron, U+03BF NEW
|
| 189 |
-
"oplus": 0x2295, # circled plus = direct sum, U+2295 ISOamsb
|
| 190 |
-
"or": 0x2228, # logical or = vee, U+2228 ISOtech
|
| 191 |
-
"ordf": 0x00AA, # feminine ordinal indicator, U+00AA ISOnum
|
| 192 |
-
"ordm": 0x00BA, # masculine ordinal indicator, U+00BA ISOnum
|
| 193 |
-
"oslash": 0x00F8, # latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1
|
| 194 |
-
"otilde": 0x00F5, # latin small letter o with tilde, U+00F5 ISOlat1
|
| 195 |
-
"otimes": 0x2297, # circled times = vector product, U+2297 ISOamsb
|
| 196 |
-
"ouml": 0x00F6, # latin small letter o with diaeresis, U+00F6 ISOlat1
|
| 197 |
-
"para": 0x00B6, # pilcrow sign = paragraph sign, U+00B6 ISOnum
|
| 198 |
-
"part": 0x2202, # partial differential, U+2202 ISOtech
|
| 199 |
-
"permil": 0x2030, # per mille sign, U+2030 ISOtech
|
| 200 |
-
"perp": 0x22A5, # up tack = orthogonal to = perpendicular, U+22A5 ISOtech
|
| 201 |
-
"phi": 0x03C6, # greek small letter phi, U+03C6 ISOgrk3
|
| 202 |
-
"pi": 0x03C0, # greek small letter pi, U+03C0 ISOgrk3
|
| 203 |
-
"piv": 0x03D6, # greek pi symbol, U+03D6 ISOgrk3
|
| 204 |
-
"plusmn": 0x00B1, # plus-minus sign = plus-or-minus sign, U+00B1 ISOnum
|
| 205 |
-
"pound": 0x00A3, # pound sign, U+00A3 ISOnum
|
| 206 |
-
"prime": 0x2032, # prime = minutes = feet, U+2032 ISOtech
|
| 207 |
-
"prod": 0x220F, # n-ary product = product sign, U+220F ISOamsb
|
| 208 |
-
"prop": 0x221D, # proportional to, U+221D ISOtech
|
| 209 |
-
"psi": 0x03C8, # greek small letter psi, U+03C8 ISOgrk3
|
| 210 |
-
"quot": 0x0022, # quotation mark = APL quote, U+0022 ISOnum
|
| 211 |
-
"rArr": 0x21D2, # rightwards double arrow, U+21D2 ISOtech
|
| 212 |
-
"radic": 0x221A, # square root = radical sign, U+221A ISOtech
|
| 213 |
-
"rang": 0x232A, # right-pointing angle bracket = ket, U+232A ISOtech
|
| 214 |
-
"raquo": 0x00BB, # right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum
|
| 215 |
-
"rarr": 0x2192, # rightwards arrow, U+2192 ISOnum
|
| 216 |
-
"rceil": 0x2309, # right ceiling, U+2309 ISOamsc
|
| 217 |
-
"rdquo": 0x201D, # right double quotation mark, U+201D ISOnum
|
| 218 |
-
"real": 0x211C, # blackletter capital R = real part symbol, U+211C ISOamso
|
| 219 |
-
"reg": 0x00AE, # registered sign = registered trade mark sign, U+00AE ISOnum
|
| 220 |
-
"rfloor": 0x230B, # right floor, U+230B ISOamsc
|
| 221 |
-
"rho": 0x03C1, # greek small letter rho, U+03C1 ISOgrk3
|
| 222 |
-
"rlm": 0x200F, # right-to-left mark, U+200F NEW RFC 2070
|
| 223 |
-
"rsaquo": 0x203A, # single right-pointing angle quotation mark, U+203A ISO proposed
|
| 224 |
-
"rsquo": 0x2019, # right single quotation mark, U+2019 ISOnum
|
| 225 |
-
"sbquo": 0x201A, # single low-9 quotation mark, U+201A NEW
|
| 226 |
-
"scaron": 0x0161, # latin small letter s with caron, U+0161 ISOlat2
|
| 227 |
-
"sdot": 0x22C5, # dot operator, U+22C5 ISOamsb
|
| 228 |
-
"sect": 0x00A7, # section sign, U+00A7 ISOnum
|
| 229 |
-
"shy": 0x00AD, # soft hyphen = discretionary hyphen, U+00AD ISOnum
|
| 230 |
-
"sigma": 0x03C3, # greek small letter sigma, U+03C3 ISOgrk3
|
| 231 |
-
"sigmaf": 0x03C2, # greek small letter final sigma, U+03C2 ISOgrk3
|
| 232 |
-
"sim": 0x223C, # tilde operator = varies with = similar to, U+223C ISOtech
|
| 233 |
-
"spades": 0x2660, # black spade suit, U+2660 ISOpub
|
| 234 |
-
"sub": 0x2282, # subset of, U+2282 ISOtech
|
| 235 |
-
"sube": 0x2286, # subset of or equal to, U+2286 ISOtech
|
| 236 |
-
"sum": 0x2211, # n-ary summation, U+2211 ISOamsb
|
| 237 |
-
"sup": 0x2283, # superset of, U+2283 ISOtech
|
| 238 |
-
"sup1": 0x00B9, # superscript one = superscript digit one, U+00B9 ISOnum
|
| 239 |
-
"sup2": 0x00B2, # superscript two = superscript digit two = squared, U+00B2 ISOnum
|
| 240 |
-
"sup3": 0x00B3, # superscript three = superscript digit three = cubed, U+00B3 ISOnum
|
| 241 |
-
"supe": 0x2287, # superset of or equal to, U+2287 ISOtech
|
| 242 |
-
"szlig": 0x00DF, # latin small letter sharp s = ess-zed, U+00DF ISOlat1
|
| 243 |
-
"tau": 0x03C4, # greek small letter tau, U+03C4 ISOgrk3
|
| 244 |
-
"there4": 0x2234, # therefore, U+2234 ISOtech
|
| 245 |
-
"theta": 0x03B8, # greek small letter theta, U+03B8 ISOgrk3
|
| 246 |
-
"thetasym": 0x03D1, # greek small letter theta symbol, U+03D1 NEW
|
| 247 |
-
"thinsp": 0x2009, # thin space, U+2009 ISOpub
|
| 248 |
-
"thorn": 0x00FE, # latin small letter thorn with, U+00FE ISOlat1
|
| 249 |
-
"tilde": 0x02DC, # small tilde, U+02DC ISOdia
|
| 250 |
-
"times": 0x00D7, # multiplication sign, U+00D7 ISOnum
|
| 251 |
-
"trade": 0x2122, # trade mark sign, U+2122 ISOnum
|
| 252 |
-
"uArr": 0x21D1, # upwards double arrow, U+21D1 ISOamsa
|
| 253 |
-
"uacute": 0x00FA, # latin small letter u with acute, U+00FA ISOlat1
|
| 254 |
-
"uarr": 0x2191, # upwards arrow, U+2191 ISOnum
|
| 255 |
-
"ucirc": 0x00FB, # latin small letter u with circumflex, U+00FB ISOlat1
|
| 256 |
-
"ugrave": 0x00F9, # latin small letter u with grave, U+00F9 ISOlat1
|
| 257 |
-
"uml": 0x00A8, # diaeresis = spacing diaeresis, U+00A8 ISOdia
|
| 258 |
-
"upsih": 0x03D2, # greek upsilon with hook symbol, U+03D2 NEW
|
| 259 |
-
"upsilon": 0x03C5, # greek small letter upsilon, U+03C5 ISOgrk3
|
| 260 |
-
"uuml": 0x00FC, # latin small letter u with diaeresis, U+00FC ISOlat1
|
| 261 |
-
"weierp": 0x2118, # script capital P = power set = Weierstrass p, U+2118 ISOamso
|
| 262 |
-
"xi": 0x03BE, # greek small letter xi, U+03BE ISOgrk3
|
| 263 |
-
"yacute": 0x00FD, # latin small letter y with acute, U+00FD ISOlat1
|
| 264 |
-
"yen": 0x00A5, # yen sign = yuan sign, U+00A5 ISOnum
|
| 265 |
-
"yuml": 0x00FF, # latin small letter y with diaeresis, U+00FF ISOlat1
|
| 266 |
-
"zeta": 0x03B6, # greek small letter zeta, U+03B6 ISOgrk3
|
| 267 |
-
"zwj": 0x200D, # zero width joiner, U+200D NEW RFC 2070
|
| 268 |
-
"zwnj": 0x200C, # zero width non-joiner, U+200C NEW RFC 2070
|
| 269 |
-
}
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
def to_unicode(text: StrOrBytes, encoding: Optional[str] = None, errors: str = "strict") -> str:
|
| 273 |
-
"""Return the Unicode representation of a bytes object `text`. If `text`
|
| 274 |
-
is already a Unicode object, return it as-is."""
|
| 275 |
-
if isinstance(text, str):
|
| 276 |
-
return text
|
| 277 |
-
if not isinstance(text, (bytes, str)):
|
| 278 |
-
raise TypeError(f"to_unicode must receive bytes or str, got {type(text).__name__}")
|
| 279 |
-
if encoding is None:
|
| 280 |
-
encoding = "utf-8"
|
| 281 |
-
return text.decode(encoding, errors)
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
def _replace_entities(
|
| 285 |
-
text: StrOrBytes,
|
| 286 |
-
keep: Iterable[str] = (),
|
| 287 |
-
remove_illegal: bool = True,
|
| 288 |
-
encoding: str = "utf-8",
|
| 289 |
-
) -> str:
|
| 290 |
-
"""Remove entities from the given `text` by converting them to their
|
| 291 |
-
corresponding Unicode character.
|
| 292 |
-
|
| 293 |
-
`text` can be a Unicode string or a byte string encoded in the given
|
| 294 |
-
`encoding` (which defaults to 'utf-8').
|
| 295 |
-
|
| 296 |
-
If `keep` is passed (with a list of entity names), those entities will
|
| 297 |
-
be kept (they won't be removed).
|
| 298 |
-
|
| 299 |
-
It supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
|
| 300 |
-
and named entities (such as `` `` or ``>``).
|
| 301 |
-
|
| 302 |
-
If `remove_illegal` is ``True``, entities that can't be converted are removed.
|
| 303 |
-
If `remove_illegal` is ``False``, entities that can't be converted are kept "as
|
| 304 |
-
is". For more information, see the tests.
|
| 305 |
-
|
| 306 |
-
Always returns a Unicode string (with the entities removed).
|
| 307 |
-
|
| 308 |
-
>>> _replace_entities(b'Price: £100')
|
| 309 |
-
'Price: \\xa3100'
|
| 310 |
-
>>> print(_replace_entities(b'Price: £100'))
|
| 311 |
-
Price: £100
|
| 312 |
-
>>>
|
| 313 |
-
|
| 314 |
-
"""
|
| 315 |
-
|
| 316 |
-
def convert_entity(m: Match[str]) -> str:
|
| 317 |
-
groups = m.groupdict()
|
| 318 |
-
number = None
|
| 319 |
-
if groups.get("dec"):
|
| 320 |
-
number = int(groups["dec"], 10)
|
| 321 |
-
elif groups.get("hex"):
|
| 322 |
-
number = int(groups["hex"], 16)
|
| 323 |
-
elif groups.get("named"):
|
| 324 |
-
entity_name = groups["named"]
|
| 325 |
-
if entity_name.lower() in keep:
|
| 326 |
-
return m.group(0)
|
| 327 |
-
number = name2codepoint.get(entity_name) or name2codepoint.get(entity_name.lower())
|
| 328 |
-
if number is not None:
|
| 329 |
-
# Browsers typically
|
| 330 |
-
# interpret numeric character references in the 80-9F range as representing the characters mapped
|
| 331 |
-
# to bytes 80-9F in the Windows-1252 encoding. For more info
|
| 332 |
-
# see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML
|
| 333 |
-
try:
|
| 334 |
-
if 0x80 <= number <= 0x9F:
|
| 335 |
-
return bytes((number,)).decode("cp1252")
|
| 336 |
-
return chr(number)
|
| 337 |
-
except (ValueError, OverflowError): # pragma: no cover
|
| 338 |
-
pass
|
| 339 |
-
|
| 340 |
-
return "" if remove_illegal and groups.get("semicolon") else m.group(0)
|
| 341 |
-
|
| 342 |
-
return _ent_re.sub(convert_entity, to_unicode(text, encoding))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scrapling/core/_types.py
CHANGED
|
@@ -12,12 +12,14 @@ from typing import (
|
|
| 12 |
Callable,
|
| 13 |
Dict,
|
| 14 |
Generator,
|
|
|
|
| 15 |
Generic,
|
| 16 |
Iterable,
|
| 17 |
List,
|
| 18 |
Set,
|
| 19 |
Literal,
|
| 20 |
Optional,
|
|
|
|
| 21 |
Pattern,
|
| 22 |
Sequence,
|
| 23 |
Tuple,
|
|
@@ -30,34 +32,16 @@ from typing import (
|
|
| 30 |
Coroutine,
|
| 31 |
SupportsIndex,
|
| 32 |
)
|
|
|
|
| 33 |
|
|
|
|
|
|
|
| 34 |
SUPPORTED_HTTP_METHODS = Literal["GET", "POST", "PUT", "DELETE"]
|
| 35 |
SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
|
| 36 |
PageLoadStates = Literal["commit", "domcontentloaded", "load", "networkidle"]
|
| 37 |
extraction_types = Literal["text", "html", "markdown"]
|
| 38 |
StrOrBytes = Union[str, bytes]
|
| 39 |
|
| 40 |
-
if TYPE_CHECKING: # pragma: no cover
|
| 41 |
-
from typing_extensions import Unpack
|
| 42 |
-
else: # pragma: no cover
|
| 43 |
-
|
| 44 |
-
class _Unpack:
|
| 45 |
-
@staticmethod
|
| 46 |
-
def __getitem__(*args, **kwargs):
|
| 47 |
-
pass
|
| 48 |
-
|
| 49 |
-
Unpack = _Unpack()
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
try:
|
| 53 |
-
# Python 3.11+
|
| 54 |
-
from typing import Self # novermin
|
| 55 |
-
except ImportError: # pragma: no cover
|
| 56 |
-
try:
|
| 57 |
-
from typing_extensions import Self # Backport
|
| 58 |
-
except ImportError:
|
| 59 |
-
Self = object
|
| 60 |
-
|
| 61 |
|
| 62 |
# Copied from `playwright._impl._api_structures.SetCookieParam`
|
| 63 |
class SetCookieParam(TypedDict, total=False):
|
|
|
|
| 12 |
Callable,
|
| 13 |
Dict,
|
| 14 |
Generator,
|
| 15 |
+
AsyncGenerator,
|
| 16 |
Generic,
|
| 17 |
Iterable,
|
| 18 |
List,
|
| 19 |
Set,
|
| 20 |
Literal,
|
| 21 |
Optional,
|
| 22 |
+
Iterator,
|
| 23 |
Pattern,
|
| 24 |
Sequence,
|
| 25 |
Tuple,
|
|
|
|
| 32 |
Coroutine,
|
| 33 |
SupportsIndex,
|
| 34 |
)
|
| 35 |
+
from typing_extensions import Self, Unpack
|
| 36 |
|
| 37 |
+
# Proxy can be a string URL or a dict (Playwright format: {"server": "...", "username": "...", "password": "..."})
|
| 38 |
+
ProxyType = Union[str, Dict[str, str]]
|
| 39 |
SUPPORTED_HTTP_METHODS = Literal["GET", "POST", "PUT", "DELETE"]
|
| 40 |
SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
|
| 41 |
PageLoadStates = Literal["commit", "domcontentloaded", "load", "networkidle"]
|
| 42 |
extraction_types = Literal["text", "html", "markdown"]
|
| 43 |
StrOrBytes = Union[str, bytes]
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
# Copied from `playwright._impl._api_structures.SetCookieParam`
|
| 47 |
class SetCookieParam(TypedDict, total=False):
|
scrapling/core/ai.py
CHANGED
|
@@ -213,7 +213,7 @@ class ScraplingMCPServer:
|
|
| 213 |
extraction_type: extraction_types = "markdown",
|
| 214 |
css_selector: Optional[str] = None,
|
| 215 |
main_content_only: bool = True,
|
| 216 |
-
headless: bool =
|
| 217 |
google_search: bool = True,
|
| 218 |
real_chrome: bool = False,
|
| 219 |
wait: int | float = 0,
|
|
@@ -295,7 +295,7 @@ class ScraplingMCPServer:
|
|
| 295 |
extraction_type: extraction_types = "markdown",
|
| 296 |
css_selector: Optional[str] = None,
|
| 297 |
main_content_only: bool = True,
|
| 298 |
-
headless: bool =
|
| 299 |
google_search: bool = True,
|
| 300 |
real_chrome: bool = False,
|
| 301 |
wait: int | float = 0,
|
|
|
|
| 213 |
extraction_type: extraction_types = "markdown",
|
| 214 |
css_selector: Optional[str] = None,
|
| 215 |
main_content_only: bool = True,
|
| 216 |
+
headless: bool = True, # noqa: F821
|
| 217 |
google_search: bool = True,
|
| 218 |
real_chrome: bool = False,
|
| 219 |
wait: int | float = 0,
|
|
|
|
| 295 |
extraction_type: extraction_types = "markdown",
|
| 296 |
css_selector: Optional[str] = None,
|
| 297 |
main_content_only: bool = True,
|
| 298 |
+
headless: bool = True, # noqa: F821
|
| 299 |
google_search: bool = True,
|
| 300 |
real_chrome: bool = False,
|
| 301 |
wait: int | float = 0,
|
scrapling/core/custom_types.py
CHANGED
|
@@ -3,6 +3,7 @@ from types import MappingProxyType
|
|
| 3 |
from re import compile as re_compile, UNICODE, IGNORECASE
|
| 4 |
|
| 5 |
from orjson import dumps, loads
|
|
|
|
| 6 |
|
| 7 |
from scrapling.core._types import (
|
| 8 |
Any,
|
|
@@ -19,7 +20,6 @@ from scrapling.core._types import (
|
|
| 19 |
SupportsIndex,
|
| 20 |
)
|
| 21 |
from scrapling.core.utils import _is_iterable, flatten, __CONSECUTIVE_SPACES_REGEX__
|
| 22 |
-
from scrapling.core._html_utils import _replace_entities
|
| 23 |
|
| 24 |
# Define type variable for AttributeHandler value type
|
| 25 |
_TextHandlerType = TypeVar("_TextHandlerType", bound="TextHandler")
|
|
@@ -35,9 +35,7 @@ class TextHandler(str):
|
|
| 35 |
lst = super().__getitem__(key)
|
| 36 |
return TextHandler(lst)
|
| 37 |
|
| 38 |
-
def split(
|
| 39 |
-
self, sep: str | None = None, maxsplit: SupportsIndex = -1
|
| 40 |
-
) -> Union[List, "TextHandlers"]: # pragma: no cover
|
| 41 |
return TextHandlers([TextHandler(s) for s in super().split(sep, maxsplit)])
|
| 42 |
|
| 43 |
def strip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
@@ -61,7 +59,7 @@ class TextHandler(str):
|
|
| 61 |
def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, "TextHandler"]: # pragma: no cover
|
| 62 |
return TextHandler(super().expandtabs(tabsize))
|
| 63 |
|
| 64 |
-
def format(self, *args: object, **kwargs:
|
| 65 |
return TextHandler(super().format(*args, **kwargs))
|
| 66 |
|
| 67 |
def format_map(self, mapping) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
@@ -291,7 +289,7 @@ class AttributesHandler(Mapping[str, _TextHandlerType]):
|
|
| 291 |
|
| 292 |
__slots__ = ("_data",)
|
| 293 |
|
| 294 |
-
def __init__(self, mapping=None, **kwargs):
|
| 295 |
mapping = (
|
| 296 |
{key: TextHandler(value) if isinstance(value, str) else value for key, value in mapping.items()}
|
| 297 |
if mapping is not None
|
|
@@ -324,8 +322,8 @@ class AttributesHandler(Mapping[str, _TextHandlerType]):
|
|
| 324 |
yield AttributesHandler({key: value})
|
| 325 |
|
| 326 |
@property
|
| 327 |
-
def json_string(self):
|
| 328 |
-
"""Convert current attributes to JSON
|
| 329 |
return dumps(dict(self._data))
|
| 330 |
|
| 331 |
def __getitem__(self, key: str) -> _TextHandlerType:
|
|
|
|
| 3 |
from re import compile as re_compile, UNICODE, IGNORECASE
|
| 4 |
|
| 5 |
from orjson import dumps, loads
|
| 6 |
+
from w3lib.html import replace_entities as _replace_entities
|
| 7 |
|
| 8 |
from scrapling.core._types import (
|
| 9 |
Any,
|
|
|
|
| 20 |
SupportsIndex,
|
| 21 |
)
|
| 22 |
from scrapling.core.utils import _is_iterable, flatten, __CONSECUTIVE_SPACES_REGEX__
|
|
|
|
| 23 |
|
| 24 |
# Define type variable for AttributeHandler value type
|
| 25 |
_TextHandlerType = TypeVar("_TextHandlerType", bound="TextHandler")
|
|
|
|
| 35 |
lst = super().__getitem__(key)
|
| 36 |
return TextHandler(lst)
|
| 37 |
|
| 38 |
+
def split(self, sep: str | None = None, maxsplit: SupportsIndex = -1) -> list[Any]: # pragma: no cover
|
|
|
|
|
|
|
| 39 |
return TextHandlers([TextHandler(s) for s in super().split(sep, maxsplit)])
|
| 40 |
|
| 41 |
def strip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
|
|
| 59 |
def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, "TextHandler"]: # pragma: no cover
|
| 60 |
return TextHandler(super().expandtabs(tabsize))
|
| 61 |
|
| 62 |
+
def format(self, *args: object, **kwargs: object) -> Union[str, "TextHandler"]: # pragma: no cover
|
| 63 |
return TextHandler(super().format(*args, **kwargs))
|
| 64 |
|
| 65 |
def format_map(self, mapping) -> Union[str, "TextHandler"]: # pragma: no cover
|
|
|
|
| 289 |
|
| 290 |
__slots__ = ("_data",)
|
| 291 |
|
| 292 |
+
def __init__(self, mapping: Any = None, **kwargs: Any) -> None:
|
| 293 |
mapping = (
|
| 294 |
{key: TextHandler(value) if isinstance(value, str) else value for key, value in mapping.items()}
|
| 295 |
if mapping is not None
|
|
|
|
| 322 |
yield AttributesHandler({key: value})
|
| 323 |
|
| 324 |
@property
|
| 325 |
+
def json_string(self) -> bytes:
|
| 326 |
+
"""Convert current attributes to JSON bytes if the attributes are JSON serializable otherwise throws error"""
|
| 327 |
return dumps(dict(self._data))
|
| 328 |
|
| 329 |
def __getitem__(self, key: str) -> _TextHandlerType:
|
scrapling/core/mixins.py
CHANGED
|
@@ -1,7 +1,4 @@
|
|
| 1 |
-
from scrapling.core._types import
|
| 2 |
-
|
| 3 |
-
if TYPE_CHECKING:
|
| 4 |
-
from scrapling.parser import Selector
|
| 5 |
|
| 6 |
|
| 7 |
class SelectorsGeneration:
|
|
@@ -11,10 +8,17 @@ class SelectorsGeneration:
|
|
| 11 |
Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591
|
| 12 |
"""
|
| 13 |
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
"""Generate a selector for the current element.
|
| 16 |
:return: A string of the generated selector.
|
| 17 |
"""
|
|
|
|
|
|
|
|
|
|
| 18 |
selectorPath = []
|
| 19 |
target = self
|
| 20 |
css = selection.lower() == "css"
|
|
@@ -33,7 +37,7 @@ class SelectorsGeneration:
|
|
| 33 |
# if classes and css:
|
| 34 |
# part += f".{'.'.join(classes)}"
|
| 35 |
# else:
|
| 36 |
-
counter = {}
|
| 37 |
for child in target.parent.children:
|
| 38 |
counter.setdefault(child.tag, 0)
|
| 39 |
counter[child.tag] += 1
|
|
@@ -53,28 +57,28 @@ class SelectorsGeneration:
|
|
| 53 |
return " > ".join(reversed(selectorPath)) if css else "//" + "/".join(reversed(selectorPath))
|
| 54 |
|
| 55 |
@property
|
| 56 |
-
def generate_css_selector(self:
|
| 57 |
"""Generate a CSS selector for the current element
|
| 58 |
:return: A string of the generated selector.
|
| 59 |
"""
|
| 60 |
return self._general_selection()
|
| 61 |
|
| 62 |
@property
|
| 63 |
-
def generate_full_css_selector(self:
|
| 64 |
"""Generate a complete CSS selector for the current element
|
| 65 |
:return: A string of the generated selector.
|
| 66 |
"""
|
| 67 |
return self._general_selection(full_path=True)
|
| 68 |
|
| 69 |
@property
|
| 70 |
-
def generate_xpath_selector(self:
|
| 71 |
"""Generate an XPath selector for the current element
|
| 72 |
:return: A string of the generated selector.
|
| 73 |
"""
|
| 74 |
return self._general_selection("xpath")
|
| 75 |
|
| 76 |
@property
|
| 77 |
-
def generate_full_xpath_selector(self:
|
| 78 |
"""Generate a complete XPath selector for the current element
|
| 79 |
:return: A string of the generated selector.
|
| 80 |
"""
|
|
|
|
| 1 |
+
from scrapling.core._types import Any, Dict
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
|
| 4 |
class SelectorsGeneration:
|
|
|
|
| 8 |
Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591
|
| 9 |
"""
|
| 10 |
|
| 11 |
+
# Note: This is a mixin class meant to be used with Selector.
|
| 12 |
+
# The methods access Selector attributes (._root, .parent, .attrib, .tag, etc.)
|
| 13 |
+
# through self, which will be a Selector instance at runtime.
|
| 14 |
+
|
| 15 |
+
def _general_selection(self: Any, selection: str = "css", full_path: bool = False) -> str:
|
| 16 |
"""Generate a selector for the current element.
|
| 17 |
:return: A string of the generated selector.
|
| 18 |
"""
|
| 19 |
+
if self._is_text_node(self._root):
|
| 20 |
+
return ""
|
| 21 |
+
|
| 22 |
selectorPath = []
|
| 23 |
target = self
|
| 24 |
css = selection.lower() == "css"
|
|
|
|
| 37 |
# if classes and css:
|
| 38 |
# part += f".{'.'.join(classes)}"
|
| 39 |
# else:
|
| 40 |
+
counter: Dict[str, int] = {}
|
| 41 |
for child in target.parent.children:
|
| 42 |
counter.setdefault(child.tag, 0)
|
| 43 |
counter[child.tag] += 1
|
|
|
|
| 57 |
return " > ".join(reversed(selectorPath)) if css else "//" + "/".join(reversed(selectorPath))
|
| 58 |
|
| 59 |
@property
|
| 60 |
+
def generate_css_selector(self: Any) -> str:
|
| 61 |
"""Generate a CSS selector for the current element
|
| 62 |
:return: A string of the generated selector.
|
| 63 |
"""
|
| 64 |
return self._general_selection()
|
| 65 |
|
| 66 |
@property
|
| 67 |
+
def generate_full_css_selector(self: Any) -> str:
|
| 68 |
"""Generate a complete CSS selector for the current element
|
| 69 |
:return: A string of the generated selector.
|
| 70 |
"""
|
| 71 |
return self._general_selection(full_path=True)
|
| 72 |
|
| 73 |
@property
|
| 74 |
+
def generate_xpath_selector(self: Any) -> str:
|
| 75 |
"""Generate an XPath selector for the current element
|
| 76 |
:return: A string of the generated selector.
|
| 77 |
"""
|
| 78 |
return self._general_selection("xpath")
|
| 79 |
|
| 80 |
@property
|
| 81 |
+
def generate_full_xpath_selector(self: Any) -> str:
|
| 82 |
"""Generate a complete XPath selector for the current element
|
| 83 |
:return: A string of the generated selector.
|
| 84 |
"""
|
scrapling/core/shell.py
CHANGED
|
@@ -30,6 +30,7 @@ from scrapling.core.custom_types import TextHandler
|
|
| 30 |
from scrapling.engines.toolbelt.custom import Response
|
| 31 |
from scrapling.core.utils._shell import _ParseHeaders, _CookieParser
|
| 32 |
from scrapling.core._types import (
|
|
|
|
| 33 |
Dict,
|
| 34 |
Any,
|
| 35 |
cast,
|
|
@@ -82,7 +83,7 @@ class NoExitArgumentParser(ArgumentParser): # pragma: no cover
|
|
| 82 |
class CurlParser:
|
| 83 |
"""Builds the argument parser for relevant curl flags from DevTools."""
|
| 84 |
|
| 85 |
-
def __init__(self):
|
| 86 |
from scrapling.fetchers import Fetcher as __Fetcher
|
| 87 |
|
| 88 |
self.__fetcher = __Fetcher
|
|
@@ -467,19 +468,21 @@ Type 'exit' or press Ctrl+D to exit.
|
|
| 467 |
|
| 468 |
return result
|
| 469 |
|
| 470 |
-
def create_wrapper(
|
|
|
|
|
|
|
| 471 |
"""Create a wrapper that preserves function signature but updates page"""
|
| 472 |
|
| 473 |
@wraps(func)
|
| 474 |
-
def wrapper(*args, **kwargs):
|
| 475 |
result = func(*args, **kwargs)
|
| 476 |
return self.update_page(result)
|
| 477 |
|
| 478 |
if get_signature:
|
| 479 |
# Explicitly preserve and unpack signature for IPython introspection and autocompletion
|
| 480 |
-
wrapper
|
| 481 |
else:
|
| 482 |
-
wrapper
|
| 483 |
|
| 484 |
return wrapper
|
| 485 |
|
|
@@ -583,7 +586,7 @@ class Convertor:
|
|
| 583 |
raise ValueError(f"Unknown extraction type: {extraction_type}")
|
| 584 |
else:
|
| 585 |
if main_content_only:
|
| 586 |
-
page = cast(Selector, page.
|
| 587 |
|
| 588 |
pages = [page] if not css_selector else cast(Selectors, page.css(css_selector))
|
| 589 |
for page in pages:
|
|
@@ -601,7 +604,7 @@ class Convertor:
|
|
| 601 |
" ",
|
| 602 |
):
|
| 603 |
# Remove consecutive white-spaces
|
| 604 |
-
txt_content = re_sub(f"[{s}]+", s, txt_content)
|
| 605 |
yield txt_content
|
| 606 |
yield ""
|
| 607 |
|
|
|
|
| 30 |
from scrapling.engines.toolbelt.custom import Response
|
| 31 |
from scrapling.core.utils._shell import _ParseHeaders, _CookieParser
|
| 32 |
from scrapling.core._types import (
|
| 33 |
+
Callable,
|
| 34 |
Dict,
|
| 35 |
Any,
|
| 36 |
cast,
|
|
|
|
| 83 |
class CurlParser:
|
| 84 |
"""Builds the argument parser for relevant curl flags from DevTools."""
|
| 85 |
|
| 86 |
+
def __init__(self) -> None:
|
| 87 |
from scrapling.fetchers import Fetcher as __Fetcher
|
| 88 |
|
| 89 |
self.__fetcher = __Fetcher
|
|
|
|
| 468 |
|
| 469 |
return result
|
| 470 |
|
| 471 |
+
def create_wrapper(
|
| 472 |
+
self, func: Callable, get_signature: bool = True, signature_name: Optional[str] = None
|
| 473 |
+
) -> Callable:
|
| 474 |
"""Create a wrapper that preserves function signature but updates page"""
|
| 475 |
|
| 476 |
@wraps(func)
|
| 477 |
+
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
| 478 |
result = func(*args, **kwargs)
|
| 479 |
return self.update_page(result)
|
| 480 |
|
| 481 |
if get_signature:
|
| 482 |
# Explicitly preserve and unpack signature for IPython introspection and autocompletion
|
| 483 |
+
setattr(wrapper, "__signature__", _unpack_signature(func, signature_name))
|
| 484 |
else:
|
| 485 |
+
setattr(wrapper, "__signature__", signature(func))
|
| 486 |
|
| 487 |
return wrapper
|
| 488 |
|
|
|
|
| 586 |
raise ValueError(f"Unknown extraction type: {extraction_type}")
|
| 587 |
else:
|
| 588 |
if main_content_only:
|
| 589 |
+
page = cast(Selector, page.css("body").first) or page
|
| 590 |
|
| 591 |
pages = [page] if not css_selector else cast(Selectors, page.css(css_selector))
|
| 592 |
for page in pages:
|
|
|
|
| 604 |
" ",
|
| 605 |
):
|
| 606 |
# Remove consecutive white-spaces
|
| 607 |
+
txt_content = TextHandler(re_sub(f"[{s}]+", s, txt_content))
|
| 608 |
yield txt_content
|
| 609 |
yield ""
|
| 610 |
|