Spaces:
Running
Running
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .bandit.yml +11 -0
- .dockerignore +108 -10
- .gitattributes +5 -0
- .github/FUNDING.yml +3 -3
- .github/ISSUE_TEMPLATE/01-bug_report.yml +82 -0
- .github/ISSUE_TEMPLATE/02-feature_request.yml +19 -0
- .github/ISSUE_TEMPLATE/03-other.yml +19 -0
- .github/ISSUE_TEMPLATE/04-docs_issue.yml +40 -0
- .github/ISSUE_TEMPLATE/config.yml +10 -0
- .github/PULL_REQUEST_TEMPLATE.md +51 -0
- .github/workflows/code-quality.yml +184 -0
- .github/workflows/docker-build.yml +86 -0
- .github/workflows/release-and-publish.yml +74 -0
- .github/workflows/tests.yml +109 -0
- .gitignore +92 -57
- .hfignore +21 -7
- .pre-commit-config.yaml +20 -0
- .readthedocs.yaml +21 -0
- CODE_OF_CONDUCT.md +1 -1
- CONTRIBUTING.md +84 -145
- Dockerfile +34 -88
- LICENSE +24 -17
- MANIFEST.in +12 -0
- README.md +360 -329
- ROADMAP.md +14 -0
- benchmarks.py +146 -0
- cleanup.py +42 -0
- docs/README_AR.md +426 -0
- docs/README_CN.md +426 -0
- docs/README_DE.md +426 -0
- docs/README_ES.md +426 -0
- docs/README_JP.md +426 -0
- docs/README_RU.md +426 -0
- docs/ai/mcp-server.md +294 -0
- docs/api-reference/custom-types.md +26 -0
- docs/api-reference/fetchers.md +63 -0
- docs/api-reference/mcp-server.md +39 -0
- docs/api-reference/proxy-rotation.md +18 -0
- docs/api-reference/response.md +18 -0
- docs/api-reference/selector.md +25 -0
- docs/api-reference/spiders.md +42 -0
- docs/assets/cover_dark.png +3 -0
- docs/assets/cover_dark.svg +0 -0
- docs/assets/cover_light.png +0 -0
- docs/assets/cover_light.svg +1 -0
- docs/assets/favicon.ico +3 -0
- docs/assets/logo.png +0 -0
- docs/assets/main_cover.png +3 -0
- docs/assets/scrapling_shell_curl.png +3 -0
- docs/assets/spider_architecture.png +3 -0
.bandit.yml
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
skips:
|
| 2 |
+
- B101
|
| 3 |
+
- B311
|
| 4 |
+
- B113 # `Requests call without timeout` these requests are done in the benchmark and examples scripts only
|
| 5 |
+
- B403 # We are using pickle for tests only
|
| 6 |
+
- B404 # Using subprocess library
|
| 7 |
+
- B602 # subprocess call with shell=True identified
|
| 8 |
+
- B110 # Try, Except, Pass detected.
|
| 9 |
+
- B104 # Possible binding to all interfaces.
|
| 10 |
+
- B301 # Pickle and modules that wrap it can be unsafe when used to deserialize untrusted data, possible security issue.
|
| 11 |
+
- B108 # Probable insecure usage of temp file/directory.
|
.dockerignore
CHANGED
|
@@ -1,12 +1,110 @@
|
|
| 1 |
-
|
| 2 |
-
.
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
.env
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
*.egg-info
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Github
|
| 2 |
+
.github/
|
| 3 |
+
|
| 4 |
+
# docs
|
| 5 |
+
docs/
|
| 6 |
+
images/
|
| 7 |
+
.cache/
|
| 8 |
+
.claude/
|
| 9 |
+
|
| 10 |
+
# cached files
|
| 11 |
+
__pycache__/
|
| 12 |
+
*.py[cod]
|
| 13 |
+
.cache
|
| 14 |
+
.DS_Store
|
| 15 |
+
*~
|
| 16 |
+
.*.sw[po]
|
| 17 |
+
.build
|
| 18 |
+
.ve
|
| 19 |
.env
|
| 20 |
+
.pytest
|
| 21 |
+
.benchmarks
|
| 22 |
+
.bootstrap
|
| 23 |
+
.appveyor.token
|
| 24 |
+
*.bak
|
| 25 |
+
*.db
|
| 26 |
+
*.db-*
|
| 27 |
+
|
| 28 |
+
# installation package
|
| 29 |
+
*.egg-info/
|
| 30 |
+
dist/
|
| 31 |
+
build/
|
| 32 |
+
|
| 33 |
+
# environments
|
| 34 |
+
.venv
|
| 35 |
+
env/
|
| 36 |
+
venv/
|
| 37 |
+
ENV/
|
| 38 |
+
env.bak/
|
| 39 |
+
venv.bak/
|
| 40 |
+
|
| 41 |
+
# C extensions
|
| 42 |
+
*.so
|
| 43 |
+
|
| 44 |
+
# pycharm
|
| 45 |
+
.idea/
|
| 46 |
+
|
| 47 |
+
# vscode
|
| 48 |
+
*.code-workspace
|
| 49 |
+
|
| 50 |
+
# Packages
|
| 51 |
+
*.egg
|
| 52 |
*.egg-info
|
| 53 |
+
dist
|
| 54 |
+
build
|
| 55 |
+
eggs
|
| 56 |
+
.eggs
|
| 57 |
+
parts
|
| 58 |
+
bin
|
| 59 |
+
var
|
| 60 |
+
sdist
|
| 61 |
+
wheelhouse
|
| 62 |
+
develop-eggs
|
| 63 |
+
.installed.cfg
|
| 64 |
+
lib
|
| 65 |
+
lib64
|
| 66 |
+
venv*/
|
| 67 |
+
.venv*/
|
| 68 |
+
pyvenv*/
|
| 69 |
+
pip-wheel-metadata/
|
| 70 |
+
poetry.lock
|
| 71 |
+
|
| 72 |
+
# Installer logs
|
| 73 |
+
pip-log.txt
|
| 74 |
+
|
| 75 |
+
# mypy
|
| 76 |
+
.mypy_cache/
|
| 77 |
+
.dmypy.json
|
| 78 |
+
dmypy.json
|
| 79 |
+
mypy.ini
|
| 80 |
+
|
| 81 |
+
# test caches
|
| 82 |
+
.tox/
|
| 83 |
+
.pytest_cache/
|
| 84 |
+
.coverage
|
| 85 |
+
htmlcov
|
| 86 |
+
report.xml
|
| 87 |
+
nosetests.xml
|
| 88 |
+
coverage.xml
|
| 89 |
+
|
| 90 |
+
# Translations
|
| 91 |
+
*.mo
|
| 92 |
+
|
| 93 |
+
# Buildout
|
| 94 |
+
.mr.developer.cfg
|
| 95 |
+
|
| 96 |
+
# IDE project files
|
| 97 |
+
.project
|
| 98 |
+
.pydevproject
|
| 99 |
+
.idea
|
| 100 |
+
*.iml
|
| 101 |
+
*.komodoproject
|
| 102 |
+
|
| 103 |
+
# Complexity
|
| 104 |
+
output/*.html
|
| 105 |
+
output/*/index.html
|
| 106 |
+
|
| 107 |
+
# Sphinx
|
| 108 |
+
docs/_build
|
| 109 |
+
public/
|
| 110 |
+
web/
|
.gitattributes
CHANGED
|
@@ -3,3 +3,8 @@ Scrapling/docs/assets/favicon.ico filter=lfs diff=lfs merge=lfs -text
|
|
| 3 |
Scrapling/docs/assets/main_cover.png filter=lfs diff=lfs merge=lfs -text
|
| 4 |
Scrapling/docs/assets/scrapling_shell_curl.png filter=lfs diff=lfs merge=lfs -text
|
| 5 |
Scrapling/docs/assets/spider_architecture.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
Scrapling/docs/assets/main_cover.png filter=lfs diff=lfs merge=lfs -text
|
| 4 |
Scrapling/docs/assets/scrapling_shell_curl.png filter=lfs diff=lfs merge=lfs -text
|
| 5 |
Scrapling/docs/assets/spider_architecture.png filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
docs/assets/cover_dark.png filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
docs/assets/favicon.ico filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
docs/assets/main_cover.png filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
docs/assets/scrapling_shell_curl.png filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
docs/assets/spider_architecture.png filter=lfs diff=lfs merge=lfs -text
|
.github/FUNDING.yml
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
| 1 |
+
github: D4Vinci
|
| 2 |
+
buy_me_a_coffee: d4vinci
|
| 3 |
+
ko_fi: d4vinci
|
.github/ISSUE_TEMPLATE/01-bug_report.yml
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Bug report
|
| 2 |
+
description: Create a bug report to help us address errors in the repository
|
| 3 |
+
labels: [bug]
|
| 4 |
+
body:
|
| 5 |
+
- type: checkboxes
|
| 6 |
+
attributes:
|
| 7 |
+
label: Have you searched if there an existing issue for this?
|
| 8 |
+
description: Please search [existing issues](https://github.com/D4Vinci/Scrapling/labels/bug).
|
| 9 |
+
options:
|
| 10 |
+
- label: I have searched the existing issues
|
| 11 |
+
required: true
|
| 12 |
+
|
| 13 |
+
- type: input
|
| 14 |
+
attributes:
|
| 15 |
+
label: "Python version (python --version)"
|
| 16 |
+
placeholder: "Python 3.8"
|
| 17 |
+
validations:
|
| 18 |
+
required: true
|
| 19 |
+
|
| 20 |
+
- type: input
|
| 21 |
+
attributes:
|
| 22 |
+
label: "Scrapling version (scrapling.__version__)"
|
| 23 |
+
placeholder: "0.1"
|
| 24 |
+
validations:
|
| 25 |
+
required: true
|
| 26 |
+
|
| 27 |
+
- type: textarea
|
| 28 |
+
attributes:
|
| 29 |
+
label: "Dependencies version (pip3 freeze)"
|
| 30 |
+
description: >
|
| 31 |
+
This is the output of the command `pip3 freeze --all`. Note that the
|
| 32 |
+
actual output might be different as compared to the placeholder text.
|
| 33 |
+
placeholder: |
|
| 34 |
+
cssselect==1.2.0
|
| 35 |
+
lxml==5.3.0
|
| 36 |
+
orjson==3.10.7
|
| 37 |
+
...
|
| 38 |
+
validations:
|
| 39 |
+
required: true
|
| 40 |
+
|
| 41 |
+
- type: input
|
| 42 |
+
attributes:
|
| 43 |
+
label: "What's your operating system?"
|
| 44 |
+
placeholder: "Windows 10"
|
| 45 |
+
validations:
|
| 46 |
+
required: true
|
| 47 |
+
|
| 48 |
+
- type: dropdown
|
| 49 |
+
attributes:
|
| 50 |
+
label: 'Are you using a separate virtual environment?'
|
| 51 |
+
description: "Please pay attention to this question"
|
| 52 |
+
options:
|
| 53 |
+
- 'No'
|
| 54 |
+
- 'Yes'
|
| 55 |
+
default: 0
|
| 56 |
+
validations:
|
| 57 |
+
required: true
|
| 58 |
+
|
| 59 |
+
- type: textarea
|
| 60 |
+
attributes:
|
| 61 |
+
label: "Expected behavior"
|
| 62 |
+
description: "Describe the behavior you expect. May include images or videos."
|
| 63 |
+
validations:
|
| 64 |
+
required: true
|
| 65 |
+
|
| 66 |
+
- type: textarea
|
| 67 |
+
attributes:
|
| 68 |
+
label: "Actual behavior"
|
| 69 |
+
validations:
|
| 70 |
+
required: true
|
| 71 |
+
|
| 72 |
+
- type: textarea
|
| 73 |
+
attributes:
|
| 74 |
+
label: Steps To Reproduce
|
| 75 |
+
description: Steps to reproduce the behavior.
|
| 76 |
+
placeholder: |
|
| 77 |
+
1. In this environment...
|
| 78 |
+
2. With this config...
|
| 79 |
+
3. Run '...'
|
| 80 |
+
4. See error...
|
| 81 |
+
validations:
|
| 82 |
+
required: false
|
.github/ISSUE_TEMPLATE/02-feature_request.yml
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Feature request
|
| 2 |
+
description: Suggest features, propose improvements, discuss new ideas.
|
| 3 |
+
labels: [enhancement]
|
| 4 |
+
body:
|
| 5 |
+
- type: checkboxes
|
| 6 |
+
attributes:
|
| 7 |
+
label: Have you searched if there an existing feature request for this?
|
| 8 |
+
description: Please search [existing requests](https://github.com/D4Vinci/Scrapling/labels/enhancement).
|
| 9 |
+
options:
|
| 10 |
+
- label: I have searched the existing requests
|
| 11 |
+
required: true
|
| 12 |
+
|
| 13 |
+
- type: textarea
|
| 14 |
+
attributes:
|
| 15 |
+
label: "Feature description"
|
| 16 |
+
description: >
|
| 17 |
+
This could include new topics or improving any existing features/implementations.
|
| 18 |
+
validations:
|
| 19 |
+
required: true
|
.github/ISSUE_TEMPLATE/03-other.yml
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Other
|
| 2 |
+
description: Use this for any other issues. PLEASE provide as much information as possible.
|
| 3 |
+
labels: ["awaiting triage"]
|
| 4 |
+
body:
|
| 5 |
+
- type: textarea
|
| 6 |
+
id: issuedescription
|
| 7 |
+
attributes:
|
| 8 |
+
label: What would you like to share?
|
| 9 |
+
description: Provide a clear and concise explanation of your issue.
|
| 10 |
+
validations:
|
| 11 |
+
required: true
|
| 12 |
+
|
| 13 |
+
- type: textarea
|
| 14 |
+
id: extrainfo
|
| 15 |
+
attributes:
|
| 16 |
+
label: Additional information
|
| 17 |
+
description: Is there anything else we should know about this issue?
|
| 18 |
+
validations:
|
| 19 |
+
required: false
|
.github/ISSUE_TEMPLATE/04-docs_issue.yml
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Documentation issue
|
| 2 |
+
description: Report incorrect, unclear, or missing documentation.
|
| 3 |
+
labels: [documentation]
|
| 4 |
+
body:
|
| 5 |
+
- type: checkboxes
|
| 6 |
+
attributes:
|
| 7 |
+
label: Have you searched if there an existing issue for this?
|
| 8 |
+
description: Please search [existing issues](https://github.com/D4Vinci/Scrapling/labels/documentation).
|
| 9 |
+
options:
|
| 10 |
+
- label: I have searched the existing issues
|
| 11 |
+
required: true
|
| 12 |
+
|
| 13 |
+
- type: input
|
| 14 |
+
attributes:
|
| 15 |
+
label: "Page URL"
|
| 16 |
+
description: "Link to the documentation page with the issue."
|
| 17 |
+
placeholder: "https://scrapling.readthedocs.io/en/latest/..."
|
| 18 |
+
validations:
|
| 19 |
+
required: true
|
| 20 |
+
|
| 21 |
+
- type: dropdown
|
| 22 |
+
attributes:
|
| 23 |
+
label: "Type of issue"
|
| 24 |
+
options:
|
| 25 |
+
- Incorrect information
|
| 26 |
+
- Unclear or confusing
|
| 27 |
+
- Missing information
|
| 28 |
+
- Typo or formatting
|
| 29 |
+
- Broken link
|
| 30 |
+
- Other
|
| 31 |
+
default: 0
|
| 32 |
+
validations:
|
| 33 |
+
required: true
|
| 34 |
+
|
| 35 |
+
- type: textarea
|
| 36 |
+
attributes:
|
| 37 |
+
label: "Description"
|
| 38 |
+
description: "Describe what's wrong and what you expected to find."
|
| 39 |
+
validations:
|
| 40 |
+
required: true
|
.github/ISSUE_TEMPLATE/config.yml
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
blank_issues_enabled: false
|
| 2 |
+
contact_links:
|
| 3 |
+
- name: Discussions
|
| 4 |
+
url: https://github.com/D4Vinci/Scrapling/discussions
|
| 5 |
+
about: >
|
| 6 |
+
The "Discussions" forum is where you want to start. 💖
|
| 7 |
+
- name: Ask on our discord server
|
| 8 |
+
url: https://discord.gg/EMgGbDceNQ
|
| 9 |
+
about: >
|
| 10 |
+
Our community chat forum.
|
.github/PULL_REQUEST_TEMPLATE.md
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!--
|
| 2 |
+
You are amazing! Thanks for contributing to Scrapling!
|
| 3 |
+
Please, DO NOT DELETE ANY TEXT from this template! (unless instructed).
|
| 4 |
+
-->
|
| 5 |
+
|
| 6 |
+
## Proposed change
|
| 7 |
+
<!--
|
| 8 |
+
Describe the big picture of your changes here to communicate to the maintainers why we should accept this pull request.
|
| 9 |
+
If it fixes a bug or resolves a feature request, be sure to link to that issue in the additional information section.
|
| 10 |
+
-->
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
### Type of change:
|
| 14 |
+
<!--
|
| 15 |
+
What type of change does your PR introduce to Scrapling?
|
| 16 |
+
NOTE: Please, check at least 1 box!
|
| 17 |
+
If your PR requires multiple boxes to be checked, you'll most likely need to
|
| 18 |
+
split it into multiple PRs. This makes things easier and faster to code review.
|
| 19 |
+
-->
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
- [ ] Dependency upgrade
|
| 24 |
+
- [ ] Bugfix (non-breaking change which fixes an issue)
|
| 25 |
+
- [ ] New integration (thank you!)
|
| 26 |
+
- [ ] New feature (which adds functionality to an existing integration)
|
| 27 |
+
- [ ] Deprecation (breaking change to happen in the future)
|
| 28 |
+
- [ ] Breaking change (fix/feature causing existing functionality to break)
|
| 29 |
+
- [ ] Code quality improvements to existing code or addition of tests
|
| 30 |
+
- [ ] Add or change doctests? -- Note: Please avoid changing both code and tests in a single pull request.
|
| 31 |
+
- [ ] Documentation change?
|
| 32 |
+
|
| 33 |
+
### Additional information
|
| 34 |
+
<!--
|
| 35 |
+
Details are important and help maintainers processing your PR.
|
| 36 |
+
Please be sure to fill out additional details, if applicable.
|
| 37 |
+
-->
|
| 38 |
+
|
| 39 |
+
- This PR fixes or closes an issue: fixes #
|
| 40 |
+
- This PR is related to an issue: #
|
| 41 |
+
- Link to documentation pull request: **
|
| 42 |
+
|
| 43 |
+
### Checklist:
|
| 44 |
+
* [ ] I have read [CONTRIBUTING.md](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md).
|
| 45 |
+
* [ ] This pull request is all my own work -- I have not plagiarized.
|
| 46 |
+
* [ ] I know that pull requests will not be merged if they fail the automated tests.
|
| 47 |
+
* [ ] All new Python files are placed inside an existing directory.
|
| 48 |
+
* [ ] All filenames are in all lowercase characters with no spaces or dashes.
|
| 49 |
+
* [ ] All functions and variable names follow Python naming conventions.
|
| 50 |
+
* [ ] All function parameters and return values are annotated with Python [type hints](https://docs.python.org/3/library/typing.html).
|
| 51 |
+
* [ ] All functions have doc-strings.
|
.github/workflows/code-quality.yml
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Code Quality
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches:
|
| 6 |
+
- main
|
| 7 |
+
- dev
|
| 8 |
+
paths-ignore:
|
| 9 |
+
- '*.md'
|
| 10 |
+
- '**/*.md'
|
| 11 |
+
- 'docs/**'
|
| 12 |
+
- 'images/**'
|
| 13 |
+
- '.github/**'
|
| 14 |
+
- '!.github/workflows/code-quality.yml' # Always run when this workflow changes
|
| 15 |
+
pull_request:
|
| 16 |
+
branches:
|
| 17 |
+
- main
|
| 18 |
+
- dev
|
| 19 |
+
paths-ignore:
|
| 20 |
+
- '*.md'
|
| 21 |
+
- '**/*.md'
|
| 22 |
+
- 'docs/**'
|
| 23 |
+
- 'images/**'
|
| 24 |
+
workflow_dispatch: # Allow manual triggering
|
| 25 |
+
|
| 26 |
+
concurrency:
|
| 27 |
+
group: ${{ github.workflow }}-${{ github.ref }}
|
| 28 |
+
cancel-in-progress: true
|
| 29 |
+
|
| 30 |
+
jobs:
|
| 31 |
+
code-quality:
|
| 32 |
+
name: Code Quality Checks
|
| 33 |
+
runs-on: ubuntu-latest
|
| 34 |
+
permissions:
|
| 35 |
+
contents: read
|
| 36 |
+
pull-requests: write # For PR annotations
|
| 37 |
+
|
| 38 |
+
steps:
|
| 39 |
+
- name: Checkout code
|
| 40 |
+
uses: actions/checkout@v6
|
| 41 |
+
with:
|
| 42 |
+
fetch-depth: 0 # Full history for better analysis
|
| 43 |
+
|
| 44 |
+
- name: Set up Python
|
| 45 |
+
uses: actions/setup-python@v6
|
| 46 |
+
with:
|
| 47 |
+
python-version: '3.10'
|
| 48 |
+
cache: 'pip'
|
| 49 |
+
|
| 50 |
+
- name: Install dependencies
|
| 51 |
+
run: |
|
| 52 |
+
python -m pip install --upgrade pip
|
| 53 |
+
pip install bandit[toml] ruff vermin mypy pyright
|
| 54 |
+
pip install -e ".[all]"
|
| 55 |
+
pip install lxml-stubs
|
| 56 |
+
|
| 57 |
+
- name: Run Bandit (Security Linter)
|
| 58 |
+
id: bandit
|
| 59 |
+
continue-on-error: true
|
| 60 |
+
run: |
|
| 61 |
+
echo "::group::Bandit - Security Linter"
|
| 62 |
+
bandit -r -c .bandit.yml scrapling/ -f json -o bandit-report.json
|
| 63 |
+
bandit -r -c .bandit.yml scrapling/
|
| 64 |
+
echo "::endgroup::"
|
| 65 |
+
|
| 66 |
+
- name: Run Ruff Linter
|
| 67 |
+
id: ruff-lint
|
| 68 |
+
continue-on-error: true
|
| 69 |
+
run: |
|
| 70 |
+
echo "::group::Ruff - Linter"
|
| 71 |
+
ruff check scrapling/ --output-format=github
|
| 72 |
+
echo "::endgroup::"
|
| 73 |
+
|
| 74 |
+
- name: Run Ruff Formatter Check
|
| 75 |
+
id: ruff-format
|
| 76 |
+
continue-on-error: true
|
| 77 |
+
run: |
|
| 78 |
+
echo "::group::Ruff - Formatter Check"
|
| 79 |
+
ruff format --check scrapling/ --diff
|
| 80 |
+
echo "::endgroup::"
|
| 81 |
+
|
| 82 |
+
- name: Run Vermin (Python Version Compatibility)
|
| 83 |
+
id: vermin
|
| 84 |
+
continue-on-error: true
|
| 85 |
+
run: |
|
| 86 |
+
echo "::group::Vermin - Python 3.10+ Compatibility Check"
|
| 87 |
+
vermin -t=3.10- --violations --eval-annotations --no-tips scrapling/
|
| 88 |
+
echo "::endgroup::"
|
| 89 |
+
|
| 90 |
+
- name: Run Mypy (Static Type Checker)
|
| 91 |
+
id: mypy
|
| 92 |
+
continue-on-error: true
|
| 93 |
+
run: |
|
| 94 |
+
echo "::group::Mypy - Static Type Checker"
|
| 95 |
+
mypy scrapling/
|
| 96 |
+
echo "::endgroup::"
|
| 97 |
+
|
| 98 |
+
- name: Run Pyright (Static Type Checker)
|
| 99 |
+
id: pyright
|
| 100 |
+
continue-on-error: true
|
| 101 |
+
run: |
|
| 102 |
+
echo "::group::Pyright - Static Type Checker"
|
| 103 |
+
pyright scrapling/
|
| 104 |
+
echo "::endgroup::"
|
| 105 |
+
|
| 106 |
+
- name: Check results and create summary
|
| 107 |
+
if: always()
|
| 108 |
+
run: |
|
| 109 |
+
echo "# Code Quality Check Results" >> $GITHUB_STEP_SUMMARY
|
| 110 |
+
echo "" >> $GITHUB_STEP_SUMMARY
|
| 111 |
+
|
| 112 |
+
# Initialize status
|
| 113 |
+
all_passed=true
|
| 114 |
+
|
| 115 |
+
# Check Bandit
|
| 116 |
+
if [ "${{ steps.bandit.outcome }}" == "success" ]; then
|
| 117 |
+
echo "✅ **Bandit (Security)**: Passed" >> $GITHUB_STEP_SUMMARY
|
| 118 |
+
else
|
| 119 |
+
echo "❌ **Bandit (Security)**: Failed" >> $GITHUB_STEP_SUMMARY
|
| 120 |
+
all_passed=false
|
| 121 |
+
fi
|
| 122 |
+
|
| 123 |
+
# Check Ruff Linter
|
| 124 |
+
if [ "${{ steps.ruff-lint.outcome }}" == "success" ]; then
|
| 125 |
+
echo "✅ **Ruff Linter**: Passed" >> $GITHUB_STEP_SUMMARY
|
| 126 |
+
else
|
| 127 |
+
echo "❌ **Ruff Linter**: Failed" >> $GITHUB_STEP_SUMMARY
|
| 128 |
+
all_passed=false
|
| 129 |
+
fi
|
| 130 |
+
|
| 131 |
+
# Check Ruff Formatter
|
| 132 |
+
if [ "${{ steps.ruff-format.outcome }}" == "success" ]; then
|
| 133 |
+
echo "✅ **Ruff Formatter**: Passed" >> $GITHUB_STEP_SUMMARY
|
| 134 |
+
else
|
| 135 |
+
echo "❌ **Ruff Formatter**: Failed" >> $GITHUB_STEP_SUMMARY
|
| 136 |
+
all_passed=false
|
| 137 |
+
fi
|
| 138 |
+
|
| 139 |
+
# Check Vermin
|
| 140 |
+
if [ "${{ steps.vermin.outcome }}" == "success" ]; then
|
| 141 |
+
echo "✅ **Vermin (Python 3.10+)**: Passed" >> $GITHUB_STEP_SUMMARY
|
| 142 |
+
else
|
| 143 |
+
echo "❌ **Vermin (Python 3.10+)**: Failed" >> $GITHUB_STEP_SUMMARY
|
| 144 |
+
all_passed=false
|
| 145 |
+
fi
|
| 146 |
+
|
| 147 |
+
# Check Mypy
|
| 148 |
+
if [ "${{ steps.mypy.outcome }}" == "success" ]; then
|
| 149 |
+
echo "✅ **Mypy (Type Checker)**: Passed" >> $GITHUB_STEP_SUMMARY
|
| 150 |
+
else
|
| 151 |
+
echo "❌ **Mypy (Type Checker)**: Failed" >> $GITHUB_STEP_SUMMARY
|
| 152 |
+
all_passed=false
|
| 153 |
+
fi
|
| 154 |
+
|
| 155 |
+
# Check Pyright
|
| 156 |
+
if [ "${{ steps.pyright.outcome }}" == "success" ]; then
|
| 157 |
+
echo "✅ **Pyright (Type Checker)**: Passed" >> $GITHUB_STEP_SUMMARY
|
| 158 |
+
else
|
| 159 |
+
echo "❌ **Pyright (Type Checker)**: Failed" >> $GITHUB_STEP_SUMMARY
|
| 160 |
+
all_passed=false
|
| 161 |
+
fi
|
| 162 |
+
|
| 163 |
+
echo "" >> $GITHUB_STEP_SUMMARY
|
| 164 |
+
|
| 165 |
+
if [ "$all_passed" == "true" ]; then
|
| 166 |
+
echo "### 🎉 All checks passed!" >> $GITHUB_STEP_SUMMARY
|
| 167 |
+
echo "" >> $GITHUB_STEP_SUMMARY
|
| 168 |
+
echo "Your code meets all quality standards." >> $GITHUB_STEP_SUMMARY
|
| 169 |
+
else
|
| 170 |
+
echo "### ⚠️ Some checks failed" >> $GITHUB_STEP_SUMMARY
|
| 171 |
+
echo "" >> $GITHUB_STEP_SUMMARY
|
| 172 |
+
echo "Please review the errors above and fix them." >> $GITHUB_STEP_SUMMARY
|
| 173 |
+
echo "" >> $GITHUB_STEP_SUMMARY
|
| 174 |
+
echo "**Tip**: Run \`pre-commit run --all-files\` locally to catch these issues before pushing." >> $GITHUB_STEP_SUMMARY
|
| 175 |
+
exit 1
|
| 176 |
+
fi
|
| 177 |
+
|
| 178 |
+
- name: Upload Bandit report
|
| 179 |
+
if: always() && steps.bandit.outcome != 'skipped'
|
| 180 |
+
uses: actions/upload-artifact@v6
|
| 181 |
+
with:
|
| 182 |
+
name: bandit-security-report
|
| 183 |
+
path: bandit-report.json
|
| 184 |
+
retention-days: 30
|
.github/workflows/docker-build.yml
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Build and Push Docker Image
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
pull_request:
|
| 5 |
+
types: [closed]
|
| 6 |
+
branches:
|
| 7 |
+
- main
|
| 8 |
+
workflow_dispatch:
|
| 9 |
+
inputs:
|
| 10 |
+
tag:
|
| 11 |
+
description: 'Docker image tag'
|
| 12 |
+
required: true
|
| 13 |
+
default: 'latest'
|
| 14 |
+
|
| 15 |
+
env:
|
| 16 |
+
DOCKERHUB_IMAGE: pyd4vinci/scrapling
|
| 17 |
+
GHCR_IMAGE: ghcr.io/${{ github.repository_owner }}/scrapling
|
| 18 |
+
|
| 19 |
+
jobs:
|
| 20 |
+
build-and-push:
|
| 21 |
+
runs-on: ubuntu-latest
|
| 22 |
+
permissions:
|
| 23 |
+
contents: read
|
| 24 |
+
packages: write
|
| 25 |
+
|
| 26 |
+
steps:
|
| 27 |
+
- name: Checkout repository
|
| 28 |
+
uses: actions/checkout@v6
|
| 29 |
+
|
| 30 |
+
- name: Set up Docker Buildx
|
| 31 |
+
uses: docker/setup-buildx-action@v3
|
| 32 |
+
with:
|
| 33 |
+
platforms: linux/amd64,linux/arm64
|
| 34 |
+
|
| 35 |
+
- name: Log in to Docker Hub
|
| 36 |
+
uses: docker/login-action@v3
|
| 37 |
+
with:
|
| 38 |
+
registry: docker.io
|
| 39 |
+
username: ${{ secrets.DOCKER_USERNAME }}
|
| 40 |
+
password: ${{ secrets.DOCKER_PASSWORD }}
|
| 41 |
+
|
| 42 |
+
- name: Log in to GitHub Container Registry
|
| 43 |
+
uses: docker/login-action@v3
|
| 44 |
+
with:
|
| 45 |
+
registry: ghcr.io
|
| 46 |
+
username: ${{ github.actor }}
|
| 47 |
+
password: ${{ secrets.CONTAINER_TOKEN }}
|
| 48 |
+
|
| 49 |
+
- name: Extract metadata
|
| 50 |
+
id: meta
|
| 51 |
+
uses: docker/metadata-action@v5
|
| 52 |
+
with:
|
| 53 |
+
images: |
|
| 54 |
+
${{ env.DOCKERHUB_IMAGE }}
|
| 55 |
+
${{ env.GHCR_IMAGE }}
|
| 56 |
+
tags: |
|
| 57 |
+
type=ref,event=branch
|
| 58 |
+
type=ref,event=pr
|
| 59 |
+
type=semver,pattern={{version}}
|
| 60 |
+
type=semver,pattern={{major}}.{{minor}}
|
| 61 |
+
type=semver,pattern={{major}}
|
| 62 |
+
type=raw,value=latest,enable={{is_default_branch}}
|
| 63 |
+
labels: |
|
| 64 |
+
org.opencontainers.image.title=Scrapling
|
| 65 |
+
org.opencontainers.image.description=An undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
|
| 66 |
+
org.opencontainers.image.vendor=D4Vinci
|
| 67 |
+
org.opencontainers.image.licenses=BSD
|
| 68 |
+
org.opencontainers.image.url=https://scrapling.readthedocs.io/en/latest/
|
| 69 |
+
org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}
|
| 70 |
+
org.opencontainers.image.documentation=https://scrapling.readthedocs.io/en/latest/
|
| 71 |
+
|
| 72 |
+
- name: Build and push Docker image
|
| 73 |
+
uses: docker/build-push-action@v6
|
| 74 |
+
with:
|
| 75 |
+
context: .
|
| 76 |
+
platforms: linux/amd64,linux/arm64
|
| 77 |
+
push: true
|
| 78 |
+
tags: ${{ steps.meta.outputs.tags }}
|
| 79 |
+
labels: ${{ steps.meta.outputs.labels }}
|
| 80 |
+
cache-from: type=gha
|
| 81 |
+
cache-to: type=gha,mode=max
|
| 82 |
+
build-args: |
|
| 83 |
+
BUILDKIT_INLINE_CACHE=1
|
| 84 |
+
|
| 85 |
+
- name: Image digest
|
| 86 |
+
run: echo ${{ steps.build.outputs.digest }}
|
.github/workflows/release-and-publish.yml
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Create Release and Publish to PyPI
|
| 2 |
+
# Creates a GitHub release when a PR is merged to main (using PR title as version and body as release notes), then publishes to PyPI.
|
| 3 |
+
|
| 4 |
+
on:
|
| 5 |
+
pull_request:
|
| 6 |
+
types: [closed]
|
| 7 |
+
branches:
|
| 8 |
+
- main
|
| 9 |
+
|
| 10 |
+
jobs:
|
| 11 |
+
create-release-and-publish:
|
| 12 |
+
if: github.event.pull_request.merged == true
|
| 13 |
+
runs-on: ubuntu-latest
|
| 14 |
+
environment:
|
| 15 |
+
name: PyPI
|
| 16 |
+
url: https://pypi.org/p/scrapling
|
| 17 |
+
permissions:
|
| 18 |
+
contents: write
|
| 19 |
+
id-token: write
|
| 20 |
+
steps:
|
| 21 |
+
- uses: actions/checkout@v6
|
| 22 |
+
with:
|
| 23 |
+
fetch-depth: 0
|
| 24 |
+
|
| 25 |
+
- name: Get PR title
|
| 26 |
+
id: pr_title
|
| 27 |
+
run: echo "title=${{ github.event.pull_request.title }}" >> $GITHUB_OUTPUT
|
| 28 |
+
|
| 29 |
+
- name: Save PR body to file
|
| 30 |
+
uses: actions/github-script@v8
|
| 31 |
+
with:
|
| 32 |
+
script: |
|
| 33 |
+
const fs = require('fs');
|
| 34 |
+
fs.writeFileSync('pr_body.md', context.payload.pull_request.body || '');
|
| 35 |
+
|
| 36 |
+
- name: Extract version
|
| 37 |
+
id: extract_version
|
| 38 |
+
run: |
|
| 39 |
+
PR_TITLE="${{ steps.pr_title.outputs.title }}"
|
| 40 |
+
if [[ $PR_TITLE =~ ^v ]]; then
|
| 41 |
+
echo "version=$PR_TITLE" >> $GITHUB_OUTPUT
|
| 42 |
+
echo "Valid version format found in PR title: $PR_TITLE"
|
| 43 |
+
else
|
| 44 |
+
echo "Error: PR title '$PR_TITLE' must start with 'v' (e.g., 'v1.0.0') to create a release."
|
| 45 |
+
exit 1
|
| 46 |
+
fi
|
| 47 |
+
|
| 48 |
+
- name: Create Release
|
| 49 |
+
uses: softprops/action-gh-release@v2
|
| 50 |
+
with:
|
| 51 |
+
tag_name: ${{ steps.extract_version.outputs.version }}
|
| 52 |
+
name: Release ${{ steps.extract_version.outputs.version }}
|
| 53 |
+
body_path: pr_body.md
|
| 54 |
+
draft: false
|
| 55 |
+
prerelease: false
|
| 56 |
+
env:
|
| 57 |
+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
| 58 |
+
|
| 59 |
+
- name: Set up Python
|
| 60 |
+
uses: actions/setup-python@v6
|
| 61 |
+
with:
|
| 62 |
+
python-version: 3.12
|
| 63 |
+
|
| 64 |
+
- name: Upgrade pip
|
| 65 |
+
run: python3 -m pip install --upgrade pip
|
| 66 |
+
|
| 67 |
+
- name: Install build
|
| 68 |
+
run: python3 -m pip install --upgrade build twine setuptools
|
| 69 |
+
|
| 70 |
+
- name: Build a binary wheel and a source tarball
|
| 71 |
+
run: python3 -m build --sdist --wheel --outdir dist/
|
| 72 |
+
|
| 73 |
+
- name: Publish distribution 📦 to PyPI
|
| 74 |
+
uses: pypa/gh-action-pypi-publish@release/v1
|
.github/workflows/tests.yml
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Tests
|
| 2 |
+
on:
|
| 3 |
+
push:
|
| 4 |
+
branches:
|
| 5 |
+
- main
|
| 6 |
+
- dev
|
| 7 |
+
paths-ignore:
|
| 8 |
+
- '*.md'
|
| 9 |
+
- '**/*.md'
|
| 10 |
+
- 'docs/*'
|
| 11 |
+
- 'images/*'
|
| 12 |
+
- '.github/*'
|
| 13 |
+
- '*.yml'
|
| 14 |
+
- '*.yaml'
|
| 15 |
+
- 'ruff.toml'
|
| 16 |
+
|
| 17 |
+
concurrency:
|
| 18 |
+
group: ${{github.workflow}}-${{ github.ref }}
|
| 19 |
+
cancel-in-progress: true
|
| 20 |
+
|
| 21 |
+
jobs:
|
| 22 |
+
tests:
|
| 23 |
+
timeout-minutes: 60
|
| 24 |
+
runs-on: ${{ matrix.os }}
|
| 25 |
+
strategy:
|
| 26 |
+
fail-fast: false
|
| 27 |
+
matrix:
|
| 28 |
+
include:
|
| 29 |
+
- python-version: "3.10"
|
| 30 |
+
os: macos-latest
|
| 31 |
+
env:
|
| 32 |
+
TOXENV: py310
|
| 33 |
+
- python-version: "3.11"
|
| 34 |
+
os: macos-latest
|
| 35 |
+
env:
|
| 36 |
+
TOXENV: py311
|
| 37 |
+
- python-version: "3.12"
|
| 38 |
+
os: macos-latest
|
| 39 |
+
env:
|
| 40 |
+
TOXENV: py312
|
| 41 |
+
- python-version: "3.13"
|
| 42 |
+
os: macos-latest
|
| 43 |
+
env:
|
| 44 |
+
TOXENV: py313
|
| 45 |
+
|
| 46 |
+
steps:
|
| 47 |
+
- uses: actions/checkout@v6
|
| 48 |
+
|
| 49 |
+
- name: Set up Python ${{ matrix.python-version }}
|
| 50 |
+
uses: actions/setup-python@v6
|
| 51 |
+
with:
|
| 52 |
+
python-version: ${{ matrix.python-version }}
|
| 53 |
+
cache: 'pip'
|
| 54 |
+
cache-dependency-path: |
|
| 55 |
+
pyproject.toml
|
| 56 |
+
tox.ini
|
| 57 |
+
|
| 58 |
+
- name: Install all browsers dependencies
|
| 59 |
+
run: |
|
| 60 |
+
python3 -m pip install --upgrade pip
|
| 61 |
+
python3 -m pip install playwright==1.56.0 patchright==1.56.0
|
| 62 |
+
|
| 63 |
+
- name: Get Playwright version
|
| 64 |
+
id: playwright-version
|
| 65 |
+
run: |
|
| 66 |
+
PLAYWRIGHT_VERSION=$(python3 -c "import importlib.metadata; print(importlib.metadata.version('playwright'))")
|
| 67 |
+
echo "version=$PLAYWRIGHT_VERSION" >> $GITHUB_OUTPUT
|
| 68 |
+
echo "Playwright version: $PLAYWRIGHT_VERSION"
|
| 69 |
+
|
| 70 |
+
- name: Retrieve Playwright browsers from cache if any
|
| 71 |
+
id: playwright-cache
|
| 72 |
+
uses: actions/cache@v5
|
| 73 |
+
with:
|
| 74 |
+
path: |
|
| 75 |
+
~/.cache/ms-playwright
|
| 76 |
+
~/Library/Caches/ms-playwright
|
| 77 |
+
~/.ms-playwright
|
| 78 |
+
key: ${{ runner.os }}-playwright-${{ steps.playwright-version.outputs.version }}-v1
|
| 79 |
+
restore-keys: |
|
| 80 |
+
${{ runner.os }}-playwright-${{ steps.playwright-version.outputs.version }}-
|
| 81 |
+
${{ runner.os }}-playwright-
|
| 82 |
+
|
| 83 |
+
- name: Install Playwright browsers
|
| 84 |
+
run: |
|
| 85 |
+
echo "Cache hit: ${{ steps.playwright-cache.outputs.cache-hit }}"
|
| 86 |
+
if [ "${{ steps.playwright-cache.outputs.cache-hit }}" != "true" ]; then
|
| 87 |
+
python3 -m playwright install chromium
|
| 88 |
+
else
|
| 89 |
+
echo "Skipping install - using cached Playwright browsers"
|
| 90 |
+
fi
|
| 91 |
+
python3 -m playwright install-deps chromium
|
| 92 |
+
|
| 93 |
+
# Cache tox environments
|
| 94 |
+
- name: Cache tox environments
|
| 95 |
+
uses: actions/cache@v5
|
| 96 |
+
with:
|
| 97 |
+
path: .tox
|
| 98 |
+
# Include python version and os in the cache key
|
| 99 |
+
key: tox-v1-${{ runner.os }}-py${{ matrix.python-version }}-${{ hashFiles('/Users/runner/work/Scrapling/pyproject.toml') }}
|
| 100 |
+
restore-keys: |
|
| 101 |
+
tox-v1-${{ runner.os }}-py${{ matrix.python-version }}-
|
| 102 |
+
tox-v1-${{ runner.os }}-
|
| 103 |
+
|
| 104 |
+
- name: Install tox
|
| 105 |
+
run: pip install -U tox
|
| 106 |
+
|
| 107 |
+
- name: Run tests
|
| 108 |
+
env: ${{ matrix.env }}
|
| 109 |
+
run: tox
|
.gitignore
CHANGED
|
@@ -1,75 +1,110 @@
|
|
| 1 |
-
#
|
| 2 |
-
|
| 3 |
-
*
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
# Virtual environment
|
| 7 |
-
venv/
|
| 8 |
-
|
| 9 |
-
# Streamlit cache
|
| 10 |
-
.streamlit/
|
| 11 |
-
|
| 12 |
-
# PyCharm files
|
| 13 |
-
.idea/
|
| 14 |
-
|
| 15 |
-
# VS Code files
|
| 16 |
-
.vscode/
|
| 17 |
|
| 18 |
-
#
|
| 19 |
-
.
|
|
|
|
| 20 |
|
| 21 |
-
#
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
.DS_Store
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
*.db
|
| 33 |
-
*.
|
| 34 |
|
| 35 |
-
#
|
| 36 |
-
*.
|
| 37 |
-
|
| 38 |
-
# Package directories
|
| 39 |
dist/
|
| 40 |
build/
|
| 41 |
-
*.egg-info/
|
| 42 |
|
| 43 |
-
#
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
-
#
|
| 48 |
-
|
| 49 |
-
.coverage
|
| 50 |
-
.coverage.*
|
| 51 |
-
coverage.xml
|
| 52 |
|
| 53 |
-
#
|
| 54 |
-
.
|
| 55 |
|
| 56 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
.mypy_cache/
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
-
#
|
| 60 |
-
.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
-
#
|
| 63 |
-
|
| 64 |
|
| 65 |
-
#
|
| 66 |
-
|
| 67 |
|
| 68 |
-
#
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
-
#
|
| 72 |
-
|
|
|
|
| 73 |
|
| 74 |
-
#
|
| 75 |
-
|
|
|
|
|
|
|
|
|
| 1 |
+
# local files
|
| 2 |
+
site/*
|
| 3 |
+
local_tests/*
|
| 4 |
+
.mcpregistry_*
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
+
# AI related files
|
| 7 |
+
.claude/*
|
| 8 |
+
CLAUDE.md
|
| 9 |
|
| 10 |
+
# cached files
|
| 11 |
+
__pycache__/
|
| 12 |
+
*.py[cod]
|
| 13 |
+
.cache
|
| 14 |
.DS_Store
|
| 15 |
+
*~
|
| 16 |
+
.*.sw[po]
|
| 17 |
+
.build
|
| 18 |
+
.ve
|
| 19 |
+
.env
|
| 20 |
+
.pytest
|
| 21 |
+
.benchmarks
|
| 22 |
+
.bootstrap
|
| 23 |
+
.appveyor.token
|
| 24 |
+
*.bak
|
| 25 |
*.db
|
| 26 |
+
*.db-*
|
| 27 |
|
| 28 |
+
# installation package
|
| 29 |
+
*.egg-info/
|
|
|
|
|
|
|
| 30 |
dist/
|
| 31 |
build/
|
|
|
|
| 32 |
|
| 33 |
+
# environments
|
| 34 |
+
.venv
|
| 35 |
+
env/
|
| 36 |
+
venv/
|
| 37 |
+
ENV/
|
| 38 |
+
env.bak/
|
| 39 |
+
venv.bak/
|
| 40 |
|
| 41 |
+
# C extensions
|
| 42 |
+
*.so
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
+
# pycharm
|
| 45 |
+
.idea/
|
| 46 |
|
| 47 |
+
# vscode
|
| 48 |
+
*.code-workspace
|
| 49 |
+
|
| 50 |
+
# Packages
|
| 51 |
+
*.egg
|
| 52 |
+
*.egg-info
|
| 53 |
+
dist
|
| 54 |
+
build
|
| 55 |
+
eggs
|
| 56 |
+
.eggs
|
| 57 |
+
parts
|
| 58 |
+
bin
|
| 59 |
+
var
|
| 60 |
+
sdist
|
| 61 |
+
wheelhouse
|
| 62 |
+
develop-eggs
|
| 63 |
+
.installed.cfg
|
| 64 |
+
lib
|
| 65 |
+
lib64
|
| 66 |
+
venv*/
|
| 67 |
+
.venv*/
|
| 68 |
+
pyvenv*/
|
| 69 |
+
pip-wheel-metadata/
|
| 70 |
+
poetry.lock
|
| 71 |
+
|
| 72 |
+
# Installer logs
|
| 73 |
+
pip-log.txt
|
| 74 |
+
|
| 75 |
+
# mypy
|
| 76 |
.mypy_cache/
|
| 77 |
+
.dmypy.json
|
| 78 |
+
dmypy.json
|
| 79 |
+
mypy.ini
|
| 80 |
|
| 81 |
+
# test caches
|
| 82 |
+
.tox/
|
| 83 |
+
.pytest_cache/
|
| 84 |
+
.coverage
|
| 85 |
+
htmlcov
|
| 86 |
+
report.xml
|
| 87 |
+
nosetests.xml
|
| 88 |
+
coverage.xml
|
| 89 |
|
| 90 |
+
# Translations
|
| 91 |
+
*.mo
|
| 92 |
|
| 93 |
+
# Buildout
|
| 94 |
+
.mr.developer.cfg
|
| 95 |
|
| 96 |
+
# IDE project files
|
| 97 |
+
.project
|
| 98 |
+
.pydevproject
|
| 99 |
+
.idea
|
| 100 |
+
*.iml
|
| 101 |
+
*.komodoproject
|
| 102 |
|
| 103 |
+
# Complexity
|
| 104 |
+
output/*.html
|
| 105 |
+
output/*/index.html
|
| 106 |
|
| 107 |
+
# Sphinx
|
| 108 |
+
docs/_build
|
| 109 |
+
public/
|
| 110 |
+
web/
|
.hfignore
CHANGED
|
@@ -1,9 +1,23 @@
|
|
| 1 |
-
.git
|
| 2 |
-
.github
|
| 3 |
-
venv
|
| 4 |
-
__pycache__
|
| 5 |
*.pyc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
.env
|
| 7 |
-
chat_history.json
|
| 8 |
-
test_patchright.py
|
| 9 |
-
client_secret.json
|
|
|
|
| 1 |
+
.git
|
| 2 |
+
.github
|
| 3 |
+
.venv
|
| 4 |
+
__pycache__
|
| 5 |
*.pyc
|
| 6 |
+
*.pyo
|
| 7 |
+
*.pyd
|
| 8 |
+
.DS_Store
|
| 9 |
+
tests/
|
| 10 |
+
docs/
|
| 11 |
+
images/
|
| 12 |
+
.coverage
|
| 13 |
+
htmlcov/
|
| 14 |
+
pytest_cache/
|
| 15 |
+
.mypy_cache/
|
| 16 |
+
.tox/
|
| 17 |
+
.pytest_cache/
|
| 18 |
+
.ruff_cache/
|
| 19 |
+
.uv/
|
| 20 |
+
dist/
|
| 21 |
+
build/
|
| 22 |
+
*.egg-info/
|
| 23 |
.env
|
|
|
|
|
|
|
|
|
.pre-commit-config.yaml
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
repos:
|
| 2 |
+
- repo: https://github.com/PyCQA/bandit
|
| 3 |
+
rev: 1.9.0
|
| 4 |
+
hooks:
|
| 5 |
+
- id: bandit
|
| 6 |
+
args: [-r, -c, .bandit.yml]
|
| 7 |
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
| 8 |
+
# Ruff version.
|
| 9 |
+
rev: v0.14.5
|
| 10 |
+
hooks:
|
| 11 |
+
# Run the linter.
|
| 12 |
+
- id: ruff
|
| 13 |
+
args: [ --fix ]
|
| 14 |
+
# Run the formatter.
|
| 15 |
+
- id: ruff-format
|
| 16 |
+
- repo: https://github.com/netromdk/vermin
|
| 17 |
+
rev: v1.7.0
|
| 18 |
+
hooks:
|
| 19 |
+
- id: vermin
|
| 20 |
+
args: ['-t=3.10-', '--violations', '--eval-annotations', '--no-tips']
|
.readthedocs.yaml
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# See https://docs.readthedocs.com/platform/stable/intro/zensical.html for details
|
| 2 |
+
# Example: https://github.com/readthedocs/test-builds/tree/zensical
|
| 3 |
+
|
| 4 |
+
version: 2
|
| 5 |
+
|
| 6 |
+
build:
|
| 7 |
+
os: ubuntu-24.04
|
| 8 |
+
apt_packages:
|
| 9 |
+
- pngquant
|
| 10 |
+
tools:
|
| 11 |
+
python: "3.13"
|
| 12 |
+
jobs:
|
| 13 |
+
install:
|
| 14 |
+
- pip install -r docs/requirements.txt
|
| 15 |
+
- pip install ".[all]"
|
| 16 |
+
build:
|
| 17 |
+
html:
|
| 18 |
+
- zensical build
|
| 19 |
+
post_build:
|
| 20 |
+
- mkdir -p $READTHEDOCS_OUTPUT/html/
|
| 21 |
+
- cp --recursive site/* $READTHEDOCS_OUTPUT/html/
|
CODE_OF_CONDUCT.md
CHANGED
|
@@ -60,7 +60,7 @@ representative at an online or offline event.
|
|
| 60 |
|
| 61 |
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
| 62 |
reported to the community leaders responsible for enforcement at
|
| 63 |
-
|
| 64 |
All complaints will be reviewed and investigated promptly and fairly.
|
| 65 |
|
| 66 |
All community leaders are obligated to respect the privacy and security of the
|
|
|
|
| 60 |
|
| 61 |
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
| 62 |
reported to the community leaders responsible for enforcement at
|
| 63 |
+
karim.shoair@pm.me.
|
| 64 |
All complaints will be reviewed and investigated promptly and fairly.
|
| 65 |
|
| 66 |
All community leaders are obligated to respect the privacy and security of the
|
CONTRIBUTING.md
CHANGED
|
@@ -1,167 +1,106 @@
|
|
| 1 |
-
# Contributing to
|
| 2 |
|
| 3 |
-
|
| 4 |
|
| 5 |
-
|
| 6 |
|
| 7 |
-
|
| 8 |
|
| 9 |
-
|
| 10 |
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
### Setting Up Development Environment
|
| 14 |
|
| 15 |
-
|
| 16 |
-
2. Clone your fork:
|
| 17 |
-
```bash
|
| 18 |
-
git clone https://github.com/your-username/CyberScraper-2077.git
|
| 19 |
-
cd CyberScraper-2077
|
| 20 |
-
```
|
| 21 |
-
3. Create a virtual environment:
|
| 22 |
-
```bash
|
| 23 |
-
python -m venv venv
|
| 24 |
-
source venv/bin/activate # On Windows: venv\Scripts\activate
|
| 25 |
-
```
|
| 26 |
-
4. Install dependencies:
|
| 27 |
-
```bash
|
| 28 |
-
pip install -r requirements.txt
|
| 29 |
-
playwright install
|
| 30 |
-
```
|
| 31 |
|
| 32 |
-
|
| 33 |
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
```
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
| 47 |
```
|
| 48 |
-
6. Create a Pull Request
|
| 49 |
-
|
| 50 |
-
## 📝 Commit Message Guidelines
|
| 51 |
|
| 52 |
-
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
```
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
[optional footer]
|
| 60 |
```
|
| 61 |
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
- `test`: Adding missing tests
|
| 69 |
-
- `chore`: Changes to build process or auxiliary tools
|
| 70 |
-
|
| 71 |
-
Example:
|
| 72 |
-
```
|
| 73 |
-
feat(scraper): add support for dynamic loading websites
|
| 74 |
```
|
| 75 |
|
| 76 |
-
##
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
|
|
|
|
|
|
|
|
|
| 92 |
```
|
| 93 |
-
CyberScraper-2077/
|
| 94 |
-
├── app/
|
| 95 |
-
│ ├── scrapers/
|
| 96 |
-
│ ├── utils/
|
| 97 |
-
│ └── ui_components/
|
| 98 |
-
├── src/
|
| 99 |
-
│ └── models/
|
| 100 |
-
├── tests/
|
| 101 |
-
└── docs/
|
| 102 |
-
```
|
| 103 |
-
|
| 104 |
-
- Place new scraper implementations in `app/scrapers/`
|
| 105 |
-
- Add utility functions in `app/utils/`
|
| 106 |
-
- UI components go in `app/ui_components/`
|
| 107 |
-
- Model-related code goes in `src/models/`
|
| 108 |
-
|
| 109 |
-
## 🎯 Feature Requests
|
| 110 |
-
|
| 111 |
-
- Use GitHub Issues to propose new features
|
| 112 |
-
- Tag feature requests with `enhancement`
|
| 113 |
-
- Provide clear use cases
|
| 114 |
-
- Discuss implementation approach
|
| 115 |
-
|
| 116 |
-
## 🐛 Bug Reports
|
| 117 |
-
|
| 118 |
-
When reporting bugs, include:
|
| 119 |
-
- Detailed description of the issue
|
| 120 |
-
- Steps to reproduce
|
| 121 |
-
- Expected vs actual behavior
|
| 122 |
-
- Environment details (OS, Python version, etc.)
|
| 123 |
-
- Screenshots if applicable
|
| 124 |
-
|
| 125 |
-
## 🔍 Pull Request Process
|
| 126 |
-
|
| 127 |
-
1. Update documentation
|
| 128 |
-
2. Add/update tests
|
| 129 |
-
3. Ensure CI/CD pipeline passes
|
| 130 |
-
4. Get at least one code review
|
| 131 |
-
5. Squash commits if requested
|
| 132 |
-
6. Ensure branch is up to date with main
|
| 133 |
-
|
| 134 |
-
## ⚙️ Development Best Practices
|
| 135 |
-
|
| 136 |
-
1. Follow PEP 8 style guide
|
| 137 |
-
2. Use type hints
|
| 138 |
-
3. Keep functions/methods focused and small
|
| 139 |
-
4. Comment complex logic
|
| 140 |
-
5. Use meaningful variable/function names
|
| 141 |
-
6. Handle errors appropriately
|
| 142 |
-
7. Log important operations
|
| 143 |
-
|
| 144 |
-
## 🚫 What to Avoid
|
| 145 |
-
|
| 146 |
-
- Breaking existing functionality
|
| 147 |
-
- Introducing unnecessary dependencies
|
| 148 |
-
- Making large, unfocused PRs
|
| 149 |
-
- Ignoring code review feedback
|
| 150 |
-
- Modifying core functionality without discussion
|
| 151 |
-
|
| 152 |
-
## 🏆 Recognition
|
| 153 |
-
|
| 154 |
-
Contributors will be added to our README.md and CONTRIBUTORS.md files. We value and appreciate all contributions!
|
| 155 |
-
|
| 156 |
-
## 📞 Getting Help
|
| 157 |
-
|
| 158 |
-
- Create an issue for questions
|
| 159 |
-
- Join our Discord community
|
| 160 |
-
- Check existing documentation
|
| 161 |
-
- Look through closed issues
|
| 162 |
-
|
| 163 |
-
## 📜 License
|
| 164 |
|
| 165 |
-
|
|
|
|
| 166 |
|
| 167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Contributing to Scrapling
|
| 2 |
|
| 3 |
+
Thank you for your interest in contributing to Scrapling!
|
| 4 |
|
| 5 |
+
Everybody is invited and welcome to contribute to Scrapling.
|
| 6 |
|
| 7 |
+
Minor changes are more likely to be included promptly. Adding unit tests for new features or test cases for bugs you've fixed helps us ensure that the Pull Request (PR) is acceptable.
|
| 8 |
|
| 9 |
+
There are many ways to contribute to Scrapling. Here are some of them:
|
| 10 |
|
| 11 |
+
- Report bugs and request features using the [GitHub issues](https://github.com/D4Vinci/Scrapling/issues). Please follow the issue template to help us resolve your issue quickly.
|
| 12 |
+
- Blog about Scrapling. Tell the world how you’re using Scrapling. This will help newcomers with more examples and increase the Scrapling project's visibility.
|
| 13 |
+
- Join the [Discord community](https://discord.gg/EMgGbDceNQ) and share your ideas on how to improve Scrapling. We’re always open to suggestions.
|
| 14 |
+
- If you are not a developer, perhaps you would like to help with translating the [documentation](https://github.com/D4Vinci/Scrapling/tree/docs)?
|
| 15 |
|
|
|
|
| 16 |
|
| 17 |
+
## Finding work
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
+
If you have decided to make a contribution to Scrapling, but you do not know what to contribute, here are some ways to find pending work:
|
| 20 |
|
| 21 |
+
- Check out the [contribution](https://github.com/D4Vinci/Scrapling/contribute) GitHub page, which lists open issues tagged as `good first issue`. These issues provide a good starting point.
|
| 22 |
+
- There are also the [help wanted](https://github.com/D4Vinci/Scrapling/issues?q=is%3Aissue%20label%3A%22help%20wanted%22%20state%3Aopen) issues, but know that some may require familiarity with the Scrapling code base first. You can also target any other issue, provided it is not tagged as `invalid`, `wontfix`, or similar tags.
|
| 23 |
+
- If you enjoy writing automated tests, you can work on increasing our test coverage. Currently, the test coverage is around 90–92%.
|
| 24 |
+
- Join the [Discord community](https://discord.gg/EMgGbDceNQ) and ask questions in the `#help` channel.
|
| 25 |
+
|
| 26 |
+
## Coding style
|
| 27 |
+
Please follow these coding conventions as we do when writing code for Scrapling:
|
| 28 |
+
- We use [pre-commit](https://pre-commit.com/) to automatically address simple code issues before every commit, so please install it and run `pre-commit install` to set it up. This will install hooks to run [ruff](https://docs.astral.sh/ruff/), [bandit](https://github.com/PyCQA/bandit), and [vermin](https://github.com/netromdk/vermin) on every commit. We are currently using a workflow to automatically run these tools on every PR, so if your code doesn't pass these checks, the PR will be rejected.
|
| 29 |
+
- We use type hints for better code clarity and [pyright](https://github.com/microsoft/pyright) for static type checking, which depends on the type hints, of course.
|
| 30 |
+
- We use the conventional commit messages format as [here](https://gist.github.com/qoomon/5dfcdf8eec66a051ecd85625518cfd13#types), so for example, we use the following prefixes for commit messages:
|
| 31 |
+
|
| 32 |
+
| Prefix | When to use it |
|
| 33 |
+
|-------------|--------------------------|
|
| 34 |
+
| `feat:` | New feature added |
|
| 35 |
+
| `fix:` | Bug fix |
|
| 36 |
+
| `docs:` | Documentation change/add |
|
| 37 |
+
| `test:` | Tests |
|
| 38 |
+
| `refactor:` | Code refactoring |
|
| 39 |
+
| `chore:` | Maintenance tasks |
|
| 40 |
+
|
| 41 |
+
Then include the details of the change in the commit message body/description.
|
| 42 |
+
|
| 43 |
+
Example:
|
| 44 |
```
|
| 45 |
+
feat: add `adaptive` for similar elements
|
| 46 |
+
|
| 47 |
+
- Added find_similar() method
|
| 48 |
+
- Implemented pattern matching
|
| 49 |
+
- Added tests and documentation
|
| 50 |
```
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
+
> Please don’t put your name in the code you contribute; git provides enough metadata to identify the author of the code.
|
| 53 |
|
| 54 |
+
## Development
|
| 55 |
+
Setting the scrapling logging level to `debug` makes it easier to know what's happening in the background.
|
| 56 |
+
```python
|
| 57 |
+
import logging
|
| 58 |
+
logging.getLogger("scrapling").setLevel(logging.DEBUG)
|
| 59 |
```
|
| 60 |
+
Bonus: You can install the beta of the upcoming update from the dev branch as follows
|
| 61 |
+
```commandline
|
| 62 |
+
pip3 install git+https://github.com/D4Vinci/Scrapling.git@dev
|
|
|
|
|
|
|
| 63 |
```
|
| 64 |
|
| 65 |
+
## Building Documentation
|
| 66 |
+
Documentation is built using [MkDocs](https://www.mkdocs.org/). You can build it locally using the following commands:
|
| 67 |
+
```bash
|
| 68 |
+
pip install mkdocs-material
|
| 69 |
+
mkdocs serve # Local preview
|
| 70 |
+
mkdocs build # Build the static site
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
```
|
| 72 |
|
| 73 |
+
## Tests
|
| 74 |
+
Scrapling includes a comprehensive test suite that can be executed with pytest. However, first, you need to install all libraries and `pytest-plugins` listed in `tests/requirements.txt`. Then, running the tests will result in an output like this:
|
| 75 |
+
```bash
|
| 76 |
+
$ pytest tests -n auto
|
| 77 |
+
=============================== test session starts ===============================
|
| 78 |
+
platform darwin -- Python 3.13.8, pytest-8.4.2, pluggy-1.6.0 -- /Users/<redacted>/.venv/bin/python3.13
|
| 79 |
+
cachedir: .pytest_cache
|
| 80 |
+
rootdir: /Users/<redacted>/scrapling
|
| 81 |
+
configfile: pytest.ini
|
| 82 |
+
plugins: asyncio-1.2.0, anyio-4.11.0, xdist-3.8.0, httpbin-2.1.0, cov-7.0.0
|
| 83 |
+
asyncio: mode=Mode.AUTO, debug=False, asyncio_default_fixture_loop_scope=function, asyncio_default_test_loop_scope=function
|
| 84 |
+
10 workers [271 items]
|
| 85 |
+
scheduling tests via LoadScheduling
|
| 86 |
+
|
| 87 |
+
...<shortened>...
|
| 88 |
+
|
| 89 |
+
=============================== 271 passed in 52.68s ==============================
|
| 90 |
+
```
|
| 91 |
+
Hence, we used `-n auto` in the command above to run tests in threads to increase speed.
|
| 92 |
|
| 93 |
+
Bonus: You can also see the test coverage with the `pytest` plugin below
|
| 94 |
+
```bash
|
| 95 |
+
pytest --cov=scrapling tests/
|
| 96 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
+
## Making a Pull Request
|
| 99 |
+
To ensure that your PR gets accepted, please make sure that your PR is based on the latest changes from the dev branch and that it satisfies the following requirements:
|
| 100 |
|
| 101 |
+
- The PR should be made against the [**dev**](https://github.com/D4Vinci/Scrapling/tree/dev) branch of Scrapling. Any PR made against the main branch will be rejected.
|
| 102 |
+
- The code should be passing all available tests. We use tox with GitHub's CI to run the current tests on all supported Python versions for every code-related commit.
|
| 103 |
+
- The code should be passing all code quality checks we mentioned above. We are using GitHub's CI to enforce the code style checks performed by pre-commit. If you were using the pre-commit hooks we discussed above, you should not see any issues when committing your changes.
|
| 104 |
+
- Make your changes, keep the code clean with an explanation of any part that might be vague, and remember to create a separate virtual environment for this project.
|
| 105 |
+
- If you are adding a new feature, please add tests for it.
|
| 106 |
+
- If you are fixing a bug, please add code with the PR that reproduces the bug.
|
Dockerfile
CHANGED
|
@@ -1,102 +1,48 @@
|
|
| 1 |
-
|
| 2 |
-
FROM python:3.12-slim-bookworm
|
| 3 |
|
| 4 |
-
|
| 5 |
-
ENV PYTHONUNBUFFERED=1 \
|
| 6 |
-
PYTHONDONTWRITEBYTECODE=1 \
|
| 7 |
-
PORT=7860 \
|
| 8 |
-
UV_SYSTEM_PYTHON=1 \
|
| 9 |
-
HOME=/home/user \
|
| 10 |
-
STREAMLIT_BROWSER_GATHER_USAGE_STATS=false \
|
| 11 |
-
STREAMLIT_SERVER_HEADLESS=true \
|
| 12 |
-
STREAMLIT_SERVER_PORT=8501 \
|
| 13 |
-
STREAMLIT_SERVER_ADDRESS=0.0.0.0
|
| 14 |
-
|
| 15 |
-
# Install system dependencies
|
| 16 |
-
RUN apt-get update && apt-get install -y \
|
| 17 |
-
wget \
|
| 18 |
-
gnupg \
|
| 19 |
-
git \
|
| 20 |
-
tor \
|
| 21 |
-
tor-geoipdb \
|
| 22 |
-
netcat-traditional \
|
| 23 |
-
curl \
|
| 24 |
-
build-essential \
|
| 25 |
-
python3-dev \
|
| 26 |
-
libffi-dev \
|
| 27 |
-
procps \
|
| 28 |
-
nginx \
|
| 29 |
-
# Browser dependencies for Playwright/Patchright
|
| 30 |
-
libglib2.0-0 \
|
| 31 |
-
libnspr4 \
|
| 32 |
-
libnss3 \
|
| 33 |
-
libdbus-1-3 \
|
| 34 |
-
libatk1.0-0 \
|
| 35 |
-
libatk-bridge2.0-0 \
|
| 36 |
-
libcups2 \
|
| 37 |
-
libxkbcommon0 \
|
| 38 |
-
libatspi2.0-0 \
|
| 39 |
-
libxcomposite1 \
|
| 40 |
-
libxdamage1 \
|
| 41 |
-
libxfixes3 \
|
| 42 |
-
libxrandr2 \
|
| 43 |
-
libgbm1 \
|
| 44 |
-
libcairo2 \
|
| 45 |
-
libpango-1.0-0 \
|
| 46 |
-
libasound2 \
|
| 47 |
-
&& apt-get clean \
|
| 48 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 49 |
-
|
| 50 |
-
# Install uv
|
| 51 |
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
|
| 52 |
|
| 53 |
-
# Set
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
COPY requirements.txt .
|
| 58 |
-
RUN uv pip install --system -r requirements.txt
|
| 59 |
-
RUN uv pip install --system fastapi uvicorn
|
| 60 |
-
|
| 61 |
-
# Install patchright browser (Chromium)
|
| 62 |
-
RUN patchright install chromium
|
| 63 |
-
|
| 64 |
-
# Create a non-root user
|
| 65 |
-
RUN useradd -m -u 1000 user
|
| 66 |
|
| 67 |
-
|
| 68 |
-
RUN echo "SocksPort 9050" >> /etc/tor/torrc && \
|
| 69 |
-
echo "ControlPort 9051" >> /etc/tor/torrc && \
|
| 70 |
-
echo "CookieAuthentication 1" >> /etc/tor/torrc && \
|
| 71 |
-
echo "DataDirectory /var/lib/tor" >> /etc/tor/torrc
|
| 72 |
|
| 73 |
-
#
|
| 74 |
-
|
| 75 |
-
chown -R user:user /var/lib/tor && \
|
| 76 |
-
chmod 700 /var/lib/tor && \
|
| 77 |
-
chown -R user:user /app && \
|
| 78 |
-
mkdir -p /var/log/nginx /var/lib/nginx /tmp && \
|
| 79 |
-
chown -R user:user /var/log/nginx /var/lib/nginx /tmp
|
| 80 |
|
| 81 |
-
#
|
| 82 |
-
RUN
|
|
|
|
| 83 |
|
| 84 |
-
# Copy
|
| 85 |
-
COPY
|
| 86 |
|
| 87 |
-
# Install
|
| 88 |
-
RUN
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
-
#
|
| 92 |
-
RUN
|
|
|
|
| 93 |
|
| 94 |
-
# Switch to non-root user
|
| 95 |
USER user
|
| 96 |
-
ENV PATH="/home/user/.local/bin:$PATH"
|
| 97 |
|
| 98 |
-
# Expose port
|
| 99 |
EXPOSE 7860
|
| 100 |
|
| 101 |
-
# Set
|
| 102 |
-
ENTRYPOINT ["
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.12-slim-trixie
|
|
|
|
| 2 |
|
| 3 |
+
LABEL io.modelcontextprotocol.server.name="io.github.D4Vinci/Scrapling"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
|
| 5 |
|
| 6 |
+
# Set environment variables
|
| 7 |
+
ENV DEBIAN_FRONTEND=noninteractive \
|
| 8 |
+
PYTHONUNBUFFERED=1 \
|
| 9 |
+
PYTHONDONTWRITEBYTECODE=1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
+
WORKDIR /app
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
+
# Copy dependency file first for better layer caching
|
| 14 |
+
COPY pyproject.toml ./
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
+
# Install dependencies only
|
| 17 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 18 |
+
uv sync --no-install-project --all-extras --compile-bytecode
|
| 19 |
|
| 20 |
+
# Copy source code
|
| 21 |
+
COPY . .
|
| 22 |
|
| 23 |
+
# Install browsers and project in one optimized layer
|
| 24 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 25 |
+
--mount=type=cache,target=/var/cache/apt \
|
| 26 |
+
--mount=type=cache,target=/var/lib/apt \
|
| 27 |
+
apt-get update && \
|
| 28 |
+
uv run playwright install-deps chromium && \
|
| 29 |
+
uv run playwright install chromium && \
|
| 30 |
+
uv sync --all-extras --compile-bytecode && \
|
| 31 |
+
apt-get clean && \
|
| 32 |
+
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
| 33 |
|
| 34 |
+
# Create a non-root user
|
| 35 |
+
RUN useradd -m -u 1000 user && \
|
| 36 |
+
chown -R user:user /app
|
| 37 |
|
| 38 |
+
# Switch to the non-root user
|
| 39 |
USER user
|
|
|
|
| 40 |
|
| 41 |
+
# Expose port for MCP server HTTP transport
|
| 42 |
EXPOSE 7860
|
| 43 |
|
| 44 |
+
# Set entrypoint to run scrapling
|
| 45 |
+
ENTRYPOINT ["uv", "run", "scrapling"]
|
| 46 |
+
|
| 47 |
+
# Default command (can be overridden)
|
| 48 |
+
CMD ["mcp", "--http", "--port", "7860", "--host", "0.0.0.0"]
|
LICENSE
CHANGED
|
@@ -1,21 +1,28 @@
|
|
| 1 |
-
|
| 2 |
|
| 3 |
-
Copyright (c) 2024
|
| 4 |
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
in the Software without restriction, including without limitation the rights
|
| 8 |
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
-
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
-
furnished to do so, subject to the following conditions:
|
| 11 |
|
| 12 |
-
|
| 13 |
-
|
| 14 |
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
BSD 3-Clause License
|
| 2 |
|
| 3 |
+
Copyright (c) 2024, Karim shoair
|
| 4 |
|
| 5 |
+
Redistribution and use in source and binary forms, with or without
|
| 6 |
+
modification, are permitted provided that the following conditions are met:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
+
1. Redistributions of source code must retain the above copyright notice, this
|
| 9 |
+
list of conditions and the following disclaimer.
|
| 10 |
|
| 11 |
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
| 12 |
+
this list of conditions and the following disclaimer in the documentation
|
| 13 |
+
and/or other materials provided with the distribution.
|
| 14 |
+
|
| 15 |
+
3. Neither the name of the copyright holder nor the names of its
|
| 16 |
+
contributors may be used to endorse or promote products derived from
|
| 17 |
+
this software without specific prior written permission.
|
| 18 |
+
|
| 19 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
| 20 |
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
| 21 |
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
| 22 |
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
| 23 |
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
| 24 |
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
| 25 |
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
| 26 |
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
| 27 |
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 28 |
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
MANIFEST.in
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
include LICENSE
|
| 2 |
+
include *.db
|
| 3 |
+
include *.js
|
| 4 |
+
include scrapling/*.db
|
| 5 |
+
include scrapling/*.db*
|
| 6 |
+
include scrapling/*.db-*
|
| 7 |
+
include scrapling/py.typed
|
| 8 |
+
include scrapling/.scrapling_dependencies_installed
|
| 9 |
+
include .scrapling_dependencies_installed
|
| 10 |
+
|
| 11 |
+
recursive-exclude * __pycache__
|
| 12 |
+
recursive-exclude * *.py[co]
|
README.md
CHANGED
|
@@ -1,406 +1,437 @@
|
|
| 1 |
---
|
| 2 |
title: Scraper Hub
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
app_port: 7860
|
| 8 |
---
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
<p align="center">
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
</p>
|
| 15 |
|
| 16 |
<p align="center">
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
</p>
|
| 19 |
|
| 20 |
-
|
| 21 |
-
[](https://streamlit.io/)
|
| 22 |
-
[](https://opensource.org/licenses/MIT)
|
| 23 |
-
[](http://makeapullrequest.com)
|
| 24 |
-
|
| 25 |
-
> Rip data from the net, leaving no trace. Welcome to the future of web scraping.
|
| 26 |
-
|
| 27 |
-
## 🔍 About
|
| 28 |
-
|
| 29 |
-
CyberScraper 2077 is not just another web scraping tool – it's a glimpse into the future of data extraction. Born from the neon-lit streets of a cyberpunk world, this AI-powered scraper uses OpenAI, Gemini and LocalLLM Models to slice through the web's defenses, extracting the data you need with unparalleled precision and style.
|
| 30 |
-
|
| 31 |
-
Whether you're a corpo data analyst, a street-smart netrunner, or just someone looking to pull information from the digital realm, CyberScraper 2077 has got you covered.
|
| 32 |
-
|
| 33 |
-
<p align="center">
|
| 34 |
-
<img src="https://i.postimg.cc/3NHb15wq/20240821-074556.gif">
|
| 35 |
-
</p>
|
| 36 |
-
|
| 37 |
-
## ✨ Features
|
| 38 |
-
|
| 39 |
-
- **AI-Powered Extraction**: Utilizes cutting-edge AI models to understand and parse web content intelligently.
|
| 40 |
-
- **Sleek Streamlit Interface**: User-friendly GUI that even a chrome-armed street samurai could navigate.
|
| 41 |
-
- **Multi-Format Support**: Export your data in JSON, CSV, HTML, SQL or Excel – whatever fits your cyberdeck.
|
| 42 |
-
- **Tor Network Support**: Safely scrape .onion sites through the Tor network with automatic routing and security features.
|
| 43 |
-
- **Stealth Mode**: Implemented stealth mode parameters that help avoid detection as a bot.
|
| 44 |
-
- **Ollama Support**: Use a huge library of open source LLMs.
|
| 45 |
-
- **Async Operations**: Lightning-fast scraping that would make a Trauma Team jealous.
|
| 46 |
-
- **Smart Parsing**: Structures scraped content as if it was extracted straight from the engram of a master netrunner.
|
| 47 |
-
- **Caching**: Implemented content-based and query-based caching using LRU cache and a custom dictionary to reduce redundant API calls.
|
| 48 |
-
- **Upload to Google Sheets**: Now you can easily upload your extracted CSV data to Google Sheets with one click.
|
| 49 |
-
- **Bypass Captcha**: Bypass captcha by using the -captcha at the end of the URL. (Currently only works natively, doesn't work on Docker)
|
| 50 |
-
- **Current Browser**: The current browser feature uses your local browser instance which will help you bypass 99% of bot detections. (Only use when necessary)
|
| 51 |
-
- **Navigate through the Pages (BETA)**: Navigate through the webpage and scrape data from different pages.
|
| 52 |
-
|
| 53 |
-
## 🪟 For Windows Users
|
| 54 |
|
| 55 |
-
|
| 56 |
|
| 57 |
-
|
| 58 |
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
```
|
| 66 |
-
|
| 67 |
-
2. Create and activate a virtual environment:
|
| 68 |
-
```bash
|
| 69 |
-
virtualenv venv
|
| 70 |
-
source venv/bin/activate # Optional
|
| 71 |
-
```
|
| 72 |
-
|
| 73 |
-
3. Install the required packages:
|
| 74 |
-
```bash
|
| 75 |
-
pip install -r requirements.txt
|
| 76 |
-
```
|
| 77 |
-
|
| 78 |
-
4. Install the playwright:
|
| 79 |
-
```bash
|
| 80 |
-
playwright install
|
| 81 |
-
```
|
| 82 |
-
|
| 83 |
-
5. Set OpenAI & Gemini Key in your environment:
|
| 84 |
-
|
| 85 |
-
Linux/Mac:
|
| 86 |
-
```bash
|
| 87 |
-
export OPENAI_API_KEY="your-api-key-here"
|
| 88 |
-
export GOOGLE_API_KEY="your-api-key-here"
|
| 89 |
-
```
|
| 90 |
-
|
| 91 |
-
### Using Ollama
|
| 92 |
-
|
| 93 |
-
Note: I only recommend using OpenAI and Gemini API as these models are really good at following instructions. If you are using open-source LLMs, make sure you have a good system as the speed of the data generation/presentation depends on how well your system can run the LLM. You may also have to fine-tune the prompt and add some additional filters yourself.
|
| 94 |
-
|
| 95 |
-
```bash
|
| 96 |
-
1. Setup Ollama using `pip install ollama`
|
| 97 |
-
2. Download Ollama from the official website: https://ollama.com/download
|
| 98 |
-
3. Now type: ollama pull llama3.1 or whatever LLM you want to use.
|
| 99 |
-
4. Now follow the rest of the steps below.
|
| 100 |
```
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
1. Ensure you have Docker installed on your system.
|
| 107 |
-
|
| 108 |
-
2. Clone this repository:
|
| 109 |
-
```bash
|
| 110 |
-
git clone https://github.com/itsOwen/CyberScraper-2077.git
|
| 111 |
-
cd CyberScraper-2077
|
| 112 |
-
```
|
| 113 |
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
```
|
| 118 |
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
docker run -p 8501:8501 -e OPENAI_API_KEY="your-actual-api-key" -e GOOGLE_API_KEY="your-actual-api-key" cyberscraper-2077
|
| 122 |
-
```
|
| 123 |
|
| 124 |
-
#
|
| 125 |
|
| 126 |
-
|
|
|
|
| 127 |
|
| 128 |
-
|
| 129 |
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
-
3. Find your host machine's IP address:
|
| 136 |
-
- On Linux/Mac: `ifconfig` or `ip addr show`
|
| 137 |
-
- On Windows: `ipconfig`
|
| 138 |
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
```
|
| 143 |
|
| 144 |
-
|
| 145 |
|
| 146 |
-
|
| 147 |
-
```bash
|
| 148 |
-
docker run -e OLLAMA_BASE_URL=http://<your-host-ip>:11434 -p 8501:8501 cyberscraper-2077
|
| 149 |
-
```
|
| 150 |
-
Replace `<your-host-ip>` with your actual host machine IP address.
|
| 151 |
|
| 152 |
-
|
| 153 |
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
-
#
|
|
|
|
|
|
|
| 157 |
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
-
|
|
|
|
|
|
|
| 164 |
|
| 165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
|
| 167 |
-
|
|
|
|
|
|
|
| 168 |
|
| 169 |
-
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
```
|
| 173 |
-
|
| 174 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
```
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
> **Note**: The multi-page scraping feature is currently in beta. While functional, you may encounter occasional issues or unexpected behavior. We appreciate your feedback and patience as we continue to improve this feature.
|
| 180 |
-
|
| 181 |
-
CyberScraper 2077 now supports multi-page scraping, allowing you to extract data from multiple pages of a website in one go. This feature is perfect for scraping paginated content, search results, or any site with data spread across multiple pages.
|
| 182 |
-
|
| 183 |
-
### How to Use Multi-Page Scraping
|
| 184 |
-
|
| 185 |
-
I suggest you enter the URL structure every time if you want to scrape multiple pages so it can detect the URL structure easily. It detects nearly all URL types.
|
| 186 |
-
|
| 187 |
-
1. **Basic Usage**:
|
| 188 |
-
To scrape multiple pages, use the following format when entering the URL:
|
| 189 |
-
```
|
| 190 |
-
https://example.com/page 1-5
|
| 191 |
-
https://example.com/p/ 1-6
|
| 192 |
-
https://example.com/xample/something-something-1279?p=1 1-3
|
| 193 |
-
```
|
| 194 |
-
This will scrape pages 1 through 5 of the website.
|
| 195 |
-
|
| 196 |
-
2. **Custom Page Ranges**:
|
| 197 |
-
You can specify custom page ranges:
|
| 198 |
-
```
|
| 199 |
-
https://example.com/p/ 1-5,7,9-12
|
| 200 |
-
https://example.com/xample/something-something-1279?p=1 1,7,8,9
|
| 201 |
-
```
|
| 202 |
-
This will scrape pages 1 to 5, page 7, and pages 9 to 12.
|
| 203 |
-
|
| 204 |
-
3. **URL Patterns**:
|
| 205 |
-
For websites with different URL structures, you can specify a pattern:
|
| 206 |
-
```
|
| 207 |
-
https://example.com/search?q=cyberpunk&page={page} 1-5
|
| 208 |
-
```
|
| 209 |
-
Replace `{page}` with where the page number should be in the URL.
|
| 210 |
-
|
| 211 |
-
4. **Automatic Pattern Detection**:
|
| 212 |
-
If you don't specify a pattern, CyberScraper 2077 will attempt to detect the URL pattern automatically. However, for best results, specifying the pattern is recommended.
|
| 213 |
-
|
| 214 |
-
### Tips for Effective Multi-Page Scraping
|
| 215 |
-
|
| 216 |
-
- Start with a small range of pages to test before scraping a large number.
|
| 217 |
-
- Be mindful of the website's load and your scraping speed to avoid overloading servers.
|
| 218 |
-
- Use the `simulate_human` option for more natural scraping behavior on sites with anti-bot measures.
|
| 219 |
-
- Regularly check the website's `robots.txt` file and terms of service to ensure compliance.
|
| 220 |
-
|
| 221 |
-
### Example
|
| 222 |
-
|
| 223 |
-
```bash
|
| 224 |
-
URL Example : "https://news.ycombinator.com/?p=1 1-3 or 1,2,3,4"
|
| 225 |
```
|
|
|
|
| 226 |
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
#
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
```
|
| 260 |
-
http://example123abc.onion
|
| 261 |
-
```
|
| 262 |
-
|
| 263 |
-
2. **Safety Features**:
|
| 264 |
-
- Automatic .onion URL detection
|
| 265 |
-
- Built-in connection verification
|
| 266 |
-
- Tor Browser-like request headers
|
| 267 |
-
- Automatic circuit isolation
|
| 268 |
|
| 269 |
-
|
|
|
|
|
|
|
| 270 |
|
| 271 |
-
|
| 272 |
```python
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
```
|
| 280 |
|
| 281 |
-
##
|
| 282 |
|
| 283 |
-
|
| 284 |
-
- Use a VPN in addition to Tor for extra security
|
| 285 |
-
- Be patient as Tor connections can be slower than regular web scraping
|
| 286 |
-
- Avoid sending personal or identifying information through Tor
|
| 287 |
-
- Some .onion sites may be offline or unreachable
|
| 288 |
|
| 289 |
-
|
| 290 |
|
| 291 |
-
|
| 292 |
```bash
|
| 293 |
-
|
| 294 |
-
--network="host" \
|
| 295 |
-
-e OPENAI_API_KEY="your-api-key" \
|
| 296 |
-
cyberscraper-2077
|
| 297 |
```
|
| 298 |
-
|
| 299 |
-
### Example Usage
|
| 300 |
-
|
| 301 |
-
<p align="center">
|
| 302 |
-
<img src="https://i.postimg.cc/3JvhgtMP/cyberscraper-onion.png" alt="CyberScraper 2077 Onion Scrape">
|
| 303 |
-
</p>
|
| 304 |
-
|
| 305 |
-
## 🔐 Setup Google Sheets Authentication
|
| 306 |
-
|
| 307 |
-
1. Go to the Google Cloud Console (https://console.cloud.google.com/).
|
| 308 |
-
2. Select your project.
|
| 309 |
-
3. Navigate to "APIs & Services" > "Credentials".
|
| 310 |
-
4. Find your existing OAuth 2.0 Client ID and delete it.
|
| 311 |
-
5. Click "Create Credentials" > "OAuth client ID".
|
| 312 |
-
6. Choose "Web application" as the application type.
|
| 313 |
-
7. Name your client (e.g., "CyberScraper 2077 Web Client").
|
| 314 |
-
8. Under "Authorized JavaScript origins", add:
|
| 315 |
-
- http://localhost:8501
|
| 316 |
-
- http://localhost:8502
|
| 317 |
-
- http://127.0.0.1:8501
|
| 318 |
-
- http://127.0.0.1:8502
|
| 319 |
-
9. Under "Authorized redirect URIs", add:
|
| 320 |
-
- http://localhost:8501/
|
| 321 |
-
- http://127.0.0.1:8501/
|
| 322 |
-
- http://localhost:8502/
|
| 323 |
-
- http://127.0.0.1:8502/
|
| 324 |
-
10. Click "Create" to generate the new client ID.
|
| 325 |
-
11. Download the new client configuration JSON file and rename it to `client_secret.json`.
|
| 326 |
-
|
| 327 |
-
## ⚙️ Adjusting PlaywrightScraper Settings (optional)
|
| 328 |
-
|
| 329 |
-
Customize the `PlaywrightScraper` settings to fit your scraping needs. If some websites are giving you issues, you might want to check the behavior of the website:
|
| 330 |
-
|
| 331 |
```bash
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
bypass_cloudflare: bool = True:
|
| 337 |
```
|
| 338 |
|
| 339 |
-
|
|
|
|
| 340 |
|
| 341 |
-
|
| 342 |
|
| 343 |
-
|
| 344 |
|
| 345 |
-
|
| 346 |
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 352 |
|
| 353 |
-
## 🔧 Troubleshooting
|
| 354 |
|
| 355 |
-
|
| 356 |
|
| 357 |
-
|
| 358 |
|
| 359 |
-
|
| 360 |
-
|
|
|
|
|
|
|
| 361 |
|
| 362 |
-
**Q: Can I use this for commercial purposes?**
|
| 363 |
-
A: Yes, under the terms of the MIT License.
|
| 364 |
|
| 365 |
-
|
| 366 |
|
| 367 |
-
|
| 368 |
|
| 369 |
-
|
| 370 |
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
- Website: [owen.sh](https://owen.sh)
|
| 375 |
-
|
| 376 |
-
## 🚨 Disclaimer
|
| 377 |
-
|
| 378 |
-
Listen up, choombas! Before you jack into this code, you better understand the risks:
|
| 379 |
|
| 380 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
|
| 382 |
-
|
| 383 |
|
| 384 |
-
|
| 385 |
|
| 386 |
-
|
| 387 |
|
| 388 |
-
|
|
|
|
| 389 |
|
| 390 |
-
|
| 391 |
|
| 392 |
-
|
| 393 |
|
| 394 |
-
|
| 395 |
|
| 396 |
-
|
|
|
|
| 397 |
|
| 398 |
---
|
| 399 |
-
|
| 400 |
-
<p align="center">
|
| 401 |
-
<strong>CyberScraper 2077 – Because in 2077, what makes someone a criminal? Getting caught.</strong>
|
| 402 |
-
</p>
|
| 403 |
-
|
| 404 |
-
<p align="center">
|
| 405 |
-
Built with love and chrome by the streets of Night City | © 2077 Owen Singh
|
| 406 |
-
</p>
|
|
|
|
| 1 |
---
|
| 2 |
title: Scraper Hub
|
| 3 |
+
emoji: 🕷️
|
| 4 |
+
colorFrom: purple
|
| 5 |
+
colorTo: blue
|
| 6 |
sdk: docker
|
| 7 |
app_port: 7860
|
| 8 |
---
|
| 9 |
+
<!-- mcp-name: io.github.D4Vinci/Scrapling -->
|
| 10 |
+
|
| 11 |
+
<h1 align="center">
|
| 12 |
+
<a href="https://scrapling.readthedocs.io">
|
| 13 |
+
<picture>
|
| 14 |
+
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
|
| 15 |
+
<img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
|
| 16 |
+
</picture>
|
| 17 |
+
</a>
|
| 18 |
+
<br>
|
| 19 |
+
<small>Effortless Web Scraping for the Modern Web</small>
|
| 20 |
+
</h1>
|
| 21 |
|
| 22 |
<p align="center">
|
| 23 |
+
<a href="https://trendshift.io/repositories/14244" target="_blank"><img src="https://trendshift.io/api/badge/repositories/14244" alt="D4Vinci%2FScrapling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
| 24 |
+
<br/>
|
| 25 |
+
<a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_AR.md">العربيه</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_ES.md">Español</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_DE.md">Deutsch</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_CN.md">简体中文</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_JP.md">日本語</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_RU.md">Русский</a>
|
| 26 |
+
<br/>
|
| 27 |
+
<a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
|
| 28 |
+
<img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
|
| 29 |
+
<a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
|
| 30 |
+
<img alt="PyPI version" src="https://badge.fury.io/py/Scrapling.svg"></a>
|
| 31 |
+
<a href="https://pepy.tech/project/scrapling" alt="PyPI Downloads">
|
| 32 |
+
<img alt="PyPI Downloads" src="https://static.pepy.tech/personalized-badge/scrapling?period=total&units=INTERNATIONAL_SYSTEM&left_color=GREY&right_color=GREEN&left_text=Downloads"></a>
|
| 33 |
+
<br/>
|
| 34 |
+
<a href="https://discord.gg/EMgGbDceNQ" alt="Discord" target="_blank">
|
| 35 |
+
<img alt="Discord" src="https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ">
|
| 36 |
+
</a>
|
| 37 |
+
<a href="https://x.com/Scrapling_dev" alt="X (formerly Twitter)">
|
| 38 |
+
<img alt="X (formerly Twitter) Follow" src="https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev">
|
| 39 |
+
</a>
|
| 40 |
+
<br/>
|
| 41 |
+
<a href="https://pypi.org/project/scrapling/" alt="Supported Python versions">
|
| 42 |
+
<img alt="Supported Python versions" src="https://img.shields.io/pypi/pyversions/scrapling.svg"></a>
|
| 43 |
</p>
|
| 44 |
|
| 45 |
<p align="center">
|
| 46 |
+
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/"><strong>Selection methods</strong></a>
|
| 47 |
+
·
|
| 48 |
+
<a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/"><strong>Fetchers</strong></a>
|
| 49 |
+
·
|
| 50 |
+
<a href="https://scrapling.readthedocs.io/en/latest/spiders/architecture.html"><strong>Spiders</strong></a>
|
| 51 |
+
·
|
| 52 |
+
<a href="https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html"><strong>Proxy Rotation</strong></a>
|
| 53 |
+
·
|
| 54 |
+
<a href="https://scrapling.readthedocs.io/en/latest/cli/overview/"><strong>CLI</strong></a>
|
| 55 |
+
·
|
| 56 |
+
<a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/"><strong>MCP</strong></a>
|
| 57 |
</p>
|
| 58 |
|
| 59 |
+
Scrapling is an adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
+
Its parser learns from website changes and automatically relocates your elements when pages update. Its fetchers bypass anti-bot systems like Cloudflare Turnstile out of the box. And its spider framework lets you scale up to concurrent, multi-session crawls with pause/resume and automatic proxy rotation — all in a few lines of Python. One library, zero compromises.
|
| 62 |
|
| 63 |
+
Blazing fast crawls with real-time stats and streaming. Built by Web Scrapers for Web Scrapers and regular users, there's something for everyone.
|
| 64 |
|
| 65 |
+
```python
|
| 66 |
+
from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
|
| 67 |
+
StealthyFetcher.adaptive = True
|
| 68 |
+
p = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # Fetch website under the radar!
|
| 69 |
+
products = p.css('.product', auto_save=True) # Scrape data that survives website design changes!
|
| 70 |
+
products = p.css('.product', adaptive=True) # Later, if the website structure changes, pass `adaptive=True` to find them!
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
```
|
| 72 |
+
Or scale up to full crawls
|
| 73 |
+
```python
|
| 74 |
+
from scrapling.spiders import Spider, Response
|
| 75 |
|
| 76 |
+
class MySpider(Spider):
|
| 77 |
+
name = "demo"
|
| 78 |
+
start_urls = ["https://example.com/"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
+
async def parse(self, response: Response):
|
| 81 |
+
for item in response.css('.product'):
|
| 82 |
+
yield {"title": item.css('h2::text').get()}
|
|
|
|
| 83 |
|
| 84 |
+
MySpider().start()
|
| 85 |
+
```
|
|
|
|
|
|
|
| 86 |
|
| 87 |
+
# Platinum Sponsors
|
| 88 |
|
| 89 |
+
<i><sub>Do you want to be the first company to show up here? Click [here](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)</sub></i>
|
| 90 |
+
# Sponsors
|
| 91 |
|
| 92 |
+
<!-- sponsors -->
|
| 93 |
|
| 94 |
+
<a href="https://www.scrapeless.com/en?utm_source=official&utm_term=scrapling" target="_blank" title="Effortless Web Scraping Toolkit for Business and Developers"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg"></a>
|
| 95 |
+
<a href="https://www.thordata.com/?ls=github&lk=github" target="_blank" title="Unblockable proxies and scraping infrastructure, delivering real-time, reliable web data to power AI models and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
|
| 96 |
+
<a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
|
| 97 |
+
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
| 98 |
+
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
| 99 |
+
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
| 100 |
+
<a href="https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci" target="_blank" title="The web scraping service that actually beats anti-bot systems!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png"></a>
|
| 101 |
+
<a href="https://proxyempire.io/" target="_blank" title="Collect The Data Your Project Needs with the Best Residential Proxies"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png"></a>
|
| 102 |
+
<a href="https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling" target="_blank" title="Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png"></a>
|
| 103 |
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
+
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
| 106 |
+
<a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
|
| 107 |
+
<a href="https://browser.cash/?utm_source=D4Vinci&utm_medium=referral" target="_blank" title="Browser Automation & AI Browser Agent Platform"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png"></a>
|
|
|
|
| 108 |
|
| 109 |
+
<!-- /sponsors -->
|
| 110 |
|
| 111 |
+
<i><sub>Do you want to show your ad here? Click [here](https://github.com/sponsors/D4Vinci) and choose the tier that suites you!</sub></i>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
+
---
|
| 114 |
|
| 115 |
+
## Key Features
|
| 116 |
+
|
| 117 |
+
### Spiders — A Full Crawling Framework
|
| 118 |
+
- 🕷️ **Scrapy-like Spider API**: Define spiders with `start_urls`, async `parse` callbacks, and `Request`/`Response` objects.
|
| 119 |
+
- ⚡ **Concurrent Crawling**: Configurable concurrency limits, per-domain throttling, and download delays.
|
| 120 |
+
- 🔄 **Multi-Session Support**: Unified interface for HTTP requests, and stealthy headless browsers in a single spider — route requests to different sessions by ID.
|
| 121 |
+
- 💾 **Pause & Resume**: Checkpoint-based crawl persistence. Press Ctrl+C for a graceful shutdown; restart to resume from where you left off.
|
| 122 |
+
- 📡 **Streaming Mode**: Stream scraped items as they arrive via `async for item in spider.stream()` with real-time stats — ideal for UI, pipelines, and long-running crawls.
|
| 123 |
+
- 🛡️ **Blocked Request Detection**: Automatic detection and retry of blocked requests with customizable logic.
|
| 124 |
+
- 📦 **Built-in Export**: Export results through hooks and your own pipeline or the built-in JSON/JSONL with `result.items.to_json()` / `result.items.to_jsonl()` respectively.
|
| 125 |
+
|
| 126 |
+
### Advanced Websites Fetching with Session Support
|
| 127 |
+
- **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class. Can impersonate browsers' TLS fingerprint, headers, and use HTTP/3.
|
| 128 |
+
- **Dynamic Loading**: Fetch dynamic websites with full browser automation through the `DynamicFetcher` class supporting Playwright's Chromium and Google's Chrome.
|
| 129 |
+
- **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` and fingerprint spoofing. Can easily bypass all types of Cloudflare's Turnstile/Interstitial with automation.
|
| 130 |
+
- **Session Management**: Persistent session support with `FetcherSession`, `StealthySession`, and `DynamicSession` classes for cookie and state management across requests.
|
| 131 |
+
- **Proxy Rotation**: Built-in `ProxyRotator` with cyclic or custom rotation strategies across all session types, plus per-request proxy overrides.
|
| 132 |
+
- **Domain Blocking**: Block requests to specific domains (and their subdomains) in browser-based fetchers.
|
| 133 |
+
- **Async Support**: Complete async support across all fetchers and dedicated async session classes.
|
| 134 |
+
|
| 135 |
+
### Adaptive Scraping & AI Integration
|
| 136 |
+
- 🔄 **Smart Element Tracking**: Relocate elements after website changes using intelligent similarity algorithms.
|
| 137 |
+
- 🎯 **Smart Flexible Selection**: CSS selectors, XPath selectors, filter-based search, text search, regex search, and more.
|
| 138 |
+
- 🔍 **Find Similar Elements**: Automatically locate elements similar to found elements.
|
| 139 |
+
- 🤖 **MCP Server to be used with AI**: Built-in MCP server for AI-assisted Web Scraping and data extraction. The MCP server features powerful, custom capabilities that leverage Scrapling to extract targeted content before passing it to the AI (Claude/Cursor/etc), thereby speeding up operations and reducing costs by minimizing token usage. ([demo video](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
|
| 140 |
+
|
| 141 |
+
### High-Performance & battle-tested Architecture
|
| 142 |
+
- 🚀 **Lightning Fast**: Optimized performance outperforming most Python scraping libraries.
|
| 143 |
+
- 🔋 **Memory Efficient**: Optimized data structures and lazy loading for a minimal memory footprint.
|
| 144 |
+
- ⚡ **Fast JSON Serialization**: 10x faster than the standard library.
|
| 145 |
+
- 🏗️ **Battle tested**: Not only does Scrapling have 92% test coverage and full type hints coverage, but it has been used daily by hundreds of Web Scrapers over the past year.
|
| 146 |
+
|
| 147 |
+
### Developer/Web Scraper Friendly Experience
|
| 148 |
+
- 🎯 **Interactive Web Scraping Shell**: Optional built-in IPython shell with Scrapling integration, shortcuts, and new tools to speed up Web Scraping scripts development, like converting curl requests to Scrapling requests and viewing requests results in your browser.
|
| 149 |
+
- 🚀 **Use it directly from the Terminal**: Optionally, you can use Scrapling to scrape a URL without writing a single line of code!
|
| 150 |
+
- 🛠️ **Rich Navigation API**: Advanced DOM traversal with parent, sibling, and child navigation methods.
|
| 151 |
+
- 🧬 **Enhanced Text Processing**: Built-in regex, cleaning methods, and optimized string operations.
|
| 152 |
+
- 📝 **Auto Selector Generation**: Generate robust CSS/XPath selectors for any element.
|
| 153 |
+
- 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup with the same pseudo-elements used in Scrapy/Parsel.
|
| 154 |
+
- 📘 **Complete Type Coverage**: Full type hints for excellent IDE support and code completion. The entire codebase is automatically scanned with **PyRight** and **MyPy** with each change.
|
| 155 |
+
- 🔋 **Ready Docker image**: With each release, a Docker image containing all browsers is automatically built and pushed.
|
| 156 |
+
|
| 157 |
+
## Getting Started
|
| 158 |
+
|
| 159 |
+
Let's give you a quick glimpse of what Scrapling can do without deep diving.
|
| 160 |
+
|
| 161 |
+
### Basic Usage
|
| 162 |
+
HTTP requests with session support
|
| 163 |
+
```python
|
| 164 |
+
from scrapling.fetchers import Fetcher, FetcherSession
|
| 165 |
|
| 166 |
+
with FetcherSession(impersonate='chrome') as session: # Use latest version of Chrome's TLS fingerprint
|
| 167 |
+
page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
| 168 |
+
quotes = page.css('.quote .text::text').getall()
|
| 169 |
|
| 170 |
+
# Or use one-off requests
|
| 171 |
+
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 172 |
+
quotes = page.css('.quote .text::text').getall()
|
| 173 |
+
```
|
| 174 |
+
Advanced stealth mode
|
| 175 |
+
```python
|
| 176 |
+
from scrapling.fetchers import StealthyFetcher, StealthySession
|
| 177 |
|
| 178 |
+
with StealthySession(headless=True, solve_cloudflare=True) as session: # Keep the browser open until you finish
|
| 179 |
+
page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
|
| 180 |
+
data = page.css('#padded_content a').getall()
|
| 181 |
|
| 182 |
+
# Or use one-off request style, it opens the browser for this request, then closes it after finishing
|
| 183 |
+
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
|
| 184 |
+
data = page.css('#padded_content a').getall()
|
| 185 |
+
```
|
| 186 |
+
Full browser automation
|
| 187 |
+
```python
|
| 188 |
+
from scrapling.fetchers import DynamicFetcher, DynamicSession
|
| 189 |
|
| 190 |
+
with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # Keep the browser open until you finish
|
| 191 |
+
page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
|
| 192 |
+
data = page.xpath('//span[@class="text"]/text()').getall() # XPath selector if you prefer it
|
| 193 |
|
| 194 |
+
# Or use one-off request style, it opens the browser for this request, then closes it after finishing
|
| 195 |
+
page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
|
| 196 |
+
data = page.css('.quote .text::text').getall()
|
| 197 |
+
```
|
| 198 |
|
| 199 |
+
### Spiders
|
| 200 |
+
Build full crawlers with concurrent requests, multiple session types, and pause/resume:
|
| 201 |
+
```python
|
| 202 |
+
from scrapling.spiders import Spider, Request, Response
|
| 203 |
+
|
| 204 |
+
class QuotesSpider(Spider):
|
| 205 |
+
name = "quotes"
|
| 206 |
+
start_urls = ["https://quotes.toscrape.com/"]
|
| 207 |
+
concurrent_requests = 10
|
| 208 |
+
|
| 209 |
+
async def parse(self, response: Response):
|
| 210 |
+
for quote in response.css('.quote'):
|
| 211 |
+
yield {
|
| 212 |
+
"text": quote.css('.text::text').get(),
|
| 213 |
+
"author": quote.css('.author::text').get(),
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
next_page = response.css('.next a')
|
| 217 |
+
if next_page:
|
| 218 |
+
yield response.follow(next_page[0].attrib['href'])
|
| 219 |
+
|
| 220 |
+
result = QuotesSpider().start()
|
| 221 |
+
print(f"Scraped {len(result.items)} quotes")
|
| 222 |
+
result.items.to_json("quotes.json")
|
| 223 |
```
|
| 224 |
+
Use multiple session types in a single spider:
|
| 225 |
+
```python
|
| 226 |
+
from scrapling.spiders import Spider, Request, Response
|
| 227 |
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession
|
| 228 |
+
|
| 229 |
+
class MultiSessionSpider(Spider):
|
| 230 |
+
name = "multi"
|
| 231 |
+
start_urls = ["https://example.com/"]
|
| 232 |
+
|
| 233 |
+
def configure_sessions(self, manager):
|
| 234 |
+
manager.add("fast", FetcherSession(impersonate="chrome"))
|
| 235 |
+
manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)
|
| 236 |
+
|
| 237 |
+
async def parse(self, response: Response):
|
| 238 |
+
for link in response.css('a::attr(href)').getall():
|
| 239 |
+
# Route protected pages through the stealth session
|
| 240 |
+
if "protected" in link:
|
| 241 |
+
yield Request(link, sid="stealth")
|
| 242 |
+
else:
|
| 243 |
+
yield Request(link, sid="fast", callback=self.parse) # explicit callback
|
| 244 |
```
|
| 245 |
+
Pause and resume long crawls with checkpoints by running the spider like this:
|
| 246 |
+
```python
|
| 247 |
+
QuotesSpider(crawldir="./crawl_data").start()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
```
|
| 249 |
+
Press Ctrl+C to pause gracefully — progress is saved automatically. Later, when you start the spider again, pass the same `crawldir`, and it will resume from where it stopped.
|
| 250 |
|
| 251 |
+
### Advanced Parsing & Navigation
|
| 252 |
+
```python
|
| 253 |
+
from scrapling.fetchers import Fetcher
|
| 254 |
+
|
| 255 |
+
# Rich element selection and navigation
|
| 256 |
+
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 257 |
+
|
| 258 |
+
# Get quotes with multiple selection methods
|
| 259 |
+
quotes = page.css('.quote') # CSS selector
|
| 260 |
+
quotes = page.xpath('//div[@class="quote"]') # XPath
|
| 261 |
+
quotes = page.find_all('div', {'class': 'quote'}) # BeautifulSoup-style
|
| 262 |
+
# Same as
|
| 263 |
+
quotes = page.find_all('div', class_='quote')
|
| 264 |
+
quotes = page.find_all(['div'], class_='quote')
|
| 265 |
+
quotes = page.find_all(class_='quote') # and so on...
|
| 266 |
+
# Find element by text content
|
| 267 |
+
quotes = page.find_by_text('quote', tag='div')
|
| 268 |
+
|
| 269 |
+
# Advanced navigation
|
| 270 |
+
quote_text = page.css('.quote')[0].css('.text::text').get()
|
| 271 |
+
quote_text = page.css('.quote').css('.text::text').getall() # Chained selectors
|
| 272 |
+
first_quote = page.css('.quote')[0]
|
| 273 |
+
author = first_quote.next_sibling.css('.author::text')
|
| 274 |
+
parent_container = first_quote.parent
|
| 275 |
+
|
| 276 |
+
# Element relationships and similarity
|
| 277 |
+
similar_elements = first_quote.find_similar()
|
| 278 |
+
below_elements = first_quote.below_elements()
|
| 279 |
+
```
|
| 280 |
+
You can use the parser right away if you don't want to fetch websites like below:
|
| 281 |
+
```python
|
| 282 |
+
from scrapling.parser import Selector
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
|
| 284 |
+
page = Selector("<html>...</html>")
|
| 285 |
+
```
|
| 286 |
+
And it works precisely the same way!
|
| 287 |
|
| 288 |
+
### Async Session Management Examples
|
| 289 |
```python
|
| 290 |
+
import asyncio
|
| 291 |
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession
|
| 292 |
+
|
| 293 |
+
async with FetcherSession(http3=True) as session: # `FetcherSession` is context-aware and can work in both sync/async patterns
|
| 294 |
+
page1 = session.get('https://quotes.toscrape.com/')
|
| 295 |
+
page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
|
| 296 |
+
|
| 297 |
+
# Async session usage
|
| 298 |
+
async with AsyncStealthySession(max_pages=2) as session:
|
| 299 |
+
tasks = []
|
| 300 |
+
urls = ['https://example.com/page1', 'https://example.com/page2']
|
| 301 |
+
|
| 302 |
+
for url in urls:
|
| 303 |
+
task = session.fetch(url)
|
| 304 |
+
tasks.append(task)
|
| 305 |
+
|
| 306 |
+
print(session.get_pool_stats()) # Optional - The status of the browser tabs pool (busy/free/error)
|
| 307 |
+
results = await asyncio.gather(*tasks)
|
| 308 |
+
print(session.get_pool_stats())
|
| 309 |
```
|
| 310 |
|
| 311 |
+
## CLI & Interactive Shell
|
| 312 |
|
| 313 |
+
Scrapling includes a powerful command-line interface:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
|
| 315 |
+
[](https://asciinema.org/a/736339)
|
| 316 |
|
| 317 |
+
Launch the interactive Web Scraping shell
|
| 318 |
```bash
|
| 319 |
+
scrapling shell
|
|
|
|
|
|
|
|
|
|
| 320 |
```
|
| 321 |
+
Extract pages to a file directly without programming (Extracts the content inside the `body` tag by default). If the output file ends with `.txt`, then the text content of the target will be extracted. If it ends in `.md`, it will be a Markdown representation of the HTML content; if it ends in `.html`, it will be the HTML content itself.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
```bash
|
| 323 |
+
scrapling extract get 'https://example.com' content.md
|
| 324 |
+
scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # All elements matching the CSS selector '#fromSkipToProducts'
|
| 325 |
+
scrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless
|
| 326 |
+
scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare
|
|
|
|
| 327 |
```
|
| 328 |
|
| 329 |
+
> [!NOTE]
|
| 330 |
+
> There are many additional features, but we want to keep this page concise, including the MCP server and the interactive Web Scraping Shell. Check out the full documentation [here](https://scrapling.readthedocs.io/en/latest/)
|
| 331 |
|
| 332 |
+
## Performance Benchmarks
|
| 333 |
|
| 334 |
+
Scrapling isn't just powerful—it's also blazing fast. The following benchmarks compare Scrapling's parser with the latest versions of other popular libraries.
|
| 335 |
|
| 336 |
+
### Text Extraction Speed Test (5000 nested elements)
|
| 337 |
|
| 338 |
+
| # | Library | Time (ms) | vs Scrapling |
|
| 339 |
+
|---|:-----------------:|:---------:|:------------:|
|
| 340 |
+
| 1 | Scrapling | 2.02 | 1.0x |
|
| 341 |
+
| 2 | Parsel/Scrapy | 2.04 | 1.01 |
|
| 342 |
+
| 3 | Raw Lxml | 2.54 | 1.257 |
|
| 343 |
+
| 4 | PyQuery | 24.17 | ~12x |
|
| 344 |
+
| 5 | Selectolax | 82.63 | ~41x |
|
| 345 |
+
| 6 | MechanicalSoup | 1549.71 | ~767.1x |
|
| 346 |
+
| 7 | BS4 with Lxml | 1584.31 | ~784.3x |
|
| 347 |
+
| 8 | BS4 with html5lib | 3391.91 | ~1679.1x |
|
| 348 |
|
|
|
|
| 349 |
|
| 350 |
+
### Element Similarity & Text Search Performance
|
| 351 |
|
| 352 |
+
Scrapling's adaptive element finding capabilities significantly outperform alternatives:
|
| 353 |
|
| 354 |
+
| Library | Time (ms) | vs Scrapling |
|
| 355 |
+
|-------------|:---------:|:------------:|
|
| 356 |
+
| Scrapling | 2.39 | 1.0x |
|
| 357 |
+
| AutoScraper | 12.45 | 5.209x |
|
| 358 |
|
|
|
|
|
|
|
| 359 |
|
| 360 |
+
> All benchmarks represent averages of 100+ runs. See [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology.
|
| 361 |
|
| 362 |
+
## Installation
|
| 363 |
|
| 364 |
+
Scrapling requires Python 3.10 or higher:
|
| 365 |
|
| 366 |
+
```bash
|
| 367 |
+
pip install scrapling
|
| 368 |
+
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 369 |
|
| 370 |
+
This installation only includes the parser engine and its dependencies, without any fetchers or commandline dependencies.
|
| 371 |
+
|
| 372 |
+
### Optional Dependencies
|
| 373 |
+
|
| 374 |
+
1. If you are going to use any of the extra features below, the fetchers, or their classes, you will need to install fetchers' dependencies and their browser dependencies as follows:
|
| 375 |
+
```bash
|
| 376 |
+
pip install "scrapling[fetchers]"
|
| 377 |
+
|
| 378 |
+
scrapling install # normal install
|
| 379 |
+
scrapling install --force # force reinstall
|
| 380 |
+
```
|
| 381 |
+
|
| 382 |
+
This downloads all browsers, along with their system dependencies and fingerprint manipulation dependencies.
|
| 383 |
+
|
| 384 |
+
Or you can install them from the code instead of running a command like this:
|
| 385 |
+
```python
|
| 386 |
+
from scrapling.cli import install
|
| 387 |
+
|
| 388 |
+
install([], standalone_mode=False) # normal install
|
| 389 |
+
install(["--force"], standalone_mode=False) # force reinstall
|
| 390 |
+
```
|
| 391 |
+
|
| 392 |
+
2. Extra features:
|
| 393 |
+
- Install the MCP server feature:
|
| 394 |
+
```bash
|
| 395 |
+
pip install "scrapling[ai]"
|
| 396 |
+
```
|
| 397 |
+
- Install shell features (Web Scraping shell and the `extract` command):
|
| 398 |
+
```bash
|
| 399 |
+
pip install "scrapling[shell]"
|
| 400 |
+
```
|
| 401 |
+
- Install everything:
|
| 402 |
+
```bash
|
| 403 |
+
pip install "scrapling[all]"
|
| 404 |
+
```
|
| 405 |
+
Remember that you need to install the browser dependencies with `scrapling install` after any of these extras (if you didn't already)
|
| 406 |
+
|
| 407 |
+
### Docker
|
| 408 |
+
You can also install a Docker image with all extras and browsers with the following command from DockerHub:
|
| 409 |
+
```bash
|
| 410 |
+
docker pull pyd4vinci/scrapling
|
| 411 |
+
```
|
| 412 |
+
Or download it from the GitHub registry:
|
| 413 |
+
```bash
|
| 414 |
+
docker pull ghcr.io/d4vinci/scrapling:latest
|
| 415 |
+
```
|
| 416 |
+
This image is automatically built and pushed using GitHub Actions and the repository's main branch.
|
| 417 |
|
| 418 |
+
## Contributing
|
| 419 |
|
| 420 |
+
We welcome contributions! Please read our [contributing guidelines](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) before getting started.
|
| 421 |
|
| 422 |
+
## Disclaimer
|
| 423 |
|
| 424 |
+
> [!CAUTION]
|
| 425 |
+
> This library is provided for educational and research purposes only. By using this library, you agree to comply with local and international data scraping and privacy laws. The authors and contributors are not responsible for any misuse of this software. Always respect the terms of service of websites and robots.txt files.
|
| 426 |
|
| 427 |
+
## License
|
| 428 |
|
| 429 |
+
This work is licensed under the BSD-3-Clause License.
|
| 430 |
|
| 431 |
+
## Acknowledgments
|
| 432 |
|
| 433 |
+
This project includes code adapted from:
|
| 434 |
+
- Parsel (BSD License)—Used for [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py) submodule
|
| 435 |
|
| 436 |
---
|
| 437 |
+
<div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ROADMAP.md
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## TODOs
|
| 2 |
+
- [x] Add more tests and increase the code coverage.
|
| 3 |
+
- [x] Structure the tests folder in a better way.
|
| 4 |
+
- [x] Add more documentation.
|
| 5 |
+
- [x] Add the browsing ability.
|
| 6 |
+
- [x] Create detailed documentation for the 'readthedocs' website, preferably add GitHub action for deploying it.
|
| 7 |
+
- [ ] Create a Scrapy plugin/decorator to make it replace parsel in the response argument when needed.
|
| 8 |
+
- [x] Need to add more functionality to `AttributesHandler` and more navigation functions to `Selector` object (ex: functions similar to map, filter, and reduce functions but here pass it to the element and the function is executed on children, siblings, next elements, etc...)
|
| 9 |
+
- [x] Add `.filter` method to `Selectors` object and other similar methods.
|
| 10 |
+
- [ ] Add functionality to automatically detect pagination URLs
|
| 11 |
+
- [ ] Add the ability to auto-detect schemas in pages and manipulate them.
|
| 12 |
+
- [ ] Add `analyzer` ability that tries to learn about the page through meta-elements and return what it learned
|
| 13 |
+
- [ ] Add the ability to generate a regex from a group of elements (Like for all href attributes)
|
| 14 |
+
-
|
benchmarks.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import functools
|
| 2 |
+
import time
|
| 3 |
+
import timeit
|
| 4 |
+
from statistics import mean
|
| 5 |
+
|
| 6 |
+
import requests
|
| 7 |
+
from autoscraper import AutoScraper
|
| 8 |
+
from bs4 import BeautifulSoup
|
| 9 |
+
from lxml import etree, html
|
| 10 |
+
from mechanicalsoup import StatefulBrowser
|
| 11 |
+
from parsel import Selector
|
| 12 |
+
from pyquery import PyQuery as pq
|
| 13 |
+
from selectolax.parser import HTMLParser
|
| 14 |
+
|
| 15 |
+
from scrapling import Selector as ScraplingSelector
|
| 16 |
+
|
| 17 |
+
large_html = (
|
| 18 |
+
"<html><body>" + '<div class="item">' * 5000 + "</div>" * 5000 + "</body></html>"
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def benchmark(func):
|
| 23 |
+
@functools.wraps(func)
|
| 24 |
+
def wrapper(*args, **kwargs):
|
| 25 |
+
benchmark_name = func.__name__.replace("test_", "").replace("_", " ")
|
| 26 |
+
print(f"-> {benchmark_name}", end=" ", flush=True)
|
| 27 |
+
# Warm-up phase
|
| 28 |
+
timeit.repeat(
|
| 29 |
+
lambda: func(*args, **kwargs), number=2, repeat=2, globals=globals()
|
| 30 |
+
)
|
| 31 |
+
# Measure time (1 run, repeat 100 times, take average)
|
| 32 |
+
times = timeit.repeat(
|
| 33 |
+
lambda: func(*args, **kwargs),
|
| 34 |
+
number=1,
|
| 35 |
+
repeat=100,
|
| 36 |
+
globals=globals(),
|
| 37 |
+
timer=time.process_time,
|
| 38 |
+
)
|
| 39 |
+
min_time = round(mean(times) * 1000, 2) # Convert to milliseconds
|
| 40 |
+
print(f"average execution time: {min_time} ms")
|
| 41 |
+
return min_time
|
| 42 |
+
|
| 43 |
+
return wrapper
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
@benchmark
|
| 47 |
+
def test_lxml():
|
| 48 |
+
return [
|
| 49 |
+
e.text
|
| 50 |
+
for e in etree.fromstring(
|
| 51 |
+
large_html,
|
| 52 |
+
# Scrapling and Parsel use the same parser inside, so this is just to make it fair
|
| 53 |
+
parser=html.HTMLParser(recover=True, huge_tree=True),
|
| 54 |
+
).cssselect(".item")
|
| 55 |
+
]
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@benchmark
|
| 59 |
+
def test_bs4_lxml():
|
| 60 |
+
return [e.text for e in BeautifulSoup(large_html, "lxml").select(".item")]
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
@benchmark
|
| 64 |
+
def test_bs4_html5lib():
|
| 65 |
+
return [e.text for e in BeautifulSoup(large_html, "html5lib").select(".item")]
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
@benchmark
|
| 69 |
+
def test_pyquery():
|
| 70 |
+
return [e.text() for e in pq(large_html)(".item").items()]
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
@benchmark
|
| 74 |
+
def test_scrapling():
|
| 75 |
+
# No need to do `.extract()` like parsel to extract text
|
| 76 |
+
# Also, this is faster than `[t.text for t in Selector(large_html, adaptive=False).css('.item')]`
|
| 77 |
+
# for obvious reasons, of course.
|
| 78 |
+
return ScraplingSelector(large_html, adaptive=False).css(".item::text").getall()
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
@benchmark
|
| 82 |
+
def test_parsel():
|
| 83 |
+
return Selector(text=large_html).css(".item::text").extract()
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
@benchmark
|
| 87 |
+
def test_mechanicalsoup():
|
| 88 |
+
browser = StatefulBrowser()
|
| 89 |
+
browser.open_fake_page(large_html)
|
| 90 |
+
return [e.text for e in browser.page.select(".item")]
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
@benchmark
|
| 94 |
+
def test_selectolax():
|
| 95 |
+
return [node.text() for node in HTMLParser(large_html).css(".item")]
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def display(results):
|
| 99 |
+
# Sort and display results
|
| 100 |
+
sorted_results = sorted(results.items(), key=lambda x: x[1]) # Sort by time
|
| 101 |
+
scrapling_time = results["Scrapling"]
|
| 102 |
+
print("\nRanked Results (fastest to slowest):")
|
| 103 |
+
print(f" i. {'Library tested':<18} | {'avg. time (ms)':<15} | vs Scrapling")
|
| 104 |
+
print("-" * 50)
|
| 105 |
+
for i, (test_name, test_time) in enumerate(sorted_results, 1):
|
| 106 |
+
compare = round(test_time / scrapling_time, 3)
|
| 107 |
+
print(f" {i}. {test_name:<18} | {str(test_time):<15} | {compare}")
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
@benchmark
|
| 111 |
+
def test_scrapling_text(request_html):
|
| 112 |
+
return ScraplingSelector(request_html, adaptive=False).find_by_text("Tipping the Velvet", first_match=True, clean_match=False).find_similar(ignore_attributes=["title"])
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
@benchmark
|
| 116 |
+
def test_autoscraper(request_html):
|
| 117 |
+
# autoscraper by default returns elements text
|
| 118 |
+
return AutoScraper().build(html=request_html, wanted_list=["Tipping the Velvet"])
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
if __name__ == "__main__":
|
| 122 |
+
print(
|
| 123 |
+
" Benchmark: Speed of parsing and retrieving the text content of 5000 nested elements \n"
|
| 124 |
+
)
|
| 125 |
+
results1 = {
|
| 126 |
+
"Raw Lxml": test_lxml(),
|
| 127 |
+
"Parsel/Scrapy": test_parsel(),
|
| 128 |
+
"Scrapling": test_scrapling(),
|
| 129 |
+
"Selectolax": test_selectolax(),
|
| 130 |
+
"PyQuery": test_pyquery(),
|
| 131 |
+
"BS4 with Lxml": test_bs4_lxml(),
|
| 132 |
+
"MechanicalSoup": test_mechanicalsoup(),
|
| 133 |
+
"BS4 with html5lib": test_bs4_html5lib(),
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
display(results1)
|
| 137 |
+
print("\n" + "=" * 25)
|
| 138 |
+
req = requests.get("https://books.toscrape.com/index.html")
|
| 139 |
+
print(
|
| 140 |
+
" Benchmark: Speed of searching for an element by text content, and retrieving the text of similar elements\n"
|
| 141 |
+
)
|
| 142 |
+
results2 = {
|
| 143 |
+
"Scrapling": test_scrapling_text(req.text),
|
| 144 |
+
"AutoScraper": test_autoscraper(req.text),
|
| 145 |
+
}
|
| 146 |
+
display(results2)
|
cleanup.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import shutil
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
# Clean up after installing for local development
|
| 6 |
+
def clean():
|
| 7 |
+
# Get the current directory
|
| 8 |
+
base_dir = Path.cwd()
|
| 9 |
+
|
| 10 |
+
# Directories and patterns to clean
|
| 11 |
+
cleanup_patterns = [
|
| 12 |
+
"build",
|
| 13 |
+
"dist",
|
| 14 |
+
"*.egg-info",
|
| 15 |
+
"__pycache__",
|
| 16 |
+
".eggs",
|
| 17 |
+
".pytest_cache",
|
| 18 |
+
]
|
| 19 |
+
|
| 20 |
+
# Clean directories
|
| 21 |
+
for pattern in cleanup_patterns:
|
| 22 |
+
for path in base_dir.glob(pattern):
|
| 23 |
+
try:
|
| 24 |
+
if path.is_dir():
|
| 25 |
+
shutil.rmtree(path)
|
| 26 |
+
else:
|
| 27 |
+
path.unlink()
|
| 28 |
+
print(f"Removed: {path}")
|
| 29 |
+
except Exception as e:
|
| 30 |
+
print(f"Could not remove {path}: {e}")
|
| 31 |
+
|
| 32 |
+
# Remove compiled Python files
|
| 33 |
+
for path in base_dir.rglob("*.py[co]"):
|
| 34 |
+
try:
|
| 35 |
+
path.unlink()
|
| 36 |
+
print(f"Removed compiled file: {path}")
|
| 37 |
+
except Exception as e:
|
| 38 |
+
print(f"Could not remove {path}: {e}")
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
if __name__ == "__main__":
|
| 42 |
+
clean()
|
docs/README_AR.md
ADDED
|
@@ -0,0 +1,426 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!-- mcp-name: io.github.D4Vinci/Scrapling -->
|
| 2 |
+
|
| 3 |
+
<h1 align="center">
|
| 4 |
+
<a href="https://scrapling.readthedocs.io">
|
| 5 |
+
<picture>
|
| 6 |
+
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
|
| 7 |
+
<img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
|
| 8 |
+
</picture>
|
| 9 |
+
</a>
|
| 10 |
+
<br>
|
| 11 |
+
<small>Effortless Web Scraping for the Modern Web</small>
|
| 12 |
+
</h1>
|
| 13 |
+
|
| 14 |
+
<p align="center">
|
| 15 |
+
<a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
|
| 16 |
+
<img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
|
| 17 |
+
<a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
|
| 18 |
+
<img alt="PyPI version" src="https://badge.fury.io/py/Scrapling.svg"></a>
|
| 19 |
+
<a href="https://pepy.tech/project/scrapling" alt="PyPI Downloads">
|
| 20 |
+
<img alt="PyPI Downloads" src="https://static.pepy.tech/personalized-badge/scrapling?period=total&units=INTERNATIONAL_SYSTEM&left_color=GREY&right_color=GREEN&left_text=Downloads"></a>
|
| 21 |
+
<br/>
|
| 22 |
+
<a href="https://discord.gg/EMgGbDceNQ" alt="Discord" target="_blank">
|
| 23 |
+
<img alt="Discord" src="https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ">
|
| 24 |
+
</a>
|
| 25 |
+
<a href="https://x.com/Scrapling_dev" alt="X (formerly Twitter)">
|
| 26 |
+
<img alt="X (formerly Twitter) Follow" src="https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev">
|
| 27 |
+
</a>
|
| 28 |
+
<br/>
|
| 29 |
+
<a href="https://pypi.org/project/scrapling/" alt="Supported Python versions">
|
| 30 |
+
<img alt="Supported Python versions" src="https://img.shields.io/pypi/pyversions/scrapling.svg"></a>
|
| 31 |
+
</p>
|
| 32 |
+
|
| 33 |
+
<p align="center">
|
| 34 |
+
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/"><strong>طرق الاختيار</strong></a>
|
| 35 |
+
·
|
| 36 |
+
<a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/"><strong>اختيار Fetcher</strong></a>
|
| 37 |
+
·
|
| 38 |
+
<a href="https://scrapling.readthedocs.io/en/latest/spiders/architecture.html"><strong>العناكب</strong></a>
|
| 39 |
+
·
|
| 40 |
+
<a href="https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html"><strong>تدوير البروكسي</strong></a>
|
| 41 |
+
·
|
| 42 |
+
<a href="https://scrapling.readthedocs.io/en/latest/cli/overview/"><strong>واجهة سطر الأوامر</strong></a>
|
| 43 |
+
·
|
| 44 |
+
<a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/"><strong>وضع MCP</strong></a>
|
| 45 |
+
</p>
|
| 46 |
+
|
| 47 |
+
Scrapling هو إطار عمل تكيفي لـ Web Scraping يتعامل مع كل شيء من طلب واحد إلى زحف كامل النطاق.
|
| 48 |
+
|
| 49 |
+
محلله يتعلم من تغييرات المواقع ويعيد تحديد موقع عناصرك تلقائياً عند تحديث الصفحات. جوالبه تتجاوز أنظمة مكافحة الروبوتات مثل Cloudflare Turnstile مباشرةً. وإطار عمل Spider الخاص به يتيح لك التوسع إلى عمليات زحف متزامنة ومتعددة الجلسات مع إيقاف/استئناف وتدوير تلقائي لـ Proxy - كل ذلك في بضعة أسطر من Python. مكتبة واحدة، بدون تنازلات.
|
| 50 |
+
|
| 51 |
+
زحف سريع للغاية مع إحصائيات فورية و Streaming. مبني بواسطة مستخرجي الويب لمستخرجي الويب والمستخدمين العاديين، هناك شيء للجميع.
|
| 52 |
+
|
| 53 |
+
```python
|
| 54 |
+
from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
|
| 55 |
+
StealthyFetcher.adaptive = True
|
| 56 |
+
p = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # احصل على الموقع بشكل خفي!
|
| 57 |
+
products = p.css('.product', auto_save=True) # استخرج بيانات تنجو من تغييرات تصميم الموقع!
|
| 58 |
+
products = p.css('.product', adaptive=True) # لاحقاً، إذا تغيرت بنية الموقع، مرر `adaptive=True` للعثور عليها!
|
| 59 |
+
```
|
| 60 |
+
أو توسع إلى عمليات زحف كاملة
|
| 61 |
+
```python
|
| 62 |
+
from scrapling.spiders import Spider, Response
|
| 63 |
+
|
| 64 |
+
class MySpider(Spider):
|
| 65 |
+
name = "demo"
|
| 66 |
+
start_urls = ["https://example.com/"]
|
| 67 |
+
|
| 68 |
+
async def parse(self, response: Response):
|
| 69 |
+
for item in response.css('.product'):
|
| 70 |
+
yield {"title": item.css('h2::text').get()}
|
| 71 |
+
|
| 72 |
+
MySpider().start()
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# الرعاة البلاتينيون
|
| 77 |
+
|
| 78 |
+
<i><sub>هل تريد أن تكون أول شركة تظهر هنا؟ انقر [هنا](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)</sub></i>
|
| 79 |
+
# الرعاة
|
| 80 |
+
|
| 81 |
+
<!-- sponsors -->
|
| 82 |
+
|
| 83 |
+
<a href="https://www.scrapeless.com/en?utm_source=official&utm_term=scrapling" target="_blank" title="Effortless Web Scraping Toolkit for Business and Developers"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg"></a>
|
| 84 |
+
<a href="https://www.thordata.com/?ls=github&lk=github" target="_blank" title="Unblockable proxies and scraping infrastructure, delivering real-time, reliable web data to power AI models and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
|
| 85 |
+
<a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
|
| 86 |
+
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
| 87 |
+
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
| 88 |
+
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
| 89 |
+
<a href="https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci" target="_blank" title="The web scraping service that actually beats anti-bot systems!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png"></a>
|
| 90 |
+
<a href="https://proxyempire.io/" target="_blank" title="Collect The Data Your Project Needs with the Best Residential Proxies"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png"></a>
|
| 91 |
+
<a href="https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling" target="_blank" title="Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png"></a>
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
| 95 |
+
<a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
|
| 96 |
+
<a href="https://browser.cash/?utm_source=D4Vinci&utm_medium=referral" target="_blank" title="Browser Automation & AI Browser Agent Platform"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png"></a>
|
| 97 |
+
|
| 98 |
+
<!-- /sponsors -->
|
| 99 |
+
|
| 100 |
+
<i><sub>هل تريد عرض إعلانك هنا؟ انقر [هنا](https://github.com/sponsors/D4Vinci) واختر المستوى الذي يناسبك!</sub></i>
|
| 101 |
+
|
| 102 |
+
---
|
| 103 |
+
|
| 104 |
+
## الميزات الرئيسية
|
| 105 |
+
|
| 106 |
+
### Spiders — إطار عمل زحف كامل
|
| 107 |
+
- 🕷️ **واجهة Spider شبيهة بـ Scrapy**: عرّف Spiders مع `start_urls`، و async `parse` callbacks، وكائنات `Request`/`Response`.
|
| 108 |
+
- ⚡ **زحف متزامن**: حدود تزامن قابلة للتكوين، وتحكم بالسرعة حسب النطاق، وتأخيرات التنزيل.
|
| 109 |
+
- 🔄 **دعم الجلسات المتعددة**: واجهة موحدة لطلبات HTTP، ومتصفحات خفية بدون واجهة في Spider واحد — وجّه الطلبات إلى جلسات مختلفة بالمعرّف.
|
| 110 |
+
- 💾 **إيقاف واستئناف**: استمرارية الزحف القائمة على Checkpoint. اضغط Ctrl+C للإيقاف بسلاسة؛ أعد التشغيل للاستئناف من حيث توقفت.
|
| 111 |
+
- 📡 **وضع Streaming**: بث العناصر المستخرجة فور وصولها عبر `async for item in spider.stream()` مع إحصائيات فورية — مثالي لواجهات المستخدم وخطوط الأنابيب وعمليات الزحف الطويلة.
|
| 112 |
+
- 🛡️ **كشف الطلبات المحظورة**: كشف تلقائي وإعادة محاولة للطلبات المحظورة مع منطق قابل للتخصيص.
|
| 113 |
+
- 📦 **تصدير مدمج**: صدّر النتائج عبر الخطافات وخط الأنابيب الخاص بك أو JSON/JSONL المدمج مع `result.items.to_json()` / `result.items.to_jsonl()` على التوالي.
|
| 114 |
+
|
| 115 |
+
### جلب متقدم للمواقع مع دعم الجلسات
|
| 116 |
+
- **طلبات HTTP**: طلبات HTTP سريعة وخفية مع فئة `Fetcher`. يمكنها تقليد بصمة TLS للمتصفح والرؤوس واستخدام HTTP/3.
|
| 117 |
+
- **التحميل الديناميكي**: جلب المواقع الديناميكية مع أتمتة كاملة للمتصفح من خلال فئة `DynamicFetcher` التي تدعم Chromium من Playwright و Google Chrome.
|
| 118 |
+
- **تجاوز مكافحة الروبوتات**: قدرات تخفي متقدمة مع `StealthyFetcher` وا��تحال fingerprint. يمكنه تجاوز جميع أنواع Turnstile/Interstitial من Cloudflare بسهولة بالأتمتة.
|
| 119 |
+
- **إدارة الجلسات**: دعم الجلسات المستمرة مع فئات `FetcherSession` و`StealthySession` و`DynamicSession` لإدارة ملفات تعريف الارتباط والحالة عبر الطلبات.
|
| 120 |
+
- **تدوير Proxy**: `ProxyRotator` مدمج مع استراتيجيات التدوير الدوري أو المخصصة عبر جميع أنواع الجلسات، بالإضافة إلى تجاوزات Proxy لكل طلب.
|
| 121 |
+
- **حظر النطاقات**: حظر الطلبات إلى نطاقات محددة (ونطاقاتها الفرعية) في الجوالب المعتمدة على المتصفح.
|
| 122 |
+
- **دعم Async**: دعم async كامل عبر جميع الجوالب وفئات الجلسات async المخصصة.
|
| 123 |
+
|
| 124 |
+
### الاستخراج التكيفي والتكامل مع الذكاء الاصطناعي
|
| 125 |
+
- 🔄 **تتبع العناصر الذكي**: إعادة تحديد موقع العناصر بعد تغييرات الموقع باستخدام خوارزميات التشابه الذكية.
|
| 126 |
+
- 🎯 **الاختيار المرن الذكي**: محددات CSS، محددات XPath، البحث القائم على الفلاتر، البحث النصي، البحث بالتعبيرات العادية والمزيد.
|
| 127 |
+
- 🔍 **البحث عن عناصر مشابهة**: تحديد العناصر المشابهة للعناصر الموجودة تلقائياً.
|
| 128 |
+
- 🤖 **خادم MCP للاستخدام مع الذكاء الاصطناعي**: خادم MCP مدمج لـ Web Scraping بمساعدة الذكاء الاصطناعي واستخراج البيانات. يتميز خادم MCP بقدرات قوية مخصصة تستفيد من Scrapling لاستخراج المحتوى المستهدف قبل تمريره إلى الذكاء الاصطناعي (Claude/Cursor/إلخ)، وبالتالي تسريع العمليات وتقليل التكاليف عن طريق تقليل استخدام الرموز. ([فيديو توضيحي](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
|
| 129 |
+
|
| 130 |
+
### بنية عالية الأداء ومختبرة ميدانياً
|
| 131 |
+
- 🚀 **سريع كالبرق**: أداء محسّن يتفوق على معظم مكتبات Web Scraping في Python.
|
| 132 |
+
- 🔋 **فعال في استخدام الذاكرة**: هياكل بيانات محسّنة وتحميل كسول لأقل استخدام للذاكرة.
|
| 133 |
+
- ⚡ **تسلسل JSON سريع**: أسرع 10 مرات من المكتبة القياسية.
|
| 134 |
+
- 🏗️ **مُختبر ميدانياً**: لا يمتلك Scrapling فقط تغطية اختبار بنسبة 92٪ وتغطية كاملة لتلميحات الأنواع، بل تم استخدامه يومياً من قبل مئات مستخرجي الويب خلال العام الماضي.
|
| 135 |
+
|
| 136 |
+
### تجربة صديقة للمطورين/مستخرجي الويب
|
| 137 |
+
- 🎯 **Shell تفاعلي لـ Web Scraping**: Shell IPython مدمج اختياري مع تكامل Scrapling، واختصارات، وأدوات جديدة لتسريع تطوير سكريبتات Web Scraping، مثل تحويل طلبات curl إلى طلبات Scrapling وعرض نتائج الطلبات في متصفحك.
|
| 138 |
+
- 🚀 **استخدمه مباشرة من الطرفية**: اختيارياً، يمكنك استخدام Scrapling لاستخراج عنوان URL دون كتابة سطر واحد من الكود!
|
| 139 |
+
- 🛠️ **واجهة تنقل غنية**: اجتياز DOM متقدم مع طرق التنقل بين العناصر الوالدية والشقيقة والفرعية.
|
| 140 |
+
- 🧬 **معالجة نصوص محسّنة**: تعبيرات عادية مدمجة وطرق تنظيف وعمليات نصية محسّنة.
|
| 141 |
+
- 📝 **إنشاء محددات تلقائي**: إنشاء محددات CSS/XPath قوية لأي عنصر.
|
| 142 |
+
- 🔌 **واجهة مألوفة**: مشابه لـ Scrapy/BeautifulSoup مع نفس العناصر الزائفة المستخدمة في Scrapy/Parsel.
|
| 143 |
+
- 📘 **تغطية كاملة للأنواع**: تلميحات نوع كاملة لدعم IDE ممتاز وإكمال الكود. يتم فحص قاعدة الكود بالكامل تلقائياً بواسطة **PyRight** و**MyPy** مع كل تغيير.
|
| 144 |
+
- 🔋 **صورة Docker جاهزة**: مع كل إصدار، يتم بناء ودفع صورة Docker تحتوي على جميع المتصفحات تلقائياً.
|
| 145 |
+
|
| 146 |
+
## البدء
|
| 147 |
+
|
| 148 |
+
لنلقِ نظرة سريعة على ما يمكن لـ Scrapling فعله دون التعمق.
|
| 149 |
+
|
| 150 |
+
### الاستخدام الأساسي
|
| 151 |
+
طلبات HTTP مع دعم الجلسات
|
| 152 |
+
```python
|
| 153 |
+
from scrapling.fetchers import Fetcher, FetcherSession
|
| 154 |
+
|
| 155 |
+
with FetcherSession(impersonate='chrome') as session: # استخدم أحدث إصدار من بصمة TLS لـ Chrome
|
| 156 |
+
page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
| 157 |
+
quotes = page.css('.quote .text::text').getall()
|
| 158 |
+
|
| 159 |
+
# أو استخدم طلبات لمرة واحدة
|
| 160 |
+
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 161 |
+
quotes = page.css('.quote .text::text').getall()
|
| 162 |
+
```
|
| 163 |
+
وضع التخفي المتقدم
|
| 164 |
+
```python
|
| 165 |
+
from scrapling.fetchers import StealthyFetcher, StealthySession
|
| 166 |
+
|
| 167 |
+
with StealthySession(headless=True, solve_cloudflare=True) as session: # أبقِ المتصفح مفتوحاً حتى تنتهي
|
| 168 |
+
page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
|
| 169 |
+
data = page.css('#padded_content a').getall()
|
| 170 |
+
|
| 171 |
+
# أو استخدم نمط الطلب لمرة واحدة، يفتح المتصفح لهذا الطلب، ثم يغلقه بعد الانتهاء
|
| 172 |
+
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
|
| 173 |
+
data = page.css('#padded_content a').getall()
|
| 174 |
+
```
|
| 175 |
+
أتمتة المتصفح الكاملة
|
| 176 |
+
```python
|
| 177 |
+
from scrapling.fetchers import DynamicFetcher, DynamicSession
|
| 178 |
+
|
| 179 |
+
with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # أبقِ المتصفح مفتوحاً حتى تنتهي
|
| 180 |
+
page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
|
| 181 |
+
data = page.xpath('//span[@class="text"]/text()').getall() # محدد XPath إذا كنت تفضله
|
| 182 |
+
|
| 183 |
+
# أو استخدم نمط الطلب لمرة واحدة، يفتح المتصفح لهذا الطلب، ثم يغلقه بعد الانتهاء
|
| 184 |
+
page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
|
| 185 |
+
data = page.css('.quote .text::text').getall()
|
| 186 |
+
```
|
| 187 |
+
|
| 188 |
+
### Spiders
|
| 189 |
+
ابنِ زواحف كاملة مع طلبات متزامنة وأنواع جلسات متعددة وإيقاف/استئناف:
|
| 190 |
+
```python
|
| 191 |
+
from scrapling.spiders import Spider, Request, Response
|
| 192 |
+
|
| 193 |
+
class QuotesSpider(Spider):
|
| 194 |
+
name = "quotes"
|
| 195 |
+
start_urls = ["https://quotes.toscrape.com/"]
|
| 196 |
+
concurrent_requests = 10
|
| 197 |
+
|
| 198 |
+
async def parse(self, response: Response):
|
| 199 |
+
for quote in response.css('.quote'):
|
| 200 |
+
yield {
|
| 201 |
+
"text": quote.css('.text::text').get(),
|
| 202 |
+
"author": quote.css('.author::text').get(),
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
next_page = response.css('.next a')
|
| 206 |
+
if next_page:
|
| 207 |
+
yield response.follow(next_page[0].attrib['href'])
|
| 208 |
+
|
| 209 |
+
result = QuotesSpider().start()
|
| 210 |
+
print(f"Scraped {len(result.items)} quotes")
|
| 211 |
+
result.items.to_json("quotes.json")
|
| 212 |
+
```
|
| 213 |
+
استخدم أنواع جلسات متعددة في Spider واحد:
|
| 214 |
+
```python
|
| 215 |
+
from scrapling.spiders import Spider, Request, Response
|
| 216 |
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession
|
| 217 |
+
|
| 218 |
+
class MultiSessionSpider(Spider):
|
| 219 |
+
name = "multi"
|
| 220 |
+
start_urls = ["https://example.com/"]
|
| 221 |
+
|
| 222 |
+
def configure_sessions(self, manager):
|
| 223 |
+
manager.add("fast", FetcherSession(impersonate="chrome"))
|
| 224 |
+
manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)
|
| 225 |
+
|
| 226 |
+
async def parse(self, response: Response):
|
| 227 |
+
for link in response.css('a::attr(href)').getall():
|
| 228 |
+
# وجّه الصفحات المحمية عبر جلسة التخفي
|
| 229 |
+
if "protected" in link:
|
| 230 |
+
yield Request(link, sid="stealth")
|
| 231 |
+
else:
|
| 232 |
+
yield Request(link, sid="fast", callback=self.parse) # callback صريح
|
| 233 |
+
```
|
| 234 |
+
أوقف واستأنف عمليات الزحف الطويلة مع Checkpoints بتشغيل Spider هكذا:
|
| 235 |
+
```python
|
| 236 |
+
QuotesSpider(crawldir="./crawl_data").start()
|
| 237 |
+
```
|
| 238 |
+
اضغط Ctrl+C للإيقاف بسلاسة — يتم حفظ التقدم تلقائياً. لاحقاً، عند تشغيل Spider مرة أخرى، مرر نفس `crawldir`، وسيستأنف من حيث توقف.
|
| 239 |
+
|
| 240 |
+
### التحليل المتقدم والتنقل
|
| 241 |
+
```python
|
| 242 |
+
from scrapling.fetchers import Fetcher
|
| 243 |
+
|
| 244 |
+
# اختيار عناصر غني وتنقل
|
| 245 |
+
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 246 |
+
|
| 247 |
+
# احصل على الاقتباسات بطرق اختيار متعددة
|
| 248 |
+
quotes = page.css('.quote') # محدد CSS
|
| 249 |
+
quotes = page.xpath('//div[@class="quote"]') # XPath
|
| 250 |
+
quotes = page.find_all('div', {'class': 'quote'}) # بأسلوب BeautifulSoup
|
| 251 |
+
# نفس الشيء مثل
|
| 252 |
+
quotes = page.find_all('div', class_='quote')
|
| 253 |
+
quotes = page.find_all(['div'], class_='quote')
|
| 254 |
+
quotes = page.find_all(class_='quote') # وهكذا...
|
| 255 |
+
# البحث عن عنصر بمحتوى النص
|
| 256 |
+
quotes = page.find_by_text('quote', tag='div')
|
| 257 |
+
|
| 258 |
+
# التنقل المتقدم
|
| 259 |
+
quote_text = page.css('.quote')[0].css('.text::text').get()
|
| 260 |
+
quote_text = page.css('.quote').css('.text::text').getall() # محددات متسلسلة
|
| 261 |
+
first_quote = page.css('.quote')[0]
|
| 262 |
+
author = first_quote.next_sibling.css('.author::text')
|
| 263 |
+
parent_container = first_quote.parent
|
| 264 |
+
|
| 265 |
+
# علاقات العناصر والتشابه
|
| 266 |
+
similar_elements = first_quote.find_similar()
|
| 267 |
+
below_elements = first_quote.below_elements()
|
| 268 |
+
```
|
| 269 |
+
يمكنك استخدام المحلل مباشرة إذا كنت لا تريد جلب المواقع كما يلي:
|
| 270 |
+
```python
|
| 271 |
+
from scrapling.parser import Selector
|
| 272 |
+
|
| 273 |
+
page = Selector("<html>...</html>")
|
| 274 |
+
```
|
| 275 |
+
وهو يعمل بنفس الطريقة تماماً!
|
| 276 |
+
|
| 277 |
+
### أمثلة إدارة ��لجلسات بشكل Async
|
| 278 |
+
```python
|
| 279 |
+
import asyncio
|
| 280 |
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession
|
| 281 |
+
|
| 282 |
+
async with FetcherSession(http3=True) as session: # `FetcherSession` واعٍ بالسياق ويعمل في كلا النمطين المتزامن/async
|
| 283 |
+
page1 = session.get('https://quotes.toscrape.com/')
|
| 284 |
+
page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
|
| 285 |
+
|
| 286 |
+
# استخدام جلسة async
|
| 287 |
+
async with AsyncStealthySession(max_pages=2) as session:
|
| 288 |
+
tasks = []
|
| 289 |
+
urls = ['https://example.com/page1', 'https://example.com/page2']
|
| 290 |
+
|
| 291 |
+
for url in urls:
|
| 292 |
+
task = session.fetch(url)
|
| 293 |
+
tasks.append(task)
|
| 294 |
+
|
| 295 |
+
print(session.get_pool_stats()) # اختياري - حالة مجموعة علامات تبويب المتصفح (مشغول/حر/خطأ)
|
| 296 |
+
results = await asyncio.gather(*tasks)
|
| 297 |
+
print(session.get_pool_stats())
|
| 298 |
+
```
|
| 299 |
+
|
| 300 |
+
## واجهة سطر الأوامر والـ Shell التفاعلي
|
| 301 |
+
|
| 302 |
+
يتضمن Scrapling واجهة سطر أوامر قوية:
|
| 303 |
+
|
| 304 |
+
[](https://asciinema.org/a/736339)
|
| 305 |
+
|
| 306 |
+
تشغيل Shell الـ Web Scraping التفاعلي
|
| 307 |
+
```bash
|
| 308 |
+
scrapling shell
|
| 309 |
+
```
|
| 310 |
+
استخرج الصفحات إلى ملف مباشرة دون برمجة (يستخرج المحتوى داخل وسم `body` افتراضياً). إذا انتهى ملف الإخراج بـ `.txt`، فسيتم استخراج محتوى النص للهدف. إذا انتهى بـ `.md`، فسيكون تمثيل Markdown لمحتوى HTML؛ إذا انتهى بـ `.html`، فسيكون محتوى HTML نفسه.
|
| 311 |
+
```bash
|
| 312 |
+
scrapling extract get 'https://example.com' content.md
|
| 313 |
+
scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # جميع العناصر المطابقة لمحدد CSS '#fromSkipToProducts'
|
| 314 |
+
scrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless
|
| 315 |
+
scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare
|
| 316 |
+
```
|
| 317 |
+
|
| 318 |
+
> [!NOTE]
|
| 319 |
+
> هناك العديد من الميزات الإضافية، لكننا نريد إبقاء هذه الصفحة موجزة، بما في ذلك خادم MCP والـ Shell التفاعلي لـ Web Scraping. تحقق من الوثائق الكاملة [هنا](https://scrapling.readthedocs.io/en/latest/)
|
| 320 |
+
|
| 321 |
+
## معايير الأداء
|
| 322 |
+
|
| 323 |
+
Scrapling ليس قوياً فحسب — بل هو أيضاً سريع بشكل مذهل. تقارن المعايير التالية محلل Scrapling مع أحدث إصدارات المكتبات الشائعة الأخرى.
|
| 324 |
+
|
| 325 |
+
### اختبار سرعة استخراج النص (5000 عنصر متداخل)
|
| 326 |
+
|
| 327 |
+
| # | المكتبة | الوقت (ms) | vs Scrapling |
|
| 328 |
+
|---|:-----------------:|:----------:|:------------:|
|
| 329 |
+
| 1 | Scrapling | 2.02 | 1.0x |
|
| 330 |
+
| 2 | Parsel/Scrapy | 2.04 | 1.01 |
|
| 331 |
+
| 3 | Raw Lxml | 2.54 | 1.257 |
|
| 332 |
+
| 4 | PyQuery | 24.17 | ~12x |
|
| 333 |
+
| 5 | Selectolax | 82.63 | ~41x |
|
| 334 |
+
| 6 | MechanicalSoup | 1549.71 | ~767.1x |
|
| 335 |
+
| 7 | BS4 with Lxml | 1584.31 | ~784.3x |
|
| 336 |
+
| 8 | BS4 with html5lib | 3391.91 | ~1679.1x |
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
### أداء تشابه العناصر والبحث النصي
|
| 340 |
+
|
| 341 |
+
قدرات العثور على العناصر التكيفية لـ Scrapling تتفوق بشكل كبير على البدائل:
|
| 342 |
+
|
| 343 |
+
| المكتبة | الوقت (ms) | vs Scrapling |
|
| 344 |
+
|-------------|:----------:|:------------:|
|
| 345 |
+
| Scrapling | 2.39 | 1.0x |
|
| 346 |
+
| AutoScraper | 12.45 | 5.209x |
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
> تمثل جميع المعايير متوسطات أكثر من 100 تشغيل. انظر [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) للمنهجية.
|
| 350 |
+
|
| 351 |
+
## التثبيت
|
| 352 |
+
|
| 353 |
+
يتطلب Scrapling إصدار Python 3.10 أو أعلى:
|
| 354 |
+
|
| 355 |
+
```bash
|
| 356 |
+
pip install scrapling
|
| 357 |
+
```
|
| 358 |
+
|
| 359 |
+
يتضمن هذا التثبيت فقط محرك المحلل وتبعياته، بدون أي جوالب أو تبعيات سطر الأوامر.
|
| 360 |
+
|
| 361 |
+
### التبعيات الاختيارية
|
| 362 |
+
|
| 363 |
+
1. إذا كنت ستستخدم أياً من الميزات الإضافية أدناه، أو الجوالب، أو فئاتها، فستحتاج إلى تثبيت تبعيات الجوالب وتبعيات المتصفح الخاصة بها على النحو التالي:
|
| 364 |
+
```bash
|
| 365 |
+
pip install "scrapling[fetchers]"
|
| 366 |
+
|
| 367 |
+
scrapling install # normal install
|
| 368 |
+
scrapling install --force # force reinstall
|
| 369 |
+
```
|
| 370 |
+
|
| 371 |
+
يقوم هذا بتنزيل جميع المتصفحات، إلى جانب تبعيات النظام وتبعيات معالجة fingerprint الخاصة بها.
|
| 372 |
+
|
| 373 |
+
أو يمكنك تثبيتها من الكود بدلاً من تشغيل أمر كالتالي:
|
| 374 |
+
```python
|
| 375 |
+
from scrapling.cli import install
|
| 376 |
+
|
| 377 |
+
install([], standalone_mode=False) # normal install
|
| 378 |
+
install(["--force"], standalone_mode=False) # force reinstall
|
| 379 |
+
```
|
| 380 |
+
|
| 381 |
+
2. ميزات إضافية:
|
| 382 |
+
- تثبيت ميزة خادم MCP:
|
| 383 |
+
```bash
|
| 384 |
+
pip install "scrapling[ai]"
|
| 385 |
+
```
|
| 386 |
+
- تثبيت ميزات Shell (Shell الـ Web Scraping وأمر `extract`):
|
| 387 |
+
```bash
|
| 388 |
+
pip install "scrapling[shell]"
|
| 389 |
+
```
|
| 390 |
+
- تثبيت كل شيء:
|
| 391 |
+
```bash
|
| 392 |
+
pip install "scrapling[all]"
|
| 393 |
+
```
|
| 394 |
+
تذكر أنك تحتاج إلى تثبيت تبعيات المتصفح مع `scrapling install` بعد أي من هذه الإضافات (إذا لم تكن قد فعلت ذلك بالفعل)
|
| 395 |
+
|
| 396 |
+
### Docker
|
| 397 |
+
يمكنك أيضاً تثبيت صورة Docker مع جميع الإضافات والمتصفحات باستخدام الأمر التالي من DockerHub:
|
| 398 |
+
```bash
|
| 399 |
+
docker pull pyd4vinci/scrapling
|
| 400 |
+
```
|
| 401 |
+
أو تنزيلها من سجل GitHub:
|
| 402 |
+
```bash
|
| 403 |
+
docker pull ghcr.io/d4vinci/scrapling:latest
|
| 404 |
+
```
|
| 405 |
+
يتم بناء هذه الصورة ودفعها تلقائياً باستخدام GitHub Actions والفرع الرئيسي للمستودع.
|
| 406 |
+
|
| 407 |
+
## المساهمة
|
| 408 |
+
|
| 409 |
+
نرحب بالمساهمات! يرجى قراءة [إرشادات المساهمة](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) قبل البدء.
|
| 410 |
+
|
| 411 |
+
## إخلاء المسؤولية
|
| 412 |
+
|
| 413 |
+
> [!CAUTION]
|
| 414 |
+
> يتم توفير هذه المكتبة للأغراض التعليمية والبحثية فقط. باستخدام هذه المكتبة، فإنك توافق على الامتثال لقوانين استخراج البيانات والخصوصية المحلية والدولية. المؤلفون والمساهمون غير مسؤولين عن أي إساءة استخدام لهذا البرنامج. احترم دائماً شروط خدمة المواقع وملفات robots.txt.
|
| 415 |
+
|
| 416 |
+
## الترخيص
|
| 417 |
+
|
| 418 |
+
هذا العمل مرخص بموجب ترخيص BSD-3-Clause.
|
| 419 |
+
|
| 420 |
+
## الشكر والتقدير
|
| 421 |
+
|
| 422 |
+
يتضمن هذا المشروع كوداً معدلاً من:
|
| 423 |
+
- Parsel (ترخيص BSD) — يُستخدم للوحدة الفرعية [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)
|
| 424 |
+
|
| 425 |
+
---
|
| 426 |
+
<div align="center"><small>مصمم ومصنوع بـ ❤️ بواسطة كريم شعير.</small></div><br>
|
docs/README_CN.md
ADDED
|
@@ -0,0 +1,426 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!-- mcp-name: io.github.D4Vinci/Scrapling -->
|
| 2 |
+
|
| 3 |
+
<h1 align="center">
|
| 4 |
+
<a href="https://scrapling.readthedocs.io">
|
| 5 |
+
<picture>
|
| 6 |
+
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
|
| 7 |
+
<img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
|
| 8 |
+
</picture>
|
| 9 |
+
</a>
|
| 10 |
+
<br>
|
| 11 |
+
<small>Effortless Web Scraping for the Modern Web</small>
|
| 12 |
+
</h1>
|
| 13 |
+
|
| 14 |
+
<p align="center">
|
| 15 |
+
<a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
|
| 16 |
+
<img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
|
| 17 |
+
<a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
|
| 18 |
+
<img alt="PyPI version" src="https://badge.fury.io/py/Scrapling.svg"></a>
|
| 19 |
+
<a href="https://pepy.tech/project/scrapling" alt="PyPI Downloads">
|
| 20 |
+
<img alt="PyPI Downloads" src="https://static.pepy.tech/personalized-badge/scrapling?period=total&units=INTERNATIONAL_SYSTEM&left_color=GREY&right_color=GREEN&left_text=Downloads"></a>
|
| 21 |
+
<br/>
|
| 22 |
+
<a href="https://discord.gg/EMgGbDceNQ" alt="Discord" target="_blank">
|
| 23 |
+
<img alt="Discord" src="https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ">
|
| 24 |
+
</a>
|
| 25 |
+
<a href="https://x.com/Scrapling_dev" alt="X (formerly Twitter)">
|
| 26 |
+
<img alt="X (formerly Twitter) Follow" src="https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev">
|
| 27 |
+
</a>
|
| 28 |
+
<br/>
|
| 29 |
+
<a href="https://pypi.org/project/scrapling/" alt="Supported Python versions">
|
| 30 |
+
<img alt="Supported Python versions" src="https://img.shields.io/pypi/pyversions/scrapling.svg"></a>
|
| 31 |
+
</p>
|
| 32 |
+
|
| 33 |
+
<p align="center">
|
| 34 |
+
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/"><strong>选择方法</strong></a>
|
| 35 |
+
·
|
| 36 |
+
<a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/"><strong>选择Fetcher</strong></a>
|
| 37 |
+
·
|
| 38 |
+
<a href="https://scrapling.readthedocs.io/en/latest/spiders/architecture.html"><strong>爬虫</strong></a>
|
| 39 |
+
·
|
| 40 |
+
<a href="https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html"><strong>代理轮换</strong></a>
|
| 41 |
+
·
|
| 42 |
+
<a href="https://scrapling.readthedocs.io/en/latest/cli/overview/"><strong>CLI</strong></a>
|
| 43 |
+
·
|
| 44 |
+
<a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/"><strong>MCP模式</strong></a>
|
| 45 |
+
</p>
|
| 46 |
+
|
| 47 |
+
Scrapling是一个自适应Web Scraping框架,能处理从单个请求到大规模爬取的一切需求。
|
| 48 |
+
|
| 49 |
+
它的解析器能够从网站变化中学习,并在页面更新时自动重新定位您的元素。它的Fetcher能够开箱即用地绕过Cloudflare Turnstile等反机器人系统。它的Spider框架让您可以扩展到并发、多Session爬取,支持暂停/恢复和自动Proxy轮换——只需几行Python代码。一个库,零妥协。
|
| 50 |
+
|
| 51 |
+
极速爬取,实时统计和Streaming。由Web Scraper为Web Scraper和普通用户而构建,每个人都能找到适合自己的功能。
|
| 52 |
+
|
| 53 |
+
```python
|
| 54 |
+
from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
|
| 55 |
+
StealthyFetcher.adaptive = True
|
| 56 |
+
p = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # 隐秘地获取网站!
|
| 57 |
+
products = p.css('.product', auto_save=True) # 抓取在网站设计变更后仍能存活的数据!
|
| 58 |
+
products = p.css('.product', adaptive=True) # 之后,如果网站结构改变,传递 `adaptive=True` 来找到它们!
|
| 59 |
+
```
|
| 60 |
+
或扩展为完整爬取
|
| 61 |
+
```python
|
| 62 |
+
from scrapling.spiders import Spider, Response
|
| 63 |
+
|
| 64 |
+
class MySpider(Spider):
|
| 65 |
+
name = "demo"
|
| 66 |
+
start_urls = ["https://example.com/"]
|
| 67 |
+
|
| 68 |
+
async def parse(self, response: Response):
|
| 69 |
+
for item in response.css('.product'):
|
| 70 |
+
yield {"title": item.css('h2::text').get()}
|
| 71 |
+
|
| 72 |
+
MySpider().start()
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# 铂金赞助商
|
| 77 |
+
|
| 78 |
+
<i><sub>想成为第一个出现在这里的公司吗?点击[这里](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)</sub></i>
|
| 79 |
+
# 赞助商
|
| 80 |
+
|
| 81 |
+
<!-- sponsors -->
|
| 82 |
+
|
| 83 |
+
<a href="https://www.scrapeless.com/en?utm_source=official&utm_term=scrapling" target="_blank" title="Effortless Web Scraping Toolkit for Business and Developers"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg"></a>
|
| 84 |
+
<a href="https://www.thordata.com/?ls=github&lk=github" target="_blank" title="Unblockable proxies and scraping infrastructure, delivering real-time, reliable web data to power AI models and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
|
| 85 |
+
<a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
|
| 86 |
+
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
| 87 |
+
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
| 88 |
+
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
| 89 |
+
<a href="https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci" target="_blank" title="The web scraping service that actually beats anti-bot systems!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png"></a>
|
| 90 |
+
<a href="https://proxyempire.io/" target="_blank" title="Collect The Data Your Project Needs with the Best Residential Proxies"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png"></a>
|
| 91 |
+
<a href="https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling" target="_blank" title="Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png"></a>
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
| 95 |
+
<a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
|
| 96 |
+
<a href="https://browser.cash/?utm_source=D4Vinci&utm_medium=referral" target="_blank" title="Browser Automation & AI Browser Agent Platform"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png"></a>
|
| 97 |
+
|
| 98 |
+
<!-- /sponsors -->
|
| 99 |
+
|
| 100 |
+
<i><sub>想在这里展示您的广告吗?点击[这里](https://github.com/sponsors/D4Vinci)并选择适合您的级别!</sub></i>
|
| 101 |
+
|
| 102 |
+
---
|
| 103 |
+
|
| 104 |
+
## 主要特性
|
| 105 |
+
|
| 106 |
+
### Spider — 完整的爬取框架
|
| 107 |
+
- 🕷️ **类Scrapy的Spider API**:使用`start_urls`、async `parse` callback和`Request`/`Response`对象定义Spider。
|
| 108 |
+
- ⚡ **并发爬取**:可配置的并发限制、按域名节流和下载延迟。
|
| 109 |
+
- 🔄 **多Session支持**:统一接口,支持HTTP请求和隐秘无头浏览器在同一个Spider中使用——通过ID将请求路由到不同的Session。
|
| 110 |
+
- 💾 **暂停与恢复**:基于Checkpoint的爬取持久化。按Ctrl+C优雅关闭;重启后从上次停止的地方继续。
|
| 111 |
+
- 📡 **Streaming模式**:通过`async for item in spider.stream()`以实时统计Streaming抓取的数据——非常适合UI、管道和长时间运行的爬取。
|
| 112 |
+
- 🛡️ **被阻止请求检测**:自动检测并重试被阻止的请求,支持自定义逻辑。
|
| 113 |
+
- 📦 **内置导出**:通过钩子和您自己的管道导出结果,或使用内置的JSON/JSONL,分别通过`result.items.to_json()`/`result.items.to_jsonl()`。
|
| 114 |
+
|
| 115 |
+
### 支持Session的高级网站获取
|
| 116 |
+
- **HTTP请求**:使用`Fetcher`类进行快速和隐秘的HTTP请求。可以模拟浏览器的TLS fingerprint、标头并使用HTTP/3。
|
| 117 |
+
- **动态加载**:通过`DynamicFetcher`类使用完整的浏览器自动化获取动态网站,支持Playwright的Chromium和Google Chrome。
|
| 118 |
+
- **反机器人绕过**:使用`StealthyFetcher`的高级隐秘功能和fingerprint伪装。可以轻松自动绕过所有类型的Cloudflare Turnstile/Interstitial。
|
| 119 |
+
- **Session管理**:使用`FetcherSession`、`StealthySession`和`DynamicSession`类实现持久化Session支持,用于跨请求的cookie和状态管理。
|
| 120 |
+
- **Proxy轮换**:内置`ProxyRotator`,支持轮询或自定义策略,适用于所有Session类型,并支持按请求覆盖Proxy。
|
| 121 |
+
- **域名屏蔽**:在基于浏览器的Fetcher中屏蔽对特定域名(及其子域名)的请求。
|
| 122 |
+
- **Async支持**:所有Fetcher和专用async Session类的完整async支持。
|
| 123 |
+
|
| 124 |
+
### 自适应抓取和AI集成
|
| 125 |
+
- 🔄 **智能元素跟踪**:使用智能相似性算法在网站更改后重新定位元素。
|
| 126 |
+
- 🎯 **智能灵活选择**:CSS选择器、XPath选择器、基于过滤器的搜索、文本搜索、正则表达式搜索等。
|
| 127 |
+
- 🔍 **查找相似元素**:自动定位与已找到元素相似的元素。
|
| 128 |
+
- 🤖 **与AI一起使用的MCP服务器**:内置MCP服务器用于AI辅助Web Scraping和数据提取。MCP服务器具有强大的自定义功能,利用Scrapling在将内容传递给AI(Claude/Cursor等)之前提取目标内容,从而加快操作并通过最小化token使用来降低成本。([演示视频](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
|
| 129 |
+
|
| 130 |
+
### 高性能和经过实战测试的架构
|
| 131 |
+
- 🚀 **闪电般快速**:优化性能超越大多数Python抓取库。
|
| 132 |
+
- 🔋 **内存高效**:优化的数据结构和延迟加载,最小内存占用。
|
| 133 |
+
- ⚡ **快速JSON序列化**:比标准库快10倍。
|
| 134 |
+
- 🏗️ **经过实战测试**:Scrapling不仅拥有92%的测试覆盖率和完整的类型提示覆盖率,而且在过去一年中每天被数百名Web Scraper使用。
|
| 135 |
+
|
| 136 |
+
### 对开发者/Web Scraper友好的体验
|
| 137 |
+
- 🎯 **交互式Web Scraping Shell**:可选的内置IPython Shell,具有Scrapling集成、快捷方式和新工具,可加快Web Scraping脚本开发,例如将curl请求转换为Scrapling请求并在浏览器中查看请求结果。
|
| 138 |
+
- 🚀 **直接从终端使用**:可选地,您可以使用Scrapling抓取URL而无需编写任何代码!
|
| 139 |
+
- 🛠️ **丰富的导航API**:使用父级、兄弟级和子级导航方法进行高级DOM遍历。
|
| 140 |
+
- 🧬 **增强的文本处理**:内置正则表达式、清理方法和优化的字符串操作。
|
| 141 |
+
- 📝 **自动选择器生成**:为任何元素生成强大的CSS/XPath选择器。
|
| 142 |
+
- 🔌 **熟悉的API**:类似于Scrapy/BeautifulSoup,使用与Scrapy/Parsel相同的伪元素。
|
| 143 |
+
- 📘 **完整的类型覆盖**:完整的类型提示,出色的IDE支持和代码补全。整个代码库在每次更改时都会自动使用**PyRight**和**MyPy**扫描。
|
| 144 |
+
- 🔋 **现成的Docker镜像**:每次发布时,包含所有浏览器的Docker镜像会自动构建和推送。
|
| 145 |
+
|
| 146 |
+
## 入门
|
| 147 |
+
|
| 148 |
+
让我们快速展示Scrapling的功能,无需深入了解。
|
| 149 |
+
|
| 150 |
+
### 基本用法
|
| 151 |
+
支持Session的HTTP请求
|
| 152 |
+
```python
|
| 153 |
+
from scrapling.fetchers import Fetcher, FetcherSession
|
| 154 |
+
|
| 155 |
+
with FetcherSession(impersonate='chrome') as session: # 使用Chrome的最新版本TLS fingerprint
|
| 156 |
+
page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
| 157 |
+
quotes = page.css('.quote .text::text').getall()
|
| 158 |
+
|
| 159 |
+
# 或使用一次性请求
|
| 160 |
+
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 161 |
+
quotes = page.css('.quote .text::text').getall()
|
| 162 |
+
```
|
| 163 |
+
高级隐秘模式
|
| 164 |
+
```python
|
| 165 |
+
from scrapling.fetchers import StealthyFetcher, StealthySession
|
| 166 |
+
|
| 167 |
+
with StealthySession(headless=True, solve_cloudflare=True) as session: # 保持浏览器打开直到完成
|
| 168 |
+
page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
|
| 169 |
+
data = page.css('#padded_content a').getall()
|
| 170 |
+
|
| 171 |
+
# 或使用一次性请求样式,为此请求打开浏览器,完成后关闭
|
| 172 |
+
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
|
| 173 |
+
data = page.css('#padded_content a').getall()
|
| 174 |
+
```
|
| 175 |
+
完整的浏览器自动化
|
| 176 |
+
```python
|
| 177 |
+
from scrapling.fetchers import DynamicFetcher, DynamicSession
|
| 178 |
+
|
| 179 |
+
with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # 保持浏览器打开直到完成
|
| 180 |
+
page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
|
| 181 |
+
data = page.xpath('//span[@class="text"]/text()').getall() # 如果您偏好XPath选择器
|
| 182 |
+
|
| 183 |
+
# 或使用一次性请求样式,为此请求打开浏览器,完成后关闭
|
| 184 |
+
page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
|
| 185 |
+
data = page.css('.quote .text::text').getall()
|
| 186 |
+
```
|
| 187 |
+
|
| 188 |
+
### Spider
|
| 189 |
+
构建具有并发请求、多种Session类型和暂停/恢复功能的完整爬虫:
|
| 190 |
+
```python
|
| 191 |
+
from scrapling.spiders import Spider, Request, Response
|
| 192 |
+
|
| 193 |
+
class QuotesSpider(Spider):
|
| 194 |
+
name = "quotes"
|
| 195 |
+
start_urls = ["https://quotes.toscrape.com/"]
|
| 196 |
+
concurrent_requests = 10
|
| 197 |
+
|
| 198 |
+
async def parse(self, response: Response):
|
| 199 |
+
for quote in response.css('.quote'):
|
| 200 |
+
yield {
|
| 201 |
+
"text": quote.css('.text::text').get(),
|
| 202 |
+
"author": quote.css('.author::text').get(),
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
next_page = response.css('.next a')
|
| 206 |
+
if next_page:
|
| 207 |
+
yield response.follow(next_page[0].attrib['href'])
|
| 208 |
+
|
| 209 |
+
result = QuotesSpider().start()
|
| 210 |
+
print(f"抓取了 {len(result.items)} 条引用")
|
| 211 |
+
result.items.to_json("quotes.json")
|
| 212 |
+
```
|
| 213 |
+
在单个Spider中使用多种Session类型:
|
| 214 |
+
```python
|
| 215 |
+
from scrapling.spiders import Spider, Request, Response
|
| 216 |
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession
|
| 217 |
+
|
| 218 |
+
class MultiSessionSpider(Spider):
|
| 219 |
+
name = "multi"
|
| 220 |
+
start_urls = ["https://example.com/"]
|
| 221 |
+
|
| 222 |
+
def configure_sessions(self, manager):
|
| 223 |
+
manager.add("fast", FetcherSession(impersonate="chrome"))
|
| 224 |
+
manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)
|
| 225 |
+
|
| 226 |
+
async def parse(self, response: Response):
|
| 227 |
+
for link in response.css('a::attr(href)').getall():
|
| 228 |
+
# 将受保护的页面路由到隐秘Session
|
| 229 |
+
if "protected" in link:
|
| 230 |
+
yield Request(link, sid="stealth")
|
| 231 |
+
else:
|
| 232 |
+
yield Request(link, sid="fast", callback=self.parse) # 显式callback
|
| 233 |
+
```
|
| 234 |
+
通过如下方式运行Spider来暂停和恢复长时间爬取,使用Checkpoint:
|
| 235 |
+
```python
|
| 236 |
+
QuotesSpider(crawldir="./crawl_data").start()
|
| 237 |
+
```
|
| 238 |
+
按Ctrl+C优雅暂停——进度会自动保存。之后,当您再次启动Spider时,传递相同的`crawldir`,它将从上次停止的地方继续。
|
| 239 |
+
|
| 240 |
+
### 高级解析与导航
|
| 241 |
+
```python
|
| 242 |
+
from scrapling.fetchers import Fetcher
|
| 243 |
+
|
| 244 |
+
# 丰富的元素选择和导航
|
| 245 |
+
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 246 |
+
|
| 247 |
+
# 使用多种选择方法获取引用
|
| 248 |
+
quotes = page.css('.quote') # CSS选择器
|
| 249 |
+
quotes = page.xpath('//div[@class="quote"]') # XPath
|
| 250 |
+
quotes = page.find_all('div', {'class': 'quote'}) # BeautifulSoup风格
|
| 251 |
+
# 等同于
|
| 252 |
+
quotes = page.find_all('div', class_='quote')
|
| 253 |
+
quotes = page.find_all(['div'], class_='quote')
|
| 254 |
+
quotes = page.find_all(class_='quote') # 等等...
|
| 255 |
+
# 按文本内容查找元素
|
| 256 |
+
quotes = page.find_by_text('quote', tag='div')
|
| 257 |
+
|
| 258 |
+
# 高级导航
|
| 259 |
+
quote_text = page.css('.quote')[0].css('.text::text').get()
|
| 260 |
+
quote_text = page.css('.quote').css('.text::text').getall() # 链式选择器
|
| 261 |
+
first_quote = page.css('.quote')[0]
|
| 262 |
+
author = first_quote.next_sibling.css('.author::text')
|
| 263 |
+
parent_container = first_quote.parent
|
| 264 |
+
|
| 265 |
+
# 元素关系和相似性
|
| 266 |
+
similar_elements = first_quote.find_similar()
|
| 267 |
+
below_elements = first_quote.below_elements()
|
| 268 |
+
```
|
| 269 |
+
如果您不想获取网站,可以直接使用解析器,如下所示:
|
| 270 |
+
```python
|
| 271 |
+
from scrapling.parser import Selector
|
| 272 |
+
|
| 273 |
+
page = Selector("<html>...</html>")
|
| 274 |
+
```
|
| 275 |
+
用法完全相同!
|
| 276 |
+
|
| 277 |
+
### Async Session管理示例
|
| 278 |
+
```python
|
| 279 |
+
import asyncio
|
| 280 |
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession
|
| 281 |
+
|
| 282 |
+
async with FetcherSession(http3=True) as session: # `FetcherSession`是上下文感知的,可以在sync/async模式下工作
|
| 283 |
+
page1 = session.get('https://quotes.toscrape.com/')
|
| 284 |
+
page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
|
| 285 |
+
|
| 286 |
+
# Async Session用法
|
| 287 |
+
async with AsyncStealthySession(max_pages=2) as session:
|
| 288 |
+
tasks = []
|
| 289 |
+
urls = ['https://example.com/page1', 'https://example.com/page2']
|
| 290 |
+
|
| 291 |
+
for url in urls:
|
| 292 |
+
task = session.fetch(url)
|
| 293 |
+
tasks.append(task)
|
| 294 |
+
|
| 295 |
+
print(session.get_pool_stats()) # 可选 - 浏览器标签池的状态(忙/空闲/错误)
|
| 296 |
+
results = await asyncio.gather(*tasks)
|
| 297 |
+
print(session.get_pool_stats())
|
| 298 |
+
```
|
| 299 |
+
|
| 300 |
+
## CLI和交互式Shell
|
| 301 |
+
|
| 302 |
+
Scrapling包含强大的命令行界面:
|
| 303 |
+
|
| 304 |
+
[](https://asciinema.org/a/736339)
|
| 305 |
+
|
| 306 |
+
启动交互式Web Scraping Shell
|
| 307 |
+
```bash
|
| 308 |
+
scrapling shell
|
| 309 |
+
```
|
| 310 |
+
直接将页面提取到文件而无需编程(默认提取`body`标签内的内容)。如果输出文件以`.txt`结尾,则将提取目标的文本内容。如果以`.md`结尾,它将是HTML内容的Markdown表示;如果以`.html`结尾,它将是HTML内容本身。
|
| 311 |
+
```bash
|
| 312 |
+
scrapling extract get 'https://example.com' content.md
|
| 313 |
+
scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # 所有匹配CSS选择器'#fromSkipToProducts'的元素
|
| 314 |
+
scrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless
|
| 315 |
+
scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare
|
| 316 |
+
```
|
| 317 |
+
|
| 318 |
+
> [!NOTE]
|
| 319 |
+
> 还有许多其他功能,但我们希望保持此页面简洁,包括MCP服务器和交互式Web Scraping Shell。查看完整文档[这里](https://scrapling.readthedocs.io/en/latest/)
|
| 320 |
+
|
| 321 |
+
## 性能基准
|
| 322 |
+
|
| 323 |
+
Scrapling不仅功能强大——它还速度极快。以下基准测试将Scrapling的解析器与其他流行库的最新版本进行了比较。
|
| 324 |
+
|
| 325 |
+
### 文本提取速度测试(5000个嵌套元素)
|
| 326 |
+
|
| 327 |
+
| # | 库 | 时间(ms) | vs Scrapling |
|
| 328 |
+
|---|:-----------------:|:---------:|:------------:|
|
| 329 |
+
| 1 | Scrapling | 2.02 | 1.0x |
|
| 330 |
+
| 2 | Parsel/Scrapy | 2.04 | 1.01 |
|
| 331 |
+
| 3 | Raw Lxml | 2.54 | 1.257 |
|
| 332 |
+
| 4 | PyQuery | 24.17 | ~12x |
|
| 333 |
+
| 5 | Selectolax | 82.63 | ~41x |
|
| 334 |
+
| 6 | MechanicalSoup | 1549.71 | ~767.1x |
|
| 335 |
+
| 7 | BS4 with Lxml | 1584.31 | ~784.3x |
|
| 336 |
+
| 8 | BS4 with html5lib | 3391.91 | ~1679.1x |
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
### 元素相似性和文本搜索性能
|
| 340 |
+
|
| 341 |
+
Scrapling的自适应元素查找功能明显优于替代方案:
|
| 342 |
+
|
| 343 |
+
| 库 | 时间(ms) | vs Scrapling |
|
| 344 |
+
|-------------|:---------:|:------------:|
|
| 345 |
+
| Scrapling | 2.39 | 1.0x |
|
| 346 |
+
| AutoScraper | 12.45 | 5.209x |
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
> 所有基准测试代表100+次运行的平均值。请参阅[benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py)了解方法。
|
| 350 |
+
|
| 351 |
+
## 安装
|
| 352 |
+
|
| 353 |
+
Scrapling需要Python 3.10或更高版本:
|
| 354 |
+
|
| 355 |
+
```bash
|
| 356 |
+
pip install scrapling
|
| 357 |
+
```
|
| 358 |
+
|
| 359 |
+
此安装仅包括解析器引擎及其依赖项,没有任何Fetcher或命令行依赖项。
|
| 360 |
+
|
| 361 |
+
### 可选依赖项
|
| 362 |
+
|
| 363 |
+
1. 如果您要使用以下任何额外功能、Fetcher或它们的类,您将需要安装Fetcher的依赖项和它们的浏览器依赖项,如下所示:
|
| 364 |
+
```bash
|
| 365 |
+
pip install "scrapling[fetchers]"
|
| 366 |
+
|
| 367 |
+
scrapling install # normal install
|
| 368 |
+
scrapling install --force # force reinstall
|
| 369 |
+
```
|
| 370 |
+
|
| 371 |
+
这会下载所有浏览器,以及它们的系统依赖项和fingerprint操作依赖项。
|
| 372 |
+
|
| 373 |
+
或者你可以从代码中安装,而不是运行命令:
|
| 374 |
+
```python
|
| 375 |
+
from scrapling.cli import install
|
| 376 |
+
|
| 377 |
+
install([], standalone_mode=False) # normal install
|
| 378 |
+
install(["--force"], standalone_mode=False) # force reinstall
|
| 379 |
+
```
|
| 380 |
+
|
| 381 |
+
2. 额外功能:
|
| 382 |
+
- 安装MCP服务器功能:
|
| 383 |
+
```bash
|
| 384 |
+
pip install "scrapling[ai]"
|
| 385 |
+
```
|
| 386 |
+
- 安装Shell功能(Web Scraping Shell和`extract`命令):
|
| 387 |
+
```bash
|
| 388 |
+
pip install "scrapling[shell]"
|
| 389 |
+
```
|
| 390 |
+
- 安装所有内容:
|
| 391 |
+
```bash
|
| 392 |
+
pip install "scrapling[all]"
|
| 393 |
+
```
|
| 394 |
+
请记住,在安装任何这些额外功能后(如果您还没有安装),您需要使用`scrapling install`安装浏览器依赖项
|
| 395 |
+
|
| 396 |
+
### Docker
|
| 397 |
+
您还可以使用以下命令从DockerHub安装包含所有额外功能和浏览器的Docker镜像:
|
| 398 |
+
```bash
|
| 399 |
+
docker pull pyd4vinci/scrapling
|
| 400 |
+
```
|
| 401 |
+
或从GitHub注册表下载:
|
| 402 |
+
```bash
|
| 403 |
+
docker pull ghcr.io/d4vinci/scrapling:latest
|
| 404 |
+
```
|
| 405 |
+
此镜像使用GitHub Actions和仓库主分支自动构建和推送。
|
| 406 |
+
|
| 407 |
+
## 贡献
|
| 408 |
+
|
| 409 |
+
我们欢迎贡献!在开始之前,请阅读我们的[贡献指南](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md)。
|
| 410 |
+
|
| 411 |
+
## 免责声明
|
| 412 |
+
|
| 413 |
+
> [!CAUTION]
|
| 414 |
+
> 此库仅用于教育和研究目的。使用此库即表示您同意遵守本地和国际数据抓取和隐私法律。作者和贡献者对本软件的任何滥用不承担责任。始终尊重网站的服务条款和robots.txt文件。
|
| 415 |
+
|
| 416 |
+
## 许可证
|
| 417 |
+
|
| 418 |
+
本作品根据BSD-3-Clause许可证授权。
|
| 419 |
+
|
| 420 |
+
## 致谢
|
| 421 |
+
|
| 422 |
+
此项目包含改编自以下内容的代码:
|
| 423 |
+
- Parsel(BSD许可证)——用于[translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)子模块
|
| 424 |
+
|
| 425 |
+
---
|
| 426 |
+
<div align="center"><small>由Karim Shoair用❤️设计和制作。</small></div><br>
|
docs/README_DE.md
ADDED
|
@@ -0,0 +1,426 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!-- mcp-name: io.github.D4Vinci/Scrapling -->
|
| 2 |
+
|
| 3 |
+
<h1 align="center">
|
| 4 |
+
<a href="https://scrapling.readthedocs.io">
|
| 5 |
+
<picture>
|
| 6 |
+
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
|
| 7 |
+
<img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
|
| 8 |
+
</picture>
|
| 9 |
+
</a>
|
| 10 |
+
<br>
|
| 11 |
+
<small>Effortless Web Scraping for the Modern Web</small>
|
| 12 |
+
</h1>
|
| 13 |
+
|
| 14 |
+
<p align="center">
|
| 15 |
+
<a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
|
| 16 |
+
<img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
|
| 17 |
+
<a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
|
| 18 |
+
<img alt="PyPI version" src="https://badge.fury.io/py/Scrapling.svg"></a>
|
| 19 |
+
<a href="https://pepy.tech/project/scrapling" alt="PyPI Downloads">
|
| 20 |
+
<img alt="PyPI Downloads" src="https://static.pepy.tech/personalized-badge/scrapling?period=total&units=INTERNATIONAL_SYSTEM&left_color=GREY&right_color=GREEN&left_text=Downloads"></a>
|
| 21 |
+
<br/>
|
| 22 |
+
<a href="https://discord.gg/EMgGbDceNQ" alt="Discord" target="_blank">
|
| 23 |
+
<img alt="Discord" src="https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ">
|
| 24 |
+
</a>
|
| 25 |
+
<a href="https://x.com/Scrapling_dev" alt="X (formerly Twitter)">
|
| 26 |
+
<img alt="X (formerly Twitter) Follow" src="https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev">
|
| 27 |
+
</a>
|
| 28 |
+
<br/>
|
| 29 |
+
<a href="https://pypi.org/project/scrapling/" alt="Supported Python versions">
|
| 30 |
+
<img alt="Supported Python versions" src="https://img.shields.io/pypi/pyversions/scrapling.svg"></a>
|
| 31 |
+
</p>
|
| 32 |
+
|
| 33 |
+
<p align="center">
|
| 34 |
+
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/"><strong>Auswahlmethoden</strong></a>
|
| 35 |
+
·
|
| 36 |
+
<a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/"><strong>Einen Fetcher wählen</strong></a>
|
| 37 |
+
·
|
| 38 |
+
<a href="https://scrapling.readthedocs.io/en/latest/spiders/architecture.html"><strong>Spiders</strong></a>
|
| 39 |
+
·
|
| 40 |
+
<a href="https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html"><strong>Proxy-Rotation</strong></a>
|
| 41 |
+
·
|
| 42 |
+
<a href="https://scrapling.readthedocs.io/en/latest/cli/overview/"><strong>CLI</strong></a>
|
| 43 |
+
·
|
| 44 |
+
<a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/"><strong>MCP-Modus</strong></a>
|
| 45 |
+
</p>
|
| 46 |
+
|
| 47 |
+
Scrapling ist ein adaptives Web-Scraping-Framework, das alles abdeckt -- von einer einzelnen Anfrage bis hin zu einem umfassenden Crawl.
|
| 48 |
+
|
| 49 |
+
Sein Parser lernt aus Website-Änderungen und lokalisiert Ihre Elemente automatisch neu, wenn sich Seiten aktualisieren. Seine Fetcher umgehen Anti-Bot-Systeme wie Cloudflare Turnstile direkt ab Werk. Und sein Spider-Framework ermöglicht es Ihnen, auf parallele Multi-Session-Crawls mit Pause & Resume und automatischer Proxy-Rotation hochzuskalieren -- alles in wenigen Zeilen Python. Eine Bibliothek, keine Kompromisse.
|
| 50 |
+
|
| 51 |
+
Blitzschnelle Crawls mit Echtzeit-Statistiken und Streaming. Von Web Scrapern für Web Scraper und normale Benutzer entwickelt, ist für jeden etwas dabei.
|
| 52 |
+
|
| 53 |
+
```python
|
| 54 |
+
from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
|
| 55 |
+
StealthyFetcher.adaptive = True
|
| 56 |
+
p = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # Website unbemerkt abrufen!
|
| 57 |
+
products = p.css('.product', auto_save=True) # Daten scrapen, die Website-Designänderungen überleben!
|
| 58 |
+
products = p.css('.product', adaptive=True) # Später, wenn sich die Website-Struktur ändert, `adaptive=True` übergeben, um sie zu finden!
|
| 59 |
+
```
|
| 60 |
+
Oder auf vollständige Crawls hochskalieren
|
| 61 |
+
```python
|
| 62 |
+
from scrapling.spiders import Spider, Response
|
| 63 |
+
|
| 64 |
+
class MySpider(Spider):
|
| 65 |
+
name = "demo"
|
| 66 |
+
start_urls = ["https://example.com/"]
|
| 67 |
+
|
| 68 |
+
async def parse(self, response: Response):
|
| 69 |
+
for item in response.css('.product'):
|
| 70 |
+
yield {"title": item.css('h2::text').get()}
|
| 71 |
+
|
| 72 |
+
MySpider().start()
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# Platin-Sponsoren
|
| 77 |
+
|
| 78 |
+
<i><sub>Möchten Sie das erste Unternehmen sein, das hier erscheint? Klicken Sie [hier](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)</sub></i>
|
| 79 |
+
# Sponsoren
|
| 80 |
+
|
| 81 |
+
<!-- sponsors -->
|
| 82 |
+
|
| 83 |
+
<a href="https://www.scrapeless.com/en?utm_source=official&utm_term=scrapling" target="_blank" title="Effortless Web Scraping Toolkit for Business and Developers"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg"></a>
|
| 84 |
+
<a href="https://www.thordata.com/?ls=github&lk=github" target="_blank" title="Unblockable proxies and scraping infrastructure, delivering real-time, reliable web data to power AI models and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
|
| 85 |
+
<a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
|
| 86 |
+
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
| 87 |
+
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
| 88 |
+
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
| 89 |
+
<a href="https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci" target="_blank" title="The web scraping service that actually beats anti-bot systems!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png"></a>
|
| 90 |
+
<a href="https://proxyempire.io/" target="_blank" title="Collect The Data Your Project Needs with the Best Residential Proxies"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png"></a>
|
| 91 |
+
<a href="https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling" target="_blank" title="Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png"></a>
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
| 95 |
+
<a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
|
| 96 |
+
<a href="https://browser.cash/?utm_source=D4Vinci&utm_medium=referral" target="_blank" title="Browser Automation & AI Browser Agent Platform"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png"></a>
|
| 97 |
+
|
| 98 |
+
<!-- /sponsors -->
|
| 99 |
+
|
| 100 |
+
<i><sub>Möchten Sie Ihre Anzeige hier zeigen? Klicken Sie [hier](https://github.com/sponsors/D4Vinci) und wählen Sie die Stufe, die zu Ihnen passt!</sub></i>
|
| 101 |
+
|
| 102 |
+
---
|
| 103 |
+
|
| 104 |
+
## Hauptmerkmale
|
| 105 |
+
|
| 106 |
+
### Spiders -- Ein vollständiges Crawling-Framework
|
| 107 |
+
- 🕷️ **Scrapy-ähnliche Spider-API**: Definieren Sie Spiders mit `start_urls`, async `parse` Callbacks und `Request`/`Response`-Objekten.
|
| 108 |
+
- ⚡ **Paralleles Crawling**: Konfigurierbare Parallelitätslimits, domainbezogenes Throttling und Download-Verzögerungen.
|
| 109 |
+
- 🔄 **Multi-Session-Unterstützung**: Einheitliche Schnittstelle für HTTP-Anfragen und heimliche Headless-Browser in einem einzigen Spider -- leiten Sie Anfragen per ID an verschiedene Sessions weiter.
|
| 110 |
+
- 💾 **Pause & Resume**: Checkpoint-basierte Crawl-Persistenz. Drücken Sie Strg+C für ein kontrolliertes Herunterfahren; starten Sie neu, um dort fortzufahren, wo Sie aufgehört haben.
|
| 111 |
+
- 📡 **Streaming-Modus**: Gescrapte Elemente in Echtzeit streamen über `async for item in spider.stream()` mit Echtzeit-Statistiken -- ideal für UI, Pipelines und lang laufende Crawls.
|
| 112 |
+
- 🛡️ **Erkennung blockierter Anfragen**: Automatische Erkennung und Wiederholung blockierter Anfragen mit anpassbarer Logik.
|
| 113 |
+
- 📦 **Integrierter Export**: Ergebnisse über Hooks und Ihre eigene Pipeline oder den integrierten JSON/JSONL-Export mit `result.items.to_json()` / `result.items.to_jsonl()` exportieren.
|
| 114 |
+
|
| 115 |
+
### Erweitertes Website-Abrufen mit Session-Unterstützung
|
| 116 |
+
- **HTTP-Anfragen**: Schnelle und heimliche HTTP-Anfragen mit der `Fetcher`-Klasse. Kann Browser-TLS-Fingerprints und Header imitieren und HTTP/3 verwenden.
|
| 117 |
+
- **Dynamisches Laden**: Dynamische Websites mit vollständiger Browser-Automatisierung über die `DynamicFetcher`-Klasse abrufen, die Playwrights Chromium und Google Chrome unterstützt.
|
| 118 |
+
- **Anti-Bot-Umgehung**: Erweiterte Stealth-Fähigkeiten mit `StealthyFetcher` und Fingerprint-Spoofing. Kann alle Arten von Cloudflares Turnstile/Interstitial einfach mit Automatisierung umgehen.
|
| 119 |
+
- **Session-Verwaltung**: Persistente Session-Unterstützung mit den Klassen `FetcherSession`, `StealthySession` und `DynamicSession` für Cookie- und Zustandsverwaltung über Anfragen hinweg.
|
| 120 |
+
- **Proxy-Rotation**: Integrierter `ProxyRotator` mit zyklischen oder benutzerdefinierten Rotationsstrategien über alle Session-Typen hinweg, plus Proxy-Überschreibungen pro Anfrage.
|
| 121 |
+
- **Domain-Blockierung**: Anfragen an bestimmte Domains (und deren Subdomains) in browserbasierten Fetchern blockieren.
|
| 122 |
+
- **Async-Unterstützung**: Vollständige async-Unterstützung über alle Fetcher und dedizierte async Session-Klassen hinweg.
|
| 123 |
+
|
| 124 |
+
### Adaptives Scraping & KI-Integration
|
| 125 |
+
- 🔄 **Intelligente Element-Verfolgung**: Elemente nach Website-Änderungen mit intelligenten Ähnlichkeitsalgorithmen neu lokalisieren.
|
| 126 |
+
- 🎯 **Intelligente flexible Auswahl**: CSS-Selektoren, XPath-Selektoren, filterbasierte Suche, Textsuche, Regex-Suche und mehr.
|
| 127 |
+
- 🔍 **Ähnliche Elemente finden**: Elemente, die gefundenen Elementen ähnlich sind, automatisch lokalisieren.
|
| 128 |
+
- 🤖 **MCP-Server für die Verwendung mit KI**: Integrierter MCP-Server für KI-unterstütztes Web Scraping und Datenextraktion. Der MCP-Server verfügt über leistungsstarke, benutzerdefinierte Funktionen, die Scrapling nutzen, um gezielten Inhalt zu extrahieren, bevor er an die KI (Claude/Cursor/etc.) übergeben wird, wodurch Vorgänge beschleunigt und Kosten durch Minimierung der Token-Nutzung gesenkt werden. ([Demo-Video](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
|
| 129 |
+
|
| 130 |
+
### Hochleistungs- und praxiserprobte Architektur
|
| 131 |
+
- 🚀 **Blitzschnell**: Optimierte Leistung, die die meisten Python-Scraping-Bibliotheken übertrifft.
|
| 132 |
+
- 🔋 **Speichereffizient**: Optimierte Datenstrukturen und Lazy Loading für einen minimalen Speicher-Footprint.
|
| 133 |
+
- ⚡ **Schnelle JSON-Serialisierung**: 10x schneller als die Standardbibliothek.
|
| 134 |
+
- 🏗️ **Praxiserprobt**: Scrapling hat nicht nur eine Testabdeckung von 92% und eine vollständige Type-Hints-Abdeckung, sondern wird seit dem letzten Jahr täglich von Hunderten von Web Scrapern verwendet.
|
| 135 |
+
|
| 136 |
+
### Entwickler-/Web-Scraper-freundliche Erfahrung
|
| 137 |
+
- 🎯 **Interaktive Web-Scraping-Shell**: Optionale integrierte IPython-Shell mit Scrapling-Integration, Shortcuts und neuen Tools zur Beschleunigung der Web-Scraping-Skriptentwicklung, wie das Konvertieren von Curl-Anfragen in Scrapling-Anfragen und das Anzeigen von Anfrageergebnissen in Ihrem Browser.
|
| 138 |
+
- 🚀 **Direkt vom Terminal aus verwenden**: Optional können Sie Scrapling verwenden, um eine URL zu scrapen, ohne eine einzige Codezeile zu schreiben!
|
| 139 |
+
- 🛠️ **Umfangreiche Navigations-API**: Erweiterte DOM-Traversierung mit Eltern-, Geschwister- und Kind-Navigationsmethoden.
|
| 140 |
+
- 🧬 **Verbesserte Textverarbeitung**: Integrierte Regex, Bereinigungsmethoden und optimierte String-Operationen.
|
| 141 |
+
- 📝 **Automatische Selektorgenerierung**: Robuste CSS/XPath-Selektoren für jedes Element generieren.
|
| 142 |
+
- 🔌 **Vertraute API**: Ähnlich wie Scrapy/BeautifulSoup mit denselben Pseudo-Elementen, die in Scrapy/Parsel verwendet werden.
|
| 143 |
+
- 📘 **Vollständige Typabdeckung**: Vollständige Type Hints für hervorragende IDE-Unterstützung und Code-Vervollständigung. Die gesamte Codebasis wird bei jeder Änderung automatisch mit **PyRight** und **MyPy** gescannt.
|
| 144 |
+
- 🔋 **Fertiges Docker-Image**: Mit jeder Veröffentlichung wird automatisch ein Docker-Image erstellt und gepusht, das alle Browser enthält.
|
| 145 |
+
|
| 146 |
+
## Erste Schritte
|
| 147 |
+
|
| 148 |
+
Hier ein kurzer Überblick über das, was Scrapling kann, ohne zu sehr ins Detail zu gehen.
|
| 149 |
+
|
| 150 |
+
### Grundlegende Verwendung
|
| 151 |
+
HTTP-Anfragen mit Session-Unterstützung
|
| 152 |
+
```python
|
| 153 |
+
from scrapling.fetchers import Fetcher, FetcherSession
|
| 154 |
+
|
| 155 |
+
with FetcherSession(impersonate='chrome') as session: # Neueste Version von Chromes TLS-Fingerprint verwenden
|
| 156 |
+
page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
| 157 |
+
quotes = page.css('.quote .text::text').getall()
|
| 158 |
+
|
| 159 |
+
# Oder einmalige Anfragen verwenden
|
| 160 |
+
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 161 |
+
quotes = page.css('.quote .text::text').getall()
|
| 162 |
+
```
|
| 163 |
+
Erweiterter Stealth-Modus
|
| 164 |
+
```python
|
| 165 |
+
from scrapling.fetchers import StealthyFetcher, StealthySession
|
| 166 |
+
|
| 167 |
+
with StealthySession(headless=True, solve_cloudflare=True) as session: # Browser offen halten, bis Sie fertig sind
|
| 168 |
+
page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
|
| 169 |
+
data = page.css('#padded_content a').getall()
|
| 170 |
+
|
| 171 |
+
# Oder einmaligen Anfragenstil verwenden: öffnet den Browser für diese Anfrage und schließt ihn nach Abschluss
|
| 172 |
+
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
|
| 173 |
+
data = page.css('#padded_content a').getall()
|
| 174 |
+
```
|
| 175 |
+
Vollständige Browser-Automatisierung
|
| 176 |
+
```python
|
| 177 |
+
from scrapling.fetchers import DynamicFetcher, DynamicSession
|
| 178 |
+
|
| 179 |
+
with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # Browser offen halten, bis Sie fertig sind
|
| 180 |
+
page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
|
| 181 |
+
data = page.xpath('//span[@class="text"]/text()').getall() # XPath-Selektor, falls bevorzugt
|
| 182 |
+
|
| 183 |
+
# Oder einmaligen Anfragenstil verwenden: öffnet den Browser für diese Anfrage und schließt ihn nach Abschluss
|
| 184 |
+
page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
|
| 185 |
+
data = page.css('.quote .text::text').getall()
|
| 186 |
+
```
|
| 187 |
+
|
| 188 |
+
### Spiders
|
| 189 |
+
Vollständige Crawler mit parallelen Anfragen, mehreren Session-Typen und Pause & Resume erstellen:
|
| 190 |
+
```python
|
| 191 |
+
from scrapling.spiders import Spider, Request, Response
|
| 192 |
+
|
| 193 |
+
class QuotesSpider(Spider):
|
| 194 |
+
name = "quotes"
|
| 195 |
+
start_urls = ["https://quotes.toscrape.com/"]
|
| 196 |
+
concurrent_requests = 10
|
| 197 |
+
|
| 198 |
+
async def parse(self, response: Response):
|
| 199 |
+
for quote in response.css('.quote'):
|
| 200 |
+
yield {
|
| 201 |
+
"text": quote.css('.text::text').get(),
|
| 202 |
+
"author": quote.css('.author::text').get(),
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
next_page = response.css('.next a')
|
| 206 |
+
if next_page:
|
| 207 |
+
yield response.follow(next_page[0].attrib['href'])
|
| 208 |
+
|
| 209 |
+
result = QuotesSpider().start()
|
| 210 |
+
print(f"{len(result.items)} Zitate gescrapt")
|
| 211 |
+
result.items.to_json("quotes.json")
|
| 212 |
+
```
|
| 213 |
+
Mehrere Session-Typen in einem einzigen Spider verwenden:
|
| 214 |
+
```python
|
| 215 |
+
from scrapling.spiders import Spider, Request, Response
|
| 216 |
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession
|
| 217 |
+
|
| 218 |
+
class MultiSessionSpider(Spider):
|
| 219 |
+
name = "multi"
|
| 220 |
+
start_urls = ["https://example.com/"]
|
| 221 |
+
|
| 222 |
+
def configure_sessions(self, manager):
|
| 223 |
+
manager.add("fast", FetcherSession(impersonate="chrome"))
|
| 224 |
+
manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)
|
| 225 |
+
|
| 226 |
+
async def parse(self, response: Response):
|
| 227 |
+
for link in response.css('a::attr(href)').getall():
|
| 228 |
+
# Geschützte Seiten über die Stealth-Session leiten
|
| 229 |
+
if "protected" in link:
|
| 230 |
+
yield Request(link, sid="stealth")
|
| 231 |
+
else:
|
| 232 |
+
yield Request(link, sid="fast", callback=self.parse) # Expliziter Callback
|
| 233 |
+
```
|
| 234 |
+
Lange Crawls mit Checkpoints pausieren und fortsetzen, indem Sie den Spider so starten:
|
| 235 |
+
```python
|
| 236 |
+
QuotesSpider(crawldir="./crawl_data").start()
|
| 237 |
+
```
|
| 238 |
+
Drücken Sie Strg+C, um kontrolliert zu pausieren -- der Fortschritt wird automatisch gespeichert. Wenn Sie den Spider später erneut starten, übergeben Sie dasselbe `crawldir`, und er setzt dort fort, wo er aufgehört hat.
|
| 239 |
+
|
| 240 |
+
### Erweitertes Parsing & Navigation
|
| 241 |
+
```python
|
| 242 |
+
from scrapling.fetchers import Fetcher
|
| 243 |
+
|
| 244 |
+
# Umfangreiche Elementauswahl und Navigation
|
| 245 |
+
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 246 |
+
|
| 247 |
+
# Zitate mit verschiedenen Auswahlmethoden abrufen
|
| 248 |
+
quotes = page.css('.quote') # CSS-Selektor
|
| 249 |
+
quotes = page.xpath('//div[@class="quote"]') # XPath
|
| 250 |
+
quotes = page.find_all('div', {'class': 'quote'}) # BeautifulSoup-Stil
|
| 251 |
+
# Gleich wie
|
| 252 |
+
quotes = page.find_all('div', class_='quote')
|
| 253 |
+
quotes = page.find_all(['div'], class_='quote')
|
| 254 |
+
quotes = page.find_all(class_='quote') # und so weiter...
|
| 255 |
+
# Element nach Textinhalt finden
|
| 256 |
+
quotes = page.find_by_text('quote', tag='div')
|
| 257 |
+
|
| 258 |
+
# Erweiterte Navigation
|
| 259 |
+
quote_text = page.css('.quote')[0].css('.text::text').get()
|
| 260 |
+
quote_text = page.css('.quote').css('.text::text').getall() # Verkettete Selektoren
|
| 261 |
+
first_quote = page.css('.quote')[0]
|
| 262 |
+
author = first_quote.next_sibling.css('.author::text')
|
| 263 |
+
parent_container = first_quote.parent
|
| 264 |
+
|
| 265 |
+
# Elementbeziehungen und Ähnlichkeit
|
| 266 |
+
similar_elements = first_quote.find_similar()
|
| 267 |
+
below_elements = first_quote.below_elements()
|
| 268 |
+
```
|
| 269 |
+
Sie können den Parser direkt verwenden, wenn Sie keine Websites abrufen möchten, wie unten gezeigt:
|
| 270 |
+
```python
|
| 271 |
+
from scrapling.parser import Selector
|
| 272 |
+
|
| 273 |
+
page = Selector("<html>...</html>")
|
| 274 |
+
```
|
| 275 |
+
Und es funktioniert genau auf die gleiche Weise!
|
| 276 |
+
|
| 277 |
+
### Beispiele für async Session-Verwaltung
|
| 278 |
+
```python
|
| 279 |
+
import asyncio
|
| 280 |
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession
|
| 281 |
+
|
| 282 |
+
async with FetcherSession(http3=True) as session: # `FetcherSession` ist kontextbewusst und kann sowohl in sync- als auch in async-Mustern arbeiten
|
| 283 |
+
page1 = session.get('https://quotes.toscrape.com/')
|
| 284 |
+
page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
|
| 285 |
+
|
| 286 |
+
# Async-Session-Verwendung
|
| 287 |
+
async with AsyncStealthySession(max_pages=2) as session:
|
| 288 |
+
tasks = []
|
| 289 |
+
urls = ['https://example.com/page1', 'https://example.com/page2']
|
| 290 |
+
|
| 291 |
+
for url in urls:
|
| 292 |
+
task = session.fetch(url)
|
| 293 |
+
tasks.append(task)
|
| 294 |
+
|
| 295 |
+
print(session.get_pool_stats()) # Optional - Der Status des Browser-Tab-Pools (beschäftigt/frei/Fehler)
|
| 296 |
+
results = await asyncio.gather(*tasks)
|
| 297 |
+
print(session.get_pool_stats())
|
| 298 |
+
```
|
| 299 |
+
|
| 300 |
+
## CLI & Interaktive Shell
|
| 301 |
+
|
| 302 |
+
Scrapling enthält eine leistungsstarke Befehlszeilenschnittstelle:
|
| 303 |
+
|
| 304 |
+
[](https://asciinema.org/a/736339)
|
| 305 |
+
|
| 306 |
+
Interaktive Web-Scraping-Shell starten
|
| 307 |
+
```bash
|
| 308 |
+
scrapling shell
|
| 309 |
+
```
|
| 310 |
+
Seiten direkt ohne Programmierung in eine Datei extrahieren (extrahiert standardmäßig den Inhalt im `body`-Tag). Wenn die Ausgabedatei mit `.txt` endet, wird der Textinhalt des Ziels extrahiert. Wenn sie mit `.md` endet, ist es eine Markdown-Darstellung des HTML-Inhalts; wenn sie mit `.html` endet, ist es der HTML-Inhalt selbst.
|
| 311 |
+
```bash
|
| 312 |
+
scrapling extract get 'https://example.com' content.md
|
| 313 |
+
scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # Alle Elemente, die dem CSS-Selektor '#fromSkipToProducts' entsprechen
|
| 314 |
+
scrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless
|
| 315 |
+
scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare
|
| 316 |
+
```
|
| 317 |
+
|
| 318 |
+
> [!NOTE]
|
| 319 |
+
> Es gibt viele zusätzliche Funktionen, aber wir möchten diese Seite prägnant halten, einschließlich des MCP-Servers und der interaktiven Web-Scraping-Shell. Schauen Sie sich die vollständige Dokumentation [hier](https://scrapling.readthedocs.io/en/latest/) an
|
| 320 |
+
|
| 321 |
+
## Leistungsbenchmarks
|
| 322 |
+
|
| 323 |
+
Scrapling ist nicht nur leistungsstark -- es ist auch blitzschnell. Die folgenden Benchmarks vergleichen Scraplings Parser mit den neuesten Versionen anderer beliebter Bibliotheken.
|
| 324 |
+
|
| 325 |
+
### Textextraktions-Geschwindigkeitstest (5000 verschachtelte Elemente)
|
| 326 |
+
|
| 327 |
+
| # | Bibliothek | Zeit (ms) | vs Scrapling |
|
| 328 |
+
|---|:-----------------:|:---------:|:------------:|
|
| 329 |
+
| 1 | Scrapling | 2.02 | 1.0x |
|
| 330 |
+
| 2 | Parsel/Scrapy | 2.04 | 1.01 |
|
| 331 |
+
| 3 | Raw Lxml | 2.54 | 1.257 |
|
| 332 |
+
| 4 | PyQuery | 24.17 | ~12x |
|
| 333 |
+
| 5 | Selectolax | 82.63 | ~41x |
|
| 334 |
+
| 6 | MechanicalSoup | 1549.71 | ~767.1x |
|
| 335 |
+
| 7 | BS4 with Lxml | 1584.31 | ~784.3x |
|
| 336 |
+
| 8 | BS4 with html5lib | 3391.91 | ~1679.1x |
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
### Element-Ähnlichkeit & Textsuche-Leistung
|
| 340 |
+
|
| 341 |
+
Scraplings adaptive Element-Finding-Fähigkeiten übertreffen Alternativen deutlich:
|
| 342 |
+
|
| 343 |
+
| Bibliothek | Zeit (ms) | vs Scrapling |
|
| 344 |
+
|-------------|:---------:|:------------:|
|
| 345 |
+
| Scrapling | 2.39 | 1.0x |
|
| 346 |
+
| AutoScraper | 12.45 | 5.209x |
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
> Alle Benchmarks stellen Durchschnittswerte von über 100 Durchläufen dar. Siehe [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) für die Methodik.
|
| 350 |
+
|
| 351 |
+
## Installation
|
| 352 |
+
|
| 353 |
+
Scrapling erfordert Python 3.10 oder höher:
|
| 354 |
+
|
| 355 |
+
```bash
|
| 356 |
+
pip install scrapling
|
| 357 |
+
```
|
| 358 |
+
|
| 359 |
+
Diese Installation enthält nur die Parser-Engine und ihre Abhängigkeiten, ohne Fetcher oder Kommandozeilenabhängigkeiten.
|
| 360 |
+
|
| 361 |
+
### Optionale Abhängigkeiten
|
| 362 |
+
|
| 363 |
+
1. Wenn Sie eine der folgenden zusätzlichen Funktionen, die Fetcher oder ihre Klassen verwenden möchten, müssen Sie die Abhängigkeiten der Fetcher und ihre Browser-Abhängigkeiten wie folgt installieren:
|
| 364 |
+
```bash
|
| 365 |
+
pip install "scrapling[fetchers]"
|
| 366 |
+
|
| 367 |
+
scrapling install # normal install
|
| 368 |
+
scrapling install --force # force reinstall
|
| 369 |
+
```
|
| 370 |
+
|
| 371 |
+
Dies lädt alle Browser zusammen mit ihren Systemabhängigkeiten und Fingerprint-Manipulationsabhängigkeiten herunter.
|
| 372 |
+
|
| 373 |
+
Oder Sie können sie aus dem Code heraus installieren, anstatt einen Befehl auszuführen:
|
| 374 |
+
```python
|
| 375 |
+
from scrapling.cli import install
|
| 376 |
+
|
| 377 |
+
install([], standalone_mode=False) # normal install
|
| 378 |
+
install(["--force"], standalone_mode=False) # force reinstall
|
| 379 |
+
```
|
| 380 |
+
|
| 381 |
+
2. Zusätzliche Funktionen:
|
| 382 |
+
- MCP-Server-Funktion installieren:
|
| 383 |
+
```bash
|
| 384 |
+
pip install "scrapling[ai]"
|
| 385 |
+
```
|
| 386 |
+
- Shell-Funktionen installieren (Web-Scraping-Shell und der `extract`-Befehl):
|
| 387 |
+
```bash
|
| 388 |
+
pip install "scrapling[shell]"
|
| 389 |
+
```
|
| 390 |
+
- Alles installieren:
|
| 391 |
+
```bash
|
| 392 |
+
pip install "scrapling[all]"
|
| 393 |
+
```
|
| 394 |
+
Denken Sie daran, dass Sie nach einem dieser Extras (falls noch nicht geschehen) die Browser-Abhängigkeiten mit `scrapling install` installieren müssen
|
| 395 |
+
|
| 396 |
+
### Docker
|
| 397 |
+
Sie können auch ein Docker-Image mit allen Extras und Browsern mit dem folgenden Befehl von DockerHub installieren:
|
| 398 |
+
```bash
|
| 399 |
+
docker pull pyd4vinci/scrapling
|
| 400 |
+
```
|
| 401 |
+
Oder laden Sie es aus der GitHub-Registry herunter:
|
| 402 |
+
```bash
|
| 403 |
+
docker pull ghcr.io/d4vinci/scrapling:latest
|
| 404 |
+
```
|
| 405 |
+
Dieses Image wird automatisch mit GitHub Actions und dem Hauptzweig des Repositorys erstellt und gepusht.
|
| 406 |
+
|
| 407 |
+
## Beitragen
|
| 408 |
+
|
| 409 |
+
Wir freuen uns über Beiträge! Bitte lesen Sie unsere [Beitragsrichtlinien](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md), bevor Sie beginnen.
|
| 410 |
+
|
| 411 |
+
## Haftungsausschluss
|
| 412 |
+
|
| 413 |
+
> [!CAUTION]
|
| 414 |
+
> Diese Bibliothek wird nur zu Bildungs- und Forschungszwecken bereitgestellt. Durch die Nutzung dieser Bibliothek erklären Sie sich damit einverstanden, lokale und internationale Gesetze zum Daten-Scraping und Datenschutz einzuhalten. Die Autoren und Mitwirkenden sind nicht verantwortlich für Missbrauch dieser Software. Respektieren Sie immer die Nutzungsbedingungen von Websites und robots.txt-Dateien.
|
| 415 |
+
|
| 416 |
+
## Lizenz
|
| 417 |
+
|
| 418 |
+
Diese Arbeit ist unter der BSD-3-Clause-Lizenz lizenziert.
|
| 419 |
+
|
| 420 |
+
## Danksagungen
|
| 421 |
+
|
| 422 |
+
Dieses Projekt enthält angepassten Code von:
|
| 423 |
+
- Parsel (BSD-Lizenz) -- Verwendet für das [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)-Submodul
|
| 424 |
+
|
| 425 |
+
---
|
| 426 |
+
<div align="center"><small>Entworfen und hergestellt mit ❤️ von Karim Shoair.</small></div><br>
|
docs/README_ES.md
ADDED
|
@@ -0,0 +1,426 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!-- mcp-name: io.github.D4Vinci/Scrapling -->
|
| 2 |
+
|
| 3 |
+
<h1 align="center">
|
| 4 |
+
<a href="https://scrapling.readthedocs.io">
|
| 5 |
+
<picture>
|
| 6 |
+
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
|
| 7 |
+
<img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
|
| 8 |
+
</picture>
|
| 9 |
+
</a>
|
| 10 |
+
<br>
|
| 11 |
+
<small>Effortless Web Scraping for the Modern Web</small>
|
| 12 |
+
</h1>
|
| 13 |
+
|
| 14 |
+
<p align="center">
|
| 15 |
+
<a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
|
| 16 |
+
<img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
|
| 17 |
+
<a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
|
| 18 |
+
<img alt="PyPI version" src="https://badge.fury.io/py/Scrapling.svg"></a>
|
| 19 |
+
<a href="https://pepy.tech/project/scrapling" alt="PyPI Downloads">
|
| 20 |
+
<img alt="PyPI Downloads" src="https://static.pepy.tech/personalized-badge/scrapling?period=total&units=INTERNATIONAL_SYSTEM&left_color=GREY&right_color=GREEN&left_text=Downloads"></a>
|
| 21 |
+
<br/>
|
| 22 |
+
<a href="https://discord.gg/EMgGbDceNQ" alt="Discord" target="_blank">
|
| 23 |
+
<img alt="Discord" src="https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ">
|
| 24 |
+
</a>
|
| 25 |
+
<a href="https://x.com/Scrapling_dev" alt="X (formerly Twitter)">
|
| 26 |
+
<img alt="X (formerly Twitter) Follow" src="https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev">
|
| 27 |
+
</a>
|
| 28 |
+
<br/>
|
| 29 |
+
<a href="https://pypi.org/project/scrapling/" alt="Supported Python versions">
|
| 30 |
+
<img alt="Supported Python versions" src="https://img.shields.io/pypi/pyversions/scrapling.svg"></a>
|
| 31 |
+
</p>
|
| 32 |
+
|
| 33 |
+
<p align="center">
|
| 34 |
+
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/"><strong>Métodos de selección</strong></a>
|
| 35 |
+
·
|
| 36 |
+
<a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/"><strong>Elegir un fetcher</strong></a>
|
| 37 |
+
·
|
| 38 |
+
<a href="https://scrapling.readthedocs.io/en/latest/spiders/architecture.html"><strong>Spiders</strong></a>
|
| 39 |
+
·
|
| 40 |
+
<a href="https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html"><strong>Rotación de proxy</strong></a>
|
| 41 |
+
·
|
| 42 |
+
<a href="https://scrapling.readthedocs.io/en/latest/cli/overview/"><strong>CLI</strong></a>
|
| 43 |
+
·
|
| 44 |
+
<a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/"><strong>Modo MCP</strong></a>
|
| 45 |
+
</p>
|
| 46 |
+
|
| 47 |
+
Scrapling es un framework de Web Scraping adaptativo que se encarga de todo, desde una sola solicitud hasta un rastreo a gran escala.
|
| 48 |
+
|
| 49 |
+
Su parser aprende de los cambios de los sitios web y relocaliza automáticamente tus elementos cuando las páginas se actualizan. Sus fetchers evaden sistemas anti-bot como Cloudflare Turnstile de forma nativa. Y su framework Spider te permite escalar a rastreos concurrentes con múltiples sesiones, con Pause & Resume y rotación automática de Proxy, todo en unas pocas líneas de Python. Una biblioteca, cero compromisos.
|
| 50 |
+
|
| 51 |
+
Rastreos ultrarrápidos con estadísticas en tiempo real y Streaming. Construido por Web Scrapers para Web Scrapers y usuarios regulares, hay algo para todos.
|
| 52 |
+
|
| 53 |
+
```python
|
| 54 |
+
from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
|
| 55 |
+
StealthyFetcher.adaptive = True
|
| 56 |
+
p = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # ¡Obtén el sitio web bajo el radar!
|
| 57 |
+
products = p.css('.product', auto_save=True) # ¡Extrae datos que sobreviven a cambios de diseño del sitio web!
|
| 58 |
+
products = p.css('.product', adaptive=True) # Más tarde, si la estructura del sitio web cambia, ¡pasa `adaptive=True` para encontrarlos!
|
| 59 |
+
```
|
| 60 |
+
O escala a rastreos completos
|
| 61 |
+
```python
|
| 62 |
+
from scrapling.spiders import Spider, Response
|
| 63 |
+
|
| 64 |
+
class MySpider(Spider):
|
| 65 |
+
name = "demo"
|
| 66 |
+
start_urls = ["https://example.com/"]
|
| 67 |
+
|
| 68 |
+
async def parse(self, response: Response):
|
| 69 |
+
for item in response.css('.product'):
|
| 70 |
+
yield {"title": item.css('h2::text').get()}
|
| 71 |
+
|
| 72 |
+
MySpider().start()
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# Patrocinadores Platino
|
| 77 |
+
|
| 78 |
+
<i><sub>¿Quieres ser la primera empresa en aparecer aquí? Haz clic [aquí](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)</sub></i>
|
| 79 |
+
# Patrocinadores
|
| 80 |
+
|
| 81 |
+
<!-- sponsors -->
|
| 82 |
+
|
| 83 |
+
<a href="https://www.scrapeless.com/en?utm_source=official&utm_term=scrapling" target="_blank" title="Effortless Web Scraping Toolkit for Business and Developers"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg"></a>
|
| 84 |
+
<a href="https://www.thordata.com/?ls=github&lk=github" target="_blank" title="Unblockable proxies and scraping infrastructure, delivering real-time, reliable web data to power AI models and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
|
| 85 |
+
<a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
|
| 86 |
+
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
| 87 |
+
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
| 88 |
+
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
| 89 |
+
<a href="https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci" target="_blank" title="The web scraping service that actually beats anti-bot systems!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png"></a>
|
| 90 |
+
<a href="https://proxyempire.io/" target="_blank" title="Collect The Data Your Project Needs with the Best Residential Proxies"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png"></a>
|
| 91 |
+
<a href="https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling" target="_blank" title="Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png"></a>
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
| 95 |
+
<a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
|
| 96 |
+
<a href="https://browser.cash/?utm_source=D4Vinci&utm_medium=referral" target="_blank" title="Browser Automation & AI Browser Agent Platform"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png"></a>
|
| 97 |
+
|
| 98 |
+
<!-- /sponsors -->
|
| 99 |
+
|
| 100 |
+
<i><sub>¿Quieres mostrar tu anuncio aquí? ¡Haz clic [aquí](https://github.com/sponsors/D4Vinci) y elige el nivel que te convenga!</sub></i>
|
| 101 |
+
|
| 102 |
+
---
|
| 103 |
+
|
| 104 |
+
## Características Principales
|
| 105 |
+
|
| 106 |
+
### Spiders — Un Framework Completo de Rastreo
|
| 107 |
+
- 🕷️ **API de Spider al estilo Scrapy**: Define spiders con `start_urls`, callbacks async `parse`, y objetos `Request`/`Response`.
|
| 108 |
+
- ⚡ **Rastreo Concurrente**: Límites de concurrencia configurables, limitación por dominio y retrasos de descarga.
|
| 109 |
+
- 🔄 **Soporte Multi-Session**: Interfaz unificada para solicitudes HTTP y navegadores headless sigilosos en un solo Spider — enruta solicitudes a diferentes sesiones por ID.
|
| 110 |
+
- 💾 **Pause & Resume**: Persistencia de rastreo basada en Checkpoint. Presiona Ctrl+C para un cierre ordenado; reinicia para continuar desde donde lo dejaste.
|
| 111 |
+
- 📡 **Modo Streaming**: Transmite elementos extraídos a medida que llegan con `async for item in spider.stream()` con estadísticas en tiempo real — ideal para UI, pipelines y rastreos de larga duración.
|
| 112 |
+
- 🛡️ **Detección de Solicitudes Bloqueadas**: Detección automática y reintento de solicitudes bloqueadas con lógica personalizable.
|
| 113 |
+
- 📦 **Exportación Integrada**: Exporta resultados a través de hooks y tu propio pipeline o el JSON/JSONL integrado con `result.items.to_json()` / `result.items.to_jsonl()` respectivamente.
|
| 114 |
+
|
| 115 |
+
### Obtención Avanzada de Sitios Web con Soporte de Session
|
| 116 |
+
- **Solicitudes HTTP**: Solicitudes HTTP rápidas y sigilosas con la clase `Fetcher`. Puede imitar el fingerprint TLS de los navegadores, encabezados y usar HTTP/3.
|
| 117 |
+
- **Carga Dinámica**: Obtén sitios web dinámicos con automatización completa del navegador a través de la clase `DynamicFetcher` compatible con Chromium de Playwright y Google Chrome.
|
| 118 |
+
- **Evasión Anti-bot**: Capacidades de sigilo avanzadas con `StealthyFetcher` y falsificación de fingerprint. Puede evadir fácilmente todos los tipos de Turnstile/Interstitial de Cloudflare con automatización.
|
| 119 |
+
- **Gestión de Session**: Soporte de sesión persistente con las clases `FetcherSession`, `StealthySession` y `DynamicSession` para la gestión de cookies y estado entre solicitudes.
|
| 120 |
+
- **Rotación de Proxy**: `ProxyRotator` integrado con estrategias de rotación cíclica o personalizadas en todos los tipos de sesión, además de sobrescrituras de Proxy por solicitud.
|
| 121 |
+
- **Bloqueo de Dominios**: Bloquea solicitudes a dominios específicos (y sus subdominios) en fetchers basados en navegador.
|
| 122 |
+
- **Soporte Async**: Soporte async completo en todos los fetchers y clases de sesión async dedicadas.
|
| 123 |
+
|
| 124 |
+
### Scraping Adaptativo e Integración con IA
|
| 125 |
+
- 🔄 **Seguimiento Inteligente de Elementos**: Relocaliza elementos después de cambios en el sitio web usando algoritmos inteligentes de similitud.
|
| 126 |
+
- 🎯 **Selección Flexible Inteligente**: Selectores CSS, selectores XPath, búsqueda basada en filtros, búsqueda de texto, búsqueda regex y más.
|
| 127 |
+
- 🔍 **Encontrar Elementos Similares**: Localiza automáticamente elementos similares a los elementos encontrados.
|
| 128 |
+
- 🤖 **Servidor MCP para usar con IA**: Servidor MCP integrado para Web Scraping asistido por IA y extracción de datos. El servidor MCP presenta capacidades potentes y personalizadas que aprovechan Scrapling para extraer contenido específico antes de pasarlo a la IA (Claude/Cursor/etc), acelerando así las operaciones y reduciendo costos al minimizar el uso de tokens. ([video demo](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
|
| 129 |
+
|
| 130 |
+
### Arquitectura de Alto Rendimiento y Probada en Batalla
|
| 131 |
+
- 🚀 **Ultrarrápido**: Rendimiento optimizado que supera a la mayoría de las bibliotecas de Web Scraping de Python.
|
| 132 |
+
- 🔋 **Eficiente en Memoria**: Estructuras de datos optimizadas y carga diferida para una huella de memoria mínima.
|
| 133 |
+
- ⚡ **Serialización JSON Rápida**: 10 veces más rápido que la biblioteca estándar.
|
| 134 |
+
- 🏗️ **Probado en batalla**: Scrapling no solo tiene una cobertura de pruebas del 92% y cobertura completa de type hints, sino que ha sido utilizado diariamente por cientos de Web Scrapers durante el último año.
|
| 135 |
+
|
| 136 |
+
### Experiencia Amigable para Desarrolladores/Web Scrapers
|
| 137 |
+
- 🎯 **Shell Interactivo de Web Scraping**: Shell IPython integrado opcional con integración de Scrapling, atajos y nuevas herramientas para acelerar el desarrollo de scripts de Web Scraping, como convertir solicitudes curl a solicitudes Scrapling y ver resultados de solicitudes en tu navegador.
|
| 138 |
+
- 🚀 **Úsalo directamente desde la Terminal**: Opcionalmente, ¡puedes usar Scrapling para hacer scraping de una URL sin escribir ni una sola línea de código!
|
| 139 |
+
- 🛠️ **API de Navegación Rica**: Recorrido avanzado del DOM con métodos de navegación de padres, hermanos e hijos.
|
| 140 |
+
- 🧬 **Procesamiento de Texto Mejorado**: Métodos integrados de regex, limpieza y operaciones de cadena optimizadas.
|
| 141 |
+
- 📝 **Generación Automática de Selectores**: Genera selectores CSS/XPath robustos para cualquier elemento.
|
| 142 |
+
- 🔌 **API Familiar**: Similar a Scrapy/BeautifulSoup con los mismos pseudo-elementos usados en Scrapy/Parsel.
|
| 143 |
+
- 📘 **Cobertura Completa de Tipos**: Type hints completos para excelente soporte de IDE y autocompletado de código. Todo el código fuente se escanea automáticamente con **PyRight** y **MyPy** en cada cambio.
|
| 144 |
+
- 🔋 **Imagen Docker Lista**: Con cada lanzamiento, se construye y publica automáticamente una imagen Docker que contiene todos los navegadores.
|
| 145 |
+
|
| 146 |
+
## Primeros Pasos
|
| 147 |
+
|
| 148 |
+
Aquí tienes un vistazo rápido de lo que Scrapling puede hacer sin entrar en profundidad.
|
| 149 |
+
|
| 150 |
+
### Uso Básico
|
| 151 |
+
Solicitudes HTTP con soporte de sesión
|
| 152 |
+
```python
|
| 153 |
+
from scrapling.fetchers import Fetcher, FetcherSession
|
| 154 |
+
|
| 155 |
+
with FetcherSession(impersonate='chrome') as session: # Usa la última versión del fingerprint TLS de Chrome
|
| 156 |
+
page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
| 157 |
+
quotes = page.css('.quote .text::text').getall()
|
| 158 |
+
|
| 159 |
+
# O usa solicitudes de una sola vez
|
| 160 |
+
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 161 |
+
quotes = page.css('.quote .text::text').getall()
|
| 162 |
+
```
|
| 163 |
+
Modo sigiloso avanzado
|
| 164 |
+
```python
|
| 165 |
+
from scrapling.fetchers import StealthyFetcher, StealthySession
|
| 166 |
+
|
| 167 |
+
with StealthySession(headless=True, solve_cloudflare=True) as session: # Mantén el navegador abierto hasta que termines
|
| 168 |
+
page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
|
| 169 |
+
data = page.css('#padded_content a').getall()
|
| 170 |
+
|
| 171 |
+
# O usa el estilo de solicitud de una sola vez, abre el navegador para esta solicitud, luego lo cierra después de terminar
|
| 172 |
+
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
|
| 173 |
+
data = page.css('#padded_content a').getall()
|
| 174 |
+
```
|
| 175 |
+
Automatización completa del navegador
|
| 176 |
+
```python
|
| 177 |
+
from scrapling.fetchers import DynamicFetcher, DynamicSession
|
| 178 |
+
|
| 179 |
+
with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # Mantén el navegador abierto hasta que termines
|
| 180 |
+
page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
|
| 181 |
+
data = page.xpath('//span[@class="text"]/text()').getall() # Selector XPath si lo prefieres
|
| 182 |
+
|
| 183 |
+
# O usa el estilo de solicitud de una sola vez, abre el navegador para esta solicitud, luego lo cierra después de terminar
|
| 184 |
+
page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
|
| 185 |
+
data = page.css('.quote .text::text').getall()
|
| 186 |
+
```
|
| 187 |
+
|
| 188 |
+
### Spiders
|
| 189 |
+
Construye rastreadores completos con solicitudes concurrentes, múltiples tipos de sesión y Pause & Resume:
|
| 190 |
+
```python
|
| 191 |
+
from scrapling.spiders import Spider, Request, Response
|
| 192 |
+
|
| 193 |
+
class QuotesSpider(Spider):
|
| 194 |
+
name = "quotes"
|
| 195 |
+
start_urls = ["https://quotes.toscrape.com/"]
|
| 196 |
+
concurrent_requests = 10
|
| 197 |
+
|
| 198 |
+
async def parse(self, response: Response):
|
| 199 |
+
for quote in response.css('.quote'):
|
| 200 |
+
yield {
|
| 201 |
+
"text": quote.css('.text::text').get(),
|
| 202 |
+
"author": quote.css('.author::text').get(),
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
next_page = response.css('.next a')
|
| 206 |
+
if next_page:
|
| 207 |
+
yield response.follow(next_page[0].attrib['href'])
|
| 208 |
+
|
| 209 |
+
result = QuotesSpider().start()
|
| 210 |
+
print(f"Se extrajeron {len(result.items)} citas")
|
| 211 |
+
result.items.to_json("quotes.json")
|
| 212 |
+
```
|
| 213 |
+
Usa múltiples tipos de sesión en un solo Spider:
|
| 214 |
+
```python
|
| 215 |
+
from scrapling.spiders import Spider, Request, Response
|
| 216 |
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession
|
| 217 |
+
|
| 218 |
+
class MultiSessionSpider(Spider):
|
| 219 |
+
name = "multi"
|
| 220 |
+
start_urls = ["https://example.com/"]
|
| 221 |
+
|
| 222 |
+
def configure_sessions(self, manager):
|
| 223 |
+
manager.add("fast", FetcherSession(impersonate="chrome"))
|
| 224 |
+
manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)
|
| 225 |
+
|
| 226 |
+
async def parse(self, response: Response):
|
| 227 |
+
for link in response.css('a::attr(href)').getall():
|
| 228 |
+
# Enruta las páginas protegidas a través de la sesión sigilosa
|
| 229 |
+
if "protected" in link:
|
| 230 |
+
yield Request(link, sid="stealth")
|
| 231 |
+
else:
|
| 232 |
+
yield Request(link, sid="fast", callback=self.parse) # callback explícito
|
| 233 |
+
```
|
| 234 |
+
Pausa y reanuda rastreos largos con checkpoints ejecutando el Spider así:
|
| 235 |
+
```python
|
| 236 |
+
QuotesSpider(crawldir="./crawl_data").start()
|
| 237 |
+
```
|
| 238 |
+
Presiona Ctrl+C para pausar de forma ordenada — el progreso se guarda automáticamente. Después, cuando inicies el Spider de nuevo, pasa el mismo `crawldir`, y continuará desde donde se detuvo.
|
| 239 |
+
|
| 240 |
+
### Análisis Avanzado y Navegación
|
| 241 |
+
```python
|
| 242 |
+
from scrapling.fetchers import Fetcher
|
| 243 |
+
|
| 244 |
+
# Selección rica de elementos y navegación
|
| 245 |
+
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 246 |
+
|
| 247 |
+
# Obtén citas con múltiples métodos de selección
|
| 248 |
+
quotes = page.css('.quote') # Selector CSS
|
| 249 |
+
quotes = page.xpath('//div[@class="quote"]') # XPath
|
| 250 |
+
quotes = page.find_all('div', {'class': 'quote'}) # Estilo BeautifulSoup
|
| 251 |
+
# Igual que
|
| 252 |
+
quotes = page.find_all('div', class_='quote')
|
| 253 |
+
quotes = page.find_all(['div'], class_='quote')
|
| 254 |
+
quotes = page.find_all(class_='quote') # y así sucesivamente...
|
| 255 |
+
# Encuentra elementos por contenido de texto
|
| 256 |
+
quotes = page.find_by_text('quote', tag='div')
|
| 257 |
+
|
| 258 |
+
# Navegación avanzada
|
| 259 |
+
quote_text = page.css('.quote')[0].css('.text::text').get()
|
| 260 |
+
quote_text = page.css('.quote').css('.text::text').getall() # Selectores encadenados
|
| 261 |
+
first_quote = page.css('.quote')[0]
|
| 262 |
+
author = first_quote.next_sibling.css('.author::text')
|
| 263 |
+
parent_container = first_quote.parent
|
| 264 |
+
|
| 265 |
+
# Relaciones y similitud de elementos
|
| 266 |
+
similar_elements = first_quote.find_similar()
|
| 267 |
+
below_elements = first_quote.below_elements()
|
| 268 |
+
```
|
| 269 |
+
Puedes usar el parser directamente si no necesitas obtener sitios web, como se muestra a continuación:
|
| 270 |
+
```python
|
| 271 |
+
from scrapling.parser import Selector
|
| 272 |
+
|
| 273 |
+
page = Selector("<html>...</html>")
|
| 274 |
+
```
|
| 275 |
+
¡Y funciona exactamente de la misma manera!
|
| 276 |
+
|
| 277 |
+
### Ejemplos de Gestión de Session Async
|
| 278 |
+
```python
|
| 279 |
+
import asyncio
|
| 280 |
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession
|
| 281 |
+
|
| 282 |
+
async with FetcherSession(http3=True) as session: # `FetcherSession` es consciente del contexto y puede funcionar tanto en patrones sync/async
|
| 283 |
+
page1 = session.get('https://quotes.toscrape.com/')
|
| 284 |
+
page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
|
| 285 |
+
|
| 286 |
+
# Uso de sesión async
|
| 287 |
+
async with AsyncStealthySession(max_pages=2) as session:
|
| 288 |
+
tasks = []
|
| 289 |
+
urls = ['https://example.com/page1', 'https://example.com/page2']
|
| 290 |
+
|
| 291 |
+
for url in urls:
|
| 292 |
+
task = session.fetch(url)
|
| 293 |
+
tasks.append(task)
|
| 294 |
+
|
| 295 |
+
print(session.get_pool_stats()) # Opcional - El estado del pool de pestañas del navegador (ocupado/libre/error)
|
| 296 |
+
results = await asyncio.gather(*tasks)
|
| 297 |
+
print(session.get_pool_stats())
|
| 298 |
+
```
|
| 299 |
+
|
| 300 |
+
## CLI y Shell Interactivo
|
| 301 |
+
|
| 302 |
+
Scrapling incluye una poderosa interfaz de línea de comandos:
|
| 303 |
+
|
| 304 |
+
[](https://asciinema.org/a/736339)
|
| 305 |
+
|
| 306 |
+
Lanzar el Shell interactivo de Web Scraping
|
| 307 |
+
```bash
|
| 308 |
+
scrapling shell
|
| 309 |
+
```
|
| 310 |
+
Extraer páginas a un archivo directamente sin programar (Extrae el contenido dentro de la etiqueta `body` por defecto). Si el archivo de salida termina con `.txt`, entonces se extraerá el contenido de texto del objetivo. Si termina con `.md`, será una representación Markdown del contenido HTML; si termina con `.html`, será el contenido HTML en sí mismo.
|
| 311 |
+
```bash
|
| 312 |
+
scrapling extract get 'https://example.com' content.md
|
| 313 |
+
scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # Todos los elementos que coinciden con el selector CSS '#fromSkipToProducts'
|
| 314 |
+
scrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless
|
| 315 |
+
scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare
|
| 316 |
+
```
|
| 317 |
+
|
| 318 |
+
> [!NOTE]
|
| 319 |
+
> Hay muchas características adicionales, pero queremos mantener esta página concisa, incluyendo el servidor MCP y el Shell Interactivo de Web Scraping. Consulta la documentación completa [aquí](https://scrapling.readthedocs.io/en/latest/)
|
| 320 |
+
|
| 321 |
+
## Benchmarks de Rendimiento
|
| 322 |
+
|
| 323 |
+
Scrapling no solo es potente, también es ultrarrápido. Los siguientes benchmarks comparan el parser de Scrapling con las últimas versiones de otras bibliotecas populares.
|
| 324 |
+
|
| 325 |
+
### Prueba de Velocidad de Extracción de Texto (5000 elementos anidados)
|
| 326 |
+
|
| 327 |
+
| # | Biblioteca | Tiempo (ms) | vs Scrapling |
|
| 328 |
+
|---|:-----------------:|:-----------:|:------------:|
|
| 329 |
+
| 1 | Scrapling | 2.02 | 1.0x |
|
| 330 |
+
| 2 | Parsel/Scrapy | 2.04 | 1.01 |
|
| 331 |
+
| 3 | Raw Lxml | 2.54 | 1.257 |
|
| 332 |
+
| 4 | PyQuery | 24.17 | ~12x |
|
| 333 |
+
| 5 | Selectolax | 82.63 | ~41x |
|
| 334 |
+
| 6 | MechanicalSoup | 1549.71 | ~767.1x |
|
| 335 |
+
| 7 | BS4 with Lxml | 1584.31 | ~784.3x |
|
| 336 |
+
| 8 | BS4 with html5lib | 3391.91 | ~1679.1x |
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
### Rendimiento de Similitud de Elementos y Búsqueda de Texto
|
| 340 |
+
|
| 341 |
+
Las capacidades de búsqueda adaptativa de elementos de Scrapling superan significativamente a las alternativas:
|
| 342 |
+
|
| 343 |
+
| Biblioteca | Tiempo (ms) | vs Scrapling |
|
| 344 |
+
|-------------|:-----------:|:------------:|
|
| 345 |
+
| Scrapling | 2.39 | 1.0x |
|
| 346 |
+
| AutoScraper | 12.45 | 5.209x |
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
> Todos los benchmarks representan promedios de más de 100 ejecuciones. Ver [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) para la metodología.
|
| 350 |
+
|
| 351 |
+
## Instalación
|
| 352 |
+
|
| 353 |
+
Scrapling requiere Python 3.10 o superior:
|
| 354 |
+
|
| 355 |
+
```bash
|
| 356 |
+
pip install scrapling
|
| 357 |
+
```
|
| 358 |
+
|
| 359 |
+
Esta instalación solo incluye el motor de análisis y sus dependencias, sin ningún fetcher ni dependencias de línea de comandos.
|
| 360 |
+
|
| 361 |
+
### Dependencias Opcionales
|
| 362 |
+
|
| 363 |
+
1. Si vas a usar alguna de las características adicionales a continuación, los fetchers, o sus clases, necesitarás instalar las dependencias de los fetchers y sus dependencias del navegador de la siguiente manera:
|
| 364 |
+
```bash
|
| 365 |
+
pip install "scrapling[fetchers]"
|
| 366 |
+
|
| 367 |
+
scrapling install # normal install
|
| 368 |
+
scrapling install --force # force reinstall
|
| 369 |
+
```
|
| 370 |
+
|
| 371 |
+
Esto descarga todos los navegadores, junto con sus dependencias del sistema y dependencias de manipulación de fingerprint.
|
| 372 |
+
|
| 373 |
+
O puedes instalarlos desde el código en lugar de ejecutar un comando:
|
| 374 |
+
```python
|
| 375 |
+
from scrapling.cli import install
|
| 376 |
+
|
| 377 |
+
install([], standalone_mode=False) # normal install
|
| 378 |
+
install(["--force"], standalone_mode=False) # force reinstall
|
| 379 |
+
```
|
| 380 |
+
|
| 381 |
+
2. Características adicionales:
|
| 382 |
+
- Instalar la característica del servidor MCP:
|
| 383 |
+
```bash
|
| 384 |
+
pip install "scrapling[ai]"
|
| 385 |
+
```
|
| 386 |
+
- Instalar características del Shell (Shell de Web Scraping y el comando `extract`):
|
| 387 |
+
```bash
|
| 388 |
+
pip install "scrapling[shell]"
|
| 389 |
+
```
|
| 390 |
+
- Instalar todo:
|
| 391 |
+
```bash
|
| 392 |
+
pip install "scrapling[all]"
|
| 393 |
+
```
|
| 394 |
+
Recuerda que necesitas instalar las dependencias del navegador con `scrapling install` después de cualquiera de estos extras (si no lo hiciste ya)
|
| 395 |
+
|
| 396 |
+
### Docker
|
| 397 |
+
También puedes instalar una imagen Docker con todos los extras y navegadores con el siguiente comando desde DockerHub:
|
| 398 |
+
```bash
|
| 399 |
+
docker pull pyd4vinci/scrapling
|
| 400 |
+
```
|
| 401 |
+
O descárgala desde el registro de GitHub:
|
| 402 |
+
```bash
|
| 403 |
+
docker pull ghcr.io/d4vinci/scrapling:latest
|
| 404 |
+
```
|
| 405 |
+
Esta imagen se construye y publica automáticamente usando GitHub Actions y la rama principal del repositorio.
|
| 406 |
+
|
| 407 |
+
## Contribuir
|
| 408 |
+
|
| 409 |
+
¡Damos la bienvenida a las contribuciones! Por favor lee nuestras [pautas de contribución](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) antes de comenzar.
|
| 410 |
+
|
| 411 |
+
## Descargo de Responsabilidad
|
| 412 |
+
|
| 413 |
+
> [!CAUTION]
|
| 414 |
+
> Esta biblioteca se proporciona solo con fines educativos y de investigación. Al usar esta biblioteca, aceptas cumplir con las leyes locales e internacionales de scraping de datos y privacidad. Los autores y contribuyentes no son responsables de ningún mal uso de este software. Respeta siempre los términos de servicio de los sitios web y los archivos robots.txt.
|
| 415 |
+
|
| 416 |
+
## Licencia
|
| 417 |
+
|
| 418 |
+
Este trabajo está licenciado bajo la Licencia BSD-3-Clause.
|
| 419 |
+
|
| 420 |
+
## Agradecimientos
|
| 421 |
+
|
| 422 |
+
Este proyecto incluye código adaptado de:
|
| 423 |
+
- Parsel (Licencia BSD)—Usado para el submódulo [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)
|
| 424 |
+
|
| 425 |
+
---
|
| 426 |
+
<div align="center"><small>Diseñado y elaborado con ❤️ por Karim Shoair.</small></div><br>
|
docs/README_JP.md
ADDED
|
@@ -0,0 +1,426 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!-- mcp-name: io.github.D4Vinci/Scrapling -->
|
| 2 |
+
|
| 3 |
+
<h1 align="center">
|
| 4 |
+
<a href="https://scrapling.readthedocs.io">
|
| 5 |
+
<picture>
|
| 6 |
+
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
|
| 7 |
+
<img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
|
| 8 |
+
</picture>
|
| 9 |
+
</a>
|
| 10 |
+
<br>
|
| 11 |
+
<small>Effortless Web Scraping for the Modern Web</small>
|
| 12 |
+
</h1>
|
| 13 |
+
|
| 14 |
+
<p align="center">
|
| 15 |
+
<a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
|
| 16 |
+
<img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
|
| 17 |
+
<a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
|
| 18 |
+
<img alt="PyPI version" src="https://badge.fury.io/py/Scrapling.svg"></a>
|
| 19 |
+
<a href="https://pepy.tech/project/scrapling" alt="PyPI Downloads">
|
| 20 |
+
<img alt="PyPI Downloads" src="https://static.pepy.tech/personalized-badge/scrapling?period=total&units=INTERNATIONAL_SYSTEM&left_color=GREY&right_color=GREEN&left_text=Downloads"></a>
|
| 21 |
+
<br/>
|
| 22 |
+
<a href="https://discord.gg/EMgGbDceNQ" alt="Discord" target="_blank">
|
| 23 |
+
<img alt="Discord" src="https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ">
|
| 24 |
+
</a>
|
| 25 |
+
<a href="https://x.com/Scrapling_dev" alt="X (formerly Twitter)">
|
| 26 |
+
<img alt="X (formerly Twitter) Follow" src="https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev">
|
| 27 |
+
</a>
|
| 28 |
+
<br/>
|
| 29 |
+
<a href="https://pypi.org/project/scrapling/" alt="Supported Python versions">
|
| 30 |
+
<img alt="Supported Python versions" src="https://img.shields.io/pypi/pyversions/scrapling.svg"></a>
|
| 31 |
+
</p>
|
| 32 |
+
|
| 33 |
+
<p align="center">
|
| 34 |
+
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/"><strong>選択メソッド</strong></a>
|
| 35 |
+
·
|
| 36 |
+
<a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/"><strong>Fetcherの選び方</strong></a>
|
| 37 |
+
·
|
| 38 |
+
<a href="https://scrapling.readthedocs.io/en/latest/spiders/architecture.html"><strong>スパイダー</strong></a>
|
| 39 |
+
·
|
| 40 |
+
<a href="https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html"><strong>プロキシローテーション</strong></a>
|
| 41 |
+
·
|
| 42 |
+
<a href="https://scrapling.readthedocs.io/en/latest/cli/overview/"><strong>CLI</strong></a>
|
| 43 |
+
·
|
| 44 |
+
<a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/"><strong>MCPモード</strong></a>
|
| 45 |
+
</p>
|
| 46 |
+
|
| 47 |
+
Scraplingは、単一のリクエストから本格的なクロールまですべてを処理する適応型Web Scrapingフレームワークです。
|
| 48 |
+
|
| 49 |
+
そのパーサーはウェブサイトの変更から学習し、ページが更新されたときに要素を自動的に再配置します。Fetcherはすぐに使えるCloudflare Turnstileなどのアンチボットシステムを回避します。そしてSpiderフレームワークにより、Pause & Resumeや自動Proxy回転機能を備えた並行マルチSessionクロールへとスケールアップできます — すべてわずか数行のPythonで。1つのライブラリ、妥協なし。
|
| 50 |
+
|
| 51 |
+
リアルタイム統計とStreamingによる超高速クロール。Web Scraperによって、Web Scraperと一般ユーザーのために構築され、誰にでも何かがあります。
|
| 52 |
+
|
| 53 |
+
```python
|
| 54 |
+
from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
|
| 55 |
+
StealthyFetcher.adaptive = True
|
| 56 |
+
p = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # レーダーの下でウェブサイトを取得!
|
| 57 |
+
products = p.css('.product', auto_save=True) # ウェブサイトのデザイン変更に耐えるデータをスクレイプ!
|
| 58 |
+
products = p.css('.product', adaptive=True) # 後でウェブサイトの構造が変わったら、`adaptive=True`を渡して見つける!
|
| 59 |
+
```
|
| 60 |
+
または本格的なクロールへスケールアップ
|
| 61 |
+
```python
|
| 62 |
+
from scrapling.spiders import Spider, Response
|
| 63 |
+
|
| 64 |
+
class MySpider(Spider):
|
| 65 |
+
name = "demo"
|
| 66 |
+
start_urls = ["https://example.com/"]
|
| 67 |
+
|
| 68 |
+
async def parse(self, response: Response):
|
| 69 |
+
for item in response.css('.product'):
|
| 70 |
+
yield {"title": item.css('h2::text').get()}
|
| 71 |
+
|
| 72 |
+
MySpider().start()
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# プラチナスポンサー
|
| 77 |
+
|
| 78 |
+
<i><sub>ここに最初に表示される企業になりませんか?[こちら](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)をクリック</sub></i>
|
| 79 |
+
# スポンサー
|
| 80 |
+
|
| 81 |
+
<!-- sponsors -->
|
| 82 |
+
|
| 83 |
+
<a href="https://www.scrapeless.com/en?utm_source=official&utm_term=scrapling" target="_blank" title="Effortless Web Scraping Toolkit for Business and Developers"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg"></a>
|
| 84 |
+
<a href="https://www.thordata.com/?ls=github&lk=github" target="_blank" title="Unblockable proxies and scraping infrastructure, delivering real-time, reliable web data to power AI models and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
|
| 85 |
+
<a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
|
| 86 |
+
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
| 87 |
+
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
| 88 |
+
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
| 89 |
+
<a href="https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci" target="_blank" title="The web scraping service that actually beats anti-bot systems!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png"></a>
|
| 90 |
+
<a href="https://proxyempire.io/" target="_blank" title="Collect The Data Your Project Needs with the Best Residential Proxies"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png"></a>
|
| 91 |
+
<a href="https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling" target="_blank" title="Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png"></a>
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
| 95 |
+
<a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
|
| 96 |
+
<a href="https://browser.cash/?utm_source=D4Vinci&utm_medium=referral" target="_blank" title="Browser Automation & AI Browser Agent Platform"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png"></a>
|
| 97 |
+
|
| 98 |
+
<!-- /sponsors -->
|
| 99 |
+
|
| 100 |
+
<i><sub>ここに広告を表示したいですか?[こちら](https://github.com/sponsors/D4Vinci)をクリックして、あなたに合ったティアを選択してください!</sub></i>
|
| 101 |
+
|
| 102 |
+
---
|
| 103 |
+
|
| 104 |
+
## 主な機能
|
| 105 |
+
|
| 106 |
+
### Spider — 本格的なクロールフレームワーク
|
| 107 |
+
- 🕷️ **Scrapy風のSpider API**:`start_urls`、async `parse` callback、`Request`/`Response`オブジェクトでSpiderを定義。
|
| 108 |
+
- ⚡ **並行クロール**:設定可能な並行数制限、ドメインごとのスロットリング、ダウンロード遅延。
|
| 109 |
+
- 🔄 **マルチSessionサポート**:HTTPリクエストとステルスヘッドレスブラウザの統一インターフェース — IDによって異なるSessionにリクエストをルーティング。
|
| 110 |
+
- 💾 **Pause & Resume**:Checkpointベースのクロール永続化。Ctrl+Cで正常にシャットダウン;再起動すると中断したところから再開。
|
| 111 |
+
- 📡 **Streamingモード**:`async for item in spider.stream()`でリアルタイム統計とともにスクレイプされたアイテムをStreamingで受信 — UI、パイプライン、長時間実行クロールに最適。
|
| 112 |
+
- 🛡️ **ブロックされたリクエストの検出**:カスタマイズ可能なロジックによるブロックされたリクエストの自動検出とリトライ。
|
| 113 |
+
- 📦 **組み込みエクスポート**:フックや独自のパイプライン、または組み込みのJSON/JSONLで結果をエクスポート。それぞれ`result.items.to_json()` / `result.items.to_jsonl()`を使用。
|
| 114 |
+
|
| 115 |
+
### Sessionサポート付き高度なウェブサイト取得
|
| 116 |
+
- **HTTPリクエスト**:`Fetcher`クラスで高速かつステルスなHTTPリクエスト。ブラウザのTLS fingerprint、ヘッダーを模倣し、HTTP/3を使用可能。
|
| 117 |
+
- **動的読み込み**:PlaywrightのChromiumとGoogle Chromeをサポートする`DynamicFetcher`クラスによる完全なブラウザ自動化で動的ウェブサイトを取得。
|
| 118 |
+
- **アンチボット回避**:`StealthyFetcher`とfingerprint偽装による高度なステルス機能。自動化でCloudflareのTurnstile/Interstitialのすべてのタイプを簡単に回避。
|
| 119 |
+
- **Session管理**:リクエスト間でCookieと状態を管理するための`FetcherSession`、`StealthySession`、`DynamicSession`クラスによる永続的なSessionサポート。
|
| 120 |
+
- **Proxy回転**:すべてのSessionタイプに対応したラウンドロビンまたはカスタム戦略の組み込み`ProxyRotator`、さらにリクエストごとのProxyオーバーライド。
|
| 121 |
+
- **ドメインブロック**:ブラウザベースのFetcherで特定のドメイン(およびそのサブドメイン)へのリクエストをブロック。
|
| 122 |
+
- **asyncサポート**:すべてのFetcherおよび専用asyncSessionクラス全体での完全なasyncサポート。
|
| 123 |
+
|
| 124 |
+
### 適応型スクレイピングとAI統合
|
| 125 |
+
- 🔄 **スマート要素追跡**:インテリジェントな類似性アルゴリズムを使用してウェブサイトの変更後に要素を再配置。
|
| 126 |
+
- 🎯 **スマート柔軟選択**:CSSセレクタ、XPathセレクタ、フィルタベース検索、テキスト検索、正規表現検索など。
|
| 127 |
+
- 🔍 **類似要素の検出**:見つかった要素に類似した要素を自動的に特定。
|
| 128 |
+
- 🤖 **AIと使用するMCPサーバー**:AI支援Web Scrapingとデータ抽出のための組み込みMCPサーバー。MCPサーバーは、AI(Claude/Cursorなど)に渡す前にScraplingを活用してターゲットコンテンツを抽出する強力でカスタムな機能を備えており、操作を高速化し、トークン使用量を最小限に抑えることでコストを削減します。([デモ動画](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
|
| 129 |
+
|
| 130 |
+
### 高性能で実戦テスト済みのアーキテクチャ
|
| 131 |
+
- 🚀 **超高速**:ほとんどのPythonスクレイピングライブラリを上回る最適化されたパフォーマンス。
|
| 132 |
+
- 🔋 **メモリ効率**:最小のメモリフットプリントのための最適化されたデータ構造と遅延読み込み。
|
| 133 |
+
- ⚡ **高速JSONシリアル化**:標準ライブラリの10倍の速度。
|
| 134 |
+
- 🏗️ **実戦テスト済み**:Scraplingは92%のテストカバレッジと完全な型ヒントカバレッジを備えているだけでなく、過去1年間に数百人のWeb Scraperによって毎日使用されてきました。
|
| 135 |
+
|
| 136 |
+
### 開発者/Web Scraperにやさしい体験
|
| 137 |
+
- 🎯 **インタラクティブWeb Scraping Shell**:Scrapling統合、ショートカット、curlリクエストをScraplingリクエストに変換したり、ブラウザでリクエスト結果を表示したりするなどの新しいツールを備えたオプションの組み込みIPython Shellで、Web Scrapingスクリプトの開発を加速。
|
| 138 |
+
- 🚀 **ターミナルから直接使用**:オプションで、コードを一行も書かずにScraplingを使用してURLをスクレイプできます!
|
| 139 |
+
- 🛠️ **豊富なナビゲーションAPI**:親、兄弟、子のナビゲーションメソッドによる高度なDOMトラバーサル。
|
| 140 |
+
- 🧬 **強化されたテキスト処理**:組み込みの正規表現、クリーニングメソッド、最適化された文字列操作。
|
| 141 |
+
- 📝 **自動セレクタ生成**:任意の要素に対して堅牢なCSS/XPathセレクタを生成。
|
| 142 |
+
- 🔌 **馴染みのあるAPI**:Scrapy/Parselで使用されている同じ疑似要素を持つScrapy/BeautifulSoupに似た設計。
|
| 143 |
+
- 📘 **完全な型カバレッジ**:優れたIDEサポートとコード補完のための完全な型ヒント。コードベース全体が変更のたびに**PyRight**と**MyPy**で自動的にスキャンされます。
|
| 144 |
+
- 🔋 **すぐに使えるDockerイメージ**:各リリースで、すべてのブラウザを含むDockerイメージが自動的にビルドおよびプッシュされます。
|
| 145 |
+
|
| 146 |
+
## はじめに
|
| 147 |
+
|
| 148 |
+
深く掘り下げずに、Scraplingにできることの簡単な概要をお見せしましょう。
|
| 149 |
+
|
| 150 |
+
### 基本的な使い方
|
| 151 |
+
Sessionサポート付きHTTPリクエスト
|
| 152 |
+
```python
|
| 153 |
+
from scrapling.fetchers import Fetcher, FetcherSession
|
| 154 |
+
|
| 155 |
+
with FetcherSession(impersonate='chrome') as session: # ChromeのTLS fingerprintの最新バージョンを使用
|
| 156 |
+
page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
| 157 |
+
quotes = page.css('.quote .text::text').getall()
|
| 158 |
+
|
| 159 |
+
# または一回限りのリクエストを使用
|
| 160 |
+
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 161 |
+
quotes = page.css('.quote .text::text').getall()
|
| 162 |
+
```
|
| 163 |
+
高度なステルスモード
|
| 164 |
+
```python
|
| 165 |
+
from scrapling.fetchers import StealthyFetcher, StealthySession
|
| 166 |
+
|
| 167 |
+
with StealthySession(headless=True, solve_cloudflare=True) as session: # 完了するまでブラウザを開いたままにする
|
| 168 |
+
page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
|
| 169 |
+
data = page.css('#padded_content a').getall()
|
| 170 |
+
|
| 171 |
+
# または一回限りのリクエストスタイル、このリクエストのためにブラウザを開き、完了後に閉じる
|
| 172 |
+
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
|
| 173 |
+
data = page.css('#padded_content a').getall()
|
| 174 |
+
```
|
| 175 |
+
完全なブラウザ自動化
|
| 176 |
+
```python
|
| 177 |
+
from scrapling.fetchers import DynamicFetcher, DynamicSession
|
| 178 |
+
|
| 179 |
+
with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # 完了するまでブラウザを開いたままにする
|
| 180 |
+
page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
|
| 181 |
+
data = page.xpath('//span[@class="text"]/text()').getall() # お好みであればXPathセレクタを使用
|
| 182 |
+
|
| 183 |
+
# または一回限りのリクエストスタイル、このリクエストのためにブラウザを開き、完了後に閉じる
|
| 184 |
+
page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
|
| 185 |
+
data = page.css('.quote .text::text').getall()
|
| 186 |
+
```
|
| 187 |
+
|
| 188 |
+
### Spider
|
| 189 |
+
並行リクエスト、複数のSessionタイプ、Pause & Resumeを備えた本格的なクローラーを構築:
|
| 190 |
+
```python
|
| 191 |
+
from scrapling.spiders import Spider, Request, Response
|
| 192 |
+
|
| 193 |
+
class QuotesSpider(Spider):
|
| 194 |
+
name = "quotes"
|
| 195 |
+
start_urls = ["https://quotes.toscrape.com/"]
|
| 196 |
+
concurrent_requests = 10
|
| 197 |
+
|
| 198 |
+
async def parse(self, response: Response):
|
| 199 |
+
for quote in response.css('.quote'):
|
| 200 |
+
yield {
|
| 201 |
+
"text": quote.css('.text::text').get(),
|
| 202 |
+
"author": quote.css('.author::text').get(),
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
next_page = response.css('.next a')
|
| 206 |
+
if next_page:
|
| 207 |
+
yield response.follow(next_page[0].attrib['href'])
|
| 208 |
+
|
| 209 |
+
result = QuotesSpider().start()
|
| 210 |
+
print(f"{len(result.items)}件の引用をスクレイプしました")
|
| 211 |
+
result.items.to_json("quotes.json")
|
| 212 |
+
```
|
| 213 |
+
単一のSpiderで複数のSessionタイプを使用:
|
| 214 |
+
```python
|
| 215 |
+
from scrapling.spiders import Spider, Request, Response
|
| 216 |
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession
|
| 217 |
+
|
| 218 |
+
class MultiSessionSpider(Spider):
|
| 219 |
+
name = "multi"
|
| 220 |
+
start_urls = ["https://example.com/"]
|
| 221 |
+
|
| 222 |
+
def configure_sessions(self, manager):
|
| 223 |
+
manager.add("fast", FetcherSession(impersonate="chrome"))
|
| 224 |
+
manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)
|
| 225 |
+
|
| 226 |
+
async def parse(self, response: Response):
|
| 227 |
+
for link in response.css('a::attr(href)').getall():
|
| 228 |
+
# 保護されたページはステルスSessionを通してルーティング
|
| 229 |
+
if "protected" in link:
|
| 230 |
+
yield Request(link, sid="stealth")
|
| 231 |
+
else:
|
| 232 |
+
yield Request(link, sid="fast", callback=self.parse) # 明示的なcallback
|
| 233 |
+
```
|
| 234 |
+
Checkpointを使用して長時間のクロールをPause & Resume:
|
| 235 |
+
```python
|
| 236 |
+
QuotesSpider(crawldir="./crawl_data").start()
|
| 237 |
+
```
|
| 238 |
+
Ctrl+Cを押すと正常に一時停止し、進捗は自動的に保存されます。後でSpiderを再度起動する際に同じ`crawldir`を渡すと、中断したところから再開します。
|
| 239 |
+
|
| 240 |
+
### 高度なパースとナビゲーション
|
| 241 |
+
```python
|
| 242 |
+
from scrapling.fetchers import Fetcher
|
| 243 |
+
|
| 244 |
+
# 豊富な要素選択とナビゲーション
|
| 245 |
+
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 246 |
+
|
| 247 |
+
# 複数の選択メソッドで引用を取得
|
| 248 |
+
quotes = page.css('.quote') # CSSセレクタ
|
| 249 |
+
quotes = page.xpath('//div[@class="quote"]') # XPath
|
| 250 |
+
quotes = page.find_all('div', {'class': 'quote'}) # BeautifulSoupスタイル
|
| 251 |
+
# 以下と同じ
|
| 252 |
+
quotes = page.find_all('div', class_='quote')
|
| 253 |
+
quotes = page.find_all(['div'], class_='quote')
|
| 254 |
+
quotes = page.find_all(class_='quote') # など...
|
| 255 |
+
# テキスト内容で要素を検索
|
| 256 |
+
quotes = page.find_by_text('quote', tag='div')
|
| 257 |
+
|
| 258 |
+
# 高度なナビゲーション
|
| 259 |
+
quote_text = page.css('.quote')[0].css('.text::text').get()
|
| 260 |
+
quote_text = page.css('.quote').css('.text::text').getall() # チェーンセレクタ
|
| 261 |
+
first_quote = page.css('.quote')[0]
|
| 262 |
+
author = first_quote.next_sibling.css('.author::text')
|
| 263 |
+
parent_container = first_quote.parent
|
| 264 |
+
|
| 265 |
+
# 要素の関連性と類似性
|
| 266 |
+
similar_elements = first_quote.find_similar()
|
| 267 |
+
below_elements = first_quote.below_elements()
|
| 268 |
+
```
|
| 269 |
+
ウェブサイトを取得せずにパーサーをすぐに使用することもできます:
|
| 270 |
+
```python
|
| 271 |
+
from scrapling.parser import Selector
|
| 272 |
+
|
| 273 |
+
page = Selector("<html>...</html>")
|
| 274 |
+
```
|
| 275 |
+
まったく同じ方法で動作します!
|
| 276 |
+
|
| 277 |
+
### 非同期Session管理の例
|
| 278 |
+
```python
|
| 279 |
+
import asyncio
|
| 280 |
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession
|
| 281 |
+
|
| 282 |
+
async with FetcherSession(http3=True) as session: # `FetcherSession`はコンテキストアウェアで、同期/非同期両方のパターンで動作可能
|
| 283 |
+
page1 = session.get('https://quotes.toscrape.com/')
|
| 284 |
+
page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
|
| 285 |
+
|
| 286 |
+
# 非同期Sessionの使用
|
| 287 |
+
async with AsyncStealthySession(max_pages=2) as session:
|
| 288 |
+
tasks = []
|
| 289 |
+
urls = ['https://example.com/page1', 'https://example.com/page2']
|
| 290 |
+
|
| 291 |
+
for url in urls:
|
| 292 |
+
task = session.fetch(url)
|
| 293 |
+
tasks.append(task)
|
| 294 |
+
|
| 295 |
+
print(session.get_pool_stats()) # オプション - ブラウザタブプールのステータス(ビジー/フリー/エラー)
|
| 296 |
+
results = await asyncio.gather(*tasks)
|
| 297 |
+
print(session.get_pool_stats())
|
| 298 |
+
```
|
| 299 |
+
|
| 300 |
+
## CLIとインタラクティブShell
|
| 301 |
+
|
| 302 |
+
Scraplingには強力なコマンドラインインターフェー��が含まれています:
|
| 303 |
+
|
| 304 |
+
[](https://asciinema.org/a/736339)
|
| 305 |
+
|
| 306 |
+
インタラクティブWeb Scraping Shellを起動
|
| 307 |
+
```bash
|
| 308 |
+
scrapling shell
|
| 309 |
+
```
|
| 310 |
+
プログラミングせずに直接ページをファイルに抽出(デフォルトで`body`タグ内のコンテンツを抽出)。出力ファイルが`.txt`で終わる場合、ターゲットのテキストコンテンツが抽出されます。`.md`で終わる場合、HTMLコンテンツのMarkdown表現になります。`.html`で終わる場合、HTMLコンテンツそのものになります。
|
| 311 |
+
```bash
|
| 312 |
+
scrapling extract get 'https://example.com' content.md
|
| 313 |
+
scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # CSSセレクタ'#fromSkipToProducts'に一致するすべての要素
|
| 314 |
+
scrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless
|
| 315 |
+
scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare
|
| 316 |
+
```
|
| 317 |
+
|
| 318 |
+
> [!NOTE]
|
| 319 |
+
> MCPサーバーやインタラクティブWeb Scraping Shellなど、他にも多くの追加機能がありますが、このページは簡潔に保ちたいと思います。完全なドキュメントは[こちら](https://scrapling.readthedocs.io/en/latest/)をご覧ください
|
| 320 |
+
|
| 321 |
+
## パフォーマンスベンチマーク
|
| 322 |
+
|
| 323 |
+
Scraplingは強力であるだけでなく、超高速です。以下のベンチマークは、Scraplingのパーサーを他の人気ライブラリの最新バージョンと比較しています。
|
| 324 |
+
|
| 325 |
+
### テキスト抽出速度テスト(5000個のネストされた要素)
|
| 326 |
+
|
| 327 |
+
| # | ライブラリ | 時間(ms) | vs Scrapling |
|
| 328 |
+
|---|:-----------------:|:---------:|:------------:|
|
| 329 |
+
| 1 | Scrapling | 2.02 | 1.0x |
|
| 330 |
+
| 2 | Parsel/Scrapy | 2.04 | 1.01 |
|
| 331 |
+
| 3 | Raw Lxml | 2.54 | 1.257 |
|
| 332 |
+
| 4 | PyQuery | 24.17 | ~12x |
|
| 333 |
+
| 5 | Selectolax | 82.63 | ~41x |
|
| 334 |
+
| 6 | MechanicalSoup | 1549.71 | ~767.1x |
|
| 335 |
+
| 7 | BS4 with Lxml | 1584.31 | ~784.3x |
|
| 336 |
+
| 8 | BS4 with html5lib | 3391.91 | ~1679.1x |
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
### 要素類似性とテキスト検索のパフォーマンス
|
| 340 |
+
|
| 341 |
+
Scraplingの適応型要素検索機能は代替手段を大幅に上回ります:
|
| 342 |
+
|
| 343 |
+
| ライブラリ | 時間(ms) | vs Scrapling |
|
| 344 |
+
|-------------|:---------:|:------------:|
|
| 345 |
+
| Scrapling | 2.39 | 1.0x |
|
| 346 |
+
| AutoScraper | 12.45 | 5.209x |
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
> すべてのベンチマークは100回以上の実行の平均を表します。方法論については[benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py)を参照してください。
|
| 350 |
+
|
| 351 |
+
## インストール
|
| 352 |
+
|
| 353 |
+
ScraplingにはPython 3.10以上が必要です:
|
| 354 |
+
|
| 355 |
+
```bash
|
| 356 |
+
pip install scrapling
|
| 357 |
+
```
|
| 358 |
+
|
| 359 |
+
このインストールにはパーサーエンジンとその依存関係のみが含まれており、Fetcherやコマンドライン依存関係は含まれていません。
|
| 360 |
+
|
| 361 |
+
### オプションの依存関係
|
| 362 |
+
|
| 363 |
+
1. 以下の追加機能、Fetcher、またはそれらのクラスのいずれかを使用する場合は、Fetcherの依存関係とブラウザの依存関係を次のようにインストールする必要があります:
|
| 364 |
+
```bash
|
| 365 |
+
pip install "scrapling[fetchers]"
|
| 366 |
+
|
| 367 |
+
scrapling install # normal install
|
| 368 |
+
scrapling install --force # force reinstall
|
| 369 |
+
```
|
| 370 |
+
|
| 371 |
+
これにより、すべてのブラウザ、およびそれらのシステム依存関係とfingerprint操作依存関係がダウンロードされます。
|
| 372 |
+
|
| 373 |
+
または、コマンドを実行する代わりにコードからインストールすることもできます:
|
| 374 |
+
```python
|
| 375 |
+
from scrapling.cli import install
|
| 376 |
+
|
| 377 |
+
install([], standalone_mode=False) # normal install
|
| 378 |
+
install(["--force"], standalone_mode=False) # force reinstall
|
| 379 |
+
```
|
| 380 |
+
|
| 381 |
+
2. 追加機能:
|
| 382 |
+
- MCPサーバー機能をインストール:
|
| 383 |
+
```bash
|
| 384 |
+
pip install "scrapling[ai]"
|
| 385 |
+
```
|
| 386 |
+
- Shell機能(Web Scraping Shellと`extract`コマンド)をインストール:
|
| 387 |
+
```bash
|
| 388 |
+
pip install "scrapling[shell]"
|
| 389 |
+
```
|
| 390 |
+
- すべてをインストール:
|
| 391 |
+
```bash
|
| 392 |
+
pip install "scrapling[all]"
|
| 393 |
+
```
|
| 394 |
+
これらの追加機能のいずれかの後(まだインストールしていない場合)、`scrapling install`でブラウザの依存関係をインストールする必要があることを忘れないでください
|
| 395 |
+
|
| 396 |
+
### Docker
|
| 397 |
+
DockerHubから次のコマンドですべての追加機能とブラウザを含むDockerイメージをインストールすることもできます:
|
| 398 |
+
```bash
|
| 399 |
+
docker pull pyd4vinci/scrapling
|
| 400 |
+
```
|
| 401 |
+
またはGitHubレジストリからダウンロード:
|
| 402 |
+
```bash
|
| 403 |
+
docker pull ghcr.io/d4vinci/scrapling:latest
|
| 404 |
+
```
|
| 405 |
+
このイメージは、GitHub Actionsとリポジトリの��インブランチを使用して自動的にビルドおよびプッシュされます。
|
| 406 |
+
|
| 407 |
+
## 貢献
|
| 408 |
+
|
| 409 |
+
貢献を歓迎します!始める前に[貢献ガイドライン](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md)をお読みください。
|
| 410 |
+
|
| 411 |
+
## 免責事項
|
| 412 |
+
|
| 413 |
+
> [!CAUTION]
|
| 414 |
+
> このライブラリは教育および研究目的のみで提供されています。このライブラリを使用することにより、地域および国際的なデータスクレイピングおよびプライバシー法に準拠することに同意したものとみなされます。著者および貢献者は、このソフトウェアの誤用について責任を負いません。常にウェブサイトの利用規約とrobots.txtファイルを尊重してください。
|
| 415 |
+
|
| 416 |
+
## ライセンス
|
| 417 |
+
|
| 418 |
+
この作品はBSD-3-Clauseライセンスの下でライセンスされています。
|
| 419 |
+
|
| 420 |
+
## 謝辞
|
| 421 |
+
|
| 422 |
+
このプロジェクトには次から適応されたコードが含まれています:
|
| 423 |
+
- Parsel(BSDライセンス)— [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)サブモジュールに使用
|
| 424 |
+
|
| 425 |
+
---
|
| 426 |
+
<div align="center"><small>Karim Shoairによって❤️でデザインおよび作成されました。</small></div><br>
|
docs/README_RU.md
ADDED
|
@@ -0,0 +1,426 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!-- mcp-name: io.github.D4Vinci/Scrapling -->
|
| 2 |
+
|
| 3 |
+
<h1 align="center">
|
| 4 |
+
<a href="https://scrapling.readthedocs.io">
|
| 5 |
+
<picture>
|
| 6 |
+
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
|
| 7 |
+
<img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
|
| 8 |
+
</picture>
|
| 9 |
+
</a>
|
| 10 |
+
<br>
|
| 11 |
+
<small>Effortless Web Scraping for the Modern Web</small>
|
| 12 |
+
</h1>
|
| 13 |
+
|
| 14 |
+
<p align="center">
|
| 15 |
+
<a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
|
| 16 |
+
<img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
|
| 17 |
+
<a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
|
| 18 |
+
<img alt="PyPI version" src="https://badge.fury.io/py/Scrapling.svg"></a>
|
| 19 |
+
<a href="https://pepy.tech/project/scrapling" alt="PyPI Downloads">
|
| 20 |
+
<img alt="PyPI Downloads" src="https://static.pepy.tech/personalized-badge/scrapling?period=total&units=INTERNATIONAL_SYSTEM&left_color=GREY&right_color=GREEN&left_text=Downloads"></a>
|
| 21 |
+
<br/>
|
| 22 |
+
<a href="https://discord.gg/EMgGbDceNQ" alt="Discord" target="_blank">
|
| 23 |
+
<img alt="Discord" src="https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ">
|
| 24 |
+
</a>
|
| 25 |
+
<a href="https://x.com/Scrapling_dev" alt="X (formerly Twitter)">
|
| 26 |
+
<img alt="X (formerly Twitter) Follow" src="https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev">
|
| 27 |
+
</a>
|
| 28 |
+
<br/>
|
| 29 |
+
<a href="https://pypi.org/project/scrapling/" alt="Supported Python versions">
|
| 30 |
+
<img alt="Supported Python versions" src="https://img.shields.io/pypi/pyversions/scrapling.svg"></a>
|
| 31 |
+
</p>
|
| 32 |
+
|
| 33 |
+
<p align="center">
|
| 34 |
+
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/"><strong>Методы выбора</strong></a>
|
| 35 |
+
·
|
| 36 |
+
<a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/"><strong>Выбор Fetcher</strong></a>
|
| 37 |
+
·
|
| 38 |
+
<a href="https://scrapling.readthedocs.io/en/latest/spiders/architecture.html"><strong>Пауки</strong></a>
|
| 39 |
+
·
|
| 40 |
+
<a href="https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html"><strong>Ротация прокси</strong></a>
|
| 41 |
+
·
|
| 42 |
+
<a href="https://scrapling.readthedocs.io/en/latest/cli/overview/"><strong>CLI</strong></a>
|
| 43 |
+
·
|
| 44 |
+
<a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/"><strong>Режим MCP</strong></a>
|
| 45 |
+
</p>
|
| 46 |
+
|
| 47 |
+
Scrapling — это адаптивный фреймворк для Web Scraping, который берёт на себя всё: от одного запроса до полномасштабного обхода сайтов.
|
| 48 |
+
|
| 49 |
+
Его парсер учится на изменениях сайтов и автоматически перемещает ваши элементы при обновлении страниц. Его Fetcher'ы обходят анти-бот системы вроде Cloudflare Turnstile прямо из коробки. А его Spider-фреймворк позволяет масштабироваться до параллельных, многосессионных обходов с Pause & Resume и автоматической ротацией Proxy — и всё это в нескольких строках Python. Одна библиотека, без компромиссов.
|
| 50 |
+
|
| 51 |
+
Молниеносно быстрые обходы с отслеживанием статистики в реальном времени и Streaming. Создано веб-скраперами для веб-скраперов и обычных пользователей — здесь есть что-то для каждого.
|
| 52 |
+
|
| 53 |
+
```python
|
| 54 |
+
from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
|
| 55 |
+
StealthyFetcher.adaptive = True
|
| 56 |
+
p = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # Загрузите сайт незаметно!
|
| 57 |
+
products = p.css('.product', auto_save=True) # Скрапьте данные, которые переживут изменения дизайна сайта!
|
| 58 |
+
products = p.css('.product', adaptive=True) # Позже, если структура сайта изменится, передайте `adaptive=True`, чтобы найти их!
|
| 59 |
+
```
|
| 60 |
+
Или масштабируйте до полного обхода
|
| 61 |
+
```python
|
| 62 |
+
from scrapling.spiders import Spider, Response
|
| 63 |
+
|
| 64 |
+
class MySpider(Spider):
|
| 65 |
+
name = "demo"
|
| 66 |
+
start_urls = ["https://example.com/"]
|
| 67 |
+
|
| 68 |
+
async def parse(self, response: Response):
|
| 69 |
+
for item in response.css('.product'):
|
| 70 |
+
yield {"title": item.css('h2::text').get()}
|
| 71 |
+
|
| 72 |
+
MySpider().start()
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# Платиновые спонсоры
|
| 77 |
+
|
| 78 |
+
<i><sub>Хотите стать первой компанией, которая появится здесь? Нажмите [здесь](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)</sub></i>
|
| 79 |
+
# Спонсоры
|
| 80 |
+
|
| 81 |
+
<!-- sponsors -->
|
| 82 |
+
|
| 83 |
+
<a href="https://www.scrapeless.com/en?utm_source=official&utm_term=scrapling" target="_blank" title="Effortless Web Scraping Toolkit for Business and Developers"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg"></a>
|
| 84 |
+
<a href="https://www.thordata.com/?ls=github&lk=github" target="_blank" title="Unblockable proxies and scraping infrastructure, delivering real-time, reliable web data to power AI models and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
|
| 85 |
+
<a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
|
| 86 |
+
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
| 87 |
+
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
| 88 |
+
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
| 89 |
+
<a href="https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci" target="_blank" title="The web scraping service that actually beats anti-bot systems!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png"></a>
|
| 90 |
+
<a href="https://proxyempire.io/" target="_blank" title="Collect The Data Your Project Needs with the Best Residential Proxies"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png"></a>
|
| 91 |
+
<a href="https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling" target="_blank" title="Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png"></a>
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
| 95 |
+
<a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
|
| 96 |
+
<a href="https://browser.cash/?utm_source=D4Vinci&utm_medium=referral" target="_blank" title="Browser Automation & AI Browser Agent Platform"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png"></a>
|
| 97 |
+
|
| 98 |
+
<!-- /sponsors -->
|
| 99 |
+
|
| 100 |
+
<i><sub>Хотите показать здесь свою рекламу? Нажмите [здесь](https://github.com/sponsors/D4Vinci) и выберите подходящий вам уровень!</sub></i>
|
| 101 |
+
|
| 102 |
+
---
|
| 103 |
+
|
| 104 |
+
## Ключевые особенности
|
| 105 |
+
|
| 106 |
+
### Spider'ы — полноценный фреймворк для обхода сайтов
|
| 107 |
+
- 🕷️ **Scrapy-подобный Spider API**: Определяйте Spider'ов с `start_urls`, async `parse` callback'ами и объектами `Request`/`Response`.
|
| 108 |
+
- ⚡ **Параллельный обход**: Настраиваемые лимиты параллелизма, ограничение скорости по домену и задержки загрузки.
|
| 109 |
+
- 🔄 **Поддержка нескольких сессий**: Единый интерфейс для HTTP-запросов и скрытных headless-браузеров в одном Spider — маршрутизируйте запросы к разным сессиям по ID.
|
| 110 |
+
- 💾 **Pause & Resume**: Persistence обхода на основе Checkpoint'ов. Нажмите Ctrl+C для мягкой остановки; перезапустите, чтобы продолжить с того места, где вы остановились.
|
| 111 |
+
- 📡 **Режим Streaming**: Стримьте извлечённые элементы по мере их поступления через `async for item in spider.stream()` со статистикой в реальном времени — идеально для UI, конвейеров и длительных обходов.
|
| 112 |
+
- 🛡️ **Обнаружение заблокированных запросов**: Автоматическое обнаружение и повторная отправка заблокированных запросов с настраиваемо�� логикой.
|
| 113 |
+
- 📦 **Встроенный экспорт**: Экспортируйте результаты через хуки и собственный конвейер или встроенный JSON/JSONL с `result.items.to_json()` / `result.items.to_jsonl()` соответственно.
|
| 114 |
+
|
| 115 |
+
### Продвинутая загрузка сайтов с поддержкой Session
|
| 116 |
+
- **HTTP-запросы**: Быстрые и скрытные HTTP-запросы с классом `Fetcher`. Может имитировать TLS fingerprint браузера, заголовки и использовать HTTP/3.
|
| 117 |
+
- **Динамическая загрузка**: Загрузка динамических сайтов с полной автоматизацией браузера через класс `DynamicFetcher`, поддерживающий Chromium от Playwright и Google Chrome.
|
| 118 |
+
- **Обход анти-ботов**: Расширенные возможности скрытности с `StealthyFetcher` и подмену fingerprint'ов. Может легко обойти все типы Cloudflare Turnstile/Interstitial с помощью автоматизации.
|
| 119 |
+
- **Управление сессиями**: Поддержка постоянных сессий с классами `FetcherSession`, `StealthySession` и `DynamicSession` для управления cookie и состоянием между запросами.
|
| 120 |
+
- **Ротация Proxy**: Встроенный `ProxyRotator` с циклической или пользовательскими стратегиями для всех типов сессий, а также переопределение Proxy для каждого запроса.
|
| 121 |
+
- **Блокировка доменов**: Блокируйте запросы к определённым доменам (и их поддоменам) в браузерных Fetcher'ах.
|
| 122 |
+
- **Поддержка async**: Полная async-поддержка во всех Fetcher'ах и выделенных async-классах сессий.
|
| 123 |
+
|
| 124 |
+
### Адаптивный скрапинг и интеграция с ИИ
|
| 125 |
+
- 🔄 **Умное отслеживание элементов**: Перемещайте элементы после изменений сайта с помощью интеллектуальных алгоритмов подобия.
|
| 126 |
+
- 🎯 **Умный гибкий выбор**: CSS-селекторы, XPath-селекторы, поиск на основе фильтров, текстовый поиск, поиск по регулярным выражениям и многое другое.
|
| 127 |
+
- 🔍 **Поиск похожих элементов**: Автоматически находите элементы, похожие на найденные.
|
| 128 |
+
- 🤖 **MCP-сервер для использования с ИИ**: Встроенный MCP-сервер для Web Scraping с помощью ИИ и извлечения данных. MCP-сервер обладает мощными пользовательскими возможностями, которые используют Scrapling для извлечения целевого контента перед передачей его ИИ (Claude/Cursor/и т.д.), тем самым ускоряя операции и снижая затраты за счёт минимизации использования токенов. ([демо-видео](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
|
| 129 |
+
|
| 130 |
+
### Высокопроизводительная и проверенная в боях архитектура
|
| 131 |
+
- 🚀 **Молниеносная скорость**: Оптимизированная производительность, превосходящая большинство Python-библиотек для скрапинга.
|
| 132 |
+
- 🔋 **Эффективное использование памяти**: Оптимизированные структуры данных и ленивая загрузка для минимального потребления памяти.
|
| 133 |
+
- ⚡ **Быстрая сериализация JSON**: В 10 раз быстрее стандартной библиотеки.
|
| 134 |
+
- 🏗️ **Проверено в боях**: Scrapling имеет не только 92% покрытия тестами и полное покрытие type hints, но и ежедневно использовался сотнями веб-скраперов в течение последнего года.
|
| 135 |
+
|
| 136 |
+
### Удобный для разработчиков/веб-скраперов опыт
|
| 137 |
+
- 🎯 **Интерактивная Web Scraping Shell**: Опциональная встроенная IPython-оболочка с интеграцией Scrapling, ярлыками и новыми инструментами для ускорения разработки скриптов Web Scraping, такими как преобразование curl-запросов в запросы Scrapling и просмотр результатов з��просов в браузере.
|
| 138 |
+
- 🚀 **Используйте прямо из терминала**: При желании вы можете использовать Scrapling для скрапинга URL без написания ни одной строки кода!
|
| 139 |
+
- 🛠️ **Богатый API навигации**: Расширенный обход DOM с методами навигации по родителям, братьям и детям.
|
| 140 |
+
- 🧬 **Улучшенная обработка текста**: Встроенные регулярные выражения, методы очистки и оптимизированные операции со строками.
|
| 141 |
+
- 📝 **Автоматическая генерация селекторов**: Генерация надёжных CSS/XPath-селекторов для любого элемента.
|
| 142 |
+
- 🔌 **Знакомый API**: Похож на Scrapy/BeautifulSoup с теми же псевдоэлементами, используемыми в Scrapy/Parsel.
|
| 143 |
+
- 📘 **Полное покрытие типами**: Полные type hints для отличной поддержки IDE и автодополнения кода. Вся кодовая база автоматически проверяется **PyRight** и **MyPy** при каждом изменении.
|
| 144 |
+
- 🔋 **Готовый Docker-образ**: С каждым релизом автоматически создаётся и публикуется Docker-образ, содержащий все браузеры.
|
| 145 |
+
|
| 146 |
+
## Начало работы
|
| 147 |
+
|
| 148 |
+
Давайте кратко покажем, на что способен Scrapling, без глубокого погружения.
|
| 149 |
+
|
| 150 |
+
### Базовое использование
|
| 151 |
+
HTTP-запросы с поддержкой Session
|
| 152 |
+
```python
|
| 153 |
+
from scrapling.fetchers import Fetcher, FetcherSession
|
| 154 |
+
|
| 155 |
+
with FetcherSession(impersonate='chrome') as session: # Используйте последнюю версию TLS fingerprint Chrome
|
| 156 |
+
page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
|
| 157 |
+
quotes = page.css('.quote .text::text').getall()
|
| 158 |
+
|
| 159 |
+
# Или используйте одноразовые запросы
|
| 160 |
+
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 161 |
+
quotes = page.css('.quote .text::text').getall()
|
| 162 |
+
```
|
| 163 |
+
Расширенный режим скрытности
|
| 164 |
+
```python
|
| 165 |
+
from scrapling.fetchers import StealthyFetcher, StealthySession
|
| 166 |
+
|
| 167 |
+
with StealthySession(headless=True, solve_cloudflare=True) as session: # Держите браузер открытым, пока не закончите
|
| 168 |
+
page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
|
| 169 |
+
data = page.css('#padded_content a').getall()
|
| 170 |
+
|
| 171 |
+
# Или используйте стиль одноразового запроса — открывает браузер для этого запроса, затем закрывает его после завершения
|
| 172 |
+
page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
|
| 173 |
+
data = page.css('#padded_content a').getall()
|
| 174 |
+
```
|
| 175 |
+
Полная автоматизация браузера
|
| 176 |
+
```python
|
| 177 |
+
from scrapling.fetchers import DynamicFetcher, DynamicSession
|
| 178 |
+
|
| 179 |
+
with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # Держите браузер открытым, пока не закончите
|
| 180 |
+
page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
|
| 181 |
+
data = page.xpath('//span[@class="text"]/text()').getall() # XPath-селектор, если вы предпочитаете его
|
| 182 |
+
|
| 183 |
+
# Или используйте стиль одноразового запроса — открывает браузер для этого запроса, затем закрывает его после завершения
|
| 184 |
+
page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
|
| 185 |
+
data = page.css('.quote .text::text').getall()
|
| 186 |
+
```
|
| 187 |
+
|
| 188 |
+
### Spider'ы
|
| 189 |
+
Создавайте полноценные обходчики с параллельными запросами, несколькими типами сессий и Pause & Resume:
|
| 190 |
+
```python
|
| 191 |
+
from scrapling.spiders import Spider, Request, Response
|
| 192 |
+
|
| 193 |
+
class QuotesSpider(Spider):
|
| 194 |
+
name = "quotes"
|
| 195 |
+
start_urls = ["https://quotes.toscrape.com/"]
|
| 196 |
+
concurrent_requests = 10
|
| 197 |
+
|
| 198 |
+
async def parse(self, response: Response):
|
| 199 |
+
for quote in response.css('.quote'):
|
| 200 |
+
yield {
|
| 201 |
+
"text": quote.css('.text::text').get(),
|
| 202 |
+
"author": quote.css('.author::text').get(),
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
next_page = response.css('.next a')
|
| 206 |
+
if next_page:
|
| 207 |
+
yield response.follow(next_page[0].attrib['href'])
|
| 208 |
+
|
| 209 |
+
result = QuotesSpider().start()
|
| 210 |
+
print(f"Извлечено {len(result.items)} цитат")
|
| 211 |
+
result.items.to_json("quotes.json")
|
| 212 |
+
```
|
| 213 |
+
Используйте несколько типов сессий в одном Spider:
|
| 214 |
+
```python
|
| 215 |
+
from scrapling.spiders import Spider, Request, Response
|
| 216 |
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession
|
| 217 |
+
|
| 218 |
+
class MultiSessionSpider(Spider):
|
| 219 |
+
name = "multi"
|
| 220 |
+
start_urls = ["https://example.com/"]
|
| 221 |
+
|
| 222 |
+
def configure_sessions(self, manager):
|
| 223 |
+
manager.add("fast", FetcherSession(impersonate="chrome"))
|
| 224 |
+
manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)
|
| 225 |
+
|
| 226 |
+
async def parse(self, response: Response):
|
| 227 |
+
for link in response.css('a::attr(href)').getall():
|
| 228 |
+
# Направляйте защищённые страницы через stealth-сессию
|
| 229 |
+
if "protected" in link:
|
| 230 |
+
yield Request(link, sid="stealth")
|
| 231 |
+
else:
|
| 232 |
+
yield Request(link, sid="fast", callback=self.parse) # явный callback
|
| 233 |
+
```
|
| 234 |
+
Приостанавливайте и возобновляйте длительные обходы с помощью Checkpoint'ов, запуская Spider следующим образом:
|
| 235 |
+
```python
|
| 236 |
+
QuotesSpider(crawldir="./crawl_data").start()
|
| 237 |
+
```
|
| 238 |
+
Нажмите Ctrl+C для мягкой остановки — прогресс сохраняется автоматически. Позже, когда вы снова запустите Spider, передайте тот же `crawldir`, и он продолжит с того места, где остановился.
|
| 239 |
+
|
| 240 |
+
### Продвинутый парсинг и навигация
|
| 241 |
+
```python
|
| 242 |
+
from scrapling.fetchers import Fetcher
|
| 243 |
+
|
| 244 |
+
# Богатый выбор элементов и навигация
|
| 245 |
+
page = Fetcher.get('https://quotes.toscrape.com/')
|
| 246 |
+
|
| 247 |
+
# Получение цитат различными методами выбора
|
| 248 |
+
quotes = page.css('.quote') # CSS-селектор
|
| 249 |
+
quotes = page.xpath('//div[@class="quote"]') # XPath
|
| 250 |
+
quotes = page.find_all('div', {'class': 'quote'}) # В стиле BeautifulSoup
|
| 251 |
+
# То же самое, что
|
| 252 |
+
quotes = page.find_all('div', class_='quote')
|
| 253 |
+
quotes = page.find_all(['div'], class_='quote')
|
| 254 |
+
quotes = page.find_all(class_='quote') # и так далее...
|
| 255 |
+
# Найти элемент по текстовому содержимому
|
| 256 |
+
quotes = page.find_by_text('quote', tag='div')
|
| 257 |
+
|
| 258 |
+
# Продвинутая навигация
|
| 259 |
+
quote_text = page.css('.quote')[0].css('.text::text').get()
|
| 260 |
+
quote_text = page.css('.quote').css('.text::text').getall() # Цепочка селекторов
|
| 261 |
+
first_quote = page.css('.quote')[0]
|
| 262 |
+
author = first_quote.next_sibling.css('.author::text')
|
| 263 |
+
parent_container = first_quote.parent
|
| 264 |
+
|
| 265 |
+
# Связи элементов и подобие
|
| 266 |
+
similar_elements = first_quote.find_similar()
|
| 267 |
+
below_elements = first_quote.below_elements()
|
| 268 |
+
```
|
| 269 |
+
Вы можете использовать парсер напрямую, если не хотите загружать сайты, как показано ниже:
|
| 270 |
+
```python
|
| 271 |
+
from scrapling.parser import Selector
|
| 272 |
+
|
| 273 |
+
page = Selector("<html>...</html>")
|
| 274 |
+
```
|
| 275 |
+
И он работает точно так же!
|
| 276 |
+
|
| 277 |
+
### Примеры async Session
|
| 278 |
+
```python
|
| 279 |
+
import asyncio
|
| 280 |
+
from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession
|
| 281 |
+
|
| 282 |
+
async with FetcherSession(http3=True) as session: # `FetcherSession` контекстно-осведомлён и может работать как в sync, так и в async-режимах
|
| 283 |
+
page1 = session.get('https://quotes.toscrape.com/')
|
| 284 |
+
page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
|
| 285 |
+
|
| 286 |
+
# Использование async-сессии
|
| 287 |
+
async with AsyncStealthySession(max_pages=2) as session:
|
| 288 |
+
tasks = []
|
| 289 |
+
urls = ['https://example.com/page1', 'https://example.com/page2']
|
| 290 |
+
|
| 291 |
+
for url in urls:
|
| 292 |
+
task = session.fetch(url)
|
| 293 |
+
tasks.append(task)
|
| 294 |
+
|
| 295 |
+
print(session.get_pool_stats()) # Опционально — статус пула вкладок браузера (занят/свободен/ошибка)
|
| 296 |
+
results = await asyncio.gather(*tasks)
|
| 297 |
+
print(session.get_pool_stats())
|
| 298 |
+
```
|
| 299 |
+
|
| 300 |
+
## CLI и интерактивная Shell
|
| 301 |
+
|
| 302 |
+
Scrapling включает мощный интерфейс командной строки:
|
| 303 |
+
|
| 304 |
+
[](https://asciinema.org/a/736339)
|
| 305 |
+
|
| 306 |
+
Запустить интерактивную Web Scraping Shell
|
| 307 |
+
```bash
|
| 308 |
+
scrapling shell
|
| 309 |
+
```
|
| 310 |
+
Извлечь страницы в файл напрямую без программирования (по умолчанию извлекает содержимое внутри тега `body`). Если выходной файл заканчивается на `.txt`, будет извлечено текстовое содержимое цели. Если заканчивается на `.md`, это будет Markdown-представление HTML-содержимого; если заканчивается на `.html`, это будет само HTML-содержимое.
|
| 311 |
+
```bash
|
| 312 |
+
scrapling extract get 'https://example.com' content.md
|
| 313 |
+
scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # Все элементы, соответствующие CSS-селектору '#fromSkipToProducts'
|
| 314 |
+
scrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless
|
| 315 |
+
scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare
|
| 316 |
+
```
|
| 317 |
+
|
| 318 |
+
> [!NOTE]
|
| 319 |
+
> Есть множество дополнительных возможностей, но мы хотим сохранить эту страницу краткой, включая MCP-сервер и интерактивную Web Scraping Shell. Ознакомьтесь с полной документацией [здесь](https://scrapling.readthedocs.io/en/latest/)
|
| 320 |
+
|
| 321 |
+
## Тесты производительности
|
| 322 |
+
|
| 323 |
+
Scrapling не только мощный — он ещё и невероятно быстрый. Следующие тесты производительности сравнивают парсер Scrapling с последними версиями других популярных библиотек.
|
| 324 |
+
|
| 325 |
+
### Тест скорости извлечения текста (5000 вложенных элементов)
|
| 326 |
+
|
| 327 |
+
| # | Библиотека | Время (мс) | vs Scrapling |
|
| 328 |
+
|---|:-----------------:|:----------:|:------------:|
|
| 329 |
+
| 1 | Scrapling | 2.02 | 1.0x |
|
| 330 |
+
| 2 | Parsel/Scrapy | 2.04 | 1.01 |
|
| 331 |
+
| 3 | Raw Lxml | 2.54 | 1.257 |
|
| 332 |
+
| 4 | PyQuery | 24.17 | ~12x |
|
| 333 |
+
| 5 | Selectolax | 82.63 | ~41x |
|
| 334 |
+
| 6 | MechanicalSoup | 1549.71 | ~767.1x |
|
| 335 |
+
| 7 | BS4 with Lxml | 1584.31 | ~784.3x |
|
| 336 |
+
| 8 | BS4 with html5lib | 3391.91 | ~1679.1x |
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
### Производительность подобия элементов и текстового поиска
|
| 340 |
+
|
| 341 |
+
Возможности адаптивного поиска элементов Scrapling значительно превосходят альтернативы:
|
| 342 |
+
|
| 343 |
+
| Библиотека | Время (мс) | vs Scrapling |
|
| 344 |
+
|-------------|:----------:|:------------:|
|
| 345 |
+
| Scrapling | 2.39 | 1.0x |
|
| 346 |
+
| AutoScraper | 12.45 | 5.209x |
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
> Все тесты производительности представляют собой средние значения более 100 запусков. См. [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) для методологии.
|
| 350 |
+
|
| 351 |
+
## Установка
|
| 352 |
+
|
| 353 |
+
Scrapling требует Python 3.10 или выше:
|
| 354 |
+
|
| 355 |
+
```bash
|
| 356 |
+
pip install scrapling
|
| 357 |
+
```
|
| 358 |
+
|
| 359 |
+
Эта установка включает только движок парсера и его зависимости, без каких-либо Fetcher'ов или зависимостей командной строки.
|
| 360 |
+
|
| 361 |
+
### Опциональные зависимости
|
| 362 |
+
|
| 363 |
+
1. Если вы собираетесь использовать какие-либо из дополнительных возможностей ниже, Fetcher'ы или их классы, вам необходимо установить зависимости Fetcher'ов и браузеров следующим образом:
|
| 364 |
+
```bash
|
| 365 |
+
pip install "scrapling[fetchers]"
|
| 366 |
+
|
| 367 |
+
scrapling install # normal install
|
| 368 |
+
scrapling install --force # force reinstall
|
| 369 |
+
```
|
| 370 |
+
|
| 371 |
+
Это загрузит все браузеры вместе с их системными зависимостями и зависимостями для манипуляции fingerprint'ами.
|
| 372 |
+
|
| 373 |
+
Или вы можете установить их из кода вместо выполнения команды:
|
| 374 |
+
```python
|
| 375 |
+
from scrapling.cli import install
|
| 376 |
+
|
| 377 |
+
install([], standalone_mode=False) # normal install
|
| 378 |
+
install(["--force"], standalone_mode=False) # force reinstall
|
| 379 |
+
```
|
| 380 |
+
|
| 381 |
+
2. Дополнительные возможности:
|
| 382 |
+
- Установить функцию MCP-сервера:
|
| 383 |
+
```bash
|
| 384 |
+
pip install "scrapling[ai]"
|
| 385 |
+
```
|
| 386 |
+
- Установить функции Shell (Web Scraping Shell и команда `extract`):
|
| 387 |
+
```bash
|
| 388 |
+
pip install "scrapling[shell]"
|
| 389 |
+
```
|
| 390 |
+
- Установить всё:
|
| 391 |
+
```bash
|
| 392 |
+
pip install "scrapling[all]"
|
| 393 |
+
```
|
| 394 |
+
Помните, что вам нужно установить зависимости браузеров с помощью `scrapling install` после любого из этих дополнений (если вы ещё этого не сделали)
|
| 395 |
+
|
| 396 |
+
### Docker
|
| 397 |
+
Вы также можете установить Docker-образ со всеми дополнениями и браузерами с помощью следующей команды из DockerHub:
|
| 398 |
+
```bash
|
| 399 |
+
docker pull pyd4vinci/scrapling
|
| 400 |
+
```
|
| 401 |
+
Или скачайте его из реестра GitHub:
|
| 402 |
+
```bash
|
| 403 |
+
docker pull ghcr.io/d4vinci/scrapling:latest
|
| 404 |
+
```
|
| 405 |
+
Этот образ автоматически создаётся и публикуе��ся с помощью GitHub Actions и основной ветки репозитория.
|
| 406 |
+
|
| 407 |
+
## Участие в разработке
|
| 408 |
+
|
| 409 |
+
Мы приветствуем участие! Пожалуйста, прочитайте наши [руководства по участию в разработке](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) перед началом работы.
|
| 410 |
+
|
| 411 |
+
## Отказ от ответственности
|
| 412 |
+
|
| 413 |
+
> [!CAUTION]
|
| 414 |
+
> Эта библиотека предоставляется только в образовательных и исследовательских целях. Используя эту библиотеку, вы соглашаетесь соблюдать местные и международные законы о скрапинге данных и конфиденциальности. Авторы и участники не несут ответственности за любое неправомерное использование этого программного обеспечения. Всегда уважайте условия обслуживания веб-сайтов и файлы robots.txt.
|
| 415 |
+
|
| 416 |
+
## Лицензия
|
| 417 |
+
|
| 418 |
+
Эта работа лицензирована по лицензии BSD-3-Clause.
|
| 419 |
+
|
| 420 |
+
## Благодарности
|
| 421 |
+
|
| 422 |
+
Этот проект включает код, адаптированный из:
|
| 423 |
+
- Parsel (лицензия BSD) — Используется для подмодуля [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)
|
| 424 |
+
|
| 425 |
+
---
|
| 426 |
+
<div align="center"><small>Разработано и создано с ❤️ Карим Шоаир.</small></div><br>
|
docs/ai/mcp-server.md
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Scrapling MCP Server Guide
|
| 2 |
+
|
| 3 |
+
<iframe width="560" height="315" src="https://www.youtube.com/embed/qyFk3ZNwOxE?si=3FHzgcYCb66iJ6e3" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>
|
| 4 |
+
|
| 5 |
+
The **Scrapling MCP Server** is a new feature that brings Scrapling's powerful Web Scraping capabilities directly to your favorite AI chatbot or AI agent. This integration allows you to scrape websites, extract data, and bypass anti-bot protections conversationally through Claude's AI interface or any interface that supports MCP.
|
| 6 |
+
|
| 7 |
+
## Features
|
| 8 |
+
|
| 9 |
+
The Scrapling MCP Server provides six powerful tools for web scraping:
|
| 10 |
+
|
| 11 |
+
### 🚀 Basic HTTP Scraping
|
| 12 |
+
- **`get`**: Fast HTTP requests with browser fingerprint impersonation, generating real browser headers matching the TLS version, HTTP/3, and more!
|
| 13 |
+
- **`bulk_get`**: An async version of the above tool that allows scraping of multiple URLs at the same time!
|
| 14 |
+
|
| 15 |
+
### 🌐 Dynamic Content Scraping
|
| 16 |
+
- **`fetch`**: Rapidly fetch dynamic content with Chromium/Chrome browser with complete control over the request/browser, and more!
|
| 17 |
+
- **`bulk_fetch`**: An async version of the above tool that allows scraping of multiple URLs in different browser tabs at the same time!
|
| 18 |
+
|
| 19 |
+
### 🔒 Stealth Scraping
|
| 20 |
+
- **`stealthy_fetch`**: Uses our Stealthy browser to bypass Cloudflare Turnstile/Interstitial and other anti-bot systems with complete control over the request/browser!
|
| 21 |
+
- **`bulk_stealthy_fetch`**: An async version of the above tool that allows stealth scraping of multiple URLs in different browser tabs at the same time!
|
| 22 |
+
|
| 23 |
+
### Key Capabilities
|
| 24 |
+
- **Smart Content Extraction**: Convert web pages/elements to Markdown, HTML, or extract a clean version of the text content
|
| 25 |
+
- **CSS Selector Support**: Use the Scrapling engine to target specific elements with precision before handing the content to the AI
|
| 26 |
+
- **Anti-Bot Bypass**: Handle Cloudflare Turnstile, Interstitial, and other protections
|
| 27 |
+
- **Proxy Support**: Use proxies for anonymity and geo-targeting
|
| 28 |
+
- **Browser Impersonation**: Mimic real browsers with TLS fingerprinting, real browser headers matching that version, and more
|
| 29 |
+
- **Parallel Processing**: Scrape multiple URLs concurrently for efficiency
|
| 30 |
+
|
| 31 |
+
#### But why use Scrapling MCP Server instead of other available tools?
|
| 32 |
+
|
| 33 |
+
Aside from its stealth capabilities and ability to bypass Cloudflare Turnstile/Interstitial, Scrapling's server is the only one that lets you select specific elements to pass to the AI, saving a lot of time and tokens!
|
| 34 |
+
|
| 35 |
+
The way other servers work is that they extract the content, then pass it all to the AI to extract the fields you want. This causes the AI to consume far more tokens than needed (from irrelevant content). Scrapling solves this problem by allowing you to pass a CSS selector to narrow down the content you want before passing it to the AI, which makes the whole process much faster and more efficient.
|
| 36 |
+
|
| 37 |
+
If you don't know how to write/use CSS selectors, don't worry. You can tell the AI in the prompt to write selectors to match possible fields for you and watch it try different combinations until it finds the right one, as we will show in the examples section.
|
| 38 |
+
|
| 39 |
+
## Installation
|
| 40 |
+
|
| 41 |
+
Install Scrapling with MCP Support, then double-check that the browser dependencies are installed.
|
| 42 |
+
|
| 43 |
+
```bash
|
| 44 |
+
# Install Scrapling with MCP server dependencies
|
| 45 |
+
pip install "scrapling[ai]"
|
| 46 |
+
|
| 47 |
+
# Install browser dependencies
|
| 48 |
+
scrapling install
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
Or use the Docker image directly from the Docker registry:
|
| 52 |
+
```bash
|
| 53 |
+
docker pull pyd4vinci/scrapling
|
| 54 |
+
```
|
| 55 |
+
Or download it from the GitHub registry:
|
| 56 |
+
```bash
|
| 57 |
+
docker pull ghcr.io/d4vinci/scrapling:latest
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
## Setting up the MCP Server
|
| 61 |
+
|
| 62 |
+
Here we will explain how to add Scrapling MCP Server to [Claude Desktop](https://claude.ai/download) and [Claude Code](https://www.anthropic.com/claude-code), but the same logic applies to any other chatbot that supports MCP:
|
| 63 |
+
|
| 64 |
+
### Claude Desktop
|
| 65 |
+
|
| 66 |
+
1. Open Claude Desktop
|
| 67 |
+
2. Click the hamburger menu (☰) at the top left → Settings → Developer → Edit Config
|
| 68 |
+
3. Add the Scrapling MCP server configuration:
|
| 69 |
+
```json
|
| 70 |
+
"ScraplingServer": {
|
| 71 |
+
"command": "scrapling",
|
| 72 |
+
"args": [
|
| 73 |
+
"mcp"
|
| 74 |
+
]
|
| 75 |
+
}
|
| 76 |
+
```
|
| 77 |
+
If that's the first MCP server you're adding, set the content of the file to this:
|
| 78 |
+
```json
|
| 79 |
+
{
|
| 80 |
+
"mcpServers": {
|
| 81 |
+
"ScraplingServer": {
|
| 82 |
+
"command": "scrapling",
|
| 83 |
+
"args": [
|
| 84 |
+
"mcp"
|
| 85 |
+
]
|
| 86 |
+
}
|
| 87 |
+
}
|
| 88 |
+
}
|
| 89 |
+
```
|
| 90 |
+
As per the [official article](https://modelcontextprotocol.io/quickstart/user), this action either creates a new configuration file if none exists or opens your existing configuration. The file is located at
|
| 91 |
+
|
| 92 |
+
1. **MacOS**: `~/Library/Application Support/Claude/claude_desktop_config.json`
|
| 93 |
+
2. **Windows**: `%APPDATA%\Claude\claude_desktop_config.json`
|
| 94 |
+
|
| 95 |
+
To ensure it's working, use the full path to the `scrapling` executable. Open the terminal and execute the following command:
|
| 96 |
+
|
| 97 |
+
1. **MacOS**: `which scrapling`
|
| 98 |
+
2. **Windows**: `where scrapling`
|
| 99 |
+
|
| 100 |
+
For me, on my Mac, it returned `/Users/<MyUsername>/.venv/bin/scrapling`, so the config I used in the end is:
|
| 101 |
+
```json
|
| 102 |
+
{
|
| 103 |
+
"mcpServers": {
|
| 104 |
+
"ScraplingServer": {
|
| 105 |
+
"command": "/Users/<MyUsername>/.venv/bin/scrapling",
|
| 106 |
+
"args": [
|
| 107 |
+
"mcp"
|
| 108 |
+
]
|
| 109 |
+
}
|
| 110 |
+
}
|
| 111 |
+
}
|
| 112 |
+
```
|
| 113 |
+
#### Docker
|
| 114 |
+
If you are using the Docker image, then it would be something like
|
| 115 |
+
```json
|
| 116 |
+
{
|
| 117 |
+
"mcpServers": {
|
| 118 |
+
"ScraplingServer": {
|
| 119 |
+
"command": "docker",
|
| 120 |
+
"args": [
|
| 121 |
+
"run", "-i", "--rm", "scrapling", "mcp"
|
| 122 |
+
]
|
| 123 |
+
}
|
| 124 |
+
}
|
| 125 |
+
}
|
| 126 |
+
```
|
| 127 |
+
|
| 128 |
+
The same logic applies to [Cursor](https://cursor.com/docs/context/mcp), [WindSurf](https://windsurf.com/university/tutorials/configuring-first-mcp-server), and others.
|
| 129 |
+
|
| 130 |
+
### Claude Code
|
| 131 |
+
Here it's much simpler to do. If you have [Claude Code](https://www.anthropic.com/claude-code) installed, open the terminal and execute the following command:
|
| 132 |
+
|
| 133 |
+
```bash
|
| 134 |
+
claude mcp add ScraplingServer "/Users/<MyUsername>/.venv/bin/scrapling" mcp
|
| 135 |
+
```
|
| 136 |
+
Same as above, to get Scrapling's executable path, open the terminal and execute the following command:
|
| 137 |
+
|
| 138 |
+
1. **MacOS**: `which scrapling`
|
| 139 |
+
2. **Windows**: `where scrapling`
|
| 140 |
+
|
| 141 |
+
Here's the main article from Anthropic on [how to add MCP servers to Claude code](https://docs.anthropic.com/en/docs/claude-code/mcp#option-1%3A-add-a-local-stdio-server) for further details.
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
Then, after you've added the server, you need to completely quit and restart the app you used above. In Claude Desktop, you should see an MCP server indicator (🔧) in the bottom-right corner of the chat input or see `ScraplingServer` in the `Search and tools` dropdown in the chat input box.
|
| 145 |
+
|
| 146 |
+
### Streamable HTTP
|
| 147 |
+
As per version 0.3.6, we have added the ability to make the MCP server use the 'Streamable HTTP' transport mode instead of the traditional 'stdio' transport.
|
| 148 |
+
|
| 149 |
+
So instead of using the following command (the 'stdio' one):
|
| 150 |
+
```bash
|
| 151 |
+
scrapling mcp
|
| 152 |
+
```
|
| 153 |
+
Use the following to enable 'Streamable HTTP' transport mode:
|
| 154 |
+
```bash
|
| 155 |
+
scrapling mcp --http
|
| 156 |
+
```
|
| 157 |
+
Hence, the default value for the host the server is listening to is '0.0.0.0' and the port is 8000, which both can be configured as below:
|
| 158 |
+
```bash
|
| 159 |
+
scrapling mcp --http --host '127.0.0.1' --port 8000
|
| 160 |
+
```
|
| 161 |
+
|
| 162 |
+
## Examples
|
| 163 |
+
|
| 164 |
+
Now we will show you some examples of prompts we used while testing the MCP server, but you are probably more creative than we are and better at prompt engineering than we are :)
|
| 165 |
+
|
| 166 |
+
We will gradually go from simple prompts to more complex ones. We will use Claude Desktop for the examples, but the same logic applies to the rest, of course.
|
| 167 |
+
|
| 168 |
+
1. **Basic Web Scraping**
|
| 169 |
+
|
| 170 |
+
Extract the main content from a webpage as Markdown:
|
| 171 |
+
|
| 172 |
+
```
|
| 173 |
+
Scrape the main content from https://example.com and convert it to markdown format.
|
| 174 |
+
```
|
| 175 |
+
|
| 176 |
+
Claude will use the `get` tool to fetch the page and return clean, readable content. If it fails, it will continue retrying every second for 3 attempts, unless you instruct it otherwise. If it fails to retrieve content for any reason, such as protection or if it's a dynamic website, it will automatically try the other tools. If Claude didn't do that automatically for some reason, you can add that to the prompt.
|
| 177 |
+
|
| 178 |
+
A more optimized version of the same prompt would be:
|
| 179 |
+
```
|
| 180 |
+
Use regular requests to scrape the main content from https://example.com and convert it to markdown format.
|
| 181 |
+
```
|
| 182 |
+
This tells Claude which tool to use here, so it doesn't have to guess. Sometimes it will start using normal requests on its own, and at other times, it will assume browsers are better suited for this website without any apparent reason. As a rule of thumb, you should always tell Claude which tool to use to save time and money and get consistent results.
|
| 183 |
+
|
| 184 |
+
2. **Targeted Data Extraction**
|
| 185 |
+
|
| 186 |
+
Extract specific elements using CSS selectors:
|
| 187 |
+
|
| 188 |
+
```
|
| 189 |
+
Get all product titles from https://shop.example.com using the CSS selector '.product-title'. If the request fails, retry up to 5 times every 10 seconds.
|
| 190 |
+
```
|
| 191 |
+
|
| 192 |
+
The server will extract only the elements matching your selector and return them as a structured list. Notice I told it to set the tool to try up to 5 times in case the website has connection issues, but the default setting should be fine for most cases.
|
| 193 |
+
|
| 194 |
+
3. **E-commerce Data Collection**
|
| 195 |
+
|
| 196 |
+
Another example of a bit more complex prompt:
|
| 197 |
+
```
|
| 198 |
+
Extract product information from these e-commerce URLs using bulk browser fetches:
|
| 199 |
+
- https://shop1.com/product-a
|
| 200 |
+
- https://shop2.com/product-b
|
| 201 |
+
- https://shop3.com/product-c
|
| 202 |
+
|
| 203 |
+
Get the product names, prices, and descriptions from each page.
|
| 204 |
+
```
|
| 205 |
+
|
| 206 |
+
Claude will use `bulk_fetch` to concurrently scrape all URLs, then analyze the extracted data.
|
| 207 |
+
|
| 208 |
+
4. **More advanced workflow**
|
| 209 |
+
|
| 210 |
+
Let's say I want to get all the action games available on PlayStation's store first page right now. I can use the following prompt to do that:
|
| 211 |
+
```
|
| 212 |
+
Extract the URLs of all games in this page, then do a bulk request to them and return a list of all action games: https://store.playstation.com/en-us/pages/browse
|
| 213 |
+
```
|
| 214 |
+
Note that I instructed it to use a bulk request for all the URLs collected. If I hadn't mentioned it, sometimes it works as intended, and other times it makes a separate request to each URL, which takes significantly longer. This prompt takes approximately one minute to complete.
|
| 215 |
+
|
| 216 |
+
However, because I wasn't specific enough, it actually used the `stealthy_fetch` here and the `bulk_stealthy_fetch` in the second step, which unnecessarily consumed a large number of tokens. A better prompt would be:
|
| 217 |
+
```
|
| 218 |
+
Use normal requests to extract the URLs of all games in this page, then do a bulk request to them and return a list of all action games: https://store.playstation.com/en-us/pages/browse
|
| 219 |
+
```
|
| 220 |
+
And if you know how to write CSS selectors, you can instruct Claude to apply the selectors to the elements you want, and it will nearly complete the task immediately.
|
| 221 |
+
```
|
| 222 |
+
Use normal requests to extract the URLs of all games on the page below, then perform a bulk request to them and return a list of all action games.
|
| 223 |
+
The selector for games in the first page is `[href*="/concept/"]` and the selector for the genre in the second request is `[data-qa="gameInfo#releaseInformation#genre-value"]`.
|
| 224 |
+
|
| 225 |
+
URL: https://store.playstation.com/en-us/pages/browse
|
| 226 |
+
```
|
| 227 |
+
|
| 228 |
+
5. **Get data from a website with Cloudflare protection**
|
| 229 |
+
|
| 230 |
+
If you think the website you are targeting has Cloudflare protection, tell Claude instead of letting it discover it on its own.
|
| 231 |
+
```
|
| 232 |
+
What's the price of this product? Be cautious, as it utilizes Cloudflare's Turnstile protection. Make the browser visible while you work.
|
| 233 |
+
|
| 234 |
+
https://ao.com/product/oo101uk-ninja-woodfire-outdoor-pizza-oven-brown-99357-685.aspx
|
| 235 |
+
```
|
| 236 |
+
|
| 237 |
+
6. **Long workflow**
|
| 238 |
+
|
| 239 |
+
You can, for example, use a prompt like this:
|
| 240 |
+
```
|
| 241 |
+
Extract all product URLs for the following category, then return the prices and details for the first 3 products.
|
| 242 |
+
|
| 243 |
+
https://www.arnotts.ie/furniture/bedroom/bed-frames/
|
| 244 |
+
```
|
| 245 |
+
But a better prompt would be:
|
| 246 |
+
```
|
| 247 |
+
Go to the following category URL and extract all product URLs using the CSS selector "a". Then, fetch the first 3 product pages in parallel and extract each product’s price and details.
|
| 248 |
+
|
| 249 |
+
Keep the output in markdown format to reduce irrelevant content.
|
| 250 |
+
|
| 251 |
+
Category URL:
|
| 252 |
+
https://www.arnotts.ie/furniture/bedroom/bed-frames/
|
| 253 |
+
```
|
| 254 |
+
|
| 255 |
+
And so on, you get the idea. Your creativity is the key here.
|
| 256 |
+
|
| 257 |
+
## Best Practices
|
| 258 |
+
|
| 259 |
+
Here is some technical advice for you.
|
| 260 |
+
|
| 261 |
+
### 1. Choose the Right Tool
|
| 262 |
+
- **`get`**: Fast, simple websites
|
| 263 |
+
- **`fetch`**: Sites with JavaScript/dynamic content
|
| 264 |
+
- **`stealthy_fetch`**: Protected sites, Cloudflare, anti-bot systems
|
| 265 |
+
|
| 266 |
+
### 2. Optimize Performance
|
| 267 |
+
- Use bulk tools for multiple URLs
|
| 268 |
+
- Disable unnecessary resources
|
| 269 |
+
- Set appropriate timeouts
|
| 270 |
+
- Use CSS selectors for targeted extraction
|
| 271 |
+
|
| 272 |
+
### 3. Handle Dynamic Content
|
| 273 |
+
- Use `network_idle` for SPAs
|
| 274 |
+
- Set `wait_selector` for specific elements
|
| 275 |
+
- Increase timeout for slow-loading sites
|
| 276 |
+
|
| 277 |
+
### 4. Data Quality
|
| 278 |
+
- Use `main_content_only=true` to avoid navigation/ads
|
| 279 |
+
- Choose an appropriate `extraction_type` for your use case
|
| 280 |
+
|
| 281 |
+
## Legal and Ethical Considerations
|
| 282 |
+
|
| 283 |
+
⚠️ **Important Guidelines:**
|
| 284 |
+
|
| 285 |
+
- **Check robots.txt**: Visit `https://website.com/robots.txt` to see scraping rules
|
| 286 |
+
- **Respect rate limits**: Don't overwhelm servers with requests
|
| 287 |
+
- **Terms of Service**: Read and comply with website terms
|
| 288 |
+
- **Copyright**: Respect intellectual property rights
|
| 289 |
+
- **Privacy**: Be mindful of personal data protection laws
|
| 290 |
+
- **Commercial use**: Ensure you have permission for business purposes
|
| 291 |
+
|
| 292 |
+
---
|
| 293 |
+
|
| 294 |
+
*Built with ❤️ by the Scrapling team. Happy scraping!*
|
docs/api-reference/custom-types.md
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
search:
|
| 3 |
+
exclude: true
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
# Custom Types API Reference
|
| 7 |
+
|
| 8 |
+
Here's the reference information for all custom types of classes Scrapling implemented, with all their parameters, attributes, and methods.
|
| 9 |
+
|
| 10 |
+
You can import all of them directly like below:
|
| 11 |
+
|
| 12 |
+
```python
|
| 13 |
+
from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
|
| 14 |
+
```
|
| 15 |
+
|
| 16 |
+
## ::: scrapling.core.custom_types.TextHandler
|
| 17 |
+
handler: python
|
| 18 |
+
:docstring:
|
| 19 |
+
|
| 20 |
+
## ::: scrapling.core.custom_types.TextHandlers
|
| 21 |
+
handler: python
|
| 22 |
+
:docstring:
|
| 23 |
+
|
| 24 |
+
## ::: scrapling.core.custom_types.AttributesHandler
|
| 25 |
+
handler: python
|
| 26 |
+
:docstring:
|
docs/api-reference/fetchers.md
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
search:
|
| 3 |
+
exclude: true
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
# Fetchers Classes
|
| 7 |
+
|
| 8 |
+
Here's the reference information for all fetcher-type classes' parameters, attributes, and methods.
|
| 9 |
+
|
| 10 |
+
You can import all of them directly like below:
|
| 11 |
+
|
| 12 |
+
```python
|
| 13 |
+
from scrapling.fetchers import (
|
| 14 |
+
Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher,
|
| 15 |
+
FetcherSession, AsyncStealthySession, StealthySession, DynamicSession, AsyncDynamicSession
|
| 16 |
+
)
|
| 17 |
+
```
|
| 18 |
+
|
| 19 |
+
## ::: scrapling.fetchers.Fetcher
|
| 20 |
+
handler: python
|
| 21 |
+
:docstring:
|
| 22 |
+
|
| 23 |
+
## ::: scrapling.fetchers.AsyncFetcher
|
| 24 |
+
handler: python
|
| 25 |
+
:docstring:
|
| 26 |
+
|
| 27 |
+
## ::: scrapling.fetchers.DynamicFetcher
|
| 28 |
+
handler: python
|
| 29 |
+
:docstring:
|
| 30 |
+
|
| 31 |
+
## ::: scrapling.fetchers.StealthyFetcher
|
| 32 |
+
handler: python
|
| 33 |
+
:docstring:
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
## Session Classes
|
| 37 |
+
|
| 38 |
+
### HTTP Sessions
|
| 39 |
+
|
| 40 |
+
## ::: scrapling.fetchers.FetcherSession
|
| 41 |
+
handler: python
|
| 42 |
+
:docstring:
|
| 43 |
+
|
| 44 |
+
### Stealth Sessions
|
| 45 |
+
|
| 46 |
+
## ::: scrapling.fetchers.StealthySession
|
| 47 |
+
handler: python
|
| 48 |
+
:docstring:
|
| 49 |
+
|
| 50 |
+
## ::: scrapling.fetchers.AsyncStealthySession
|
| 51 |
+
handler: python
|
| 52 |
+
:docstring:
|
| 53 |
+
|
| 54 |
+
### Dynamic Sessions
|
| 55 |
+
|
| 56 |
+
## ::: scrapling.fetchers.DynamicSession
|
| 57 |
+
handler: python
|
| 58 |
+
:docstring:
|
| 59 |
+
|
| 60 |
+
## ::: scrapling.fetchers.AsyncDynamicSession
|
| 61 |
+
handler: python
|
| 62 |
+
:docstring:
|
| 63 |
+
|
docs/api-reference/mcp-server.md
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
search:
|
| 3 |
+
exclude: true
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
# MCP Server API Reference
|
| 7 |
+
|
| 8 |
+
The **Scrapling MCP Server** provides six powerful tools for web scraping through the Model Context Protocol (MCP). This server integrates Scrapling's capabilities directly into AI chatbots and agents, allowing conversational web scraping with advanced anti-bot bypass features.
|
| 9 |
+
|
| 10 |
+
You can start the MCP server by running:
|
| 11 |
+
|
| 12 |
+
```bash
|
| 13 |
+
scrapling mcp
|
| 14 |
+
```
|
| 15 |
+
|
| 16 |
+
Or import the server class directly:
|
| 17 |
+
|
| 18 |
+
```python
|
| 19 |
+
from scrapling.core.ai import ScraplingMCPServer
|
| 20 |
+
|
| 21 |
+
server = ScraplingMCPServer()
|
| 22 |
+
server.serve(http=False, host="0.0.0.0", port=8000)
|
| 23 |
+
```
|
| 24 |
+
|
| 25 |
+
## Response Model
|
| 26 |
+
|
| 27 |
+
The standardized response structure that's returned by all MCP server tools:
|
| 28 |
+
|
| 29 |
+
## ::: scrapling.core.ai.ResponseModel
|
| 30 |
+
handler: python
|
| 31 |
+
:docstring:
|
| 32 |
+
|
| 33 |
+
## MCP Server Class
|
| 34 |
+
|
| 35 |
+
The main MCP server class that provides all web scraping tools:
|
| 36 |
+
|
| 37 |
+
## ::: scrapling.core.ai.ScraplingMCPServer
|
| 38 |
+
handler: python
|
| 39 |
+
:docstring:
|
docs/api-reference/proxy-rotation.md
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
search:
|
| 3 |
+
exclude: true
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
# Proxy Rotation
|
| 7 |
+
|
| 8 |
+
The `ProxyRotator` class provides thread-safe proxy rotation for any fetcher or session.
|
| 9 |
+
|
| 10 |
+
You can import it directly like below:
|
| 11 |
+
|
| 12 |
+
```python
|
| 13 |
+
from scrapling.fetchers import ProxyRotator
|
| 14 |
+
```
|
| 15 |
+
|
| 16 |
+
## ::: scrapling.engines.toolbelt.proxy_rotation.ProxyRotator
|
| 17 |
+
handler: python
|
| 18 |
+
:docstring:
|
docs/api-reference/response.md
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
search:
|
| 3 |
+
exclude: true
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
# Response Class
|
| 7 |
+
|
| 8 |
+
The `Response` class wraps HTTP responses returned by all fetchers, providing access to status, headers, body, cookies, and a `Selector` for parsing.
|
| 9 |
+
|
| 10 |
+
You can import the `Response` class like below:
|
| 11 |
+
|
| 12 |
+
```python
|
| 13 |
+
from scrapling.engines.toolbelt.custom import Response
|
| 14 |
+
```
|
| 15 |
+
|
| 16 |
+
## ::: scrapling.engines.toolbelt.custom.Response
|
| 17 |
+
handler: python
|
| 18 |
+
:docstring:
|
docs/api-reference/selector.md
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
search:
|
| 3 |
+
exclude: true
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
# Selector Class
|
| 7 |
+
|
| 8 |
+
The `Selector` class is the core parsing engine in Scrapling that provides HTML parsing and element selection capabilities.
|
| 9 |
+
|
| 10 |
+
Here's the reference information for the `Selector` class, with all its parameters, attributes, and methods.
|
| 11 |
+
|
| 12 |
+
You can import the `Selector` class directly from `scrapling`:
|
| 13 |
+
|
| 14 |
+
```python
|
| 15 |
+
from scrapling.parser import Selector
|
| 16 |
+
```
|
| 17 |
+
|
| 18 |
+
## ::: scrapling.parser.Selector
|
| 19 |
+
handler: python
|
| 20 |
+
:docstring:
|
| 21 |
+
|
| 22 |
+
## ::: scrapling.parser.Selectors
|
| 23 |
+
handler: python
|
| 24 |
+
:docstring:
|
| 25 |
+
|
docs/api-reference/spiders.md
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
search:
|
| 3 |
+
exclude: true
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
# Spider Classes
|
| 7 |
+
|
| 8 |
+
Here's the reference information for the spider framework classes' parameters, attributes, and methods.
|
| 9 |
+
|
| 10 |
+
You can import them directly like below:
|
| 11 |
+
|
| 12 |
+
```python
|
| 13 |
+
from scrapling.spiders import Spider, Request, CrawlResult, SessionManager, Response
|
| 14 |
+
```
|
| 15 |
+
|
| 16 |
+
## ::: scrapling.spiders.Spider
|
| 17 |
+
handler: python
|
| 18 |
+
:docstring:
|
| 19 |
+
|
| 20 |
+
## ::: scrapling.spiders.Request
|
| 21 |
+
handler: python
|
| 22 |
+
:docstring:
|
| 23 |
+
|
| 24 |
+
## Result Classes
|
| 25 |
+
|
| 26 |
+
## ::: scrapling.spiders.result.CrawlResult
|
| 27 |
+
handler: python
|
| 28 |
+
:docstring:
|
| 29 |
+
|
| 30 |
+
## ::: scrapling.spiders.result.CrawlStats
|
| 31 |
+
handler: python
|
| 32 |
+
:docstring:
|
| 33 |
+
|
| 34 |
+
## ::: scrapling.spiders.result.ItemList
|
| 35 |
+
handler: python
|
| 36 |
+
:docstring:
|
| 37 |
+
|
| 38 |
+
## Session Management
|
| 39 |
+
|
| 40 |
+
## ::: scrapling.spiders.session.SessionManager
|
| 41 |
+
handler: python
|
| 42 |
+
:docstring:
|
docs/assets/cover_dark.png
ADDED
|
Git LFS Details
|
docs/assets/cover_dark.svg
ADDED
|
|
docs/assets/cover_light.png
ADDED
|
docs/assets/cover_light.svg
ADDED
|
|
docs/assets/favicon.ico
ADDED
|
|
Git LFS Details
|
docs/assets/logo.png
ADDED
|
docs/assets/main_cover.png
ADDED
|
Git LFS Details
|
docs/assets/scrapling_shell_curl.png
ADDED
|
Git LFS Details
|
docs/assets/spider_architecture.png
ADDED
|
Git LFS Details
|