AUXteam commited on
Commit
94ec243
·
verified ·
1 Parent(s): 7734ef5

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .bandit.yml +11 -0
  2. .dockerignore +108 -10
  3. .gitattributes +5 -0
  4. .github/FUNDING.yml +3 -3
  5. .github/ISSUE_TEMPLATE/01-bug_report.yml +82 -0
  6. .github/ISSUE_TEMPLATE/02-feature_request.yml +19 -0
  7. .github/ISSUE_TEMPLATE/03-other.yml +19 -0
  8. .github/ISSUE_TEMPLATE/04-docs_issue.yml +40 -0
  9. .github/ISSUE_TEMPLATE/config.yml +10 -0
  10. .github/PULL_REQUEST_TEMPLATE.md +51 -0
  11. .github/workflows/code-quality.yml +184 -0
  12. .github/workflows/docker-build.yml +86 -0
  13. .github/workflows/release-and-publish.yml +74 -0
  14. .github/workflows/tests.yml +109 -0
  15. .gitignore +92 -57
  16. .hfignore +21 -7
  17. .pre-commit-config.yaml +20 -0
  18. .readthedocs.yaml +21 -0
  19. CODE_OF_CONDUCT.md +1 -1
  20. CONTRIBUTING.md +84 -145
  21. Dockerfile +34 -88
  22. LICENSE +24 -17
  23. MANIFEST.in +12 -0
  24. README.md +360 -329
  25. ROADMAP.md +14 -0
  26. benchmarks.py +146 -0
  27. cleanup.py +42 -0
  28. docs/README_AR.md +426 -0
  29. docs/README_CN.md +426 -0
  30. docs/README_DE.md +426 -0
  31. docs/README_ES.md +426 -0
  32. docs/README_JP.md +426 -0
  33. docs/README_RU.md +426 -0
  34. docs/ai/mcp-server.md +294 -0
  35. docs/api-reference/custom-types.md +26 -0
  36. docs/api-reference/fetchers.md +63 -0
  37. docs/api-reference/mcp-server.md +39 -0
  38. docs/api-reference/proxy-rotation.md +18 -0
  39. docs/api-reference/response.md +18 -0
  40. docs/api-reference/selector.md +25 -0
  41. docs/api-reference/spiders.md +42 -0
  42. docs/assets/cover_dark.png +3 -0
  43. docs/assets/cover_dark.svg +0 -0
  44. docs/assets/cover_light.png +0 -0
  45. docs/assets/cover_light.svg +1 -0
  46. docs/assets/favicon.ico +3 -0
  47. docs/assets/logo.png +0 -0
  48. docs/assets/main_cover.png +3 -0
  49. docs/assets/scrapling_shell_curl.png +3 -0
  50. docs/assets/spider_architecture.png +3 -0
.bandit.yml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ skips:
2
+ - B101
3
+ - B311
4
+ - B113 # `Requests call without timeout` these requests are done in the benchmark and examples scripts only
5
+ - B403 # We are using pickle for tests only
6
+ - B404 # Using subprocess library
7
+ - B602 # subprocess call with shell=True identified
8
+ - B110 # Try, Except, Pass detected.
9
+ - B104 # Possible binding to all interfaces.
10
+ - B301 # Pickle and modules that wrap it can be unsafe when used to deserialize untrusted data, possible security issue.
11
+ - B108 # Probable insecure usage of temp file/directory.
.dockerignore CHANGED
@@ -1,12 +1,110 @@
1
- .git
2
- .gitignore
3
- __pycache__
4
- *.pyc
5
- *.pyo
6
- venv
7
- .venv
 
 
 
 
 
 
 
 
 
 
 
8
  .env
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  *.egg-info
10
- .DS_Store
11
- chat_history.json
12
- client_secret.json
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Github
2
+ .github/
3
+
4
+ # docs
5
+ docs/
6
+ images/
7
+ .cache/
8
+ .claude/
9
+
10
+ # cached files
11
+ __pycache__/
12
+ *.py[cod]
13
+ .cache
14
+ .DS_Store
15
+ *~
16
+ .*.sw[po]
17
+ .build
18
+ .ve
19
  .env
20
+ .pytest
21
+ .benchmarks
22
+ .bootstrap
23
+ .appveyor.token
24
+ *.bak
25
+ *.db
26
+ *.db-*
27
+
28
+ # installation package
29
+ *.egg-info/
30
+ dist/
31
+ build/
32
+
33
+ # environments
34
+ .venv
35
+ env/
36
+ venv/
37
+ ENV/
38
+ env.bak/
39
+ venv.bak/
40
+
41
+ # C extensions
42
+ *.so
43
+
44
+ # pycharm
45
+ .idea/
46
+
47
+ # vscode
48
+ *.code-workspace
49
+
50
+ # Packages
51
+ *.egg
52
  *.egg-info
53
+ dist
54
+ build
55
+ eggs
56
+ .eggs
57
+ parts
58
+ bin
59
+ var
60
+ sdist
61
+ wheelhouse
62
+ develop-eggs
63
+ .installed.cfg
64
+ lib
65
+ lib64
66
+ venv*/
67
+ .venv*/
68
+ pyvenv*/
69
+ pip-wheel-metadata/
70
+ poetry.lock
71
+
72
+ # Installer logs
73
+ pip-log.txt
74
+
75
+ # mypy
76
+ .mypy_cache/
77
+ .dmypy.json
78
+ dmypy.json
79
+ mypy.ini
80
+
81
+ # test caches
82
+ .tox/
83
+ .pytest_cache/
84
+ .coverage
85
+ htmlcov
86
+ report.xml
87
+ nosetests.xml
88
+ coverage.xml
89
+
90
+ # Translations
91
+ *.mo
92
+
93
+ # Buildout
94
+ .mr.developer.cfg
95
+
96
+ # IDE project files
97
+ .project
98
+ .pydevproject
99
+ .idea
100
+ *.iml
101
+ *.komodoproject
102
+
103
+ # Complexity
104
+ output/*.html
105
+ output/*/index.html
106
+
107
+ # Sphinx
108
+ docs/_build
109
+ public/
110
+ web/
.gitattributes CHANGED
@@ -3,3 +3,8 @@ Scrapling/docs/assets/favicon.ico filter=lfs diff=lfs merge=lfs -text
3
  Scrapling/docs/assets/main_cover.png filter=lfs diff=lfs merge=lfs -text
4
  Scrapling/docs/assets/scrapling_shell_curl.png filter=lfs diff=lfs merge=lfs -text
5
  Scrapling/docs/assets/spider_architecture.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
3
  Scrapling/docs/assets/main_cover.png filter=lfs diff=lfs merge=lfs -text
4
  Scrapling/docs/assets/scrapling_shell_curl.png filter=lfs diff=lfs merge=lfs -text
5
  Scrapling/docs/assets/spider_architecture.png filter=lfs diff=lfs merge=lfs -text
6
+ docs/assets/cover_dark.png filter=lfs diff=lfs merge=lfs -text
7
+ docs/assets/favicon.ico filter=lfs diff=lfs merge=lfs -text
8
+ docs/assets/main_cover.png filter=lfs diff=lfs merge=lfs -text
9
+ docs/assets/scrapling_shell_curl.png filter=lfs diff=lfs merge=lfs -text
10
+ docs/assets/spider_architecture.png filter=lfs diff=lfs merge=lfs -text
.github/FUNDING.yml CHANGED
@@ -1,3 +1,3 @@
1
- # These are supported funding model platforms
2
-
3
- github: itsOwen
 
1
+ github: D4Vinci
2
+ buy_me_a_coffee: d4vinci
3
+ ko_fi: d4vinci
.github/ISSUE_TEMPLATE/01-bug_report.yml ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Bug report
2
+ description: Create a bug report to help us address errors in the repository
3
+ labels: [bug]
4
+ body:
5
+ - type: checkboxes
6
+ attributes:
7
+ label: Have you searched if there an existing issue for this?
8
+ description: Please search [existing issues](https://github.com/D4Vinci/Scrapling/labels/bug).
9
+ options:
10
+ - label: I have searched the existing issues
11
+ required: true
12
+
13
+ - type: input
14
+ attributes:
15
+ label: "Python version (python --version)"
16
+ placeholder: "Python 3.8"
17
+ validations:
18
+ required: true
19
+
20
+ - type: input
21
+ attributes:
22
+ label: "Scrapling version (scrapling.__version__)"
23
+ placeholder: "0.1"
24
+ validations:
25
+ required: true
26
+
27
+ - type: textarea
28
+ attributes:
29
+ label: "Dependencies version (pip3 freeze)"
30
+ description: >
31
+ This is the output of the command `pip3 freeze --all`. Note that the
32
+ actual output might be different as compared to the placeholder text.
33
+ placeholder: |
34
+ cssselect==1.2.0
35
+ lxml==5.3.0
36
+ orjson==3.10.7
37
+ ...
38
+ validations:
39
+ required: true
40
+
41
+ - type: input
42
+ attributes:
43
+ label: "What's your operating system?"
44
+ placeholder: "Windows 10"
45
+ validations:
46
+ required: true
47
+
48
+ - type: dropdown
49
+ attributes:
50
+ label: 'Are you using a separate virtual environment?'
51
+ description: "Please pay attention to this question"
52
+ options:
53
+ - 'No'
54
+ - 'Yes'
55
+ default: 0
56
+ validations:
57
+ required: true
58
+
59
+ - type: textarea
60
+ attributes:
61
+ label: "Expected behavior"
62
+ description: "Describe the behavior you expect. May include images or videos."
63
+ validations:
64
+ required: true
65
+
66
+ - type: textarea
67
+ attributes:
68
+ label: "Actual behavior"
69
+ validations:
70
+ required: true
71
+
72
+ - type: textarea
73
+ attributes:
74
+ label: Steps To Reproduce
75
+ description: Steps to reproduce the behavior.
76
+ placeholder: |
77
+ 1. In this environment...
78
+ 2. With this config...
79
+ 3. Run '...'
80
+ 4. See error...
81
+ validations:
82
+ required: false
.github/ISSUE_TEMPLATE/02-feature_request.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Feature request
2
+ description: Suggest features, propose improvements, discuss new ideas.
3
+ labels: [enhancement]
4
+ body:
5
+ - type: checkboxes
6
+ attributes:
7
+ label: Have you searched if there an existing feature request for this?
8
+ description: Please search [existing requests](https://github.com/D4Vinci/Scrapling/labels/enhancement).
9
+ options:
10
+ - label: I have searched the existing requests
11
+ required: true
12
+
13
+ - type: textarea
14
+ attributes:
15
+ label: "Feature description"
16
+ description: >
17
+ This could include new topics or improving any existing features/implementations.
18
+ validations:
19
+ required: true
.github/ISSUE_TEMPLATE/03-other.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Other
2
+ description: Use this for any other issues. PLEASE provide as much information as possible.
3
+ labels: ["awaiting triage"]
4
+ body:
5
+ - type: textarea
6
+ id: issuedescription
7
+ attributes:
8
+ label: What would you like to share?
9
+ description: Provide a clear and concise explanation of your issue.
10
+ validations:
11
+ required: true
12
+
13
+ - type: textarea
14
+ id: extrainfo
15
+ attributes:
16
+ label: Additional information
17
+ description: Is there anything else we should know about this issue?
18
+ validations:
19
+ required: false
.github/ISSUE_TEMPLATE/04-docs_issue.yml ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Documentation issue
2
+ description: Report incorrect, unclear, or missing documentation.
3
+ labels: [documentation]
4
+ body:
5
+ - type: checkboxes
6
+ attributes:
7
+ label: Have you searched if there an existing issue for this?
8
+ description: Please search [existing issues](https://github.com/D4Vinci/Scrapling/labels/documentation).
9
+ options:
10
+ - label: I have searched the existing issues
11
+ required: true
12
+
13
+ - type: input
14
+ attributes:
15
+ label: "Page URL"
16
+ description: "Link to the documentation page with the issue."
17
+ placeholder: "https://scrapling.readthedocs.io/en/latest/..."
18
+ validations:
19
+ required: true
20
+
21
+ - type: dropdown
22
+ attributes:
23
+ label: "Type of issue"
24
+ options:
25
+ - Incorrect information
26
+ - Unclear or confusing
27
+ - Missing information
28
+ - Typo or formatting
29
+ - Broken link
30
+ - Other
31
+ default: 0
32
+ validations:
33
+ required: true
34
+
35
+ - type: textarea
36
+ attributes:
37
+ label: "Description"
38
+ description: "Describe what's wrong and what you expected to find."
39
+ validations:
40
+ required: true
.github/ISSUE_TEMPLATE/config.yml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ blank_issues_enabled: false
2
+ contact_links:
3
+ - name: Discussions
4
+ url: https://github.com/D4Vinci/Scrapling/discussions
5
+ about: >
6
+ The "Discussions" forum is where you want to start. 💖
7
+ - name: Ask on our discord server
8
+ url: https://discord.gg/EMgGbDceNQ
9
+ about: >
10
+ Our community chat forum.
.github/PULL_REQUEST_TEMPLATE.md ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--
2
+ You are amazing! Thanks for contributing to Scrapling!
3
+ Please, DO NOT DELETE ANY TEXT from this template! (unless instructed).
4
+ -->
5
+
6
+ ## Proposed change
7
+ <!--
8
+ Describe the big picture of your changes here to communicate to the maintainers why we should accept this pull request.
9
+ If it fixes a bug or resolves a feature request, be sure to link to that issue in the additional information section.
10
+ -->
11
+
12
+
13
+ ### Type of change:
14
+ <!--
15
+ What type of change does your PR introduce to Scrapling?
16
+ NOTE: Please, check at least 1 box!
17
+ If your PR requires multiple boxes to be checked, you'll most likely need to
18
+ split it into multiple PRs. This makes things easier and faster to code review.
19
+ -->
20
+
21
+
22
+
23
+ - [ ] Dependency upgrade
24
+ - [ ] Bugfix (non-breaking change which fixes an issue)
25
+ - [ ] New integration (thank you!)
26
+ - [ ] New feature (which adds functionality to an existing integration)
27
+ - [ ] Deprecation (breaking change to happen in the future)
28
+ - [ ] Breaking change (fix/feature causing existing functionality to break)
29
+ - [ ] Code quality improvements to existing code or addition of tests
30
+ - [ ] Add or change doctests? -- Note: Please avoid changing both code and tests in a single pull request.
31
+ - [ ] Documentation change?
32
+
33
+ ### Additional information
34
+ <!--
35
+ Details are important and help maintainers processing your PR.
36
+ Please be sure to fill out additional details, if applicable.
37
+ -->
38
+
39
+ - This PR fixes or closes an issue: fixes #
40
+ - This PR is related to an issue: #
41
+ - Link to documentation pull request: **
42
+
43
+ ### Checklist:
44
+ * [ ] I have read [CONTRIBUTING.md](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md).
45
+ * [ ] This pull request is all my own work -- I have not plagiarized.
46
+ * [ ] I know that pull requests will not be merged if they fail the automated tests.
47
+ * [ ] All new Python files are placed inside an existing directory.
48
+ * [ ] All filenames are in all lowercase characters with no spaces or dashes.
49
+ * [ ] All functions and variable names follow Python naming conventions.
50
+ * [ ] All function parameters and return values are annotated with Python [type hints](https://docs.python.org/3/library/typing.html).
51
+ * [ ] All functions have doc-strings.
.github/workflows/code-quality.yml ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Code Quality
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ - dev
8
+ paths-ignore:
9
+ - '*.md'
10
+ - '**/*.md'
11
+ - 'docs/**'
12
+ - 'images/**'
13
+ - '.github/**'
14
+ - '!.github/workflows/code-quality.yml' # Always run when this workflow changes
15
+ pull_request:
16
+ branches:
17
+ - main
18
+ - dev
19
+ paths-ignore:
20
+ - '*.md'
21
+ - '**/*.md'
22
+ - 'docs/**'
23
+ - 'images/**'
24
+ workflow_dispatch: # Allow manual triggering
25
+
26
+ concurrency:
27
+ group: ${{ github.workflow }}-${{ github.ref }}
28
+ cancel-in-progress: true
29
+
30
+ jobs:
31
+ code-quality:
32
+ name: Code Quality Checks
33
+ runs-on: ubuntu-latest
34
+ permissions:
35
+ contents: read
36
+ pull-requests: write # For PR annotations
37
+
38
+ steps:
39
+ - name: Checkout code
40
+ uses: actions/checkout@v6
41
+ with:
42
+ fetch-depth: 0 # Full history for better analysis
43
+
44
+ - name: Set up Python
45
+ uses: actions/setup-python@v6
46
+ with:
47
+ python-version: '3.10'
48
+ cache: 'pip'
49
+
50
+ - name: Install dependencies
51
+ run: |
52
+ python -m pip install --upgrade pip
53
+ pip install bandit[toml] ruff vermin mypy pyright
54
+ pip install -e ".[all]"
55
+ pip install lxml-stubs
56
+
57
+ - name: Run Bandit (Security Linter)
58
+ id: bandit
59
+ continue-on-error: true
60
+ run: |
61
+ echo "::group::Bandit - Security Linter"
62
+ bandit -r -c .bandit.yml scrapling/ -f json -o bandit-report.json
63
+ bandit -r -c .bandit.yml scrapling/
64
+ echo "::endgroup::"
65
+
66
+ - name: Run Ruff Linter
67
+ id: ruff-lint
68
+ continue-on-error: true
69
+ run: |
70
+ echo "::group::Ruff - Linter"
71
+ ruff check scrapling/ --output-format=github
72
+ echo "::endgroup::"
73
+
74
+ - name: Run Ruff Formatter Check
75
+ id: ruff-format
76
+ continue-on-error: true
77
+ run: |
78
+ echo "::group::Ruff - Formatter Check"
79
+ ruff format --check scrapling/ --diff
80
+ echo "::endgroup::"
81
+
82
+ - name: Run Vermin (Python Version Compatibility)
83
+ id: vermin
84
+ continue-on-error: true
85
+ run: |
86
+ echo "::group::Vermin - Python 3.10+ Compatibility Check"
87
+ vermin -t=3.10- --violations --eval-annotations --no-tips scrapling/
88
+ echo "::endgroup::"
89
+
90
+ - name: Run Mypy (Static Type Checker)
91
+ id: mypy
92
+ continue-on-error: true
93
+ run: |
94
+ echo "::group::Mypy - Static Type Checker"
95
+ mypy scrapling/
96
+ echo "::endgroup::"
97
+
98
+ - name: Run Pyright (Static Type Checker)
99
+ id: pyright
100
+ continue-on-error: true
101
+ run: |
102
+ echo "::group::Pyright - Static Type Checker"
103
+ pyright scrapling/
104
+ echo "::endgroup::"
105
+
106
+ - name: Check results and create summary
107
+ if: always()
108
+ run: |
109
+ echo "# Code Quality Check Results" >> $GITHUB_STEP_SUMMARY
110
+ echo "" >> $GITHUB_STEP_SUMMARY
111
+
112
+ # Initialize status
113
+ all_passed=true
114
+
115
+ # Check Bandit
116
+ if [ "${{ steps.bandit.outcome }}" == "success" ]; then
117
+ echo "✅ **Bandit (Security)**: Passed" >> $GITHUB_STEP_SUMMARY
118
+ else
119
+ echo "❌ **Bandit (Security)**: Failed" >> $GITHUB_STEP_SUMMARY
120
+ all_passed=false
121
+ fi
122
+
123
+ # Check Ruff Linter
124
+ if [ "${{ steps.ruff-lint.outcome }}" == "success" ]; then
125
+ echo "✅ **Ruff Linter**: Passed" >> $GITHUB_STEP_SUMMARY
126
+ else
127
+ echo "❌ **Ruff Linter**: Failed" >> $GITHUB_STEP_SUMMARY
128
+ all_passed=false
129
+ fi
130
+
131
+ # Check Ruff Formatter
132
+ if [ "${{ steps.ruff-format.outcome }}" == "success" ]; then
133
+ echo "✅ **Ruff Formatter**: Passed" >> $GITHUB_STEP_SUMMARY
134
+ else
135
+ echo "❌ **Ruff Formatter**: Failed" >> $GITHUB_STEP_SUMMARY
136
+ all_passed=false
137
+ fi
138
+
139
+ # Check Vermin
140
+ if [ "${{ steps.vermin.outcome }}" == "success" ]; then
141
+ echo "✅ **Vermin (Python 3.10+)**: Passed" >> $GITHUB_STEP_SUMMARY
142
+ else
143
+ echo "❌ **Vermin (Python 3.10+)**: Failed" >> $GITHUB_STEP_SUMMARY
144
+ all_passed=false
145
+ fi
146
+
147
+ # Check Mypy
148
+ if [ "${{ steps.mypy.outcome }}" == "success" ]; then
149
+ echo "✅ **Mypy (Type Checker)**: Passed" >> $GITHUB_STEP_SUMMARY
150
+ else
151
+ echo "❌ **Mypy (Type Checker)**: Failed" >> $GITHUB_STEP_SUMMARY
152
+ all_passed=false
153
+ fi
154
+
155
+ # Check Pyright
156
+ if [ "${{ steps.pyright.outcome }}" == "success" ]; then
157
+ echo "✅ **Pyright (Type Checker)**: Passed" >> $GITHUB_STEP_SUMMARY
158
+ else
159
+ echo "❌ **Pyright (Type Checker)**: Failed" >> $GITHUB_STEP_SUMMARY
160
+ all_passed=false
161
+ fi
162
+
163
+ echo "" >> $GITHUB_STEP_SUMMARY
164
+
165
+ if [ "$all_passed" == "true" ]; then
166
+ echo "### 🎉 All checks passed!" >> $GITHUB_STEP_SUMMARY
167
+ echo "" >> $GITHUB_STEP_SUMMARY
168
+ echo "Your code meets all quality standards." >> $GITHUB_STEP_SUMMARY
169
+ else
170
+ echo "### ⚠️ Some checks failed" >> $GITHUB_STEP_SUMMARY
171
+ echo "" >> $GITHUB_STEP_SUMMARY
172
+ echo "Please review the errors above and fix them." >> $GITHUB_STEP_SUMMARY
173
+ echo "" >> $GITHUB_STEP_SUMMARY
174
+ echo "**Tip**: Run \`pre-commit run --all-files\` locally to catch these issues before pushing." >> $GITHUB_STEP_SUMMARY
175
+ exit 1
176
+ fi
177
+
178
+ - name: Upload Bandit report
179
+ if: always() && steps.bandit.outcome != 'skipped'
180
+ uses: actions/upload-artifact@v6
181
+ with:
182
+ name: bandit-security-report
183
+ path: bandit-report.json
184
+ retention-days: 30
.github/workflows/docker-build.yml ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Build and Push Docker Image
2
+
3
+ on:
4
+ pull_request:
5
+ types: [closed]
6
+ branches:
7
+ - main
8
+ workflow_dispatch:
9
+ inputs:
10
+ tag:
11
+ description: 'Docker image tag'
12
+ required: true
13
+ default: 'latest'
14
+
15
+ env:
16
+ DOCKERHUB_IMAGE: pyd4vinci/scrapling
17
+ GHCR_IMAGE: ghcr.io/${{ github.repository_owner }}/scrapling
18
+
19
+ jobs:
20
+ build-and-push:
21
+ runs-on: ubuntu-latest
22
+ permissions:
23
+ contents: read
24
+ packages: write
25
+
26
+ steps:
27
+ - name: Checkout repository
28
+ uses: actions/checkout@v6
29
+
30
+ - name: Set up Docker Buildx
31
+ uses: docker/setup-buildx-action@v3
32
+ with:
33
+ platforms: linux/amd64,linux/arm64
34
+
35
+ - name: Log in to Docker Hub
36
+ uses: docker/login-action@v3
37
+ with:
38
+ registry: docker.io
39
+ username: ${{ secrets.DOCKER_USERNAME }}
40
+ password: ${{ secrets.DOCKER_PASSWORD }}
41
+
42
+ - name: Log in to GitHub Container Registry
43
+ uses: docker/login-action@v3
44
+ with:
45
+ registry: ghcr.io
46
+ username: ${{ github.actor }}
47
+ password: ${{ secrets.CONTAINER_TOKEN }}
48
+
49
+ - name: Extract metadata
50
+ id: meta
51
+ uses: docker/metadata-action@v5
52
+ with:
53
+ images: |
54
+ ${{ env.DOCKERHUB_IMAGE }}
55
+ ${{ env.GHCR_IMAGE }}
56
+ tags: |
57
+ type=ref,event=branch
58
+ type=ref,event=pr
59
+ type=semver,pattern={{version}}
60
+ type=semver,pattern={{major}}.{{minor}}
61
+ type=semver,pattern={{major}}
62
+ type=raw,value=latest,enable={{is_default_branch}}
63
+ labels: |
64
+ org.opencontainers.image.title=Scrapling
65
+ org.opencontainers.image.description=An undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
66
+ org.opencontainers.image.vendor=D4Vinci
67
+ org.opencontainers.image.licenses=BSD
68
+ org.opencontainers.image.url=https://scrapling.readthedocs.io/en/latest/
69
+ org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}
70
+ org.opencontainers.image.documentation=https://scrapling.readthedocs.io/en/latest/
71
+
72
+ - name: Build and push Docker image
73
+ uses: docker/build-push-action@v6
74
+ with:
75
+ context: .
76
+ platforms: linux/amd64,linux/arm64
77
+ push: true
78
+ tags: ${{ steps.meta.outputs.tags }}
79
+ labels: ${{ steps.meta.outputs.labels }}
80
+ cache-from: type=gha
81
+ cache-to: type=gha,mode=max
82
+ build-args: |
83
+ BUILDKIT_INLINE_CACHE=1
84
+
85
+ - name: Image digest
86
+ run: echo ${{ steps.build.outputs.digest }}
.github/workflows/release-and-publish.yml ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Create Release and Publish to PyPI
2
+ # Creates a GitHub release when a PR is merged to main (using PR title as version and body as release notes), then publishes to PyPI.
3
+
4
+ on:
5
+ pull_request:
6
+ types: [closed]
7
+ branches:
8
+ - main
9
+
10
+ jobs:
11
+ create-release-and-publish:
12
+ if: github.event.pull_request.merged == true
13
+ runs-on: ubuntu-latest
14
+ environment:
15
+ name: PyPI
16
+ url: https://pypi.org/p/scrapling
17
+ permissions:
18
+ contents: write
19
+ id-token: write
20
+ steps:
21
+ - uses: actions/checkout@v6
22
+ with:
23
+ fetch-depth: 0
24
+
25
+ - name: Get PR title
26
+ id: pr_title
27
+ run: echo "title=${{ github.event.pull_request.title }}" >> $GITHUB_OUTPUT
28
+
29
+ - name: Save PR body to file
30
+ uses: actions/github-script@v8
31
+ with:
32
+ script: |
33
+ const fs = require('fs');
34
+ fs.writeFileSync('pr_body.md', context.payload.pull_request.body || '');
35
+
36
+ - name: Extract version
37
+ id: extract_version
38
+ run: |
39
+ PR_TITLE="${{ steps.pr_title.outputs.title }}"
40
+ if [[ $PR_TITLE =~ ^v ]]; then
41
+ echo "version=$PR_TITLE" >> $GITHUB_OUTPUT
42
+ echo "Valid version format found in PR title: $PR_TITLE"
43
+ else
44
+ echo "Error: PR title '$PR_TITLE' must start with 'v' (e.g., 'v1.0.0') to create a release."
45
+ exit 1
46
+ fi
47
+
48
+ - name: Create Release
49
+ uses: softprops/action-gh-release@v2
50
+ with:
51
+ tag_name: ${{ steps.extract_version.outputs.version }}
52
+ name: Release ${{ steps.extract_version.outputs.version }}
53
+ body_path: pr_body.md
54
+ draft: false
55
+ prerelease: false
56
+ env:
57
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
58
+
59
+ - name: Set up Python
60
+ uses: actions/setup-python@v6
61
+ with:
62
+ python-version: 3.12
63
+
64
+ - name: Upgrade pip
65
+ run: python3 -m pip install --upgrade pip
66
+
67
+ - name: Install build
68
+ run: python3 -m pip install --upgrade build twine setuptools
69
+
70
+ - name: Build a binary wheel and a source tarball
71
+ run: python3 -m build --sdist --wheel --outdir dist/
72
+
73
+ - name: Publish distribution 📦 to PyPI
74
+ uses: pypa/gh-action-pypi-publish@release/v1
.github/workflows/tests.yml ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Tests
2
+ on:
3
+ push:
4
+ branches:
5
+ - main
6
+ - dev
7
+ paths-ignore:
8
+ - '*.md'
9
+ - '**/*.md'
10
+ - 'docs/*'
11
+ - 'images/*'
12
+ - '.github/*'
13
+ - '*.yml'
14
+ - '*.yaml'
15
+ - 'ruff.toml'
16
+
17
+ concurrency:
18
+ group: ${{github.workflow}}-${{ github.ref }}
19
+ cancel-in-progress: true
20
+
21
+ jobs:
22
+ tests:
23
+ timeout-minutes: 60
24
+ runs-on: ${{ matrix.os }}
25
+ strategy:
26
+ fail-fast: false
27
+ matrix:
28
+ include:
29
+ - python-version: "3.10"
30
+ os: macos-latest
31
+ env:
32
+ TOXENV: py310
33
+ - python-version: "3.11"
34
+ os: macos-latest
35
+ env:
36
+ TOXENV: py311
37
+ - python-version: "3.12"
38
+ os: macos-latest
39
+ env:
40
+ TOXENV: py312
41
+ - python-version: "3.13"
42
+ os: macos-latest
43
+ env:
44
+ TOXENV: py313
45
+
46
+ steps:
47
+ - uses: actions/checkout@v6
48
+
49
+ - name: Set up Python ${{ matrix.python-version }}
50
+ uses: actions/setup-python@v6
51
+ with:
52
+ python-version: ${{ matrix.python-version }}
53
+ cache: 'pip'
54
+ cache-dependency-path: |
55
+ pyproject.toml
56
+ tox.ini
57
+
58
+ - name: Install all browsers dependencies
59
+ run: |
60
+ python3 -m pip install --upgrade pip
61
+ python3 -m pip install playwright==1.56.0 patchright==1.56.0
62
+
63
+ - name: Get Playwright version
64
+ id: playwright-version
65
+ run: |
66
+ PLAYWRIGHT_VERSION=$(python3 -c "import importlib.metadata; print(importlib.metadata.version('playwright'))")
67
+ echo "version=$PLAYWRIGHT_VERSION" >> $GITHUB_OUTPUT
68
+ echo "Playwright version: $PLAYWRIGHT_VERSION"
69
+
70
+ - name: Retrieve Playwright browsers from cache if any
71
+ id: playwright-cache
72
+ uses: actions/cache@v5
73
+ with:
74
+ path: |
75
+ ~/.cache/ms-playwright
76
+ ~/Library/Caches/ms-playwright
77
+ ~/.ms-playwright
78
+ key: ${{ runner.os }}-playwright-${{ steps.playwright-version.outputs.version }}-v1
79
+ restore-keys: |
80
+ ${{ runner.os }}-playwright-${{ steps.playwright-version.outputs.version }}-
81
+ ${{ runner.os }}-playwright-
82
+
83
+ - name: Install Playwright browsers
84
+ run: |
85
+ echo "Cache hit: ${{ steps.playwright-cache.outputs.cache-hit }}"
86
+ if [ "${{ steps.playwright-cache.outputs.cache-hit }}" != "true" ]; then
87
+ python3 -m playwright install chromium
88
+ else
89
+ echo "Skipping install - using cached Playwright browsers"
90
+ fi
91
+ python3 -m playwright install-deps chromium
92
+
93
+ # Cache tox environments
94
+ - name: Cache tox environments
95
+ uses: actions/cache@v5
96
+ with:
97
+ path: .tox
98
+ # Include python version and os in the cache key
99
+ key: tox-v1-${{ runner.os }}-py${{ matrix.python-version }}-${{ hashFiles('/Users/runner/work/Scrapling/pyproject.toml') }}
100
+ restore-keys: |
101
+ tox-v1-${{ runner.os }}-py${{ matrix.python-version }}-
102
+ tox-v1-${{ runner.os }}-
103
+
104
+ - name: Install tox
105
+ run: pip install -U tox
106
+
107
+ - name: Run tests
108
+ env: ${{ matrix.env }}
109
+ run: tox
.gitignore CHANGED
@@ -1,75 +1,110 @@
1
- # Python cache files
2
- __pycache__/
3
- *.py[cod]
4
- *$py.class
5
-
6
- # Virtual environment
7
- venv/
8
-
9
- # Streamlit cache
10
- .streamlit/
11
-
12
- # PyCharm files
13
- .idea/
14
-
15
- # VS Code files
16
- .vscode/
17
 
18
- # Jupyter Notebook
19
- .ipynb_checkpoints
 
20
 
21
- # Environment variables
22
- .env
23
-
24
- # Operating system files
25
  .DS_Store
26
- Thumbs.db
27
-
28
- # Log files
29
- *.log
30
-
31
- # Database files
 
 
 
 
32
  *.db
33
- *.sqlite3
34
 
35
- # Compiled Python files
36
- *.pyc
37
-
38
- # Package directories
39
  dist/
40
  build/
41
- *.egg-info/
42
 
43
- # Backup files
44
- *~
45
- *.bak
 
 
 
 
46
 
47
- # Coverage reports
48
- htmlcov/
49
- .coverage
50
- .coverage.*
51
- coverage.xml
52
 
53
- # Pytest cache
54
- .pytest_cache/
55
 
56
- # mypy cache
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  .mypy_cache/
 
 
 
58
 
59
- # Scrapy stuff:
60
- .scrapy
 
 
 
 
 
 
61
 
62
- # Sphinx documentation
63
- docs/_build/
64
 
65
- # PyBuilder
66
- target/
67
 
68
- # Google Sheets authentication token
69
- token.json
 
 
 
 
70
 
71
- # Chat history
72
- chat_history.json
 
73
 
74
- # Google OAuth client secret
75
- client_secret.json
 
 
 
1
+ # local files
2
+ site/*
3
+ local_tests/*
4
+ .mcpregistry_*
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
+ # AI related files
7
+ .claude/*
8
+ CLAUDE.md
9
 
10
+ # cached files
11
+ __pycache__/
12
+ *.py[cod]
13
+ .cache
14
  .DS_Store
15
+ *~
16
+ .*.sw[po]
17
+ .build
18
+ .ve
19
+ .env
20
+ .pytest
21
+ .benchmarks
22
+ .bootstrap
23
+ .appveyor.token
24
+ *.bak
25
  *.db
26
+ *.db-*
27
 
28
+ # installation package
29
+ *.egg-info/
 
 
30
  dist/
31
  build/
 
32
 
33
+ # environments
34
+ .venv
35
+ env/
36
+ venv/
37
+ ENV/
38
+ env.bak/
39
+ venv.bak/
40
 
41
+ # C extensions
42
+ *.so
 
 
 
43
 
44
+ # pycharm
45
+ .idea/
46
 
47
+ # vscode
48
+ *.code-workspace
49
+
50
+ # Packages
51
+ *.egg
52
+ *.egg-info
53
+ dist
54
+ build
55
+ eggs
56
+ .eggs
57
+ parts
58
+ bin
59
+ var
60
+ sdist
61
+ wheelhouse
62
+ develop-eggs
63
+ .installed.cfg
64
+ lib
65
+ lib64
66
+ venv*/
67
+ .venv*/
68
+ pyvenv*/
69
+ pip-wheel-metadata/
70
+ poetry.lock
71
+
72
+ # Installer logs
73
+ pip-log.txt
74
+
75
+ # mypy
76
  .mypy_cache/
77
+ .dmypy.json
78
+ dmypy.json
79
+ mypy.ini
80
 
81
+ # test caches
82
+ .tox/
83
+ .pytest_cache/
84
+ .coverage
85
+ htmlcov
86
+ report.xml
87
+ nosetests.xml
88
+ coverage.xml
89
 
90
+ # Translations
91
+ *.mo
92
 
93
+ # Buildout
94
+ .mr.developer.cfg
95
 
96
+ # IDE project files
97
+ .project
98
+ .pydevproject
99
+ .idea
100
+ *.iml
101
+ *.komodoproject
102
 
103
+ # Complexity
104
+ output/*.html
105
+ output/*/index.html
106
 
107
+ # Sphinx
108
+ docs/_build
109
+ public/
110
+ web/
.hfignore CHANGED
@@ -1,9 +1,23 @@
1
- .git/
2
- .github/
3
- venv/
4
- __pycache__/
5
  *.pyc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  .env
7
- chat_history.json
8
- test_patchright.py
9
- client_secret.json
 
1
+ .git
2
+ .github
3
+ .venv
4
+ __pycache__
5
  *.pyc
6
+ *.pyo
7
+ *.pyd
8
+ .DS_Store
9
+ tests/
10
+ docs/
11
+ images/
12
+ .coverage
13
+ htmlcov/
14
+ pytest_cache/
15
+ .mypy_cache/
16
+ .tox/
17
+ .pytest_cache/
18
+ .ruff_cache/
19
+ .uv/
20
+ dist/
21
+ build/
22
+ *.egg-info/
23
  .env
 
 
 
.pre-commit-config.yaml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/PyCQA/bandit
3
+ rev: 1.9.0
4
+ hooks:
5
+ - id: bandit
6
+ args: [-r, -c, .bandit.yml]
7
+ - repo: https://github.com/astral-sh/ruff-pre-commit
8
+ # Ruff version.
9
+ rev: v0.14.5
10
+ hooks:
11
+ # Run the linter.
12
+ - id: ruff
13
+ args: [ --fix ]
14
+ # Run the formatter.
15
+ - id: ruff-format
16
+ - repo: https://github.com/netromdk/vermin
17
+ rev: v1.7.0
18
+ hooks:
19
+ - id: vermin
20
+ args: ['-t=3.10-', '--violations', '--eval-annotations', '--no-tips']
.readthedocs.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # See https://docs.readthedocs.com/platform/stable/intro/zensical.html for details
2
+ # Example: https://github.com/readthedocs/test-builds/tree/zensical
3
+
4
+ version: 2
5
+
6
+ build:
7
+ os: ubuntu-24.04
8
+ apt_packages:
9
+ - pngquant
10
+ tools:
11
+ python: "3.13"
12
+ jobs:
13
+ install:
14
+ - pip install -r docs/requirements.txt
15
+ - pip install ".[all]"
16
+ build:
17
+ html:
18
+ - zensical build
19
+ post_build:
20
+ - mkdir -p $READTHEDOCS_OUTPUT/html/
21
+ - cp --recursive site/* $READTHEDOCS_OUTPUT/html/
CODE_OF_CONDUCT.md CHANGED
@@ -60,7 +60,7 @@ representative at an online or offline event.
60
 
61
  Instances of abusive, harassing, or otherwise unacceptable behavior may be
62
  reported to the community leaders responsible for enforcement at
63
- owensingh72@gmail.com.
64
  All complaints will be reviewed and investigated promptly and fairly.
65
 
66
  All community leaders are obligated to respect the privacy and security of the
 
60
 
61
  Instances of abusive, harassing, or otherwise unacceptable behavior may be
62
  reported to the community leaders responsible for enforcement at
63
+ karim.shoair@pm.me.
64
  All complaints will be reviewed and investigated promptly and fairly.
65
 
66
  All community leaders are obligated to respect the privacy and security of the
CONTRIBUTING.md CHANGED
@@ -1,167 +1,106 @@
1
- # Contributing to CyberScraper 2077
2
 
3
- > "In 2077, what makes someone a contributor? Pushing code." - Johnny Silverhand
4
 
5
- Thanks for considering contributing to CyberScraper 2077! This document outlines the process and guidelines for contributing to make the experience smooth for everyone involved.
6
 
7
- ## 🤝 Code of Conduct
8
 
9
- By participating in this project, you agree to abide by our Code of Conduct. Please read it before contributing.
10
 
11
- ## 🚀 How to Contribute
 
 
 
12
 
13
- ### Setting Up Development Environment
14
 
15
- 1. Fork the repository
16
- 2. Clone your fork:
17
- ```bash
18
- git clone https://github.com/your-username/CyberScraper-2077.git
19
- cd CyberScraper-2077
20
- ```
21
- 3. Create a virtual environment:
22
- ```bash
23
- python -m venv venv
24
- source venv/bin/activate # On Windows: venv\Scripts\activate
25
- ```
26
- 4. Install dependencies:
27
- ```bash
28
- pip install -r requirements.txt
29
- playwright install
30
- ```
31
 
32
- ### Making Changes
33
 
34
- 1. Create a new branch:
35
- ```bash
36
- git checkout -b feature/your-feature-name
37
- ```
38
- 2. Make your changes
39
- 3. Test your changes thoroughly
40
- 4. Commit your changes:
41
- ```bash
42
- git commit -m "feat: add new feature"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  ```
44
- 5. Push to your fork:
45
- ```bash
46
- git push origin feature/your-feature-name
 
 
47
  ```
48
- 6. Create a Pull Request
49
-
50
- ## 📝 Commit Message Guidelines
51
 
52
- We follow [Conventional Commits](https://www.conventionalcommits.org/). Your commit messages should be structured as follows:
53
 
 
 
 
 
 
54
  ```
55
- <type>(<scope>): <description>
56
-
57
- [optional body]
58
-
59
- [optional footer]
60
  ```
61
 
62
- Types:
63
- - `feat`: New feature
64
- - `fix`: Bug fix
65
- - `docs`: Documentation changes
66
- - `style`: Code style changes (formatting, missing semi-colons, etc)
67
- - `refactor`: Code refactoring
68
- - `test`: Adding missing tests
69
- - `chore`: Changes to build process or auxiliary tools
70
-
71
- Example:
72
- ```
73
- feat(scraper): add support for dynamic loading websites
74
  ```
75
 
76
- ## 🧪 Testing Guidelines
77
-
78
- - Write tests for new features
79
- - Ensure all tests pass before submitting PR
80
- - Follow existing test patterns
81
- - Include both unit and integration tests when applicable
82
-
83
- ## 📚 Documentation Guidelines
84
-
85
- - Update README.md if adding new features
86
- - Add docstrings to new functions/classes
87
- - Include code examples when appropriate
88
- - Keep documentation clear and concise
89
-
90
- ## 🏗️ Project Structure
 
 
 
 
91
 
 
 
 
92
  ```
93
- CyberScraper-2077/
94
- ├── app/
95
- │ ├── scrapers/
96
- │ ├── utils/
97
- │ └── ui_components/
98
- ├── src/
99
- │ └── models/
100
- ├── tests/
101
- └── docs/
102
- ```
103
-
104
- - Place new scraper implementations in `app/scrapers/`
105
- - Add utility functions in `app/utils/`
106
- - UI components go in `app/ui_components/`
107
- - Model-related code goes in `src/models/`
108
-
109
- ## 🎯 Feature Requests
110
-
111
- - Use GitHub Issues to propose new features
112
- - Tag feature requests with `enhancement`
113
- - Provide clear use cases
114
- - Discuss implementation approach
115
-
116
- ## 🐛 Bug Reports
117
-
118
- When reporting bugs, include:
119
- - Detailed description of the issue
120
- - Steps to reproduce
121
- - Expected vs actual behavior
122
- - Environment details (OS, Python version, etc.)
123
- - Screenshots if applicable
124
-
125
- ## 🔍 Pull Request Process
126
-
127
- 1. Update documentation
128
- 2. Add/update tests
129
- 3. Ensure CI/CD pipeline passes
130
- 4. Get at least one code review
131
- 5. Squash commits if requested
132
- 6. Ensure branch is up to date with main
133
-
134
- ## ⚙️ Development Best Practices
135
-
136
- 1. Follow PEP 8 style guide
137
- 2. Use type hints
138
- 3. Keep functions/methods focused and small
139
- 4. Comment complex logic
140
- 5. Use meaningful variable/function names
141
- 6. Handle errors appropriately
142
- 7. Log important operations
143
-
144
- ## 🚫 What to Avoid
145
-
146
- - Breaking existing functionality
147
- - Introducing unnecessary dependencies
148
- - Making large, unfocused PRs
149
- - Ignoring code review feedback
150
- - Modifying core functionality without discussion
151
-
152
- ## 🏆 Recognition
153
-
154
- Contributors will be added to our README.md and CONTRIBUTORS.md files. We value and appreciate all contributions!
155
-
156
- ## 📞 Getting Help
157
-
158
- - Create an issue for questions
159
- - Join our Discord community
160
- - Check existing documentation
161
- - Look through closed issues
162
-
163
- ## 📜 License
164
 
165
- By contributing, you agree that your contributions will be licensed under the project's MIT License.
 
166
 
167
- Remember: In Night City - and in open source - style is everything, choom. Let's keep the code clean and the commits conventional.
 
 
 
 
 
 
1
+ # Contributing to Scrapling
2
 
3
+ Thank you for your interest in contributing to Scrapling!
4
 
5
+ Everybody is invited and welcome to contribute to Scrapling.
6
 
7
+ Minor changes are more likely to be included promptly. Adding unit tests for new features or test cases for bugs you've fixed helps us ensure that the Pull Request (PR) is acceptable.
8
 
9
+ There are many ways to contribute to Scrapling. Here are some of them:
10
 
11
+ - Report bugs and request features using the [GitHub issues](https://github.com/D4Vinci/Scrapling/issues). Please follow the issue template to help us resolve your issue quickly.
12
+ - Blog about Scrapling. Tell the world how you’re using Scrapling. This will help newcomers with more examples and increase the Scrapling project's visibility.
13
+ - Join the [Discord community](https://discord.gg/EMgGbDceNQ) and share your ideas on how to improve Scrapling. We’re always open to suggestions.
14
+ - If you are not a developer, perhaps you would like to help with translating the [documentation](https://github.com/D4Vinci/Scrapling/tree/docs)?
15
 
 
16
 
17
+ ## Finding work
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
+ If you have decided to make a contribution to Scrapling, but you do not know what to contribute, here are some ways to find pending work:
20
 
21
+ - Check out the [contribution](https://github.com/D4Vinci/Scrapling/contribute) GitHub page, which lists open issues tagged as `good first issue`. These issues provide a good starting point.
22
+ - There are also the [help wanted](https://github.com/D4Vinci/Scrapling/issues?q=is%3Aissue%20label%3A%22help%20wanted%22%20state%3Aopen) issues, but know that some may require familiarity with the Scrapling code base first. You can also target any other issue, provided it is not tagged as `invalid`, `wontfix`, or similar tags.
23
+ - If you enjoy writing automated tests, you can work on increasing our test coverage. Currently, the test coverage is around 90–92%.
24
+ - Join the [Discord community](https://discord.gg/EMgGbDceNQ) and ask questions in the `#help` channel.
25
+
26
+ ## Coding style
27
+ Please follow these coding conventions as we do when writing code for Scrapling:
28
+ - We use [pre-commit](https://pre-commit.com/) to automatically address simple code issues before every commit, so please install it and run `pre-commit install` to set it up. This will install hooks to run [ruff](https://docs.astral.sh/ruff/), [bandit](https://github.com/PyCQA/bandit), and [vermin](https://github.com/netromdk/vermin) on every commit. We are currently using a workflow to automatically run these tools on every PR, so if your code doesn't pass these checks, the PR will be rejected.
29
+ - We use type hints for better code clarity and [pyright](https://github.com/microsoft/pyright) for static type checking, which depends on the type hints, of course.
30
+ - We use the conventional commit messages format as [here](https://gist.github.com/qoomon/5dfcdf8eec66a051ecd85625518cfd13#types), so for example, we use the following prefixes for commit messages:
31
+
32
+ | Prefix | When to use it |
33
+ |-------------|--------------------------|
34
+ | `feat:` | New feature added |
35
+ | `fix:` | Bug fix |
36
+ | `docs:` | Documentation change/add |
37
+ | `test:` | Tests |
38
+ | `refactor:` | Code refactoring |
39
+ | `chore:` | Maintenance tasks |
40
+
41
+ Then include the details of the change in the commit message body/description.
42
+
43
+ Example:
44
  ```
45
+ feat: add `adaptive` for similar elements
46
+
47
+ - Added find_similar() method
48
+ - Implemented pattern matching
49
+ - Added tests and documentation
50
  ```
 
 
 
51
 
52
+ > Please don’t put your name in the code you contribute; git provides enough metadata to identify the author of the code.
53
 
54
+ ## Development
55
+ Setting the scrapling logging level to `debug` makes it easier to know what's happening in the background.
56
+ ```python
57
+ import logging
58
+ logging.getLogger("scrapling").setLevel(logging.DEBUG)
59
  ```
60
+ Bonus: You can install the beta of the upcoming update from the dev branch as follows
61
+ ```commandline
62
+ pip3 install git+https://github.com/D4Vinci/Scrapling.git@dev
 
 
63
  ```
64
 
65
+ ## Building Documentation
66
+ Documentation is built using [MkDocs](https://www.mkdocs.org/). You can build it locally using the following commands:
67
+ ```bash
68
+ pip install mkdocs-material
69
+ mkdocs serve # Local preview
70
+ mkdocs build # Build the static site
 
 
 
 
 
 
71
  ```
72
 
73
+ ## Tests
74
+ Scrapling includes a comprehensive test suite that can be executed with pytest. However, first, you need to install all libraries and `pytest-plugins` listed in `tests/requirements.txt`. Then, running the tests will result in an output like this:
75
+ ```bash
76
+ $ pytest tests -n auto
77
+ =============================== test session starts ===============================
78
+ platform darwin -- Python 3.13.8, pytest-8.4.2, pluggy-1.6.0 -- /Users/<redacted>/.venv/bin/python3.13
79
+ cachedir: .pytest_cache
80
+ rootdir: /Users/<redacted>/scrapling
81
+ configfile: pytest.ini
82
+ plugins: asyncio-1.2.0, anyio-4.11.0, xdist-3.8.0, httpbin-2.1.0, cov-7.0.0
83
+ asyncio: mode=Mode.AUTO, debug=False, asyncio_default_fixture_loop_scope=function, asyncio_default_test_loop_scope=function
84
+ 10 workers [271 items]
85
+ scheduling tests via LoadScheduling
86
+
87
+ ...<shortened>...
88
+
89
+ =============================== 271 passed in 52.68s ==============================
90
+ ```
91
+ Hence, we used `-n auto` in the command above to run tests in threads to increase speed.
92
 
93
+ Bonus: You can also see the test coverage with the `pytest` plugin below
94
+ ```bash
95
+ pytest --cov=scrapling tests/
96
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
+ ## Making a Pull Request
99
+ To ensure that your PR gets accepted, please make sure that your PR is based on the latest changes from the dev branch and that it satisfies the following requirements:
100
 
101
+ - The PR should be made against the [**dev**](https://github.com/D4Vinci/Scrapling/tree/dev) branch of Scrapling. Any PR made against the main branch will be rejected.
102
+ - The code should be passing all available tests. We use tox with GitHub's CI to run the current tests on all supported Python versions for every code-related commit.
103
+ - The code should be passing all code quality checks we mentioned above. We are using GitHub's CI to enforce the code style checks performed by pre-commit. If you were using the pre-commit hooks we discussed above, you should not see any issues when committing your changes.
104
+ - Make your changes, keep the code clean with an explanation of any part that might be vague, and remember to create a separate virtual environment for this project.
105
+ - If you are adding a new feature, please add tests for it.
106
+ - If you are fixing a bug, please add code with the PR that reproduces the bug.
Dockerfile CHANGED
@@ -1,102 +1,48 @@
1
- # Use Python 3.12 for better performance and compatibility
2
- FROM python:3.12-slim-bookworm
3
 
4
- # Set environment variables
5
- ENV PYTHONUNBUFFERED=1 \
6
- PYTHONDONTWRITEBYTECODE=1 \
7
- PORT=7860 \
8
- UV_SYSTEM_PYTHON=1 \
9
- HOME=/home/user \
10
- STREAMLIT_BROWSER_GATHER_USAGE_STATS=false \
11
- STREAMLIT_SERVER_HEADLESS=true \
12
- STREAMLIT_SERVER_PORT=8501 \
13
- STREAMLIT_SERVER_ADDRESS=0.0.0.0
14
-
15
- # Install system dependencies
16
- RUN apt-get update && apt-get install -y \
17
- wget \
18
- gnupg \
19
- git \
20
- tor \
21
- tor-geoipdb \
22
- netcat-traditional \
23
- curl \
24
- build-essential \
25
- python3-dev \
26
- libffi-dev \
27
- procps \
28
- nginx \
29
- # Browser dependencies for Playwright/Patchright
30
- libglib2.0-0 \
31
- libnspr4 \
32
- libnss3 \
33
- libdbus-1-3 \
34
- libatk1.0-0 \
35
- libatk-bridge2.0-0 \
36
- libcups2 \
37
- libxkbcommon0 \
38
- libatspi2.0-0 \
39
- libxcomposite1 \
40
- libxdamage1 \
41
- libxfixes3 \
42
- libxrandr2 \
43
- libgbm1 \
44
- libcairo2 \
45
- libpango-1.0-0 \
46
- libasound2 \
47
- && apt-get clean \
48
- && rm -rf /var/lib/apt/lists/*
49
-
50
- # Install uv
51
  COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
52
 
53
- # Set up working directory
54
- WORKDIR /app
55
-
56
- # Copy requirements and install as root
57
- COPY requirements.txt .
58
- RUN uv pip install --system -r requirements.txt
59
- RUN uv pip install --system fastapi uvicorn
60
-
61
- # Install patchright browser (Chromium)
62
- RUN patchright install chromium
63
-
64
- # Create a non-root user
65
- RUN useradd -m -u 1000 user
66
 
67
- # Configure Tor
68
- RUN echo "SocksPort 9050" >> /etc/tor/torrc && \
69
- echo "ControlPort 9051" >> /etc/tor/torrc && \
70
- echo "CookieAuthentication 1" >> /etc/tor/torrc && \
71
- echo "DataDirectory /var/lib/tor" >> /etc/tor/torrc
72
 
73
- # Set permissions for Tor, app directory, and nginx
74
- RUN mkdir -p /var/lib/tor && \
75
- chown -R user:user /var/lib/tor && \
76
- chmod 700 /var/lib/tor && \
77
- chown -R user:user /app && \
78
- mkdir -p /var/log/nginx /var/lib/nginx /tmp && \
79
- chown -R user:user /var/log/nginx /var/lib/nginx /tmp
80
 
81
- # Pre-create streamlit config dir in home
82
- RUN mkdir -p /home/user/.streamlit && chown -R user:user /home/user
 
83
 
84
- # Copy the rest of the application
85
- COPY --chown=user:user . .
86
 
87
- # Install Scrapling in editable mode and its browser dependencies
88
- RUN uv pip install --system -e ./Scrapling[fetchers]
89
- RUN playwright install chromium
 
 
 
 
 
 
 
90
 
91
- # Set permissions for the start script
92
- RUN chmod +x start.sh
 
93
 
94
- # Switch to non-root user
95
  USER user
96
- ENV PATH="/home/user/.local/bin:$PATH"
97
 
98
- # Expose port
99
  EXPOSE 7860
100
 
101
- # Set the entrypoint
102
- ENTRYPOINT ["./start.sh"]
 
 
 
 
1
+ FROM python:3.12-slim-trixie
 
2
 
3
+ LABEL io.modelcontextprotocol.server.name="io.github.D4Vinci/Scrapling"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
5
 
6
+ # Set environment variables
7
+ ENV DEBIAN_FRONTEND=noninteractive \
8
+ PYTHONUNBUFFERED=1 \
9
+ PYTHONDONTWRITEBYTECODE=1
 
 
 
 
 
 
 
 
 
10
 
11
+ WORKDIR /app
 
 
 
 
12
 
13
+ # Copy dependency file first for better layer caching
14
+ COPY pyproject.toml ./
 
 
 
 
 
15
 
16
+ # Install dependencies only
17
+ RUN --mount=type=cache,target=/root/.cache/uv \
18
+ uv sync --no-install-project --all-extras --compile-bytecode
19
 
20
+ # Copy source code
21
+ COPY . .
22
 
23
+ # Install browsers and project in one optimized layer
24
+ RUN --mount=type=cache,target=/root/.cache/uv \
25
+ --mount=type=cache,target=/var/cache/apt \
26
+ --mount=type=cache,target=/var/lib/apt \
27
+ apt-get update && \
28
+ uv run playwright install-deps chromium && \
29
+ uv run playwright install chromium && \
30
+ uv sync --all-extras --compile-bytecode && \
31
+ apt-get clean && \
32
+ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
33
 
34
+ # Create a non-root user
35
+ RUN useradd -m -u 1000 user && \
36
+ chown -R user:user /app
37
 
38
+ # Switch to the non-root user
39
  USER user
 
40
 
41
+ # Expose port for MCP server HTTP transport
42
  EXPOSE 7860
43
 
44
+ # Set entrypoint to run scrapling
45
+ ENTRYPOINT ["uv", "run", "scrapling"]
46
+
47
+ # Default command (can be overridden)
48
+ CMD ["mcp", "--http", "--port", "7860", "--host", "0.0.0.0"]
LICENSE CHANGED
@@ -1,21 +1,28 @@
1
- MIT License
2
 
3
- Copyright (c) 2024 Owen Singh
4
 
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
 
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
 
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
 
 
 
 
 
 
 
 
 
 
 
 
1
+ BSD 3-Clause License
2
 
3
+ Copyright (c) 2024, Karim shoair
4
 
5
+ Redistribution and use in source and binary forms, with or without
6
+ modification, are permitted provided that the following conditions are met:
 
 
 
 
7
 
8
+ 1. Redistributions of source code must retain the above copyright notice, this
9
+ list of conditions and the following disclaimer.
10
 
11
+ 2. Redistributions in binary form must reproduce the above copyright notice,
12
+ this list of conditions and the following disclaimer in the documentation
13
+ and/or other materials provided with the distribution.
14
+
15
+ 3. Neither the name of the copyright holder nor the names of its
16
+ contributors may be used to endorse or promote products derived from
17
+ this software without specific prior written permission.
18
+
19
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
MANIFEST.in ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ include LICENSE
2
+ include *.db
3
+ include *.js
4
+ include scrapling/*.db
5
+ include scrapling/*.db*
6
+ include scrapling/*.db-*
7
+ include scrapling/py.typed
8
+ include scrapling/.scrapling_dependencies_installed
9
+ include .scrapling_dependencies_installed
10
+
11
+ recursive-exclude * __pycache__
12
+ recursive-exclude * *.py[co]
README.md CHANGED
@@ -1,406 +1,437 @@
1
  ---
2
  title: Scraper Hub
3
- emoji: 🌐
4
- colorFrom: blue
5
- colorTo: red
6
  sdk: docker
7
  app_port: 7860
8
  ---
9
-
10
- # 🌐 CyberScraper 2077
 
 
 
 
 
 
 
 
 
 
11
 
12
  <p align="center">
13
- <img src="https://i.postimg.cc/j5b7QSzg/scraper.png" alt="CyberScraper 2077 Logo">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  </p>
15
 
16
  <p align="center">
17
- <img src="https://i.postimg.cc/9MKqtn2g/68747470733a2f2f692e706f7374696d672e63632f74346d64347a74762f6379626572736372617065722d323037372e6a70.jpg">
 
 
 
 
 
 
 
 
 
 
18
  </p>
19
 
20
- [![Python](https://img.shields.io/badge/Python-blue)](https://www.python.org/downloads/)
21
- [![Streamlit](https://img.shields.io/badge/Streamlit-FF4B4B)](https://streamlit.io/)
22
- [![License](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT)
23
- [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](http://makeapullrequest.com)
24
-
25
- > Rip data from the net, leaving no trace. Welcome to the future of web scraping.
26
-
27
- ## 🔍 About
28
-
29
- CyberScraper 2077 is not just another web scraping tool – it's a glimpse into the future of data extraction. Born from the neon-lit streets of a cyberpunk world, this AI-powered scraper uses OpenAI, Gemini and LocalLLM Models to slice through the web's defenses, extracting the data you need with unparalleled precision and style.
30
-
31
- Whether you're a corpo data analyst, a street-smart netrunner, or just someone looking to pull information from the digital realm, CyberScraper 2077 has got you covered.
32
-
33
- <p align="center">
34
- <img src="https://i.postimg.cc/3NHb15wq/20240821-074556.gif">
35
- </p>
36
-
37
- ## ✨ Features
38
-
39
- - **AI-Powered Extraction**: Utilizes cutting-edge AI models to understand and parse web content intelligently.
40
- - **Sleek Streamlit Interface**: User-friendly GUI that even a chrome-armed street samurai could navigate.
41
- - **Multi-Format Support**: Export your data in JSON, CSV, HTML, SQL or Excel – whatever fits your cyberdeck.
42
- - **Tor Network Support**: Safely scrape .onion sites through the Tor network with automatic routing and security features.
43
- - **Stealth Mode**: Implemented stealth mode parameters that help avoid detection as a bot.
44
- - **Ollama Support**: Use a huge library of open source LLMs.
45
- - **Async Operations**: Lightning-fast scraping that would make a Trauma Team jealous.
46
- - **Smart Parsing**: Structures scraped content as if it was extracted straight from the engram of a master netrunner.
47
- - **Caching**: Implemented content-based and query-based caching using LRU cache and a custom dictionary to reduce redundant API calls.
48
- - **Upload to Google Sheets**: Now you can easily upload your extracted CSV data to Google Sheets with one click.
49
- - **Bypass Captcha**: Bypass captcha by using the -captcha at the end of the URL. (Currently only works natively, doesn't work on Docker)
50
- - **Current Browser**: The current browser feature uses your local browser instance which will help you bypass 99% of bot detections. (Only use when necessary)
51
- - **Navigate through the Pages (BETA)**: Navigate through the webpage and scrape data from different pages.
52
-
53
- ## 🪟 For Windows Users
54
 
55
- Please follow the Docker Container Guide given below, as I won't be able to maintain another version for Windows systems.
56
 
57
- ## 🛠 Installation
58
 
59
- **Note: CyberScraper 2077 requires Python 3.10 or higher.**
60
-
61
- 1. Clone this repository:
62
- ```bash
63
- git clone https://github.com/itsOwen/CyberScraper-2077.git
64
- cd CyberScraper-2077
65
- ```
66
-
67
- 2. Create and activate a virtual environment:
68
- ```bash
69
- virtualenv venv
70
- source venv/bin/activate # Optional
71
- ```
72
-
73
- 3. Install the required packages:
74
- ```bash
75
- pip install -r requirements.txt
76
- ```
77
-
78
- 4. Install the playwright:
79
- ```bash
80
- playwright install
81
- ```
82
-
83
- 5. Set OpenAI & Gemini Key in your environment:
84
-
85
- Linux/Mac:
86
- ```bash
87
- export OPENAI_API_KEY="your-api-key-here"
88
- export GOOGLE_API_KEY="your-api-key-here"
89
- ```
90
-
91
- ### Using Ollama
92
-
93
- Note: I only recommend using OpenAI and Gemini API as these models are really good at following instructions. If you are using open-source LLMs, make sure you have a good system as the speed of the data generation/presentation depends on how well your system can run the LLM. You may also have to fine-tune the prompt and add some additional filters yourself.
94
-
95
- ```bash
96
- 1. Setup Ollama using `pip install ollama`
97
- 2. Download Ollama from the official website: https://ollama.com/download
98
- 3. Now type: ollama pull llama3.1 or whatever LLM you want to use.
99
- 4. Now follow the rest of the steps below.
100
  ```
 
 
 
101
 
102
- ## 🐳 Docker Installation
103
-
104
- If you prefer to use Docker, follow these steps to set up and run CyberScraper 2077:
105
-
106
- 1. Ensure you have Docker installed on your system.
107
-
108
- 2. Clone this repository:
109
- ```bash
110
- git clone https://github.com/itsOwen/CyberScraper-2077.git
111
- cd CyberScraper-2077
112
- ```
113
 
114
- 3. Build the Docker image:
115
- ```bash
116
- docker build -t cyberscraper-2077 .
117
- ```
118
 
119
- 4. Run the container:
120
- ```bash
121
- docker run -p 8501:8501 -e OPENAI_API_KEY="your-actual-api-key" -e GOOGLE_API_KEY="your-actual-api-key" cyberscraper-2077
122
- ```
123
 
124
- ### Using Ollama with Docker
125
 
126
- If you want to use Ollama with the Docker setup:
 
127
 
128
- 1. Install Ollama on your host machine following the instructions at https://ollama.com/download
129
 
130
- 2. Run Ollama on your host machine:
131
- ```bash
132
- ollama pull llama3.1
133
- ```
 
 
 
 
 
134
 
135
- 3. Find your host machine's IP address:
136
- - On Linux/Mac: `ifconfig` or `ip addr show`
137
- - On Windows: `ipconfig`
138
 
139
- 4. Run the Docker container with the host network and set the Ollama URL:
140
- ```bash
141
- docker run -e OLLAMA_BASE_URL=http://host.docker.internal:11434 -p 8501:8501 cyberscraper-2077
142
- ```
143
 
144
- Now visit the url: http://localhost:8501/
145
 
146
- On Linux you might need to use this below:
147
- ```bash
148
- docker run -e OLLAMA_BASE_URL=http://<your-host-ip>:11434 -p 8501:8501 cyberscraper-2077
149
- ```
150
- Replace `<your-host-ip>` with your actual host machine IP address.
151
 
152
- 5. In the Streamlit interface, select the Ollama model you want to use (e.g., "ollama:llama3.1").
153
 
154
- Note: Ensure that your firewall allows connections to port 11434 for Ollama.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
- ## 🚀 Usage
 
 
157
 
158
- 1. Fire up the Streamlit app:
159
- ```bash
160
- streamlit run main.py
161
- ```
 
 
 
162
 
163
- 2. Open your browser and navigate to `http://localhost:8501`.
 
 
164
 
165
- 3. Enter the URL of the site you want to scrape or ask a question about the data you need.
 
 
 
 
 
 
166
 
167
- 4. Ask the chatbot to extract the data in any format. Select whatever data you want to export or even everything from the webpage.
 
 
168
 
169
- 5. Watch as CyberScraper 2077 tears through the net, extracting your data faster than you can say "flatline"!
 
 
 
170
 
171
- Example usage with page ranges:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  ```
173
- https://example.com/products 1-5
174
- https://example.com/search?q=cyberpunk&page={page} 1-10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  ```
176
-
177
- ## 🌐 Multi-Page Scraping (BETA)
178
-
179
- > **Note**: The multi-page scraping feature is currently in beta. While functional, you may encounter occasional issues or unexpected behavior. We appreciate your feedback and patience as we continue to improve this feature.
180
-
181
- CyberScraper 2077 now supports multi-page scraping, allowing you to extract data from multiple pages of a website in one go. This feature is perfect for scraping paginated content, search results, or any site with data spread across multiple pages.
182
-
183
- ### How to Use Multi-Page Scraping
184
-
185
- I suggest you enter the URL structure every time if you want to scrape multiple pages so it can detect the URL structure easily. It detects nearly all URL types.
186
-
187
- 1. **Basic Usage**:
188
- To scrape multiple pages, use the following format when entering the URL:
189
- ```
190
- https://example.com/page 1-5
191
- https://example.com/p/ 1-6
192
- https://example.com/xample/something-something-1279?p=1 1-3
193
- ```
194
- This will scrape pages 1 through 5 of the website.
195
-
196
- 2. **Custom Page Ranges**:
197
- You can specify custom page ranges:
198
- ```
199
- https://example.com/p/ 1-5,7,9-12
200
- https://example.com/xample/something-something-1279?p=1 1,7,8,9
201
- ```
202
- This will scrape pages 1 to 5, page 7, and pages 9 to 12.
203
-
204
- 3. **URL Patterns**:
205
- For websites with different URL structures, you can specify a pattern:
206
- ```
207
- https://example.com/search?q=cyberpunk&page={page} 1-5
208
- ```
209
- Replace `{page}` with where the page number should be in the URL.
210
-
211
- 4. **Automatic Pattern Detection**:
212
- If you don't specify a pattern, CyberScraper 2077 will attempt to detect the URL pattern automatically. However, for best results, specifying the pattern is recommended.
213
-
214
- ### Tips for Effective Multi-Page Scraping
215
-
216
- - Start with a small range of pages to test before scraping a large number.
217
- - Be mindful of the website's load and your scraping speed to avoid overloading servers.
218
- - Use the `simulate_human` option for more natural scraping behavior on sites with anti-bot measures.
219
- - Regularly check the website's `robots.txt` file and terms of service to ensure compliance.
220
-
221
- ### Example
222
-
223
- ```bash
224
- URL Example : "https://news.ycombinator.com/?p=1 1-3 or 1,2,3,4"
225
  ```
 
226
 
227
- If you want to scrape a specific page, just enter the query "please scrape page number 1 or 2". If you want to scrape all pages, simply give a query like "scrape all pages in csv" or whatever format you want.
228
-
229
- ## 🧅 Tor Network Scraping
230
-
231
- > **Note**: The Tor network scraping feature allows you to access and scrape .onion sites. This feature requires additional setup and should be used responsibly and legally.
232
-
233
- CyberScraper 2077 now supports scraping .onion sites through the Tor network, allowing you to access and extract data from the dark web safely and anonymously. This feature is perfect for researchers, security analysts, and investigators who need to gather information from Tor hidden services.
234
-
235
- ### Prerequisites
236
-
237
- 1. Install Tor on your system:
238
- ```bash
239
- # Ubuntu/Debian
240
- sudo apt install tor
241
-
242
- # macOS (using Homebrew)
243
- brew install tor
244
-
245
- # Start the Tor service
246
- sudo service tor start # on Linux
247
- brew services start tor # on macOS
248
- ```
249
-
250
- 2. Install additional Python packages:
251
- ```bash
252
- pip install PySocks requests[socks]
253
- ```
254
-
255
- ### Using Tor Scraping
256
-
257
- 1. **Basic Usage**:
258
- Simply enter an .onion URL, and CyberScraper will automatically detect and route it through the Tor network:
259
- ```
260
- http://example123abc.onion
261
- ```
262
-
263
- 2. **Safety Features**:
264
- - Automatic .onion URL detection
265
- - Built-in connection verification
266
- - Tor Browser-like request headers
267
- - Automatic circuit isolation
268
 
269
- ### Configuration Options
 
 
270
 
271
- You can customize the Tor scraping behavior by adjusting the following settings:
272
  ```python
273
- tor_config = TorConfig(
274
- socks_port=9050, # Default Tor SOCKS port
275
- circuit_timeout=10, # Timeout for circuit creation
276
- auto_renew_circuit=True, # Automatically renew Tor circuit
277
- verify_connection=True # Verify Tor connection before scraping
278
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  ```
280
 
281
- ### Security Considerations
282
 
283
- - Always ensure you're complying with local laws and regulations
284
- - Use a VPN in addition to Tor for extra security
285
- - Be patient as Tor connections can be slower than regular web scraping
286
- - Avoid sending personal or identifying information through Tor
287
- - Some .onion sites may be offline or unreachable
288
 
289
- ### Docker Support
290
 
291
- For Docker users, add these additional flags to enable Tor support:
292
  ```bash
293
- docker run -p 8501:8501 \
294
- --network="host" \
295
- -e OPENAI_API_KEY="your-api-key" \
296
- cyberscraper-2077
297
  ```
298
-
299
- ### Example Usage
300
-
301
- <p align="center">
302
- <img src="https://i.postimg.cc/3JvhgtMP/cyberscraper-onion.png" alt="CyberScraper 2077 Onion Scrape">
303
- </p>
304
-
305
- ## 🔐 Setup Google Sheets Authentication
306
-
307
- 1. Go to the Google Cloud Console (https://console.cloud.google.com/).
308
- 2. Select your project.
309
- 3. Navigate to "APIs & Services" > "Credentials".
310
- 4. Find your existing OAuth 2.0 Client ID and delete it.
311
- 5. Click "Create Credentials" > "OAuth client ID".
312
- 6. Choose "Web application" as the application type.
313
- 7. Name your client (e.g., "CyberScraper 2077 Web Client").
314
- 8. Under "Authorized JavaScript origins", add:
315
- - http://localhost:8501
316
- - http://localhost:8502
317
- - http://127.0.0.1:8501
318
- - http://127.0.0.1:8502
319
- 9. Under "Authorized redirect URIs", add:
320
- - http://localhost:8501/
321
- - http://127.0.0.1:8501/
322
- - http://localhost:8502/
323
- - http://127.0.0.1:8502/
324
- 10. Click "Create" to generate the new client ID.
325
- 11. Download the new client configuration JSON file and rename it to `client_secret.json`.
326
-
327
- ## ⚙️ Adjusting PlaywrightScraper Settings (optional)
328
-
329
- Customize the `PlaywrightScraper` settings to fit your scraping needs. If some websites are giving you issues, you might want to check the behavior of the website:
330
-
331
  ```bash
332
- use_stealth: bool = True,
333
- simulate_human: bool = False,
334
- use_custom_headers: bool = True,
335
- hide_webdriver: bool = True,
336
- bypass_cloudflare: bool = True:
337
  ```
338
 
339
- Adjust these settings based on your target website and environment for optimal results.
 
340
 
341
- You can also bypass the captcha using the ```-captcha``` parameter at the end of the URL. The browser window will pop up, complete the captcha, and go back to your terminal window. Press enter and the bot will complete its task.
342
 
343
- ## 🤝 Contributing
344
 
345
- We welcome all cyberpunks, netrunners, and code samurais to contribute to CyberScraper 2077!
346
 
347
- 1. Fork the repository
348
- 2. Create your feature branch (`git checkout -b feature/amazing-feature`)
349
- 3. Commit your changes (`git commit -m 'Add some amazing feature'`)
350
- 4. Push to the branch (`git push origin feature/amazing-feature`)
351
- 5. Open a Pull Request
 
 
 
 
 
352
 
353
- ## 🔧 Troubleshooting
354
 
355
- Ran into a glitch in the matrix? Let me know by adding the issue to this repo so that we can fix it together.
356
 
357
- ## FAQ
358
 
359
- **Q: Is CyberScraper 2077 legal to use?**
360
- A: CyberScraper 2077 is designed for ethical web scraping. Always ensure you have the right to scrape a website and respect their robots.txt file.
 
 
361
 
362
- **Q: Can I use this for commercial purposes?**
363
- A: Yes, under the terms of the MIT License.
364
 
365
- ## 📄 License
366
 
367
- This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. Use it, mod it, sell it – just don't blame us if you end up flatlined.
368
 
369
- ## 📞 Contact
370
 
371
- Got questions? Need support? Want to hire me for a gig?
372
-
373
- - Email: owensingh72@proton.me
374
- - Website: [owen.sh](https://owen.sh)
375
-
376
- ## 🚨 Disclaimer
377
-
378
- Listen up, choombas! Before you jack into this code, you better understand the risks:
379
 
380
- 1. This software is provided "as is", without warranty of any kind, express or implied.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
 
382
- 2. The authors are not liable for any damages or losses resulting from the use of this software.
383
 
384
- 3. This tool is intended for educational and research purposes only. Any illegal use is strictly prohibited.
385
 
386
- 4. We do not guarantee the accuracy, completeness, or reliability of any data obtained through this tool.
387
 
388
- 5. By using this software, you acknowledge that you are doing so at your own risk.
 
389
 
390
- 6. You are responsible for complying with all applicable laws and regulations in your use of this software.
391
 
392
- 7. We reserve the right to modify or discontinue the software at any time without notice.
393
 
394
- Remember, samurai: In the dark future of the NET, knowledge is power, but it's also a double-edged sword. Use this tool wisely, and may your connection always be strong and your firewalls impenetrable. Stay frosty out there in the digital frontier.
395
 
396
- ![Alt](https://repobeats.axiom.co/api/embed/80758496e19179f355d6d71c180db7aca66d396b.svg "Repobeats analytics image")
 
397
 
398
  ---
399
-
400
- <p align="center">
401
- <strong>CyberScraper 2077 – Because in 2077, what makes someone a criminal? Getting caught.</strong>
402
- </p>
403
-
404
- <p align="center">
405
- Built with love and chrome by the streets of Night City | © 2077 Owen Singh
406
- </p>
 
1
  ---
2
  title: Scraper Hub
3
+ emoji: 🕷️
4
+ colorFrom: purple
5
+ colorTo: blue
6
  sdk: docker
7
  app_port: 7860
8
  ---
9
+ <!-- mcp-name: io.github.D4Vinci/Scrapling -->
10
+
11
+ <h1 align="center">
12
+ <a href="https://scrapling.readthedocs.io">
13
+ <picture>
14
+ <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
15
+ <img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
16
+ </picture>
17
+ </a>
18
+ <br>
19
+ <small>Effortless Web Scraping for the Modern Web</small>
20
+ </h1>
21
 
22
  <p align="center">
23
+ <a href="https://trendshift.io/repositories/14244" target="_blank"><img src="https://trendshift.io/api/badge/repositories/14244" alt="D4Vinci%2FScrapling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
24
+ <br/>
25
+ <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_AR.md">العربيه</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_ES.md">Español</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_DE.md">Deutsch</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_CN.md">简体中文</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_JP.md">日本語</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_RU.md">Русский</a>
26
+ <br/>
27
+ <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
28
+ <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
29
+ <a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
30
+ <img alt="PyPI version" src="https://badge.fury.io/py/Scrapling.svg"></a>
31
+ <a href="https://pepy.tech/project/scrapling" alt="PyPI Downloads">
32
+ <img alt="PyPI Downloads" src="https://static.pepy.tech/personalized-badge/scrapling?period=total&units=INTERNATIONAL_SYSTEM&left_color=GREY&right_color=GREEN&left_text=Downloads"></a>
33
+ <br/>
34
+ <a href="https://discord.gg/EMgGbDceNQ" alt="Discord" target="_blank">
35
+ <img alt="Discord" src="https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ">
36
+ </a>
37
+ <a href="https://x.com/Scrapling_dev" alt="X (formerly Twitter)">
38
+ <img alt="X (formerly Twitter) Follow" src="https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev">
39
+ </a>
40
+ <br/>
41
+ <a href="https://pypi.org/project/scrapling/" alt="Supported Python versions">
42
+ <img alt="Supported Python versions" src="https://img.shields.io/pypi/pyversions/scrapling.svg"></a>
43
  </p>
44
 
45
  <p align="center">
46
+ <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/"><strong>Selection methods</strong></a>
47
+ &middot;
48
+ <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/"><strong>Fetchers</strong></a>
49
+ &middot;
50
+ <a href="https://scrapling.readthedocs.io/en/latest/spiders/architecture.html"><strong>Spiders</strong></a>
51
+ &middot;
52
+ <a href="https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html"><strong>Proxy Rotation</strong></a>
53
+ &middot;
54
+ <a href="https://scrapling.readthedocs.io/en/latest/cli/overview/"><strong>CLI</strong></a>
55
+ &middot;
56
+ <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/"><strong>MCP</strong></a>
57
  </p>
58
 
59
+ Scrapling is an adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
+ Its parser learns from website changes and automatically relocates your elements when pages update. Its fetchers bypass anti-bot systems like Cloudflare Turnstile out of the box. And its spider framework lets you scale up to concurrent, multi-session crawls with pause/resume and automatic proxy rotation all in a few lines of Python. One library, zero compromises.
62
 
63
+ Blazing fast crawls with real-time stats and streaming. Built by Web Scrapers for Web Scrapers and regular users, there's something for everyone.
64
 
65
+ ```python
66
+ from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
67
+ StealthyFetcher.adaptive = True
68
+ p = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # Fetch website under the radar!
69
+ products = p.css('.product', auto_save=True) # Scrape data that survives website design changes!
70
+ products = p.css('.product', adaptive=True) # Later, if the website structure changes, pass `adaptive=True` to find them!
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  ```
72
+ Or scale up to full crawls
73
+ ```python
74
+ from scrapling.spiders import Spider, Response
75
 
76
+ class MySpider(Spider):
77
+ name = "demo"
78
+ start_urls = ["https://example.com/"]
 
 
 
 
 
 
 
 
79
 
80
+ async def parse(self, response: Response):
81
+ for item in response.css('.product'):
82
+ yield {"title": item.css('h2::text').get()}
 
83
 
84
+ MySpider().start()
85
+ ```
 
 
86
 
87
+ # Platinum Sponsors
88
 
89
+ <i><sub>Do you want to be the first company to show up here? Click [here](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)</sub></i>
90
+ # Sponsors
91
 
92
+ <!-- sponsors -->
93
 
94
+ <a href="https://www.scrapeless.com/en?utm_source=official&utm_term=scrapling" target="_blank" title="Effortless Web Scraping Toolkit for Business and Developers"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg"></a>
95
+ <a href="https://www.thordata.com/?ls=github&lk=github" target="_blank" title="Unblockable proxies and scraping infrastructure, delivering real-time, reliable web data to power AI models and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
96
+ <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
97
+ <a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
98
+ <a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
99
+ <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
100
+ <a href="https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci" target="_blank" title="The web scraping service that actually beats anti-bot systems!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png"></a>
101
+ <a href="https://proxyempire.io/" target="_blank" title="Collect The Data Your Project Needs with the Best Residential Proxies"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png"></a>
102
+ <a href="https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling" target="_blank" title="Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png"></a>
103
 
 
 
 
104
 
105
+ <a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
106
+ <a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
107
+ <a href="https://browser.cash/?utm_source=D4Vinci&utm_medium=referral" target="_blank" title="Browser Automation & AI Browser Agent Platform"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png"></a>
 
108
 
109
+ <!-- /sponsors -->
110
 
111
+ <i><sub>Do you want to show your ad here? Click [here](https://github.com/sponsors/D4Vinci) and choose the tier that suites you!</sub></i>
 
 
 
 
112
 
113
+ ---
114
 
115
+ ## Key Features
116
+
117
+ ### Spiders — A Full Crawling Framework
118
+ - 🕷️ **Scrapy-like Spider API**: Define spiders with `start_urls`, async `parse` callbacks, and `Request`/`Response` objects.
119
+ - ⚡ **Concurrent Crawling**: Configurable concurrency limits, per-domain throttling, and download delays.
120
+ - 🔄 **Multi-Session Support**: Unified interface for HTTP requests, and stealthy headless browsers in a single spider — route requests to different sessions by ID.
121
+ - 💾 **Pause & Resume**: Checkpoint-based crawl persistence. Press Ctrl+C for a graceful shutdown; restart to resume from where you left off.
122
+ - 📡 **Streaming Mode**: Stream scraped items as they arrive via `async for item in spider.stream()` with real-time stats — ideal for UI, pipelines, and long-running crawls.
123
+ - 🛡️ **Blocked Request Detection**: Automatic detection and retry of blocked requests with customizable logic.
124
+ - 📦 **Built-in Export**: Export results through hooks and your own pipeline or the built-in JSON/JSONL with `result.items.to_json()` / `result.items.to_jsonl()` respectively.
125
+
126
+ ### Advanced Websites Fetching with Session Support
127
+ - **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class. Can impersonate browsers' TLS fingerprint, headers, and use HTTP/3.
128
+ - **Dynamic Loading**: Fetch dynamic websites with full browser automation through the `DynamicFetcher` class supporting Playwright's Chromium and Google's Chrome.
129
+ - **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` and fingerprint spoofing. Can easily bypass all types of Cloudflare's Turnstile/Interstitial with automation.
130
+ - **Session Management**: Persistent session support with `FetcherSession`, `StealthySession`, and `DynamicSession` classes for cookie and state management across requests.
131
+ - **Proxy Rotation**: Built-in `ProxyRotator` with cyclic or custom rotation strategies across all session types, plus per-request proxy overrides.
132
+ - **Domain Blocking**: Block requests to specific domains (and their subdomains) in browser-based fetchers.
133
+ - **Async Support**: Complete async support across all fetchers and dedicated async session classes.
134
+
135
+ ### Adaptive Scraping & AI Integration
136
+ - 🔄 **Smart Element Tracking**: Relocate elements after website changes using intelligent similarity algorithms.
137
+ - 🎯 **Smart Flexible Selection**: CSS selectors, XPath selectors, filter-based search, text search, regex search, and more.
138
+ - 🔍 **Find Similar Elements**: Automatically locate elements similar to found elements.
139
+ - 🤖 **MCP Server to be used with AI**: Built-in MCP server for AI-assisted Web Scraping and data extraction. The MCP server features powerful, custom capabilities that leverage Scrapling to extract targeted content before passing it to the AI (Claude/Cursor/etc), thereby speeding up operations and reducing costs by minimizing token usage. ([demo video](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
140
+
141
+ ### High-Performance & battle-tested Architecture
142
+ - 🚀 **Lightning Fast**: Optimized performance outperforming most Python scraping libraries.
143
+ - 🔋 **Memory Efficient**: Optimized data structures and lazy loading for a minimal memory footprint.
144
+ - ⚡ **Fast JSON Serialization**: 10x faster than the standard library.
145
+ - 🏗️ **Battle tested**: Not only does Scrapling have 92% test coverage and full type hints coverage, but it has been used daily by hundreds of Web Scrapers over the past year.
146
+
147
+ ### Developer/Web Scraper Friendly Experience
148
+ - 🎯 **Interactive Web Scraping Shell**: Optional built-in IPython shell with Scrapling integration, shortcuts, and new tools to speed up Web Scraping scripts development, like converting curl requests to Scrapling requests and viewing requests results in your browser.
149
+ - 🚀 **Use it directly from the Terminal**: Optionally, you can use Scrapling to scrape a URL without writing a single line of code!
150
+ - 🛠️ **Rich Navigation API**: Advanced DOM traversal with parent, sibling, and child navigation methods.
151
+ - 🧬 **Enhanced Text Processing**: Built-in regex, cleaning methods, and optimized string operations.
152
+ - 📝 **Auto Selector Generation**: Generate robust CSS/XPath selectors for any element.
153
+ - 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup with the same pseudo-elements used in Scrapy/Parsel.
154
+ - 📘 **Complete Type Coverage**: Full type hints for excellent IDE support and code completion. The entire codebase is automatically scanned with **PyRight** and **MyPy** with each change.
155
+ - 🔋 **Ready Docker image**: With each release, a Docker image containing all browsers is automatically built and pushed.
156
+
157
+ ## Getting Started
158
+
159
+ Let's give you a quick glimpse of what Scrapling can do without deep diving.
160
+
161
+ ### Basic Usage
162
+ HTTP requests with session support
163
+ ```python
164
+ from scrapling.fetchers import Fetcher, FetcherSession
165
 
166
+ with FetcherSession(impersonate='chrome') as session: # Use latest version of Chrome's TLS fingerprint
167
+ page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
168
+ quotes = page.css('.quote .text::text').getall()
169
 
170
+ # Or use one-off requests
171
+ page = Fetcher.get('https://quotes.toscrape.com/')
172
+ quotes = page.css('.quote .text::text').getall()
173
+ ```
174
+ Advanced stealth mode
175
+ ```python
176
+ from scrapling.fetchers import StealthyFetcher, StealthySession
177
 
178
+ with StealthySession(headless=True, solve_cloudflare=True) as session: # Keep the browser open until you finish
179
+ page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
180
+ data = page.css('#padded_content a').getall()
181
 
182
+ # Or use one-off request style, it opens the browser for this request, then closes it after finishing
183
+ page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
184
+ data = page.css('#padded_content a').getall()
185
+ ```
186
+ Full browser automation
187
+ ```python
188
+ from scrapling.fetchers import DynamicFetcher, DynamicSession
189
 
190
+ with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # Keep the browser open until you finish
191
+ page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
192
+ data = page.xpath('//span[@class="text"]/text()').getall() # XPath selector if you prefer it
193
 
194
+ # Or use one-off request style, it opens the browser for this request, then closes it after finishing
195
+ page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
196
+ data = page.css('.quote .text::text').getall()
197
+ ```
198
 
199
+ ### Spiders
200
+ Build full crawlers with concurrent requests, multiple session types, and pause/resume:
201
+ ```python
202
+ from scrapling.spiders import Spider, Request, Response
203
+
204
+ class QuotesSpider(Spider):
205
+ name = "quotes"
206
+ start_urls = ["https://quotes.toscrape.com/"]
207
+ concurrent_requests = 10
208
+
209
+ async def parse(self, response: Response):
210
+ for quote in response.css('.quote'):
211
+ yield {
212
+ "text": quote.css('.text::text').get(),
213
+ "author": quote.css('.author::text').get(),
214
+ }
215
+
216
+ next_page = response.css('.next a')
217
+ if next_page:
218
+ yield response.follow(next_page[0].attrib['href'])
219
+
220
+ result = QuotesSpider().start()
221
+ print(f"Scraped {len(result.items)} quotes")
222
+ result.items.to_json("quotes.json")
223
  ```
224
+ Use multiple session types in a single spider:
225
+ ```python
226
+ from scrapling.spiders import Spider, Request, Response
227
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession
228
+
229
+ class MultiSessionSpider(Spider):
230
+ name = "multi"
231
+ start_urls = ["https://example.com/"]
232
+
233
+ def configure_sessions(self, manager):
234
+ manager.add("fast", FetcherSession(impersonate="chrome"))
235
+ manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)
236
+
237
+ async def parse(self, response: Response):
238
+ for link in response.css('a::attr(href)').getall():
239
+ # Route protected pages through the stealth session
240
+ if "protected" in link:
241
+ yield Request(link, sid="stealth")
242
+ else:
243
+ yield Request(link, sid="fast", callback=self.parse) # explicit callback
244
  ```
245
+ Pause and resume long crawls with checkpoints by running the spider like this:
246
+ ```python
247
+ QuotesSpider(crawldir="./crawl_data").start()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  ```
249
+ Press Ctrl+C to pause gracefully — progress is saved automatically. Later, when you start the spider again, pass the same `crawldir`, and it will resume from where it stopped.
250
 
251
+ ### Advanced Parsing & Navigation
252
+ ```python
253
+ from scrapling.fetchers import Fetcher
254
+
255
+ # Rich element selection and navigation
256
+ page = Fetcher.get('https://quotes.toscrape.com/')
257
+
258
+ # Get quotes with multiple selection methods
259
+ quotes = page.css('.quote') # CSS selector
260
+ quotes = page.xpath('//div[@class="quote"]') # XPath
261
+ quotes = page.find_all('div', {'class': 'quote'}) # BeautifulSoup-style
262
+ # Same as
263
+ quotes = page.find_all('div', class_='quote')
264
+ quotes = page.find_all(['div'], class_='quote')
265
+ quotes = page.find_all(class_='quote') # and so on...
266
+ # Find element by text content
267
+ quotes = page.find_by_text('quote', tag='div')
268
+
269
+ # Advanced navigation
270
+ quote_text = page.css('.quote')[0].css('.text::text').get()
271
+ quote_text = page.css('.quote').css('.text::text').getall() # Chained selectors
272
+ first_quote = page.css('.quote')[0]
273
+ author = first_quote.next_sibling.css('.author::text')
274
+ parent_container = first_quote.parent
275
+
276
+ # Element relationships and similarity
277
+ similar_elements = first_quote.find_similar()
278
+ below_elements = first_quote.below_elements()
279
+ ```
280
+ You can use the parser right away if you don't want to fetch websites like below:
281
+ ```python
282
+ from scrapling.parser import Selector
 
 
 
 
 
 
 
 
 
283
 
284
+ page = Selector("<html>...</html>")
285
+ ```
286
+ And it works precisely the same way!
287
 
288
+ ### Async Session Management Examples
289
  ```python
290
+ import asyncio
291
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession
292
+
293
+ async with FetcherSession(http3=True) as session: # `FetcherSession` is context-aware and can work in both sync/async patterns
294
+ page1 = session.get('https://quotes.toscrape.com/')
295
+ page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
296
+
297
+ # Async session usage
298
+ async with AsyncStealthySession(max_pages=2) as session:
299
+ tasks = []
300
+ urls = ['https://example.com/page1', 'https://example.com/page2']
301
+
302
+ for url in urls:
303
+ task = session.fetch(url)
304
+ tasks.append(task)
305
+
306
+ print(session.get_pool_stats()) # Optional - The status of the browser tabs pool (busy/free/error)
307
+ results = await asyncio.gather(*tasks)
308
+ print(session.get_pool_stats())
309
  ```
310
 
311
+ ## CLI & Interactive Shell
312
 
313
+ Scrapling includes a powerful command-line interface:
 
 
 
 
314
 
315
+ [![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)
316
 
317
+ Launch the interactive Web Scraping shell
318
  ```bash
319
+ scrapling shell
 
 
 
320
  ```
321
+ Extract pages to a file directly without programming (Extracts the content inside the `body` tag by default). If the output file ends with `.txt`, then the text content of the target will be extracted. If it ends in `.md`, it will be a Markdown representation of the HTML content; if it ends in `.html`, it will be the HTML content itself.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
  ```bash
323
+ scrapling extract get 'https://example.com' content.md
324
+ scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # All elements matching the CSS selector '#fromSkipToProducts'
325
+ scrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless
326
+ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare
 
327
  ```
328
 
329
+ > [!NOTE]
330
+ > There are many additional features, but we want to keep this page concise, including the MCP server and the interactive Web Scraping Shell. Check out the full documentation [here](https://scrapling.readthedocs.io/en/latest/)
331
 
332
+ ## Performance Benchmarks
333
 
334
+ Scrapling isn't just powerful—it's also blazing fast. The following benchmarks compare Scrapling's parser with the latest versions of other popular libraries.
335
 
336
+ ### Text Extraction Speed Test (5000 nested elements)
337
 
338
+ | # | Library | Time (ms) | vs Scrapling |
339
+ |---|:-----------------:|:---------:|:------------:|
340
+ | 1 | Scrapling | 2.02 | 1.0x |
341
+ | 2 | Parsel/Scrapy | 2.04 | 1.01 |
342
+ | 3 | Raw Lxml | 2.54 | 1.257 |
343
+ | 4 | PyQuery | 24.17 | ~12x |
344
+ | 5 | Selectolax | 82.63 | ~41x |
345
+ | 6 | MechanicalSoup | 1549.71 | ~767.1x |
346
+ | 7 | BS4 with Lxml | 1584.31 | ~784.3x |
347
+ | 8 | BS4 with html5lib | 3391.91 | ~1679.1x |
348
 
 
349
 
350
+ ### Element Similarity & Text Search Performance
351
 
352
+ Scrapling's adaptive element finding capabilities significantly outperform alternatives:
353
 
354
+ | Library | Time (ms) | vs Scrapling |
355
+ |-------------|:---------:|:------------:|
356
+ | Scrapling | 2.39 | 1.0x |
357
+ | AutoScraper | 12.45 | 5.209x |
358
 
 
 
359
 
360
+ > All benchmarks represent averages of 100+ runs. See [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology.
361
 
362
+ ## Installation
363
 
364
+ Scrapling requires Python 3.10 or higher:
365
 
366
+ ```bash
367
+ pip install scrapling
368
+ ```
 
 
 
 
 
369
 
370
+ This installation only includes the parser engine and its dependencies, without any fetchers or commandline dependencies.
371
+
372
+ ### Optional Dependencies
373
+
374
+ 1. If you are going to use any of the extra features below, the fetchers, or their classes, you will need to install fetchers' dependencies and their browser dependencies as follows:
375
+ ```bash
376
+ pip install "scrapling[fetchers]"
377
+
378
+ scrapling install # normal install
379
+ scrapling install --force # force reinstall
380
+ ```
381
+
382
+ This downloads all browsers, along with their system dependencies and fingerprint manipulation dependencies.
383
+
384
+ Or you can install them from the code instead of running a command like this:
385
+ ```python
386
+ from scrapling.cli import install
387
+
388
+ install([], standalone_mode=False) # normal install
389
+ install(["--force"], standalone_mode=False) # force reinstall
390
+ ```
391
+
392
+ 2. Extra features:
393
+ - Install the MCP server feature:
394
+ ```bash
395
+ pip install "scrapling[ai]"
396
+ ```
397
+ - Install shell features (Web Scraping shell and the `extract` command):
398
+ ```bash
399
+ pip install "scrapling[shell]"
400
+ ```
401
+ - Install everything:
402
+ ```bash
403
+ pip install "scrapling[all]"
404
+ ```
405
+ Remember that you need to install the browser dependencies with `scrapling install` after any of these extras (if you didn't already)
406
+
407
+ ### Docker
408
+ You can also install a Docker image with all extras and browsers with the following command from DockerHub:
409
+ ```bash
410
+ docker pull pyd4vinci/scrapling
411
+ ```
412
+ Or download it from the GitHub registry:
413
+ ```bash
414
+ docker pull ghcr.io/d4vinci/scrapling:latest
415
+ ```
416
+ This image is automatically built and pushed using GitHub Actions and the repository's main branch.
417
 
418
+ ## Contributing
419
 
420
+ We welcome contributions! Please read our [contributing guidelines](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) before getting started.
421
 
422
+ ## Disclaimer
423
 
424
+ > [!CAUTION]
425
+ > This library is provided for educational and research purposes only. By using this library, you agree to comply with local and international data scraping and privacy laws. The authors and contributors are not responsible for any misuse of this software. Always respect the terms of service of websites and robots.txt files.
426
 
427
+ ## License
428
 
429
+ This work is licensed under the BSD-3-Clause License.
430
 
431
+ ## Acknowledgments
432
 
433
+ This project includes code adapted from:
434
+ - Parsel (BSD License)—Used for [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py) submodule
435
 
436
  ---
437
+ <div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
 
 
 
 
 
 
 
ROADMAP.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## TODOs
2
+ - [x] Add more tests and increase the code coverage.
3
+ - [x] Structure the tests folder in a better way.
4
+ - [x] Add more documentation.
5
+ - [x] Add the browsing ability.
6
+ - [x] Create detailed documentation for the 'readthedocs' website, preferably add GitHub action for deploying it.
7
+ - [ ] Create a Scrapy plugin/decorator to make it replace parsel in the response argument when needed.
8
+ - [x] Need to add more functionality to `AttributesHandler` and more navigation functions to `Selector` object (ex: functions similar to map, filter, and reduce functions but here pass it to the element and the function is executed on children, siblings, next elements, etc...)
9
+ - [x] Add `.filter` method to `Selectors` object and other similar methods.
10
+ - [ ] Add functionality to automatically detect pagination URLs
11
+ - [ ] Add the ability to auto-detect schemas in pages and manipulate them.
12
+ - [ ] Add `analyzer` ability that tries to learn about the page through meta-elements and return what it learned
13
+ - [ ] Add the ability to generate a regex from a group of elements (Like for all href attributes)
14
+ -
benchmarks.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import functools
2
+ import time
3
+ import timeit
4
+ from statistics import mean
5
+
6
+ import requests
7
+ from autoscraper import AutoScraper
8
+ from bs4 import BeautifulSoup
9
+ from lxml import etree, html
10
+ from mechanicalsoup import StatefulBrowser
11
+ from parsel import Selector
12
+ from pyquery import PyQuery as pq
13
+ from selectolax.parser import HTMLParser
14
+
15
+ from scrapling import Selector as ScraplingSelector
16
+
17
+ large_html = (
18
+ "<html><body>" + '<div class="item">' * 5000 + "</div>" * 5000 + "</body></html>"
19
+ )
20
+
21
+
22
+ def benchmark(func):
23
+ @functools.wraps(func)
24
+ def wrapper(*args, **kwargs):
25
+ benchmark_name = func.__name__.replace("test_", "").replace("_", " ")
26
+ print(f"-> {benchmark_name}", end=" ", flush=True)
27
+ # Warm-up phase
28
+ timeit.repeat(
29
+ lambda: func(*args, **kwargs), number=2, repeat=2, globals=globals()
30
+ )
31
+ # Measure time (1 run, repeat 100 times, take average)
32
+ times = timeit.repeat(
33
+ lambda: func(*args, **kwargs),
34
+ number=1,
35
+ repeat=100,
36
+ globals=globals(),
37
+ timer=time.process_time,
38
+ )
39
+ min_time = round(mean(times) * 1000, 2) # Convert to milliseconds
40
+ print(f"average execution time: {min_time} ms")
41
+ return min_time
42
+
43
+ return wrapper
44
+
45
+
46
+ @benchmark
47
+ def test_lxml():
48
+ return [
49
+ e.text
50
+ for e in etree.fromstring(
51
+ large_html,
52
+ # Scrapling and Parsel use the same parser inside, so this is just to make it fair
53
+ parser=html.HTMLParser(recover=True, huge_tree=True),
54
+ ).cssselect(".item")
55
+ ]
56
+
57
+
58
+ @benchmark
59
+ def test_bs4_lxml():
60
+ return [e.text for e in BeautifulSoup(large_html, "lxml").select(".item")]
61
+
62
+
63
+ @benchmark
64
+ def test_bs4_html5lib():
65
+ return [e.text for e in BeautifulSoup(large_html, "html5lib").select(".item")]
66
+
67
+
68
+ @benchmark
69
+ def test_pyquery():
70
+ return [e.text() for e in pq(large_html)(".item").items()]
71
+
72
+
73
+ @benchmark
74
+ def test_scrapling():
75
+ # No need to do `.extract()` like parsel to extract text
76
+ # Also, this is faster than `[t.text for t in Selector(large_html, adaptive=False).css('.item')]`
77
+ # for obvious reasons, of course.
78
+ return ScraplingSelector(large_html, adaptive=False).css(".item::text").getall()
79
+
80
+
81
+ @benchmark
82
+ def test_parsel():
83
+ return Selector(text=large_html).css(".item::text").extract()
84
+
85
+
86
+ @benchmark
87
+ def test_mechanicalsoup():
88
+ browser = StatefulBrowser()
89
+ browser.open_fake_page(large_html)
90
+ return [e.text for e in browser.page.select(".item")]
91
+
92
+
93
+ @benchmark
94
+ def test_selectolax():
95
+ return [node.text() for node in HTMLParser(large_html).css(".item")]
96
+
97
+
98
+ def display(results):
99
+ # Sort and display results
100
+ sorted_results = sorted(results.items(), key=lambda x: x[1]) # Sort by time
101
+ scrapling_time = results["Scrapling"]
102
+ print("\nRanked Results (fastest to slowest):")
103
+ print(f" i. {'Library tested':<18} | {'avg. time (ms)':<15} | vs Scrapling")
104
+ print("-" * 50)
105
+ for i, (test_name, test_time) in enumerate(sorted_results, 1):
106
+ compare = round(test_time / scrapling_time, 3)
107
+ print(f" {i}. {test_name:<18} | {str(test_time):<15} | {compare}")
108
+
109
+
110
+ @benchmark
111
+ def test_scrapling_text(request_html):
112
+ return ScraplingSelector(request_html, adaptive=False).find_by_text("Tipping the Velvet", first_match=True, clean_match=False).find_similar(ignore_attributes=["title"])
113
+
114
+
115
+ @benchmark
116
+ def test_autoscraper(request_html):
117
+ # autoscraper by default returns elements text
118
+ return AutoScraper().build(html=request_html, wanted_list=["Tipping the Velvet"])
119
+
120
+
121
+ if __name__ == "__main__":
122
+ print(
123
+ " Benchmark: Speed of parsing and retrieving the text content of 5000 nested elements \n"
124
+ )
125
+ results1 = {
126
+ "Raw Lxml": test_lxml(),
127
+ "Parsel/Scrapy": test_parsel(),
128
+ "Scrapling": test_scrapling(),
129
+ "Selectolax": test_selectolax(),
130
+ "PyQuery": test_pyquery(),
131
+ "BS4 with Lxml": test_bs4_lxml(),
132
+ "MechanicalSoup": test_mechanicalsoup(),
133
+ "BS4 with html5lib": test_bs4_html5lib(),
134
+ }
135
+
136
+ display(results1)
137
+ print("\n" + "=" * 25)
138
+ req = requests.get("https://books.toscrape.com/index.html")
139
+ print(
140
+ " Benchmark: Speed of searching for an element by text content, and retrieving the text of similar elements\n"
141
+ )
142
+ results2 = {
143
+ "Scrapling": test_scrapling_text(req.text),
144
+ "AutoScraper": test_autoscraper(req.text),
145
+ }
146
+ display(results2)
cleanup.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+
5
+ # Clean up after installing for local development
6
+ def clean():
7
+ # Get the current directory
8
+ base_dir = Path.cwd()
9
+
10
+ # Directories and patterns to clean
11
+ cleanup_patterns = [
12
+ "build",
13
+ "dist",
14
+ "*.egg-info",
15
+ "__pycache__",
16
+ ".eggs",
17
+ ".pytest_cache",
18
+ ]
19
+
20
+ # Clean directories
21
+ for pattern in cleanup_patterns:
22
+ for path in base_dir.glob(pattern):
23
+ try:
24
+ if path.is_dir():
25
+ shutil.rmtree(path)
26
+ else:
27
+ path.unlink()
28
+ print(f"Removed: {path}")
29
+ except Exception as e:
30
+ print(f"Could not remove {path}: {e}")
31
+
32
+ # Remove compiled Python files
33
+ for path in base_dir.rglob("*.py[co]"):
34
+ try:
35
+ path.unlink()
36
+ print(f"Removed compiled file: {path}")
37
+ except Exception as e:
38
+ print(f"Could not remove {path}: {e}")
39
+
40
+
41
+ if __name__ == "__main__":
42
+ clean()
docs/README_AR.md ADDED
@@ -0,0 +1,426 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- mcp-name: io.github.D4Vinci/Scrapling -->
2
+
3
+ <h1 align="center">
4
+ <a href="https://scrapling.readthedocs.io">
5
+ <picture>
6
+ <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
7
+ <img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
8
+ </picture>
9
+ </a>
10
+ <br>
11
+ <small>Effortless Web Scraping for the Modern Web</small>
12
+ </h1>
13
+
14
+ <p align="center">
15
+ <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
16
+ <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
17
+ <a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
18
+ <img alt="PyPI version" src="https://badge.fury.io/py/Scrapling.svg"></a>
19
+ <a href="https://pepy.tech/project/scrapling" alt="PyPI Downloads">
20
+ <img alt="PyPI Downloads" src="https://static.pepy.tech/personalized-badge/scrapling?period=total&units=INTERNATIONAL_SYSTEM&left_color=GREY&right_color=GREEN&left_text=Downloads"></a>
21
+ <br/>
22
+ <a href="https://discord.gg/EMgGbDceNQ" alt="Discord" target="_blank">
23
+ <img alt="Discord" src="https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ">
24
+ </a>
25
+ <a href="https://x.com/Scrapling_dev" alt="X (formerly Twitter)">
26
+ <img alt="X (formerly Twitter) Follow" src="https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev">
27
+ </a>
28
+ <br/>
29
+ <a href="https://pypi.org/project/scrapling/" alt="Supported Python versions">
30
+ <img alt="Supported Python versions" src="https://img.shields.io/pypi/pyversions/scrapling.svg"></a>
31
+ </p>
32
+
33
+ <p align="center">
34
+ <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/"><strong>طرق الاختيار</strong></a>
35
+ &middot;
36
+ <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/"><strong>اختيار Fetcher</strong></a>
37
+ &middot;
38
+ <a href="https://scrapling.readthedocs.io/en/latest/spiders/architecture.html"><strong>العناكب</strong></a>
39
+ &middot;
40
+ <a href="https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html"><strong>تدوير البروكسي</strong></a>
41
+ &middot;
42
+ <a href="https://scrapling.readthedocs.io/en/latest/cli/overview/"><strong>واجهة سطر الأوامر</strong></a>
43
+ &middot;
44
+ <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/"><strong>وضع MCP</strong></a>
45
+ </p>
46
+
47
+ Scrapling هو إطار عمل تكيفي لـ Web Scraping يتعامل مع كل شيء من طلب واحد إلى زحف كامل النطاق.
48
+
49
+ محلله يتعلم من تغييرات المواقع ويعيد تحديد موقع عناصرك تلقائياً عند تحديث الصفحات. جوالبه تتجاوز أنظمة مكافحة الروبوتات مثل Cloudflare Turnstile مباشرةً. وإطار عمل Spider الخاص به يتيح لك التوسع إلى عمليات زحف متزامنة ومتعددة الجلسات مع إيقاف/استئناف وتدوير تلقائي لـ Proxy - كل ذلك في بضعة أسطر من Python. مكتبة واحدة، بدون تنازلات.
50
+
51
+ زحف سريع للغاية مع إحصائيات فورية و Streaming. مبني بواسطة مستخرجي الويب لمستخرجي الويب والمستخدمين العاديين، هناك شيء للجميع.
52
+
53
+ ```python
54
+ from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
55
+ StealthyFetcher.adaptive = True
56
+ p = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # احصل على الموقع بشكل خفي!
57
+ products = p.css('.product', auto_save=True) # استخرج بيانات تنجو من تغييرات تصميم الموقع!
58
+ products = p.css('.product', adaptive=True) # لاحقاً، إذا تغيرت بنية الموقع، مرر `adaptive=True` للعثور عليها!
59
+ ```
60
+ أو توسع إلى عمليات زحف كاملة
61
+ ```python
62
+ from scrapling.spiders import Spider, Response
63
+
64
+ class MySpider(Spider):
65
+ name = "demo"
66
+ start_urls = ["https://example.com/"]
67
+
68
+ async def parse(self, response: Response):
69
+ for item in response.css('.product'):
70
+ yield {"title": item.css('h2::text').get()}
71
+
72
+ MySpider().start()
73
+ ```
74
+
75
+
76
+ # الرعاة البلاتينيون
77
+
78
+ <i><sub>هل تريد أن تكون أول شركة تظهر هنا؟ انقر [هنا](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)</sub></i>
79
+ # الرعاة
80
+
81
+ <!-- sponsors -->
82
+
83
+ <a href="https://www.scrapeless.com/en?utm_source=official&utm_term=scrapling" target="_blank" title="Effortless Web Scraping Toolkit for Business and Developers"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg"></a>
84
+ <a href="https://www.thordata.com/?ls=github&lk=github" target="_blank" title="Unblockable proxies and scraping infrastructure, delivering real-time, reliable web data to power AI models and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
85
+ <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
86
+ <a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
87
+ <a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
88
+ <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
89
+ <a href="https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci" target="_blank" title="The web scraping service that actually beats anti-bot systems!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png"></a>
90
+ <a href="https://proxyempire.io/" target="_blank" title="Collect The Data Your Project Needs with the Best Residential Proxies"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png"></a>
91
+ <a href="https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling" target="_blank" title="Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png"></a>
92
+
93
+
94
+ <a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
95
+ <a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
96
+ <a href="https://browser.cash/?utm_source=D4Vinci&utm_medium=referral" target="_blank" title="Browser Automation & AI Browser Agent Platform"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png"></a>
97
+
98
+ <!-- /sponsors -->
99
+
100
+ <i><sub>هل تريد عرض إعلانك هنا؟ انقر [هنا](https://github.com/sponsors/D4Vinci) واختر المستوى الذي يناسبك!</sub></i>
101
+
102
+ ---
103
+
104
+ ## الميزات الرئيسية
105
+
106
+ ### Spiders — إطار عمل زحف كامل
107
+ - 🕷️ **واجهة Spider شبيهة بـ Scrapy**: عرّف Spiders مع `start_urls`، و async `parse` callbacks، وكائنات `Request`/`Response`.
108
+ - ⚡ **زحف متزامن**: حدود تزامن قابلة للتكوين، وتحكم بالسرعة حسب النطاق، وتأخيرات التنزيل.
109
+ - 🔄 **دعم الجلسات المتعددة**: واجهة موحدة لطلبات HTTP، ومتصفحات خفية بدون واجهة في Spider واحد — وجّه الطلبات إلى جلسات مختلفة بالمعرّف.
110
+ - 💾 **إيقاف واستئناف**: استمرارية الزحف القائمة على Checkpoint. اضغط Ctrl+C للإيقاف بسلاسة؛ أعد التشغيل للاستئناف من حيث توقفت.
111
+ - 📡 **وضع Streaming**: بث العناصر المستخرجة فور وصولها عبر `async for item in spider.stream()` مع إحصائيات فورية — مثالي لواجهات المستخدم وخطوط الأنابيب وعمليات الزحف الطويلة.
112
+ - 🛡️ **كشف الطلبات المحظورة**: كشف تلقائي وإعادة محاولة للطلبات المحظورة مع منطق قابل للتخصيص.
113
+ - 📦 **تصدير مدمج**: صدّر النتائج عبر الخطافات وخط الأنابيب الخاص بك أو JSON/JSONL المدمج مع `result.items.to_json()` / `result.items.to_jsonl()` على التوالي.
114
+
115
+ ### جلب متقدم للمواقع مع دعم الجلسات
116
+ - **طلبات HTTP**: طلبات HTTP سريعة وخفية مع فئة `Fetcher`. يمكنها تقليد بصمة TLS للمتصفح والرؤوس واستخدام HTTP/3.
117
+ - **التحميل الديناميكي**: جلب المواقع الديناميكية مع أتمتة كاملة للمتصفح من خلال فئة `DynamicFetcher` التي تدعم Chromium من Playwright و Google Chrome.
118
+ - **تجاوز مكافحة الروبوتات**: قدرات تخفي متقدمة مع `StealthyFetcher` وا��تحال fingerprint. يمكنه تجاوز جميع أنواع Turnstile/Interstitial من Cloudflare بسهولة بالأتمتة.
119
+ - **إدارة الجلسات**: دعم الجلسات المستمرة مع فئات `FetcherSession` و`StealthySession` و`DynamicSession` لإدارة ملفات تعريف الارتباط والحالة عبر الطلبات.
120
+ - **تدوير Proxy**: `ProxyRotator` مدمج مع استراتيجيات التدوير الدوري أو المخصصة عبر جميع أنواع الجلسات، بالإضافة إلى تجاوزات Proxy لكل طلب.
121
+ - **حظر النطاقات**: حظر الطلبات إلى نطاقات محددة (ونطاقاتها الفرعية) في الجوالب المعتمدة على المتصفح.
122
+ - **دعم Async**: دعم async كامل عبر جميع الجوالب وفئات الجلسات async المخصصة.
123
+
124
+ ### الاستخراج التكيفي والتكامل مع الذكاء الاصطناعي
125
+ - 🔄 **تتبع العناصر الذكي**: إعادة تحديد موقع العناصر بعد تغييرات الموقع باستخدام خوارزميات التشابه الذكية.
126
+ - 🎯 **الاختيار المرن الذكي**: محددات CSS، محددات XPath، البحث القائم على الفلاتر، البحث النصي، البحث بالتعبيرات العادية والمزيد.
127
+ - 🔍 **البحث عن عناصر مشابهة**: تحديد العناصر المشابهة للعناصر الموجودة تلقائياً.
128
+ - 🤖 **خادم MCP للاستخدام مع الذكاء الاصطناعي**: خادم MCP مدمج لـ Web Scraping بمساعدة الذكاء الاصطناعي واستخراج البيانات. يتميز خادم MCP بقدرات قوية مخصصة تستفيد من Scrapling لاستخراج المحتوى المستهدف قبل تمريره إلى الذكاء الاصطناعي (Claude/Cursor/إلخ)، وبالتالي تسريع العمليات وتقليل التكاليف عن طريق تقليل استخدام الرموز. ([فيديو توضيحي](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
129
+
130
+ ### بنية عالية الأداء ومختبرة ميدانياً
131
+ - 🚀 **سريع كالبرق**: أداء محسّن يتفوق على معظم مكتبات Web Scraping في Python.
132
+ - 🔋 **فعال في استخدام الذاكرة**: هياكل بيانات محسّنة وتحميل كسول لأقل استخدام للذاكرة.
133
+ - ⚡ **تسلسل JSON سريع**: أسرع 10 مرات من المكتبة القياسية.
134
+ - 🏗️ **مُختبر ميدانياً**: لا يمتلك Scrapling فقط تغطية اختبار بنسبة 92٪ وتغطية كاملة لتلميحات الأنواع، بل تم استخدامه يومياً من قبل مئات مستخرجي الويب خلال العام الماضي.
135
+
136
+ ### تجربة صديقة للمطورين/مستخرجي الويب
137
+ - 🎯 **Shell تفاعلي لـ Web Scraping**: Shell IPython مدمج اختياري مع تكامل Scrapling، واختصارات، وأدوات جديدة لتسريع تطوير سكريبتات Web Scraping، مثل تحويل طلبات curl إلى طلبات Scrapling وعرض نتائج الطلبات في متصفحك.
138
+ - 🚀 **استخدمه مباشرة من الطرفية**: اختيارياً، يمكنك استخدام Scrapling لاستخراج عنوان URL دون كتابة سطر واحد من الكود!
139
+ - 🛠️ **واجهة تنقل غنية**: اجتياز DOM متقدم مع طرق التنقل بين العناصر الوالدية والشقيقة والفرعية.
140
+ - 🧬 **معالجة نصوص محسّنة**: تعبيرات عادية مدمجة وطرق تنظيف وعمليات نصية محسّنة.
141
+ - 📝 **إنشاء محددات تلقائي**: إنشاء محددات CSS/XPath قوية لأي عنصر.
142
+ - 🔌 **واجهة مألوفة**: مشابه لـ Scrapy/BeautifulSoup مع نفس العناصر الزائفة المستخدمة في Scrapy/Parsel.
143
+ - 📘 **تغطية كاملة للأنواع**: تلميحات نوع كاملة لدعم IDE ممتاز وإكمال الكود. يتم فحص قاعدة الكود بالكامل تلقائياً بواسطة **PyRight** و**MyPy** مع كل تغيير.
144
+ - 🔋 **صورة Docker جاهزة**: مع كل إصدار، يتم بناء ودفع صورة Docker تحتوي على جميع المتصفحات تلقائياً.
145
+
146
+ ## البدء
147
+
148
+ لنلقِ نظرة سريعة على ما يمكن لـ Scrapling فعله دون التعمق.
149
+
150
+ ### الاستخدام الأساسي
151
+ طلبات HTTP مع دعم الجلسات
152
+ ```python
153
+ from scrapling.fetchers import Fetcher, FetcherSession
154
+
155
+ with FetcherSession(impersonate='chrome') as session: # استخدم أحدث إصدار من بصمة TLS لـ Chrome
156
+ page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
157
+ quotes = page.css('.quote .text::text').getall()
158
+
159
+ # أو استخدم طلبات لمرة واحدة
160
+ page = Fetcher.get('https://quotes.toscrape.com/')
161
+ quotes = page.css('.quote .text::text').getall()
162
+ ```
163
+ وضع التخفي المتقدم
164
+ ```python
165
+ from scrapling.fetchers import StealthyFetcher, StealthySession
166
+
167
+ with StealthySession(headless=True, solve_cloudflare=True) as session: # أبقِ المتصفح مفتوحاً حتى تنتهي
168
+ page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
169
+ data = page.css('#padded_content a').getall()
170
+
171
+ # أو استخدم نمط الطلب لمرة واحدة، يفتح المتصفح لهذا الطلب، ثم يغلقه بعد الانتهاء
172
+ page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
173
+ data = page.css('#padded_content a').getall()
174
+ ```
175
+ أتمتة المتصفح الكاملة
176
+ ```python
177
+ from scrapling.fetchers import DynamicFetcher, DynamicSession
178
+
179
+ with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # أبقِ المتصفح مفتوحاً حتى تنتهي
180
+ page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
181
+ data = page.xpath('//span[@class="text"]/text()').getall() # محدد XPath إذا كنت تفضله
182
+
183
+ # أو استخدم نمط الطلب لمرة واحدة، يفتح المتصفح لهذا الطلب، ثم يغلقه بعد الانتهاء
184
+ page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
185
+ data = page.css('.quote .text::text').getall()
186
+ ```
187
+
188
+ ### Spiders
189
+ ابنِ زواحف كاملة مع طلبات متزامنة وأنواع جلسات متعددة وإيقاف/استئناف:
190
+ ```python
191
+ from scrapling.spiders import Spider, Request, Response
192
+
193
+ class QuotesSpider(Spider):
194
+ name = "quotes"
195
+ start_urls = ["https://quotes.toscrape.com/"]
196
+ concurrent_requests = 10
197
+
198
+ async def parse(self, response: Response):
199
+ for quote in response.css('.quote'):
200
+ yield {
201
+ "text": quote.css('.text::text').get(),
202
+ "author": quote.css('.author::text').get(),
203
+ }
204
+
205
+ next_page = response.css('.next a')
206
+ if next_page:
207
+ yield response.follow(next_page[0].attrib['href'])
208
+
209
+ result = QuotesSpider().start()
210
+ print(f"Scraped {len(result.items)} quotes")
211
+ result.items.to_json("quotes.json")
212
+ ```
213
+ استخدم أنواع جلسات متعددة في Spider واحد:
214
+ ```python
215
+ from scrapling.spiders import Spider, Request, Response
216
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession
217
+
218
+ class MultiSessionSpider(Spider):
219
+ name = "multi"
220
+ start_urls = ["https://example.com/"]
221
+
222
+ def configure_sessions(self, manager):
223
+ manager.add("fast", FetcherSession(impersonate="chrome"))
224
+ manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)
225
+
226
+ async def parse(self, response: Response):
227
+ for link in response.css('a::attr(href)').getall():
228
+ # وجّه الصفحات المحمية عبر جلسة التخفي
229
+ if "protected" in link:
230
+ yield Request(link, sid="stealth")
231
+ else:
232
+ yield Request(link, sid="fast", callback=self.parse) # callback صريح
233
+ ```
234
+ أوقف واستأنف عمليات الزحف الطويلة مع Checkpoints بتشغيل Spider هكذا:
235
+ ```python
236
+ QuotesSpider(crawldir="./crawl_data").start()
237
+ ```
238
+ اضغط Ctrl+C للإيقاف بسلاسة — يتم حفظ التقدم تلقائياً. لاحقاً، عند تشغيل Spider مرة أخرى، مرر نفس `crawldir`، وسيستأنف من حيث توقف.
239
+
240
+ ### التحليل المتقدم والتنقل
241
+ ```python
242
+ from scrapling.fetchers import Fetcher
243
+
244
+ # اختيار عناصر غني وتنقل
245
+ page = Fetcher.get('https://quotes.toscrape.com/')
246
+
247
+ # احصل على الاقتباسات بطرق اختيار متعددة
248
+ quotes = page.css('.quote') # محدد CSS
249
+ quotes = page.xpath('//div[@class="quote"]') # XPath
250
+ quotes = page.find_all('div', {'class': 'quote'}) # بأسلوب BeautifulSoup
251
+ # نفس الشيء مثل
252
+ quotes = page.find_all('div', class_='quote')
253
+ quotes = page.find_all(['div'], class_='quote')
254
+ quotes = page.find_all(class_='quote') # وهكذا...
255
+ # البحث عن عنصر بمحتوى النص
256
+ quotes = page.find_by_text('quote', tag='div')
257
+
258
+ # التنقل المتقدم
259
+ quote_text = page.css('.quote')[0].css('.text::text').get()
260
+ quote_text = page.css('.quote').css('.text::text').getall() # محددات متسلسلة
261
+ first_quote = page.css('.quote')[0]
262
+ author = first_quote.next_sibling.css('.author::text')
263
+ parent_container = first_quote.parent
264
+
265
+ # علاقات العناصر والتشابه
266
+ similar_elements = first_quote.find_similar()
267
+ below_elements = first_quote.below_elements()
268
+ ```
269
+ يمكنك استخدام المحلل مباشرة إذا كنت لا تريد جلب المواقع كما يلي:
270
+ ```python
271
+ from scrapling.parser import Selector
272
+
273
+ page = Selector("<html>...</html>")
274
+ ```
275
+ وهو يعمل بنفس الطريقة تماماً!
276
+
277
+ ### أمثلة إدارة ��لجلسات بشكل Async
278
+ ```python
279
+ import asyncio
280
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession
281
+
282
+ async with FetcherSession(http3=True) as session: # `FetcherSession` واعٍ بالسياق ويعمل في كلا النمطين المتزامن/async
283
+ page1 = session.get('https://quotes.toscrape.com/')
284
+ page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
285
+
286
+ # استخدام جلسة async
287
+ async with AsyncStealthySession(max_pages=2) as session:
288
+ tasks = []
289
+ urls = ['https://example.com/page1', 'https://example.com/page2']
290
+
291
+ for url in urls:
292
+ task = session.fetch(url)
293
+ tasks.append(task)
294
+
295
+ print(session.get_pool_stats()) # اختياري - حالة مجموعة علامات تبويب المتصفح (مشغول/حر/خطأ)
296
+ results = await asyncio.gather(*tasks)
297
+ print(session.get_pool_stats())
298
+ ```
299
+
300
+ ## واجهة سطر الأوامر والـ Shell التفاعلي
301
+
302
+ يتضمن Scrapling واجهة سطر أوامر قوية:
303
+
304
+ [![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)
305
+
306
+ تشغيل Shell الـ Web Scraping التفاعلي
307
+ ```bash
308
+ scrapling shell
309
+ ```
310
+ استخرج الصفحات إلى ملف مباشرة دون برمجة (يستخرج المحتوى داخل وسم `body` افتراضياً). إذا انتهى ملف الإخراج بـ `.txt`، فسيتم استخراج محتوى النص للهدف. إذا انتهى بـ `.md`، فسيكون تمثيل Markdown لمحتوى HTML؛ إذا انتهى بـ `.html`، فسيكون محتوى HTML نفسه.
311
+ ```bash
312
+ scrapling extract get 'https://example.com' content.md
313
+ scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # جميع العناصر المطابقة لمحدد CSS '#fromSkipToProducts'
314
+ scrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless
315
+ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare
316
+ ```
317
+
318
+ > [!NOTE]
319
+ > هناك العديد من الميزات الإضافية، لكننا نريد إبقاء هذه الصفحة موجزة، بما في ذلك خادم MCP والـ Shell التفاعلي لـ Web Scraping. تحقق من الوثائق الكاملة [هنا](https://scrapling.readthedocs.io/en/latest/)
320
+
321
+ ## معايير الأداء
322
+
323
+ Scrapling ليس قوياً فحسب — بل هو أيضاً سريع بشكل مذهل. تقارن المعايير التالية محلل Scrapling مع أحدث إصدارات المكتبات الشائعة الأخرى.
324
+
325
+ ### اختبار سرعة استخراج النص (5000 عنصر متداخل)
326
+
327
+ | # | المكتبة | الوقت (ms) | vs Scrapling |
328
+ |---|:-----------------:|:----------:|:------------:|
329
+ | 1 | Scrapling | 2.02 | 1.0x |
330
+ | 2 | Parsel/Scrapy | 2.04 | 1.01 |
331
+ | 3 | Raw Lxml | 2.54 | 1.257 |
332
+ | 4 | PyQuery | 24.17 | ~12x |
333
+ | 5 | Selectolax | 82.63 | ~41x |
334
+ | 6 | MechanicalSoup | 1549.71 | ~767.1x |
335
+ | 7 | BS4 with Lxml | 1584.31 | ~784.3x |
336
+ | 8 | BS4 with html5lib | 3391.91 | ~1679.1x |
337
+
338
+
339
+ ### أداء تشابه العناصر والبحث النصي
340
+
341
+ قدرات العثور على العناصر التكيفية لـ Scrapling تتفوق بشكل كبير على البدائل:
342
+
343
+ | المكتبة | الوقت (ms) | vs Scrapling |
344
+ |-------------|:----------:|:------------:|
345
+ | Scrapling | 2.39 | 1.0x |
346
+ | AutoScraper | 12.45 | 5.209x |
347
+
348
+
349
+ > تمثل جميع المعايير متوسطات أكثر من 100 تشغيل. انظر [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) للمنهجية.
350
+
351
+ ## التثبيت
352
+
353
+ يتطلب Scrapling إصدار Python 3.10 أو أعلى:
354
+
355
+ ```bash
356
+ pip install scrapling
357
+ ```
358
+
359
+ يتضمن هذا التثبيت فقط محرك المحلل وتبعياته، بدون أي جوالب أو تبعيات سطر الأوامر.
360
+
361
+ ### التبعيات الاختيارية
362
+
363
+ 1. إذا كنت ستستخدم أياً من الميزات الإضافية أدناه، أو الجوالب، أو فئاتها، فستحتاج إلى تثبيت تبعيات الجوالب وتبعيات المتصفح الخاصة بها على النحو التالي:
364
+ ```bash
365
+ pip install "scrapling[fetchers]"
366
+
367
+ scrapling install # normal install
368
+ scrapling install --force # force reinstall
369
+ ```
370
+
371
+ يقوم هذا بتنزيل جميع المتصفحات، إلى جانب تبعيات النظام وتبعيات معالجة fingerprint الخاصة بها.
372
+
373
+ أو يمكنك تثبيتها من الكود بدلاً من تشغيل أمر كالتالي:
374
+ ```python
375
+ from scrapling.cli import install
376
+
377
+ install([], standalone_mode=False) # normal install
378
+ install(["--force"], standalone_mode=False) # force reinstall
379
+ ```
380
+
381
+ 2. ميزات إضافية:
382
+ - تثبيت ميزة خادم MCP:
383
+ ```bash
384
+ pip install "scrapling[ai]"
385
+ ```
386
+ - تثبيت ميزات Shell (Shell الـ Web Scraping وأمر `extract`):
387
+ ```bash
388
+ pip install "scrapling[shell]"
389
+ ```
390
+ - تثبيت كل شيء:
391
+ ```bash
392
+ pip install "scrapling[all]"
393
+ ```
394
+ تذكر أنك تحتاج إلى تثبيت تبعيات المتصفح مع `scrapling install` بعد أي من هذه الإضافات (إذا لم تكن قد فعلت ذلك بالفعل)
395
+
396
+ ### Docker
397
+ يمكنك أيضاً تثبيت صورة Docker مع جميع الإضافات والمتصفحات باستخدام الأمر التالي من DockerHub:
398
+ ```bash
399
+ docker pull pyd4vinci/scrapling
400
+ ```
401
+ أو تنزيلها من سجل GitHub:
402
+ ```bash
403
+ docker pull ghcr.io/d4vinci/scrapling:latest
404
+ ```
405
+ يتم بناء هذه الصورة ودفعها تلقائياً باستخدام GitHub Actions والفرع الرئيسي للمستودع.
406
+
407
+ ## المساهمة
408
+
409
+ نرحب بالمساهمات! يرجى قراءة [إرشادات المساهمة](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) قبل البدء.
410
+
411
+ ## إخلاء المسؤولية
412
+
413
+ > [!CAUTION]
414
+ > يتم توفير هذه المكتبة للأغراض التعليمية والبحثية فقط. باستخدام هذه المكتبة، فإنك توافق على الامتثال لقوانين استخراج البيانات والخصوصية المحلية والدولية. المؤلفون والمساهمون غير مسؤولين عن أي إساءة استخدام لهذا البرنامج. احترم دائماً شروط خدمة المواقع وملفات robots.txt.
415
+
416
+ ## الترخيص
417
+
418
+ هذا العمل مرخص بموجب ترخيص BSD-3-Clause.
419
+
420
+ ## الشكر والتقدير
421
+
422
+ يتضمن هذا المشروع كوداً معدلاً من:
423
+ - Parsel (ترخيص BSD) — يُستخدم للوحدة الفرعية [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)
424
+
425
+ ---
426
+ <div align="center"><small>مصمم ومصنوع بـ ❤️ بواسطة كريم شعير.</small></div><br>
docs/README_CN.md ADDED
@@ -0,0 +1,426 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- mcp-name: io.github.D4Vinci/Scrapling -->
2
+
3
+ <h1 align="center">
4
+ <a href="https://scrapling.readthedocs.io">
5
+ <picture>
6
+ <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
7
+ <img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
8
+ </picture>
9
+ </a>
10
+ <br>
11
+ <small>Effortless Web Scraping for the Modern Web</small>
12
+ </h1>
13
+
14
+ <p align="center">
15
+ <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
16
+ <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
17
+ <a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
18
+ <img alt="PyPI version" src="https://badge.fury.io/py/Scrapling.svg"></a>
19
+ <a href="https://pepy.tech/project/scrapling" alt="PyPI Downloads">
20
+ <img alt="PyPI Downloads" src="https://static.pepy.tech/personalized-badge/scrapling?period=total&units=INTERNATIONAL_SYSTEM&left_color=GREY&right_color=GREEN&left_text=Downloads"></a>
21
+ <br/>
22
+ <a href="https://discord.gg/EMgGbDceNQ" alt="Discord" target="_blank">
23
+ <img alt="Discord" src="https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ">
24
+ </a>
25
+ <a href="https://x.com/Scrapling_dev" alt="X (formerly Twitter)">
26
+ <img alt="X (formerly Twitter) Follow" src="https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev">
27
+ </a>
28
+ <br/>
29
+ <a href="https://pypi.org/project/scrapling/" alt="Supported Python versions">
30
+ <img alt="Supported Python versions" src="https://img.shields.io/pypi/pyversions/scrapling.svg"></a>
31
+ </p>
32
+
33
+ <p align="center">
34
+ <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/"><strong>选择方法</strong></a>
35
+ &middot;
36
+ <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/"><strong>选择Fetcher</strong></a>
37
+ &middot;
38
+ <a href="https://scrapling.readthedocs.io/en/latest/spiders/architecture.html"><strong>爬虫</strong></a>
39
+ &middot;
40
+ <a href="https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html"><strong>代理轮换</strong></a>
41
+ &middot;
42
+ <a href="https://scrapling.readthedocs.io/en/latest/cli/overview/"><strong>CLI</strong></a>
43
+ &middot;
44
+ <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/"><strong>MCP模式</strong></a>
45
+ </p>
46
+
47
+ Scrapling是一个自适应Web Scraping框架,能处理从单个请求到大规模爬取的一切需求。
48
+
49
+ 它的解析器能够从网站变化中学习,并在页面更新时自动重新定位您的元素。它的Fetcher能够开箱即用地绕过Cloudflare Turnstile等反机器人系统。它的Spider框架让您可以扩展到并发、多Session爬取,支持暂停/恢复和自动Proxy轮换——只需几行Python代码。一个库,零妥协。
50
+
51
+ 极速爬取,实时统计和Streaming。由Web Scraper为Web Scraper和普通用户而构建,每个人都能找到适合自己的功能。
52
+
53
+ ```python
54
+ from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
55
+ StealthyFetcher.adaptive = True
56
+ p = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # 隐秘地获取网站!
57
+ products = p.css('.product', auto_save=True) # 抓取在网站设计变更后仍能存活的数据!
58
+ products = p.css('.product', adaptive=True) # 之后,如果网站结构改变,传递 `adaptive=True` 来找到它们!
59
+ ```
60
+ 或扩展为完整爬取
61
+ ```python
62
+ from scrapling.spiders import Spider, Response
63
+
64
+ class MySpider(Spider):
65
+ name = "demo"
66
+ start_urls = ["https://example.com/"]
67
+
68
+ async def parse(self, response: Response):
69
+ for item in response.css('.product'):
70
+ yield {"title": item.css('h2::text').get()}
71
+
72
+ MySpider().start()
73
+ ```
74
+
75
+
76
+ # 铂金赞助商
77
+
78
+ <i><sub>想成为第一个出现在这里的公司吗?点击[这里](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)</sub></i>
79
+ # 赞助商
80
+
81
+ <!-- sponsors -->
82
+
83
+ <a href="https://www.scrapeless.com/en?utm_source=official&utm_term=scrapling" target="_blank" title="Effortless Web Scraping Toolkit for Business and Developers"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg"></a>
84
+ <a href="https://www.thordata.com/?ls=github&lk=github" target="_blank" title="Unblockable proxies and scraping infrastructure, delivering real-time, reliable web data to power AI models and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
85
+ <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
86
+ <a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
87
+ <a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
88
+ <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
89
+ <a href="https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci" target="_blank" title="The web scraping service that actually beats anti-bot systems!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png"></a>
90
+ <a href="https://proxyempire.io/" target="_blank" title="Collect The Data Your Project Needs with the Best Residential Proxies"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png"></a>
91
+ <a href="https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling" target="_blank" title="Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png"></a>
92
+
93
+
94
+ <a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
95
+ <a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
96
+ <a href="https://browser.cash/?utm_source=D4Vinci&utm_medium=referral" target="_blank" title="Browser Automation & AI Browser Agent Platform"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png"></a>
97
+
98
+ <!-- /sponsors -->
99
+
100
+ <i><sub>想在这里展示您的广告吗?点击[这里](https://github.com/sponsors/D4Vinci)并选择适合您的级别!</sub></i>
101
+
102
+ ---
103
+
104
+ ## 主要特性
105
+
106
+ ### Spider — 完整的爬取框架
107
+ - 🕷️ **类Scrapy的Spider API**:使用`start_urls`、async `parse` callback和`Request`/`Response`对象定义Spider。
108
+ - ⚡ **并发爬取**:可配置的并发限制、按域名节流和下载延迟。
109
+ - 🔄 **多Session支持**:统一接口,支持HTTP请求和隐秘无头浏览器在同一个Spider中使用——通过ID将请求路由到不同的Session。
110
+ - 💾 **暂停与恢复**:基于Checkpoint的爬取持久化。按Ctrl+C优雅关闭;重启后从上次停止的地方继续。
111
+ - 📡 **Streaming模式**:通过`async for item in spider.stream()`以实时统计Streaming抓取的数据——非常适合UI、管道和长时间运行的爬取。
112
+ - 🛡️ **被阻止请求检测**:自动检测并重试被阻止的请求,支持自定义逻辑。
113
+ - 📦 **内置导出**:通过钩子和您自己的管道导出结果,或使用内置的JSON/JSONL,分别通过`result.items.to_json()`/`result.items.to_jsonl()`。
114
+
115
+ ### 支持Session的高级网站获取
116
+ - **HTTP请求**:使用`Fetcher`类进行快速和隐秘的HTTP请求。可以模拟浏览器的TLS fingerprint、标头并使用HTTP/3。
117
+ - **动态加载**:通过`DynamicFetcher`类使用完整的浏览器自动化获取动态网站,支持Playwright的Chromium和Google Chrome。
118
+ - **反机器人绕过**:使用`StealthyFetcher`的高级隐秘功能和fingerprint伪装。可以轻松自动绕过所有类型的Cloudflare Turnstile/Interstitial。
119
+ - **Session管理**:使用`FetcherSession`、`StealthySession`和`DynamicSession`类实现持久化Session支持,用于跨请求的cookie和状态管理。
120
+ - **Proxy轮换**:内置`ProxyRotator`,支持轮询或自定义策略,适用于所有Session类型,并支持按请求覆盖Proxy。
121
+ - **域名屏蔽**:在基于浏览器的Fetcher中屏蔽对特定域名(及其子域名)的请求。
122
+ - **Async支持**:所有Fetcher和专用async Session类的完整async支持。
123
+
124
+ ### 自适应抓取和AI集成
125
+ - 🔄 **智能元素跟踪**:使用智能相似性算法在网站更改后重新定位元素。
126
+ - 🎯 **智能灵活选择**:CSS选择器、XPath选择器、基于过滤器的搜索、文本搜索、正则表达式搜索等。
127
+ - 🔍 **查找相似元素**:自动定位与已找到元素相似的元素。
128
+ - 🤖 **与AI一起使用的MCP服务器**:内置MCP服务器用于AI辅助Web Scraping和数据提取。MCP服务器具有强大的自定义功能,利用Scrapling在将内容传递给AI(Claude/Cursor等)之前提取目标内容,从而加快操作并通过最小化token使用来降低成本。([演示视频](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
129
+
130
+ ### 高性能和经过实战测试的架构
131
+ - 🚀 **闪电般快速**:优化性能超越大多数Python抓取库。
132
+ - 🔋 **内存高效**:优化的数据结构和延迟加载,最小内存占用。
133
+ - ⚡ **快速JSON序列化**:比标准库快10倍。
134
+ - 🏗️ **经过实战测试**:Scrapling不仅拥有92%的测试覆盖率和完整的类型提示覆盖率,而且在过去一年中每天被数百名Web Scraper使用。
135
+
136
+ ### 对开发者/Web Scraper友好的体验
137
+ - 🎯 **交互式Web Scraping Shell**:可选的内置IPython Shell,具有Scrapling集成、快捷方式和新工具,可加快Web Scraping脚本开发,例如将curl请求转换为Scrapling请求并在浏览器中查看请求结果。
138
+ - 🚀 **直接从终端使用**:可选地,您可以使用Scrapling抓取URL而无需编写任何代码!
139
+ - 🛠️ **丰富的导航API**:使用父级、兄弟级和子级导航方法进行高级DOM遍历。
140
+ - 🧬 **增强的文本处理**:内置正则表达式、清理方法和优化的字符串操作。
141
+ - 📝 **自动选择器生成**:为任何元素生成强大的CSS/XPath选择器。
142
+ - 🔌 **熟悉的API**:类似于Scrapy/BeautifulSoup,使用与Scrapy/Parsel相同的伪元素。
143
+ - 📘 **完整的类型覆盖**:完整的类型提示,出色的IDE支持和代码补全。整个代码库在每次更改时都会自动使用**PyRight**和**MyPy**扫描。
144
+ - 🔋 **现成的Docker镜像**:每次发布时,包含所有浏览器的Docker镜像会自动构建和推送。
145
+
146
+ ## 入门
147
+
148
+ 让我们快速展示Scrapling的功能,无需深入了解。
149
+
150
+ ### 基本用法
151
+ 支持Session的HTTP请求
152
+ ```python
153
+ from scrapling.fetchers import Fetcher, FetcherSession
154
+
155
+ with FetcherSession(impersonate='chrome') as session: # 使用Chrome的最新版本TLS fingerprint
156
+ page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
157
+ quotes = page.css('.quote .text::text').getall()
158
+
159
+ # 或使用一次性请求
160
+ page = Fetcher.get('https://quotes.toscrape.com/')
161
+ quotes = page.css('.quote .text::text').getall()
162
+ ```
163
+ 高级隐秘模式
164
+ ```python
165
+ from scrapling.fetchers import StealthyFetcher, StealthySession
166
+
167
+ with StealthySession(headless=True, solve_cloudflare=True) as session: # 保持浏览器打开直到完成
168
+ page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
169
+ data = page.css('#padded_content a').getall()
170
+
171
+ # 或使用一次性请求样式,为此请求打开浏览器,完成后关闭
172
+ page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
173
+ data = page.css('#padded_content a').getall()
174
+ ```
175
+ 完整的浏览器自动化
176
+ ```python
177
+ from scrapling.fetchers import DynamicFetcher, DynamicSession
178
+
179
+ with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # 保持浏览器打开直到完成
180
+ page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
181
+ data = page.xpath('//span[@class="text"]/text()').getall() # 如果您偏好XPath选择器
182
+
183
+ # 或使用一次性请求样式,为此请求打开浏览器,完成后关闭
184
+ page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
185
+ data = page.css('.quote .text::text').getall()
186
+ ```
187
+
188
+ ### Spider
189
+ 构建具有并发请求、多种Session类型和暂停/恢复功能的完整爬虫:
190
+ ```python
191
+ from scrapling.spiders import Spider, Request, Response
192
+
193
+ class QuotesSpider(Spider):
194
+ name = "quotes"
195
+ start_urls = ["https://quotes.toscrape.com/"]
196
+ concurrent_requests = 10
197
+
198
+ async def parse(self, response: Response):
199
+ for quote in response.css('.quote'):
200
+ yield {
201
+ "text": quote.css('.text::text').get(),
202
+ "author": quote.css('.author::text').get(),
203
+ }
204
+
205
+ next_page = response.css('.next a')
206
+ if next_page:
207
+ yield response.follow(next_page[0].attrib['href'])
208
+
209
+ result = QuotesSpider().start()
210
+ print(f"抓取了 {len(result.items)} 条引用")
211
+ result.items.to_json("quotes.json")
212
+ ```
213
+ 在单个Spider中使用多种Session类型:
214
+ ```python
215
+ from scrapling.spiders import Spider, Request, Response
216
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession
217
+
218
+ class MultiSessionSpider(Spider):
219
+ name = "multi"
220
+ start_urls = ["https://example.com/"]
221
+
222
+ def configure_sessions(self, manager):
223
+ manager.add("fast", FetcherSession(impersonate="chrome"))
224
+ manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)
225
+
226
+ async def parse(self, response: Response):
227
+ for link in response.css('a::attr(href)').getall():
228
+ # 将受保护的页面路由到隐秘Session
229
+ if "protected" in link:
230
+ yield Request(link, sid="stealth")
231
+ else:
232
+ yield Request(link, sid="fast", callback=self.parse) # 显式callback
233
+ ```
234
+ 通过如下方式运行Spider来暂停和恢复长时间爬取,使用Checkpoint:
235
+ ```python
236
+ QuotesSpider(crawldir="./crawl_data").start()
237
+ ```
238
+ 按Ctrl+C优雅暂停——进度会自动保存。之后,当您再次启动Spider时,传递相同的`crawldir`,它将从上次停止的地方继续。
239
+
240
+ ### 高级解析与导航
241
+ ```python
242
+ from scrapling.fetchers import Fetcher
243
+
244
+ # 丰富的元素选择和导航
245
+ page = Fetcher.get('https://quotes.toscrape.com/')
246
+
247
+ # 使用多种选择方法获取引用
248
+ quotes = page.css('.quote') # CSS选择器
249
+ quotes = page.xpath('//div[@class="quote"]') # XPath
250
+ quotes = page.find_all('div', {'class': 'quote'}) # BeautifulSoup风格
251
+ # 等同于
252
+ quotes = page.find_all('div', class_='quote')
253
+ quotes = page.find_all(['div'], class_='quote')
254
+ quotes = page.find_all(class_='quote') # 等等...
255
+ # 按文本内容查找元素
256
+ quotes = page.find_by_text('quote', tag='div')
257
+
258
+ # 高级导航
259
+ quote_text = page.css('.quote')[0].css('.text::text').get()
260
+ quote_text = page.css('.quote').css('.text::text').getall() # 链式选择器
261
+ first_quote = page.css('.quote')[0]
262
+ author = first_quote.next_sibling.css('.author::text')
263
+ parent_container = first_quote.parent
264
+
265
+ # 元素关系和相似性
266
+ similar_elements = first_quote.find_similar()
267
+ below_elements = first_quote.below_elements()
268
+ ```
269
+ 如果您不想获取网站,可以直接使用解析器,如下所示:
270
+ ```python
271
+ from scrapling.parser import Selector
272
+
273
+ page = Selector("<html>...</html>")
274
+ ```
275
+ 用法完全相同!
276
+
277
+ ### Async Session管理示例
278
+ ```python
279
+ import asyncio
280
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession
281
+
282
+ async with FetcherSession(http3=True) as session: # `FetcherSession`是上下文感知的,可以在sync/async模式下工作
283
+ page1 = session.get('https://quotes.toscrape.com/')
284
+ page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
285
+
286
+ # Async Session用法
287
+ async with AsyncStealthySession(max_pages=2) as session:
288
+ tasks = []
289
+ urls = ['https://example.com/page1', 'https://example.com/page2']
290
+
291
+ for url in urls:
292
+ task = session.fetch(url)
293
+ tasks.append(task)
294
+
295
+ print(session.get_pool_stats()) # 可选 - 浏览器标签池的状态(忙/空闲/错误)
296
+ results = await asyncio.gather(*tasks)
297
+ print(session.get_pool_stats())
298
+ ```
299
+
300
+ ## CLI和交互式Shell
301
+
302
+ Scrapling包含强大的命令行界面:
303
+
304
+ [![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)
305
+
306
+ 启动交互式Web Scraping Shell
307
+ ```bash
308
+ scrapling shell
309
+ ```
310
+ 直接将页面提取到文件而无需编程(默认提取`body`标签内的内容)。如果输出文件以`.txt`结尾,则将提取目标的文本内容。如果以`.md`结尾,它将是HTML内容的Markdown表示;如果以`.html`结尾,它将是HTML内容本身。
311
+ ```bash
312
+ scrapling extract get 'https://example.com' content.md
313
+ scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # 所有匹配CSS选择器'#fromSkipToProducts'的元素
314
+ scrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless
315
+ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare
316
+ ```
317
+
318
+ > [!NOTE]
319
+ > 还有许多其他功能,但我们希望保持此页面简洁,包括MCP服务器和交互式Web Scraping Shell。查看完整文档[这里](https://scrapling.readthedocs.io/en/latest/)
320
+
321
+ ## 性能基准
322
+
323
+ Scrapling不仅功能强大——它还速度极快。以下基准测试将Scrapling的解析器与其他流行库的最新版本进行了比较。
324
+
325
+ ### 文本提取速度测试(5000个嵌套元素)
326
+
327
+ | # | 库 | 时间(ms) | vs Scrapling |
328
+ |---|:-----------------:|:---------:|:------------:|
329
+ | 1 | Scrapling | 2.02 | 1.0x |
330
+ | 2 | Parsel/Scrapy | 2.04 | 1.01 |
331
+ | 3 | Raw Lxml | 2.54 | 1.257 |
332
+ | 4 | PyQuery | 24.17 | ~12x |
333
+ | 5 | Selectolax | 82.63 | ~41x |
334
+ | 6 | MechanicalSoup | 1549.71 | ~767.1x |
335
+ | 7 | BS4 with Lxml | 1584.31 | ~784.3x |
336
+ | 8 | BS4 with html5lib | 3391.91 | ~1679.1x |
337
+
338
+
339
+ ### 元素相似性和文本搜索性能
340
+
341
+ Scrapling的自适应元素查找功能明显优于替代方案:
342
+
343
+ | 库 | 时间(ms) | vs Scrapling |
344
+ |-------------|:---------:|:------------:|
345
+ | Scrapling | 2.39 | 1.0x |
346
+ | AutoScraper | 12.45 | 5.209x |
347
+
348
+
349
+ > 所有基准测试代表100+次运行的平均值。请参阅[benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py)了解方法。
350
+
351
+ ## 安装
352
+
353
+ Scrapling需要Python 3.10或更高版本:
354
+
355
+ ```bash
356
+ pip install scrapling
357
+ ```
358
+
359
+ 此安装仅包括解析器引擎及其依赖项,没有任何Fetcher或命令行依赖项。
360
+
361
+ ### 可选依赖项
362
+
363
+ 1. 如果您要使用以下任何额外功能、Fetcher或它们的类,您将需要安装Fetcher的依赖项和它们的浏览器依赖项,如下所示:
364
+ ```bash
365
+ pip install "scrapling[fetchers]"
366
+
367
+ scrapling install # normal install
368
+ scrapling install --force # force reinstall
369
+ ```
370
+
371
+ 这会下载所有浏览器,以及它们的系统依赖项和fingerprint操作依赖项。
372
+
373
+ 或者你可以从代码中安装,而不是运行命令:
374
+ ```python
375
+ from scrapling.cli import install
376
+
377
+ install([], standalone_mode=False) # normal install
378
+ install(["--force"], standalone_mode=False) # force reinstall
379
+ ```
380
+
381
+ 2. 额外功能:
382
+ - 安装MCP服务器功能:
383
+ ```bash
384
+ pip install "scrapling[ai]"
385
+ ```
386
+ - 安装Shell功能(Web Scraping Shell和`extract`命令):
387
+ ```bash
388
+ pip install "scrapling[shell]"
389
+ ```
390
+ - 安装所有内容:
391
+ ```bash
392
+ pip install "scrapling[all]"
393
+ ```
394
+ 请记住,在安装任何这些额外功能后(如果您还没有安装),您需要使用`scrapling install`安装浏览器依赖项
395
+
396
+ ### Docker
397
+ 您还可以使用以下命令从DockerHub安装包含所有额外功能和浏览器的Docker镜像:
398
+ ```bash
399
+ docker pull pyd4vinci/scrapling
400
+ ```
401
+ 或从GitHub注册表下载:
402
+ ```bash
403
+ docker pull ghcr.io/d4vinci/scrapling:latest
404
+ ```
405
+ 此镜像使用GitHub Actions和仓库主分支自动构建和推送。
406
+
407
+ ## 贡献
408
+
409
+ 我们欢迎贡献!在开始之前,请阅读我们的[贡献指南](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md)。
410
+
411
+ ## 免责声明
412
+
413
+ > [!CAUTION]
414
+ > 此库仅用于教育和研究目的。使用此库即表示您同意遵守本地和国际数据抓取和隐私法律。作者和贡献者对本软件的任何滥用不承担责任。始终尊重网站的服务条款和robots.txt文件。
415
+
416
+ ## 许可证
417
+
418
+ 本作品根据BSD-3-Clause许可证授权。
419
+
420
+ ## 致谢
421
+
422
+ 此项目包含改编自以下内容的代码:
423
+ - Parsel(BSD许可证)——用于[translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)子模块
424
+
425
+ ---
426
+ <div align="center"><small>由Karim Shoair用❤️设计和制作。</small></div><br>
docs/README_DE.md ADDED
@@ -0,0 +1,426 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- mcp-name: io.github.D4Vinci/Scrapling -->
2
+
3
+ <h1 align="center">
4
+ <a href="https://scrapling.readthedocs.io">
5
+ <picture>
6
+ <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
7
+ <img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
8
+ </picture>
9
+ </a>
10
+ <br>
11
+ <small>Effortless Web Scraping for the Modern Web</small>
12
+ </h1>
13
+
14
+ <p align="center">
15
+ <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
16
+ <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
17
+ <a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
18
+ <img alt="PyPI version" src="https://badge.fury.io/py/Scrapling.svg"></a>
19
+ <a href="https://pepy.tech/project/scrapling" alt="PyPI Downloads">
20
+ <img alt="PyPI Downloads" src="https://static.pepy.tech/personalized-badge/scrapling?period=total&units=INTERNATIONAL_SYSTEM&left_color=GREY&right_color=GREEN&left_text=Downloads"></a>
21
+ <br/>
22
+ <a href="https://discord.gg/EMgGbDceNQ" alt="Discord" target="_blank">
23
+ <img alt="Discord" src="https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ">
24
+ </a>
25
+ <a href="https://x.com/Scrapling_dev" alt="X (formerly Twitter)">
26
+ <img alt="X (formerly Twitter) Follow" src="https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev">
27
+ </a>
28
+ <br/>
29
+ <a href="https://pypi.org/project/scrapling/" alt="Supported Python versions">
30
+ <img alt="Supported Python versions" src="https://img.shields.io/pypi/pyversions/scrapling.svg"></a>
31
+ </p>
32
+
33
+ <p align="center">
34
+ <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/"><strong>Auswahlmethoden</strong></a>
35
+ &middot;
36
+ <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/"><strong>Einen Fetcher wählen</strong></a>
37
+ &middot;
38
+ <a href="https://scrapling.readthedocs.io/en/latest/spiders/architecture.html"><strong>Spiders</strong></a>
39
+ &middot;
40
+ <a href="https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html"><strong>Proxy-Rotation</strong></a>
41
+ &middot;
42
+ <a href="https://scrapling.readthedocs.io/en/latest/cli/overview/"><strong>CLI</strong></a>
43
+ &middot;
44
+ <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/"><strong>MCP-Modus</strong></a>
45
+ </p>
46
+
47
+ Scrapling ist ein adaptives Web-Scraping-Framework, das alles abdeckt -- von einer einzelnen Anfrage bis hin zu einem umfassenden Crawl.
48
+
49
+ Sein Parser lernt aus Website-Änderungen und lokalisiert Ihre Elemente automatisch neu, wenn sich Seiten aktualisieren. Seine Fetcher umgehen Anti-Bot-Systeme wie Cloudflare Turnstile direkt ab Werk. Und sein Spider-Framework ermöglicht es Ihnen, auf parallele Multi-Session-Crawls mit Pause & Resume und automatischer Proxy-Rotation hochzuskalieren -- alles in wenigen Zeilen Python. Eine Bibliothek, keine Kompromisse.
50
+
51
+ Blitzschnelle Crawls mit Echtzeit-Statistiken und Streaming. Von Web Scrapern für Web Scraper und normale Benutzer entwickelt, ist für jeden etwas dabei.
52
+
53
+ ```python
54
+ from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
55
+ StealthyFetcher.adaptive = True
56
+ p = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # Website unbemerkt abrufen!
57
+ products = p.css('.product', auto_save=True) # Daten scrapen, die Website-Designänderungen überleben!
58
+ products = p.css('.product', adaptive=True) # Später, wenn sich die Website-Struktur ändert, `adaptive=True` übergeben, um sie zu finden!
59
+ ```
60
+ Oder auf vollständige Crawls hochskalieren
61
+ ```python
62
+ from scrapling.spiders import Spider, Response
63
+
64
+ class MySpider(Spider):
65
+ name = "demo"
66
+ start_urls = ["https://example.com/"]
67
+
68
+ async def parse(self, response: Response):
69
+ for item in response.css('.product'):
70
+ yield {"title": item.css('h2::text').get()}
71
+
72
+ MySpider().start()
73
+ ```
74
+
75
+
76
+ # Platin-Sponsoren
77
+
78
+ <i><sub>Möchten Sie das erste Unternehmen sein, das hier erscheint? Klicken Sie [hier](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)</sub></i>
79
+ # Sponsoren
80
+
81
+ <!-- sponsors -->
82
+
83
+ <a href="https://www.scrapeless.com/en?utm_source=official&utm_term=scrapling" target="_blank" title="Effortless Web Scraping Toolkit for Business and Developers"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg"></a>
84
+ <a href="https://www.thordata.com/?ls=github&lk=github" target="_blank" title="Unblockable proxies and scraping infrastructure, delivering real-time, reliable web data to power AI models and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
85
+ <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
86
+ <a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
87
+ <a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
88
+ <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
89
+ <a href="https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci" target="_blank" title="The web scraping service that actually beats anti-bot systems!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png"></a>
90
+ <a href="https://proxyempire.io/" target="_blank" title="Collect The Data Your Project Needs with the Best Residential Proxies"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png"></a>
91
+ <a href="https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling" target="_blank" title="Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png"></a>
92
+
93
+
94
+ <a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
95
+ <a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
96
+ <a href="https://browser.cash/?utm_source=D4Vinci&utm_medium=referral" target="_blank" title="Browser Automation & AI Browser Agent Platform"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png"></a>
97
+
98
+ <!-- /sponsors -->
99
+
100
+ <i><sub>Möchten Sie Ihre Anzeige hier zeigen? Klicken Sie [hier](https://github.com/sponsors/D4Vinci) und wählen Sie die Stufe, die zu Ihnen passt!</sub></i>
101
+
102
+ ---
103
+
104
+ ## Hauptmerkmale
105
+
106
+ ### Spiders -- Ein vollständiges Crawling-Framework
107
+ - 🕷️ **Scrapy-ähnliche Spider-API**: Definieren Sie Spiders mit `start_urls`, async `parse` Callbacks und `Request`/`Response`-Objekten.
108
+ - ⚡ **Paralleles Crawling**: Konfigurierbare Parallelitätslimits, domainbezogenes Throttling und Download-Verzögerungen.
109
+ - 🔄 **Multi-Session-Unterstützung**: Einheitliche Schnittstelle für HTTP-Anfragen und heimliche Headless-Browser in einem einzigen Spider -- leiten Sie Anfragen per ID an verschiedene Sessions weiter.
110
+ - 💾 **Pause & Resume**: Checkpoint-basierte Crawl-Persistenz. Drücken Sie Strg+C für ein kontrolliertes Herunterfahren; starten Sie neu, um dort fortzufahren, wo Sie aufgehört haben.
111
+ - 📡 **Streaming-Modus**: Gescrapte Elemente in Echtzeit streamen über `async for item in spider.stream()` mit Echtzeit-Statistiken -- ideal für UI, Pipelines und lang laufende Crawls.
112
+ - 🛡️ **Erkennung blockierter Anfragen**: Automatische Erkennung und Wiederholung blockierter Anfragen mit anpassbarer Logik.
113
+ - 📦 **Integrierter Export**: Ergebnisse über Hooks und Ihre eigene Pipeline oder den integrierten JSON/JSONL-Export mit `result.items.to_json()` / `result.items.to_jsonl()` exportieren.
114
+
115
+ ### Erweitertes Website-Abrufen mit Session-Unterstützung
116
+ - **HTTP-Anfragen**: Schnelle und heimliche HTTP-Anfragen mit der `Fetcher`-Klasse. Kann Browser-TLS-Fingerprints und Header imitieren und HTTP/3 verwenden.
117
+ - **Dynamisches Laden**: Dynamische Websites mit vollständiger Browser-Automatisierung über die `DynamicFetcher`-Klasse abrufen, die Playwrights Chromium und Google Chrome unterstützt.
118
+ - **Anti-Bot-Umgehung**: Erweiterte Stealth-Fähigkeiten mit `StealthyFetcher` und Fingerprint-Spoofing. Kann alle Arten von Cloudflares Turnstile/Interstitial einfach mit Automatisierung umgehen.
119
+ - **Session-Verwaltung**: Persistente Session-Unterstützung mit den Klassen `FetcherSession`, `StealthySession` und `DynamicSession` für Cookie- und Zustandsverwaltung über Anfragen hinweg.
120
+ - **Proxy-Rotation**: Integrierter `ProxyRotator` mit zyklischen oder benutzerdefinierten Rotationsstrategien über alle Session-Typen hinweg, plus Proxy-Überschreibungen pro Anfrage.
121
+ - **Domain-Blockierung**: Anfragen an bestimmte Domains (und deren Subdomains) in browserbasierten Fetchern blockieren.
122
+ - **Async-Unterstützung**: Vollständige async-Unterstützung über alle Fetcher und dedizierte async Session-Klassen hinweg.
123
+
124
+ ### Adaptives Scraping & KI-Integration
125
+ - 🔄 **Intelligente Element-Verfolgung**: Elemente nach Website-Änderungen mit intelligenten Ähnlichkeitsalgorithmen neu lokalisieren.
126
+ - 🎯 **Intelligente flexible Auswahl**: CSS-Selektoren, XPath-Selektoren, filterbasierte Suche, Textsuche, Regex-Suche und mehr.
127
+ - 🔍 **Ähnliche Elemente finden**: Elemente, die gefundenen Elementen ähnlich sind, automatisch lokalisieren.
128
+ - 🤖 **MCP-Server für die Verwendung mit KI**: Integrierter MCP-Server für KI-unterstütztes Web Scraping und Datenextraktion. Der MCP-Server verfügt über leistungsstarke, benutzerdefinierte Funktionen, die Scrapling nutzen, um gezielten Inhalt zu extrahieren, bevor er an die KI (Claude/Cursor/etc.) übergeben wird, wodurch Vorgänge beschleunigt und Kosten durch Minimierung der Token-Nutzung gesenkt werden. ([Demo-Video](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
129
+
130
+ ### Hochleistungs- und praxiserprobte Architektur
131
+ - 🚀 **Blitzschnell**: Optimierte Leistung, die die meisten Python-Scraping-Bibliotheken übertrifft.
132
+ - 🔋 **Speichereffizient**: Optimierte Datenstrukturen und Lazy Loading für einen minimalen Speicher-Footprint.
133
+ - ⚡ **Schnelle JSON-Serialisierung**: 10x schneller als die Standardbibliothek.
134
+ - 🏗️ **Praxiserprobt**: Scrapling hat nicht nur eine Testabdeckung von 92% und eine vollständige Type-Hints-Abdeckung, sondern wird seit dem letzten Jahr täglich von Hunderten von Web Scrapern verwendet.
135
+
136
+ ### Entwickler-/Web-Scraper-freundliche Erfahrung
137
+ - 🎯 **Interaktive Web-Scraping-Shell**: Optionale integrierte IPython-Shell mit Scrapling-Integration, Shortcuts und neuen Tools zur Beschleunigung der Web-Scraping-Skriptentwicklung, wie das Konvertieren von Curl-Anfragen in Scrapling-Anfragen und das Anzeigen von Anfrageergebnissen in Ihrem Browser.
138
+ - 🚀 **Direkt vom Terminal aus verwenden**: Optional können Sie Scrapling verwenden, um eine URL zu scrapen, ohne eine einzige Codezeile zu schreiben!
139
+ - 🛠️ **Umfangreiche Navigations-API**: Erweiterte DOM-Traversierung mit Eltern-, Geschwister- und Kind-Navigationsmethoden.
140
+ - 🧬 **Verbesserte Textverarbeitung**: Integrierte Regex, Bereinigungsmethoden und optimierte String-Operationen.
141
+ - 📝 **Automatische Selektorgenerierung**: Robuste CSS/XPath-Selektoren für jedes Element generieren.
142
+ - 🔌 **Vertraute API**: Ähnlich wie Scrapy/BeautifulSoup mit denselben Pseudo-Elementen, die in Scrapy/Parsel verwendet werden.
143
+ - 📘 **Vollständige Typabdeckung**: Vollständige Type Hints für hervorragende IDE-Unterstützung und Code-Vervollständigung. Die gesamte Codebasis wird bei jeder Änderung automatisch mit **PyRight** und **MyPy** gescannt.
144
+ - 🔋 **Fertiges Docker-Image**: Mit jeder Veröffentlichung wird automatisch ein Docker-Image erstellt und gepusht, das alle Browser enthält.
145
+
146
+ ## Erste Schritte
147
+
148
+ Hier ein kurzer Überblick über das, was Scrapling kann, ohne zu sehr ins Detail zu gehen.
149
+
150
+ ### Grundlegende Verwendung
151
+ HTTP-Anfragen mit Session-Unterstützung
152
+ ```python
153
+ from scrapling.fetchers import Fetcher, FetcherSession
154
+
155
+ with FetcherSession(impersonate='chrome') as session: # Neueste Version von Chromes TLS-Fingerprint verwenden
156
+ page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
157
+ quotes = page.css('.quote .text::text').getall()
158
+
159
+ # Oder einmalige Anfragen verwenden
160
+ page = Fetcher.get('https://quotes.toscrape.com/')
161
+ quotes = page.css('.quote .text::text').getall()
162
+ ```
163
+ Erweiterter Stealth-Modus
164
+ ```python
165
+ from scrapling.fetchers import StealthyFetcher, StealthySession
166
+
167
+ with StealthySession(headless=True, solve_cloudflare=True) as session: # Browser offen halten, bis Sie fertig sind
168
+ page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
169
+ data = page.css('#padded_content a').getall()
170
+
171
+ # Oder einmaligen Anfragenstil verwenden: öffnet den Browser für diese Anfrage und schließt ihn nach Abschluss
172
+ page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
173
+ data = page.css('#padded_content a').getall()
174
+ ```
175
+ Vollständige Browser-Automatisierung
176
+ ```python
177
+ from scrapling.fetchers import DynamicFetcher, DynamicSession
178
+
179
+ with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # Browser offen halten, bis Sie fertig sind
180
+ page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
181
+ data = page.xpath('//span[@class="text"]/text()').getall() # XPath-Selektor, falls bevorzugt
182
+
183
+ # Oder einmaligen Anfragenstil verwenden: öffnet den Browser für diese Anfrage und schließt ihn nach Abschluss
184
+ page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
185
+ data = page.css('.quote .text::text').getall()
186
+ ```
187
+
188
+ ### Spiders
189
+ Vollständige Crawler mit parallelen Anfragen, mehreren Session-Typen und Pause & Resume erstellen:
190
+ ```python
191
+ from scrapling.spiders import Spider, Request, Response
192
+
193
+ class QuotesSpider(Spider):
194
+ name = "quotes"
195
+ start_urls = ["https://quotes.toscrape.com/"]
196
+ concurrent_requests = 10
197
+
198
+ async def parse(self, response: Response):
199
+ for quote in response.css('.quote'):
200
+ yield {
201
+ "text": quote.css('.text::text').get(),
202
+ "author": quote.css('.author::text').get(),
203
+ }
204
+
205
+ next_page = response.css('.next a')
206
+ if next_page:
207
+ yield response.follow(next_page[0].attrib['href'])
208
+
209
+ result = QuotesSpider().start()
210
+ print(f"{len(result.items)} Zitate gescrapt")
211
+ result.items.to_json("quotes.json")
212
+ ```
213
+ Mehrere Session-Typen in einem einzigen Spider verwenden:
214
+ ```python
215
+ from scrapling.spiders import Spider, Request, Response
216
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession
217
+
218
+ class MultiSessionSpider(Spider):
219
+ name = "multi"
220
+ start_urls = ["https://example.com/"]
221
+
222
+ def configure_sessions(self, manager):
223
+ manager.add("fast", FetcherSession(impersonate="chrome"))
224
+ manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)
225
+
226
+ async def parse(self, response: Response):
227
+ for link in response.css('a::attr(href)').getall():
228
+ # Geschützte Seiten über die Stealth-Session leiten
229
+ if "protected" in link:
230
+ yield Request(link, sid="stealth")
231
+ else:
232
+ yield Request(link, sid="fast", callback=self.parse) # Expliziter Callback
233
+ ```
234
+ Lange Crawls mit Checkpoints pausieren und fortsetzen, indem Sie den Spider so starten:
235
+ ```python
236
+ QuotesSpider(crawldir="./crawl_data").start()
237
+ ```
238
+ Drücken Sie Strg+C, um kontrolliert zu pausieren -- der Fortschritt wird automatisch gespeichert. Wenn Sie den Spider später erneut starten, übergeben Sie dasselbe `crawldir`, und er setzt dort fort, wo er aufgehört hat.
239
+
240
+ ### Erweitertes Parsing & Navigation
241
+ ```python
242
+ from scrapling.fetchers import Fetcher
243
+
244
+ # Umfangreiche Elementauswahl und Navigation
245
+ page = Fetcher.get('https://quotes.toscrape.com/')
246
+
247
+ # Zitate mit verschiedenen Auswahlmethoden abrufen
248
+ quotes = page.css('.quote') # CSS-Selektor
249
+ quotes = page.xpath('//div[@class="quote"]') # XPath
250
+ quotes = page.find_all('div', {'class': 'quote'}) # BeautifulSoup-Stil
251
+ # Gleich wie
252
+ quotes = page.find_all('div', class_='quote')
253
+ quotes = page.find_all(['div'], class_='quote')
254
+ quotes = page.find_all(class_='quote') # und so weiter...
255
+ # Element nach Textinhalt finden
256
+ quotes = page.find_by_text('quote', tag='div')
257
+
258
+ # Erweiterte Navigation
259
+ quote_text = page.css('.quote')[0].css('.text::text').get()
260
+ quote_text = page.css('.quote').css('.text::text').getall() # Verkettete Selektoren
261
+ first_quote = page.css('.quote')[0]
262
+ author = first_quote.next_sibling.css('.author::text')
263
+ parent_container = first_quote.parent
264
+
265
+ # Elementbeziehungen und Ähnlichkeit
266
+ similar_elements = first_quote.find_similar()
267
+ below_elements = first_quote.below_elements()
268
+ ```
269
+ Sie können den Parser direkt verwenden, wenn Sie keine Websites abrufen möchten, wie unten gezeigt:
270
+ ```python
271
+ from scrapling.parser import Selector
272
+
273
+ page = Selector("<html>...</html>")
274
+ ```
275
+ Und es funktioniert genau auf die gleiche Weise!
276
+
277
+ ### Beispiele für async Session-Verwaltung
278
+ ```python
279
+ import asyncio
280
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession
281
+
282
+ async with FetcherSession(http3=True) as session: # `FetcherSession` ist kontextbewusst und kann sowohl in sync- als auch in async-Mustern arbeiten
283
+ page1 = session.get('https://quotes.toscrape.com/')
284
+ page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
285
+
286
+ # Async-Session-Verwendung
287
+ async with AsyncStealthySession(max_pages=2) as session:
288
+ tasks = []
289
+ urls = ['https://example.com/page1', 'https://example.com/page2']
290
+
291
+ for url in urls:
292
+ task = session.fetch(url)
293
+ tasks.append(task)
294
+
295
+ print(session.get_pool_stats()) # Optional - Der Status des Browser-Tab-Pools (beschäftigt/frei/Fehler)
296
+ results = await asyncio.gather(*tasks)
297
+ print(session.get_pool_stats())
298
+ ```
299
+
300
+ ## CLI & Interaktive Shell
301
+
302
+ Scrapling enthält eine leistungsstarke Befehlszeilenschnittstelle:
303
+
304
+ [![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)
305
+
306
+ Interaktive Web-Scraping-Shell starten
307
+ ```bash
308
+ scrapling shell
309
+ ```
310
+ Seiten direkt ohne Programmierung in eine Datei extrahieren (extrahiert standardmäßig den Inhalt im `body`-Tag). Wenn die Ausgabedatei mit `.txt` endet, wird der Textinhalt des Ziels extrahiert. Wenn sie mit `.md` endet, ist es eine Markdown-Darstellung des HTML-Inhalts; wenn sie mit `.html` endet, ist es der HTML-Inhalt selbst.
311
+ ```bash
312
+ scrapling extract get 'https://example.com' content.md
313
+ scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # Alle Elemente, die dem CSS-Selektor '#fromSkipToProducts' entsprechen
314
+ scrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless
315
+ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare
316
+ ```
317
+
318
+ > [!NOTE]
319
+ > Es gibt viele zusätzliche Funktionen, aber wir möchten diese Seite prägnant halten, einschließlich des MCP-Servers und der interaktiven Web-Scraping-Shell. Schauen Sie sich die vollständige Dokumentation [hier](https://scrapling.readthedocs.io/en/latest/) an
320
+
321
+ ## Leistungsbenchmarks
322
+
323
+ Scrapling ist nicht nur leistungsstark -- es ist auch blitzschnell. Die folgenden Benchmarks vergleichen Scraplings Parser mit den neuesten Versionen anderer beliebter Bibliotheken.
324
+
325
+ ### Textextraktions-Geschwindigkeitstest (5000 verschachtelte Elemente)
326
+
327
+ | # | Bibliothek | Zeit (ms) | vs Scrapling |
328
+ |---|:-----------------:|:---------:|:------------:|
329
+ | 1 | Scrapling | 2.02 | 1.0x |
330
+ | 2 | Parsel/Scrapy | 2.04 | 1.01 |
331
+ | 3 | Raw Lxml | 2.54 | 1.257 |
332
+ | 4 | PyQuery | 24.17 | ~12x |
333
+ | 5 | Selectolax | 82.63 | ~41x |
334
+ | 6 | MechanicalSoup | 1549.71 | ~767.1x |
335
+ | 7 | BS4 with Lxml | 1584.31 | ~784.3x |
336
+ | 8 | BS4 with html5lib | 3391.91 | ~1679.1x |
337
+
338
+
339
+ ### Element-Ähnlichkeit & Textsuche-Leistung
340
+
341
+ Scraplings adaptive Element-Finding-Fähigkeiten übertreffen Alternativen deutlich:
342
+
343
+ | Bibliothek | Zeit (ms) | vs Scrapling |
344
+ |-------------|:---------:|:------------:|
345
+ | Scrapling | 2.39 | 1.0x |
346
+ | AutoScraper | 12.45 | 5.209x |
347
+
348
+
349
+ > Alle Benchmarks stellen Durchschnittswerte von über 100 Durchläufen dar. Siehe [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) für die Methodik.
350
+
351
+ ## Installation
352
+
353
+ Scrapling erfordert Python 3.10 oder höher:
354
+
355
+ ```bash
356
+ pip install scrapling
357
+ ```
358
+
359
+ Diese Installation enthält nur die Parser-Engine und ihre Abhängigkeiten, ohne Fetcher oder Kommandozeilenabhängigkeiten.
360
+
361
+ ### Optionale Abhängigkeiten
362
+
363
+ 1. Wenn Sie eine der folgenden zusätzlichen Funktionen, die Fetcher oder ihre Klassen verwenden möchten, müssen Sie die Abhängigkeiten der Fetcher und ihre Browser-Abhängigkeiten wie folgt installieren:
364
+ ```bash
365
+ pip install "scrapling[fetchers]"
366
+
367
+ scrapling install # normal install
368
+ scrapling install --force # force reinstall
369
+ ```
370
+
371
+ Dies lädt alle Browser zusammen mit ihren Systemabhängigkeiten und Fingerprint-Manipulationsabhängigkeiten herunter.
372
+
373
+ Oder Sie können sie aus dem Code heraus installieren, anstatt einen Befehl auszuführen:
374
+ ```python
375
+ from scrapling.cli import install
376
+
377
+ install([], standalone_mode=False) # normal install
378
+ install(["--force"], standalone_mode=False) # force reinstall
379
+ ```
380
+
381
+ 2. Zusätzliche Funktionen:
382
+ - MCP-Server-Funktion installieren:
383
+ ```bash
384
+ pip install "scrapling[ai]"
385
+ ```
386
+ - Shell-Funktionen installieren (Web-Scraping-Shell und der `extract`-Befehl):
387
+ ```bash
388
+ pip install "scrapling[shell]"
389
+ ```
390
+ - Alles installieren:
391
+ ```bash
392
+ pip install "scrapling[all]"
393
+ ```
394
+ Denken Sie daran, dass Sie nach einem dieser Extras (falls noch nicht geschehen) die Browser-Abhängigkeiten mit `scrapling install` installieren müssen
395
+
396
+ ### Docker
397
+ Sie können auch ein Docker-Image mit allen Extras und Browsern mit dem folgenden Befehl von DockerHub installieren:
398
+ ```bash
399
+ docker pull pyd4vinci/scrapling
400
+ ```
401
+ Oder laden Sie es aus der GitHub-Registry herunter:
402
+ ```bash
403
+ docker pull ghcr.io/d4vinci/scrapling:latest
404
+ ```
405
+ Dieses Image wird automatisch mit GitHub Actions und dem Hauptzweig des Repositorys erstellt und gepusht.
406
+
407
+ ## Beitragen
408
+
409
+ Wir freuen uns über Beiträge! Bitte lesen Sie unsere [Beitragsrichtlinien](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md), bevor Sie beginnen.
410
+
411
+ ## Haftungsausschluss
412
+
413
+ > [!CAUTION]
414
+ > Diese Bibliothek wird nur zu Bildungs- und Forschungszwecken bereitgestellt. Durch die Nutzung dieser Bibliothek erklären Sie sich damit einverstanden, lokale und internationale Gesetze zum Daten-Scraping und Datenschutz einzuhalten. Die Autoren und Mitwirkenden sind nicht verantwortlich für Missbrauch dieser Software. Respektieren Sie immer die Nutzungsbedingungen von Websites und robots.txt-Dateien.
415
+
416
+ ## Lizenz
417
+
418
+ Diese Arbeit ist unter der BSD-3-Clause-Lizenz lizenziert.
419
+
420
+ ## Danksagungen
421
+
422
+ Dieses Projekt enthält angepassten Code von:
423
+ - Parsel (BSD-Lizenz) -- Verwendet für das [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)-Submodul
424
+
425
+ ---
426
+ <div align="center"><small>Entworfen und hergestellt mit ❤️ von Karim Shoair.</small></div><br>
docs/README_ES.md ADDED
@@ -0,0 +1,426 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- mcp-name: io.github.D4Vinci/Scrapling -->
2
+
3
+ <h1 align="center">
4
+ <a href="https://scrapling.readthedocs.io">
5
+ <picture>
6
+ <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
7
+ <img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
8
+ </picture>
9
+ </a>
10
+ <br>
11
+ <small>Effortless Web Scraping for the Modern Web</small>
12
+ </h1>
13
+
14
+ <p align="center">
15
+ <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
16
+ <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
17
+ <a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
18
+ <img alt="PyPI version" src="https://badge.fury.io/py/Scrapling.svg"></a>
19
+ <a href="https://pepy.tech/project/scrapling" alt="PyPI Downloads">
20
+ <img alt="PyPI Downloads" src="https://static.pepy.tech/personalized-badge/scrapling?period=total&units=INTERNATIONAL_SYSTEM&left_color=GREY&right_color=GREEN&left_text=Downloads"></a>
21
+ <br/>
22
+ <a href="https://discord.gg/EMgGbDceNQ" alt="Discord" target="_blank">
23
+ <img alt="Discord" src="https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ">
24
+ </a>
25
+ <a href="https://x.com/Scrapling_dev" alt="X (formerly Twitter)">
26
+ <img alt="X (formerly Twitter) Follow" src="https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev">
27
+ </a>
28
+ <br/>
29
+ <a href="https://pypi.org/project/scrapling/" alt="Supported Python versions">
30
+ <img alt="Supported Python versions" src="https://img.shields.io/pypi/pyversions/scrapling.svg"></a>
31
+ </p>
32
+
33
+ <p align="center">
34
+ <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/"><strong>Métodos de selección</strong></a>
35
+ &middot;
36
+ <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/"><strong>Elegir un fetcher</strong></a>
37
+ &middot;
38
+ <a href="https://scrapling.readthedocs.io/en/latest/spiders/architecture.html"><strong>Spiders</strong></a>
39
+ &middot;
40
+ <a href="https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html"><strong>Rotación de proxy</strong></a>
41
+ &middot;
42
+ <a href="https://scrapling.readthedocs.io/en/latest/cli/overview/"><strong>CLI</strong></a>
43
+ &middot;
44
+ <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/"><strong>Modo MCP</strong></a>
45
+ </p>
46
+
47
+ Scrapling es un framework de Web Scraping adaptativo que se encarga de todo, desde una sola solicitud hasta un rastreo a gran escala.
48
+
49
+ Su parser aprende de los cambios de los sitios web y relocaliza automáticamente tus elementos cuando las páginas se actualizan. Sus fetchers evaden sistemas anti-bot como Cloudflare Turnstile de forma nativa. Y su framework Spider te permite escalar a rastreos concurrentes con múltiples sesiones, con Pause & Resume y rotación automática de Proxy, todo en unas pocas líneas de Python. Una biblioteca, cero compromisos.
50
+
51
+ Rastreos ultrarrápidos con estadísticas en tiempo real y Streaming. Construido por Web Scrapers para Web Scrapers y usuarios regulares, hay algo para todos.
52
+
53
+ ```python
54
+ from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
55
+ StealthyFetcher.adaptive = True
56
+ p = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # ¡Obtén el sitio web bajo el radar!
57
+ products = p.css('.product', auto_save=True) # ¡Extrae datos que sobreviven a cambios de diseño del sitio web!
58
+ products = p.css('.product', adaptive=True) # Más tarde, si la estructura del sitio web cambia, ¡pasa `adaptive=True` para encontrarlos!
59
+ ```
60
+ O escala a rastreos completos
61
+ ```python
62
+ from scrapling.spiders import Spider, Response
63
+
64
+ class MySpider(Spider):
65
+ name = "demo"
66
+ start_urls = ["https://example.com/"]
67
+
68
+ async def parse(self, response: Response):
69
+ for item in response.css('.product'):
70
+ yield {"title": item.css('h2::text').get()}
71
+
72
+ MySpider().start()
73
+ ```
74
+
75
+
76
+ # Patrocinadores Platino
77
+
78
+ <i><sub>¿Quieres ser la primera empresa en aparecer aquí? Haz clic [aquí](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)</sub></i>
79
+ # Patrocinadores
80
+
81
+ <!-- sponsors -->
82
+
83
+ <a href="https://www.scrapeless.com/en?utm_source=official&utm_term=scrapling" target="_blank" title="Effortless Web Scraping Toolkit for Business and Developers"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg"></a>
84
+ <a href="https://www.thordata.com/?ls=github&lk=github" target="_blank" title="Unblockable proxies and scraping infrastructure, delivering real-time, reliable web data to power AI models and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
85
+ <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
86
+ <a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
87
+ <a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
88
+ <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
89
+ <a href="https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci" target="_blank" title="The web scraping service that actually beats anti-bot systems!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png"></a>
90
+ <a href="https://proxyempire.io/" target="_blank" title="Collect The Data Your Project Needs with the Best Residential Proxies"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png"></a>
91
+ <a href="https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling" target="_blank" title="Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png"></a>
92
+
93
+
94
+ <a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
95
+ <a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
96
+ <a href="https://browser.cash/?utm_source=D4Vinci&utm_medium=referral" target="_blank" title="Browser Automation & AI Browser Agent Platform"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png"></a>
97
+
98
+ <!-- /sponsors -->
99
+
100
+ <i><sub>¿Quieres mostrar tu anuncio aquí? ¡Haz clic [aquí](https://github.com/sponsors/D4Vinci) y elige el nivel que te convenga!</sub></i>
101
+
102
+ ---
103
+
104
+ ## Características Principales
105
+
106
+ ### Spiders — Un Framework Completo de Rastreo
107
+ - 🕷️ **API de Spider al estilo Scrapy**: Define spiders con `start_urls`, callbacks async `parse`, y objetos `Request`/`Response`.
108
+ - ⚡ **Rastreo Concurrente**: Límites de concurrencia configurables, limitación por dominio y retrasos de descarga.
109
+ - 🔄 **Soporte Multi-Session**: Interfaz unificada para solicitudes HTTP y navegadores headless sigilosos en un solo Spider — enruta solicitudes a diferentes sesiones por ID.
110
+ - 💾 **Pause & Resume**: Persistencia de rastreo basada en Checkpoint. Presiona Ctrl+C para un cierre ordenado; reinicia para continuar desde donde lo dejaste.
111
+ - 📡 **Modo Streaming**: Transmite elementos extraídos a medida que llegan con `async for item in spider.stream()` con estadísticas en tiempo real — ideal para UI, pipelines y rastreos de larga duración.
112
+ - 🛡️ **Detección de Solicitudes Bloqueadas**: Detección automática y reintento de solicitudes bloqueadas con lógica personalizable.
113
+ - 📦 **Exportación Integrada**: Exporta resultados a través de hooks y tu propio pipeline o el JSON/JSONL integrado con `result.items.to_json()` / `result.items.to_jsonl()` respectivamente.
114
+
115
+ ### Obtención Avanzada de Sitios Web con Soporte de Session
116
+ - **Solicitudes HTTP**: Solicitudes HTTP rápidas y sigilosas con la clase `Fetcher`. Puede imitar el fingerprint TLS de los navegadores, encabezados y usar HTTP/3.
117
+ - **Carga Dinámica**: Obtén sitios web dinámicos con automatización completa del navegador a través de la clase `DynamicFetcher` compatible con Chromium de Playwright y Google Chrome.
118
+ - **Evasión Anti-bot**: Capacidades de sigilo avanzadas con `StealthyFetcher` y falsificación de fingerprint. Puede evadir fácilmente todos los tipos de Turnstile/Interstitial de Cloudflare con automatización.
119
+ - **Gestión de Session**: Soporte de sesión persistente con las clases `FetcherSession`, `StealthySession` y `DynamicSession` para la gestión de cookies y estado entre solicitudes.
120
+ - **Rotación de Proxy**: `ProxyRotator` integrado con estrategias de rotación cíclica o personalizadas en todos los tipos de sesión, además de sobrescrituras de Proxy por solicitud.
121
+ - **Bloqueo de Dominios**: Bloquea solicitudes a dominios específicos (y sus subdominios) en fetchers basados en navegador.
122
+ - **Soporte Async**: Soporte async completo en todos los fetchers y clases de sesión async dedicadas.
123
+
124
+ ### Scraping Adaptativo e Integración con IA
125
+ - 🔄 **Seguimiento Inteligente de Elementos**: Relocaliza elementos después de cambios en el sitio web usando algoritmos inteligentes de similitud.
126
+ - 🎯 **Selección Flexible Inteligente**: Selectores CSS, selectores XPath, búsqueda basada en filtros, búsqueda de texto, búsqueda regex y más.
127
+ - 🔍 **Encontrar Elementos Similares**: Localiza automáticamente elementos similares a los elementos encontrados.
128
+ - 🤖 **Servidor MCP para usar con IA**: Servidor MCP integrado para Web Scraping asistido por IA y extracción de datos. El servidor MCP presenta capacidades potentes y personalizadas que aprovechan Scrapling para extraer contenido específico antes de pasarlo a la IA (Claude/Cursor/etc), acelerando así las operaciones y reduciendo costos al minimizar el uso de tokens. ([video demo](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
129
+
130
+ ### Arquitectura de Alto Rendimiento y Probada en Batalla
131
+ - 🚀 **Ultrarrápido**: Rendimiento optimizado que supera a la mayoría de las bibliotecas de Web Scraping de Python.
132
+ - 🔋 **Eficiente en Memoria**: Estructuras de datos optimizadas y carga diferida para una huella de memoria mínima.
133
+ - ⚡ **Serialización JSON Rápida**: 10 veces más rápido que la biblioteca estándar.
134
+ - 🏗️ **Probado en batalla**: Scrapling no solo tiene una cobertura de pruebas del 92% y cobertura completa de type hints, sino que ha sido utilizado diariamente por cientos de Web Scrapers durante el último año.
135
+
136
+ ### Experiencia Amigable para Desarrolladores/Web Scrapers
137
+ - 🎯 **Shell Interactivo de Web Scraping**: Shell IPython integrado opcional con integración de Scrapling, atajos y nuevas herramientas para acelerar el desarrollo de scripts de Web Scraping, como convertir solicitudes curl a solicitudes Scrapling y ver resultados de solicitudes en tu navegador.
138
+ - 🚀 **Úsalo directamente desde la Terminal**: Opcionalmente, ¡puedes usar Scrapling para hacer scraping de una URL sin escribir ni una sola línea de código!
139
+ - 🛠️ **API de Navegación Rica**: Recorrido avanzado del DOM con métodos de navegación de padres, hermanos e hijos.
140
+ - 🧬 **Procesamiento de Texto Mejorado**: Métodos integrados de regex, limpieza y operaciones de cadena optimizadas.
141
+ - 📝 **Generación Automática de Selectores**: Genera selectores CSS/XPath robustos para cualquier elemento.
142
+ - 🔌 **API Familiar**: Similar a Scrapy/BeautifulSoup con los mismos pseudo-elementos usados en Scrapy/Parsel.
143
+ - 📘 **Cobertura Completa de Tipos**: Type hints completos para excelente soporte de IDE y autocompletado de código. Todo el código fuente se escanea automáticamente con **PyRight** y **MyPy** en cada cambio.
144
+ - 🔋 **Imagen Docker Lista**: Con cada lanzamiento, se construye y publica automáticamente una imagen Docker que contiene todos los navegadores.
145
+
146
+ ## Primeros Pasos
147
+
148
+ Aquí tienes un vistazo rápido de lo que Scrapling puede hacer sin entrar en profundidad.
149
+
150
+ ### Uso Básico
151
+ Solicitudes HTTP con soporte de sesión
152
+ ```python
153
+ from scrapling.fetchers import Fetcher, FetcherSession
154
+
155
+ with FetcherSession(impersonate='chrome') as session: # Usa la última versión del fingerprint TLS de Chrome
156
+ page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
157
+ quotes = page.css('.quote .text::text').getall()
158
+
159
+ # O usa solicitudes de una sola vez
160
+ page = Fetcher.get('https://quotes.toscrape.com/')
161
+ quotes = page.css('.quote .text::text').getall()
162
+ ```
163
+ Modo sigiloso avanzado
164
+ ```python
165
+ from scrapling.fetchers import StealthyFetcher, StealthySession
166
+
167
+ with StealthySession(headless=True, solve_cloudflare=True) as session: # Mantén el navegador abierto hasta que termines
168
+ page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
169
+ data = page.css('#padded_content a').getall()
170
+
171
+ # O usa el estilo de solicitud de una sola vez, abre el navegador para esta solicitud, luego lo cierra después de terminar
172
+ page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
173
+ data = page.css('#padded_content a').getall()
174
+ ```
175
+ Automatización completa del navegador
176
+ ```python
177
+ from scrapling.fetchers import DynamicFetcher, DynamicSession
178
+
179
+ with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # Mantén el navegador abierto hasta que termines
180
+ page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
181
+ data = page.xpath('//span[@class="text"]/text()').getall() # Selector XPath si lo prefieres
182
+
183
+ # O usa el estilo de solicitud de una sola vez, abre el navegador para esta solicitud, luego lo cierra después de terminar
184
+ page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
185
+ data = page.css('.quote .text::text').getall()
186
+ ```
187
+
188
+ ### Spiders
189
+ Construye rastreadores completos con solicitudes concurrentes, múltiples tipos de sesión y Pause & Resume:
190
+ ```python
191
+ from scrapling.spiders import Spider, Request, Response
192
+
193
+ class QuotesSpider(Spider):
194
+ name = "quotes"
195
+ start_urls = ["https://quotes.toscrape.com/"]
196
+ concurrent_requests = 10
197
+
198
+ async def parse(self, response: Response):
199
+ for quote in response.css('.quote'):
200
+ yield {
201
+ "text": quote.css('.text::text').get(),
202
+ "author": quote.css('.author::text').get(),
203
+ }
204
+
205
+ next_page = response.css('.next a')
206
+ if next_page:
207
+ yield response.follow(next_page[0].attrib['href'])
208
+
209
+ result = QuotesSpider().start()
210
+ print(f"Se extrajeron {len(result.items)} citas")
211
+ result.items.to_json("quotes.json")
212
+ ```
213
+ Usa múltiples tipos de sesión en un solo Spider:
214
+ ```python
215
+ from scrapling.spiders import Spider, Request, Response
216
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession
217
+
218
+ class MultiSessionSpider(Spider):
219
+ name = "multi"
220
+ start_urls = ["https://example.com/"]
221
+
222
+ def configure_sessions(self, manager):
223
+ manager.add("fast", FetcherSession(impersonate="chrome"))
224
+ manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)
225
+
226
+ async def parse(self, response: Response):
227
+ for link in response.css('a::attr(href)').getall():
228
+ # Enruta las páginas protegidas a través de la sesión sigilosa
229
+ if "protected" in link:
230
+ yield Request(link, sid="stealth")
231
+ else:
232
+ yield Request(link, sid="fast", callback=self.parse) # callback explícito
233
+ ```
234
+ Pausa y reanuda rastreos largos con checkpoints ejecutando el Spider así:
235
+ ```python
236
+ QuotesSpider(crawldir="./crawl_data").start()
237
+ ```
238
+ Presiona Ctrl+C para pausar de forma ordenada — el progreso se guarda automáticamente. Después, cuando inicies el Spider de nuevo, pasa el mismo `crawldir`, y continuará desde donde se detuvo.
239
+
240
+ ### Análisis Avanzado y Navegación
241
+ ```python
242
+ from scrapling.fetchers import Fetcher
243
+
244
+ # Selección rica de elementos y navegación
245
+ page = Fetcher.get('https://quotes.toscrape.com/')
246
+
247
+ # Obtén citas con múltiples métodos de selección
248
+ quotes = page.css('.quote') # Selector CSS
249
+ quotes = page.xpath('//div[@class="quote"]') # XPath
250
+ quotes = page.find_all('div', {'class': 'quote'}) # Estilo BeautifulSoup
251
+ # Igual que
252
+ quotes = page.find_all('div', class_='quote')
253
+ quotes = page.find_all(['div'], class_='quote')
254
+ quotes = page.find_all(class_='quote') # y así sucesivamente...
255
+ # Encuentra elementos por contenido de texto
256
+ quotes = page.find_by_text('quote', tag='div')
257
+
258
+ # Navegación avanzada
259
+ quote_text = page.css('.quote')[0].css('.text::text').get()
260
+ quote_text = page.css('.quote').css('.text::text').getall() # Selectores encadenados
261
+ first_quote = page.css('.quote')[0]
262
+ author = first_quote.next_sibling.css('.author::text')
263
+ parent_container = first_quote.parent
264
+
265
+ # Relaciones y similitud de elementos
266
+ similar_elements = first_quote.find_similar()
267
+ below_elements = first_quote.below_elements()
268
+ ```
269
+ Puedes usar el parser directamente si no necesitas obtener sitios web, como se muestra a continuación:
270
+ ```python
271
+ from scrapling.parser import Selector
272
+
273
+ page = Selector("<html>...</html>")
274
+ ```
275
+ ¡Y funciona exactamente de la misma manera!
276
+
277
+ ### Ejemplos de Gestión de Session Async
278
+ ```python
279
+ import asyncio
280
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession
281
+
282
+ async with FetcherSession(http3=True) as session: # `FetcherSession` es consciente del contexto y puede funcionar tanto en patrones sync/async
283
+ page1 = session.get('https://quotes.toscrape.com/')
284
+ page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
285
+
286
+ # Uso de sesión async
287
+ async with AsyncStealthySession(max_pages=2) as session:
288
+ tasks = []
289
+ urls = ['https://example.com/page1', 'https://example.com/page2']
290
+
291
+ for url in urls:
292
+ task = session.fetch(url)
293
+ tasks.append(task)
294
+
295
+ print(session.get_pool_stats()) # Opcional - El estado del pool de pestañas del navegador (ocupado/libre/error)
296
+ results = await asyncio.gather(*tasks)
297
+ print(session.get_pool_stats())
298
+ ```
299
+
300
+ ## CLI y Shell Interactivo
301
+
302
+ Scrapling incluye una poderosa interfaz de línea de comandos:
303
+
304
+ [![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)
305
+
306
+ Lanzar el Shell interactivo de Web Scraping
307
+ ```bash
308
+ scrapling shell
309
+ ```
310
+ Extraer páginas a un archivo directamente sin programar (Extrae el contenido dentro de la etiqueta `body` por defecto). Si el archivo de salida termina con `.txt`, entonces se extraerá el contenido de texto del objetivo. Si termina con `.md`, será una representación Markdown del contenido HTML; si termina con `.html`, será el contenido HTML en sí mismo.
311
+ ```bash
312
+ scrapling extract get 'https://example.com' content.md
313
+ scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # Todos los elementos que coinciden con el selector CSS '#fromSkipToProducts'
314
+ scrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless
315
+ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare
316
+ ```
317
+
318
+ > [!NOTE]
319
+ > Hay muchas características adicionales, pero queremos mantener esta página concisa, incluyendo el servidor MCP y el Shell Interactivo de Web Scraping. Consulta la documentación completa [aquí](https://scrapling.readthedocs.io/en/latest/)
320
+
321
+ ## Benchmarks de Rendimiento
322
+
323
+ Scrapling no solo es potente, también es ultrarrápido. Los siguientes benchmarks comparan el parser de Scrapling con las últimas versiones de otras bibliotecas populares.
324
+
325
+ ### Prueba de Velocidad de Extracción de Texto (5000 elementos anidados)
326
+
327
+ | # | Biblioteca | Tiempo (ms) | vs Scrapling |
328
+ |---|:-----------------:|:-----------:|:------------:|
329
+ | 1 | Scrapling | 2.02 | 1.0x |
330
+ | 2 | Parsel/Scrapy | 2.04 | 1.01 |
331
+ | 3 | Raw Lxml | 2.54 | 1.257 |
332
+ | 4 | PyQuery | 24.17 | ~12x |
333
+ | 5 | Selectolax | 82.63 | ~41x |
334
+ | 6 | MechanicalSoup | 1549.71 | ~767.1x |
335
+ | 7 | BS4 with Lxml | 1584.31 | ~784.3x |
336
+ | 8 | BS4 with html5lib | 3391.91 | ~1679.1x |
337
+
338
+
339
+ ### Rendimiento de Similitud de Elementos y Búsqueda de Texto
340
+
341
+ Las capacidades de búsqueda adaptativa de elementos de Scrapling superan significativamente a las alternativas:
342
+
343
+ | Biblioteca | Tiempo (ms) | vs Scrapling |
344
+ |-------------|:-----------:|:------------:|
345
+ | Scrapling | 2.39 | 1.0x |
346
+ | AutoScraper | 12.45 | 5.209x |
347
+
348
+
349
+ > Todos los benchmarks representan promedios de más de 100 ejecuciones. Ver [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) para la metodología.
350
+
351
+ ## Instalación
352
+
353
+ Scrapling requiere Python 3.10 o superior:
354
+
355
+ ```bash
356
+ pip install scrapling
357
+ ```
358
+
359
+ Esta instalación solo incluye el motor de análisis y sus dependencias, sin ningún fetcher ni dependencias de línea de comandos.
360
+
361
+ ### Dependencias Opcionales
362
+
363
+ 1. Si vas a usar alguna de las características adicionales a continuación, los fetchers, o sus clases, necesitarás instalar las dependencias de los fetchers y sus dependencias del navegador de la siguiente manera:
364
+ ```bash
365
+ pip install "scrapling[fetchers]"
366
+
367
+ scrapling install # normal install
368
+ scrapling install --force # force reinstall
369
+ ```
370
+
371
+ Esto descarga todos los navegadores, junto con sus dependencias del sistema y dependencias de manipulación de fingerprint.
372
+
373
+ O puedes instalarlos desde el código en lugar de ejecutar un comando:
374
+ ```python
375
+ from scrapling.cli import install
376
+
377
+ install([], standalone_mode=False) # normal install
378
+ install(["--force"], standalone_mode=False) # force reinstall
379
+ ```
380
+
381
+ 2. Características adicionales:
382
+ - Instalar la característica del servidor MCP:
383
+ ```bash
384
+ pip install "scrapling[ai]"
385
+ ```
386
+ - Instalar características del Shell (Shell de Web Scraping y el comando `extract`):
387
+ ```bash
388
+ pip install "scrapling[shell]"
389
+ ```
390
+ - Instalar todo:
391
+ ```bash
392
+ pip install "scrapling[all]"
393
+ ```
394
+ Recuerda que necesitas instalar las dependencias del navegador con `scrapling install` después de cualquiera de estos extras (si no lo hiciste ya)
395
+
396
+ ### Docker
397
+ También puedes instalar una imagen Docker con todos los extras y navegadores con el siguiente comando desde DockerHub:
398
+ ```bash
399
+ docker pull pyd4vinci/scrapling
400
+ ```
401
+ O descárgala desde el registro de GitHub:
402
+ ```bash
403
+ docker pull ghcr.io/d4vinci/scrapling:latest
404
+ ```
405
+ Esta imagen se construye y publica automáticamente usando GitHub Actions y la rama principal del repositorio.
406
+
407
+ ## Contribuir
408
+
409
+ ¡Damos la bienvenida a las contribuciones! Por favor lee nuestras [pautas de contribución](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) antes de comenzar.
410
+
411
+ ## Descargo de Responsabilidad
412
+
413
+ > [!CAUTION]
414
+ > Esta biblioteca se proporciona solo con fines educativos y de investigación. Al usar esta biblioteca, aceptas cumplir con las leyes locales e internacionales de scraping de datos y privacidad. Los autores y contribuyentes no son responsables de ningún mal uso de este software. Respeta siempre los términos de servicio de los sitios web y los archivos robots.txt.
415
+
416
+ ## Licencia
417
+
418
+ Este trabajo está licenciado bajo la Licencia BSD-3-Clause.
419
+
420
+ ## Agradecimientos
421
+
422
+ Este proyecto incluye código adaptado de:
423
+ - Parsel (Licencia BSD)—Usado para el submódulo [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)
424
+
425
+ ---
426
+ <div align="center"><small>Diseñado y elaborado con ❤️ por Karim Shoair.</small></div><br>
docs/README_JP.md ADDED
@@ -0,0 +1,426 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- mcp-name: io.github.D4Vinci/Scrapling -->
2
+
3
+ <h1 align="center">
4
+ <a href="https://scrapling.readthedocs.io">
5
+ <picture>
6
+ <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
7
+ <img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
8
+ </picture>
9
+ </a>
10
+ <br>
11
+ <small>Effortless Web Scraping for the Modern Web</small>
12
+ </h1>
13
+
14
+ <p align="center">
15
+ <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
16
+ <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
17
+ <a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
18
+ <img alt="PyPI version" src="https://badge.fury.io/py/Scrapling.svg"></a>
19
+ <a href="https://pepy.tech/project/scrapling" alt="PyPI Downloads">
20
+ <img alt="PyPI Downloads" src="https://static.pepy.tech/personalized-badge/scrapling?period=total&units=INTERNATIONAL_SYSTEM&left_color=GREY&right_color=GREEN&left_text=Downloads"></a>
21
+ <br/>
22
+ <a href="https://discord.gg/EMgGbDceNQ" alt="Discord" target="_blank">
23
+ <img alt="Discord" src="https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ">
24
+ </a>
25
+ <a href="https://x.com/Scrapling_dev" alt="X (formerly Twitter)">
26
+ <img alt="X (formerly Twitter) Follow" src="https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev">
27
+ </a>
28
+ <br/>
29
+ <a href="https://pypi.org/project/scrapling/" alt="Supported Python versions">
30
+ <img alt="Supported Python versions" src="https://img.shields.io/pypi/pyversions/scrapling.svg"></a>
31
+ </p>
32
+
33
+ <p align="center">
34
+ <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/"><strong>選択メソッド</strong></a>
35
+ &middot;
36
+ <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/"><strong>Fetcherの選び方</strong></a>
37
+ &middot;
38
+ <a href="https://scrapling.readthedocs.io/en/latest/spiders/architecture.html"><strong>スパイダー</strong></a>
39
+ &middot;
40
+ <a href="https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html"><strong>プロキシローテーション</strong></a>
41
+ &middot;
42
+ <a href="https://scrapling.readthedocs.io/en/latest/cli/overview/"><strong>CLI</strong></a>
43
+ &middot;
44
+ <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/"><strong>MCPモード</strong></a>
45
+ </p>
46
+
47
+ Scraplingは、単一のリクエストから本格的なクロールまですべてを処理する適応型Web Scrapingフレームワークです。
48
+
49
+ そのパーサーはウェブサイトの変更から学習し、ページが更新されたときに要素を自動的に再配置します。Fetcherはすぐに使えるCloudflare Turnstileなどのアンチボットシステムを回避します。そしてSpiderフレームワークにより、Pause & Resumeや自動Proxy回転機能を備えた並行マルチSessionクロールへとスケールアップできます — すべてわずか数行のPythonで。1つのライブラリ、妥協なし。
50
+
51
+ リアルタイム統計とStreamingによる超高速クロール。Web Scraperによって、Web Scraperと一般ユーザーのために構築され、誰にでも何かがあります。
52
+
53
+ ```python
54
+ from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
55
+ StealthyFetcher.adaptive = True
56
+ p = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # レーダーの下でウェブサイトを取得!
57
+ products = p.css('.product', auto_save=True) # ウェブサイトのデザイン変更に耐えるデータをスクレイプ!
58
+ products = p.css('.product', adaptive=True) # 後でウェブサイトの構造が変わったら、`adaptive=True`を渡して見つける!
59
+ ```
60
+ または本格的なクロールへスケールアップ
61
+ ```python
62
+ from scrapling.spiders import Spider, Response
63
+
64
+ class MySpider(Spider):
65
+ name = "demo"
66
+ start_urls = ["https://example.com/"]
67
+
68
+ async def parse(self, response: Response):
69
+ for item in response.css('.product'):
70
+ yield {"title": item.css('h2::text').get()}
71
+
72
+ MySpider().start()
73
+ ```
74
+
75
+
76
+ # プラチナスポンサー
77
+
78
+ <i><sub>ここに最初に表示される企業になりませんか?[こちら](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)をクリック</sub></i>
79
+ # スポンサー
80
+
81
+ <!-- sponsors -->
82
+
83
+ <a href="https://www.scrapeless.com/en?utm_source=official&utm_term=scrapling" target="_blank" title="Effortless Web Scraping Toolkit for Business and Developers"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg"></a>
84
+ <a href="https://www.thordata.com/?ls=github&lk=github" target="_blank" title="Unblockable proxies and scraping infrastructure, delivering real-time, reliable web data to power AI models and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
85
+ <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
86
+ <a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
87
+ <a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
88
+ <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
89
+ <a href="https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci" target="_blank" title="The web scraping service that actually beats anti-bot systems!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png"></a>
90
+ <a href="https://proxyempire.io/" target="_blank" title="Collect The Data Your Project Needs with the Best Residential Proxies"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png"></a>
91
+ <a href="https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling" target="_blank" title="Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png"></a>
92
+
93
+
94
+ <a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
95
+ <a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
96
+ <a href="https://browser.cash/?utm_source=D4Vinci&utm_medium=referral" target="_blank" title="Browser Automation & AI Browser Agent Platform"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png"></a>
97
+
98
+ <!-- /sponsors -->
99
+
100
+ <i><sub>ここに広告を表示したいですか?[こちら](https://github.com/sponsors/D4Vinci)をクリックして、あなたに合ったティアを選択してください!</sub></i>
101
+
102
+ ---
103
+
104
+ ## 主な機能
105
+
106
+ ### Spider — 本格的なクロールフレームワーク
107
+ - 🕷️ **Scrapy風のSpider API**:`start_urls`、async `parse` callback、`Request`/`Response`オブジェクトでSpiderを定義。
108
+ - ⚡ **並行クロール**:設定可能な並行数制限、ドメインごとのスロットリング、ダウンロード遅延。
109
+ - 🔄 **マルチSessionサポート**:HTTPリクエストとステルスヘッドレスブラウザの統一インターフェース — IDによって異なるSessionにリクエストをルーティング。
110
+ - 💾 **Pause & Resume**:Checkpointベースのクロール永続化。Ctrl+Cで正常にシャットダウン;再起動すると中断したところから再開。
111
+ - 📡 **Streamingモード**:`async for item in spider.stream()`でリアルタイム統計とともにスクレイプされたアイテムをStreamingで受信 — UI、パイプライン、長時間実行クロールに最適。
112
+ - 🛡️ **ブロックされたリクエストの検出**:カスタマイズ可能なロジックによるブロックされたリクエストの自動検出とリトライ。
113
+ - 📦 **組み込みエクスポート**:フックや独自のパイプライン、または組み込みのJSON/JSONLで結果をエクスポート。それぞれ`result.items.to_json()` / `result.items.to_jsonl()`を使用。
114
+
115
+ ### Sessionサポート付き高度なウェブサイト取得
116
+ - **HTTPリクエスト**:`Fetcher`クラスで高速かつステルスなHTTPリクエスト。ブラウザのTLS fingerprint、ヘッダーを模倣し、HTTP/3を使用可能。
117
+ - **動的読み込み**:PlaywrightのChromiumとGoogle Chromeをサポートする`DynamicFetcher`クラスによる完全なブラウザ自動化で動的ウェブサイトを取得。
118
+ - **アンチボット回避**:`StealthyFetcher`とfingerprint偽装による高度なステルス機能。自動化でCloudflareのTurnstile/Interstitialのすべてのタイプを簡単に回避。
119
+ - **Session管理**:リクエスト間でCookieと状態を管理するための`FetcherSession`、`StealthySession`、`DynamicSession`クラスによる永続的なSessionサポート。
120
+ - **Proxy回転**:すべてのSessionタイプに対応したラウンドロビンまたはカスタム戦略の組み込み`ProxyRotator`、さらにリクエストごとのProxyオーバーライド。
121
+ - **ドメインブロック**:ブラウザベースのFetcherで特定のドメイン(およびそのサブドメイン)へのリクエストをブロック。
122
+ - **asyncサポート**:すべてのFetcherおよび専用asyncSessionクラス全体での完全なasyncサポート。
123
+
124
+ ### 適応型スクレイピングとAI統合
125
+ - 🔄 **スマート要素追跡**:インテリジェントな類似性アルゴリズムを使用してウェブサイトの変更後に要素を再配置。
126
+ - 🎯 **スマート柔軟選択**:CSSセレクタ、XPathセレクタ、フィルタベース検索、テキスト検索、正規表現検索など。
127
+ - 🔍 **類似要素の検出**:見つかった要素に類似した要素を自動的に特定。
128
+ - 🤖 **AIと使用するMCPサーバー**:AI支援Web Scrapingとデータ抽出のための組み込みMCPサーバー。MCPサーバーは、AI(Claude/Cursorなど)に渡す前にScraplingを活用してターゲットコンテンツを抽出する強力でカスタムな機能を備えており、操作を高速化し、トークン使用量を最小限に抑えることでコストを削減します。([デモ動画](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
129
+
130
+ ### 高性能で実戦テスト済みのアーキテクチャ
131
+ - 🚀 **超高速**:ほとんどのPythonスクレイピングライブラリを上回る最適化されたパフォーマンス。
132
+ - 🔋 **メモリ効率**:最小のメモリフットプリントのための最適化されたデータ構造と遅延読み込み。
133
+ - ⚡ **高速JSONシリアル化**:標準ライブラリの10倍の速度。
134
+ - 🏗️ **実戦テスト済み**:Scraplingは92%のテストカバレッジと完全な型ヒントカバレッジを備えているだけでなく、過去1年間に数百人のWeb Scraperによって毎日使用されてきました。
135
+
136
+ ### 開発者/Web Scraperにやさしい体験
137
+ - 🎯 **インタラクティブWeb Scraping Shell**:Scrapling統合、ショートカット、curlリクエストをScraplingリクエストに変換したり、ブラウザでリクエスト結果を表示したりするなどの新しいツールを備えたオプションの組み込みIPython Shellで、Web Scrapingスクリプトの開発を加速。
138
+ - 🚀 **ターミナルから直接使用**:オプションで、コードを一行も書かずにScraplingを使用してURLをスクレイプできます!
139
+ - 🛠️ **豊富なナビゲーションAPI**:親、兄弟、子のナビゲーションメソッドによる高度なDOMトラバーサル。
140
+ - 🧬 **強化されたテキスト処理**:組み込みの正規表現、クリーニングメソッド、最適化された文字列操作。
141
+ - 📝 **自動セレクタ生成**:任意の要素に対して堅牢なCSS/XPathセレクタを生成。
142
+ - 🔌 **馴染みのあるAPI**:Scrapy/Parselで使用されている同じ疑似要素を持つScrapy/BeautifulSoupに似た設計。
143
+ - 📘 **完全な型カバレッジ**:優れたIDEサポートとコード補完のための完全な型ヒント。コードベース全体が変更のたびに**PyRight**と**MyPy**で自動的にスキャンされます。
144
+ - 🔋 **すぐに使えるDockerイメージ**:各リリースで、すべてのブラウザを含むDockerイメージが自動的にビルドおよびプッシュされます。
145
+
146
+ ## はじめに
147
+
148
+ 深く掘り下げずに、Scraplingにできることの簡単な概要をお見せしましょう。
149
+
150
+ ### 基本的な使い方
151
+ Sessionサポート付きHTTPリクエスト
152
+ ```python
153
+ from scrapling.fetchers import Fetcher, FetcherSession
154
+
155
+ with FetcherSession(impersonate='chrome') as session: # ChromeのTLS fingerprintの最新バージョンを使用
156
+ page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
157
+ quotes = page.css('.quote .text::text').getall()
158
+
159
+ # または一回限りのリクエストを使用
160
+ page = Fetcher.get('https://quotes.toscrape.com/')
161
+ quotes = page.css('.quote .text::text').getall()
162
+ ```
163
+ 高度なステルスモード
164
+ ```python
165
+ from scrapling.fetchers import StealthyFetcher, StealthySession
166
+
167
+ with StealthySession(headless=True, solve_cloudflare=True) as session: # 完了するまでブラウザを開いたままにする
168
+ page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
169
+ data = page.css('#padded_content a').getall()
170
+
171
+ # または一回限りのリクエストスタイル、このリクエストのためにブラウザを開き、完了後に閉じる
172
+ page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
173
+ data = page.css('#padded_content a').getall()
174
+ ```
175
+ 完全なブラウザ自動化
176
+ ```python
177
+ from scrapling.fetchers import DynamicFetcher, DynamicSession
178
+
179
+ with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # 完了するまでブラウザを開いたままにする
180
+ page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
181
+ data = page.xpath('//span[@class="text"]/text()').getall() # お好みであればXPathセレクタを使用
182
+
183
+ # または一回限りのリクエストスタイル、このリクエストのためにブラウザを開き、完了後に閉じる
184
+ page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
185
+ data = page.css('.quote .text::text').getall()
186
+ ```
187
+
188
+ ### Spider
189
+ 並行リクエスト、複数のSessionタイプ、Pause & Resumeを備えた本格的なクローラーを構築:
190
+ ```python
191
+ from scrapling.spiders import Spider, Request, Response
192
+
193
+ class QuotesSpider(Spider):
194
+ name = "quotes"
195
+ start_urls = ["https://quotes.toscrape.com/"]
196
+ concurrent_requests = 10
197
+
198
+ async def parse(self, response: Response):
199
+ for quote in response.css('.quote'):
200
+ yield {
201
+ "text": quote.css('.text::text').get(),
202
+ "author": quote.css('.author::text').get(),
203
+ }
204
+
205
+ next_page = response.css('.next a')
206
+ if next_page:
207
+ yield response.follow(next_page[0].attrib['href'])
208
+
209
+ result = QuotesSpider().start()
210
+ print(f"{len(result.items)}件の引用をスクレイプしました")
211
+ result.items.to_json("quotes.json")
212
+ ```
213
+ 単一のSpiderで複数のSessionタイプを使用:
214
+ ```python
215
+ from scrapling.spiders import Spider, Request, Response
216
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession
217
+
218
+ class MultiSessionSpider(Spider):
219
+ name = "multi"
220
+ start_urls = ["https://example.com/"]
221
+
222
+ def configure_sessions(self, manager):
223
+ manager.add("fast", FetcherSession(impersonate="chrome"))
224
+ manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)
225
+
226
+ async def parse(self, response: Response):
227
+ for link in response.css('a::attr(href)').getall():
228
+ # 保護されたページはステルスSessionを通してルーティング
229
+ if "protected" in link:
230
+ yield Request(link, sid="stealth")
231
+ else:
232
+ yield Request(link, sid="fast", callback=self.parse) # 明示的なcallback
233
+ ```
234
+ Checkpointを使用して長時間のクロールをPause & Resume:
235
+ ```python
236
+ QuotesSpider(crawldir="./crawl_data").start()
237
+ ```
238
+ Ctrl+Cを押すと正常に一時停止し、進捗は自動的に保存されます。後でSpiderを再度起動する際に同じ`crawldir`を渡すと、中断したところから再開します。
239
+
240
+ ### 高度なパースとナビゲーション
241
+ ```python
242
+ from scrapling.fetchers import Fetcher
243
+
244
+ # 豊富な要素選択とナビゲーション
245
+ page = Fetcher.get('https://quotes.toscrape.com/')
246
+
247
+ # 複数の選択メソッドで引用を取得
248
+ quotes = page.css('.quote') # CSSセレクタ
249
+ quotes = page.xpath('//div[@class="quote"]') # XPath
250
+ quotes = page.find_all('div', {'class': 'quote'}) # BeautifulSoupスタイル
251
+ # 以下と同じ
252
+ quotes = page.find_all('div', class_='quote')
253
+ quotes = page.find_all(['div'], class_='quote')
254
+ quotes = page.find_all(class_='quote') # など...
255
+ # テキスト内容で要素を検索
256
+ quotes = page.find_by_text('quote', tag='div')
257
+
258
+ # 高度なナビゲーション
259
+ quote_text = page.css('.quote')[0].css('.text::text').get()
260
+ quote_text = page.css('.quote').css('.text::text').getall() # チェーンセレクタ
261
+ first_quote = page.css('.quote')[0]
262
+ author = first_quote.next_sibling.css('.author::text')
263
+ parent_container = first_quote.parent
264
+
265
+ # 要素の関連性と類似性
266
+ similar_elements = first_quote.find_similar()
267
+ below_elements = first_quote.below_elements()
268
+ ```
269
+ ウェブサイトを取得せずにパーサーをすぐに使用することもできます:
270
+ ```python
271
+ from scrapling.parser import Selector
272
+
273
+ page = Selector("<html>...</html>")
274
+ ```
275
+ まったく同じ方法で動作します!
276
+
277
+ ### 非同期Session管理の例
278
+ ```python
279
+ import asyncio
280
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession
281
+
282
+ async with FetcherSession(http3=True) as session: # `FetcherSession`はコンテキストアウェアで、同期/非同期両方のパターンで動作可能
283
+ page1 = session.get('https://quotes.toscrape.com/')
284
+ page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
285
+
286
+ # 非同期Sessionの使用
287
+ async with AsyncStealthySession(max_pages=2) as session:
288
+ tasks = []
289
+ urls = ['https://example.com/page1', 'https://example.com/page2']
290
+
291
+ for url in urls:
292
+ task = session.fetch(url)
293
+ tasks.append(task)
294
+
295
+ print(session.get_pool_stats()) # オプション - ブラウザタブプールのステータス(ビジー/フリー/エラー)
296
+ results = await asyncio.gather(*tasks)
297
+ print(session.get_pool_stats())
298
+ ```
299
+
300
+ ## CLIとインタラクティブShell
301
+
302
+ Scraplingには強力なコマンドラインインターフェー��が含まれています:
303
+
304
+ [![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)
305
+
306
+ インタラクティブWeb Scraping Shellを起動
307
+ ```bash
308
+ scrapling shell
309
+ ```
310
+ プログラミングせずに直接ページをファイルに抽出(デフォルトで`body`タグ内のコンテンツを抽出)。出力ファイルが`.txt`で終わる場合、ターゲットのテキストコンテンツが抽出されます。`.md`で終わる場合、HTMLコンテンツのMarkdown表現になります。`.html`で終わる場合、HTMLコンテンツそのものになります。
311
+ ```bash
312
+ scrapling extract get 'https://example.com' content.md
313
+ scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # CSSセレクタ'#fromSkipToProducts'に一致するすべての要素
314
+ scrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless
315
+ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare
316
+ ```
317
+
318
+ > [!NOTE]
319
+ > MCPサーバーやインタラクティブWeb Scraping Shellなど、他にも多くの追加機能がありますが、このページは簡潔に保ちたいと思います。完全なドキュメントは[こちら](https://scrapling.readthedocs.io/en/latest/)をご覧ください
320
+
321
+ ## パフォーマンスベンチマーク
322
+
323
+ Scraplingは強力であるだけでなく、超高速です。以下のベンチマークは、Scraplingのパーサーを他の人気ライブラリの最新バージョンと比較しています。
324
+
325
+ ### テキスト抽出速度テスト(5000個のネストされた要素)
326
+
327
+ | # | ライブラリ | 時間(ms) | vs Scrapling |
328
+ |---|:-----------------:|:---------:|:------------:|
329
+ | 1 | Scrapling | 2.02 | 1.0x |
330
+ | 2 | Parsel/Scrapy | 2.04 | 1.01 |
331
+ | 3 | Raw Lxml | 2.54 | 1.257 |
332
+ | 4 | PyQuery | 24.17 | ~12x |
333
+ | 5 | Selectolax | 82.63 | ~41x |
334
+ | 6 | MechanicalSoup | 1549.71 | ~767.1x |
335
+ | 7 | BS4 with Lxml | 1584.31 | ~784.3x |
336
+ | 8 | BS4 with html5lib | 3391.91 | ~1679.1x |
337
+
338
+
339
+ ### 要素類似性とテキスト検索のパフォーマンス
340
+
341
+ Scraplingの適応型要素検索機能は代替手段を大幅に上回ります:
342
+
343
+ | ライブラリ | 時間(ms) | vs Scrapling |
344
+ |-------------|:---------:|:------------:|
345
+ | Scrapling | 2.39 | 1.0x |
346
+ | AutoScraper | 12.45 | 5.209x |
347
+
348
+
349
+ > すべてのベンチマークは100回以上の実行の平均を表します。方法論については[benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py)を参照してください。
350
+
351
+ ## インストール
352
+
353
+ ScraplingにはPython 3.10以上が必要です:
354
+
355
+ ```bash
356
+ pip install scrapling
357
+ ```
358
+
359
+ このインストールにはパーサーエンジンとその依存関係のみが含まれており、Fetcherやコマンドライン依存関係は含まれていません。
360
+
361
+ ### オプションの依存関係
362
+
363
+ 1. 以下の追加機能、Fetcher、またはそれらのクラスのいずれかを使用する場合は、Fetcherの依存関係とブラウザの依存関係を次のようにインストールする必要があります:
364
+ ```bash
365
+ pip install "scrapling[fetchers]"
366
+
367
+ scrapling install # normal install
368
+ scrapling install --force # force reinstall
369
+ ```
370
+
371
+ これにより、すべてのブラウザ、およびそれらのシステム依存関係とfingerprint操作依存関係がダウンロードされます。
372
+
373
+ または、コマンドを実行する代わりにコードからインストールすることもできます:
374
+ ```python
375
+ from scrapling.cli import install
376
+
377
+ install([], standalone_mode=False) # normal install
378
+ install(["--force"], standalone_mode=False) # force reinstall
379
+ ```
380
+
381
+ 2. 追加機能:
382
+ - MCPサーバー機能をインストール:
383
+ ```bash
384
+ pip install "scrapling[ai]"
385
+ ```
386
+ - Shell機能(Web Scraping Shellと`extract`コマンド)をインストール:
387
+ ```bash
388
+ pip install "scrapling[shell]"
389
+ ```
390
+ - すべてをインストール:
391
+ ```bash
392
+ pip install "scrapling[all]"
393
+ ```
394
+ これらの追加機能のいずれかの後(まだインストールしていない場合)、`scrapling install`でブラウザの依存関係をインストールする必要があることを忘れないでください
395
+
396
+ ### Docker
397
+ DockerHubから次のコマンドですべての追加機能とブラウザを含むDockerイメージをインストールすることもできます:
398
+ ```bash
399
+ docker pull pyd4vinci/scrapling
400
+ ```
401
+ またはGitHubレジストリからダウンロード:
402
+ ```bash
403
+ docker pull ghcr.io/d4vinci/scrapling:latest
404
+ ```
405
+ このイメージは、GitHub Actionsとリポジトリの��インブランチを使用して自動的にビルドおよびプッシュされます。
406
+
407
+ ## 貢献
408
+
409
+ 貢献を歓迎します!始める前に[貢献ガイドライン](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md)をお読みください。
410
+
411
+ ## 免責事項
412
+
413
+ > [!CAUTION]
414
+ > このライブラリは教育および研究目的のみで提供されています。このライブラリを使用することにより、地域および国際的なデータスクレイピングおよびプライバシー法に準拠することに同意したものとみなされます。著者および貢献者は、このソフトウェアの誤用について責任を負いません。常にウェブサイトの利用規約とrobots.txtファイルを尊重してください。
415
+
416
+ ## ライセンス
417
+
418
+ この作品はBSD-3-Clauseライセンスの下でライセンスされています。
419
+
420
+ ## 謝辞
421
+
422
+ このプロジェクトには次から適応されたコードが含まれています:
423
+ - Parsel(BSDライセンス)— [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)サブモジュールに使用
424
+
425
+ ---
426
+ <div align="center"><small>Karim Shoairによって❤️でデザインおよび作成されました。</small></div><br>
docs/README_RU.md ADDED
@@ -0,0 +1,426 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- mcp-name: io.github.D4Vinci/Scrapling -->
2
+
3
+ <h1 align="center">
4
+ <a href="https://scrapling.readthedocs.io">
5
+ <picture>
6
+ <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true">
7
+ <img alt="Scrapling Poster" src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true">
8
+ </picture>
9
+ </a>
10
+ <br>
11
+ <small>Effortless Web Scraping for the Modern Web</small>
12
+ </h1>
13
+
14
+ <p align="center">
15
+ <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
16
+ <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
17
+ <a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
18
+ <img alt="PyPI version" src="https://badge.fury.io/py/Scrapling.svg"></a>
19
+ <a href="https://pepy.tech/project/scrapling" alt="PyPI Downloads">
20
+ <img alt="PyPI Downloads" src="https://static.pepy.tech/personalized-badge/scrapling?period=total&units=INTERNATIONAL_SYSTEM&left_color=GREY&right_color=GREEN&left_text=Downloads"></a>
21
+ <br/>
22
+ <a href="https://discord.gg/EMgGbDceNQ" alt="Discord" target="_blank">
23
+ <img alt="Discord" src="https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ">
24
+ </a>
25
+ <a href="https://x.com/Scrapling_dev" alt="X (formerly Twitter)">
26
+ <img alt="X (formerly Twitter) Follow" src="https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev">
27
+ </a>
28
+ <br/>
29
+ <a href="https://pypi.org/project/scrapling/" alt="Supported Python versions">
30
+ <img alt="Supported Python versions" src="https://img.shields.io/pypi/pyversions/scrapling.svg"></a>
31
+ </p>
32
+
33
+ <p align="center">
34
+ <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/"><strong>Методы выбора</strong></a>
35
+ &middot;
36
+ <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/"><strong>Выбор Fetcher</strong></a>
37
+ &middot;
38
+ <a href="https://scrapling.readthedocs.io/en/latest/spiders/architecture.html"><strong>Пауки</strong></a>
39
+ &middot;
40
+ <a href="https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html"><strong>Ротация прокси</strong></a>
41
+ &middot;
42
+ <a href="https://scrapling.readthedocs.io/en/latest/cli/overview/"><strong>CLI</strong></a>
43
+ &middot;
44
+ <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/"><strong>Режим MCP</strong></a>
45
+ </p>
46
+
47
+ Scrapling — это адаптивный фреймворк для Web Scraping, который берёт на себя всё: от одного запроса до полномасштабного обхода сайтов.
48
+
49
+ Его парсер учится на изменениях сайтов и автоматически перемещает ваши элементы при обновлении страниц. Его Fetcher'ы обходят анти-бот системы вроде Cloudflare Turnstile прямо из коробки. А его Spider-фреймворк позволяет масштабироваться до параллельных, многосессионных обходов с Pause & Resume и автоматической ротацией Proxy — и всё это в нескольких строках Python. Одна библиотека, без компромиссов.
50
+
51
+ Молниеносно быстрые обходы с отслеживанием статистики в реальном времени и Streaming. Создано веб-скраперами для веб-скраперов и обычных пользователей — здесь есть что-то для каждого.
52
+
53
+ ```python
54
+ from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
55
+ StealthyFetcher.adaptive = True
56
+ p = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # Загрузите сайт незаметно!
57
+ products = p.css('.product', auto_save=True) # Скрапьте данные, которые переживут изменения дизайна сайта!
58
+ products = p.css('.product', adaptive=True) # Позже, если структура сайта изменится, передайте `adaptive=True`, чтобы найти их!
59
+ ```
60
+ Или масштабируйте до полного обхода
61
+ ```python
62
+ from scrapling.spiders import Spider, Response
63
+
64
+ class MySpider(Spider):
65
+ name = "demo"
66
+ start_urls = ["https://example.com/"]
67
+
68
+ async def parse(self, response: Response):
69
+ for item in response.css('.product'):
70
+ yield {"title": item.css('h2::text').get()}
71
+
72
+ MySpider().start()
73
+ ```
74
+
75
+
76
+ # Платиновые спонсоры
77
+
78
+ <i><sub>Хотите стать первой компанией, которая появится здесь? Нажмите [здесь](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)</sub></i>
79
+ # Спонсоры
80
+
81
+ <!-- sponsors -->
82
+
83
+ <a href="https://www.scrapeless.com/en?utm_source=official&utm_term=scrapling" target="_blank" title="Effortless Web Scraping Toolkit for Business and Developers"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg"></a>
84
+ <a href="https://www.thordata.com/?ls=github&lk=github" target="_blank" title="Unblockable proxies and scraping infrastructure, delivering real-time, reliable web data to power AI models and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
85
+ <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
86
+ <a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
87
+ <a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
88
+ <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
89
+ <a href="https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci" target="_blank" title="The web scraping service that actually beats anti-bot systems!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png"></a>
90
+ <a href="https://proxyempire.io/" target="_blank" title="Collect The Data Your Project Needs with the Best Residential Proxies"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png"></a>
91
+ <a href="https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling" target="_blank" title="Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png"></a>
92
+
93
+
94
+ <a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
95
+ <a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
96
+ <a href="https://browser.cash/?utm_source=D4Vinci&utm_medium=referral" target="_blank" title="Browser Automation & AI Browser Agent Platform"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png"></a>
97
+
98
+ <!-- /sponsors -->
99
+
100
+ <i><sub>Хотите показать здесь свою рекламу? Нажмите [здесь](https://github.com/sponsors/D4Vinci) и выберите подходящий вам уровень!</sub></i>
101
+
102
+ ---
103
+
104
+ ## Ключевые особенности
105
+
106
+ ### Spider'ы — полноценный фреймворк для обхода сайтов
107
+ - 🕷️ **Scrapy-подобный Spider API**: Определяйте Spider'ов с `start_urls`, async `parse` callback'ами и объектами `Request`/`Response`.
108
+ - ⚡ **Параллельный обход**: Настраиваемые лимиты параллелизма, ограничение скорости по домену и задержки загрузки.
109
+ - 🔄 **Поддержка нескольких сессий**: Единый интерфейс для HTTP-запросов и скрытных headless-браузеров в одном Spider — маршрутизируйте запросы к разным сессиям по ID.
110
+ - 💾 **Pause & Resume**: Persistence обхода на основе Checkpoint'ов. Нажмите Ctrl+C для мягкой остановки; перезапустите, чтобы продолжить с того места, где вы остановились.
111
+ - 📡 **Режим Streaming**: Стримьте извлечённые элементы по мере их поступления через `async for item in spider.stream()` со статистикой в реальном времени — идеально для UI, конвейеров и длительных обходов.
112
+ - 🛡️ **Обнаружение заблокированных запросов**: Автоматическое обнаружение и повторная отправка заблокированных запросов с настраиваемо�� логикой.
113
+ - 📦 **Встроенный экспорт**: Экспортируйте результаты через хуки и собственный конвейер или встроенный JSON/JSONL с `result.items.to_json()` / `result.items.to_jsonl()` соответственно.
114
+
115
+ ### Продвинутая загрузка сайтов с поддержкой Session
116
+ - **HTTP-запросы**: Быстрые и скрытные HTTP-запросы с классом `Fetcher`. Может имитировать TLS fingerprint браузера, заголовки и использовать HTTP/3.
117
+ - **Динамическая загрузка**: Загрузка динамических сайтов с полной автоматизацией браузера через класс `DynamicFetcher`, поддерживающий Chromium от Playwright и Google Chrome.
118
+ - **Обход анти-ботов**: Расширенные возможности скрытности с `StealthyFetcher` и подмену fingerprint'ов. Может легко обойти все типы Cloudflare Turnstile/Interstitial с помощью автоматизации.
119
+ - **Управление сессиями**: Поддержка постоянных сессий с классами `FetcherSession`, `StealthySession` и `DynamicSession` для управления cookie и состоянием между запросами.
120
+ - **Ротация Proxy**: Встроенный `ProxyRotator` с циклической или пользовательскими стратегиями для всех типов сессий, а также переопределение Proxy для каждого запроса.
121
+ - **Блокировка доменов**: Блокируйте запросы к определённым доменам (и их поддоменам) в браузерных Fetcher'ах.
122
+ - **Поддержка async**: Полная async-поддержка во всех Fetcher'ах и выделенных async-классах сессий.
123
+
124
+ ### Адаптивный скрапинг и интеграция с ИИ
125
+ - 🔄 **Умное отслеживание элементов**: Перемещайте элементы после изменений сайта с помощью интеллектуальных алгоритмов подобия.
126
+ - 🎯 **Умный гибкий выбор**: CSS-селекторы, XPath-селекторы, поиск на основе фильтров, текстовый поиск, поиск по регулярным выражениям и многое другое.
127
+ - 🔍 **Поиск похожих элементов**: Автоматически находите элементы, похожие на найденные.
128
+ - 🤖 **MCP-сервер для использования с ИИ**: Встроенный MCP-сервер для Web Scraping с помощью ИИ и извлечения данных. MCP-сервер обладает мощными пользовательскими возможностями, которые используют Scrapling для извлечения целевого контента перед передачей его ИИ (Claude/Cursor/и т.д.), тем самым ускоряя операции и снижая затраты за счёт минимизации использования токенов. ([демо-видео](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
129
+
130
+ ### Высокопроизводительная и проверенная в боях архитектура
131
+ - 🚀 **Молниеносная скорость**: Оптимизированная производительность, превосходящая большинство Python-библиотек для скрапинга.
132
+ - 🔋 **Эффективное использование памяти**: Оптимизированные структуры данных и ленивая загрузка для минимального потребления памяти.
133
+ - ⚡ **Быстрая сериализация JSON**: В 10 раз быстрее стандартной библиотеки.
134
+ - 🏗️ **Проверено в боях**: Scrapling имеет не только 92% покрытия тестами и полное покрытие type hints, но и ежедневно использовался сотнями веб-скраперов в течение последнего года.
135
+
136
+ ### Удобный для разработчиков/веб-скраперов опыт
137
+ - 🎯 **Интерактивная Web Scraping Shell**: Опциональная встроенная IPython-оболочка с интеграцией Scrapling, ярлыками и новыми инструментами для ускорения разработки скриптов Web Scraping, такими как преобразование curl-запросов в запросы Scrapling и просмотр результатов з��просов в браузере.
138
+ - 🚀 **Используйте прямо из терминала**: При желании вы можете использовать Scrapling для скрапинга URL без написания ни одной строки кода!
139
+ - 🛠️ **Богатый API навигации**: Расширенный обход DOM с методами навигации по родителям, братьям и детям.
140
+ - 🧬 **Улучшенная обработка текста**: Встроенные регулярные выражения, методы очистки и оптимизированные операции со строками.
141
+ - 📝 **Автоматическая генерация селекторов**: Генерация надёжных CSS/XPath-селекторов для любого элемента.
142
+ - 🔌 **Знакомый API**: Похож на Scrapy/BeautifulSoup с теми же псевдоэлементами, используемыми в Scrapy/Parsel.
143
+ - 📘 **Полное покрытие типами**: Полные type hints для отличной поддержки IDE и автодополнения кода. Вся кодовая база автоматически проверяется **PyRight** и **MyPy** при каждом изменении.
144
+ - 🔋 **Готовый Docker-образ**: С каждым релизом автоматически создаётся и публикуется Docker-образ, содержащий все браузеры.
145
+
146
+ ## Начало работы
147
+
148
+ Давайте кратко покажем, на что способен Scrapling, без глубокого погружения.
149
+
150
+ ### Базовое использование
151
+ HTTP-запросы с поддержкой Session
152
+ ```python
153
+ from scrapling.fetchers import Fetcher, FetcherSession
154
+
155
+ with FetcherSession(impersonate='chrome') as session: # Используйте последнюю версию TLS fingerprint Chrome
156
+ page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)
157
+ quotes = page.css('.quote .text::text').getall()
158
+
159
+ # Или используйте одноразовые запросы
160
+ page = Fetcher.get('https://quotes.toscrape.com/')
161
+ quotes = page.css('.quote .text::text').getall()
162
+ ```
163
+ Расширенный режим скрытности
164
+ ```python
165
+ from scrapling.fetchers import StealthyFetcher, StealthySession
166
+
167
+ with StealthySession(headless=True, solve_cloudflare=True) as session: # Держите браузер открытым, пока не закончите
168
+ page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
169
+ data = page.css('#padded_content a').getall()
170
+
171
+ # Или используйте стиль одноразового запроса — открывает браузер для этого запроса, затем закрывает его после завершения
172
+ page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')
173
+ data = page.css('#padded_content a').getall()
174
+ ```
175
+ Полная автоматизация браузера
176
+ ```python
177
+ from scrapling.fetchers import DynamicFetcher, DynamicSession
178
+
179
+ with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # Держите браузер открытым, пока не закончите
180
+ page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
181
+ data = page.xpath('//span[@class="text"]/text()').getall() # XPath-селектор, если вы предпочитаете его
182
+
183
+ # Или используйте стиль одноразового запроса — открывает браузер для этого запроса, затем закрывает его после завершения
184
+ page = DynamicFetcher.fetch('https://quotes.toscrape.com/')
185
+ data = page.css('.quote .text::text').getall()
186
+ ```
187
+
188
+ ### Spider'ы
189
+ Создавайте полноценные обходчики с параллельными запросами, несколькими типами сессий и Pause & Resume:
190
+ ```python
191
+ from scrapling.spiders import Spider, Request, Response
192
+
193
+ class QuotesSpider(Spider):
194
+ name = "quotes"
195
+ start_urls = ["https://quotes.toscrape.com/"]
196
+ concurrent_requests = 10
197
+
198
+ async def parse(self, response: Response):
199
+ for quote in response.css('.quote'):
200
+ yield {
201
+ "text": quote.css('.text::text').get(),
202
+ "author": quote.css('.author::text').get(),
203
+ }
204
+
205
+ next_page = response.css('.next a')
206
+ if next_page:
207
+ yield response.follow(next_page[0].attrib['href'])
208
+
209
+ result = QuotesSpider().start()
210
+ print(f"Извлечено {len(result.items)} цитат")
211
+ result.items.to_json("quotes.json")
212
+ ```
213
+ Используйте несколько типов сессий в одном Spider:
214
+ ```python
215
+ from scrapling.spiders import Spider, Request, Response
216
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession
217
+
218
+ class MultiSessionSpider(Spider):
219
+ name = "multi"
220
+ start_urls = ["https://example.com/"]
221
+
222
+ def configure_sessions(self, manager):
223
+ manager.add("fast", FetcherSession(impersonate="chrome"))
224
+ manager.add("stealth", AsyncStealthySession(headless=True), lazy=True)
225
+
226
+ async def parse(self, response: Response):
227
+ for link in response.css('a::attr(href)').getall():
228
+ # Направляйте защищённые страницы через stealth-сессию
229
+ if "protected" in link:
230
+ yield Request(link, sid="stealth")
231
+ else:
232
+ yield Request(link, sid="fast", callback=self.parse) # явный callback
233
+ ```
234
+ Приостанавливайте и возобновляйте длительные обходы с помощью Checkpoint'ов, запуская Spider следующим образом:
235
+ ```python
236
+ QuotesSpider(crawldir="./crawl_data").start()
237
+ ```
238
+ Нажмите Ctrl+C для мягкой остановки — прогресс сохраняется автоматически. Позже, когда вы снова запустите Spider, передайте тот же `crawldir`, и он продолжит с того места, где остановился.
239
+
240
+ ### Продвинутый парсинг и навигация
241
+ ```python
242
+ from scrapling.fetchers import Fetcher
243
+
244
+ # Богатый выбор элементов и навигация
245
+ page = Fetcher.get('https://quotes.toscrape.com/')
246
+
247
+ # Получение цитат различными методами выбора
248
+ quotes = page.css('.quote') # CSS-селектор
249
+ quotes = page.xpath('//div[@class="quote"]') # XPath
250
+ quotes = page.find_all('div', {'class': 'quote'}) # В стиле BeautifulSoup
251
+ # То же самое, что
252
+ quotes = page.find_all('div', class_='quote')
253
+ quotes = page.find_all(['div'], class_='quote')
254
+ quotes = page.find_all(class_='quote') # и так далее...
255
+ # Найти элемент по текстовому содержимому
256
+ quotes = page.find_by_text('quote', tag='div')
257
+
258
+ # Продвинутая навигация
259
+ quote_text = page.css('.quote')[0].css('.text::text').get()
260
+ quote_text = page.css('.quote').css('.text::text').getall() # Цепочка селекторов
261
+ first_quote = page.css('.quote')[0]
262
+ author = first_quote.next_sibling.css('.author::text')
263
+ parent_container = first_quote.parent
264
+
265
+ # Связи элементов и подобие
266
+ similar_elements = first_quote.find_similar()
267
+ below_elements = first_quote.below_elements()
268
+ ```
269
+ Вы можете использовать парсер напрямую, если не хотите загружать сайты, как показано ниже:
270
+ ```python
271
+ from scrapling.parser import Selector
272
+
273
+ page = Selector("<html>...</html>")
274
+ ```
275
+ И он работает точно так же!
276
+
277
+ ### Примеры async Session
278
+ ```python
279
+ import asyncio
280
+ from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession
281
+
282
+ async with FetcherSession(http3=True) as session: # `FetcherSession` контекстно-осведомлён и может работать как в sync, так и в async-режимах
283
+ page1 = session.get('https://quotes.toscrape.com/')
284
+ page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')
285
+
286
+ # Использование async-сессии
287
+ async with AsyncStealthySession(max_pages=2) as session:
288
+ tasks = []
289
+ urls = ['https://example.com/page1', 'https://example.com/page2']
290
+
291
+ for url in urls:
292
+ task = session.fetch(url)
293
+ tasks.append(task)
294
+
295
+ print(session.get_pool_stats()) # Опционально — статус пула вкладок браузера (занят/свободен/ошибка)
296
+ results = await asyncio.gather(*tasks)
297
+ print(session.get_pool_stats())
298
+ ```
299
+
300
+ ## CLI и интерактивная Shell
301
+
302
+ Scrapling включает мощный интерфейс командной строки:
303
+
304
+ [![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)
305
+
306
+ Запустить интерактивную Web Scraping Shell
307
+ ```bash
308
+ scrapling shell
309
+ ```
310
+ Извлечь страницы в файл напрямую без программирования (по умолчанию извлекает содержимое внутри тега `body`). Если выходной файл заканчивается на `.txt`, будет извлечено текстовое содержимое цели. Если заканчивается на `.md`, это будет Markdown-представление HTML-содержимого; если заканчивается на `.html`, это будет само HTML-содержимое.
311
+ ```bash
312
+ scrapling extract get 'https://example.com' content.md
313
+ scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # Все элементы, соответствующие CSS-селектору '#fromSkipToProducts'
314
+ scrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless
315
+ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare
316
+ ```
317
+
318
+ > [!NOTE]
319
+ > Есть множество дополнительных возможностей, но мы хотим сохранить эту страницу краткой, включая MCP-сервер и интерактивную Web Scraping Shell. Ознакомьтесь с полной документацией [здесь](https://scrapling.readthedocs.io/en/latest/)
320
+
321
+ ## Тесты производительности
322
+
323
+ Scrapling не только мощный — он ещё и невероятно быстрый. Следующие тесты производительности сравнивают парсер Scrapling с последними версиями других популярных библиотек.
324
+
325
+ ### Тест скорости извлечения текста (5000 вложенных элементов)
326
+
327
+ | # | Библиотека | Время (мс) | vs Scrapling |
328
+ |---|:-----------------:|:----------:|:------------:|
329
+ | 1 | Scrapling | 2.02 | 1.0x |
330
+ | 2 | Parsel/Scrapy | 2.04 | 1.01 |
331
+ | 3 | Raw Lxml | 2.54 | 1.257 |
332
+ | 4 | PyQuery | 24.17 | ~12x |
333
+ | 5 | Selectolax | 82.63 | ~41x |
334
+ | 6 | MechanicalSoup | 1549.71 | ~767.1x |
335
+ | 7 | BS4 with Lxml | 1584.31 | ~784.3x |
336
+ | 8 | BS4 with html5lib | 3391.91 | ~1679.1x |
337
+
338
+
339
+ ### Производительность подобия элементов и текстового поиска
340
+
341
+ Возможности адаптивного поиска элементов Scrapling значительно превосходят альтернативы:
342
+
343
+ | Библиотека | Время (мс) | vs Scrapling |
344
+ |-------------|:----------:|:------------:|
345
+ | Scrapling | 2.39 | 1.0x |
346
+ | AutoScraper | 12.45 | 5.209x |
347
+
348
+
349
+ > Все тесты производительности представляют собой средние значения более 100 запусков. См. [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) для методологии.
350
+
351
+ ## Установка
352
+
353
+ Scrapling требует Python 3.10 или выше:
354
+
355
+ ```bash
356
+ pip install scrapling
357
+ ```
358
+
359
+ Эта установка включает только движок парсера и его зависимости, без каких-либо Fetcher'ов или зависимостей командной строки.
360
+
361
+ ### Опциональные зависимости
362
+
363
+ 1. Если вы собираетесь использовать какие-либо из дополнительных возможностей ниже, Fetcher'ы или их классы, вам необходимо установить зависимости Fetcher'ов и браузеров следующим образом:
364
+ ```bash
365
+ pip install "scrapling[fetchers]"
366
+
367
+ scrapling install # normal install
368
+ scrapling install --force # force reinstall
369
+ ```
370
+
371
+ Это загрузит все браузеры вместе с их системными зависимостями и зависимостями для манипуляции fingerprint'ами.
372
+
373
+ Или вы можете установить их из кода вместо выполнения команды:
374
+ ```python
375
+ from scrapling.cli import install
376
+
377
+ install([], standalone_mode=False) # normal install
378
+ install(["--force"], standalone_mode=False) # force reinstall
379
+ ```
380
+
381
+ 2. Дополнительные возможности:
382
+ - Установить функцию MCP-сервера:
383
+ ```bash
384
+ pip install "scrapling[ai]"
385
+ ```
386
+ - Установить функции Shell (Web Scraping Shell и команда `extract`):
387
+ ```bash
388
+ pip install "scrapling[shell]"
389
+ ```
390
+ - Установить всё:
391
+ ```bash
392
+ pip install "scrapling[all]"
393
+ ```
394
+ Помните, что вам нужно установить зависимости браузеров с помощью `scrapling install` после любого из этих дополнений (если вы ещё этого не сделали)
395
+
396
+ ### Docker
397
+ Вы также можете установить Docker-образ со всеми дополнениями и браузерами с помощью следующей команды из DockerHub:
398
+ ```bash
399
+ docker pull pyd4vinci/scrapling
400
+ ```
401
+ Или скачайте его из реестра GitHub:
402
+ ```bash
403
+ docker pull ghcr.io/d4vinci/scrapling:latest
404
+ ```
405
+ Этот образ автоматически создаётся и публикуе��ся с помощью GitHub Actions и основной ветки репозитория.
406
+
407
+ ## Участие в разработке
408
+
409
+ Мы приветствуем участие! Пожалуйста, прочитайте наши [руководства по участию в разработке](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) перед началом работы.
410
+
411
+ ## Отказ от ответственности
412
+
413
+ > [!CAUTION]
414
+ > Эта библиотека предоставляется только в образовательных и исследовательских целях. Используя эту библиотеку, вы соглашаетесь соблюдать местные и международные законы о скрапинге данных и конфиденциальности. Авторы и участники не несут ответственности за любое неправомерное использование этого программного обеспечения. Всегда уважайте условия обслуживания веб-сайтов и файлы robots.txt.
415
+
416
+ ## Лицензия
417
+
418
+ Эта работа лицензирована по лицензии BSD-3-Clause.
419
+
420
+ ## Благодарности
421
+
422
+ Этот проект включает код, адаптированный из:
423
+ - Parsel (лицензия BSD) — Используется для подмодуля [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)
424
+
425
+ ---
426
+ <div align="center"><small>Разработано и создано с ❤️ Карим Шоаир.</small></div><br>
docs/ai/mcp-server.md ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Scrapling MCP Server Guide
2
+
3
+ <iframe width="560" height="315" src="https://www.youtube.com/embed/qyFk3ZNwOxE?si=3FHzgcYCb66iJ6e3" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>
4
+
5
+ The **Scrapling MCP Server** is a new feature that brings Scrapling's powerful Web Scraping capabilities directly to your favorite AI chatbot or AI agent. This integration allows you to scrape websites, extract data, and bypass anti-bot protections conversationally through Claude's AI interface or any interface that supports MCP.
6
+
7
+ ## Features
8
+
9
+ The Scrapling MCP Server provides six powerful tools for web scraping:
10
+
11
+ ### 🚀 Basic HTTP Scraping
12
+ - **`get`**: Fast HTTP requests with browser fingerprint impersonation, generating real browser headers matching the TLS version, HTTP/3, and more!
13
+ - **`bulk_get`**: An async version of the above tool that allows scraping of multiple URLs at the same time!
14
+
15
+ ### 🌐 Dynamic Content Scraping
16
+ - **`fetch`**: Rapidly fetch dynamic content with Chromium/Chrome browser with complete control over the request/browser, and more!
17
+ - **`bulk_fetch`**: An async version of the above tool that allows scraping of multiple URLs in different browser tabs at the same time!
18
+
19
+ ### 🔒 Stealth Scraping
20
+ - **`stealthy_fetch`**: Uses our Stealthy browser to bypass Cloudflare Turnstile/Interstitial and other anti-bot systems with complete control over the request/browser!
21
+ - **`bulk_stealthy_fetch`**: An async version of the above tool that allows stealth scraping of multiple URLs in different browser tabs at the same time!
22
+
23
+ ### Key Capabilities
24
+ - **Smart Content Extraction**: Convert web pages/elements to Markdown, HTML, or extract a clean version of the text content
25
+ - **CSS Selector Support**: Use the Scrapling engine to target specific elements with precision before handing the content to the AI
26
+ - **Anti-Bot Bypass**: Handle Cloudflare Turnstile, Interstitial, and other protections
27
+ - **Proxy Support**: Use proxies for anonymity and geo-targeting
28
+ - **Browser Impersonation**: Mimic real browsers with TLS fingerprinting, real browser headers matching that version, and more
29
+ - **Parallel Processing**: Scrape multiple URLs concurrently for efficiency
30
+
31
+ #### But why use Scrapling MCP Server instead of other available tools?
32
+
33
+ Aside from its stealth capabilities and ability to bypass Cloudflare Turnstile/Interstitial, Scrapling's server is the only one that lets you select specific elements to pass to the AI, saving a lot of time and tokens!
34
+
35
+ The way other servers work is that they extract the content, then pass it all to the AI to extract the fields you want. This causes the AI to consume far more tokens than needed (from irrelevant content). Scrapling solves this problem by allowing you to pass a CSS selector to narrow down the content you want before passing it to the AI, which makes the whole process much faster and more efficient.
36
+
37
+ If you don't know how to write/use CSS selectors, don't worry. You can tell the AI in the prompt to write selectors to match possible fields for you and watch it try different combinations until it finds the right one, as we will show in the examples section.
38
+
39
+ ## Installation
40
+
41
+ Install Scrapling with MCP Support, then double-check that the browser dependencies are installed.
42
+
43
+ ```bash
44
+ # Install Scrapling with MCP server dependencies
45
+ pip install "scrapling[ai]"
46
+
47
+ # Install browser dependencies
48
+ scrapling install
49
+ ```
50
+
51
+ Or use the Docker image directly from the Docker registry:
52
+ ```bash
53
+ docker pull pyd4vinci/scrapling
54
+ ```
55
+ Or download it from the GitHub registry:
56
+ ```bash
57
+ docker pull ghcr.io/d4vinci/scrapling:latest
58
+ ```
59
+
60
+ ## Setting up the MCP Server
61
+
62
+ Here we will explain how to add Scrapling MCP Server to [Claude Desktop](https://claude.ai/download) and [Claude Code](https://www.anthropic.com/claude-code), but the same logic applies to any other chatbot that supports MCP:
63
+
64
+ ### Claude Desktop
65
+
66
+ 1. Open Claude Desktop
67
+ 2. Click the hamburger menu (☰) at the top left → Settings → Developer → Edit Config
68
+ 3. Add the Scrapling MCP server configuration:
69
+ ```json
70
+ "ScraplingServer": {
71
+ "command": "scrapling",
72
+ "args": [
73
+ "mcp"
74
+ ]
75
+ }
76
+ ```
77
+ If that's the first MCP server you're adding, set the content of the file to this:
78
+ ```json
79
+ {
80
+ "mcpServers": {
81
+ "ScraplingServer": {
82
+ "command": "scrapling",
83
+ "args": [
84
+ "mcp"
85
+ ]
86
+ }
87
+ }
88
+ }
89
+ ```
90
+ As per the [official article](https://modelcontextprotocol.io/quickstart/user), this action either creates a new configuration file if none exists or opens your existing configuration. The file is located at
91
+
92
+ 1. **MacOS**: `~/Library/Application Support/Claude/claude_desktop_config.json`
93
+ 2. **Windows**: `%APPDATA%\Claude\claude_desktop_config.json`
94
+
95
+ To ensure it's working, use the full path to the `scrapling` executable. Open the terminal and execute the following command:
96
+
97
+ 1. **MacOS**: `which scrapling`
98
+ 2. **Windows**: `where scrapling`
99
+
100
+ For me, on my Mac, it returned `/Users/<MyUsername>/.venv/bin/scrapling`, so the config I used in the end is:
101
+ ```json
102
+ {
103
+ "mcpServers": {
104
+ "ScraplingServer": {
105
+ "command": "/Users/<MyUsername>/.venv/bin/scrapling",
106
+ "args": [
107
+ "mcp"
108
+ ]
109
+ }
110
+ }
111
+ }
112
+ ```
113
+ #### Docker
114
+ If you are using the Docker image, then it would be something like
115
+ ```json
116
+ {
117
+ "mcpServers": {
118
+ "ScraplingServer": {
119
+ "command": "docker",
120
+ "args": [
121
+ "run", "-i", "--rm", "scrapling", "mcp"
122
+ ]
123
+ }
124
+ }
125
+ }
126
+ ```
127
+
128
+ The same logic applies to [Cursor](https://cursor.com/docs/context/mcp), [WindSurf](https://windsurf.com/university/tutorials/configuring-first-mcp-server), and others.
129
+
130
+ ### Claude Code
131
+ Here it's much simpler to do. If you have [Claude Code](https://www.anthropic.com/claude-code) installed, open the terminal and execute the following command:
132
+
133
+ ```bash
134
+ claude mcp add ScraplingServer "/Users/<MyUsername>/.venv/bin/scrapling" mcp
135
+ ```
136
+ Same as above, to get Scrapling's executable path, open the terminal and execute the following command:
137
+
138
+ 1. **MacOS**: `which scrapling`
139
+ 2. **Windows**: `where scrapling`
140
+
141
+ Here's the main article from Anthropic on [how to add MCP servers to Claude code](https://docs.anthropic.com/en/docs/claude-code/mcp#option-1%3A-add-a-local-stdio-server) for further details.
142
+
143
+
144
+ Then, after you've added the server, you need to completely quit and restart the app you used above. In Claude Desktop, you should see an MCP server indicator (🔧) in the bottom-right corner of the chat input or see `ScraplingServer` in the `Search and tools` dropdown in the chat input box.
145
+
146
+ ### Streamable HTTP
147
+ As per version 0.3.6, we have added the ability to make the MCP server use the 'Streamable HTTP' transport mode instead of the traditional 'stdio' transport.
148
+
149
+ So instead of using the following command (the 'stdio' one):
150
+ ```bash
151
+ scrapling mcp
152
+ ```
153
+ Use the following to enable 'Streamable HTTP' transport mode:
154
+ ```bash
155
+ scrapling mcp --http
156
+ ```
157
+ Hence, the default value for the host the server is listening to is '0.0.0.0' and the port is 8000, which both can be configured as below:
158
+ ```bash
159
+ scrapling mcp --http --host '127.0.0.1' --port 8000
160
+ ```
161
+
162
+ ## Examples
163
+
164
+ Now we will show you some examples of prompts we used while testing the MCP server, but you are probably more creative than we are and better at prompt engineering than we are :)
165
+
166
+ We will gradually go from simple prompts to more complex ones. We will use Claude Desktop for the examples, but the same logic applies to the rest, of course.
167
+
168
+ 1. **Basic Web Scraping**
169
+
170
+ Extract the main content from a webpage as Markdown:
171
+
172
+ ```
173
+ Scrape the main content from https://example.com and convert it to markdown format.
174
+ ```
175
+
176
+ Claude will use the `get` tool to fetch the page and return clean, readable content. If it fails, it will continue retrying every second for 3 attempts, unless you instruct it otherwise. If it fails to retrieve content for any reason, such as protection or if it's a dynamic website, it will automatically try the other tools. If Claude didn't do that automatically for some reason, you can add that to the prompt.
177
+
178
+ A more optimized version of the same prompt would be:
179
+ ```
180
+ Use regular requests to scrape the main content from https://example.com and convert it to markdown format.
181
+ ```
182
+ This tells Claude which tool to use here, so it doesn't have to guess. Sometimes it will start using normal requests on its own, and at other times, it will assume browsers are better suited for this website without any apparent reason. As a rule of thumb, you should always tell Claude which tool to use to save time and money and get consistent results.
183
+
184
+ 2. **Targeted Data Extraction**
185
+
186
+ Extract specific elements using CSS selectors:
187
+
188
+ ```
189
+ Get all product titles from https://shop.example.com using the CSS selector '.product-title'. If the request fails, retry up to 5 times every 10 seconds.
190
+ ```
191
+
192
+ The server will extract only the elements matching your selector and return them as a structured list. Notice I told it to set the tool to try up to 5 times in case the website has connection issues, but the default setting should be fine for most cases.
193
+
194
+ 3. **E-commerce Data Collection**
195
+
196
+ Another example of a bit more complex prompt:
197
+ ```
198
+ Extract product information from these e-commerce URLs using bulk browser fetches:
199
+ - https://shop1.com/product-a
200
+ - https://shop2.com/product-b
201
+ - https://shop3.com/product-c
202
+
203
+ Get the product names, prices, and descriptions from each page.
204
+ ```
205
+
206
+ Claude will use `bulk_fetch` to concurrently scrape all URLs, then analyze the extracted data.
207
+
208
+ 4. **More advanced workflow**
209
+
210
+ Let's say I want to get all the action games available on PlayStation's store first page right now. I can use the following prompt to do that:
211
+ ```
212
+ Extract the URLs of all games in this page, then do a bulk request to them and return a list of all action games: https://store.playstation.com/en-us/pages/browse
213
+ ```
214
+ Note that I instructed it to use a bulk request for all the URLs collected. If I hadn't mentioned it, sometimes it works as intended, and other times it makes a separate request to each URL, which takes significantly longer. This prompt takes approximately one minute to complete.
215
+
216
+ However, because I wasn't specific enough, it actually used the `stealthy_fetch` here and the `bulk_stealthy_fetch` in the second step, which unnecessarily consumed a large number of tokens. A better prompt would be:
217
+ ```
218
+ Use normal requests to extract the URLs of all games in this page, then do a bulk request to them and return a list of all action games: https://store.playstation.com/en-us/pages/browse
219
+ ```
220
+ And if you know how to write CSS selectors, you can instruct Claude to apply the selectors to the elements you want, and it will nearly complete the task immediately.
221
+ ```
222
+ Use normal requests to extract the URLs of all games on the page below, then perform a bulk request to them and return a list of all action games.
223
+ The selector for games in the first page is `[href*="/concept/"]` and the selector for the genre in the second request is `[data-qa="gameInfo#releaseInformation#genre-value"]`.
224
+
225
+ URL: https://store.playstation.com/en-us/pages/browse
226
+ ```
227
+
228
+ 5. **Get data from a website with Cloudflare protection**
229
+
230
+ If you think the website you are targeting has Cloudflare protection, tell Claude instead of letting it discover it on its own.
231
+ ```
232
+ What's the price of this product? Be cautious, as it utilizes Cloudflare's Turnstile protection. Make the browser visible while you work.
233
+
234
+ https://ao.com/product/oo101uk-ninja-woodfire-outdoor-pizza-oven-brown-99357-685.aspx
235
+ ```
236
+
237
+ 6. **Long workflow**
238
+
239
+ You can, for example, use a prompt like this:
240
+ ```
241
+ Extract all product URLs for the following category, then return the prices and details for the first 3 products.
242
+
243
+ https://www.arnotts.ie/furniture/bedroom/bed-frames/
244
+ ```
245
+ But a better prompt would be:
246
+ ```
247
+ Go to the following category URL and extract all product URLs using the CSS selector "a". Then, fetch the first 3 product pages in parallel and extract each product’s price and details.
248
+
249
+ Keep the output in markdown format to reduce irrelevant content.
250
+
251
+ Category URL:
252
+ https://www.arnotts.ie/furniture/bedroom/bed-frames/
253
+ ```
254
+
255
+ And so on, you get the idea. Your creativity is the key here.
256
+
257
+ ## Best Practices
258
+
259
+ Here is some technical advice for you.
260
+
261
+ ### 1. Choose the Right Tool
262
+ - **`get`**: Fast, simple websites
263
+ - **`fetch`**: Sites with JavaScript/dynamic content
264
+ - **`stealthy_fetch`**: Protected sites, Cloudflare, anti-bot systems
265
+
266
+ ### 2. Optimize Performance
267
+ - Use bulk tools for multiple URLs
268
+ - Disable unnecessary resources
269
+ - Set appropriate timeouts
270
+ - Use CSS selectors for targeted extraction
271
+
272
+ ### 3. Handle Dynamic Content
273
+ - Use `network_idle` for SPAs
274
+ - Set `wait_selector` for specific elements
275
+ - Increase timeout for slow-loading sites
276
+
277
+ ### 4. Data Quality
278
+ - Use `main_content_only=true` to avoid navigation/ads
279
+ - Choose an appropriate `extraction_type` for your use case
280
+
281
+ ## Legal and Ethical Considerations
282
+
283
+ ⚠️ **Important Guidelines:**
284
+
285
+ - **Check robots.txt**: Visit `https://website.com/robots.txt` to see scraping rules
286
+ - **Respect rate limits**: Don't overwhelm servers with requests
287
+ - **Terms of Service**: Read and comply with website terms
288
+ - **Copyright**: Respect intellectual property rights
289
+ - **Privacy**: Be mindful of personal data protection laws
290
+ - **Commercial use**: Ensure you have permission for business purposes
291
+
292
+ ---
293
+
294
+ *Built with ❤️ by the Scrapling team. Happy scraping!*
docs/api-reference/custom-types.md ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ search:
3
+ exclude: true
4
+ ---
5
+
6
+ # Custom Types API Reference
7
+
8
+ Here's the reference information for all custom types of classes Scrapling implemented, with all their parameters, attributes, and methods.
9
+
10
+ You can import all of them directly like below:
11
+
12
+ ```python
13
+ from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
14
+ ```
15
+
16
+ ## ::: scrapling.core.custom_types.TextHandler
17
+ handler: python
18
+ :docstring:
19
+
20
+ ## ::: scrapling.core.custom_types.TextHandlers
21
+ handler: python
22
+ :docstring:
23
+
24
+ ## ::: scrapling.core.custom_types.AttributesHandler
25
+ handler: python
26
+ :docstring:
docs/api-reference/fetchers.md ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ search:
3
+ exclude: true
4
+ ---
5
+
6
+ # Fetchers Classes
7
+
8
+ Here's the reference information for all fetcher-type classes' parameters, attributes, and methods.
9
+
10
+ You can import all of them directly like below:
11
+
12
+ ```python
13
+ from scrapling.fetchers import (
14
+ Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher,
15
+ FetcherSession, AsyncStealthySession, StealthySession, DynamicSession, AsyncDynamicSession
16
+ )
17
+ ```
18
+
19
+ ## ::: scrapling.fetchers.Fetcher
20
+ handler: python
21
+ :docstring:
22
+
23
+ ## ::: scrapling.fetchers.AsyncFetcher
24
+ handler: python
25
+ :docstring:
26
+
27
+ ## ::: scrapling.fetchers.DynamicFetcher
28
+ handler: python
29
+ :docstring:
30
+
31
+ ## ::: scrapling.fetchers.StealthyFetcher
32
+ handler: python
33
+ :docstring:
34
+
35
+
36
+ ## Session Classes
37
+
38
+ ### HTTP Sessions
39
+
40
+ ## ::: scrapling.fetchers.FetcherSession
41
+ handler: python
42
+ :docstring:
43
+
44
+ ### Stealth Sessions
45
+
46
+ ## ::: scrapling.fetchers.StealthySession
47
+ handler: python
48
+ :docstring:
49
+
50
+ ## ::: scrapling.fetchers.AsyncStealthySession
51
+ handler: python
52
+ :docstring:
53
+
54
+ ### Dynamic Sessions
55
+
56
+ ## ::: scrapling.fetchers.DynamicSession
57
+ handler: python
58
+ :docstring:
59
+
60
+ ## ::: scrapling.fetchers.AsyncDynamicSession
61
+ handler: python
62
+ :docstring:
63
+
docs/api-reference/mcp-server.md ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ search:
3
+ exclude: true
4
+ ---
5
+
6
+ # MCP Server API Reference
7
+
8
+ The **Scrapling MCP Server** provides six powerful tools for web scraping through the Model Context Protocol (MCP). This server integrates Scrapling's capabilities directly into AI chatbots and agents, allowing conversational web scraping with advanced anti-bot bypass features.
9
+
10
+ You can start the MCP server by running:
11
+
12
+ ```bash
13
+ scrapling mcp
14
+ ```
15
+
16
+ Or import the server class directly:
17
+
18
+ ```python
19
+ from scrapling.core.ai import ScraplingMCPServer
20
+
21
+ server = ScraplingMCPServer()
22
+ server.serve(http=False, host="0.0.0.0", port=8000)
23
+ ```
24
+
25
+ ## Response Model
26
+
27
+ The standardized response structure that's returned by all MCP server tools:
28
+
29
+ ## ::: scrapling.core.ai.ResponseModel
30
+ handler: python
31
+ :docstring:
32
+
33
+ ## MCP Server Class
34
+
35
+ The main MCP server class that provides all web scraping tools:
36
+
37
+ ## ::: scrapling.core.ai.ScraplingMCPServer
38
+ handler: python
39
+ :docstring:
docs/api-reference/proxy-rotation.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ search:
3
+ exclude: true
4
+ ---
5
+
6
+ # Proxy Rotation
7
+
8
+ The `ProxyRotator` class provides thread-safe proxy rotation for any fetcher or session.
9
+
10
+ You can import it directly like below:
11
+
12
+ ```python
13
+ from scrapling.fetchers import ProxyRotator
14
+ ```
15
+
16
+ ## ::: scrapling.engines.toolbelt.proxy_rotation.ProxyRotator
17
+ handler: python
18
+ :docstring:
docs/api-reference/response.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ search:
3
+ exclude: true
4
+ ---
5
+
6
+ # Response Class
7
+
8
+ The `Response` class wraps HTTP responses returned by all fetchers, providing access to status, headers, body, cookies, and a `Selector` for parsing.
9
+
10
+ You can import the `Response` class like below:
11
+
12
+ ```python
13
+ from scrapling.engines.toolbelt.custom import Response
14
+ ```
15
+
16
+ ## ::: scrapling.engines.toolbelt.custom.Response
17
+ handler: python
18
+ :docstring:
docs/api-reference/selector.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ search:
3
+ exclude: true
4
+ ---
5
+
6
+ # Selector Class
7
+
8
+ The `Selector` class is the core parsing engine in Scrapling that provides HTML parsing and element selection capabilities.
9
+
10
+ Here's the reference information for the `Selector` class, with all its parameters, attributes, and methods.
11
+
12
+ You can import the `Selector` class directly from `scrapling`:
13
+
14
+ ```python
15
+ from scrapling.parser import Selector
16
+ ```
17
+
18
+ ## ::: scrapling.parser.Selector
19
+ handler: python
20
+ :docstring:
21
+
22
+ ## ::: scrapling.parser.Selectors
23
+ handler: python
24
+ :docstring:
25
+
docs/api-reference/spiders.md ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ search:
3
+ exclude: true
4
+ ---
5
+
6
+ # Spider Classes
7
+
8
+ Here's the reference information for the spider framework classes' parameters, attributes, and methods.
9
+
10
+ You can import them directly like below:
11
+
12
+ ```python
13
+ from scrapling.spiders import Spider, Request, CrawlResult, SessionManager, Response
14
+ ```
15
+
16
+ ## ::: scrapling.spiders.Spider
17
+ handler: python
18
+ :docstring:
19
+
20
+ ## ::: scrapling.spiders.Request
21
+ handler: python
22
+ :docstring:
23
+
24
+ ## Result Classes
25
+
26
+ ## ::: scrapling.spiders.result.CrawlResult
27
+ handler: python
28
+ :docstring:
29
+
30
+ ## ::: scrapling.spiders.result.CrawlStats
31
+ handler: python
32
+ :docstring:
33
+
34
+ ## ::: scrapling.spiders.result.ItemList
35
+ handler: python
36
+ :docstring:
37
+
38
+ ## Session Management
39
+
40
+ ## ::: scrapling.spiders.session.SessionManager
41
+ handler: python
42
+ :docstring:
docs/assets/cover_dark.png ADDED

Git LFS Details

  • SHA256: 8eec59d31fa1c41f1a35ee8e08a412e975eeabf1347b1bb6ca609cd454edf044
  • Pointer size: 131 Bytes
  • Size of remote file: 114 kB
docs/assets/cover_dark.svg ADDED
docs/assets/cover_light.png ADDED
docs/assets/cover_light.svg ADDED
docs/assets/favicon.ico ADDED

Git LFS Details

  • SHA256: 9d2643963074a37762e2f2896b3146c7601a262838cecbcac30b69baa497d4f8
  • Pointer size: 131 Bytes
  • Size of remote file: 267 kB
docs/assets/logo.png ADDED
docs/assets/main_cover.png ADDED

Git LFS Details

  • SHA256: a80343a3e9f04e64c08c568ff2e452cccd2b24157d24b7263fc5d677d14ccc40
  • Pointer size: 131 Bytes
  • Size of remote file: 455 kB
docs/assets/scrapling_shell_curl.png ADDED

Git LFS Details

  • SHA256: 39c5c7aa963d31dc4f8584f34058600487c1941160dcfdcb8d11f1c699935c13
  • Pointer size: 131 Bytes
  • Size of remote file: 351 kB
docs/assets/spider_architecture.png ADDED

Git LFS Details

  • SHA256: 49bca39a1cb9a532074bc6530ec2b6b1ea625e7a9f042659d2bfffcb7dcee84a
  • Pointer size: 131 Bytes
  • Size of remote file: 130 kB