Merge pull request #3 from D4Vinci/dev
Browse files- .github/workflows/tests.yml +30 -4
- .gitignore +2 -0
- CONTRIBUTING.md +7 -2
- MANIFEST.in +2 -0
- README.md +397 -73
- ROADMAP.md +12 -11
- pytest.ini +1 -1
- scrapling/__init__.py +4 -3
- scrapling/core/__init__.py +0 -0
- scrapling/core/_types.py +25 -0
- scrapling/{custom_types.py → core/custom_types.py} +48 -3
- scrapling/{mixins.py → core/mixins.py} +22 -7
- scrapling/{storage_adaptors.py → core/storage_adaptors.py} +2 -2
- scrapling/{translator.py → core/translator.py} +2 -12
- scrapling/{utils.py → core/utils.py} +2 -61
- scrapling/engines/__init__.py +7 -0
- scrapling/engines/camo.py +121 -0
- scrapling/engines/constants.py +108 -0
- scrapling/engines/pw.py +232 -0
- scrapling/engines/static.py +112 -0
- scrapling/engines/toolbelt/__init__.py +18 -0
- scrapling/engines/toolbelt/bypasses/navigator_plugins.js +40 -0
- scrapling/engines/toolbelt/bypasses/notification_permission.js +5 -0
- scrapling/engines/toolbelt/bypasses/pdf_viewer.js +5 -0
- scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -0
- scrapling/engines/toolbelt/bypasses/screen_props.js +27 -0
- scrapling/engines/toolbelt/bypasses/webdriver_fully.js +27 -0
- scrapling/engines/toolbelt/bypasses/window_chrome.js +213 -0
- scrapling/engines/toolbelt/custom.py +168 -0
- scrapling/engines/toolbelt/fingerprints.py +81 -0
- scrapling/engines/toolbelt/navigation.py +74 -0
- scrapling/fetchers.py +190 -0
- scrapling/parser.py +216 -51
- setup.cfg +2 -2
- setup.py +11 -5
- tests/fetchers/__init__.py +1 -0
- tests/fetchers/test_camoufox.py +62 -0
- tests/fetchers/test_httpx.py +67 -0
- tests/fetchers/test_playwright.py +74 -0
- tests/parser/__init__.py +0 -0
- tests/parser/test_automatch.py +56 -0
- tests/{test_all_functions.py → parser/test_general.py} +11 -61
- tests/requirements.txt +7 -2
- tox.ini +5 -2
.github/workflows/tests.yml
CHANGED
|
@@ -7,15 +7,12 @@ concurrency:
|
|
| 7 |
|
| 8 |
jobs:
|
| 9 |
tests:
|
|
|
|
| 10 |
runs-on: ${{ matrix.os }}
|
| 11 |
strategy:
|
| 12 |
fail-fast: false
|
| 13 |
matrix:
|
| 14 |
include:
|
| 15 |
-
- python-version: "3.7"
|
| 16 |
-
os: ubuntu-latest
|
| 17 |
-
env:
|
| 18 |
-
TOXENV: py
|
| 19 |
- python-version: "3.8"
|
| 20 |
os: ubuntu-latest
|
| 21 |
env:
|
|
@@ -36,13 +33,42 @@ jobs:
|
|
| 36 |
os: ubuntu-latest
|
| 37 |
env:
|
| 38 |
TOXENV: py
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
steps:
|
| 41 |
- uses: actions/checkout@v4
|
|
|
|
| 42 |
- name: Set up Python ${{ matrix.python-version }}
|
| 43 |
uses: actions/setup-python@v5
|
| 44 |
with:
|
| 45 |
python-version: ${{ matrix.python-version }}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
- name: Run tests
|
| 48 |
env: ${{ matrix.env }}
|
|
|
|
| 7 |
|
| 8 |
jobs:
|
| 9 |
tests:
|
| 10 |
+
timeout-minutes: 60
|
| 11 |
runs-on: ${{ matrix.os }}
|
| 12 |
strategy:
|
| 13 |
fail-fast: false
|
| 14 |
matrix:
|
| 15 |
include:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
- python-version: "3.8"
|
| 17 |
os: ubuntu-latest
|
| 18 |
env:
|
|
|
|
| 33 |
os: ubuntu-latest
|
| 34 |
env:
|
| 35 |
TOXENV: py
|
| 36 |
+
- python-version: "3.13"
|
| 37 |
+
os: ubuntu-latest
|
| 38 |
+
env:
|
| 39 |
+
TOXENV: py
|
| 40 |
|
| 41 |
steps:
|
| 42 |
- uses: actions/checkout@v4
|
| 43 |
+
|
| 44 |
- name: Set up Python ${{ matrix.python-version }}
|
| 45 |
uses: actions/setup-python@v5
|
| 46 |
with:
|
| 47 |
python-version: ${{ matrix.python-version }}
|
| 48 |
+
cache: 'pip'
|
| 49 |
+
cache-dependency-path: |
|
| 50 |
+
setup.py
|
| 51 |
+
requirements*.txt
|
| 52 |
+
tox.ini
|
| 53 |
+
|
| 54 |
+
- name: Install Camoufox Dependencies
|
| 55 |
+
run: |
|
| 56 |
+
python3 -m pip install --upgrade pip
|
| 57 |
+
python3 -m pip install playwright camoufox
|
| 58 |
+
python3 -m playwright install chromium
|
| 59 |
+
python3 -m playwright install-deps chromium firefox
|
| 60 |
+
python3 -m camoufox fetch --browserforge
|
| 61 |
+
|
| 62 |
+
# Cache tox environments
|
| 63 |
+
- name: Cache tox environments
|
| 64 |
+
uses: actions/cache@v3
|
| 65 |
+
with:
|
| 66 |
+
path: .tox
|
| 67 |
+
# Include python version and os in cache key
|
| 68 |
+
key: tox-v1-${{ runner.os }}-py${{ matrix.python-version }}-${{ hashFiles('tox.ini', 'setup.py', 'requirements*.txt') }}
|
| 69 |
+
restore-keys: |
|
| 70 |
+
tox-v1-${{ runner.os }}-py${{ matrix.python-version }}-
|
| 71 |
+
tox-v1-${{ runner.os }}-
|
| 72 |
|
| 73 |
- name: Run tests
|
| 74 |
env: ${{ matrix.env }}
|
.gitignore
CHANGED
|
@@ -13,6 +13,8 @@ __pycache__/
|
|
| 13 |
.bootstrap
|
| 14 |
.appveyor.token
|
| 15 |
*.bak
|
|
|
|
|
|
|
| 16 |
|
| 17 |
# installation package
|
| 18 |
*.egg-info/
|
|
|
|
| 13 |
.bootstrap
|
| 14 |
.appveyor.token
|
| 15 |
*.bak
|
| 16 |
+
*.db
|
| 17 |
+
*.db-*
|
| 18 |
|
| 19 |
# installation package
|
| 20 |
*.egg-info/
|
CONTRIBUTING.md
CHANGED
|
@@ -15,7 +15,7 @@ configfile: pytest.ini
|
|
| 15 |
plugins: cov-5.0.0, anyio-4.6.0
|
| 16 |
collected 16 items
|
| 17 |
|
| 18 |
-
tests/
|
| 19 |
|
| 20 |
=============================== 16 passed in 0.22s ================================
|
| 21 |
```
|
|
@@ -27,4 +27,9 @@ Also, consider setting `debug` to `True` while initializing the Adaptor object s
|
|
| 27 |
- Fork Scrapling [git repository](https://github.com/D4Vinci/Scrapling).
|
| 28 |
- Make your changes.
|
| 29 |
- Ensure tests work.
|
| 30 |
-
- Create a Pull Request against the [**dev**](https://github.com/D4Vinci/Scrapling/tree/dev) branch of Scrapling.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
plugins: cov-5.0.0, anyio-4.6.0
|
| 16 |
collected 16 items
|
| 17 |
|
| 18 |
+
tests/test_parser_functions.py ................ [100%]
|
| 19 |
|
| 20 |
=============================== 16 passed in 0.22s ================================
|
| 21 |
```
|
|
|
|
| 27 |
- Fork Scrapling [git repository](https://github.com/D4Vinci/Scrapling).
|
| 28 |
- Make your changes.
|
| 29 |
- Ensure tests work.
|
| 30 |
+
- Create a Pull Request against the [**dev**](https://github.com/D4Vinci/Scrapling/tree/dev) branch of Scrapling.
|
| 31 |
+
|
| 32 |
+
### Installing the latest changes from the dev branch
|
| 33 |
+
```commandline
|
| 34 |
+
pip3 install git+https://github.com/D4Vinci/Scrapling.git@dev
|
| 35 |
+
```
|
MANIFEST.in
CHANGED
|
@@ -1,6 +1,8 @@
|
|
| 1 |
include LICENSE
|
| 2 |
include *.db
|
|
|
|
| 3 |
include scrapling/*.db
|
|
|
|
| 4 |
include scrapling/py.typed
|
| 5 |
|
| 6 |
recursive-exclude * __pycache__
|
|
|
|
| 1 |
include LICENSE
|
| 2 |
include *.db
|
| 3 |
+
include *.js
|
| 4 |
include scrapling/*.db
|
| 5 |
+
include scrapling/*.db*
|
| 6 |
include scrapling/py.typed
|
| 7 |
|
| 8 |
recursive-exclude * __pycache__
|
README.md
CHANGED
|
@@ -1,30 +1,77 @@
|
|
| 1 |
-
# 🕷️ Scrapling: Lightning-Fast, Adaptive Web Scraping for Python
|
| 2 |
[](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [](https://badge.fury.io/py/Scrapling) [](https://pypi.org/project/scrapling/) [](https://pepy.tech/project/scrapling)
|
| 3 |
|
| 4 |
-
Dealing with failing web scrapers due to website changes? Meet Scrapling.
|
| 5 |
|
| 6 |
-
Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives.
|
| 7 |
|
| 8 |
```python
|
| 9 |
-
from scrapling import
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
products = page.css('.product',
|
|
|
|
|
|
|
| 16 |
```
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
## Key Features
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
### Adaptive Scraping
|
| 21 |
- 🔄 **Smart Element Tracking**: Locate previously identified elements after website structure changes, using an intelligent similarity system and integrated storage.
|
| 22 |
-
- 🎯 **Flexible Querying**: Use CSS selectors, XPath, text search, or regex - chain them however you want!
|
| 23 |
- 🔍 **Find Similar Elements**: Automatically locate elements similar to the element you want on the page (Ex: other products like the product you found on the page).
|
| 24 |
-
- 🧠 **Smart Content Scraping**: Extract data from multiple websites without specific selectors using
|
| 25 |
|
| 26 |
### Performance
|
| 27 |
-
- 🚀 **Lightning Fast**: Built from the ground up with performance in mind, outperforming most popular Python scraping libraries (outperforming BeautifulSoup by up to
|
| 28 |
- 🔋 **Memory Efficient**: Optimized data structures for minimal memory footprint.
|
| 29 |
- ⚡ **Fast JSON serialization**: 10x faster JSON serialization than the standard json library with more options.
|
| 30 |
|
|
@@ -32,23 +79,18 @@ products = page.css('.product', auto_match=True) # Still finds them!
|
|
| 32 |
- 🛠️ **Powerful Navigation API**: Traverse the DOM tree easily in all directions and get the info you want (parent, ancestors, sibling, children, next/previous element, and more).
|
| 33 |
- 🧬 **Rich Text Processing**: All strings have built-in methods for regex matching, cleaning, and more. All elements' attributes are read-only dictionaries that are faster than standard dictionaries with added methods.
|
| 34 |
- 📝 **Automatic Selector Generation**: Create robust CSS/XPath selectors for any element.
|
| 35 |
-
- 🔌 **
|
| 36 |
-
- 📘 **Type hints**: Complete type coverage for better IDE support and fewer bugs.
|
| 37 |
|
| 38 |
## Getting Started
|
| 39 |
|
| 40 |
-
Let's walk through a basic example that demonstrates a small group of Scrapling's core features:
|
| 41 |
-
|
| 42 |
```python
|
| 43 |
-
import
|
| 44 |
-
from scrapling import Adaptor
|
| 45 |
|
| 46 |
-
|
| 47 |
-
url = 'https://quotes.toscrape.com/'
|
| 48 |
-
response = requests.get(url)
|
| 49 |
|
| 50 |
-
#
|
| 51 |
-
page =
|
| 52 |
# Get all strings in the full page
|
| 53 |
page.get_all_text(ignore_tags=('script', 'style'))
|
| 54 |
|
|
@@ -56,10 +98,17 @@ page.get_all_text(ignore_tags=('script', 'style'))
|
|
| 56 |
quotes = page.css('.quote .text::text') # CSS selector
|
| 57 |
quotes = page.xpath('//span[@class="text"]/text()') # XPath
|
| 58 |
quotes = page.css('.quote').css('.text::text') # Chained selectors
|
| 59 |
-
quotes = [element.text for element in page.css('.quote
|
| 60 |
|
| 61 |
# Get the first quote element
|
| 62 |
-
quote = page.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
# Working with elements
|
| 65 |
quote.html_content # Inner HTML
|
|
@@ -67,19 +116,9 @@ quote.prettify() # Prettified version of Inner HTML
|
|
| 67 |
quote.attrib # Element attributes
|
| 68 |
quote.path # DOM path to element (List)
|
| 69 |
```
|
| 70 |
-
To keep it simple, all methods can be chained on top of each other
|
| 71 |
-
|
| 72 |
-
### Installation
|
| 73 |
-
Scrapling is a breeze to get started with - We only require at least Python 3.7 to work and the rest of the requirements are installed automatically with the package.
|
| 74 |
-
```bash
|
| 75 |
-
# Using pip
|
| 76 |
-
pip install scrapling
|
| 77 |
-
|
| 78 |
-
# Or the latest from GitHub
|
| 79 |
-
pip install git+https://github.com/D4Vinci/Scrapling.git@master
|
| 80 |
-
```
|
| 81 |
|
| 82 |
-
## Performance
|
| 83 |
|
| 84 |
Scrapling isn't just powerful - it's also blazing fast. Scrapling implements many best practices, design patterns, and numerous optimizations to save fractions of seconds. All of that while focusing exclusively on parsing HTML documents.
|
| 85 |
Here are benchmarks comparing Scrapling to popular Python libraries in two tests.
|
|
@@ -106,11 +145,150 @@ As you see, Scrapling is on par with Scrapy and slightly faster than Lxml which
|
|
| 106 |
| Scrapling | 2.51 | 1.0x |
|
| 107 |
| AutoScraper | 11.41 | 4.546x |
|
| 108 |
|
| 109 |
-
Scrapling can find elements with more methods and it returns full element `Adaptor` objects not only the text like AutoScraper. So, to make this test fair, both libraries will extract an element with text, find similar elements, and then extract the text content for all of them. As you see, Scrapling is still 4.5 times faster at same task.
|
| 110 |
|
| 111 |
> All benchmarks' results are an average of 100 runs. See our [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology and to run your comparisons.
|
| 112 |
|
| 113 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
### Smart Navigation
|
| 115 |
```python
|
| 116 |
>>> quote.tag
|
|
@@ -130,24 +308,23 @@ Scrapling can find elements with more methods and it returns full element `Adapt
|
|
| 130 |
>>> quote.siblings
|
| 131 |
[<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
|
| 132 |
<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
|
| 133 |
-
<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
|
| 134 |
...]
|
| 135 |
|
| 136 |
>>> quote.next # gets the next element, the same logic applies to `quote.previous`
|
| 137 |
<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>
|
| 138 |
|
| 139 |
-
>>> quote.children.
|
| 140 |
-
|
| 141 |
|
| 142 |
>>> quote.has_class('quote')
|
| 143 |
True
|
| 144 |
|
| 145 |
# Generate new selectors for any element
|
| 146 |
-
>>> quote.
|
| 147 |
'body > div > div:nth-of-type(2) > div > div'
|
| 148 |
|
| 149 |
-
# Test these selectors on your favorite browser or reuse them again in the library
|
| 150 |
-
>>> quote.
|
| 151 |
'//body/div/div[2]/div/div'
|
| 152 |
```
|
| 153 |
If your case needs more than the element's parent, you can iterate over the whole ancestors' tree of any element like below
|
|
@@ -164,11 +341,9 @@ You can search for a specific ancestor of an element that satisfies a function,
|
|
| 164 |
### Content-based Selection & Finding Similar Elements
|
| 165 |
You can select elements by their text content in multiple ways, here's a full example on another website:
|
| 166 |
```python
|
| 167 |
-
>>>
|
| 168 |
-
|
| 169 |
-
>>> page = Adaptor(response.text, url=response.url)
|
| 170 |
|
| 171 |
-
>>> page.find_by_text('Tipping the Velvet') # Find the first element
|
| 172 |
<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>
|
| 173 |
|
| 174 |
>>> page.find_by_text('Tipping the Velvet', first_match=False) # Get all matches if there are more
|
|
@@ -208,8 +383,8 @@ To increase the complexity a little bit, let's say we want to get all books' dat
|
|
| 208 |
```python
|
| 209 |
>>> for product in page.find_by_text('Tipping the Velvet').parent.parent.find_similar():
|
| 210 |
print({
|
| 211 |
-
"name": product.
|
| 212 |
-
"price": product.
|
| 213 |
"stock": product.css('.availability::text')[-1].clean()
|
| 214 |
})
|
| 215 |
{'name': 'A Light in the ...', 'price': '51.77', 'stock': 'In stock'}
|
|
@@ -220,8 +395,6 @@ To increase the complexity a little bit, let's say we want to get all books' dat
|
|
| 220 |
The [documentation](https://github.com/D4Vinci/Scrapling/tree/main/docs/Examples) will provide more advanced examples.
|
| 221 |
|
| 222 |
### Handling Structural Changes
|
| 223 |
-
> Because [the internet archive](https://web.archive.org/) is down at the time of writing this, I can't use real websites as examples even though I tested that before (I mean browsing an old version of a website and then counting the current version of the website as structural changes)
|
| 224 |
-
|
| 225 |
Let's say you are scraping a page with a structure like this:
|
| 226 |
```html
|
| 227 |
<div class="container">
|
|
@@ -237,7 +410,7 @@ Let's say you are scraping a page with a structure like this:
|
|
| 237 |
</section>
|
| 238 |
</div>
|
| 239 |
```
|
| 240 |
-
|
| 241 |
```python
|
| 242 |
page.css('#p1')
|
| 243 |
```
|
|
@@ -262,34 +435,147 @@ When website owners implement structural changes like
|
|
| 262 |
</div>
|
| 263 |
</div>
|
| 264 |
```
|
| 265 |
-
The selector will no longer function and your code needs maintenance. That's where Scrapling auto-matching feature comes into play.
|
| 266 |
|
| 267 |
```python
|
|
|
|
| 268 |
# Before the change
|
| 269 |
-
page = Adaptor(page_source, url='example.com'
|
| 270 |
element = page.css('#p1' auto_save=True)
|
| 271 |
if not element: # One day website changes?
|
| 272 |
-
element = page.css('#p1', auto_match=True) #
|
| 273 |
# the rest of the code...
|
| 274 |
```
|
| 275 |
-
> How does the auto-matching work? Check the [FAQs](#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
|
| 277 |
**Notes:**
|
| 278 |
-
1.
|
|
|
|
| 279 |
```text
|
| 280 |
Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.
|
| 281 |
```
|
| 282 |
This behavior is purely for performance reasons so the database gets created/connected only when you are planning to use the auto-matching features. Same case with the `auto_match` argument.
|
| 283 |
|
| 284 |
-
|
| 285 |
```python
|
| 286 |
page.css('body').css('#p1', auto_match=True)
|
| 287 |
```
|
| 288 |
because you can't auto-match a whole list, you have to be specific and do something like
|
| 289 |
```python
|
| 290 |
-
page.
|
| 291 |
```
|
| 292 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
### Is That All?
|
| 294 |
Here's what else you can do with Scrapling:
|
| 295 |
|
|
@@ -300,12 +586,12 @@ Here's what else you can do with Scrapling:
|
|
| 300 |
```
|
| 301 |
- Saving and retrieving elements manually to auto-match them outside the `css` and the `xpath` methods but you have to set the identifier by yourself.
|
| 302 |
|
| 303 |
-
- To save element to the database:
|
| 304 |
```python
|
| 305 |
>>> element = page.find_by_text('Tipping the Velvet', first_match=True)
|
| 306 |
>>> page.save(element, 'my_special_element')
|
| 307 |
```
|
| 308 |
-
- Now later when you want to retrieve it and relocate it
|
| 309 |
```python
|
| 310 |
>>> element_dict = page.retrieve('my_special_element')
|
| 311 |
>>> page.relocate(element_dict, adaptor_type=True)
|
|
@@ -319,13 +605,38 @@ Here's what else you can do with Scrapling:
|
|
| 319 |
[<Element a at 0x105a2a7b0>]
|
| 320 |
```
|
| 321 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
- Doing operations on element content is the same as scrapy
|
| 323 |
```python
|
| 324 |
-
quote.re(r'
|
| 325 |
-
quote.re_first(r'
|
| 326 |
quote.json() # If the content text is jsonable, then convert it to json using `orjson` which is 10x faster than the standard json library and provides more options
|
| 327 |
```
|
| 328 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
|
| 330 |
|
| 331 |
- Doing operations on the text content itself includes
|
|
@@ -339,11 +650,11 @@ Here's what else you can do with Scrapling:
|
|
| 339 |
```
|
| 340 |
- Sort all characters in the string as if it were a list and return the new string
|
| 341 |
```python
|
| 342 |
-
quote.sort()
|
| 343 |
```
|
| 344 |
> To be clear, `TextHandler` is a sub-class of Python's `str` so all normal operations/methods that work with Python strings will work with it.
|
| 345 |
|
| 346 |
-
- Any element's attributes are not exactly a dictionary but a sub-class of [mapping](https://docs.python.org/3/glossary.html#term-mapping) called `AttributesHandler` that's read-only so it's faster and string values returned are actually `TextHandler` objects so all operations above can be done on them, standard dictionary operations that
|
| 347 |
- Unlike standard dictionaries, here you can search by values too and can do partial searches. It might be handy in some cases (returns a generator of matches)
|
| 348 |
```python
|
| 349 |
>>> for item in element.attrib.search_values('catalogue', partial=True):
|
|
@@ -370,8 +681,9 @@ There are a lot of deep details skipped here to make this as short as possible s
|
|
| 370 |
|
| 371 |
Note that implementing your storage system can be complex as there are some strict rules such as inheriting from the same abstract class, following the singleton design pattern used in other classes, and more. So make sure to read the docs first.
|
| 372 |
|
|
|
|
| 373 |
|
| 374 |
-
## FAQs
|
| 375 |
This section addresses common questions about Scrapling, please read this section before opening an issue.
|
| 376 |
|
| 377 |
### How does auto-matching work?
|
|
@@ -384,7 +696,7 @@ This section addresses common questions about Scrapling, please read this sectio
|
|
| 384 |
Together both are used to retrieve the element's unique properties from the database later.
|
| 385 |
4. Now later when you enable the `auto_match` parameter for both the Adaptor instance and the method call. The element properties are retrieved and Scrapling loops over all elements in the page and compares each one's unique properties to the unique properties we already have for this element and a score is calculated for each one.
|
| 386 |
5. The comparison between elements is not exact but more about finding how similar these values are, so everything is taken into consideration even the values' order like the order in which the element class names were written before and the order in which the same element class names are written now.
|
| 387 |
-
6. The score for each element is stored in the table and in the end, the element(s) with the highest combined similarity scores are returned.
|
| 388 |
|
| 389 |
### How does the auto-matching work if I didn't pass a URL while initializing the Adaptor object?
|
| 390 |
Not a big problem as it depends on your usage. The word `default` will be used in place of the URL field while saving the element's unique properties. So this will only be an issue if you used the same identifier later for a different website that you didn't pass the URL parameter while initializing it as well. The save process will overwrite the previous data and auto-matching uses the latest saved properties only.
|
|
@@ -413,7 +725,7 @@ Pretty much yeah, almost all features you get from BeautifulSoup can be found or
|
|
| 413 |
Of course, you can find elements by text/regex, find similar elements in a more reliable way than AutoScraper, and finally save/retrieve elements manually to use later as the model feature in AutoScraper. I have pulled all top articles about AutoScraper from Google and tested Scrapling against examples in them. In all examples, Scrapling got the same results as AutoScraper in much less time.
|
| 414 |
|
| 415 |
### Is Scrapling thread-safe?
|
| 416 |
-
Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its
|
| 417 |
|
| 418 |
## Sponsors
|
| 419 |
[](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
|
|
@@ -423,6 +735,10 @@ Everybody is invited and welcome to contribute to Scrapling. There is a lot to d
|
|
| 423 |
|
| 424 |
Please read the [contributing file](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) before doing anything.
|
| 425 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 426 |
## License
|
| 427 |
This work is licensed under BSD-3
|
| 428 |
|
|
@@ -430,8 +746,16 @@ This work is licensed under BSD-3
|
|
| 430 |
This project includes code adapted from:
|
| 431 |
- Parsel (BSD License) - Used for [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/translator.py) submodule
|
| 432 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 433 |
## Known Issues
|
| 434 |
- In the auto-matching save process, the unique properties of the first element from the selection results are the only ones that get saved. So if the selector you are using selects different elements on the page that are in different locations, auto-matching will probably return to you the first element only when you relocate it later. This doesn't include combined CSS selectors (Using commas to combine more than one selector for example) as these selectors get separated and each selector gets executed alone.
|
| 435 |
- Currently, Scrapling is not compatible with async/await.
|
| 436 |
|
| 437 |
-
|
|
|
|
|
|
| 1 |
+
# 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
|
| 2 |
[](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [](https://badge.fury.io/py/Scrapling) [](https://pypi.org/project/scrapling/) [](https://pepy.tech/project/scrapling)
|
| 3 |
|
| 4 |
+
Dealing with failing web scrapers due to anti-bot protections or website changes? Meet Scrapling.
|
| 5 |
|
| 6 |
+
Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
|
| 7 |
|
| 8 |
```python
|
| 9 |
+
>> from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
| 10 |
+
# Fetch websites' source under the radar!
|
| 11 |
+
>> fetcher = StealthyFetcher().fetch('https://example.com', headless=True, disable_resources=True)
|
| 12 |
+
>> print(fetcher.status)
|
| 13 |
+
200
|
| 14 |
+
>> page = fetcher.adaptor
|
| 15 |
+
>> products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
|
| 16 |
+
>> # Later, if the website structure changes, pass `auto_match=True`
|
| 17 |
+
>> products = page.css('.product', auto_match=True) # and Scrapling still finds them!
|
| 18 |
```
|
| 19 |
|
| 20 |
+
## Table of content
|
| 21 |
+
* [Key Features](#key-features)
|
| 22 |
+
* [Fetch websites as you prefer](#fetch-websites-as-you-prefer)
|
| 23 |
+
* [Adaptive Scraping](#adaptive-scraping)
|
| 24 |
+
* [Performance](#performance)
|
| 25 |
+
* [Developing Experience](#developing-experience)
|
| 26 |
+
* [Getting Started](#getting-started)
|
| 27 |
+
* [Parsing Performance](#parsing-performance)
|
| 28 |
+
* [Text Extraction Speed Test (5000 nested elements).](#text-extraction-speed-test-5000-nested-elements)
|
| 29 |
+
* [Extraction By Text Speed Test](#extraction-by-text-speed-test)
|
| 30 |
+
* [Installation](#installation)
|
| 31 |
+
* [Fetching Websites Features](#fetching-websites-features)
|
| 32 |
+
* [Fetcher](#fetcher)
|
| 33 |
+
* [StealthyFetcher](#stealthyfetcher)
|
| 34 |
+
* [PlayWrightFetcher](#playwrightfetcher)
|
| 35 |
+
* [Advanced Parsing Features](#advanced-parsing-features)
|
| 36 |
+
* [Smart Navigation](#smart-navigation)
|
| 37 |
+
* [Content-based Selection & Finding Similar Elements](#content-based-selection--finding-similar-elements)
|
| 38 |
+
* [Handling Structural Changes](#handling-structural-changes)
|
| 39 |
+
* [Real World Scenario](#real-world-scenario)
|
| 40 |
+
* [Find elements by filters](#find-elements-by-filters)
|
| 41 |
+
* [Is That All?](#is-that-all)
|
| 42 |
+
* [More Advanced Usage](#more-advanced-usage)
|
| 43 |
+
* [⚡ Enlightening Questions and FAQs](#-enlightening-questions-and-faqs)
|
| 44 |
+
* [How does auto-matching work?](#how-does-auto-matching-work)
|
| 45 |
+
* [How does the auto-matching work if I didn't pass a URL while initializing the Adaptor object?](#how-does-the-auto-matching-work-if-i-didnt-pass-a-url-while-initializing-the-adaptor-object)
|
| 46 |
+
* [If all things about an element can change or get removed, what are the unique properties to be saved?](#if-all-things-about-an-element-can-change-or-get-removed-what-are-the-unique-properties-to-be-saved)
|
| 47 |
+
* [I have enabled the `auto_save`/`auto_match` parameter while selecting and it got completely ignored with a warning message](#i-have-enabled-the-auto_saveauto_match-parameter-while-selecting-and-it-got-completely-ignored-with-a-warning-message)
|
| 48 |
+
* [I have done everything as the docs but the auto-matching didn't return anything, what's wrong?](#i-have-done-everything-as-the-docs-but-the-auto-matching-didnt-return-anything-whats-wrong)
|
| 49 |
+
* [Can Scrapling replace code built on top of BeautifulSoup4?](#can-scrapling-replace-code-built-on-top-of-beautifulsoup4)
|
| 50 |
+
* [Can Scrapling replace code built on top of AutoScraper?](#can-scrapling-replace-code-built-on-top-of-autoscraper)
|
| 51 |
+
* [Is Scrapling thread-safe?](#is-scrapling-thread-safe)
|
| 52 |
+
* [Sponsors](#sponsors)
|
| 53 |
+
* [Contributing](#contributing)
|
| 54 |
+
* [Disclaimer for Scrapling Project](#disclaimer-for-scrapling-project)
|
| 55 |
+
* [License](#license)
|
| 56 |
+
* [Acknowledgments](#acknowledgments)
|
| 57 |
+
* [Thanks and References](#thanks-and-references)
|
| 58 |
+
* [Known Issues](#known-issues)
|
| 59 |
+
|
| 60 |
## Key Features
|
| 61 |
|
| 62 |
+
### Fetch websites as you prefer
|
| 63 |
+
- **HTTP requests**: Stealthy and fast HTTP requests with `Fetcher`
|
| 64 |
+
- **Stealthy fetcher**: Annoying anti-bot protection? No problem! Scrapling can bypass almost all of them with `StealthyFetcher` with default configuration!
|
| 65 |
+
- **Your preferred browser**: Use your real browser with CDP, [NSTbrowser](https://app.nstbrowser.io/r/1vO5e5)'s browserless, PlayWright with stealth mode, or even vanilla PlayWright - All is possible with `PlayWrightFetcher`!
|
| 66 |
+
|
| 67 |
### Adaptive Scraping
|
| 68 |
- 🔄 **Smart Element Tracking**: Locate previously identified elements after website structure changes, using an intelligent similarity system and integrated storage.
|
| 69 |
+
- 🎯 **Flexible Querying**: Use CSS selectors, XPath, Elements filters, text search, or regex - chain them however you want!
|
| 70 |
- 🔍 **Find Similar Elements**: Automatically locate elements similar to the element you want on the page (Ex: other products like the product you found on the page).
|
| 71 |
+
- 🧠 **Smart Content Scraping**: Extract data from multiple websites without specific selectors using Scrapling powerful features.
|
| 72 |
|
| 73 |
### Performance
|
| 74 |
+
- 🚀 **Lightning Fast**: Built from the ground up with performance in mind, outperforming most popular Python scraping libraries (outperforming BeautifulSoup in parsing by up to 620x in our tests).
|
| 75 |
- 🔋 **Memory Efficient**: Optimized data structures for minimal memory footprint.
|
| 76 |
- ⚡ **Fast JSON serialization**: 10x faster JSON serialization than the standard json library with more options.
|
| 77 |
|
|
|
|
| 79 |
- 🛠️ **Powerful Navigation API**: Traverse the DOM tree easily in all directions and get the info you want (parent, ancestors, sibling, children, next/previous element, and more).
|
| 80 |
- 🧬 **Rich Text Processing**: All strings have built-in methods for regex matching, cleaning, and more. All elements' attributes are read-only dictionaries that are faster than standard dictionaries with added methods.
|
| 81 |
- 📝 **Automatic Selector Generation**: Create robust CSS/XPath selectors for any element.
|
| 82 |
+
- 🔌 **API Similar to Scrapy/BeautifulSoup**: Familiar methods and similar pseudo-elements for Scrapy and BeautifulSoup users.
|
| 83 |
+
- 📘 **Type hints and test coverage**: Complete type coverage and almost full test coverage for better IDE support and fewer bugs, respectively.
|
| 84 |
|
| 85 |
## Getting Started
|
| 86 |
|
|
|
|
|
|
|
| 87 |
```python
|
| 88 |
+
from scrapling import Fetcher
|
|
|
|
| 89 |
|
| 90 |
+
fetcher = Fetcher(auto_match=False)
|
|
|
|
|
|
|
| 91 |
|
| 92 |
+
# Fetch a web page and create an Adaptor instance
|
| 93 |
+
page = fetcher.get('https://quotes.toscrape.com/', stealthy_headers=True).adaptor
|
| 94 |
# Get all strings in the full page
|
| 95 |
page.get_all_text(ignore_tags=('script', 'style'))
|
| 96 |
|
|
|
|
| 98 |
quotes = page.css('.quote .text::text') # CSS selector
|
| 99 |
quotes = page.xpath('//span[@class="text"]/text()') # XPath
|
| 100 |
quotes = page.css('.quote').css('.text::text') # Chained selectors
|
| 101 |
+
quotes = [element.text for element in page.css('.quote .text')] # Slower than bulk query above
|
| 102 |
|
| 103 |
# Get the first quote element
|
| 104 |
+
quote = page.css_first('.quote') # / page.css('.quote').first / page.css('.quote')[0]
|
| 105 |
+
|
| 106 |
+
# Tired of selectors? Use find_all/find
|
| 107 |
+
quotes = page.find_all('div', {'class': 'quote'})
|
| 108 |
+
# Same as
|
| 109 |
+
quotes = page.find_all('div', class_='quote')
|
| 110 |
+
quotes = page.find_all(['div'], class_='quote')
|
| 111 |
+
quotes = page.find_all(class_='quote') # and so on...
|
| 112 |
|
| 113 |
# Working with elements
|
| 114 |
quote.html_content # Inner HTML
|
|
|
|
| 116 |
quote.attrib # Element attributes
|
| 117 |
quote.path # DOM path to element (List)
|
| 118 |
```
|
| 119 |
+
To keep it simple, all methods can be chained on top of each other!
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
|
| 121 |
+
## Parsing Performance
|
| 122 |
|
| 123 |
Scrapling isn't just powerful - it's also blazing fast. Scrapling implements many best practices, design patterns, and numerous optimizations to save fractions of seconds. All of that while focusing exclusively on parsing HTML documents.
|
| 124 |
Here are benchmarks comparing Scrapling to popular Python libraries in two tests.
|
|
|
|
| 145 |
| Scrapling | 2.51 | 1.0x |
|
| 146 |
| AutoScraper | 11.41 | 4.546x |
|
| 147 |
|
| 148 |
+
Scrapling can find elements with more methods and it returns full element `Adaptor` objects not only the text like AutoScraper. So, to make this test fair, both libraries will extract an element with text, find similar elements, and then extract the text content for all of them. As you see, Scrapling is still 4.5 times faster at the same task.
|
| 149 |
|
| 150 |
> All benchmarks' results are an average of 100 runs. See our [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology and to run your comparisons.
|
| 151 |
|
| 152 |
+
## Installation
|
| 153 |
+
Scrapling is a breeze to get started with - Starting from version 0.2, we require at least Python 3.8 to work.
|
| 154 |
+
```bash
|
| 155 |
+
pip3 install scrapling
|
| 156 |
+
```
|
| 157 |
+
- For using the `StealthyFetcher`, go to the command line and download the browser with
|
| 158 |
+
<details><summary>Windows OS</summary>
|
| 159 |
+
|
| 160 |
+
```bash
|
| 161 |
+
camoufox fetch --browserforge
|
| 162 |
+
```
|
| 163 |
+
</details>
|
| 164 |
+
<details><summary>MacOS</summary>
|
| 165 |
+
|
| 166 |
+
```bash
|
| 167 |
+
python3 -m camoufox fetch --browserforge
|
| 168 |
+
```
|
| 169 |
+
</details>
|
| 170 |
+
<details><summary>Linux</summary>
|
| 171 |
+
|
| 172 |
+
```bash
|
| 173 |
+
python -m camoufox fetch --browserforge
|
| 174 |
+
```
|
| 175 |
+
On a fresh installation of Linux, you may also need the following Firefox dependencies:
|
| 176 |
+
- Debian-based distros
|
| 177 |
+
```bash
|
| 178 |
+
sudo apt install -y libgtk-3-0 libx11-xcb1 libasound2
|
| 179 |
+
```
|
| 180 |
+
- Arch-based distros
|
| 181 |
+
```bash
|
| 182 |
+
sudo pacman -S gtk3 libx11 libxcb cairo libasound alsa-lib
|
| 183 |
+
```
|
| 184 |
+
</details>
|
| 185 |
+
|
| 186 |
+
<small> See the official <a href="https://camoufox.com/python/installation/#download-the-browser">Camoufox documentation</a> for more info on installation</small>
|
| 187 |
+
|
| 188 |
+
- If you are going to use the `PlayWrightFetcher` options, then install Playwright's Chromium browser with:
|
| 189 |
+
```commandline
|
| 190 |
+
playwright install chromium
|
| 191 |
+
```
|
| 192 |
+
- If you are going to use normal requests only with the `Fetcher` class then update the fingerprints files with:
|
| 193 |
+
```commandline
|
| 194 |
+
python -m browserforge update
|
| 195 |
+
```
|
| 196 |
+
|
| 197 |
+
## Fetching Websites Features
|
| 198 |
+
All fetcher-type classes are imported in the same way
|
| 199 |
+
```python
|
| 200 |
+
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
| 201 |
+
```
|
| 202 |
+
And all of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug` which are the same ones you give to the `Adaptor` class.
|
| 203 |
+
> [!NOTE]
|
| 204 |
+
> The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
|
| 205 |
+
### Fetcher
|
| 206 |
+
This class is built on top of [httpx](https://www.python-httpx.org/) with additional configuration options, here you can do `GET`, `POST`, `PUT`, and `DELETE` requests.
|
| 207 |
+
|
| 208 |
+
For all methods, you have `stealth_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default.
|
| 209 |
+
```python
|
| 210 |
+
>> page = Fetcher().get('https://httpbin.org/get', stealth_headers=True, follow_redirects=True)
|
| 211 |
+
>> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'})
|
| 212 |
+
>> page = Fetcher().put('https://httpbin.org/put', data={'key': 'value'})
|
| 213 |
+
>> page = Fetcher().delete('https://httpbin.org/delete')
|
| 214 |
+
```
|
| 215 |
+
### StealthyFetcher
|
| 216 |
+
This class is built on top of [Camoufox](https://github.com/daijro/camoufox) which by default bypasses most of the anti-bot protections. Scrapling adds extra layers of flavors and configurations to increase performance and undetectability even further.
|
| 217 |
+
```python
|
| 218 |
+
>> page = StealthyFetcher().fetch('https://www.browserscan.net/bot-detection') # Running headless by default
|
| 219 |
+
>> page.status == 200
|
| 220 |
+
True
|
| 221 |
+
```
|
| 222 |
+
<details><summary><strong>For the sake of simplicity, expand this for the complete list of arguments</strong></summary>
|
| 223 |
+
|
| 224 |
+
| Argument | Description | Optional |
|
| 225 |
+
|:-------------------:|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------:|
|
| 226 |
+
| url | Target url | ❌ |
|
| 227 |
+
| headless | Pass `True` to run the browser in headless/hidden (**default**), `virtual` to run it in virtual screen mode, or `False` for headful/visible mode. The `virtual` mode requires having `xvfb` installed. | ✔️ |
|
| 228 |
+
| block_images | Prevent the loading of images through Firefox preferences. _This can help save your proxy usage but be careful with this option as it makes some websites never finish loading._ | ✔️ |
|
| 229 |
+
| disable_resources | Drop requests of unnecessary resources for a speed boost. It depends but it made requests ~25% faster in my tests for some websites.<br/>Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`. _This can help save your proxy usage but be careful with this option as it makes some websites never finish loading._ | ✔️ |
|
| 230 |
+
| google_search | Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name. | ✔️ |
|
| 231 |
+
| extra_headers | A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._ | ✔️ |
|
| 232 |
+
| block_webrtc | Blocks WebRTC entirely. | ✔️ |
|
| 233 |
+
| page_action | Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again. | ✔️ |
|
| 234 |
+
| addons | List of Firefox addons to use. **Must be paths to extracted addons.** | ✔️ |
|
| 235 |
+
| humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
|
| 236 |
+
| allow_webgl | Whether to allow WebGL. To prevent leaks, only use this for special cases. | ✔️ |
|
| 237 |
+
| network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
|
| 238 |
+
| timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
|
| 239 |
+
| wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
|
| 240 |
+
| wait_selector_state | The state to wait for the selector given with `wait_selector`. _Default state is `attached`._ | ✔️ |
|
| 241 |
+
|
| 242 |
+
</details>
|
| 243 |
+
|
| 244 |
+
This list isn't final so expect a lot more additions and flexibility to be added in the next versions!
|
| 245 |
+
|
| 246 |
+
### PlayWrightFetcher
|
| 247 |
+
This class is built on top of [Playwright](https://playwright.dev/python/) which currently provides 4 main run options but they can be mixed as you want.
|
| 248 |
+
```python
|
| 249 |
+
>> page = PlayWrightFetcher().fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # Vanilla Playwright option
|
| 250 |
+
>> page.adaptor.css_first("#search a::attr(href)")
|
| 251 |
+
'https://github.com/D4Vinci/Scrapling'
|
| 252 |
+
```
|
| 253 |
+
Using this Fetcher class, you can make requests with:
|
| 254 |
+
1) Vanilla Playwright without any modifications other than the ones you chose.
|
| 255 |
+
2) Stealthy Playwright with the stealth mode I wrote for it. It's still a WIP but it bypasses many online tests like [Sannysoft's](https://bot.sannysoft.com/).</br> Some of the things this fetcher's stealth mode does include:
|
| 256 |
+
* Patching the CDP runtime fingerprint.
|
| 257 |
+
* Mimics some of the real browsers' properties by injecting several JS files and using custom options.
|
| 258 |
+
* Using custom flags on launch to hide Playwright even more and make it faster.
|
| 259 |
+
* Generates real browser's headers of the same type and same user OS then append it to the request's headers.
|
| 260 |
+
3) Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
| 261 |
+
4) [NSTBrowser](https://app.nstbrowser.io/r/1vO5e5)'s [docker browserless](https://hub.docker.com/r/nstbrowser/browserless) option by passing the CDP URL and enabling `nstbrowser_mode` option.
|
| 262 |
+
|
| 263 |
+
Add that to a lot of controlling/hiding options as you will see in the arguments list below.
|
| 264 |
+
|
| 265 |
+
<details><summary><strong>Expand this for the complete list of arguments</strong></summary>
|
| 266 |
+
|
| 267 |
+
| Argument | Description | Optional |
|
| 268 |
+
|:-------------------:|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------:|
|
| 269 |
+
| url | Target url | ❌ |
|
| 270 |
+
| headless | Pass `True` to run the browser in headless/hidden (**default**), or `False` for headful/visible mode. | ✔️ |
|
| 271 |
+
| disable_resources | Drop requests of unnecessary resources for a speed boost. It depends but it made requests ~25% faster in my tests for some websites.<br/>Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`. _This can help save your proxy usage but be careful with this option as it makes some websites never finish loading._ | ✔️ |
|
| 272 |
+
| useragent | Pass a useragent string to be used. **Otherwise the fetcher will generate a real Useragent of the same browser and use it.** | ✔️ |
|
| 273 |
+
| network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
|
| 274 |
+
| timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
|
| 275 |
+
| page_action | Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again. | ✔️ |
|
| 276 |
+
| wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
|
| 277 |
+
| wait_selector_state | The state to wait for the selector given with `wait_selector`. _Default state is `attached`._ | ✔️ |
|
| 278 |
+
| google_search | Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name. | ✔️ |
|
| 279 |
+
| extra_headers | A dictionary of extra headers to add to the request. The referer set by the `google_search` argument takes priority over the referer set here if used together. | ✔️ |
|
| 280 |
+
| hide_canvas | Add random noise to canvas operations to prevent fingerprinting. | ✔️ |
|
| 281 |
+
| disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ |
|
| 282 |
+
| stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ |
|
| 283 |
+
| cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP. | ✔️ |
|
| 284 |
+
| nstbrowser_mode | Enables NSTBrowser mode, **it have to be used with `cdp_url` argument or it will get completely ignored.** | ✔️ |
|
| 285 |
+
| nstbrowser_config | The config you want to send with requests to the NSTBrowser. _If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config._ | ✔️ |
|
| 286 |
+
|
| 287 |
+
</details>
|
| 288 |
+
|
| 289 |
+
This list isn't final so expect a lot more additions and flexibility to be added in the next versions!
|
| 290 |
+
|
| 291 |
+
## Advanced Parsing Features
|
| 292 |
### Smart Navigation
|
| 293 |
```python
|
| 294 |
>>> quote.tag
|
|
|
|
| 308 |
>>> quote.siblings
|
| 309 |
[<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
|
| 310 |
<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
|
|
|
|
| 311 |
...]
|
| 312 |
|
| 313 |
>>> quote.next # gets the next element, the same logic applies to `quote.previous`
|
| 314 |
<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>
|
| 315 |
|
| 316 |
+
>>> quote.children.css_first(".author::text")
|
| 317 |
+
'Albert Einstein'
|
| 318 |
|
| 319 |
>>> quote.has_class('quote')
|
| 320 |
True
|
| 321 |
|
| 322 |
# Generate new selectors for any element
|
| 323 |
+
>>> quote.generate_css_selector
|
| 324 |
'body > div > div:nth-of-type(2) > div > div'
|
| 325 |
|
| 326 |
+
# Test these selectors on your favorite browser or reuse them again in the library's methods!
|
| 327 |
+
>>> quote.generate_xpath_selector
|
| 328 |
'//body/div/div[2]/div/div'
|
| 329 |
```
|
| 330 |
If your case needs more than the element's parent, you can iterate over the whole ancestors' tree of any element like below
|
|
|
|
| 341 |
### Content-based Selection & Finding Similar Elements
|
| 342 |
You can select elements by their text content in multiple ways, here's a full example on another website:
|
| 343 |
```python
|
| 344 |
+
>>> page = Fetcher().get('https://books.toscrape.com/index.html').adaptor
|
|
|
|
|
|
|
| 345 |
|
| 346 |
+
>>> page.find_by_text('Tipping the Velvet') # Find the first element whose text fully matches this text
|
| 347 |
<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>
|
| 348 |
|
| 349 |
>>> page.find_by_text('Tipping the Velvet', first_match=False) # Get all matches if there are more
|
|
|
|
| 383 |
```python
|
| 384 |
>>> for product in page.find_by_text('Tipping the Velvet').parent.parent.find_similar():
|
| 385 |
print({
|
| 386 |
+
"name": product.css_first('h3 a::text'),
|
| 387 |
+
"price": product.css_first('.price_color').re_first(r'[\d\.]+'),
|
| 388 |
"stock": product.css('.availability::text')[-1].clean()
|
| 389 |
})
|
| 390 |
{'name': 'A Light in the ...', 'price': '51.77', 'stock': 'In stock'}
|
|
|
|
| 395 |
The [documentation](https://github.com/D4Vinci/Scrapling/tree/main/docs/Examples) will provide more advanced examples.
|
| 396 |
|
| 397 |
### Handling Structural Changes
|
|
|
|
|
|
|
| 398 |
Let's say you are scraping a page with a structure like this:
|
| 399 |
```html
|
| 400 |
<div class="container">
|
|
|
|
| 410 |
</section>
|
| 411 |
</div>
|
| 412 |
```
|
| 413 |
+
And you want to scrape the first product, the one with the `p1` ID. You will probably write a selector like this
|
| 414 |
```python
|
| 415 |
page.css('#p1')
|
| 416 |
```
|
|
|
|
| 435 |
</div>
|
| 436 |
</div>
|
| 437 |
```
|
| 438 |
+
The selector will no longer function and your code needs maintenance. That's where Scrapling's auto-matching feature comes into play.
|
| 439 |
|
| 440 |
```python
|
| 441 |
+
from scrapling import Adaptor
|
| 442 |
# Before the change
|
| 443 |
+
page = Adaptor(page_source, url='example.com')
|
| 444 |
element = page.css('#p1' auto_save=True)
|
| 445 |
if not element: # One day website changes?
|
| 446 |
+
element = page.css('#p1', auto_match=True) # Scrapling still finds it!
|
| 447 |
# the rest of the code...
|
| 448 |
```
|
| 449 |
+
> How does the auto-matching work? Check the [FAQs](#-enlightening-questions-and-faqs) section for that and other possible issues while auto-matching.
|
| 450 |
+
|
| 451 |
+
#### Real-World Scenario
|
| 452 |
+
Let's use a real website as an example and use one of the fetchers to fetch its source. To do this we need to find a website that will change its design/structure soon, take a copy of its source then wait for the website to make the change. Of course, that's nearly impossible to know unless I know the website's owner but that will make it a staged test haha.
|
| 453 |
+
|
| 454 |
+
To solve this issue, I will use [The Web Archive](https://archive.org/)'s [Wayback Machine](https://web.archive.org/). Here is a copy of [StackOverFlow's website in 2010](https://web.archive.org/web/20100102003420/http://stackoverflow.com/), pretty old huh?</br>Let's test if the automatch feature can extract the same button in the old design from 2010 and the current design using the same selector :)
|
| 455 |
+
|
| 456 |
+
If I want to extract the Questions button from the old design I can use a selector like this `#hmenus > div:nth-child(1) > ul > li:nth-child(1) > a` This selector is too specific because it was generated by Google Chrome.
|
| 457 |
+
Now let's test the same selector in both versions
|
| 458 |
+
```python
|
| 459 |
+
>> from scrapling import Fetcher
|
| 460 |
+
>> selector = '#hmenus > div:nth-child(1) > ul > li:nth-child(1) > a'
|
| 461 |
+
>> old_url = "https://web.archive.org/web/20100102003420/http://stackoverflow.com/"
|
| 462 |
+
>> new_url = "https://stackoverflow.com/"
|
| 463 |
+
>>
|
| 464 |
+
>> page = Fetcher(automatch_domain='stackoverflow.com').get(old_url, timeout=30).adaptor
|
| 465 |
+
>> element1 = page.css_first(selector, auto_save=True)
|
| 466 |
+
>>
|
| 467 |
+
>> # Same selector but used in the updated website
|
| 468 |
+
>> page = Fetcher(automatch_domain="stackoverflow.com").get(new_url).adaptor
|
| 469 |
+
>> element2 = page.css_first(selector, auto_match=True)
|
| 470 |
+
>>
|
| 471 |
+
>> if element1.text == element2.text:
|
| 472 |
+
... print('Scrapling found the same element in the old design and the new design!')
|
| 473 |
+
'Scrapling found the same element in the old design and the new design!'
|
| 474 |
+
```
|
| 475 |
+
Note that I used a new argument called `automatch_domain`, this is because for Scrapling these are two different URLs, not the website so it isolates their data. To tell Scrapling they are the same website, we then pass the domain we want to use for saving auto-match data for them both so Scrapling doesn't isolate them.
|
| 476 |
+
|
| 477 |
+
In a real-world scenario, the code will be the same except it will use the same URL for both requests so you won't need to use the `automatch_domain` argument. This is the closest example I can give to real-world cases so I hope it didn't confuse you :)
|
| 478 |
|
| 479 |
**Notes:**
|
| 480 |
+
1. For the two examples above I used one time the `Adaptor` class and the second time the `Fetcher` class just to show you that you can create the `Adaptor` object by yourself if you have the source or fetch the source using any `Fetcher` class then it will create the `Adaptor` object for you on the `.adaptor` property.
|
| 481 |
+
2. Passing the `auto_save` argument with the `auto_match` argument set to `False` while initializing the Adaptor/Fetcher object will only result in ignoring the `auto_save` argument value and the following warning message
|
| 482 |
```text
|
| 483 |
Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.
|
| 484 |
```
|
| 485 |
This behavior is purely for performance reasons so the database gets created/connected only when you are planning to use the auto-matching features. Same case with the `auto_match` argument.
|
| 486 |
|
| 487 |
+
3. The `auto_match` parameter works only for `Adaptor` instances not `Adaptors` so if you do something like this you will get an error
|
| 488 |
```python
|
| 489 |
page.css('body').css('#p1', auto_match=True)
|
| 490 |
```
|
| 491 |
because you can't auto-match a whole list, you have to be specific and do something like
|
| 492 |
```python
|
| 493 |
+
page.css_first('body').css('#p1', auto_match=True)
|
| 494 |
```
|
| 495 |
|
| 496 |
+
### Find elements by filters
|
| 497 |
+
Inspired by BeautifulSoup's `find_all` function you can find elements by using `find_all`/`find` methods. Both methods can take multiple types of filters and return all elements in the pages that all these filters apply to.
|
| 498 |
+
|
| 499 |
+
* To be more specific:
|
| 500 |
+
* Any string passed is considered a tag name
|
| 501 |
+
* Any iterable passed like List/Tuple/Set is considered an iterable of tag names.
|
| 502 |
+
* Any dictionary is considered a mapping of HTML element(s) attribute names and attribute values.
|
| 503 |
+
* Any regex patterns passed are used as filters
|
| 504 |
+
* Any functions passed are used as filters
|
| 505 |
+
* Any keyword argument passed is considered as an HTML element attribute with its value.
|
| 506 |
+
|
| 507 |
+
So the way it works is after collecting all passed arguments and keywords, each filter passes its results to the following filter in a waterfall-like filtering system.
|
| 508 |
+
<br/>It filters all elements in the current page/element in the following order:
|
| 509 |
+
|
| 510 |
+
1. All elements with the passed tag name(s).
|
| 511 |
+
2. All elements that match all passed attribute(s).
|
| 512 |
+
3. All elements that match all passed regex patterns.
|
| 513 |
+
4. All elements that fulfill all passed function(s).
|
| 514 |
+
|
| 515 |
+
Note: The filtering process always starts from the first filter it finds in the filtering order above so if no tag name(s) are passed but attributes are passed, the process starts from that layer and so on. **But the order in which you pass the arguments doesn't matter.**
|
| 516 |
+
|
| 517 |
+
Examples to clear any confusion :)
|
| 518 |
+
|
| 519 |
+
```python
|
| 520 |
+
>> from scrapling import Fetcher
|
| 521 |
+
>> page = Fetcher().get('https://quotes.toscrape.com/').adaptor
|
| 522 |
+
# Find all elements with tag name `div`.
|
| 523 |
+
>> page.find_all('div')
|
| 524 |
+
[<data='<div class="container"> <div class="row...' parent='<body> <div class="container"> <div clas...'>,
|
| 525 |
+
<data='<div class="row header-box"> <div class=...' parent='<div class="container"> <div class="row...'>,
|
| 526 |
+
...]
|
| 527 |
+
|
| 528 |
+
# Find all div elements with a class that equals `quote`.
|
| 529 |
+
>> page.find_all('div', class_='quote')
|
| 530 |
+
[<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
|
| 531 |
+
<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
|
| 532 |
+
...]
|
| 533 |
+
|
| 534 |
+
# Same as above.
|
| 535 |
+
>> page.find_all('div', {'class': 'quote'})
|
| 536 |
+
[<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
|
| 537 |
+
<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
|
| 538 |
+
...]
|
| 539 |
+
|
| 540 |
+
# Find all elements with a class that equals `quote`.
|
| 541 |
+
>> page.find_all({'class': 'quote'})
|
| 542 |
+
[<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
|
| 543 |
+
<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
|
| 544 |
+
...]
|
| 545 |
+
|
| 546 |
+
# Find all div elements with a class that equals `quote`, and contains the element `.text` which contains the word 'world' in its content.
|
| 547 |
+
>> page.find_all('div', {'class': 'quote'}, lambda e: "world" in e.css_first('.text::text'))
|
| 548 |
+
[<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>]
|
| 549 |
+
|
| 550 |
+
# Find all elements that don't have children.
|
| 551 |
+
>> page.find_all(lambda element: len(element.children) > 0)
|
| 552 |
+
[<data='<html lang="en"><head><meta charset="UTF...'>,
|
| 553 |
+
<data='<head><meta charset="UTF-8"><title>Quote...' parent='<html lang="en"><head><meta charset="UTF...'>,
|
| 554 |
+
<data='<body> <div class="container"> <div clas...' parent='<html lang="en"><head><meta charset="UTF...'>,
|
| 555 |
+
...]
|
| 556 |
+
|
| 557 |
+
# Find all elements that contain the word 'world' in its content.
|
| 558 |
+
>> page.find_all(lambda element: "world" in element.text)
|
| 559 |
+
[<data='<span class="text" itemprop="text">“The...' parent='<div class="quote" itemscope itemtype="h...'>,
|
| 560 |
+
<data='<a class="tag" href="/tag/world/page/1/"...' parent='<div class="tags"> Tags: <meta class="ke...'>]
|
| 561 |
+
|
| 562 |
+
# Find all span elements that match the given regex
|
| 563 |
+
>> page.find_all('span', re.compile(r'world'))
|
| 564 |
+
[<data='<span class="text" itemprop="text">“The...' parent='<div class="quote" itemscope itemtype="h...'>]
|
| 565 |
+
|
| 566 |
+
# Find all div and span elements with class 'quote' (No span elements like that so only div returned)
|
| 567 |
+
>> page.find_all(['div', 'span'], {'class': 'quote'})
|
| 568 |
+
[<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
|
| 569 |
+
<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
|
| 570 |
+
...]
|
| 571 |
+
|
| 572 |
+
# Mix things up
|
| 573 |
+
>> page.find_all({'itemtype':"http://schema.org/CreativeWork"}, 'div').css('.author::text')
|
| 574 |
+
['Albert Einstein',
|
| 575 |
+
'J.K. Rowling',
|
| 576 |
+
...]
|
| 577 |
+
```
|
| 578 |
+
|
| 579 |
### Is That All?
|
| 580 |
Here's what else you can do with Scrapling:
|
| 581 |
|
|
|
|
| 586 |
```
|
| 587 |
- Saving and retrieving elements manually to auto-match them outside the `css` and the `xpath` methods but you have to set the identifier by yourself.
|
| 588 |
|
| 589 |
+
- To save an element to the database:
|
| 590 |
```python
|
| 591 |
>>> element = page.find_by_text('Tipping the Velvet', first_match=True)
|
| 592 |
>>> page.save(element, 'my_special_element')
|
| 593 |
```
|
| 594 |
+
- Now later when you want to retrieve it and relocate it inside the page with auto-matching, it would be like this
|
| 595 |
```python
|
| 596 |
>>> element_dict = page.retrieve('my_special_element')
|
| 597 |
>>> page.relocate(element_dict, adaptor_type=True)
|
|
|
|
| 605 |
[<Element a at 0x105a2a7b0>]
|
| 606 |
```
|
| 607 |
|
| 608 |
+
- Filtering results based on a function
|
| 609 |
+
```python
|
| 610 |
+
# Find all products over $50
|
| 611 |
+
expensive_products = page.css('.product_pod').filter(
|
| 612 |
+
lambda p: float(p.css('.price_color').re_first(r'[\d\.]+')) > 50
|
| 613 |
+
)
|
| 614 |
+
```
|
| 615 |
+
|
| 616 |
+
- Searching results for the first one that matches a function
|
| 617 |
+
```python
|
| 618 |
+
# Find all the products with price '53.23'
|
| 619 |
+
page.css('.product_pod').search(
|
| 620 |
+
lambda p: float(p.css('.price_color').re_first(r'[\d\.]+')) == 54.23
|
| 621 |
+
)
|
| 622 |
+
```
|
| 623 |
+
|
| 624 |
- Doing operations on element content is the same as scrapy
|
| 625 |
```python
|
| 626 |
+
quote.re(r'regex_pattern') # Get all strings (TextHandlers) that match the regex pattern
|
| 627 |
+
quote.re_first(r'regex_pattern') # Get the first string (TextHandler) only
|
| 628 |
quote.json() # If the content text is jsonable, then convert it to json using `orjson` which is 10x faster than the standard json library and provides more options
|
| 629 |
```
|
| 630 |
+
except that you can do more with them like
|
| 631 |
+
```python
|
| 632 |
+
quote.re(
|
| 633 |
+
r'regex_pattern',
|
| 634 |
+
replace_entities=True, # Character entity references are replaced by their corresponding character
|
| 635 |
+
clean_match=True, # This will ignore all whitespaces and consecutive spaces while matching
|
| 636 |
+
case_sensitive= False, # Set the regex to ignore letters case while compiling it
|
| 637 |
+
)
|
| 638 |
+
```
|
| 639 |
+
Hence all of these methods are methods from the `TextHandler` within that contains the text content so the same can be done directly if you call the `.text` property or equivalent selector function.
|
| 640 |
|
| 641 |
|
| 642 |
- Doing operations on the text content itself includes
|
|
|
|
| 650 |
```
|
| 651 |
- Sort all characters in the string as if it were a list and return the new string
|
| 652 |
```python
|
| 653 |
+
quote.sort(reverse=False)
|
| 654 |
```
|
| 655 |
> To be clear, `TextHandler` is a sub-class of Python's `str` so all normal operations/methods that work with Python strings will work with it.
|
| 656 |
|
| 657 |
+
- Any element's attributes are not exactly a dictionary but a sub-class of [mapping](https://docs.python.org/3/glossary.html#term-mapping) called `AttributesHandler` that's read-only so it's faster and string values returned are actually `TextHandler` objects so all operations above can be done on them, standard dictionary operations that don't modify the data, and more :)
|
| 658 |
- Unlike standard dictionaries, here you can search by values too and can do partial searches. It might be handy in some cases (returns a generator of matches)
|
| 659 |
```python
|
| 660 |
>>> for item in element.attrib.search_values('catalogue', partial=True):
|
|
|
|
| 681 |
|
| 682 |
Note that implementing your storage system can be complex as there are some strict rules such as inheriting from the same abstract class, following the singleton design pattern used in other classes, and more. So make sure to read the docs first.
|
| 683 |
|
| 684 |
+
To give detailed documentation of the library, it will need a website. I'm trying to rush creating the website, researching new ideas, and adding more features/tests/benchmarks but time is tight with too many spinning plates between work, personal life, and working on Scrapling. But you can help by using the [sponsor button](https://github.com/sponsors/D4Vinci) above :)
|
| 685 |
|
| 686 |
+
## ⚡ Enlightening Questions and FAQs
|
| 687 |
This section addresses common questions about Scrapling, please read this section before opening an issue.
|
| 688 |
|
| 689 |
### How does auto-matching work?
|
|
|
|
| 696 |
Together both are used to retrieve the element's unique properties from the database later.
|
| 697 |
4. Now later when you enable the `auto_match` parameter for both the Adaptor instance and the method call. The element properties are retrieved and Scrapling loops over all elements in the page and compares each one's unique properties to the unique properties we already have for this element and a score is calculated for each one.
|
| 698 |
5. The comparison between elements is not exact but more about finding how similar these values are, so everything is taken into consideration even the values' order like the order in which the element class names were written before and the order in which the same element class names are written now.
|
| 699 |
+
6. The score for each element is stored in the table, and in the end, the element(s) with the highest combined similarity scores are returned.
|
| 700 |
|
| 701 |
### How does the auto-matching work if I didn't pass a URL while initializing the Adaptor object?
|
| 702 |
Not a big problem as it depends on your usage. The word `default` will be used in place of the URL field while saving the element's unique properties. So this will only be an issue if you used the same identifier later for a different website that you didn't pass the URL parameter while initializing it as well. The save process will overwrite the previous data and auto-matching uses the latest saved properties only.
|
|
|
|
| 725 |
Of course, you can find elements by text/regex, find similar elements in a more reliable way than AutoScraper, and finally save/retrieve elements manually to use later as the model feature in AutoScraper. I have pulled all top articles about AutoScraper from Google and tested Scrapling against examples in them. In all examples, Scrapling got the same results as AutoScraper in much less time.
|
| 726 |
|
| 727 |
### Is Scrapling thread-safe?
|
| 728 |
+
Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its state.
|
| 729 |
|
| 730 |
## Sponsors
|
| 731 |
[](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
|
|
|
|
| 735 |
|
| 736 |
Please read the [contributing file](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) before doing anything.
|
| 737 |
|
| 738 |
+
## Disclaimer for Scrapling Project
|
| 739 |
+
> [!CAUTION]
|
| 740 |
+
> This library is provided for educational and research purposes only. By using this library, you agree to comply with local and international laws regarding data scraping and privacy. The authors and contributors are not responsible for any misuse of this software. This library should not be used to violate the rights of others, for unethical purposes, or to use data in an unauthorized or illegal manner. Do not use it on any website unless you have permission from the website owner or within their allowed rules like the `robots.txt` file, for example.
|
| 741 |
+
|
| 742 |
## License
|
| 743 |
This work is licensed under BSD-3
|
| 744 |
|
|
|
|
| 746 |
This project includes code adapted from:
|
| 747 |
- Parsel (BSD License) - Used for [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/translator.py) submodule
|
| 748 |
|
| 749 |
+
## Thanks and References
|
| 750 |
+
- [Daijro](https://github.com/daijro)'s brilliant work on both [BrowserForge](https://github.com/daijro/browserforge) and [Camoufox](https://github.com/daijro/camoufox)
|
| 751 |
+
- [Vinyzu](https://github.com/Vinyzu)'s work on Playwright's mock on [Botright](https://github.com/Vinyzu/Botright)
|
| 752 |
+
- [brotector](https://github.com/kaliiiiiiiiii/brotector)
|
| 753 |
+
- [fakebrowser](https://github.com/kkoooqq/fakebrowser)
|
| 754 |
+
- [rebrowser-patches](https://github.com/rebrowser/rebrowser-patches)
|
| 755 |
+
|
| 756 |
## Known Issues
|
| 757 |
- In the auto-matching save process, the unique properties of the first element from the selection results are the only ones that get saved. So if the selector you are using selects different elements on the page that are in different locations, auto-matching will probably return to you the first element only when you relocate it later. This doesn't include combined CSS selectors (Using commas to combine more than one selector for example) as these selectors get separated and each selector gets executed alone.
|
| 758 |
- Currently, Scrapling is not compatible with async/await.
|
| 759 |
|
| 760 |
+
---
|
| 761 |
+
<div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
|
ROADMAP.md
CHANGED
|
@@ -1,13 +1,14 @@
|
|
| 1 |
## TODOs
|
| 2 |
-
- Add more tests and increase the code coverage.
|
| 3 |
-
- Structure the tests folder in a better way.
|
| 4 |
-
- Add more documentation.
|
| 5 |
-
- Add the browsing ability.
|
| 6 |
-
- Create detailed documentation for 'readthedocs' website, preferably add Github action for deploying it.
|
| 7 |
-
- Create a Scrapy plugin/decorator to make it replace parsel in the response argument when needed.
|
| 8 |
-
- Need to add more functionality to `AttributesHandler` and more navigation functions to `Adaptor` object (ex: functions similar to map, filter, and reduce functions but here pass it to the element and the function is executed on children, siblings, next elements, etc...)
|
| 9 |
-
- Add `.filter` method to `Adaptors` object and other similar methods.
|
| 10 |
-
- Add functionality to automatically detect pagination URLs
|
| 11 |
-
- Add the ability to auto-detect schemas in pages and manipulate them
|
| 12 |
-
- Add ability
|
|
|
|
| 13 |
-
|
|
|
|
| 1 |
## TODOs
|
| 2 |
+
- [x] Add more tests and increase the code coverage.
|
| 3 |
+
- [x] Structure the tests folder in a better way.
|
| 4 |
+
- [ ] Add more documentation.
|
| 5 |
+
- [x] Add the browsing ability.
|
| 6 |
+
- [ ] Create detailed documentation for 'readthedocs' website, preferably add Github action for deploying it.
|
| 7 |
+
- [ ] Create a Scrapy plugin/decorator to make it replace parsel in the response argument when needed.
|
| 8 |
+
- [ ] Need to add more functionality to `AttributesHandler` and more navigation functions to `Adaptor` object (ex: functions similar to map, filter, and reduce functions but here pass it to the element and the function is executed on children, siblings, next elements, etc...)
|
| 9 |
+
- [x] Add `.filter` method to `Adaptors` object and other similar methods.
|
| 10 |
+
- [ ] Add functionality to automatically detect pagination URLs
|
| 11 |
+
- [ ] Add the ability to auto-detect schemas in pages and manipulate them.
|
| 12 |
+
- [ ] Add `analyzer` ability that tries to learn about the page through meta elements and return what it learned
|
| 13 |
+
- [ ] Add ability to generate a regex from a group of elements (Like for all href attributes)
|
| 14 |
-
|
pytest.ini
CHANGED
|
@@ -1,2 +1,2 @@
|
|
| 1 |
[pytest]
|
| 2 |
-
addopts = -p no:warnings --doctest-modules --ignore=setup.py
|
|
|
|
| 1 |
[pytest]
|
| 2 |
+
addopts = -p no:warnings --doctest-modules --ignore=setup.py --verbose
|
scrapling/__init__.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
| 1 |
# Declare top-level shortcuts
|
|
|
|
| 2 |
from scrapling.parser import Adaptor, Adaptors
|
| 3 |
-
from scrapling.custom_types import TextHandler, AttributesHandler
|
| 4 |
|
| 5 |
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
| 6 |
-
__version__ = "0.
|
| 7 |
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
| 8 |
|
| 9 |
|
| 10 |
-
__all__ = ['Adaptor', '
|
|
|
|
| 1 |
# Declare top-level shortcuts
|
| 2 |
+
from scrapling.fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher, CustomFetcher
|
| 3 |
from scrapling.parser import Adaptor, Adaptors
|
| 4 |
+
from scrapling.core.custom_types import TextHandler, AttributesHandler
|
| 5 |
|
| 6 |
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
| 7 |
+
__version__ = "0.2"
|
| 8 |
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
| 9 |
|
| 10 |
|
| 11 |
+
__all__ = ['Adaptor', 'Fetcher', 'StealthyFetcher', 'PlayWrightFetcher']
|
scrapling/core/__init__.py
ADDED
|
File without changes
|
scrapling/core/_types.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Type definitions for type checking purposes.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from typing import (
|
| 6 |
+
Dict, Optional, Union, Callable, Any, List, Tuple, Pattern, Generator, Iterable, Type, TYPE_CHECKING, Literal
|
| 7 |
+
)
|
| 8 |
+
|
| 9 |
+
try:
|
| 10 |
+
from typing import Protocol
|
| 11 |
+
except ImportError:
|
| 12 |
+
# Added in Python 3.8
|
| 13 |
+
Protocol = object
|
| 14 |
+
|
| 15 |
+
try:
|
| 16 |
+
from typing import SupportsIndex
|
| 17 |
+
except ImportError:
|
| 18 |
+
# 'SupportsIndex' got added in Python 3.8
|
| 19 |
+
SupportsIndex = None
|
| 20 |
+
|
| 21 |
+
if TYPE_CHECKING:
|
| 22 |
+
# typing.Self requires Python 3.11
|
| 23 |
+
from typing_extensions import Self
|
| 24 |
+
else:
|
| 25 |
+
Self = object
|
scrapling/{custom_types.py → core/custom_types.py}
RENAMED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import re
|
| 2 |
from types import MappingProxyType
|
| 3 |
from collections.abc import Mapping
|
| 4 |
-
from typing import Dict, List, Union, Pattern
|
| 5 |
|
| 6 |
-
from scrapling.utils import _is_iterable, flatten
|
|
|
|
| 7 |
|
| 8 |
from orjson import loads, dumps
|
| 9 |
from w3lib.html import replace_entities as _replace_entities
|
|
@@ -69,7 +69,7 @@ class TextHandler(str):
|
|
| 69 |
return [TextHandler(_replace_entities(s)) for s in results]
|
| 70 |
|
| 71 |
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
| 72 |
-
clean_match: bool = False, case_sensitive: bool = False
|
| 73 |
"""Apply the given regex to text and return the first match if found, otherwise return the default value.
|
| 74 |
|
| 75 |
:param regex: Can be either a compiled regular expression or a string.
|
|
@@ -83,6 +83,51 @@ class TextHandler(str):
|
|
| 83 |
return result[0] if result else default
|
| 84 |
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
class AttributesHandler(Mapping):
|
| 87 |
"""A read-only mapping to use instead of the standard dictionary for the speed boost but
|
| 88 |
at the same time I use it to add more functionalities.
|
|
|
|
| 1 |
import re
|
| 2 |
from types import MappingProxyType
|
| 3 |
from collections.abc import Mapping
|
|
|
|
| 4 |
|
| 5 |
+
from scrapling.core.utils import _is_iterable, flatten
|
| 6 |
+
from scrapling.core._types import Dict, List, Union, Pattern, SupportsIndex
|
| 7 |
|
| 8 |
from orjson import loads, dumps
|
| 9 |
from w3lib.html import replace_entities as _replace_entities
|
|
|
|
| 69 |
return [TextHandler(_replace_entities(s)) for s in results]
|
| 70 |
|
| 71 |
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
| 72 |
+
clean_match: bool = False, case_sensitive: bool = False) -> Union[str, None]:
|
| 73 |
"""Apply the given regex to text and return the first match if found, otherwise return the default value.
|
| 74 |
|
| 75 |
:param regex: Can be either a compiled regular expression or a string.
|
|
|
|
| 83 |
return result[0] if result else default
|
| 84 |
|
| 85 |
|
| 86 |
+
class TextHandlers(List[TextHandler]):
|
| 87 |
+
"""
|
| 88 |
+
The :class:`TextHandlers` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
|
| 89 |
+
"""
|
| 90 |
+
__slots__ = ()
|
| 91 |
+
|
| 92 |
+
def __getitem__(self, pos: Union[SupportsIndex, slice]) -> Union[TextHandler, "TextHandlers[TextHandler]"]:
|
| 93 |
+
lst = super().__getitem__(pos)
|
| 94 |
+
if isinstance(pos, slice):
|
| 95 |
+
return self.__class__(lst)
|
| 96 |
+
else:
|
| 97 |
+
return lst
|
| 98 |
+
|
| 99 |
+
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
|
| 100 |
+
case_sensitive: bool = False) -> 'List[str]':
|
| 101 |
+
"""Call the ``.re()`` method for each element in this list and return
|
| 102 |
+
their results flattened as TextHandlers.
|
| 103 |
+
|
| 104 |
+
:param regex: Can be either a compiled regular expression or a string.
|
| 105 |
+
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
| 106 |
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 107 |
+
:param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
|
| 108 |
+
"""
|
| 109 |
+
results = [
|
| 110 |
+
n.re(regex, replace_entities, clean_match, case_sensitive) for n in self
|
| 111 |
+
]
|
| 112 |
+
return flatten(results)
|
| 113 |
+
|
| 114 |
+
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
| 115 |
+
clean_match: bool = False, case_sensitive: bool = False) -> Union[str, None]:
|
| 116 |
+
"""Call the ``.re_first()`` method for each element in this list and return
|
| 117 |
+
the first result or the default value otherwise.
|
| 118 |
+
|
| 119 |
+
:param regex: Can be either a compiled regular expression or a string.
|
| 120 |
+
:param default: The default value to be returned if there is no match
|
| 121 |
+
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
| 122 |
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 123 |
+
:param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
|
| 124 |
+
"""
|
| 125 |
+
for n in self:
|
| 126 |
+
for result in n.re(regex, replace_entities, clean_match, case_sensitive):
|
| 127 |
+
return result
|
| 128 |
+
return default
|
| 129 |
+
|
| 130 |
+
|
| 131 |
class AttributesHandler(Mapping):
|
| 132 |
"""A read-only mapping to use instead of the standard dictionary for the speed boost but
|
| 133 |
at the same time I use it to add more functionalities.
|
scrapling/{mixins.py → core/mixins.py}
RENAMED
|
@@ -4,7 +4,7 @@ class SelectorsGeneration:
|
|
| 4 |
Trying to generate selectors like Firefox or maybe cleaner ones!? Ehm
|
| 5 |
Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591"""
|
| 6 |
|
| 7 |
-
def __general_selection(self, selection: str = 'css') -> str:
|
| 8 |
"""Generate a selector for the current element.
|
| 9 |
:return: A string of the generated selector.
|
| 10 |
"""
|
|
@@ -20,10 +20,11 @@ class SelectorsGeneration:
|
|
| 20 |
else f"[@id='{target.attrib['id']}']"
|
| 21 |
)
|
| 22 |
selectorPath.append(part)
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
|
|
|
| 27 |
else:
|
| 28 |
part = f'{target.tag}'
|
| 29 |
# We won't use classes anymore because I some websites share exact classes between elements
|
|
@@ -60,15 +61,29 @@ class SelectorsGeneration:
|
|
| 60 |
)
|
| 61 |
|
| 62 |
@property
|
| 63 |
-
def
|
| 64 |
"""Generate a CSS selector for the current element
|
| 65 |
:return: A string of the generated selector.
|
| 66 |
"""
|
| 67 |
return self.__general_selection()
|
| 68 |
|
| 69 |
@property
|
| 70 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
"""Generate a XPath selector for the current element
|
| 72 |
:return: A string of the generated selector.
|
| 73 |
"""
|
| 74 |
return self.__general_selection('xpath')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
Trying to generate selectors like Firefox or maybe cleaner ones!? Ehm
|
| 5 |
Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591"""
|
| 6 |
|
| 7 |
+
def __general_selection(self, selection: str = 'css', full_path=False) -> str:
|
| 8 |
"""Generate a selector for the current element.
|
| 9 |
:return: A string of the generated selector.
|
| 10 |
"""
|
|
|
|
| 20 |
else f"[@id='{target.attrib['id']}']"
|
| 21 |
)
|
| 22 |
selectorPath.append(part)
|
| 23 |
+
if not full_path:
|
| 24 |
+
return (
|
| 25 |
+
" > ".join(reversed(selectorPath)) if css
|
| 26 |
+
else '//*' + "/".join(reversed(selectorPath))
|
| 27 |
+
)
|
| 28 |
else:
|
| 29 |
part = f'{target.tag}'
|
| 30 |
# We won't use classes anymore because I some websites share exact classes between elements
|
|
|
|
| 61 |
)
|
| 62 |
|
| 63 |
@property
|
| 64 |
+
def generate_css_selector(self) -> str:
|
| 65 |
"""Generate a CSS selector for the current element
|
| 66 |
:return: A string of the generated selector.
|
| 67 |
"""
|
| 68 |
return self.__general_selection()
|
| 69 |
|
| 70 |
@property
|
| 71 |
+
def generate_full_css_selector(self) -> str:
|
| 72 |
+
"""Generate a complete CSS selector for the current element
|
| 73 |
+
:return: A string of the generated selector.
|
| 74 |
+
"""
|
| 75 |
+
return self.__general_selection(full_path=True)
|
| 76 |
+
|
| 77 |
+
@property
|
| 78 |
+
def generate_xpath_selector(self) -> str:
|
| 79 |
"""Generate a XPath selector for the current element
|
| 80 |
:return: A string of the generated selector.
|
| 81 |
"""
|
| 82 |
return self.__general_selection('xpath')
|
| 83 |
+
|
| 84 |
+
@property
|
| 85 |
+
def generate_full_xpath_selector(self) -> str:
|
| 86 |
+
"""Generate a complete XPath selector for the current element
|
| 87 |
+
:return: A string of the generated selector.
|
| 88 |
+
"""
|
| 89 |
+
return self.__general_selection('xpath', full_path=True)
|
scrapling/{storage_adaptors.py → core/storage_adaptors.py}
RENAMED
|
@@ -4,9 +4,9 @@ import logging
|
|
| 4 |
import threading
|
| 5 |
from hashlib import sha256
|
| 6 |
from abc import ABC, abstractmethod
|
| 7 |
-
from typing import Dict, Optional, Union
|
| 8 |
|
| 9 |
-
from scrapling.
|
|
|
|
| 10 |
|
| 11 |
from lxml import html
|
| 12 |
from tldextract import extract as tld
|
|
|
|
| 4 |
import threading
|
| 5 |
from hashlib import sha256
|
| 6 |
from abc import ABC, abstractmethod
|
|
|
|
| 7 |
|
| 8 |
+
from scrapling.core._types import Dict, Optional, Union
|
| 9 |
+
from scrapling.core.utils import _StorageTools, cache
|
| 10 |
|
| 11 |
from lxml import html
|
| 12 |
from tldextract import extract as tld
|
scrapling/{translator.py → core/translator.py}
RENAMED
|
@@ -9,24 +9,14 @@ which will be important in future releases but most importantly...
|
|
| 9 |
import re
|
| 10 |
|
| 11 |
from w3lib.html import HTML5_WHITESPACE
|
| 12 |
-
from
|
| 13 |
-
|
| 14 |
-
from typing import Protocol
|
| 15 |
-
except ImportError:
|
| 16 |
-
# Added in Python 3.8
|
| 17 |
-
Protocol = object
|
| 18 |
-
|
| 19 |
-
from scrapling.utils import cache
|
| 20 |
|
| 21 |
from cssselect.xpath import ExpressionError
|
| 22 |
from cssselect.xpath import XPathExpr as OriginalXPathExpr
|
| 23 |
from cssselect import HTMLTranslator as OriginalHTMLTranslator
|
| 24 |
from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
|
| 25 |
|
| 26 |
-
if TYPE_CHECKING:
|
| 27 |
-
# typing.Self requires Python 3.11
|
| 28 |
-
from typing_extensions import Self
|
| 29 |
-
|
| 30 |
|
| 31 |
regex = f"[{HTML5_WHITESPACE}]+"
|
| 32 |
replace_html5_whitespaces = re.compile(regex).sub
|
|
|
|
| 9 |
import re
|
| 10 |
|
| 11 |
from w3lib.html import HTML5_WHITESPACE
|
| 12 |
+
from scrapling.core.utils import cache
|
| 13 |
+
from scrapling.core._types import Any, Optional, Protocol, Self
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
from cssselect.xpath import ExpressionError
|
| 16 |
from cssselect.xpath import XPathExpr as OriginalXPathExpr
|
| 17 |
from cssselect import HTMLTranslator as OriginalHTMLTranslator
|
| 18 |
from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
regex = f"[{HTML5_WHITESPACE}]+"
|
| 22 |
replace_html5_whitespaces = re.compile(regex).sub
|
scrapling/{utils.py → core/utils.py}
RENAMED
|
@@ -1,14 +1,13 @@
|
|
| 1 |
import re
|
| 2 |
-
import os
|
| 3 |
import logging
|
| 4 |
from itertools import chain
|
| 5 |
-
from logging import handlers
|
| 6 |
# Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
|
| 7 |
from functools import lru_cache as cache # functools.cache is available on Python 3.9+ only so let's keep lru_cache
|
| 8 |
|
| 9 |
-
from
|
| 10 |
|
| 11 |
from lxml import html
|
|
|
|
| 12 |
html_forbidden = {html.HtmlComment, }
|
| 13 |
logging.basicConfig(
|
| 14 |
level=logging.ERROR,
|
|
@@ -45,64 +44,6 @@ def _is_iterable(s: Any):
|
|
| 45 |
return isinstance(s, (list, tuple,))
|
| 46 |
|
| 47 |
|
| 48 |
-
@cache(None, typed=True)
|
| 49 |
-
class _Logger(object):
|
| 50 |
-
# I will leave this class here for now in case I decide I want to come back to use it :)
|
| 51 |
-
__slots__ = ('console_logger', 'logger_file_path',)
|
| 52 |
-
levels = {
|
| 53 |
-
'debug': logging.DEBUG,
|
| 54 |
-
'info': logging.INFO,
|
| 55 |
-
'warning': logging.WARNING,
|
| 56 |
-
'error': logging.ERROR,
|
| 57 |
-
'critical': logging.CRITICAL
|
| 58 |
-
}
|
| 59 |
-
|
| 60 |
-
def __init__(self, filename: str = 'debug.log', level: str = 'debug', when: str = 'midnight', backcount: int = 1):
|
| 61 |
-
os.makedirs(os.path.join(os.path.dirname(__file__), 'logs'), exist_ok=True)
|
| 62 |
-
format_str = logging.Formatter("[%(asctime)s] %(levelname)s: %(message)s", "%Y-%m-%d %H:%M:%S")
|
| 63 |
-
|
| 64 |
-
# on-screen output
|
| 65 |
-
lvl = self.levels[level.lower()]
|
| 66 |
-
self.console_logger = logging.getLogger('Scrapling')
|
| 67 |
-
self.console_logger.setLevel(lvl)
|
| 68 |
-
console_handler = logging.StreamHandler()
|
| 69 |
-
console_handler.setLevel(lvl)
|
| 70 |
-
console_handler.setFormatter(format_str)
|
| 71 |
-
self.console_logger.addHandler(console_handler)
|
| 72 |
-
|
| 73 |
-
if lvl == logging.DEBUG:
|
| 74 |
-
filename = os.path.join(os.path.dirname(__file__), 'logs', filename)
|
| 75 |
-
self.logger_file_path = filename
|
| 76 |
-
# Automatically generates the logging file at specified intervals
|
| 77 |
-
file_handler = handlers.TimedRotatingFileHandler(
|
| 78 |
-
# If more than (backcount+1) existed, oldest logs will be deleted
|
| 79 |
-
filename=filename, when=when, backupCount=backcount, encoding='utf-8'
|
| 80 |
-
)
|
| 81 |
-
file_handler.setLevel(lvl)
|
| 82 |
-
file_handler.setFormatter(format_str)
|
| 83 |
-
# This for the logger when it appends the date to the new log
|
| 84 |
-
file_handler.namer = lambda name: name.replace(".log", "") + ".log"
|
| 85 |
-
self.console_logger.addHandler(file_handler)
|
| 86 |
-
self.debug(f'Debug log path: {self.logger_file_path}')
|
| 87 |
-
else:
|
| 88 |
-
self.logger_file_path = None
|
| 89 |
-
|
| 90 |
-
def debug(self, message: str) -> None:
|
| 91 |
-
self.console_logger.debug(message)
|
| 92 |
-
|
| 93 |
-
def info(self, message: str) -> None:
|
| 94 |
-
self.console_logger.info(message)
|
| 95 |
-
|
| 96 |
-
def warning(self, message: str) -> None:
|
| 97 |
-
self.console_logger.warning(message)
|
| 98 |
-
|
| 99 |
-
def error(self, message: str) -> None:
|
| 100 |
-
self.console_logger.error(message)
|
| 101 |
-
|
| 102 |
-
def critical(self, message: str) -> None:
|
| 103 |
-
self.console_logger.critical(message)
|
| 104 |
-
|
| 105 |
-
|
| 106 |
class _StorageTools:
|
| 107 |
@staticmethod
|
| 108 |
def __clean_attributes(element: html.HtmlElement, forbidden: tuple = ()) -> Dict:
|
|
|
|
| 1 |
import re
|
|
|
|
| 2 |
import logging
|
| 3 |
from itertools import chain
|
|
|
|
| 4 |
# Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
|
| 5 |
from functools import lru_cache as cache # functools.cache is available on Python 3.9+ only so let's keep lru_cache
|
| 6 |
|
| 7 |
+
from scrapling.core._types import Dict, Iterable, Any
|
| 8 |
|
| 9 |
from lxml import html
|
| 10 |
+
|
| 11 |
html_forbidden = {html.HtmlComment, }
|
| 12 |
logging.basicConfig(
|
| 13 |
level=logging.ERROR,
|
|
|
|
| 44 |
return isinstance(s, (list, tuple,))
|
| 45 |
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
class _StorageTools:
|
| 48 |
@staticmethod
|
| 49 |
def __clean_attributes(element: html.HtmlElement, forbidden: tuple = ()) -> Dict:
|
scrapling/engines/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .camo import CamoufoxEngine
|
| 2 |
+
from .static import StaticEngine
|
| 3 |
+
from .pw import PlaywrightEngine
|
| 4 |
+
from .constants import DEFAULT_DISABLED_RESOURCES, DEFAULT_STEALTH_FLAGS
|
| 5 |
+
from .toolbelt import check_if_engine_usable
|
| 6 |
+
|
| 7 |
+
__all__ = ['CamoufoxEngine', 'PlaywrightEngine']
|
scrapling/engines/camo.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from scrapling.core._types import Union, Callable, Optional, Dict, List, Literal
|
| 3 |
+
|
| 4 |
+
from scrapling.engines.toolbelt import (
|
| 5 |
+
Response,
|
| 6 |
+
do_nothing,
|
| 7 |
+
get_os_name,
|
| 8 |
+
intercept_route,
|
| 9 |
+
check_type_validity,
|
| 10 |
+
generate_convincing_referer,
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
from camoufox.sync_api import Camoufox
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class CamoufoxEngine:
|
| 17 |
+
def __init__(
|
| 18 |
+
self, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
|
| 19 |
+
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
|
| 20 |
+
timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
|
| 21 |
+
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, adaptor_arguments: Dict = None
|
| 22 |
+
):
|
| 23 |
+
"""An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
|
| 24 |
+
|
| 25 |
+
:param headless: Run the browser in headless/hidden (default), virtual screen mode, or headful/visible mode.
|
| 26 |
+
:param block_images: Prevent the loading of images through Firefox preferences.
|
| 27 |
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
| 28 |
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends but it made requests ~25% faster in my tests for some websites.
|
| 29 |
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 30 |
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
| 31 |
+
:param block_webrtc: Blocks WebRTC entirely.
|
| 32 |
+
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
| 33 |
+
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
| 34 |
+
:param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases.
|
| 35 |
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 36 |
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
| 37 |
+
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
| 38 |
+
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
| 39 |
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
| 40 |
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
| 41 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 42 |
+
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
|
| 43 |
+
"""
|
| 44 |
+
self.headless = headless
|
| 45 |
+
self.block_images = bool(block_images)
|
| 46 |
+
self.disable_resources = bool(disable_resources)
|
| 47 |
+
self.block_webrtc = bool(block_webrtc)
|
| 48 |
+
self.allow_webgl = bool(allow_webgl)
|
| 49 |
+
self.network_idle = bool(network_idle)
|
| 50 |
+
self.google_search = bool(google_search)
|
| 51 |
+
self.extra_headers = extra_headers or {}
|
| 52 |
+
self.addons = addons or []
|
| 53 |
+
self.humanize = humanize
|
| 54 |
+
self.timeout = check_type_validity(timeout, [int, float], 30000)
|
| 55 |
+
if callable(page_action):
|
| 56 |
+
self.page_action = page_action
|
| 57 |
+
else:
|
| 58 |
+
self.page_action = do_nothing
|
| 59 |
+
logging.error('[Ignored] Argument "page_action" must be callable')
|
| 60 |
+
|
| 61 |
+
self.wait_selector = wait_selector
|
| 62 |
+
self.wait_selector_state = wait_selector_state
|
| 63 |
+
self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
|
| 64 |
+
|
| 65 |
+
def fetch(self, url: str) -> Response:
|
| 66 |
+
"""Opens up the browser and do your request based on your chosen options.
|
| 67 |
+
|
| 68 |
+
:param url: Target url.
|
| 69 |
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
| 70 |
+
"""
|
| 71 |
+
with Camoufox(
|
| 72 |
+
headless=self.headless,
|
| 73 |
+
block_images=self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
|
| 74 |
+
os=get_os_name(),
|
| 75 |
+
block_webrtc=self.block_webrtc,
|
| 76 |
+
allow_webgl=self.allow_webgl,
|
| 77 |
+
addons=self.addons,
|
| 78 |
+
humanize=self.humanize,
|
| 79 |
+
i_know_what_im_doing=True, # To turn warnings off with user configurations
|
| 80 |
+
) as browser:
|
| 81 |
+
page = browser.new_page()
|
| 82 |
+
page.set_default_navigation_timeout(self.timeout)
|
| 83 |
+
page.set_default_timeout(self.timeout)
|
| 84 |
+
if self.disable_resources:
|
| 85 |
+
page.route("**/*", intercept_route)
|
| 86 |
+
|
| 87 |
+
if self.extra_headers:
|
| 88 |
+
page.set_extra_http_headers(self.extra_headers)
|
| 89 |
+
|
| 90 |
+
res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
| 91 |
+
page.wait_for_load_state(state="domcontentloaded")
|
| 92 |
+
if self.network_idle:
|
| 93 |
+
page.wait_for_load_state('networkidle')
|
| 94 |
+
|
| 95 |
+
page = self.page_action(page)
|
| 96 |
+
|
| 97 |
+
if self.wait_selector and type(self.wait_selector) is str:
|
| 98 |
+
waiter = page.locator(self.wait_selector)
|
| 99 |
+
waiter.wait_for(state=self.wait_selector_state)
|
| 100 |
+
|
| 101 |
+
content_type = res.headers.get('content-type', '')
|
| 102 |
+
# Parse charset from content-type
|
| 103 |
+
encoding = 'utf-8' # default encoding
|
| 104 |
+
if 'charset=' in content_type.lower():
|
| 105 |
+
encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
|
| 106 |
+
|
| 107 |
+
response = Response(
|
| 108 |
+
url=res.url,
|
| 109 |
+
text=page.content(),
|
| 110 |
+
content=res.body(),
|
| 111 |
+
status=res.status,
|
| 112 |
+
reason=res.status_text,
|
| 113 |
+
encoding=encoding,
|
| 114 |
+
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
| 115 |
+
headers=res.all_headers(),
|
| 116 |
+
request_headers=res.request.all_headers(),
|
| 117 |
+
adaptor_arguments=self.adaptor_arguments
|
| 118 |
+
)
|
| 119 |
+
page.close()
|
| 120 |
+
|
| 121 |
+
return response
|
scrapling/engines/constants.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Disable loading these resources for speed
|
| 2 |
+
DEFAULT_DISABLED_RESOURCES = [
|
| 3 |
+
'font',
|
| 4 |
+
'image',
|
| 5 |
+
'media',
|
| 6 |
+
'beacon',
|
| 7 |
+
'object',
|
| 8 |
+
'imageset',
|
| 9 |
+
'texttrack',
|
| 10 |
+
'websocket',
|
| 11 |
+
'csp_report',
|
| 12 |
+
'stylesheet',
|
| 13 |
+
]
|
| 14 |
+
|
| 15 |
+
DEFAULT_STEALTH_FLAGS = [
|
| 16 |
+
# Explanation: https://peter.sh/experiments/chromium-command-line-switches/
|
| 17 |
+
# Generally this will make the browser faster and less detectable
|
| 18 |
+
'--no-pings',
|
| 19 |
+
'--incognito',
|
| 20 |
+
'--test-type',
|
| 21 |
+
'--lang=en-US',
|
| 22 |
+
'--mute-audio',
|
| 23 |
+
'--no-first-run',
|
| 24 |
+
'--disable-sync',
|
| 25 |
+
'--hide-scrollbars',
|
| 26 |
+
'--disable-logging',
|
| 27 |
+
'--start-maximized', # For headless check bypass
|
| 28 |
+
'--enable-async-dns',
|
| 29 |
+
'--disable-breakpad',
|
| 30 |
+
'--disable-infobars',
|
| 31 |
+
'--accept-lang=en-US',
|
| 32 |
+
'--use-mock-keychain',
|
| 33 |
+
'--disable-translate',
|
| 34 |
+
'--disable-extensions',
|
| 35 |
+
'--disable-voice-input',
|
| 36 |
+
'--window-position=0,0',
|
| 37 |
+
'--disable-wake-on-wifi',
|
| 38 |
+
'--ignore-gpu-blocklist',
|
| 39 |
+
'--enable-tcp-fast-open',
|
| 40 |
+
'--enable-web-bluetooth',
|
| 41 |
+
'--disable-hang-monitor',
|
| 42 |
+
'--password-store=basic',
|
| 43 |
+
'--disable-cloud-import',
|
| 44 |
+
'--disable-default-apps',
|
| 45 |
+
'--disable-print-preview',
|
| 46 |
+
'--disable-dev-shm-usage',
|
| 47 |
+
'--disable-popup-blocking',
|
| 48 |
+
'--metrics-recording-only',
|
| 49 |
+
'--disable-crash-reporter',
|
| 50 |
+
'--disable-partial-raster',
|
| 51 |
+
'--disable-gesture-typing',
|
| 52 |
+
'--disable-checker-imaging',
|
| 53 |
+
'--disable-prompt-on-repost',
|
| 54 |
+
'--force-color-profile=srgb',
|
| 55 |
+
'--font-render-hinting=none',
|
| 56 |
+
'--no-default-browser-check',
|
| 57 |
+
'--aggressive-cache-discard',
|
| 58 |
+
'--disable-component-update',
|
| 59 |
+
'--disable-cookie-encryption',
|
| 60 |
+
'--disable-domain-reliability',
|
| 61 |
+
'--disable-threaded-animation',
|
| 62 |
+
'--disable-threaded-scrolling',
|
| 63 |
+
# '--disable-reading-from-canvas', # For Firefox
|
| 64 |
+
'--enable-simple-cache-backend',
|
| 65 |
+
'--disable-background-networking',
|
| 66 |
+
'--disable-session-crashed-bubble',
|
| 67 |
+
'--enable-surface-synchronization',
|
| 68 |
+
'--disable-image-animation-resync',
|
| 69 |
+
'--disable-renderer-backgrounding',
|
| 70 |
+
'--disable-ipc-flooding-protection',
|
| 71 |
+
'--prerender-from-omnibox=disabled',
|
| 72 |
+
'--safebrowsing-disable-auto-update',
|
| 73 |
+
'--disable-offer-upload-credit-cards',
|
| 74 |
+
'--disable-features=site-per-process',
|
| 75 |
+
'--disable-background-timer-throttling',
|
| 76 |
+
'--disable-new-content-rendering-timeout',
|
| 77 |
+
'--run-all-compositor-stages-before-draw',
|
| 78 |
+
'--disable-client-side-phishing-detection',
|
| 79 |
+
'--disable-backgrounding-occluded-windows',
|
| 80 |
+
'--disable-layer-tree-host-memory-pressure',
|
| 81 |
+
'--autoplay-policy=no-user-gesture-required',
|
| 82 |
+
'--disable-offer-store-unmasked-wallet-cards',
|
| 83 |
+
'--disable-blink-features=AutomationControlled',
|
| 84 |
+
'--webrtc-ip-handling-policy=disable_non_proxied_udp',
|
| 85 |
+
'--disable-component-extensions-with-background-pages',
|
| 86 |
+
'--force-webrtc-ip-handling-policy=disable_non_proxied_udp',
|
| 87 |
+
'--enable-features=NetworkService,NetworkServiceInProcess,TrustTokens,TrustTokensAlwaysAllowIssuance',
|
| 88 |
+
'--blink-settings=primaryHoverType=2,availableHoverTypes=2,primaryPointerType=4,availablePointerTypes=4',
|
| 89 |
+
'--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process,TranslateUI,BlinkGenPropertyTrees',
|
| 90 |
+
]
|
| 91 |
+
|
| 92 |
+
# Defaulting to the docker mode, token doesn't matter in it as it's passed for the container
|
| 93 |
+
NSTBROWSER_DEFAULT_QUERY = {
|
| 94 |
+
"once": True,
|
| 95 |
+
"headless": True,
|
| 96 |
+
"autoClose": True,
|
| 97 |
+
"fingerprint": {
|
| 98 |
+
"flags": {
|
| 99 |
+
"timezone": "BasedOnIp",
|
| 100 |
+
"screen": "Custom"
|
| 101 |
+
},
|
| 102 |
+
"platform": 'linux', # support: windows, mac, linux
|
| 103 |
+
"kernel": 'chromium', # only support: chromium
|
| 104 |
+
"kernelMilestone": '128',
|
| 105 |
+
"hardwareConcurrency": 8,
|
| 106 |
+
"deviceMemory": 8,
|
| 107 |
+
},
|
| 108 |
+
}
|
scrapling/engines/pw.py
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import logging
|
| 3 |
+
from scrapling.core._types import Union, Callable, Optional, List, Dict
|
| 4 |
+
|
| 5 |
+
from scrapling.engines.constants import DEFAULT_STEALTH_FLAGS, NSTBROWSER_DEFAULT_QUERY
|
| 6 |
+
from scrapling.engines.toolbelt import (
|
| 7 |
+
Response,
|
| 8 |
+
do_nothing,
|
| 9 |
+
js_bypass_path,
|
| 10 |
+
intercept_route,
|
| 11 |
+
generate_headers,
|
| 12 |
+
check_type_validity,
|
| 13 |
+
construct_cdp_url,
|
| 14 |
+
generate_convincing_referer,
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class PlaywrightEngine:
|
| 19 |
+
def __init__(
|
| 20 |
+
self, headless: Union[bool, str] = True,
|
| 21 |
+
disable_resources: bool = False,
|
| 22 |
+
useragent: Optional[str] = None,
|
| 23 |
+
network_idle: Optional[bool] = False,
|
| 24 |
+
timeout: Optional[float] = 30000,
|
| 25 |
+
page_action: Callable = do_nothing,
|
| 26 |
+
wait_selector: Optional[str] = None,
|
| 27 |
+
wait_selector_state: Optional[str] = 'attached',
|
| 28 |
+
stealth: bool = False,
|
| 29 |
+
hide_canvas: bool = True,
|
| 30 |
+
disable_webgl: bool = False,
|
| 31 |
+
cdp_url: Optional[str] = None,
|
| 32 |
+
nstbrowser_mode: bool = False,
|
| 33 |
+
nstbrowser_config: Optional[Dict] = None,
|
| 34 |
+
google_search: Optional[bool] = True,
|
| 35 |
+
extra_headers: Optional[Dict[str, str]] = None,
|
| 36 |
+
adaptor_arguments: Dict = None
|
| 37 |
+
):
|
| 38 |
+
"""An engine that utilizes PlayWright library, check the `PlayWrightFetcher` class for more documentation.
|
| 39 |
+
|
| 40 |
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 41 |
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends but it made requests ~25% faster in my tests for some websites.
|
| 42 |
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 43 |
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
| 44 |
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 45 |
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 46 |
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
| 47 |
+
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
| 48 |
+
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
| 49 |
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
| 50 |
+
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
| 51 |
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
| 52 |
+
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
| 53 |
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
|
| 54 |
+
:param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
|
| 55 |
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
| 56 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 57 |
+
:param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
|
| 58 |
+
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
|
| 59 |
+
"""
|
| 60 |
+
self.headless = headless
|
| 61 |
+
self.disable_resources = disable_resources
|
| 62 |
+
self.network_idle = bool(network_idle)
|
| 63 |
+
self.stealth = bool(stealth)
|
| 64 |
+
self.hide_canvas = bool(hide_canvas)
|
| 65 |
+
self.disable_webgl = bool(disable_webgl)
|
| 66 |
+
self.google_search = bool(google_search)
|
| 67 |
+
self.extra_headers = extra_headers or {}
|
| 68 |
+
self.cdp_url = cdp_url
|
| 69 |
+
self.useragent = useragent
|
| 70 |
+
self.timeout = check_type_validity(timeout, [int, float], 30000)
|
| 71 |
+
if callable(page_action):
|
| 72 |
+
self.page_action = page_action
|
| 73 |
+
else:
|
| 74 |
+
self.page_action = do_nothing
|
| 75 |
+
logging.error('[Ignored] Argument "page_action" must be callable')
|
| 76 |
+
|
| 77 |
+
self.wait_selector = wait_selector
|
| 78 |
+
self.wait_selector_state = wait_selector_state
|
| 79 |
+
self.nstbrowser_mode = bool(nstbrowser_mode)
|
| 80 |
+
self.nstbrowser_config = nstbrowser_config
|
| 81 |
+
self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
|
| 82 |
+
|
| 83 |
+
def _cdp_url_logic(self, flags: Optional[List] = None) -> str:
|
| 84 |
+
"""Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is
|
| 85 |
+
|
| 86 |
+
:param flags: Chrome flags to be added to NSTBrowser query
|
| 87 |
+
:return: CDP URL
|
| 88 |
+
"""
|
| 89 |
+
cdp_url = self.cdp_url
|
| 90 |
+
if self.nstbrowser_mode:
|
| 91 |
+
if self.nstbrowser_config and type(self.nstbrowser_config) is Dict:
|
| 92 |
+
config = self.nstbrowser_config
|
| 93 |
+
else:
|
| 94 |
+
query = NSTBROWSER_DEFAULT_QUERY.copy()
|
| 95 |
+
if flags:
|
| 96 |
+
query.update({
|
| 97 |
+
"args": dict(zip(flags, [''] * len(flags))), # browser args should be a dictionary
|
| 98 |
+
})
|
| 99 |
+
|
| 100 |
+
config = {
|
| 101 |
+
'config': json.dumps(query),
|
| 102 |
+
# 'token': ''
|
| 103 |
+
}
|
| 104 |
+
cdp_url = construct_cdp_url(cdp_url, config)
|
| 105 |
+
else:
|
| 106 |
+
# To validate it
|
| 107 |
+
cdp_url = construct_cdp_url(cdp_url)
|
| 108 |
+
|
| 109 |
+
return cdp_url
|
| 110 |
+
|
| 111 |
+
def fetch(self, url: str) -> Response:
|
| 112 |
+
"""Opens up the browser and do your request based on your chosen options.
|
| 113 |
+
|
| 114 |
+
:param url: Target url.
|
| 115 |
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
| 116 |
+
"""
|
| 117 |
+
if not self.stealth:
|
| 118 |
+
from playwright.sync_api import sync_playwright
|
| 119 |
+
else:
|
| 120 |
+
from rebrowser_playwright.sync_api import sync_playwright
|
| 121 |
+
|
| 122 |
+
with sync_playwright() as p:
|
| 123 |
+
# Handle the UserAgent early
|
| 124 |
+
if self.useragent:
|
| 125 |
+
extra_headers = {}
|
| 126 |
+
useragent = self.useragent
|
| 127 |
+
else:
|
| 128 |
+
extra_headers = generate_headers(browser_mode=True)
|
| 129 |
+
useragent = extra_headers.get('User-Agent')
|
| 130 |
+
|
| 131 |
+
# Prepare the flags before diving
|
| 132 |
+
flags = DEFAULT_STEALTH_FLAGS
|
| 133 |
+
if self.hide_canvas:
|
| 134 |
+
flags += ['--fingerprinting-canvas-image-data-noise']
|
| 135 |
+
if self.disable_webgl:
|
| 136 |
+
flags += ['--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2']
|
| 137 |
+
|
| 138 |
+
# Creating the browser
|
| 139 |
+
if self.cdp_url:
|
| 140 |
+
cdp_url = self._cdp_url_logic(flags if self.stealth else None)
|
| 141 |
+
browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url)
|
| 142 |
+
else:
|
| 143 |
+
if self.stealth:
|
| 144 |
+
browser = p.chromium.launch(headless=self.headless, args=flags, ignore_default_args=['--enable-automation'], chromium_sandbox=True)
|
| 145 |
+
else:
|
| 146 |
+
browser = p.chromium.launch(headless=self.headless, ignore_default_args=['--enable-automation'])
|
| 147 |
+
|
| 148 |
+
# Creating the context
|
| 149 |
+
if self.stealth:
|
| 150 |
+
context = browser.new_context(
|
| 151 |
+
locale='en-US',
|
| 152 |
+
is_mobile=False,
|
| 153 |
+
has_touch=False,
|
| 154 |
+
color_scheme='dark', # Bypasses the 'prefersLightColor' check in creepjs
|
| 155 |
+
user_agent=useragent,
|
| 156 |
+
device_scale_factor=2,
|
| 157 |
+
# I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
|
| 158 |
+
service_workers="allow",
|
| 159 |
+
ignore_https_errors=True,
|
| 160 |
+
extra_http_headers=extra_headers,
|
| 161 |
+
screen={"width": 1920, "height": 1080},
|
| 162 |
+
viewport={"width": 1920, "height": 1080},
|
| 163 |
+
permissions=["geolocation", 'notifications'],
|
| 164 |
+
)
|
| 165 |
+
else:
|
| 166 |
+
context = browser.new_context(
|
| 167 |
+
color_scheme='dark',
|
| 168 |
+
user_agent=useragent,
|
| 169 |
+
device_scale_factor=2,
|
| 170 |
+
extra_http_headers=extra_headers
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
# Finally we are in business
|
| 174 |
+
page = context.new_page()
|
| 175 |
+
page.set_default_navigation_timeout(self.timeout)
|
| 176 |
+
page.set_default_timeout(self.timeout)
|
| 177 |
+
|
| 178 |
+
if self.extra_headers:
|
| 179 |
+
page.set_extra_http_headers(self.extra_headers)
|
| 180 |
+
|
| 181 |
+
if self.disable_resources:
|
| 182 |
+
page.route("**/*", intercept_route)
|
| 183 |
+
|
| 184 |
+
if self.stealth:
|
| 185 |
+
# Basic bypasses nothing fancy as I'm still working on it
|
| 186 |
+
# But with adding these bypasses to the above config, it bypasses many online tests like
|
| 187 |
+
# https://bot.sannysoft.com/
|
| 188 |
+
# https://kaliiiiiiiiii.github.io/brotector/
|
| 189 |
+
# https://pixelscan.net/
|
| 190 |
+
# https://iphey.com/
|
| 191 |
+
# https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
|
| 192 |
+
# https://arh.antoinevastel.com/bots/areyouheadless/
|
| 193 |
+
# https://prescience-data.github.io/execution-monitor.html
|
| 194 |
+
page.add_init_script(path=js_bypass_path('webdriver_fully.js'))
|
| 195 |
+
page.add_init_script(path=js_bypass_path('window_chrome.js'))
|
| 196 |
+
page.add_init_script(path=js_bypass_path('navigator_plugins.js'))
|
| 197 |
+
page.add_init_script(path=js_bypass_path('pdf_viewer.js'))
|
| 198 |
+
page.add_init_script(path=js_bypass_path('notification_permission.js'))
|
| 199 |
+
page.add_init_script(path=js_bypass_path('screen_props.js'))
|
| 200 |
+
page.add_init_script(path=js_bypass_path('playwright_fingerprint.js'))
|
| 201 |
+
|
| 202 |
+
res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
| 203 |
+
page.wait_for_load_state(state="domcontentloaded")
|
| 204 |
+
if self.network_idle:
|
| 205 |
+
page.wait_for_load_state('networkidle')
|
| 206 |
+
|
| 207 |
+
page = self.page_action(page)
|
| 208 |
+
|
| 209 |
+
if self.wait_selector and type(self.wait_selector) is str:
|
| 210 |
+
waiter = page.locator(self.wait_selector)
|
| 211 |
+
waiter.wait_for(state=self.wait_selector_state)
|
| 212 |
+
|
| 213 |
+
content_type = res.headers.get('content-type', '')
|
| 214 |
+
# Parse charset from content-type
|
| 215 |
+
encoding = 'utf-8' # default encoding
|
| 216 |
+
if 'charset=' in content_type.lower():
|
| 217 |
+
encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
|
| 218 |
+
|
| 219 |
+
response = Response(
|
| 220 |
+
url=res.url,
|
| 221 |
+
text=page.content(),
|
| 222 |
+
content=res.body(),
|
| 223 |
+
status=res.status,
|
| 224 |
+
reason=res.status_text,
|
| 225 |
+
encoding=encoding,
|
| 226 |
+
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
| 227 |
+
headers=res.all_headers(),
|
| 228 |
+
request_headers=res.request.all_headers(),
|
| 229 |
+
adaptor_arguments=self.adaptor_arguments
|
| 230 |
+
)
|
| 231 |
+
page.close()
|
| 232 |
+
return response
|
scrapling/engines/static.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
|
| 3 |
+
from scrapling.core._types import Union, Optional, Dict
|
| 4 |
+
from .toolbelt import Response, generate_convincing_referer, generate_headers
|
| 5 |
+
|
| 6 |
+
import httpx
|
| 7 |
+
from httpx._models import Response as httpxResponse
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class StaticEngine:
|
| 11 |
+
def __init__(self, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = None, adaptor_arguments: Dict = None):
|
| 12 |
+
"""An engine that utilizes httpx library, check the `Fetcher` class for more documentation.
|
| 13 |
+
|
| 14 |
+
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
| 15 |
+
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
| 16 |
+
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
|
| 17 |
+
"""
|
| 18 |
+
self.timeout = timeout
|
| 19 |
+
self.follow_redirects = bool(follow_redirects)
|
| 20 |
+
self._extra_headers = generate_headers(browser_mode=False)
|
| 21 |
+
self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
|
| 22 |
+
|
| 23 |
+
@staticmethod
|
| 24 |
+
def _headers_job(headers: Optional[Dict], url: str, stealth: bool) -> Dict:
|
| 25 |
+
"""Adds useragent to headers if it doesn't exist, generates real headers and append it to current headers, and
|
| 26 |
+
finally generates a referer header that looks like if this request came from Google's search of the current URL's domain.
|
| 27 |
+
|
| 28 |
+
:param headers: Current headers in the request if the user passed any
|
| 29 |
+
:param url: The Target URL.
|
| 30 |
+
:param stealth: Whether stealth mode is enabled or not.
|
| 31 |
+
:return: A dictionary of the new headers.
|
| 32 |
+
"""
|
| 33 |
+
headers = headers or {}
|
| 34 |
+
|
| 35 |
+
# Validate headers
|
| 36 |
+
if not headers.get('user-agent') and not headers.get('User-Agent'):
|
| 37 |
+
headers['User-Agent'] = generate_headers(browser_mode=False).get('User-Agent')
|
| 38 |
+
logging.info(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
|
| 39 |
+
|
| 40 |
+
if stealth:
|
| 41 |
+
extra_headers = generate_headers(browser_mode=False)
|
| 42 |
+
headers.update(extra_headers)
|
| 43 |
+
headers.update({'referer': generate_convincing_referer(url)})
|
| 44 |
+
|
| 45 |
+
return headers
|
| 46 |
+
|
| 47 |
+
def _prepare_response(self, response: httpxResponse) -> Response:
|
| 48 |
+
"""Takes httpx response and generates `Response` object from it.
|
| 49 |
+
|
| 50 |
+
:param response: httpx response object
|
| 51 |
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
| 52 |
+
"""
|
| 53 |
+
return Response(
|
| 54 |
+
url=str(response.url),
|
| 55 |
+
text=response.text,
|
| 56 |
+
content=response.content,
|
| 57 |
+
status=response.status_code,
|
| 58 |
+
reason=response.reason_phrase,
|
| 59 |
+
encoding=response.encoding or 'utf-8',
|
| 60 |
+
cookies=dict(response.cookies),
|
| 61 |
+
headers=dict(response.headers),
|
| 62 |
+
request_headers=dict(response.request.headers),
|
| 63 |
+
adaptor_arguments=self.adaptor_arguments
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
| 67 |
+
"""Make basic HTTP GET request for you but with some added flavors.
|
| 68 |
+
:param url: Target url.
|
| 69 |
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
| 70 |
+
create a referer header as if this request had came from Google's search of this URL's domain.
|
| 71 |
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
| 72 |
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
| 73 |
+
"""
|
| 74 |
+
headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
|
| 75 |
+
request = httpx.get(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
| 76 |
+
return self._prepare_response(request)
|
| 77 |
+
|
| 78 |
+
def post(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
| 79 |
+
"""Make basic HTTP POST request for you but with some added flavors.
|
| 80 |
+
:param url: Target url.
|
| 81 |
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
| 82 |
+
create a referer header as if this request had came from Google's search of this URL's domain.
|
| 83 |
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
| 84 |
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
| 85 |
+
"""
|
| 86 |
+
headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
|
| 87 |
+
request = httpx.post(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
| 88 |
+
return self._prepare_response(request)
|
| 89 |
+
|
| 90 |
+
def delete(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
| 91 |
+
"""Make basic HTTP DELETE request for you but with some added flavors.
|
| 92 |
+
:param url: Target url.
|
| 93 |
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
| 94 |
+
create a referer header as if this request had came from Google's search of this URL's domain.
|
| 95 |
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
| 96 |
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
| 97 |
+
"""
|
| 98 |
+
headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
|
| 99 |
+
request = httpx.delete(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
| 100 |
+
return self._prepare_response(request)
|
| 101 |
+
|
| 102 |
+
def put(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
| 103 |
+
"""Make basic HTTP PUT request for you but with some added flavors.
|
| 104 |
+
:param url: Target url.
|
| 105 |
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
| 106 |
+
create a referer header as if this request had came from Google's search of this URL's domain.
|
| 107 |
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
| 108 |
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
| 109 |
+
"""
|
| 110 |
+
headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
|
| 111 |
+
request = httpx.put(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
|
| 112 |
+
return self._prepare_response(request)
|
scrapling/engines/toolbelt/__init__.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .fingerprints import (
|
| 2 |
+
get_os_name,
|
| 3 |
+
generate_headers,
|
| 4 |
+
generate_convincing_referer,
|
| 5 |
+
)
|
| 6 |
+
from .custom import (
|
| 7 |
+
Response,
|
| 8 |
+
do_nothing,
|
| 9 |
+
BaseFetcher,
|
| 10 |
+
get_variable_name,
|
| 11 |
+
check_type_validity,
|
| 12 |
+
check_if_engine_usable,
|
| 13 |
+
)
|
| 14 |
+
from .navigation import (
|
| 15 |
+
js_bypass_path,
|
| 16 |
+
intercept_route,
|
| 17 |
+
construct_cdp_url,
|
| 18 |
+
)
|
scrapling/engines/toolbelt/bypasses/navigator_plugins.js
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
if(navigator.plugins.length == 0){
|
| 2 |
+
Object.defineProperty(navigator, 'plugins', {
|
| 3 |
+
get: () => {
|
| 4 |
+
const PDFViewerPlugin = Object.create(Plugin.prototype, {
|
| 5 |
+
description: { value: 'Portable Document Format', enumerable: false },
|
| 6 |
+
filename: { value: 'internal-pdf-viewer', enumerable: false },
|
| 7 |
+
name: { value: 'PDF Viewer', enumerable: false },
|
| 8 |
+
});
|
| 9 |
+
const ChromePDFViewer = Object.create(Plugin.prototype, {
|
| 10 |
+
description: { value: 'Portable Document Format', enumerable: false },
|
| 11 |
+
filename: { value: 'internal-pdf-viewer', enumerable: false },
|
| 12 |
+
name: { value: 'Chrome PDF Viewer', enumerable: false },
|
| 13 |
+
});
|
| 14 |
+
const ChromiumPDFViewer = Object.create(Plugin.prototype, {
|
| 15 |
+
description: { value: 'Portable Document Format', enumerable: false },
|
| 16 |
+
filename: { value: 'internal-pdf-viewer', enumerable: false },
|
| 17 |
+
name: { value: 'Chromium PDF Viewer', enumerable: false },
|
| 18 |
+
});
|
| 19 |
+
const EdgePDFViewer = Object.create(Plugin.prototype, {
|
| 20 |
+
description: { value: 'Portable Document Format', enumerable: false },
|
| 21 |
+
filename: { value: 'internal-pdf-viewer', enumerable: false },
|
| 22 |
+
name: { value: 'Microsoft Edge PDF Viewer', enumerable: false },
|
| 23 |
+
});
|
| 24 |
+
const WebKitPDFPlugin = Object.create(Plugin.prototype, {
|
| 25 |
+
description: { value: 'Portable Document Format', enumerable: false },
|
| 26 |
+
filename: { value: 'internal-pdf-viewer', enumerable: false },
|
| 27 |
+
name: { value: 'WebKit built-in PDF', enumerable: false },
|
| 28 |
+
});
|
| 29 |
+
|
| 30 |
+
return Object.create(PluginArray.prototype, {
|
| 31 |
+
length: { value: 5 },
|
| 32 |
+
0: { value: PDFViewerPlugin },
|
| 33 |
+
1: { value: ChromePDFViewer },
|
| 34 |
+
2: { value: ChromiumPDFViewer },
|
| 35 |
+
3: { value: EdgePDFViewer },
|
| 36 |
+
4: { value: WebKitPDFPlugin },
|
| 37 |
+
});
|
| 38 |
+
},
|
| 39 |
+
});
|
| 40 |
+
}
|
scrapling/engines/toolbelt/bypasses/notification_permission.js
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Bypasses `notificationIsDenied` test in creepsjs's 'Like Headless' sections
|
| 2 |
+
const isSecure = document.location.protocol.startsWith('https')
|
| 3 |
+
if (isSecure){
|
| 4 |
+
Object.defineProperty(Notification, 'permission', {get: () => 'default'})
|
| 5 |
+
}
|
scrapling/engines/toolbelt/bypasses/pdf_viewer.js
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// PDF viewer enabled
|
| 2 |
+
// Bypasses `pdfIsDisabled` test in creepsjs's 'Like Headless' sections
|
| 3 |
+
Object.defineProperty(navigator, 'pdfViewerEnabled', {
|
| 4 |
+
get: () => true,
|
| 5 |
+
});
|
scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Remove playwright fingerprint => https://github.com/microsoft/playwright/commit/c9e673c6dca746384338ab6bb0cf63c7e7caa9b2#diff-087773eea292da9db5a3f27de8f1a2940cdb895383ad750c3cd8e01772a35b40R915
|
| 2 |
+
delete __pwInitScripts;
|
scrapling/engines/toolbelt/bypasses/screen_props.js
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
const windowScreenProps = {
|
| 2 |
+
// Dimensions
|
| 3 |
+
innerHeight: 0,
|
| 4 |
+
innerWidth: 0,
|
| 5 |
+
outerHeight: 754,
|
| 6 |
+
outerWidth: 1313,
|
| 7 |
+
|
| 8 |
+
// Position
|
| 9 |
+
screenX: 19,
|
| 10 |
+
pageXOffset: 0,
|
| 11 |
+
pageYOffset: 0,
|
| 12 |
+
|
| 13 |
+
// Display
|
| 14 |
+
devicePixelRatio: 2
|
| 15 |
+
};
|
| 16 |
+
|
| 17 |
+
try {
|
| 18 |
+
for (const [prop, value] of Object.entries(windowScreenProps)) {
|
| 19 |
+
if (value > 0) {
|
| 20 |
+
// The 0 values are introduced by collecting in the hidden iframe.
|
| 21 |
+
// They are document sizes anyway so no need to test them or inject them.
|
| 22 |
+
window[prop] = value;
|
| 23 |
+
}
|
| 24 |
+
}
|
| 25 |
+
} catch (e) {
|
| 26 |
+
console.warn(e);
|
| 27 |
+
};
|
scrapling/engines/toolbelt/bypasses/webdriver_fully.js
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Create a function that looks like a native getter
|
| 2 |
+
const nativeGetter = function get webdriver() {
|
| 3 |
+
return false;
|
| 4 |
+
};
|
| 5 |
+
|
| 6 |
+
// Copy over native function properties
|
| 7 |
+
Object.defineProperties(nativeGetter, {
|
| 8 |
+
name: { value: 'get webdriver', configurable: true },
|
| 9 |
+
length: { value: 0, configurable: true },
|
| 10 |
+
toString: {
|
| 11 |
+
value: function() {
|
| 12 |
+
return `function get webdriver() { [native code] }`;
|
| 13 |
+
},
|
| 14 |
+
configurable: true
|
| 15 |
+
}
|
| 16 |
+
});
|
| 17 |
+
|
| 18 |
+
// Make it look native
|
| 19 |
+
Object.setPrototypeOf(nativeGetter, Function.prototype);
|
| 20 |
+
|
| 21 |
+
// Apply the modified descriptor
|
| 22 |
+
Object.defineProperty(Navigator.prototype, 'webdriver', {
|
| 23 |
+
get: nativeGetter,
|
| 24 |
+
set: undefined,
|
| 25 |
+
enumerable: true,
|
| 26 |
+
configurable: true
|
| 27 |
+
});
|
scrapling/engines/toolbelt/bypasses/window_chrome.js
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// To escape `HEADCHR_CHROME_OBJ` test in headless mode => https://github.com/antoinevastel/fp-collect/blob/master/src/fpCollect.js#L322
|
| 2 |
+
// Faking window.chrome fully
|
| 3 |
+
|
| 4 |
+
if (!window.chrome) {
|
| 5 |
+
// First, save all existing properties
|
| 6 |
+
const originalKeys = Object.getOwnPropertyNames(window);
|
| 7 |
+
const tempObj = {};
|
| 8 |
+
|
| 9 |
+
// Recreate all properties in original order
|
| 10 |
+
for (const key of originalKeys) {
|
| 11 |
+
const descriptor = Object.getOwnPropertyDescriptor(window, key);
|
| 12 |
+
const value = window[key];
|
| 13 |
+
// delete window[key];
|
| 14 |
+
Object.defineProperty(tempObj, key, descriptor);
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
// Use the exact property descriptor found in headful Chrome
|
| 18 |
+
// fetch it via `Object.getOwnPropertyDescriptor(window, 'chrome')`
|
| 19 |
+
const mockChrome = {
|
| 20 |
+
loadTimes: {},
|
| 21 |
+
csi: {},
|
| 22 |
+
app: {
|
| 23 |
+
isInstalled: false
|
| 24 |
+
},
|
| 25 |
+
// Add other Chrome-specific properties
|
| 26 |
+
};
|
| 27 |
+
|
| 28 |
+
Object.defineProperty(tempObj, 'chrome', {
|
| 29 |
+
writable: true,
|
| 30 |
+
enumerable: true,
|
| 31 |
+
configurable: false,
|
| 32 |
+
value: mockChrome
|
| 33 |
+
});
|
| 34 |
+
for (const key of Object.getOwnPropertyNames(tempObj)) {
|
| 35 |
+
try {
|
| 36 |
+
Object.defineProperty(window, key,
|
| 37 |
+
Object.getOwnPropertyDescriptor(tempObj, key));
|
| 38 |
+
} catch (e) {}
|
| 39 |
+
};
|
| 40 |
+
// todo: solve this
|
| 41 |
+
// Using line below bypasses the hasHighChromeIndex test in creepjs ==> https://github.com/abrahamjuliot/creepjs/blob/master/src/headless/index.ts#L121
|
| 42 |
+
// Chrome object have to be in the end of the window properties
|
| 43 |
+
// Object.assign(window, tempObj);
|
| 44 |
+
// But makes window.chrome unreadable on 'https://bot.sannysoft.com/'
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
// That means we're running headful and don't need to mock anything
|
| 48 |
+
if ('app' in window.chrome) {
|
| 49 |
+
return; // Nothing to do here
|
| 50 |
+
}
|
| 51 |
+
const makeError = {
|
| 52 |
+
ErrorInInvocation: fn => {
|
| 53 |
+
const err = new TypeError(`Error in invocation of app.${fn}()`);
|
| 54 |
+
return utils.stripErrorWithAnchor(
|
| 55 |
+
err,
|
| 56 |
+
`at ${fn} (eval at <anonymous>`,
|
| 57 |
+
);
|
| 58 |
+
},
|
| 59 |
+
};
|
| 60 |
+
// check with: `JSON.stringify(window.chrome['app'])`
|
| 61 |
+
const STATIC_DATA = JSON.parse(
|
| 62 |
+
`
|
| 63 |
+
{
|
| 64 |
+
"isInstalled": false,
|
| 65 |
+
"InstallState": {
|
| 66 |
+
"DISABLED": "disabled",
|
| 67 |
+
"INSTALLED": "installed",
|
| 68 |
+
"NOT_INSTALLED": "not_installed"
|
| 69 |
+
},
|
| 70 |
+
"RunningState": {
|
| 71 |
+
"CANNOT_RUN": "cannot_run",
|
| 72 |
+
"READY_TO_RUN": "ready_to_run",
|
| 73 |
+
"RUNNING": "running"
|
| 74 |
+
}
|
| 75 |
+
}
|
| 76 |
+
`.trim(),
|
| 77 |
+
);
|
| 78 |
+
window.chrome.app = {
|
| 79 |
+
...STATIC_DATA,
|
| 80 |
+
|
| 81 |
+
get isInstalled() {
|
| 82 |
+
return false;
|
| 83 |
+
},
|
| 84 |
+
|
| 85 |
+
getDetails: function getDetails() {
|
| 86 |
+
if (arguments.length) {
|
| 87 |
+
throw makeError.ErrorInInvocation(`getDetails`);
|
| 88 |
+
}
|
| 89 |
+
return null;
|
| 90 |
+
},
|
| 91 |
+
getIsInstalled: function getDetails() {
|
| 92 |
+
if (arguments.length) {
|
| 93 |
+
throw makeError.ErrorInInvocation(`getIsInstalled`);
|
| 94 |
+
}
|
| 95 |
+
return false;
|
| 96 |
+
},
|
| 97 |
+
runningState: function getDetails() {
|
| 98 |
+
if (arguments.length) {
|
| 99 |
+
throw makeError.ErrorInInvocation(`runningState`);
|
| 100 |
+
}
|
| 101 |
+
return 'cannot_run';
|
| 102 |
+
},
|
| 103 |
+
};
|
| 104 |
+
// Check that the Navigation Timing API v1 is available, we need that
|
| 105 |
+
if (!window.performance || !window.performance.timing) {
|
| 106 |
+
return;
|
| 107 |
+
}
|
| 108 |
+
const {timing} = window.performance;
|
| 109 |
+
window.chrome.csi = function () {
|
| 110 |
+
return {
|
| 111 |
+
onloadT: timing.domContentLoadedEventEnd,
|
| 112 |
+
startE: timing.navigationStart,
|
| 113 |
+
pageT: Date.now() - timing.navigationStart,
|
| 114 |
+
tran: 15, // Transition type or something
|
| 115 |
+
};
|
| 116 |
+
};
|
| 117 |
+
if (!window.PerformancePaintTiming){
|
| 118 |
+
return;
|
| 119 |
+
}
|
| 120 |
+
const {performance} = window;
|
| 121 |
+
// Some stuff is not available on about:blank as it requires a navigation to occur,
|
| 122 |
+
// let's harden the code to not fail then:
|
| 123 |
+
const ntEntryFallback = {
|
| 124 |
+
nextHopProtocol: 'h2',
|
| 125 |
+
type: 'other',
|
| 126 |
+
};
|
| 127 |
+
|
| 128 |
+
// The API exposes some funky info regarding the connection
|
| 129 |
+
const protocolInfo = {
|
| 130 |
+
get connectionInfo() {
|
| 131 |
+
const ntEntry =
|
| 132 |
+
performance.getEntriesByType('navigation')[0] || ntEntryFallback;
|
| 133 |
+
return ntEntry.nextHopProtocol;
|
| 134 |
+
},
|
| 135 |
+
get npnNegotiatedProtocol() {
|
| 136 |
+
// NPN is deprecated in favor of ALPN, but this implementation returns the
|
| 137 |
+
// HTTP/2 or HTTP2+QUIC/39 requests negotiated via ALPN.
|
| 138 |
+
const ntEntry =
|
| 139 |
+
performance.getEntriesByType('navigation')[0] || ntEntryFallback;
|
| 140 |
+
return ['h2', 'hq'].includes(ntEntry.nextHopProtocol)
|
| 141 |
+
? ntEntry.nextHopProtocol
|
| 142 |
+
: 'unknown';
|
| 143 |
+
},
|
| 144 |
+
get navigationType() {
|
| 145 |
+
const ntEntry =
|
| 146 |
+
performance.getEntriesByType('navigation')[0] || ntEntryFallback;
|
| 147 |
+
return ntEntry.type;
|
| 148 |
+
},
|
| 149 |
+
get wasAlternateProtocolAvailable() {
|
| 150 |
+
// The Alternate-Protocol header is deprecated in favor of Alt-Svc
|
| 151 |
+
// (https://www.mnot.net/blog/2016/03/09/alt-svc), so technically this
|
| 152 |
+
// should always return false.
|
| 153 |
+
return false;
|
| 154 |
+
},
|
| 155 |
+
get wasFetchedViaSpdy() {
|
| 156 |
+
// SPDY is deprecated in favor of HTTP/2, but this implementation returns
|
| 157 |
+
// true for HTTP/2 or HTTP2+QUIC/39 as well.
|
| 158 |
+
const ntEntry =
|
| 159 |
+
performance.getEntriesByType('navigation')[0] || ntEntryFallback;
|
| 160 |
+
return ['h2', 'hq'].includes(ntEntry.nextHopProtocol);
|
| 161 |
+
},
|
| 162 |
+
get wasNpnNegotiated() {
|
| 163 |
+
// NPN is deprecated in favor of ALPN, but this implementation returns true
|
| 164 |
+
// for HTTP/2 or HTTP2+QUIC/39 requests negotiated via ALPN.
|
| 165 |
+
const ntEntry =
|
| 166 |
+
performance.getEntriesByType('navigation')[0] || ntEntryFallback;
|
| 167 |
+
return ['h2', 'hq'].includes(ntEntry.nextHopProtocol);
|
| 168 |
+
},
|
| 169 |
+
};
|
| 170 |
+
|
| 171 |
+
// Truncate number to specific number of decimals, most of the `loadTimes` stuff has 3
|
| 172 |
+
function toFixed(num, fixed) {
|
| 173 |
+
var re = new RegExp('^-?\\d+(?:.\\d{0,' + (fixed || -1) + '})?');
|
| 174 |
+
return num.toString().match(re)[0];
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
const timingInfo = {
|
| 178 |
+
get firstPaintAfterLoadTime() {
|
| 179 |
+
// This was never actually implemented and always returns 0.
|
| 180 |
+
return 0;
|
| 181 |
+
},
|
| 182 |
+
get requestTime() {
|
| 183 |
+
return timing.navigationStart / 1000;
|
| 184 |
+
},
|
| 185 |
+
get startLoadTime() {
|
| 186 |
+
return timing.navigationStart / 1000;
|
| 187 |
+
},
|
| 188 |
+
get commitLoadTime() {
|
| 189 |
+
return timing.responseStart / 1000;
|
| 190 |
+
},
|
| 191 |
+
get finishDocumentLoadTime() {
|
| 192 |
+
return timing.domContentLoadedEventEnd / 1000;
|
| 193 |
+
},
|
| 194 |
+
get finishLoadTime() {
|
| 195 |
+
return timing.loadEventEnd / 1000;
|
| 196 |
+
},
|
| 197 |
+
get firstPaintTime() {
|
| 198 |
+
const fpEntry = performance.getEntriesByType('paint')[0] || {
|
| 199 |
+
startTime: timing.loadEventEnd / 1000, // Fallback if no navigation occured (`about:blank`)
|
| 200 |
+
};
|
| 201 |
+
return toFixed(
|
| 202 |
+
(fpEntry.startTime + performance.timeOrigin) / 1000,
|
| 203 |
+
3,
|
| 204 |
+
);
|
| 205 |
+
},
|
| 206 |
+
};
|
| 207 |
+
|
| 208 |
+
window.chrome.loadTimes = function () {
|
| 209 |
+
return {
|
| 210 |
+
...protocolInfo,
|
| 211 |
+
...timingInfo,
|
| 212 |
+
};
|
| 213 |
+
};
|
scrapling/engines/toolbelt/custom.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Functions related to custom types or type checking
|
| 3 |
+
"""
|
| 4 |
+
import inspect
|
| 5 |
+
import logging
|
| 6 |
+
from dataclasses import dataclass, field
|
| 7 |
+
|
| 8 |
+
from scrapling.core.utils import setup_basic_logging
|
| 9 |
+
from scrapling.parser import Adaptor, SQLiteStorageSystem
|
| 10 |
+
from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@dataclass(frozen=True)
|
| 14 |
+
class Response:
|
| 15 |
+
"""This class is returned by all engines as a way to unify response type between different libraries."""
|
| 16 |
+
url: str
|
| 17 |
+
text: str
|
| 18 |
+
content: bytes
|
| 19 |
+
status: int
|
| 20 |
+
reason: str
|
| 21 |
+
encoding: str = 'utf-8' # default encoding
|
| 22 |
+
cookies: Dict = field(default_factory=dict)
|
| 23 |
+
headers: Dict = field(default_factory=dict)
|
| 24 |
+
request_headers: Dict = field(default_factory=dict)
|
| 25 |
+
adaptor_arguments: Dict = field(default_factory=dict)
|
| 26 |
+
|
| 27 |
+
@property
|
| 28 |
+
def adaptor(self) -> Union[Adaptor, None]:
|
| 29 |
+
"""Generate Adaptor instance from this response if possible, otherwise return None"""
|
| 30 |
+
automatch_domain = self.adaptor_arguments.pop('automatch_domain', None)
|
| 31 |
+
if self.text:
|
| 32 |
+
# For playwright that will be the response after all JS executed
|
| 33 |
+
return Adaptor(text=self.text, url=automatch_domain or self.url, encoding=self.encoding, **self.adaptor_arguments)
|
| 34 |
+
elif self.content:
|
| 35 |
+
# For playwright, that's after all JS is loaded but not all of them executed, because playwright doesn't offer something like page.content()
|
| 36 |
+
# To get response Bytes after the load states
|
| 37 |
+
# Reference: https://playwright.dev/python/docs/api/class-page
|
| 38 |
+
return Adaptor(body=self.content, url=automatch_domain or self.url, encoding=self.encoding, **self.adaptor_arguments)
|
| 39 |
+
return None
|
| 40 |
+
|
| 41 |
+
def __repr__(self):
|
| 42 |
+
return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class BaseFetcher:
|
| 46 |
+
def __init__(
|
| 47 |
+
self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
|
| 48 |
+
storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = True,
|
| 49 |
+
automatch_domain: Optional[str] = None,
|
| 50 |
+
):
|
| 51 |
+
"""Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
|
| 52 |
+
are detected and passed automatically from the Fetcher based on the response for accessibility.
|
| 53 |
+
|
| 54 |
+
:param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
|
| 55 |
+
libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
|
| 56 |
+
:param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
|
| 57 |
+
:param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
|
| 58 |
+
priority over all auto-match related arguments/functions in the class.
|
| 59 |
+
:param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
|
| 60 |
+
:param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
|
| 61 |
+
If empty, default values will be used.
|
| 62 |
+
:param automatch_domain: For cases where you want to automatch selectors across different websites as if they were on the same website, use this argument to unify them.
|
| 63 |
+
Otherwise, the domain of the request is used by default.
|
| 64 |
+
:param debug: Enable debug mode
|
| 65 |
+
"""
|
| 66 |
+
# Adaptor class parameters
|
| 67 |
+
# I won't validate Adaptor's class parameters here again, I will leave it to be validated later
|
| 68 |
+
self.adaptor_arguments = dict(
|
| 69 |
+
huge_tree=huge_tree,
|
| 70 |
+
keep_comments=keep_comments,
|
| 71 |
+
auto_match=auto_match,
|
| 72 |
+
storage=storage,
|
| 73 |
+
storage_args=storage_args,
|
| 74 |
+
debug=debug,
|
| 75 |
+
)
|
| 76 |
+
# If the user used fetchers first, then configure the logger from here instead of the `Adaptor` class
|
| 77 |
+
setup_basic_logging(level='debug' if debug else 'info')
|
| 78 |
+
if automatch_domain:
|
| 79 |
+
if type(automatch_domain) is not str:
|
| 80 |
+
logging.warning('[Ignored] The argument "automatch_domain" must be of string type')
|
| 81 |
+
else:
|
| 82 |
+
self.adaptor_arguments.update({'automatch_domain': automatch_domain})
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def check_if_engine_usable(engine: Callable) -> Union[Callable, None]:
|
| 86 |
+
"""This function check if the passed engine can be used by a Fetcher-type class or not.
|
| 87 |
+
|
| 88 |
+
:param engine: The engine class itself
|
| 89 |
+
:return: The engine class again if all checks out, otherwise raises error
|
| 90 |
+
:raise TypeError: If engine class don't have fetch method, If engine class have fetch attribute not method, or If engine class have fetch function but it doesn't take arguments
|
| 91 |
+
"""
|
| 92 |
+
# if isinstance(engine, type):
|
| 93 |
+
# raise TypeError("Expected an engine instance, not a class definition of the engine")
|
| 94 |
+
|
| 95 |
+
if hasattr(engine, 'fetch'):
|
| 96 |
+
fetch_function = getattr(engine, "fetch")
|
| 97 |
+
if callable(fetch_function):
|
| 98 |
+
if len(inspect.signature(fetch_function).parameters) > 0:
|
| 99 |
+
return engine
|
| 100 |
+
else:
|
| 101 |
+
# raise TypeError("Engine class instance must have a callable method 'fetch' with the first argument used for the url.")
|
| 102 |
+
raise TypeError("Engine class must have a callable method 'fetch' with the first argument used for the url.")
|
| 103 |
+
else:
|
| 104 |
+
# raise TypeError("Invalid engine instance! Engine class must have a callable method 'fetch'")
|
| 105 |
+
raise TypeError("Invalid engine class! Engine class must have a callable method 'fetch'")
|
| 106 |
+
else:
|
| 107 |
+
# raise TypeError("Invalid engine instance! Engine class must have the method 'fetch'")
|
| 108 |
+
raise TypeError("Invalid engine class! Engine class must have the method 'fetch'")
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def get_variable_name(var: Any) -> Optional[str]:
|
| 112 |
+
"""Get the name of a variable using global and local scopes.
|
| 113 |
+
:param var: The variable to find the name for
|
| 114 |
+
:return: The name of the variable if found, None otherwise
|
| 115 |
+
"""
|
| 116 |
+
for scope in [globals(), locals()]:
|
| 117 |
+
for name, value in scope.items():
|
| 118 |
+
if value is var:
|
| 119 |
+
return name
|
| 120 |
+
return None
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def check_type_validity(variable: Any, valid_types: Union[List[Type], None], default_value: Any = None, critical: bool = False, param_name: Optional[str] = None) -> Any:
|
| 124 |
+
"""Check if a variable matches the specified type constraints.
|
| 125 |
+
:param variable: The variable to check
|
| 126 |
+
:param valid_types: List of valid types for the variable
|
| 127 |
+
:param default_value: Value to return if type check fails
|
| 128 |
+
:param critical: If True, raises TypeError instead of logging error
|
| 129 |
+
:param param_name: Optional parameter name for error messages
|
| 130 |
+
:return: The original variable if valid, default_value if invalid
|
| 131 |
+
:raise TypeError: If critical=True and type check fails
|
| 132 |
+
"""
|
| 133 |
+
# Use provided param_name or try to get it automatically
|
| 134 |
+
var_name = param_name or get_variable_name(variable) or "Unknown"
|
| 135 |
+
|
| 136 |
+
# Convert valid_types to a list if None
|
| 137 |
+
valid_types = valid_types or []
|
| 138 |
+
|
| 139 |
+
# Handle None value
|
| 140 |
+
if variable is None:
|
| 141 |
+
if type(None) in valid_types:
|
| 142 |
+
return variable
|
| 143 |
+
error_msg = f'Argument "{var_name}" cannot be None'
|
| 144 |
+
if critical:
|
| 145 |
+
raise TypeError(error_msg)
|
| 146 |
+
logging.error(f'[Ignored] {error_msg}')
|
| 147 |
+
return default_value
|
| 148 |
+
|
| 149 |
+
# If no valid_types specified and variable has a value, return it
|
| 150 |
+
if not valid_types:
|
| 151 |
+
return variable
|
| 152 |
+
|
| 153 |
+
# Check if variable type matches any of the valid types
|
| 154 |
+
if not any(isinstance(variable, t) for t in valid_types):
|
| 155 |
+
type_names = [t.__name__ for t in valid_types]
|
| 156 |
+
error_msg = f'Argument "{var_name}" must be of type {" or ".join(type_names)}'
|
| 157 |
+
if critical:
|
| 158 |
+
raise TypeError(error_msg)
|
| 159 |
+
logging.error(f'[Ignored] {error_msg}')
|
| 160 |
+
return default_value
|
| 161 |
+
|
| 162 |
+
return variable
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
# Pew Pew
|
| 166 |
+
def do_nothing(page):
|
| 167 |
+
# Just works as a filler for `page_action` argument in browser engines
|
| 168 |
+
return page
|
scrapling/engines/toolbelt/fingerprints.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Functions related to generating headers and fingerprints generally
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import platform
|
| 6 |
+
|
| 7 |
+
from scrapling.core.utils import cache
|
| 8 |
+
from scrapling.core._types import Union, Dict
|
| 9 |
+
|
| 10 |
+
from tldextract import extract
|
| 11 |
+
from browserforge.headers import HeaderGenerator, Browser
|
| 12 |
+
from browserforge.fingerprints import FingerprintGenerator, Fingerprint
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@cache(None, typed=True)
|
| 16 |
+
def generate_convincing_referer(url: str) -> str:
|
| 17 |
+
"""Takes the domain from the URL without the subdomain/suffix and make it look like you were searching google for this website
|
| 18 |
+
|
| 19 |
+
>>> generate_convincing_referer('https://www.somewebsite.com/blah')
|
| 20 |
+
'https://www.google.com/search?q=somewebsite'
|
| 21 |
+
|
| 22 |
+
:param url: The URL you are about to fetch.
|
| 23 |
+
:return: Google's search URL of the domain name
|
| 24 |
+
"""
|
| 25 |
+
website_name = extract(url).domain
|
| 26 |
+
return f'https://www.google.com/search?q={website_name}'
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@cache(None, typed=True)
|
| 30 |
+
def get_os_name() -> Union[str, None]:
|
| 31 |
+
"""Get the current OS name in the same format needed for browserforge
|
| 32 |
+
|
| 33 |
+
:return: Current OS name or `None` otherwise
|
| 34 |
+
"""
|
| 35 |
+
#
|
| 36 |
+
os_name = platform.system()
|
| 37 |
+
return {
|
| 38 |
+
'Linux': 'linux',
|
| 39 |
+
'Darwin': 'macos',
|
| 40 |
+
'Windows': 'windows',
|
| 41 |
+
# For the future? because why not
|
| 42 |
+
'iOS': 'ios',
|
| 43 |
+
}.get(os_name)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def generate_suitable_fingerprint() -> Fingerprint:
|
| 47 |
+
"""Generates a browserforge's fingerprint that matches current OS, desktop device, and Chrome with version 128 at least.
|
| 48 |
+
|
| 49 |
+
This function was originally created to test Browserforge's injector.
|
| 50 |
+
:return: `Fingerprint` object
|
| 51 |
+
"""
|
| 52 |
+
return FingerprintGenerator(
|
| 53 |
+
browser=[Browser(name='chrome', min_version=128)],
|
| 54 |
+
os=get_os_name(), # None is ignored
|
| 55 |
+
device='desktop'
|
| 56 |
+
).generate()
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def generate_headers(browser_mode: bool = False) -> Dict:
|
| 60 |
+
"""Generate real browser-like headers using browserforge's generator
|
| 61 |
+
|
| 62 |
+
:param browser_mode: If enabled, the headers created are used for playwright so it have to match everything
|
| 63 |
+
:return: A dictionary of the generated headers
|
| 64 |
+
"""
|
| 65 |
+
if browser_mode:
|
| 66 |
+
# In this mode we don't care about anything other than matching the OS and the browser type with the browser we are using
|
| 67 |
+
# So we don't raise any inconsistency red flags while websites fingerprinting us
|
| 68 |
+
os_name = get_os_name()
|
| 69 |
+
return HeaderGenerator(
|
| 70 |
+
browser=[Browser(name='chrome', min_version=128)],
|
| 71 |
+
os=os_name, # None is ignored
|
| 72 |
+
device='desktop'
|
| 73 |
+
).generate()
|
| 74 |
+
else:
|
| 75 |
+
# Here it's used for normal requests that aren't done through browsers so we can take it lightly
|
| 76 |
+
browsers = [
|
| 77 |
+
Browser(name='chrome', min_version=120),
|
| 78 |
+
Browser(name='firefox', min_version=120),
|
| 79 |
+
Browser(name='edge', min_version=120),
|
| 80 |
+
]
|
| 81 |
+
return HeaderGenerator(browser=browsers, device='desktop').generate()
|
scrapling/engines/toolbelt/navigation.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Functions related to files and URLs
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import logging
|
| 7 |
+
from urllib.parse import urlparse, urlencode
|
| 8 |
+
|
| 9 |
+
from scrapling.core.utils import cache
|
| 10 |
+
from scrapling.core._types import Union, Dict, Optional
|
| 11 |
+
from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
|
| 12 |
+
|
| 13 |
+
from playwright.sync_api import Route
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def intercept_route(route: Route) -> Union[Route, None]:
|
| 17 |
+
"""This is just a route handler but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
|
| 18 |
+
|
| 19 |
+
:param route: PlayWright `Route` object of the current page
|
| 20 |
+
:return: PlayWright `Route` object
|
| 21 |
+
"""
|
| 22 |
+
if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
|
| 23 |
+
logging.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
|
| 24 |
+
return route.abort()
|
| 25 |
+
return route.continue_()
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
|
| 29 |
+
"""Takes a CDP URL, reconstruct it to check it's valid, then adds encoded parameters if exists
|
| 30 |
+
|
| 31 |
+
:param cdp_url: The target URL.
|
| 32 |
+
:param query_params: A dictionary of the parameters to add.
|
| 33 |
+
:return: The new CDP URL.
|
| 34 |
+
"""
|
| 35 |
+
try:
|
| 36 |
+
# Validate the base URL structure
|
| 37 |
+
parsed = urlparse(cdp_url)
|
| 38 |
+
|
| 39 |
+
# Check scheme
|
| 40 |
+
if parsed.scheme not in ('ws', 'wss'):
|
| 41 |
+
raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
|
| 42 |
+
|
| 43 |
+
# Validate hostname and port
|
| 44 |
+
if not parsed.netloc:
|
| 45 |
+
raise ValueError("Invalid hostname for the CDP URL")
|
| 46 |
+
|
| 47 |
+
# Ensure path starts with /
|
| 48 |
+
path = parsed.path
|
| 49 |
+
if not path.startswith('/'):
|
| 50 |
+
path = '/' + path
|
| 51 |
+
|
| 52 |
+
# Reconstruct the base URL with validated parts
|
| 53 |
+
validated_base = f"{parsed.scheme}://{parsed.netloc}{path}"
|
| 54 |
+
|
| 55 |
+
# Add query parameters
|
| 56 |
+
if query_params:
|
| 57 |
+
query_string = urlencode(query_params)
|
| 58 |
+
return f"{validated_base}?{query_string}"
|
| 59 |
+
|
| 60 |
+
return validated_base
|
| 61 |
+
|
| 62 |
+
except Exception as e:
|
| 63 |
+
raise ValueError(f"Invalid CDP URL: {str(e)}")
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
@cache(None, typed=True)
|
| 67 |
+
def js_bypass_path(filename: str) -> str:
|
| 68 |
+
"""Takes the base filename of JS file inside the `bypasses` folder then return the full path of it
|
| 69 |
+
|
| 70 |
+
:param filename: The base filename of the JS file.
|
| 71 |
+
:return: The full path of the JS file.
|
| 72 |
+
"""
|
| 73 |
+
current_directory = os.path.dirname(__file__)
|
| 74 |
+
return os.path.join(current_directory, 'bypasses', filename)
|
scrapling/fetchers.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from scrapling.core._types import Dict, Optional, Union, Callable, List, Literal
|
| 2 |
+
|
| 3 |
+
from scrapling.engines.toolbelt import Response, BaseFetcher, do_nothing
|
| 4 |
+
from scrapling.engines import CamoufoxEngine, PlaywrightEngine, StaticEngine, check_if_engine_usable
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class Fetcher(BaseFetcher):
|
| 8 |
+
"""A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on httpx.
|
| 9 |
+
|
| 10 |
+
Any additional keyword arguments passed to the methods below are passed to the respective httpx's method directly.
|
| 11 |
+
"""
|
| 12 |
+
def get(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
| 13 |
+
"""Make basic HTTP GET request for you but with some added flavors.
|
| 14 |
+
:param url: Target url.
|
| 15 |
+
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
| 16 |
+
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
| 17 |
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
| 18 |
+
create a referer header as if this request had came from Google's search of this URL's domain.
|
| 19 |
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
|
| 20 |
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
| 21 |
+
"""
|
| 22 |
+
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).get(url, stealthy_headers, **kwargs)
|
| 23 |
+
return response_object
|
| 24 |
+
|
| 25 |
+
def post(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
| 26 |
+
"""Make basic HTTP POST request for you but with some added flavors.
|
| 27 |
+
:param url: Target url.
|
| 28 |
+
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
| 29 |
+
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
| 30 |
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
| 31 |
+
create a referer header as if this request came from Google's search of this URL's domain.
|
| 32 |
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
|
| 33 |
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
| 34 |
+
"""
|
| 35 |
+
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).post(url, stealthy_headers, **kwargs)
|
| 36 |
+
return response_object
|
| 37 |
+
|
| 38 |
+
def put(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
| 39 |
+
"""Make basic HTTP PUT request for you but with some added flavors.
|
| 40 |
+
:param url: Target url
|
| 41 |
+
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
| 42 |
+
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
| 43 |
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
| 44 |
+
create a referer header as if this request came from Google's search of this URL's domain.
|
| 45 |
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
|
| 46 |
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
| 47 |
+
"""
|
| 48 |
+
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).put(url, stealthy_headers, **kwargs)
|
| 49 |
+
return response_object
|
| 50 |
+
|
| 51 |
+
def delete(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
| 52 |
+
"""Make basic HTTP DELETE request for you but with some added flavors.
|
| 53 |
+
:param url: Target url
|
| 54 |
+
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
|
| 55 |
+
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
|
| 56 |
+
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
|
| 57 |
+
create a referer header as if this request came from Google's search of this URL's domain.
|
| 58 |
+
:param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
|
| 59 |
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
| 60 |
+
"""
|
| 61 |
+
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).delete(url, stealthy_headers, **kwargs)
|
| 62 |
+
return response_object
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
class StealthyFetcher(BaseFetcher):
|
| 66 |
+
"""A `Fetcher` class type that is completely stealthy fetcher that uses a modified version of Firefox.
|
| 67 |
+
|
| 68 |
+
It works as real browsers passing almost all online tests/protections based on Camoufox.
|
| 69 |
+
Other added flavors include setting the faked OS fingerprints to match the user's OS and the referer of every request is set as if this request came from Google's search of this URL's domain.
|
| 70 |
+
"""
|
| 71 |
+
def fetch(
|
| 72 |
+
self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
|
| 73 |
+
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
|
| 74 |
+
timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
|
| 75 |
+
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None
|
| 76 |
+
) -> Response:
|
| 77 |
+
"""
|
| 78 |
+
Opens up a browser and do your request based on your chosen options below.
|
| 79 |
+
:param url: Target url.
|
| 80 |
+
:param headless: Run the browser in headless/hidden (default), 'virtual' screen mode, or headful/visible mode.
|
| 81 |
+
:param block_images: Prevent the loading of images through Firefox preferences.
|
| 82 |
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
| 83 |
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends but it made requests ~25% faster in my tests for some websites.
|
| 84 |
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 85 |
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
| 86 |
+
:param block_webrtc: Blocks WebRTC entirely.
|
| 87 |
+
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
| 88 |
+
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
| 89 |
+
:param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases.
|
| 90 |
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 91 |
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
| 92 |
+
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
| 93 |
+
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
| 94 |
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
| 95 |
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
| 96 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 97 |
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
| 98 |
+
"""
|
| 99 |
+
engine = CamoufoxEngine(
|
| 100 |
+
timeout=timeout,
|
| 101 |
+
headless=headless,
|
| 102 |
+
page_action=page_action,
|
| 103 |
+
block_images=block_images,
|
| 104 |
+
block_webrtc=block_webrtc,
|
| 105 |
+
addons=addons,
|
| 106 |
+
humanize=humanize,
|
| 107 |
+
allow_webgl=allow_webgl,
|
| 108 |
+
disable_resources=disable_resources,
|
| 109 |
+
network_idle=network_idle,
|
| 110 |
+
wait_selector=wait_selector,
|
| 111 |
+
wait_selector_state=wait_selector_state,
|
| 112 |
+
google_search=google_search,
|
| 113 |
+
extra_headers=extra_headers,
|
| 114 |
+
adaptor_arguments=self.adaptor_arguments,
|
| 115 |
+
)
|
| 116 |
+
return engine.fetch(url)
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
class PlayWrightFetcher(BaseFetcher):
|
| 120 |
+
"""A `Fetcher` class type that provide many options, all of them are based on PlayWright.
|
| 121 |
+
|
| 122 |
+
Using this Fetcher class, you can do requests with:
|
| 123 |
+
- Vanilla Playwright without any modifications other than the ones you chose.
|
| 124 |
+
- Stealthy Playwright with the stealth mode I wrote for it. It's still a work in progress but it bypasses many online tests like bot.sannysoft.com
|
| 125 |
+
Some of the things stealth mode does include:
|
| 126 |
+
1) Patches the CDP runtime fingerprint.
|
| 127 |
+
2) Mimics some of the real browsers' properties by injecting several JS files and using custom options.
|
| 128 |
+
3) Using custom flags on launch to hide Playwright even more and make it faster.
|
| 129 |
+
4) Generates real browser's headers of the same type and same user OS then append it to the request.
|
| 130 |
+
- Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
| 131 |
+
- NSTBrowser's docker browserless option by passing the CDP URL and enabling `nstbrowser_mode` option.
|
| 132 |
+
> Note that these are the main options with PlayWright but it can be mixed together.
|
| 133 |
+
"""
|
| 134 |
+
def fetch(
|
| 135 |
+
self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
|
| 136 |
+
useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
|
| 137 |
+
page_action: Callable = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
|
| 138 |
+
hide_canvas: bool = True, disable_webgl: bool = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
|
| 139 |
+
stealth: bool = False,
|
| 140 |
+
cdp_url: Optional[str] = None,
|
| 141 |
+
nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
|
| 142 |
+
) -> Response:
|
| 143 |
+
"""Opens up a browser and do your request based on your chosen options below.
|
| 144 |
+
:param url: Target url.
|
| 145 |
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
| 146 |
+
:param disable_resources: Drop requests of unnecessary resources for speed boost. It depends but it made requests ~25% faster in my tests for some websites.
|
| 147 |
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
| 148 |
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
| 149 |
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
| 150 |
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
| 151 |
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
| 152 |
+
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
| 153 |
+
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
| 154 |
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
| 155 |
+
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
| 156 |
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
| 157 |
+
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
| 158 |
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
| 159 |
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
| 160 |
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
|
| 161 |
+
:param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
|
| 162 |
+
:param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
|
| 163 |
+
:return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
|
| 164 |
+
"""
|
| 165 |
+
engine = PlaywrightEngine(
|
| 166 |
+
timeout=timeout,
|
| 167 |
+
stealth=stealth,
|
| 168 |
+
cdp_url=cdp_url,
|
| 169 |
+
headless=headless,
|
| 170 |
+
useragent=useragent,
|
| 171 |
+
page_action=page_action,
|
| 172 |
+
hide_canvas=hide_canvas,
|
| 173 |
+
network_idle=network_idle,
|
| 174 |
+
google_search=google_search,
|
| 175 |
+
extra_headers=extra_headers,
|
| 176 |
+
wait_selector=wait_selector,
|
| 177 |
+
disable_webgl=disable_webgl,
|
| 178 |
+
nstbrowser_mode=nstbrowser_mode,
|
| 179 |
+
nstbrowser_config=nstbrowser_config,
|
| 180 |
+
disable_resources=disable_resources,
|
| 181 |
+
wait_selector_state=wait_selector_state,
|
| 182 |
+
adaptor_arguments=self.adaptor_arguments,
|
| 183 |
+
)
|
| 184 |
+
return engine.fetch(url)
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
class CustomFetcher(BaseFetcher):
|
| 188 |
+
def fetch(self, url: str, browser_engine, **kwargs) -> Response:
|
| 189 |
+
engine = check_if_engine_usable(browser_engine)(adaptor_arguments=self.adaptor_arguments, **kwargs)
|
| 190 |
+
return engine.fetch(url)
|
scrapling/parser.py
CHANGED
|
@@ -1,18 +1,14 @@
|
|
| 1 |
import os
|
|
|
|
|
|
|
| 2 |
from difflib import SequenceMatcher
|
| 3 |
-
from typing import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator
|
| 4 |
-
try:
|
| 5 |
-
from typing import SupportsIndex
|
| 6 |
-
except ImportError:
|
| 7 |
-
# 'SupportsIndex' got added in Python 3.8
|
| 8 |
-
SupportsIndex = None
|
| 9 |
-
|
| 10 |
-
from scrapling.translator import HTMLTranslator
|
| 11 |
-
from scrapling.mixins import SelectorsGeneration
|
| 12 |
-
from scrapling.custom_types import TextHandler, AttributesHandler
|
| 13 |
-
from scrapling.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
|
| 14 |
-
from scrapling.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
from lxml import etree, html
|
| 17 |
from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
|
| 18 |
|
|
@@ -32,7 +28,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 32 |
huge_tree: bool = True,
|
| 33 |
root: Optional[html.HtmlElement] = None,
|
| 34 |
keep_comments: Optional[bool] = False,
|
| 35 |
-
auto_match: Optional[bool] =
|
| 36 |
storage: Any = SQLiteStorageSystem,
|
| 37 |
storage_args: Optional[Dict] = None,
|
| 38 |
debug: Optional[bool] = True,
|
|
@@ -125,7 +121,7 @@ class Adaptor(SelectorsGeneration):
|
|
| 125 |
def _is_text_node(element: Union[html.HtmlElement, etree._ElementUnicodeResult]) -> bool:
|
| 126 |
"""Return True if given element is a result of a string expression
|
| 127 |
Examples:
|
| 128 |
-
|
| 129 |
CSS3 -> '::text', '::attr(attrib)'...
|
| 130 |
"""
|
| 131 |
# Faster than checking `element.is_attribute or element.is_text or element.is_tail`
|
|
@@ -163,6 +159,8 @@ class Adaptor(SelectorsGeneration):
|
|
| 163 |
results = [self.__get_correct_result(n) for n in result]
|
| 164 |
if all(isinstance(res, self.__class__) for res in results):
|
| 165 |
return Adaptors(results)
|
|
|
|
|
|
|
| 166 |
return results
|
| 167 |
|
| 168 |
return self.__get_correct_result(result)
|
|
@@ -399,6 +397,56 @@ class Adaptor(SelectorsGeneration):
|
|
| 399 |
return self.__convert_results(score_table[highest_probability])
|
| 400 |
return []
|
| 401 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 402 |
def css(self, selector: str, identifier: str = '',
|
| 403 |
auto_match: bool = False, auto_save: bool = False, percentage: int = 0
|
| 404 |
) -> Union['Adaptors[Adaptor]', List]:
|
|
@@ -495,6 +543,113 @@ class Adaptor(SelectorsGeneration):
|
|
| 495 |
except (SelectorError, SelectorSyntaxError, etree.XPathError, etree.XPathEvalError):
|
| 496 |
raise SelectorSyntaxError(f"Invalid XPath selector: {selector}")
|
| 497 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 498 |
def __calculate_similarity_score(self, original: Dict, candidate: html.HtmlElement) -> float:
|
| 499 |
"""Used internally to calculate a score that shows how candidate element similar to the original one
|
| 500 |
|
|
@@ -606,25 +761,33 @@ class Adaptor(SelectorsGeneration):
|
|
| 606 |
# Operations on text functions
|
| 607 |
def json(self) -> Dict:
|
| 608 |
"""Return json response if the response is jsonable otherwise throws error"""
|
| 609 |
-
|
|
|
|
|
|
|
|
|
|
| 610 |
|
| 611 |
-
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True
|
|
|
|
| 612 |
"""Apply the given regex to the current text and return a list of strings with the matches.
|
| 613 |
|
| 614 |
:param regex: Can be either a compiled regular expression or a string.
|
| 615 |
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
|
|
|
|
|
|
| 616 |
"""
|
| 617 |
-
return self.text.re(regex, replace_entities)
|
| 618 |
|
| 619 |
-
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True
|
|
|
|
| 620 |
"""Apply the given regex to text and return the first match if found, otherwise return the default value.
|
| 621 |
|
| 622 |
:param regex: Can be either a compiled regular expression or a string.
|
| 623 |
:param default: The default value to be returned if there is no match
|
| 624 |
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
| 625 |
-
|
|
|
|
| 626 |
"""
|
| 627 |
-
return self.text.re_first(regex, default, replace_entities)
|
| 628 |
|
| 629 |
def find_similar(
|
| 630 |
self,
|
|
@@ -757,10 +920,10 @@ class Adaptor(SelectorsGeneration):
|
|
| 757 |
return self.__convert_results(results)
|
| 758 |
|
| 759 |
def find_by_regex(
|
| 760 |
-
self, query: str, first_match: bool = True, case_sensitive: bool = False, clean_match: bool = True
|
| 761 |
) -> Union['Adaptors[Adaptor]', 'Adaptor', List]:
|
| 762 |
"""Find elements that its text content matches the input regex pattern.
|
| 763 |
-
:param query: Regex query to match
|
| 764 |
:param first_match: Return first element that matches conditions, enabled by default
|
| 765 |
:param case_sensitive: if enabled, letters case will be taken into consideration in the regex
|
| 766 |
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
|
@@ -855,54 +1018,56 @@ class Adaptors(List[Adaptor]):
|
|
| 855 |
]
|
| 856 |
return self.__class__(flatten(results))
|
| 857 |
|
| 858 |
-
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True
|
|
|
|
| 859 |
"""Call the ``.re()`` method for each element in this list and return
|
| 860 |
their results flattened as List of TextHandler.
|
| 861 |
|
| 862 |
:param regex: Can be either a compiled regular expression or a string.
|
| 863 |
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
|
|
|
|
|
|
| 864 |
"""
|
| 865 |
results = [
|
| 866 |
-
n.text.re(regex, replace_entities) for n in self
|
| 867 |
]
|
| 868 |
return flatten(results)
|
| 869 |
|
| 870 |
-
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True
|
|
|
|
| 871 |
"""Call the ``.re_first()`` method for each element in this list and return
|
| 872 |
-
|
| 873 |
|
| 874 |
:param regex: Can be either a compiled regular expression or a string.
|
| 875 |
:param default: The default value to be returned if there is no match
|
| 876 |
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 877 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 878 |
"""
|
| 879 |
results = [
|
| 880 |
-
|
| 881 |
]
|
| 882 |
-
return
|
| 883 |
-
|
| 884 |
-
# def __getattr__(self, name):
|
| 885 |
-
# if name in dir(self.__class__):
|
| 886 |
-
# return super().__getattribute__(name)
|
| 887 |
-
#
|
| 888 |
-
# # Execute the method itself on each Adaptor
|
| 889 |
-
# results = []
|
| 890 |
-
# for item in self:
|
| 891 |
-
# results.append(getattr(item, name))
|
| 892 |
-
#
|
| 893 |
-
# if all(callable(r) for r in results):
|
| 894 |
-
# def call_all(*args, **kwargs):
|
| 895 |
-
# final_results = [r(*args, **kwargs) for r in results]
|
| 896 |
-
# if all([isinstance(r, (Adaptor, Adaptors,)) for r in results]):
|
| 897 |
-
# return self.__class__(final_results)
|
| 898 |
-
# return final_results
|
| 899 |
-
#
|
| 900 |
-
# return call_all
|
| 901 |
-
# else:
|
| 902 |
-
# # Flatten the result if it's a single-item list containing a list
|
| 903 |
-
# if len(self) == 1 and isinstance(results[0], list):
|
| 904 |
-
# return self.__class__(results[0])
|
| 905 |
-
# return self.__class__(results)
|
| 906 |
|
| 907 |
def get(self, default=None):
|
| 908 |
"""Returns the first item of the current list
|
|
|
|
| 1 |
import os
|
| 2 |
+
import re
|
| 3 |
+
import inspect
|
| 4 |
from difflib import SequenceMatcher
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
+
from scrapling.core.translator import HTMLTranslator
|
| 7 |
+
from scrapling.core.mixins import SelectorsGeneration
|
| 8 |
+
from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
|
| 9 |
+
from scrapling.core.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
|
| 10 |
+
from scrapling.core.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden
|
| 11 |
+
from scrapling.core._types import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator, SupportsIndex, Iterable
|
| 12 |
from lxml import etree, html
|
| 13 |
from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
|
| 14 |
|
|
|
|
| 28 |
huge_tree: bool = True,
|
| 29 |
root: Optional[html.HtmlElement] = None,
|
| 30 |
keep_comments: Optional[bool] = False,
|
| 31 |
+
auto_match: Optional[bool] = True,
|
| 32 |
storage: Any = SQLiteStorageSystem,
|
| 33 |
storage_args: Optional[Dict] = None,
|
| 34 |
debug: Optional[bool] = True,
|
|
|
|
| 121 |
def _is_text_node(element: Union[html.HtmlElement, etree._ElementUnicodeResult]) -> bool:
|
| 122 |
"""Return True if given element is a result of a string expression
|
| 123 |
Examples:
|
| 124 |
+
XPath -> '/text()', '/@attribute' etc...
|
| 125 |
CSS3 -> '::text', '::attr(attrib)'...
|
| 126 |
"""
|
| 127 |
# Faster than checking `element.is_attribute or element.is_text or element.is_tail`
|
|
|
|
| 159 |
results = [self.__get_correct_result(n) for n in result]
|
| 160 |
if all(isinstance(res, self.__class__) for res in results):
|
| 161 |
return Adaptors(results)
|
| 162 |
+
elif all(isinstance(res, TextHandler) for res in results):
|
| 163 |
+
return TextHandlers(results)
|
| 164 |
return results
|
| 165 |
|
| 166 |
return self.__get_correct_result(result)
|
|
|
|
| 397 |
return self.__convert_results(score_table[highest_probability])
|
| 398 |
return []
|
| 399 |
|
| 400 |
+
def css_first(self, selector: str, identifier: str = '',
|
| 401 |
+
auto_match: bool = False, auto_save: bool = False, percentage: int = 0
|
| 402 |
+
) -> Union['Adaptor', 'TextHandler', None]:
|
| 403 |
+
"""Search current tree with CSS3 selectors and return the first result if possible, otherwise return `None`
|
| 404 |
+
|
| 405 |
+
**Important:
|
| 406 |
+
It's recommended to use the identifier argument if you plan to use different selector later
|
| 407 |
+
and want to relocate the same element(s)**
|
| 408 |
+
|
| 409 |
+
:param selector: The CSS3 selector to be used.
|
| 410 |
+
:param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
|
| 411 |
+
:param identifier: A string that will be used to save/retrieve element's data in auto-matching
|
| 412 |
+
otherwise the selector will be used.
|
| 413 |
+
:param auto_save: Automatically save new elements for `auto_match` later
|
| 414 |
+
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 415 |
+
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
| 416 |
+
number unless you must know what you are doing!
|
| 417 |
+
|
| 418 |
+
:return: List as :class:`Adaptors`
|
| 419 |
+
"""
|
| 420 |
+
for element in self.css(selector, identifier, auto_match, auto_save, percentage):
|
| 421 |
+
return element
|
| 422 |
+
return None
|
| 423 |
+
|
| 424 |
+
def xpath_first(self, selector: str, identifier: str = '',
|
| 425 |
+
auto_match: bool = False, auto_save: bool = False, percentage: int = 0, **kwargs: Any
|
| 426 |
+
) -> Union['Adaptor', 'TextHandler', None]:
|
| 427 |
+
"""Search current tree with XPath selectors and return the first result if possible, otherwise return `None`
|
| 428 |
+
|
| 429 |
+
**Important:
|
| 430 |
+
It's recommended to use the identifier argument if you plan to use different selector later
|
| 431 |
+
and want to relocate the same element(s)**
|
| 432 |
+
|
| 433 |
+
Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
|
| 434 |
+
|
| 435 |
+
:param selector: The XPath selector to be used.
|
| 436 |
+
:param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
|
| 437 |
+
:param identifier: A string that will be used to save/retrieve element's data in auto-matching
|
| 438 |
+
otherwise the selector will be used.
|
| 439 |
+
:param auto_save: Automatically save new elements for `auto_match` later
|
| 440 |
+
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
| 441 |
+
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
| 442 |
+
number unless you must know what you are doing!
|
| 443 |
+
|
| 444 |
+
:return: List as :class:`Adaptors`
|
| 445 |
+
"""
|
| 446 |
+
for element in self.xpath(selector, identifier, auto_match, auto_save, percentage, **kwargs):
|
| 447 |
+
return element
|
| 448 |
+
return None
|
| 449 |
+
|
| 450 |
def css(self, selector: str, identifier: str = '',
|
| 451 |
auto_match: bool = False, auto_save: bool = False, percentage: int = 0
|
| 452 |
) -> Union['Adaptors[Adaptor]', List]:
|
|
|
|
| 543 |
except (SelectorError, SelectorSyntaxError, etree.XPathError, etree.XPathEvalError):
|
| 544 |
raise SelectorSyntaxError(f"Invalid XPath selector: {selector}")
|
| 545 |
|
| 546 |
+
def find_all(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> Union['Adaptors[Adaptor]', List]:
|
| 547 |
+
"""Find elements by filters of your creations for ease..
|
| 548 |
+
|
| 549 |
+
:param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
|
| 550 |
+
:param kwargs: The attributes you want to filter elements based on it.
|
| 551 |
+
:return: The `Adaptors` object of the elements or empty list
|
| 552 |
+
"""
|
| 553 |
+
# Attributes that are Python reserved words and can't be used directly
|
| 554 |
+
# Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
|
| 555 |
+
# https://www.w3schools.com/python/python_ref_keywords.asp
|
| 556 |
+
whitelisted = {
|
| 557 |
+
'class_': 'class',
|
| 558 |
+
'for_': 'for',
|
| 559 |
+
}
|
| 560 |
+
|
| 561 |
+
if not args and not kwargs:
|
| 562 |
+
raise TypeError('You have to pass something to search with, like tag name(s), tag attributes, or both.')
|
| 563 |
+
|
| 564 |
+
attributes = dict()
|
| 565 |
+
tags, patterns = set(), set()
|
| 566 |
+
results, functions, selectors = [], [], []
|
| 567 |
+
|
| 568 |
+
def _search_tree(element: Adaptor, filter_function: Callable) -> None:
|
| 569 |
+
"""Collect element if it fulfills passed function otherwise, traverse the children tree and iterate"""
|
| 570 |
+
if filter_function(element):
|
| 571 |
+
results.append(element)
|
| 572 |
+
|
| 573 |
+
for branch in element.children:
|
| 574 |
+
_search_tree(branch, filter_function)
|
| 575 |
+
|
| 576 |
+
# Brace yourself for a wonderful journey!
|
| 577 |
+
for arg in args:
|
| 578 |
+
if type(arg) is str:
|
| 579 |
+
tags.add(arg)
|
| 580 |
+
|
| 581 |
+
elif type(arg) in [list, tuple, set]:
|
| 582 |
+
if not all(map(lambda x: type(x) is str, arg)):
|
| 583 |
+
raise TypeError('Nested Iterables are not accepted, only iterables of tag names are accepted')
|
| 584 |
+
tags.update(set(arg))
|
| 585 |
+
|
| 586 |
+
elif type(arg) is dict:
|
| 587 |
+
if not all([(type(k) is str and type(v) is str) for k, v in arg.items()]):
|
| 588 |
+
raise TypeError('Nested dictionaries are not accepted, only string keys and string values are accepted')
|
| 589 |
+
attributes.update(arg)
|
| 590 |
+
|
| 591 |
+
elif type(arg) is re.Pattern:
|
| 592 |
+
patterns.add(arg)
|
| 593 |
+
|
| 594 |
+
elif callable(arg):
|
| 595 |
+
if len(inspect.signature(arg).parameters) > 0:
|
| 596 |
+
functions.append(arg)
|
| 597 |
+
else:
|
| 598 |
+
raise TypeError("Callable filter function must have at least one argument to take `Adaptor` objects.")
|
| 599 |
+
|
| 600 |
+
else:
|
| 601 |
+
raise TypeError(f'Argument with type "{type(arg)}" is not accepted, please read the docs.')
|
| 602 |
+
|
| 603 |
+
if not all([(type(k) is str and type(v) is str) for k, v in kwargs.items()]):
|
| 604 |
+
raise TypeError('Only string values are accepted for arguments')
|
| 605 |
+
|
| 606 |
+
for attribute_name, value in kwargs.items():
|
| 607 |
+
# Only replace names for kwargs, replacing them in dictionaries doesn't make sense
|
| 608 |
+
attribute_name = whitelisted.get(attribute_name, attribute_name)
|
| 609 |
+
attributes[attribute_name] = value
|
| 610 |
+
|
| 611 |
+
# It's easier and faster to build a selector than traversing the tree
|
| 612 |
+
tags = tags or ['']
|
| 613 |
+
for tag in tags:
|
| 614 |
+
selector = tag
|
| 615 |
+
for key, value in attributes.items():
|
| 616 |
+
value = value.replace('"', r'\"') # Escape double quotes in user input
|
| 617 |
+
# Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
|
| 618 |
+
selector += '[{}="{}"]'.format(key, value)
|
| 619 |
+
if selector:
|
| 620 |
+
selectors.append(selector)
|
| 621 |
+
|
| 622 |
+
if selectors:
|
| 623 |
+
results = self.css(', '.join(selectors))
|
| 624 |
+
if results:
|
| 625 |
+
# From the results, get the ones that fulfill passed regex patterns
|
| 626 |
+
for pattern in patterns:
|
| 627 |
+
results = results.filter(lambda e: e.text.re(pattern, check_match=True))
|
| 628 |
+
|
| 629 |
+
# From the results, get the ones that fulfill passed functions
|
| 630 |
+
for function in functions:
|
| 631 |
+
results = results.filter(function)
|
| 632 |
+
else:
|
| 633 |
+
for pattern in patterns:
|
| 634 |
+
results.extend(self.find_by_regex(pattern, first_match=False))
|
| 635 |
+
|
| 636 |
+
for result in (results or [self]):
|
| 637 |
+
for function in functions:
|
| 638 |
+
_search_tree(result, function)
|
| 639 |
+
|
| 640 |
+
return self.__convert_results(results)
|
| 641 |
+
|
| 642 |
+
def find(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> Union['Adaptor', None]:
|
| 643 |
+
"""Find elements by filters of your creations for ease then return the first result. Otherwise return `None`.
|
| 644 |
+
|
| 645 |
+
:param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
|
| 646 |
+
:param kwargs: The attributes you want to filter elements based on it.
|
| 647 |
+
:return: The `Adaptor` object of the element or `None` if the result didn't match
|
| 648 |
+
"""
|
| 649 |
+
for element in self.find_all(*args, **kwargs):
|
| 650 |
+
return element
|
| 651 |
+
return None
|
| 652 |
+
|
| 653 |
def __calculate_similarity_score(self, original: Dict, candidate: html.HtmlElement) -> float:
|
| 654 |
"""Used internally to calculate a score that shows how candidate element similar to the original one
|
| 655 |
|
|
|
|
| 761 |
# Operations on text functions
|
| 762 |
def json(self) -> Dict:
|
| 763 |
"""Return json response if the response is jsonable otherwise throws error"""
|
| 764 |
+
if self.text:
|
| 765 |
+
return self.text.json()
|
| 766 |
+
else:
|
| 767 |
+
return self.get_all_text(strip=True).json()
|
| 768 |
|
| 769 |
+
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
|
| 770 |
+
clean_match: bool = False, case_sensitive: bool = False) -> 'List[str]':
|
| 771 |
"""Apply the given regex to the current text and return a list of strings with the matches.
|
| 772 |
|
| 773 |
:param regex: Can be either a compiled regular expression or a string.
|
| 774 |
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
| 775 |
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 776 |
+
:param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
|
| 777 |
"""
|
| 778 |
+
return self.text.re(regex, replace_entities, clean_match, case_sensitive)
|
| 779 |
|
| 780 |
+
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
| 781 |
+
clean_match: bool = False, case_sensitive: bool = False) -> Union[str, None]:
|
| 782 |
"""Apply the given regex to text and return the first match if found, otherwise return the default value.
|
| 783 |
|
| 784 |
:param regex: Can be either a compiled regular expression or a string.
|
| 785 |
:param default: The default value to be returned if there is no match
|
| 786 |
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
| 787 |
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 788 |
+
:param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
|
| 789 |
"""
|
| 790 |
+
return self.text.re_first(regex, default, replace_entities, clean_match, case_sensitive)
|
| 791 |
|
| 792 |
def find_similar(
|
| 793 |
self,
|
|
|
|
| 920 |
return self.__convert_results(results)
|
| 921 |
|
| 922 |
def find_by_regex(
|
| 923 |
+
self, query: Union[str, Pattern[str]], first_match: bool = True, case_sensitive: bool = False, clean_match: bool = True
|
| 924 |
) -> Union['Adaptors[Adaptor]', 'Adaptor', List]:
|
| 925 |
"""Find elements that its text content matches the input regex pattern.
|
| 926 |
+
:param query: Regex query/pattern to match
|
| 927 |
:param first_match: Return first element that matches conditions, enabled by default
|
| 928 |
:param case_sensitive: if enabled, letters case will be taken into consideration in the regex
|
| 929 |
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
|
|
|
| 1018 |
]
|
| 1019 |
return self.__class__(flatten(results))
|
| 1020 |
|
| 1021 |
+
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
|
| 1022 |
+
clean_match: bool = False, case_sensitive: bool = False) -> 'List[str]':
|
| 1023 |
"""Call the ``.re()`` method for each element in this list and return
|
| 1024 |
their results flattened as List of TextHandler.
|
| 1025 |
|
| 1026 |
:param regex: Can be either a compiled regular expression or a string.
|
| 1027 |
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
| 1028 |
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 1029 |
+
:param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
|
| 1030 |
"""
|
| 1031 |
results = [
|
| 1032 |
+
n.text.re(regex, replace_entities, clean_match, case_sensitive) for n in self
|
| 1033 |
]
|
| 1034 |
return flatten(results)
|
| 1035 |
|
| 1036 |
+
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
| 1037 |
+
clean_match: bool = False, case_sensitive: bool = False) -> Union[str, None]:
|
| 1038 |
"""Call the ``.re_first()`` method for each element in this list and return
|
| 1039 |
+
the first result or the default value otherwise.
|
| 1040 |
|
| 1041 |
:param regex: Can be either a compiled regular expression or a string.
|
| 1042 |
:param default: The default value to be returned if there is no match
|
| 1043 |
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
| 1044 |
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
| 1045 |
+
:param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
|
| 1046 |
+
"""
|
| 1047 |
+
for n in self:
|
| 1048 |
+
for result in n.re(regex, replace_entities, clean_match, case_sensitive):
|
| 1049 |
+
return result
|
| 1050 |
+
return default
|
| 1051 |
+
|
| 1052 |
+
def search(self, func: Callable[['Adaptor'], bool]) -> Union['Adaptor', None]:
|
| 1053 |
+
"""Loop over all current elements and return the first element that matches the passed function
|
| 1054 |
+
:param func: A function that takes each element as an argument and returns True/False
|
| 1055 |
+
:return: The first element that match the function or ``None`` otherwise.
|
| 1056 |
+
"""
|
| 1057 |
+
for element in self:
|
| 1058 |
+
if func(element):
|
| 1059 |
+
return element
|
| 1060 |
+
return None
|
| 1061 |
|
| 1062 |
+
def filter(self, func: Callable[['Adaptor'], bool]) -> Union['Adaptors', List]:
|
| 1063 |
+
"""Filter current elements based on the passed function
|
| 1064 |
+
:param func: A function that takes each element as an argument and returns True/False
|
| 1065 |
+
:return: The new `Adaptors` object or empty list otherwise.
|
| 1066 |
"""
|
| 1067 |
results = [
|
| 1068 |
+
element for element in self if func(element)
|
| 1069 |
]
|
| 1070 |
+
return self.__class__(results) if results else results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1071 |
|
| 1072 |
def get(self, default=None):
|
| 1073 |
"""Returns the first item of the current list
|
setup.cfg
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
[metadata]
|
| 2 |
name = scrapling
|
| 3 |
-
version = 0.
|
| 4 |
author = Karim Shoair
|
| 5 |
author_email = karim.shoair@pm.me
|
| 6 |
description = Scrapling is a powerful, flexible, adaptive, and high-performance web scraping library for Python.
|
| 7 |
license = BSD
|
| 8 |
-
|
|
|
|
| 1 |
[metadata]
|
| 2 |
name = scrapling
|
| 3 |
+
version = 0.2
|
| 4 |
author = Karim Shoair
|
| 5 |
author_email = karim.shoair@pm.me
|
| 6 |
description = Scrapling is a powerful, flexible, adaptive, and high-performance web scraping library for Python.
|
| 7 |
license = BSD
|
| 8 |
+
home_page = https://github.com/D4Vinci/Scrapling
|
setup.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from setuptools import setup
|
| 2 |
|
| 3 |
with open("README.md", "r", encoding="utf-8") as fh:
|
| 4 |
long_description = fh.read()
|
|
@@ -6,7 +6,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
| 6 |
|
| 7 |
setup(
|
| 8 |
name="scrapling",
|
| 9 |
-
version="0.
|
| 10 |
description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
| 11 |
simplifies the process of extracting data from websites, even when they undergo structural changes, and offers
|
| 12 |
impressive speed improvements over many popular scraping tools.""",
|
|
@@ -15,7 +15,7 @@ setup(
|
|
| 15 |
author="Karim Shoair",
|
| 16 |
author_email="karim.shoair@pm.me",
|
| 17 |
license="BSD",
|
| 18 |
-
packages=
|
| 19 |
zip_safe=False,
|
| 20 |
package_dir={
|
| 21 |
"scrapling": "scrapling",
|
|
@@ -32,16 +32,17 @@ setup(
|
|
| 32 |
"Natural Language :: English",
|
| 33 |
"Topic :: Internet :: WWW/HTTP",
|
| 34 |
"Topic :: Text Processing :: Markup",
|
|
|
|
| 35 |
"Topic :: Text Processing :: Markup :: HTML",
|
| 36 |
"Topic :: Software Development :: Libraries :: Python Modules",
|
| 37 |
"Programming Language :: Python :: 3",
|
| 38 |
"Programming Language :: Python :: 3 :: Only",
|
| 39 |
-
"Programming Language :: Python :: 3.7",
|
| 40 |
"Programming Language :: Python :: 3.8",
|
| 41 |
"Programming Language :: Python :: 3.9",
|
| 42 |
"Programming Language :: Python :: 3.10",
|
| 43 |
"Programming Language :: Python :: 3.11",
|
| 44 |
"Programming Language :: Python :: 3.12",
|
|
|
|
| 45 |
"Programming Language :: Python :: Implementation :: CPython",
|
| 46 |
"Typing :: Typed",
|
| 47 |
],
|
|
@@ -53,8 +54,13 @@ setup(
|
|
| 53 |
"w3lib",
|
| 54 |
"orjson>=3",
|
| 55 |
"tldextract",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
],
|
| 57 |
-
python_requires=">=3.
|
| 58 |
url="https://github.com/D4Vinci/Scrapling",
|
| 59 |
project_urls={
|
| 60 |
"Documentation": "https://github.com/D4Vinci/Scrapling/tree/main/docs", # For now
|
|
|
|
| 1 |
+
from setuptools import setup, find_packages
|
| 2 |
|
| 3 |
with open("README.md", "r", encoding="utf-8") as fh:
|
| 4 |
long_description = fh.read()
|
|
|
|
| 6 |
|
| 7 |
setup(
|
| 8 |
name="scrapling",
|
| 9 |
+
version="0.2",
|
| 10 |
description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
| 11 |
simplifies the process of extracting data from websites, even when they undergo structural changes, and offers
|
| 12 |
impressive speed improvements over many popular scraping tools.""",
|
|
|
|
| 15 |
author="Karim Shoair",
|
| 16 |
author_email="karim.shoair@pm.me",
|
| 17 |
license="BSD",
|
| 18 |
+
packages=find_packages(),
|
| 19 |
zip_safe=False,
|
| 20 |
package_dir={
|
| 21 |
"scrapling": "scrapling",
|
|
|
|
| 32 |
"Natural Language :: English",
|
| 33 |
"Topic :: Internet :: WWW/HTTP",
|
| 34 |
"Topic :: Text Processing :: Markup",
|
| 35 |
+
"Topic :: Internet :: WWW/HTTP :: Browsers",
|
| 36 |
"Topic :: Text Processing :: Markup :: HTML",
|
| 37 |
"Topic :: Software Development :: Libraries :: Python Modules",
|
| 38 |
"Programming Language :: Python :: 3",
|
| 39 |
"Programming Language :: Python :: 3 :: Only",
|
|
|
|
| 40 |
"Programming Language :: Python :: 3.8",
|
| 41 |
"Programming Language :: Python :: 3.9",
|
| 42 |
"Programming Language :: Python :: 3.10",
|
| 43 |
"Programming Language :: Python :: 3.11",
|
| 44 |
"Programming Language :: Python :: 3.12",
|
| 45 |
+
"Programming Language :: Python :: 3.13",
|
| 46 |
"Programming Language :: Python :: Implementation :: CPython",
|
| 47 |
"Typing :: Typed",
|
| 48 |
],
|
|
|
|
| 54 |
"w3lib",
|
| 55 |
"orjson>=3",
|
| 56 |
"tldextract",
|
| 57 |
+
'httpx[brotli,zstd]',
|
| 58 |
+
'playwright',
|
| 59 |
+
'rebrowser-playwright',
|
| 60 |
+
'camoufox>=0.3.7',
|
| 61 |
+
'browserforge',
|
| 62 |
],
|
| 63 |
+
python_requires=">=3.8",
|
| 64 |
url="https://github.com/D4Vinci/Scrapling",
|
| 65 |
project_urls={
|
| 66 |
"Documentation": "https://github.com/D4Vinci/Scrapling/tree/main/docs", # For now
|
tests/fetchers/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Because I'm too lazy to mock requests :)
|
tests/fetchers/test_camoufox.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
import pytest_httpbin
|
| 3 |
+
|
| 4 |
+
from scrapling import StealthyFetcher
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
@pytest_httpbin.use_class_based_httpbin
|
| 8 |
+
# @pytest_httpbin.use_class_based_httpbin_secure
|
| 9 |
+
class TestStealthyFetcher(unittest.TestCase):
|
| 10 |
+
def setUp(self):
|
| 11 |
+
self.fetcher = StealthyFetcher(auto_match=False)
|
| 12 |
+
url = self.httpbin.url
|
| 13 |
+
self.status_200 = f'{url}/status/200'
|
| 14 |
+
self.status_404 = f'{url}/status/404'
|
| 15 |
+
self.status_501 = f'{url}/status/501'
|
| 16 |
+
self.basic_url = f'{url}/get'
|
| 17 |
+
self.html_url = f'{url}/html'
|
| 18 |
+
self.delayed_url = f'{url}/delay/10' # 10 Seconds delay response
|
| 19 |
+
self.cookies_url = f"{url}/cookies/set/test/value"
|
| 20 |
+
|
| 21 |
+
def test_basic_fetch(self):
|
| 22 |
+
"""Test doing basic fetch request with multiple statuses"""
|
| 23 |
+
self.assertEqual(self.fetcher.fetch(self.status_200).status, 200)
|
| 24 |
+
self.assertEqual(self.fetcher.fetch(self.status_404).status, 404)
|
| 25 |
+
self.assertEqual(self.fetcher.fetch(self.status_501).status, 501)
|
| 26 |
+
|
| 27 |
+
def test_networkidle(self):
|
| 28 |
+
"""Test if waiting for `networkidle` make page does not finish loading or not"""
|
| 29 |
+
self.assertEqual(self.fetcher.fetch(self.basic_url, network_idle=True).status, 200)
|
| 30 |
+
|
| 31 |
+
def test_blocking_resources(self):
|
| 32 |
+
"""Test if blocking resources make page does not finish loading or not"""
|
| 33 |
+
self.assertEqual(self.fetcher.fetch(self.basic_url, block_images=True).status, 200)
|
| 34 |
+
self.assertEqual(self.fetcher.fetch(self.basic_url, disable_resources=True).status, 200)
|
| 35 |
+
|
| 36 |
+
def test_waiting_selector(self):
|
| 37 |
+
"""Test if waiting for a selector make page does not finish loading or not"""
|
| 38 |
+
self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1').status, 200)
|
| 39 |
+
|
| 40 |
+
def test_cookies_loading(self):
|
| 41 |
+
"""Test if cookies are set after the request"""
|
| 42 |
+
self.assertEqual(self.fetcher.fetch(self.cookies_url).cookies, {'test': 'value'})
|
| 43 |
+
|
| 44 |
+
def test_automation(self):
|
| 45 |
+
"""Test if automation break the code or not"""
|
| 46 |
+
def scroll_page(page):
|
| 47 |
+
page.mouse.wheel(10, 0)
|
| 48 |
+
page.mouse.move(100, 400)
|
| 49 |
+
page.mouse.up()
|
| 50 |
+
return page
|
| 51 |
+
|
| 52 |
+
self.assertEqual(self.fetcher.fetch(self.html_url, page_action=scroll_page).status, 200)
|
| 53 |
+
|
| 54 |
+
def test_properties(self):
|
| 55 |
+
"""Test if different arguments breaks the code or not"""
|
| 56 |
+
self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=True).status, 200)
|
| 57 |
+
self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=False, allow_webgl=True).status, 200)
|
| 58 |
+
self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=False).status, 200)
|
| 59 |
+
|
| 60 |
+
def test_infinite_timeout(self):
|
| 61 |
+
"""Test if infinite timeout breaks the code or not"""
|
| 62 |
+
self.assertEqual(self.fetcher.fetch(self.delayed_url, timeout=None).status, 200)
|
tests/fetchers/test_httpx.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
import pytest_httpbin
|
| 3 |
+
|
| 4 |
+
from scrapling import Fetcher
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
@pytest_httpbin.use_class_based_httpbin
|
| 8 |
+
class TestFetcher(unittest.TestCase):
|
| 9 |
+
def setUp(self):
|
| 10 |
+
self.fetcher = Fetcher(auto_match=False)
|
| 11 |
+
url = self.httpbin.url
|
| 12 |
+
self.status_200 = f'{url}/status/200'
|
| 13 |
+
self.status_404 = f'{url}/status/404'
|
| 14 |
+
self.status_501 = f'{url}/status/501'
|
| 15 |
+
self.basic_url = f'{url}/get'
|
| 16 |
+
self.post_url = f'{url}/post'
|
| 17 |
+
self.put_url = f'{url}/put'
|
| 18 |
+
self.delete_url = f'{url}/delete'
|
| 19 |
+
self.html_url = f'{url}/html'
|
| 20 |
+
|
| 21 |
+
def test_basic_get(self):
|
| 22 |
+
"""Test doing basic get request with multiple statuses"""
|
| 23 |
+
self.assertEqual(self.fetcher.get(self.status_200).status, 200)
|
| 24 |
+
self.assertEqual(self.fetcher.get(self.status_404).status, 404)
|
| 25 |
+
self.assertEqual(self.fetcher.get(self.status_501).status, 501)
|
| 26 |
+
|
| 27 |
+
def test_get_properties(self):
|
| 28 |
+
"""Test if different arguments with GET request breaks the code or not"""
|
| 29 |
+
self.assertEqual(self.fetcher.get(self.status_200, stealthy_headers=True).status, 200)
|
| 30 |
+
self.assertEqual(self.fetcher.get(self.status_200, follow_redirects=True).status, 200)
|
| 31 |
+
self.assertEqual(self.fetcher.get(self.status_200, timeout=None).status, 200)
|
| 32 |
+
self.assertEqual(
|
| 33 |
+
self.fetcher.get(self.status_200, stealthy_headers=True, follow_redirects=True, timeout=None).status,
|
| 34 |
+
200
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
def test_post_properties(self):
|
| 38 |
+
"""Test if different arguments with POST request breaks the code or not"""
|
| 39 |
+
self.assertEqual(self.fetcher.post(self.post_url, data={'key': 'value'}).status, 200)
|
| 40 |
+
self.assertEqual(self.fetcher.post(self.post_url, data={'key': 'value'}, stealthy_headers=True).status, 200)
|
| 41 |
+
self.assertEqual(self.fetcher.post(self.post_url, data={'key': 'value'}, follow_redirects=True).status, 200)
|
| 42 |
+
self.assertEqual(self.fetcher.post(self.post_url, data={'key': 'value'}, timeout=None).status, 200)
|
| 43 |
+
self.assertEqual(
|
| 44 |
+
self.fetcher.post(self.post_url, data={'key': 'value'}, stealthy_headers=True, follow_redirects=True, timeout=None).status,
|
| 45 |
+
200
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
def test_put_properties(self):
|
| 49 |
+
"""Test if different arguments with PUT request breaks the code or not"""
|
| 50 |
+
self.assertEqual(self.fetcher.put(self.put_url, data={'key': 'value'}).status, 200)
|
| 51 |
+
self.assertEqual(self.fetcher.put(self.put_url, data={'key': 'value'}, stealthy_headers=True).status, 200)
|
| 52 |
+
self.assertEqual(self.fetcher.put(self.put_url, data={'key': 'value'}, follow_redirects=True).status, 200)
|
| 53 |
+
self.assertEqual(self.fetcher.put(self.put_url, data={'key': 'value'}, timeout=None).status, 200)
|
| 54 |
+
self.assertEqual(
|
| 55 |
+
self.fetcher.put(self.put_url, data={'key': 'value'}, stealthy_headers=True, follow_redirects=True, timeout=None).status,
|
| 56 |
+
200
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
def test_delete_properties(self):
|
| 60 |
+
"""Test if different arguments with DELETE request breaks the code or not"""
|
| 61 |
+
self.assertEqual(self.fetcher.delete(self.delete_url, stealthy_headers=True).status, 200)
|
| 62 |
+
self.assertEqual(self.fetcher.delete(self.delete_url, follow_redirects=True).status, 200)
|
| 63 |
+
self.assertEqual(self.fetcher.delete(self.delete_url, timeout=None).status, 200)
|
| 64 |
+
self.assertEqual(
|
| 65 |
+
self.fetcher.delete(self.delete_url, stealthy_headers=True, follow_redirects=True, timeout=None).status,
|
| 66 |
+
200
|
| 67 |
+
)
|
tests/fetchers/test_playwright.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
import pytest_httpbin
|
| 3 |
+
|
| 4 |
+
from scrapling import PlayWrightFetcher
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
@pytest_httpbin.use_class_based_httpbin
|
| 8 |
+
# @pytest_httpbin.use_class_based_httpbin_secure
|
| 9 |
+
class TestPlayWrightFetcher(unittest.TestCase):
|
| 10 |
+
def setUp(self):
|
| 11 |
+
self.fetcher = PlayWrightFetcher(auto_match=False)
|
| 12 |
+
url = self.httpbin.url
|
| 13 |
+
self.status_200 = f'{url}/status/200'
|
| 14 |
+
self.status_404 = f'{url}/status/404'
|
| 15 |
+
self.status_501 = f'{url}/status/501'
|
| 16 |
+
self.basic_url = f'{url}/get'
|
| 17 |
+
self.html_url = f'{url}/html'
|
| 18 |
+
self.delayed_url = f'{url}/delay/10' # 10 Seconds delay response
|
| 19 |
+
self.cookies_url = f"{url}/cookies/set/test/value"
|
| 20 |
+
|
| 21 |
+
def test_basic_fetch(self):
|
| 22 |
+
"""Test doing basic fetch request with multiple statuses"""
|
| 23 |
+
self.assertEqual(self.fetcher.fetch(self.status_200).status, 200)
|
| 24 |
+
self.assertEqual(self.fetcher.fetch(self.status_404).status, 404)
|
| 25 |
+
self.assertEqual(self.fetcher.fetch(self.status_501).status, 501)
|
| 26 |
+
|
| 27 |
+
def test_networkidle(self):
|
| 28 |
+
"""Test if waiting for `networkidle` make page does not finish loading or not"""
|
| 29 |
+
self.assertEqual(self.fetcher.fetch(self.basic_url, network_idle=True).status, 200)
|
| 30 |
+
|
| 31 |
+
def test_blocking_resources(self):
|
| 32 |
+
"""Test if blocking resources make page does not finish loading or not"""
|
| 33 |
+
self.assertEqual(self.fetcher.fetch(self.basic_url, disable_resources=True).status, 200)
|
| 34 |
+
|
| 35 |
+
def test_waiting_selector(self):
|
| 36 |
+
"""Test if waiting for a selector make page does not finish loading or not"""
|
| 37 |
+
self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1').status, 200)
|
| 38 |
+
|
| 39 |
+
def test_cookies_loading(self):
|
| 40 |
+
"""Test if cookies are set after the request"""
|
| 41 |
+
self.assertEqual(self.fetcher.fetch(self.cookies_url).cookies, {'test': 'value'})
|
| 42 |
+
|
| 43 |
+
def test_automation(self):
|
| 44 |
+
"""Test if automation break the code or not"""
|
| 45 |
+
def scroll_page(page):
|
| 46 |
+
page.mouse.wheel(10, 0)
|
| 47 |
+
page.mouse.move(100, 400)
|
| 48 |
+
page.mouse.up()
|
| 49 |
+
return page
|
| 50 |
+
|
| 51 |
+
self.assertEqual(self.fetcher.fetch(self.html_url, page_action=scroll_page).status, 200)
|
| 52 |
+
|
| 53 |
+
def test_properties(self):
|
| 54 |
+
"""Test if different arguments breaks the code or not"""
|
| 55 |
+
self.assertEqual(self.fetcher.fetch(self.html_url, disable_webgl=True, hide_canvas=False).status, 200)
|
| 56 |
+
self.assertEqual(self.fetcher.fetch(self.html_url, disable_webgl=False, hide_canvas=True).status, 200)
|
| 57 |
+
self.assertEqual(self.fetcher.fetch(self.html_url, stealth=True).status, 200)
|
| 58 |
+
self.assertEqual(self.fetcher.fetch(self.html_url, useragent='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0').status, 200)
|
| 59 |
+
|
| 60 |
+
def test_cdp_url(self):
|
| 61 |
+
"""Test if it's going to try to connect to cdp url or not"""
|
| 62 |
+
with self.assertRaises(ValueError):
|
| 63 |
+
_ = self.fetcher.fetch(self.html_url, cdp_url='blahblah')
|
| 64 |
+
|
| 65 |
+
with self.assertRaises(ValueError):
|
| 66 |
+
_ = self.fetcher.fetch(self.html_url, cdp_url='blahblah', nstbrowser_mode=True)
|
| 67 |
+
|
| 68 |
+
with self.assertRaises(Exception):
|
| 69 |
+
# There's no type for this error in PlayWright, it's just `Error`
|
| 70 |
+
_ = self.fetcher.fetch(self.html_url, cdp_url='ws://blahblah')
|
| 71 |
+
|
| 72 |
+
def test_infinite_timeout(self):
|
| 73 |
+
"""Test if infinite timeout breaks the code or not"""
|
| 74 |
+
self.assertEqual(self.fetcher.fetch(self.delayed_url, timeout=None).status, 200)
|
tests/parser/__init__.py
ADDED
|
File without changes
|
tests/parser/test_automatch.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
|
| 3 |
+
from scrapling import Adaptor
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class TestParserAutoMatch(unittest.TestCase):
|
| 7 |
+
|
| 8 |
+
def test_element_relocation(self):
|
| 9 |
+
"""Test relocating element after structure change"""
|
| 10 |
+
original_html = '''
|
| 11 |
+
<div class="container">
|
| 12 |
+
<section class="products">
|
| 13 |
+
<article class="product" id="p1">
|
| 14 |
+
<h3>Product 1</h3>
|
| 15 |
+
<p class="description">Description 1</p>
|
| 16 |
+
</article>
|
| 17 |
+
<article class="product" id="p2">
|
| 18 |
+
<h3>Product 2</h3>
|
| 19 |
+
<p class="description">Description 2</p>
|
| 20 |
+
</article>
|
| 21 |
+
</section>
|
| 22 |
+
</div>
|
| 23 |
+
'''
|
| 24 |
+
changed_html = '''
|
| 25 |
+
<div class="new-container">
|
| 26 |
+
<div class="product-wrapper">
|
| 27 |
+
<section class="products">
|
| 28 |
+
<article class="product new-class" data-id="p1">
|
| 29 |
+
<div class="product-info">
|
| 30 |
+
<h3>Product 1</h3>
|
| 31 |
+
<p class="new-description">Description 1</p>
|
| 32 |
+
</div>
|
| 33 |
+
</article>
|
| 34 |
+
<article class="product new-class" data-id="p2">
|
| 35 |
+
<div class="product-info">
|
| 36 |
+
<h3>Product 2</h3>
|
| 37 |
+
<p class="new-description">Description 2</p>
|
| 38 |
+
</div>
|
| 39 |
+
</article>
|
| 40 |
+
</section>
|
| 41 |
+
</div>
|
| 42 |
+
</div>
|
| 43 |
+
'''
|
| 44 |
+
|
| 45 |
+
old_page = Adaptor(original_html, url='example.com', auto_match=True, debug=True)
|
| 46 |
+
new_page = Adaptor(changed_html, url='example.com', auto_match=True, debug=True)
|
| 47 |
+
|
| 48 |
+
# 'p1' was used as ID and now it's not and all the path elements have changes
|
| 49 |
+
# Also at the same time testing auto-match vs combined selectors
|
| 50 |
+
_ = old_page.css('#p1, #p2', auto_save=True)[0]
|
| 51 |
+
relocated = new_page.css('#p1', auto_match=True)
|
| 52 |
+
|
| 53 |
+
self.assertIsNotNone(relocated)
|
| 54 |
+
self.assertEqual(relocated[0].attrib['data-id'], 'p1')
|
| 55 |
+
self.assertTrue(relocated[0].has_class('new-class'))
|
| 56 |
+
self.assertEqual(relocated[0].css('.new-description')[0].text, 'Description 1')
|
tests/{test_all_functions.py → parser/test_general.py}
RENAMED
|
@@ -112,11 +112,11 @@ class TestParser(unittest.TestCase):
|
|
| 112 |
|
| 113 |
def test_find_similar_elements(self):
|
| 114 |
"""Test Finding similar elements of an element"""
|
| 115 |
-
first_product = self.page.
|
| 116 |
similar_products = first_product.find_similar()
|
| 117 |
self.assertEqual(len(similar_products), 2)
|
| 118 |
|
| 119 |
-
first_review = self.page.
|
| 120 |
similar_high_rated_reviews = [
|
| 121 |
review
|
| 122 |
for review in first_review.find_similar()
|
|
@@ -127,16 +127,16 @@ class TestParser(unittest.TestCase):
|
|
| 127 |
def test_expected_errors(self):
|
| 128 |
"""Test errors that should raised if it does"""
|
| 129 |
with self.assertRaises(ValueError):
|
| 130 |
-
_ = Adaptor()
|
| 131 |
|
| 132 |
with self.assertRaises(TypeError):
|
| 133 |
-
_ = Adaptor(root="ayo")
|
| 134 |
|
| 135 |
with self.assertRaises(TypeError):
|
| 136 |
-
_ = Adaptor(text=1)
|
| 137 |
|
| 138 |
with self.assertRaises(TypeError):
|
| 139 |
-
_ = Adaptor(body=1)
|
| 140 |
|
| 141 |
with self.assertRaises(ValueError):
|
| 142 |
_ = Adaptor(self.html, storage=object, auto_match=True)
|
|
@@ -169,8 +169,8 @@ class TestParser(unittest.TestCase):
|
|
| 169 |
def test_selectors_generation(self):
|
| 170 |
"""Try to create selectors for all elements in the page"""
|
| 171 |
def _traverse(element: Adaptor):
|
| 172 |
-
self.assertTrue(type(element.
|
| 173 |
-
self.assertTrue(type(element.
|
| 174 |
for branch in element.children:
|
| 175 |
_traverse(branch)
|
| 176 |
|
|
@@ -197,7 +197,7 @@ class TestParser(unittest.TestCase):
|
|
| 197 |
parent_siblings = parent.siblings
|
| 198 |
self.assertEqual(len(parent_siblings), 1)
|
| 199 |
|
| 200 |
-
child = table.
|
| 201 |
next_element = child.next
|
| 202 |
self.assertEqual(next_element.attrib['data-id'], '2')
|
| 203 |
|
|
@@ -261,60 +261,10 @@ class TestParser(unittest.TestCase):
|
|
| 261 |
key_value = list(products[0].attrib.search_values('1', partial=True))
|
| 262 |
self.assertEqual(list(key_value[0].keys()), ['data-id'])
|
| 263 |
|
| 264 |
-
attr_json = self.page.
|
| 265 |
self.assertEqual(attr_json, {'jsonable': 'data'})
|
| 266 |
self.assertEqual(type(self.page.css('#products')[0].attrib.json_string), bytes)
|
| 267 |
|
| 268 |
-
def test_element_relocation(self):
|
| 269 |
-
"""Test relocating element after structure change"""
|
| 270 |
-
original_html = '''
|
| 271 |
-
<div class="container">
|
| 272 |
-
<section class="products">
|
| 273 |
-
<article class="product" id="p1">
|
| 274 |
-
<h3>Product 1</h3>
|
| 275 |
-
<p class="description">Description 1</p>
|
| 276 |
-
</article>
|
| 277 |
-
<article class="product" id="p2">
|
| 278 |
-
<h3>Product 2</h3>
|
| 279 |
-
<p class="description">Description 2</p>
|
| 280 |
-
</article>
|
| 281 |
-
</section>
|
| 282 |
-
</div>
|
| 283 |
-
'''
|
| 284 |
-
changed_html = '''
|
| 285 |
-
<div class="new-container">
|
| 286 |
-
<div class="product-wrapper">
|
| 287 |
-
<section class="products">
|
| 288 |
-
<article class="product new-class" data-id="p1">
|
| 289 |
-
<div class="product-info">
|
| 290 |
-
<h3>Product 1</h3>
|
| 291 |
-
<p class="new-description">Description 1</p>
|
| 292 |
-
</div>
|
| 293 |
-
</article>
|
| 294 |
-
<article class="product new-class" data-id="p2">
|
| 295 |
-
<div class="product-info">
|
| 296 |
-
<h3>Product 2</h3>
|
| 297 |
-
<p class="new-description">Description 2</p>
|
| 298 |
-
</div>
|
| 299 |
-
</article>
|
| 300 |
-
</section>
|
| 301 |
-
</div>
|
| 302 |
-
</div>
|
| 303 |
-
'''
|
| 304 |
-
|
| 305 |
-
old_page = Adaptor(original_html, url='example.com', auto_match=True, debug=True)
|
| 306 |
-
new_page = Adaptor(changed_html, url='example.com', auto_match=True, debug=True)
|
| 307 |
-
|
| 308 |
-
# 'p1' was used as ID and now it's not and all the path elements have changes
|
| 309 |
-
# Also at the same time testing auto-match vs combined selectors
|
| 310 |
-
_ = old_page.css('#p1, #p2', auto_save=True)[0]
|
| 311 |
-
relocated = new_page.css('#p1', auto_match=True)
|
| 312 |
-
|
| 313 |
-
self.assertIsNotNone(relocated)
|
| 314 |
-
self.assertEqual(relocated[0].attrib['data-id'], 'p1')
|
| 315 |
-
self.assertTrue(relocated[0].has_class('new-class'))
|
| 316 |
-
self.assertEqual(relocated[0].css('.new-description')[0].text, 'Description 1')
|
| 317 |
-
|
| 318 |
def test_performance(self):
|
| 319 |
"""Test parsing and selecting speed"""
|
| 320 |
import time
|
|
@@ -331,6 +281,6 @@ class TestParser(unittest.TestCase):
|
|
| 331 |
self.assertLess(end_time - start_time, 0.1)
|
| 332 |
|
| 333 |
|
| 334 |
-
# Use `coverage run -m unittest --verbose tests/
|
| 335 |
# if __name__ == '__main__':
|
| 336 |
# unittest.main(verbosity=2)
|
|
|
|
| 112 |
|
| 113 |
def test_find_similar_elements(self):
|
| 114 |
"""Test Finding similar elements of an element"""
|
| 115 |
+
first_product = self.page.css_first('.product')
|
| 116 |
similar_products = first_product.find_similar()
|
| 117 |
self.assertEqual(len(similar_products), 2)
|
| 118 |
|
| 119 |
+
first_review = self.page.find('div', class_='review')
|
| 120 |
similar_high_rated_reviews = [
|
| 121 |
review
|
| 122 |
for review in first_review.find_similar()
|
|
|
|
| 127 |
def test_expected_errors(self):
|
| 128 |
"""Test errors that should raised if it does"""
|
| 129 |
with self.assertRaises(ValueError):
|
| 130 |
+
_ = Adaptor(auto_match=False)
|
| 131 |
|
| 132 |
with self.assertRaises(TypeError):
|
| 133 |
+
_ = Adaptor(root="ayo", auto_match=False)
|
| 134 |
|
| 135 |
with self.assertRaises(TypeError):
|
| 136 |
+
_ = Adaptor(text=1, auto_match=False)
|
| 137 |
|
| 138 |
with self.assertRaises(TypeError):
|
| 139 |
+
_ = Adaptor(body=1, auto_match=False)
|
| 140 |
|
| 141 |
with self.assertRaises(ValueError):
|
| 142 |
_ = Adaptor(self.html, storage=object, auto_match=True)
|
|
|
|
| 169 |
def test_selectors_generation(self):
|
| 170 |
"""Try to create selectors for all elements in the page"""
|
| 171 |
def _traverse(element: Adaptor):
|
| 172 |
+
self.assertTrue(type(element.generate_css_selector) is str)
|
| 173 |
+
self.assertTrue(type(element.generate_xpath_selector) is str)
|
| 174 |
for branch in element.children:
|
| 175 |
_traverse(branch)
|
| 176 |
|
|
|
|
| 197 |
parent_siblings = parent.siblings
|
| 198 |
self.assertEqual(len(parent_siblings), 1)
|
| 199 |
|
| 200 |
+
child = table.find({'data-id': "1"})
|
| 201 |
next_element = child.next
|
| 202 |
self.assertEqual(next_element.attrib['data-id'], '2')
|
| 203 |
|
|
|
|
| 261 |
key_value = list(products[0].attrib.search_values('1', partial=True))
|
| 262 |
self.assertEqual(list(key_value[0].keys()), ['data-id'])
|
| 263 |
|
| 264 |
+
attr_json = self.page.css_first('#products').attrib['schema'].json()
|
| 265 |
self.assertEqual(attr_json, {'jsonable': 'data'})
|
| 266 |
self.assertEqual(type(self.page.css('#products')[0].attrib.json_string), bytes)
|
| 267 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
def test_performance(self):
|
| 269 |
"""Test parsing and selecting speed"""
|
| 270 |
import time
|
|
|
|
| 281 |
self.assertLess(end_time - start_time, 0.1)
|
| 282 |
|
| 283 |
|
| 284 |
+
# Use `coverage run -m unittest --verbose tests/test_parser_functions.py` instead for the coverage report
|
| 285 |
# if __name__ == '__main__':
|
| 286 |
# unittest.main(verbosity=2)
|
tests/requirements.txt
CHANGED
|
@@ -1,2 +1,7 @@
|
|
| 1 |
-
pytest
|
| 2 |
-
pytest-cov
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pytest>=2.8.0,<9
|
| 2 |
+
pytest-cov
|
| 3 |
+
playwright
|
| 4 |
+
camoufox
|
| 5 |
+
werkzeug<3.0.0
|
| 6 |
+
pytest-httpbin==2.1.0
|
| 7 |
+
httpbin~=0.10.0
|
tox.ini
CHANGED
|
@@ -4,14 +4,17 @@
|
|
| 4 |
# and then run "tox" from this directory.
|
| 5 |
|
| 6 |
[tox]
|
| 7 |
-
envlist = pre-commit,
|
| 8 |
|
| 9 |
[testenv]
|
| 10 |
usedevelop = True
|
| 11 |
changedir = tests
|
| 12 |
deps =
|
| 13 |
-r{toxinidir}/tests/requirements.txt
|
| 14 |
-
commands =
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
[testenv:pre-commit]
|
| 17 |
basepython = python3
|
|
|
|
| 4 |
# and then run "tox" from this directory.
|
| 5 |
|
| 6 |
[tox]
|
| 7 |
+
envlist = pre-commit,py{38,39,310,311,312,313}
|
| 8 |
|
| 9 |
[testenv]
|
| 10 |
usedevelop = True
|
| 11 |
changedir = tests
|
| 12 |
deps =
|
| 13 |
-r{toxinidir}/tests/requirements.txt
|
| 14 |
+
commands =
|
| 15 |
+
playwright install-deps chromium firefox
|
| 16 |
+
camoufox fetch --browserforge
|
| 17 |
+
pytest --cov=scrapling --cov-report=xml
|
| 18 |
|
| 19 |
[testenv:pre-commit]
|
| 20 |
basepython = python3
|