Karim shoair commited on
Commit
42b99f6
·
2 Parent(s): a879519baf9852

Merge pull request #3 from D4Vinci/dev

Browse files
Files changed (44) hide show
  1. .github/workflows/tests.yml +30 -4
  2. .gitignore +2 -0
  3. CONTRIBUTING.md +7 -2
  4. MANIFEST.in +2 -0
  5. README.md +397 -73
  6. ROADMAP.md +12 -11
  7. pytest.ini +1 -1
  8. scrapling/__init__.py +4 -3
  9. scrapling/core/__init__.py +0 -0
  10. scrapling/core/_types.py +25 -0
  11. scrapling/{custom_types.py → core/custom_types.py} +48 -3
  12. scrapling/{mixins.py → core/mixins.py} +22 -7
  13. scrapling/{storage_adaptors.py → core/storage_adaptors.py} +2 -2
  14. scrapling/{translator.py → core/translator.py} +2 -12
  15. scrapling/{utils.py → core/utils.py} +2 -61
  16. scrapling/engines/__init__.py +7 -0
  17. scrapling/engines/camo.py +121 -0
  18. scrapling/engines/constants.py +108 -0
  19. scrapling/engines/pw.py +232 -0
  20. scrapling/engines/static.py +112 -0
  21. scrapling/engines/toolbelt/__init__.py +18 -0
  22. scrapling/engines/toolbelt/bypasses/navigator_plugins.js +40 -0
  23. scrapling/engines/toolbelt/bypasses/notification_permission.js +5 -0
  24. scrapling/engines/toolbelt/bypasses/pdf_viewer.js +5 -0
  25. scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -0
  26. scrapling/engines/toolbelt/bypasses/screen_props.js +27 -0
  27. scrapling/engines/toolbelt/bypasses/webdriver_fully.js +27 -0
  28. scrapling/engines/toolbelt/bypasses/window_chrome.js +213 -0
  29. scrapling/engines/toolbelt/custom.py +168 -0
  30. scrapling/engines/toolbelt/fingerprints.py +81 -0
  31. scrapling/engines/toolbelt/navigation.py +74 -0
  32. scrapling/fetchers.py +190 -0
  33. scrapling/parser.py +216 -51
  34. setup.cfg +2 -2
  35. setup.py +11 -5
  36. tests/fetchers/__init__.py +1 -0
  37. tests/fetchers/test_camoufox.py +62 -0
  38. tests/fetchers/test_httpx.py +67 -0
  39. tests/fetchers/test_playwright.py +74 -0
  40. tests/parser/__init__.py +0 -0
  41. tests/parser/test_automatch.py +56 -0
  42. tests/{test_all_functions.py → parser/test_general.py} +11 -61
  43. tests/requirements.txt +7 -2
  44. tox.ini +5 -2
.github/workflows/tests.yml CHANGED
@@ -7,15 +7,12 @@ concurrency:
7
 
8
  jobs:
9
  tests:
 
10
  runs-on: ${{ matrix.os }}
11
  strategy:
12
  fail-fast: false
13
  matrix:
14
  include:
15
- - python-version: "3.7"
16
- os: ubuntu-latest
17
- env:
18
- TOXENV: py
19
  - python-version: "3.8"
20
  os: ubuntu-latest
21
  env:
@@ -36,13 +33,42 @@ jobs:
36
  os: ubuntu-latest
37
  env:
38
  TOXENV: py
 
 
 
 
39
 
40
  steps:
41
  - uses: actions/checkout@v4
 
42
  - name: Set up Python ${{ matrix.python-version }}
43
  uses: actions/setup-python@v5
44
  with:
45
  python-version: ${{ matrix.python-version }}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  - name: Run tests
48
  env: ${{ matrix.env }}
 
7
 
8
  jobs:
9
  tests:
10
+ timeout-minutes: 60
11
  runs-on: ${{ matrix.os }}
12
  strategy:
13
  fail-fast: false
14
  matrix:
15
  include:
 
 
 
 
16
  - python-version: "3.8"
17
  os: ubuntu-latest
18
  env:
 
33
  os: ubuntu-latest
34
  env:
35
  TOXENV: py
36
+ - python-version: "3.13"
37
+ os: ubuntu-latest
38
+ env:
39
+ TOXENV: py
40
 
41
  steps:
42
  - uses: actions/checkout@v4
43
+
44
  - name: Set up Python ${{ matrix.python-version }}
45
  uses: actions/setup-python@v5
46
  with:
47
  python-version: ${{ matrix.python-version }}
48
+ cache: 'pip'
49
+ cache-dependency-path: |
50
+ setup.py
51
+ requirements*.txt
52
+ tox.ini
53
+
54
+ - name: Install Camoufox Dependencies
55
+ run: |
56
+ python3 -m pip install --upgrade pip
57
+ python3 -m pip install playwright camoufox
58
+ python3 -m playwright install chromium
59
+ python3 -m playwright install-deps chromium firefox
60
+ python3 -m camoufox fetch --browserforge
61
+
62
+ # Cache tox environments
63
+ - name: Cache tox environments
64
+ uses: actions/cache@v3
65
+ with:
66
+ path: .tox
67
+ # Include python version and os in cache key
68
+ key: tox-v1-${{ runner.os }}-py${{ matrix.python-version }}-${{ hashFiles('tox.ini', 'setup.py', 'requirements*.txt') }}
69
+ restore-keys: |
70
+ tox-v1-${{ runner.os }}-py${{ matrix.python-version }}-
71
+ tox-v1-${{ runner.os }}-
72
 
73
  - name: Run tests
74
  env: ${{ matrix.env }}
.gitignore CHANGED
@@ -13,6 +13,8 @@ __pycache__/
13
  .bootstrap
14
  .appveyor.token
15
  *.bak
 
 
16
 
17
  # installation package
18
  *.egg-info/
 
13
  .bootstrap
14
  .appveyor.token
15
  *.bak
16
+ *.db
17
+ *.db-*
18
 
19
  # installation package
20
  *.egg-info/
CONTRIBUTING.md CHANGED
@@ -15,7 +15,7 @@ configfile: pytest.ini
15
  plugins: cov-5.0.0, anyio-4.6.0
16
  collected 16 items
17
 
18
- tests/test_all_functions.py ................ [100%]
19
 
20
  =============================== 16 passed in 0.22s ================================
21
  ```
@@ -27,4 +27,9 @@ Also, consider setting `debug` to `True` while initializing the Adaptor object s
27
  - Fork Scrapling [git repository](https://github.com/D4Vinci/Scrapling).
28
  - Make your changes.
29
  - Ensure tests work.
30
- - Create a Pull Request against the [**dev**](https://github.com/D4Vinci/Scrapling/tree/dev) branch of Scrapling.
 
 
 
 
 
 
15
  plugins: cov-5.0.0, anyio-4.6.0
16
  collected 16 items
17
 
18
+ tests/test_parser_functions.py ................ [100%]
19
 
20
  =============================== 16 passed in 0.22s ================================
21
  ```
 
27
  - Fork Scrapling [git repository](https://github.com/D4Vinci/Scrapling).
28
  - Make your changes.
29
  - Ensure tests work.
30
+ - Create a Pull Request against the [**dev**](https://github.com/D4Vinci/Scrapling/tree/dev) branch of Scrapling.
31
+
32
+ ### Installing the latest changes from the dev branch
33
+ ```commandline
34
+ pip3 install git+https://github.com/D4Vinci/Scrapling.git@dev
35
+ ```
MANIFEST.in CHANGED
@@ -1,6 +1,8 @@
1
  include LICENSE
2
  include *.db
 
3
  include scrapling/*.db
 
4
  include scrapling/py.typed
5
 
6
  recursive-exclude * __pycache__
 
1
  include LICENSE
2
  include *.db
3
+ include *.js
4
  include scrapling/*.db
5
+ include scrapling/*.db*
6
  include scrapling/py.typed
7
 
8
  recursive-exclude * __pycache__
README.md CHANGED
@@ -1,30 +1,77 @@
1
- # 🕷️ Scrapling: Lightning-Fast, Adaptive Web Scraping for Python
2
  [![Tests](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg)](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [![PyPI version](https://badge.fury.io/py/Scrapling.svg)](https://badge.fury.io/py/Scrapling) [![Supported Python versions](https://img.shields.io/pypi/pyversions/scrapling.svg)](https://pypi.org/project/scrapling/) [![PyPI Downloads](https://static.pepy.tech/badge/scrapling)](https://pepy.tech/project/scrapling)
3
 
4
- Dealing with failing web scrapers due to website changes? Meet Scrapling.
5
 
6
- Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. Whether you're a beginner or an expert, Scrapling provides powerful features while maintaining simplicity.
7
 
8
  ```python
9
- from scrapling import Adaptor
10
-
11
- # Scrape data that survives website changes
12
- page = Adaptor(html, auto_match=True)
13
- products = page.css('.product', auto_save=True)
14
- # Later, even if selectors change:
15
- products = page.css('.product', auto_match=True) # Still finds them!
 
 
16
  ```
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  ## Key Features
19
 
 
 
 
 
 
20
  ### Adaptive Scraping
21
  - 🔄 **Smart Element Tracking**: Locate previously identified elements after website structure changes, using an intelligent similarity system and integrated storage.
22
- - 🎯 **Flexible Querying**: Use CSS selectors, XPath, text search, or regex - chain them however you want!
23
  - 🔍 **Find Similar Elements**: Automatically locate elements similar to the element you want on the page (Ex: other products like the product you found on the page).
24
- - 🧠 **Smart Content Scraping**: Extract data from multiple websites without specific selectors using its powerful features.
25
 
26
  ### Performance
27
- - 🚀 **Lightning Fast**: Built from the ground up with performance in mind, outperforming most popular Python scraping libraries (outperforming BeautifulSoup by up to 237x in our tests).
28
  - 🔋 **Memory Efficient**: Optimized data structures for minimal memory footprint.
29
  - ⚡ **Fast JSON serialization**: 10x faster JSON serialization than the standard json library with more options.
30
 
@@ -32,23 +79,18 @@ products = page.css('.product', auto_match=True) # Still finds them!
32
  - 🛠️ **Powerful Navigation API**: Traverse the DOM tree easily in all directions and get the info you want (parent, ancestors, sibling, children, next/previous element, and more).
33
  - 🧬 **Rich Text Processing**: All strings have built-in methods for regex matching, cleaning, and more. All elements' attributes are read-only dictionaries that are faster than standard dictionaries with added methods.
34
  - 📝 **Automatic Selector Generation**: Create robust CSS/XPath selectors for any element.
35
- - 🔌 **Scrapy-Compatible API**: Familiar methods and similar pseudo-elements for Scrapy users.
36
- - 📘 **Type hints**: Complete type coverage for better IDE support and fewer bugs.
37
 
38
  ## Getting Started
39
 
40
- Let's walk through a basic example that demonstrates a small group of Scrapling's core features:
41
-
42
  ```python
43
- import requests
44
- from scrapling import Adaptor
45
 
46
- # Fetch a web page
47
- url = 'https://quotes.toscrape.com/'
48
- response = requests.get(url)
49
 
50
- # Create an Adaptor instance
51
- page = Adaptor(response.text, url=url)
52
  # Get all strings in the full page
53
  page.get_all_text(ignore_tags=('script', 'style'))
54
 
@@ -56,10 +98,17 @@ page.get_all_text(ignore_tags=('script', 'style'))
56
  quotes = page.css('.quote .text::text') # CSS selector
57
  quotes = page.xpath('//span[@class="text"]/text()') # XPath
58
  quotes = page.css('.quote').css('.text::text') # Chained selectors
59
- quotes = [element.text for element in page.css('.quote').css('.text')] # Slower than bulk query above
60
 
61
  # Get the first quote element
62
- quote = page.css('.quote').first # or [0] or .get()
 
 
 
 
 
 
 
63
 
64
  # Working with elements
65
  quote.html_content # Inner HTML
@@ -67,19 +116,9 @@ quote.prettify() # Prettified version of Inner HTML
67
  quote.attrib # Element attributes
68
  quote.path # DOM path to element (List)
69
  ```
70
- To keep it simple, all methods can be chained on top of each other as long as you are chaining methods that return an element (It's called an `Adaptor` object) or a List of Adaptors (It's called `Adaptors` object)
71
-
72
- ### Installation
73
- Scrapling is a breeze to get started with - We only require at least Python 3.7 to work and the rest of the requirements are installed automatically with the package.
74
- ```bash
75
- # Using pip
76
- pip install scrapling
77
-
78
- # Or the latest from GitHub
79
- pip install git+https://github.com/D4Vinci/Scrapling.git@master
80
- ```
81
 
82
- ## Performance
83
 
84
  Scrapling isn't just powerful - it's also blazing fast. Scrapling implements many best practices, design patterns, and numerous optimizations to save fractions of seconds. All of that while focusing exclusively on parsing HTML documents.
85
  Here are benchmarks comparing Scrapling to popular Python libraries in two tests.
@@ -106,11 +145,150 @@ As you see, Scrapling is on par with Scrapy and slightly faster than Lxml which
106
  | Scrapling | 2.51 | 1.0x |
107
  | AutoScraper | 11.41 | 4.546x |
108
 
109
- Scrapling can find elements with more methods and it returns full element `Adaptor` objects not only the text like AutoScraper. So, to make this test fair, both libraries will extract an element with text, find similar elements, and then extract the text content for all of them. As you see, Scrapling is still 4.5 times faster at same task.
110
 
111
  > All benchmarks' results are an average of 100 runs. See our [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology and to run your comparisons.
112
 
113
- ## Advanced Features
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  ### Smart Navigation
115
  ```python
116
  >>> quote.tag
@@ -130,24 +308,23 @@ Scrapling can find elements with more methods and it returns full element `Adapt
130
  >>> quote.siblings
131
  [<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
132
  <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
133
- <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
134
  ...]
135
 
136
  >>> quote.next # gets the next element, the same logic applies to `quote.previous`
137
  <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>
138
 
139
- >>> quote.children.css(".author::text")
140
- ['Albert Einstein']
141
 
142
  >>> quote.has_class('quote')
143
  True
144
 
145
  # Generate new selectors for any element
146
- >>> quote.css_selector
147
  'body > div > div:nth-of-type(2) > div > div'
148
 
149
- # Test these selectors on your favorite browser or reuse them again in the library in other methods!
150
- >>> quote.xpath_selector
151
  '//body/div/div[2]/div/div'
152
  ```
153
  If your case needs more than the element's parent, you can iterate over the whole ancestors' tree of any element like below
@@ -164,11 +341,9 @@ You can search for a specific ancestor of an element that satisfies a function,
164
  ### Content-based Selection & Finding Similar Elements
165
  You can select elements by their text content in multiple ways, here's a full example on another website:
166
  ```python
167
- >>> response = requests.get('https://books.toscrape.com/index.html')
168
-
169
- >>> page = Adaptor(response.text, url=response.url)
170
 
171
- >>> page.find_by_text('Tipping the Velvet') # Find the first element that its text fully matches this text
172
  <data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>
173
 
174
  >>> page.find_by_text('Tipping the Velvet', first_match=False) # Get all matches if there are more
@@ -208,8 +383,8 @@ To increase the complexity a little bit, let's say we want to get all books' dat
208
  ```python
209
  >>> for product in page.find_by_text('Tipping the Velvet').parent.parent.find_similar():
210
  print({
211
- "name": product.css('h3 a::text')[0],
212
- "price": product.css('.price_color')[0].re_first(r'[\d\.]+'),
213
  "stock": product.css('.availability::text')[-1].clean()
214
  })
215
  {'name': 'A Light in the ...', 'price': '51.77', 'stock': 'In stock'}
@@ -220,8 +395,6 @@ To increase the complexity a little bit, let's say we want to get all books' dat
220
  The [documentation](https://github.com/D4Vinci/Scrapling/tree/main/docs/Examples) will provide more advanced examples.
221
 
222
  ### Handling Structural Changes
223
- > Because [the internet archive](https://web.archive.org/) is down at the time of writing this, I can't use real websites as examples even though I tested that before (I mean browsing an old version of a website and then counting the current version of the website as structural changes)
224
-
225
  Let's say you are scraping a page with a structure like this:
226
  ```html
227
  <div class="container">
@@ -237,7 +410,7 @@ Let's say you are scraping a page with a structure like this:
237
  </section>
238
  </div>
239
  ```
240
- and you want to scrape the first product, the one with the `p1` ID. You will probably write a selector like this
241
  ```python
242
  page.css('#p1')
243
  ```
@@ -262,34 +435,147 @@ When website owners implement structural changes like
262
  </div>
263
  </div>
264
  ```
265
- The selector will no longer function and your code needs maintenance. That's where Scrapling auto-matching feature comes into play.
266
 
267
  ```python
 
268
  # Before the change
269
- page = Adaptor(page_source, url='example.com', auto_match=True)
270
  element = page.css('#p1' auto_save=True)
271
  if not element: # One day website changes?
272
- element = page.css('#p1', auto_match=True) # Still finds it!
273
  # the rest of the code...
274
  ```
275
- > How does the auto-matching work? Check the [FAQs](#FAQs) section for that and other possible issues while auto-matching.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
 
277
  **Notes:**
278
- 1. Passing the `auto_save` argument without setting `auto_match` to `True` while initializing the Adaptor object will only result in ignoring the `auto_save` argument value and the following warning message
 
279
  ```text
280
  Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.
281
  ```
282
  This behavior is purely for performance reasons so the database gets created/connected only when you are planning to use the auto-matching features. Same case with the `auto_match` argument.
283
 
284
- 2. The `auto_match` parameter works only for `Adaptor` instances not `Adaptors` so if you do something like this you will get an error
285
  ```python
286
  page.css('body').css('#p1', auto_match=True)
287
  ```
288
  because you can't auto-match a whole list, you have to be specific and do something like
289
  ```python
290
- page.css('body')[0].css('#p1', auto_match=True)
291
  ```
292
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  ### Is That All?
294
  Here's what else you can do with Scrapling:
295
 
@@ -300,12 +586,12 @@ Here's what else you can do with Scrapling:
300
  ```
301
  - Saving and retrieving elements manually to auto-match them outside the `css` and the `xpath` methods but you have to set the identifier by yourself.
302
 
303
- - To save element to the database:
304
  ```python
305
  >>> element = page.find_by_text('Tipping the Velvet', first_match=True)
306
  >>> page.save(element, 'my_special_element')
307
  ```
308
- - Now later when you want to retrieve it and relocate it in the page with auto-matching, it would be like this
309
  ```python
310
  >>> element_dict = page.retrieve('my_special_element')
311
  >>> page.relocate(element_dict, adaptor_type=True)
@@ -319,13 +605,38 @@ Here's what else you can do with Scrapling:
319
  [<Element a at 0x105a2a7b0>]
320
  ```
321
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
  - Doing operations on element content is the same as scrapy
323
  ```python
324
- quote.re(r'somethings') # Get all strings (TextHandlers) that match the regex pattern
325
- quote.re_first(r'something') # Get the first string (TextHandler) only
326
  quote.json() # If the content text is jsonable, then convert it to json using `orjson` which is 10x faster than the standard json library and provides more options
327
  ```
328
- Hence all of these methods are actually methods from the `TextHandler` within that contains the text content so the same can be done directly if you call the `.text` property or equivalent selector function.
 
 
 
 
 
 
 
 
 
329
 
330
 
331
  - Doing operations on the text content itself includes
@@ -339,11 +650,11 @@ Here's what else you can do with Scrapling:
339
  ```
340
  - Sort all characters in the string as if it were a list and return the new string
341
  ```python
342
- quote.sort()
343
  ```
344
  > To be clear, `TextHandler` is a sub-class of Python's `str` so all normal operations/methods that work with Python strings will work with it.
345
 
346
- - Any element's attributes are not exactly a dictionary but a sub-class of [mapping](https://docs.python.org/3/glossary.html#term-mapping) called `AttributesHandler` that's read-only so it's faster and string values returned are actually `TextHandler` objects so all operations above can be done on them, standard dictionary operations that doesn't modify the data, and more :)
347
  - Unlike standard dictionaries, here you can search by values too and can do partial searches. It might be handy in some cases (returns a generator of matches)
348
  ```python
349
  >>> for item in element.attrib.search_values('catalogue', partial=True):
@@ -370,8 +681,9 @@ There are a lot of deep details skipped here to make this as short as possible s
370
 
371
  Note that implementing your storage system can be complex as there are some strict rules such as inheriting from the same abstract class, following the singleton design pattern used in other classes, and more. So make sure to read the docs first.
372
 
 
373
 
374
- ## FAQs
375
  This section addresses common questions about Scrapling, please read this section before opening an issue.
376
 
377
  ### How does auto-matching work?
@@ -384,7 +696,7 @@ This section addresses common questions about Scrapling, please read this sectio
384
  Together both are used to retrieve the element's unique properties from the database later.
385
  4. Now later when you enable the `auto_match` parameter for both the Adaptor instance and the method call. The element properties are retrieved and Scrapling loops over all elements in the page and compares each one's unique properties to the unique properties we already have for this element and a score is calculated for each one.
386
  5. The comparison between elements is not exact but more about finding how similar these values are, so everything is taken into consideration even the values' order like the order in which the element class names were written before and the order in which the same element class names are written now.
387
- 6. The score for each element is stored in the table and in the end, the element(s) with the highest combined similarity scores are returned.
388
 
389
  ### How does the auto-matching work if I didn't pass a URL while initializing the Adaptor object?
390
  Not a big problem as it depends on your usage. The word `default` will be used in place of the URL field while saving the element's unique properties. So this will only be an issue if you used the same identifier later for a different website that you didn't pass the URL parameter while initializing it as well. The save process will overwrite the previous data and auto-matching uses the latest saved properties only.
@@ -413,7 +725,7 @@ Pretty much yeah, almost all features you get from BeautifulSoup can be found or
413
  Of course, you can find elements by text/regex, find similar elements in a more reliable way than AutoScraper, and finally save/retrieve elements manually to use later as the model feature in AutoScraper. I have pulled all top articles about AutoScraper from Google and tested Scrapling against examples in them. In all examples, Scrapling got the same results as AutoScraper in much less time.
414
 
415
  ### Is Scrapling thread-safe?
416
- Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its own state.
417
 
418
  ## Sponsors
419
  [![Capsolver Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/CapSolver.png)](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
@@ -423,6 +735,10 @@ Everybody is invited and welcome to contribute to Scrapling. There is a lot to d
423
 
424
  Please read the [contributing file](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) before doing anything.
425
 
 
 
 
 
426
  ## License
427
  This work is licensed under BSD-3
428
 
@@ -430,8 +746,16 @@ This work is licensed under BSD-3
430
  This project includes code adapted from:
431
  - Parsel (BSD License) - Used for [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/translator.py) submodule
432
 
 
 
 
 
 
 
 
433
  ## Known Issues
434
  - In the auto-matching save process, the unique properties of the first element from the selection results are the only ones that get saved. So if the selector you are using selects different elements on the page that are in different locations, auto-matching will probably return to you the first element only when you relocate it later. This doesn't include combined CSS selectors (Using commas to combine more than one selector for example) as these selectors get separated and each selector gets executed alone.
435
  - Currently, Scrapling is not compatible with async/await.
436
 
437
- <div align="center"><small>Made with ❤️ by Karim Shoair</small></div><br>
 
 
1
+ # 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
2
  [![Tests](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg)](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [![PyPI version](https://badge.fury.io/py/Scrapling.svg)](https://badge.fury.io/py/Scrapling) [![Supported Python versions](https://img.shields.io/pypi/pyversions/scrapling.svg)](https://pypi.org/project/scrapling/) [![PyPI Downloads](https://static.pepy.tech/badge/scrapling)](https://pepy.tech/project/scrapling)
3
 
4
+ Dealing with failing web scrapers due to anti-bot protections or website changes? Meet Scrapling.
5
 
6
+ Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
7
 
8
  ```python
9
+ >> from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
10
+ # Fetch websites' source under the radar!
11
+ >> fetcher = StealthyFetcher().fetch('https://example.com', headless=True, disable_resources=True)
12
+ >> print(fetcher.status)
13
+ 200
14
+ >> page = fetcher.adaptor
15
+ >> products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
16
+ >> # Later, if the website structure changes, pass `auto_match=True`
17
+ >> products = page.css('.product', auto_match=True) # and Scrapling still finds them!
18
  ```
19
 
20
+ ## Table of content
21
+ * [Key Features](#key-features)
22
+ * [Fetch websites as you prefer](#fetch-websites-as-you-prefer)
23
+ * [Adaptive Scraping](#adaptive-scraping)
24
+ * [Performance](#performance)
25
+ * [Developing Experience](#developing-experience)
26
+ * [Getting Started](#getting-started)
27
+ * [Parsing Performance](#parsing-performance)
28
+ * [Text Extraction Speed Test (5000 nested elements).](#text-extraction-speed-test-5000-nested-elements)
29
+ * [Extraction By Text Speed Test](#extraction-by-text-speed-test)
30
+ * [Installation](#installation)
31
+ * [Fetching Websites Features](#fetching-websites-features)
32
+ * [Fetcher](#fetcher)
33
+ * [StealthyFetcher](#stealthyfetcher)
34
+ * [PlayWrightFetcher](#playwrightfetcher)
35
+ * [Advanced Parsing Features](#advanced-parsing-features)
36
+ * [Smart Navigation](#smart-navigation)
37
+ * [Content-based Selection & Finding Similar Elements](#content-based-selection--finding-similar-elements)
38
+ * [Handling Structural Changes](#handling-structural-changes)
39
+ * [Real World Scenario](#real-world-scenario)
40
+ * [Find elements by filters](#find-elements-by-filters)
41
+ * [Is That All?](#is-that-all)
42
+ * [More Advanced Usage](#more-advanced-usage)
43
+ * [⚡ Enlightening Questions and FAQs](#-enlightening-questions-and-faqs)
44
+ * [How does auto-matching work?](#how-does-auto-matching-work)
45
+ * [How does the auto-matching work if I didn't pass a URL while initializing the Adaptor object?](#how-does-the-auto-matching-work-if-i-didnt-pass-a-url-while-initializing-the-adaptor-object)
46
+ * [If all things about an element can change or get removed, what are the unique properties to be saved?](#if-all-things-about-an-element-can-change-or-get-removed-what-are-the-unique-properties-to-be-saved)
47
+ * [I have enabled the `auto_save`/`auto_match` parameter while selecting and it got completely ignored with a warning message](#i-have-enabled-the-auto_saveauto_match-parameter-while-selecting-and-it-got-completely-ignored-with-a-warning-message)
48
+ * [I have done everything as the docs but the auto-matching didn't return anything, what's wrong?](#i-have-done-everything-as-the-docs-but-the-auto-matching-didnt-return-anything-whats-wrong)
49
+ * [Can Scrapling replace code built on top of BeautifulSoup4?](#can-scrapling-replace-code-built-on-top-of-beautifulsoup4)
50
+ * [Can Scrapling replace code built on top of AutoScraper?](#can-scrapling-replace-code-built-on-top-of-autoscraper)
51
+ * [Is Scrapling thread-safe?](#is-scrapling-thread-safe)
52
+ * [Sponsors](#sponsors)
53
+ * [Contributing](#contributing)
54
+ * [Disclaimer for Scrapling Project](#disclaimer-for-scrapling-project)
55
+ * [License](#license)
56
+ * [Acknowledgments](#acknowledgments)
57
+ * [Thanks and References](#thanks-and-references)
58
+ * [Known Issues](#known-issues)
59
+
60
  ## Key Features
61
 
62
+ ### Fetch websites as you prefer
63
+ - **HTTP requests**: Stealthy and fast HTTP requests with `Fetcher`
64
+ - **Stealthy fetcher**: Annoying anti-bot protection? No problem! Scrapling can bypass almost all of them with `StealthyFetcher` with default configuration!
65
+ - **Your preferred browser**: Use your real browser with CDP, [NSTbrowser](https://app.nstbrowser.io/r/1vO5e5)'s browserless, PlayWright with stealth mode, or even vanilla PlayWright - All is possible with `PlayWrightFetcher`!
66
+
67
  ### Adaptive Scraping
68
  - 🔄 **Smart Element Tracking**: Locate previously identified elements after website structure changes, using an intelligent similarity system and integrated storage.
69
+ - 🎯 **Flexible Querying**: Use CSS selectors, XPath, Elements filters, text search, or regex - chain them however you want!
70
  - 🔍 **Find Similar Elements**: Automatically locate elements similar to the element you want on the page (Ex: other products like the product you found on the page).
71
+ - 🧠 **Smart Content Scraping**: Extract data from multiple websites without specific selectors using Scrapling powerful features.
72
 
73
  ### Performance
74
+ - 🚀 **Lightning Fast**: Built from the ground up with performance in mind, outperforming most popular Python scraping libraries (outperforming BeautifulSoup in parsing by up to 620x in our tests).
75
  - 🔋 **Memory Efficient**: Optimized data structures for minimal memory footprint.
76
  - ⚡ **Fast JSON serialization**: 10x faster JSON serialization than the standard json library with more options.
77
 
 
79
  - 🛠️ **Powerful Navigation API**: Traverse the DOM tree easily in all directions and get the info you want (parent, ancestors, sibling, children, next/previous element, and more).
80
  - 🧬 **Rich Text Processing**: All strings have built-in methods for regex matching, cleaning, and more. All elements' attributes are read-only dictionaries that are faster than standard dictionaries with added methods.
81
  - 📝 **Automatic Selector Generation**: Create robust CSS/XPath selectors for any element.
82
+ - 🔌 **API Similar to Scrapy/BeautifulSoup**: Familiar methods and similar pseudo-elements for Scrapy and BeautifulSoup users.
83
+ - 📘 **Type hints and test coverage**: Complete type coverage and almost full test coverage for better IDE support and fewer bugs, respectively.
84
 
85
  ## Getting Started
86
 
 
 
87
  ```python
88
+ from scrapling import Fetcher
 
89
 
90
+ fetcher = Fetcher(auto_match=False)
 
 
91
 
92
+ # Fetch a web page and create an Adaptor instance
93
+ page = fetcher.get('https://quotes.toscrape.com/', stealthy_headers=True).adaptor
94
  # Get all strings in the full page
95
  page.get_all_text(ignore_tags=('script', 'style'))
96
 
 
98
  quotes = page.css('.quote .text::text') # CSS selector
99
  quotes = page.xpath('//span[@class="text"]/text()') # XPath
100
  quotes = page.css('.quote').css('.text::text') # Chained selectors
101
+ quotes = [element.text for element in page.css('.quote .text')] # Slower than bulk query above
102
 
103
  # Get the first quote element
104
+ quote = page.css_first('.quote') # / page.css('.quote').first / page.css('.quote')[0]
105
+
106
+ # Tired of selectors? Use find_all/find
107
+ quotes = page.find_all('div', {'class': 'quote'})
108
+ # Same as
109
+ quotes = page.find_all('div', class_='quote')
110
+ quotes = page.find_all(['div'], class_='quote')
111
+ quotes = page.find_all(class_='quote') # and so on...
112
 
113
  # Working with elements
114
  quote.html_content # Inner HTML
 
116
  quote.attrib # Element attributes
117
  quote.path # DOM path to element (List)
118
  ```
119
+ To keep it simple, all methods can be chained on top of each other!
 
 
 
 
 
 
 
 
 
 
120
 
121
+ ## Parsing Performance
122
 
123
  Scrapling isn't just powerful - it's also blazing fast. Scrapling implements many best practices, design patterns, and numerous optimizations to save fractions of seconds. All of that while focusing exclusively on parsing HTML documents.
124
  Here are benchmarks comparing Scrapling to popular Python libraries in two tests.
 
145
  | Scrapling | 2.51 | 1.0x |
146
  | AutoScraper | 11.41 | 4.546x |
147
 
148
+ Scrapling can find elements with more methods and it returns full element `Adaptor` objects not only the text like AutoScraper. So, to make this test fair, both libraries will extract an element with text, find similar elements, and then extract the text content for all of them. As you see, Scrapling is still 4.5 times faster at the same task.
149
 
150
  > All benchmarks' results are an average of 100 runs. See our [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology and to run your comparisons.
151
 
152
+ ## Installation
153
+ Scrapling is a breeze to get started with - Starting from version 0.2, we require at least Python 3.8 to work.
154
+ ```bash
155
+ pip3 install scrapling
156
+ ```
157
+ - For using the `StealthyFetcher`, go to the command line and download the browser with
158
+ <details><summary>Windows OS</summary>
159
+
160
+ ```bash
161
+ camoufox fetch --browserforge
162
+ ```
163
+ </details>
164
+ <details><summary>MacOS</summary>
165
+
166
+ ```bash
167
+ python3 -m camoufox fetch --browserforge
168
+ ```
169
+ </details>
170
+ <details><summary>Linux</summary>
171
+
172
+ ```bash
173
+ python -m camoufox fetch --browserforge
174
+ ```
175
+ On a fresh installation of Linux, you may also need the following Firefox dependencies:
176
+ - Debian-based distros
177
+ ```bash
178
+ sudo apt install -y libgtk-3-0 libx11-xcb1 libasound2
179
+ ```
180
+ - Arch-based distros
181
+ ```bash
182
+ sudo pacman -S gtk3 libx11 libxcb cairo libasound alsa-lib
183
+ ```
184
+ </details>
185
+
186
+ <small> See the official <a href="https://camoufox.com/python/installation/#download-the-browser">Camoufox documentation</a> for more info on installation</small>
187
+
188
+ - If you are going to use the `PlayWrightFetcher` options, then install Playwright's Chromium browser with:
189
+ ```commandline
190
+ playwright install chromium
191
+ ```
192
+ - If you are going to use normal requests only with the `Fetcher` class then update the fingerprints files with:
193
+ ```commandline
194
+ python -m browserforge update
195
+ ```
196
+
197
+ ## Fetching Websites Features
198
+ All fetcher-type classes are imported in the same way
199
+ ```python
200
+ from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
201
+ ```
202
+ And all of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug` which are the same ones you give to the `Adaptor` class.
203
+ > [!NOTE]
204
+ > The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
205
+ ### Fetcher
206
+ This class is built on top of [httpx](https://www.python-httpx.org/) with additional configuration options, here you can do `GET`, `POST`, `PUT`, and `DELETE` requests.
207
+
208
+ For all methods, you have `stealth_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default.
209
+ ```python
210
+ >> page = Fetcher().get('https://httpbin.org/get', stealth_headers=True, follow_redirects=True)
211
+ >> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'})
212
+ >> page = Fetcher().put('https://httpbin.org/put', data={'key': 'value'})
213
+ >> page = Fetcher().delete('https://httpbin.org/delete')
214
+ ```
215
+ ### StealthyFetcher
216
+ This class is built on top of [Camoufox](https://github.com/daijro/camoufox) which by default bypasses most of the anti-bot protections. Scrapling adds extra layers of flavors and configurations to increase performance and undetectability even further.
217
+ ```python
218
+ >> page = StealthyFetcher().fetch('https://www.browserscan.net/bot-detection') # Running headless by default
219
+ >> page.status == 200
220
+ True
221
+ ```
222
+ <details><summary><strong>For the sake of simplicity, expand this for the complete list of arguments</strong></summary>
223
+
224
+ | Argument | Description | Optional |
225
+ |:-------------------:|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------:|
226
+ | url | Target url | ❌ |
227
+ | headless | Pass `True` to run the browser in headless/hidden (**default**), `virtual` to run it in virtual screen mode, or `False` for headful/visible mode. The `virtual` mode requires having `xvfb` installed. | ✔️ |
228
+ | block_images | Prevent the loading of images through Firefox preferences. _This can help save your proxy usage but be careful with this option as it makes some websites never finish loading._ | ✔️ |
229
+ | disable_resources | Drop requests of unnecessary resources for a speed boost. It depends but it made requests ~25% faster in my tests for some websites.<br/>Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`. _This can help save your proxy usage but be careful with this option as it makes some websites never finish loading._ | ✔️ |
230
+ | google_search | Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name. | ✔️ |
231
+ | extra_headers | A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._ | ✔️ |
232
+ | block_webrtc | Blocks WebRTC entirely. | ✔️ |
233
+ | page_action | Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again. | ✔️ |
234
+ | addons | List of Firefox addons to use. **Must be paths to extracted addons.** | ✔️ |
235
+ | humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
236
+ | allow_webgl | Whether to allow WebGL. To prevent leaks, only use this for special cases. | ✔️ |
237
+ | network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
238
+ | timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
239
+ | wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
240
+ | wait_selector_state | The state to wait for the selector given with `wait_selector`. _Default state is `attached`._ | ✔️ |
241
+
242
+ </details>
243
+
244
+ This list isn't final so expect a lot more additions and flexibility to be added in the next versions!
245
+
246
+ ### PlayWrightFetcher
247
+ This class is built on top of [Playwright](https://playwright.dev/python/) which currently provides 4 main run options but they can be mixed as you want.
248
+ ```python
249
+ >> page = PlayWrightFetcher().fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # Vanilla Playwright option
250
+ >> page.adaptor.css_first("#search a::attr(href)")
251
+ 'https://github.com/D4Vinci/Scrapling'
252
+ ```
253
+ Using this Fetcher class, you can make requests with:
254
+ 1) Vanilla Playwright without any modifications other than the ones you chose.
255
+ 2) Stealthy Playwright with the stealth mode I wrote for it. It's still a WIP but it bypasses many online tests like [Sannysoft's](https://bot.sannysoft.com/).</br> Some of the things this fetcher's stealth mode does include:
256
+ * Patching the CDP runtime fingerprint.
257
+ * Mimics some of the real browsers' properties by injecting several JS files and using custom options.
258
+ * Using custom flags on launch to hide Playwright even more and make it faster.
259
+ * Generates real browser's headers of the same type and same user OS then append it to the request's headers.
260
+ 3) Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
261
+ 4) [NSTBrowser](https://app.nstbrowser.io/r/1vO5e5)'s [docker browserless](https://hub.docker.com/r/nstbrowser/browserless) option by passing the CDP URL and enabling `nstbrowser_mode` option.
262
+
263
+ Add that to a lot of controlling/hiding options as you will see in the arguments list below.
264
+
265
+ <details><summary><strong>Expand this for the complete list of arguments</strong></summary>
266
+
267
+ | Argument | Description | Optional |
268
+ |:-------------------:|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------:|
269
+ | url | Target url | ❌ |
270
+ | headless | Pass `True` to run the browser in headless/hidden (**default**), or `False` for headful/visible mode. | ✔️ |
271
+ | disable_resources | Drop requests of unnecessary resources for a speed boost. It depends but it made requests ~25% faster in my tests for some websites.<br/>Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`. _This can help save your proxy usage but be careful with this option as it makes some websites never finish loading._ | ✔️ |
272
+ | useragent | Pass a useragent string to be used. **Otherwise the fetcher will generate a real Useragent of the same browser and use it.** | ✔️ |
273
+ | network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
274
+ | timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
275
+ | page_action | Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again. | ✔️ |
276
+ | wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
277
+ | wait_selector_state | The state to wait for the selector given with `wait_selector`. _Default state is `attached`._ | ✔️ |
278
+ | google_search | Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name. | ✔️ |
279
+ | extra_headers | A dictionary of extra headers to add to the request. The referer set by the `google_search` argument takes priority over the referer set here if used together. | ✔️ |
280
+ | hide_canvas | Add random noise to canvas operations to prevent fingerprinting. | ✔️ |
281
+ | disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ |
282
+ | stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ |
283
+ | cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP. | ✔️ |
284
+ | nstbrowser_mode | Enables NSTBrowser mode, **it have to be used with `cdp_url` argument or it will get completely ignored.** | ✔️ |
285
+ | nstbrowser_config | The config you want to send with requests to the NSTBrowser. _If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config._ | ✔️ |
286
+
287
+ </details>
288
+
289
+ This list isn't final so expect a lot more additions and flexibility to be added in the next versions!
290
+
291
+ ## Advanced Parsing Features
292
  ### Smart Navigation
293
  ```python
294
  >>> quote.tag
 
308
  >>> quote.siblings
309
  [<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
310
  <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
 
311
  ...]
312
 
313
  >>> quote.next # gets the next element, the same logic applies to `quote.previous`
314
  <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>
315
 
316
+ >>> quote.children.css_first(".author::text")
317
+ 'Albert Einstein'
318
 
319
  >>> quote.has_class('quote')
320
  True
321
 
322
  # Generate new selectors for any element
323
+ >>> quote.generate_css_selector
324
  'body > div > div:nth-of-type(2) > div > div'
325
 
326
+ # Test these selectors on your favorite browser or reuse them again in the library's methods!
327
+ >>> quote.generate_xpath_selector
328
  '//body/div/div[2]/div/div'
329
  ```
330
  If your case needs more than the element's parent, you can iterate over the whole ancestors' tree of any element like below
 
341
  ### Content-based Selection & Finding Similar Elements
342
  You can select elements by their text content in multiple ways, here's a full example on another website:
343
  ```python
344
+ >>> page = Fetcher().get('https://books.toscrape.com/index.html').adaptor
 
 
345
 
346
+ >>> page.find_by_text('Tipping the Velvet') # Find the first element whose text fully matches this text
347
  <data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>
348
 
349
  >>> page.find_by_text('Tipping the Velvet', first_match=False) # Get all matches if there are more
 
383
  ```python
384
  >>> for product in page.find_by_text('Tipping the Velvet').parent.parent.find_similar():
385
  print({
386
+ "name": product.css_first('h3 a::text'),
387
+ "price": product.css_first('.price_color').re_first(r'[\d\.]+'),
388
  "stock": product.css('.availability::text')[-1].clean()
389
  })
390
  {'name': 'A Light in the ...', 'price': '51.77', 'stock': 'In stock'}
 
395
  The [documentation](https://github.com/D4Vinci/Scrapling/tree/main/docs/Examples) will provide more advanced examples.
396
 
397
  ### Handling Structural Changes
 
 
398
  Let's say you are scraping a page with a structure like this:
399
  ```html
400
  <div class="container">
 
410
  </section>
411
  </div>
412
  ```
413
+ And you want to scrape the first product, the one with the `p1` ID. You will probably write a selector like this
414
  ```python
415
  page.css('#p1')
416
  ```
 
435
  </div>
436
  </div>
437
  ```
438
+ The selector will no longer function and your code needs maintenance. That's where Scrapling's auto-matching feature comes into play.
439
 
440
  ```python
441
+ from scrapling import Adaptor
442
  # Before the change
443
+ page = Adaptor(page_source, url='example.com')
444
  element = page.css('#p1' auto_save=True)
445
  if not element: # One day website changes?
446
+ element = page.css('#p1', auto_match=True) # Scrapling still finds it!
447
  # the rest of the code...
448
  ```
449
+ > How does the auto-matching work? Check the [FAQs](#-enlightening-questions-and-faqs) section for that and other possible issues while auto-matching.
450
+
451
+ #### Real-World Scenario
452
+ Let's use a real website as an example and use one of the fetchers to fetch its source. To do this we need to find a website that will change its design/structure soon, take a copy of its source then wait for the website to make the change. Of course, that's nearly impossible to know unless I know the website's owner but that will make it a staged test haha.
453
+
454
+ To solve this issue, I will use [The Web Archive](https://archive.org/)'s [Wayback Machine](https://web.archive.org/). Here is a copy of [StackOverFlow's website in 2010](https://web.archive.org/web/20100102003420/http://stackoverflow.com/), pretty old huh?</br>Let's test if the automatch feature can extract the same button in the old design from 2010 and the current design using the same selector :)
455
+
456
+ If I want to extract the Questions button from the old design I can use a selector like this `#hmenus > div:nth-child(1) > ul > li:nth-child(1) > a` This selector is too specific because it was generated by Google Chrome.
457
+ Now let's test the same selector in both versions
458
+ ```python
459
+ >> from scrapling import Fetcher
460
+ >> selector = '#hmenus > div:nth-child(1) > ul > li:nth-child(1) > a'
461
+ >> old_url = "https://web.archive.org/web/20100102003420/http://stackoverflow.com/"
462
+ >> new_url = "https://stackoverflow.com/"
463
+ >>
464
+ >> page = Fetcher(automatch_domain='stackoverflow.com').get(old_url, timeout=30).adaptor
465
+ >> element1 = page.css_first(selector, auto_save=True)
466
+ >>
467
+ >> # Same selector but used in the updated website
468
+ >> page = Fetcher(automatch_domain="stackoverflow.com").get(new_url).adaptor
469
+ >> element2 = page.css_first(selector, auto_match=True)
470
+ >>
471
+ >> if element1.text == element2.text:
472
+ ... print('Scrapling found the same element in the old design and the new design!')
473
+ 'Scrapling found the same element in the old design and the new design!'
474
+ ```
475
+ Note that I used a new argument called `automatch_domain`, this is because for Scrapling these are two different URLs, not the website so it isolates their data. To tell Scrapling they are the same website, we then pass the domain we want to use for saving auto-match data for them both so Scrapling doesn't isolate them.
476
+
477
+ In a real-world scenario, the code will be the same except it will use the same URL for both requests so you won't need to use the `automatch_domain` argument. This is the closest example I can give to real-world cases so I hope it didn't confuse you :)
478
 
479
  **Notes:**
480
+ 1. For the two examples above I used one time the `Adaptor` class and the second time the `Fetcher` class just to show you that you can create the `Adaptor` object by yourself if you have the source or fetch the source using any `Fetcher` class then it will create the `Adaptor` object for you on the `.adaptor` property.
481
+ 2. Passing the `auto_save` argument with the `auto_match` argument set to `False` while initializing the Adaptor/Fetcher object will only result in ignoring the `auto_save` argument value and the following warning message
482
  ```text
483
  Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.
484
  ```
485
  This behavior is purely for performance reasons so the database gets created/connected only when you are planning to use the auto-matching features. Same case with the `auto_match` argument.
486
 
487
+ 3. The `auto_match` parameter works only for `Adaptor` instances not `Adaptors` so if you do something like this you will get an error
488
  ```python
489
  page.css('body').css('#p1', auto_match=True)
490
  ```
491
  because you can't auto-match a whole list, you have to be specific and do something like
492
  ```python
493
+ page.css_first('body').css('#p1', auto_match=True)
494
  ```
495
 
496
+ ### Find elements by filters
497
+ Inspired by BeautifulSoup's `find_all` function you can find elements by using `find_all`/`find` methods. Both methods can take multiple types of filters and return all elements in the pages that all these filters apply to.
498
+
499
+ * To be more specific:
500
+ * Any string passed is considered a tag name
501
+ * Any iterable passed like List/Tuple/Set is considered an iterable of tag names.
502
+ * Any dictionary is considered a mapping of HTML element(s) attribute names and attribute values.
503
+ * Any regex patterns passed are used as filters
504
+ * Any functions passed are used as filters
505
+ * Any keyword argument passed is considered as an HTML element attribute with its value.
506
+
507
+ So the way it works is after collecting all passed arguments and keywords, each filter passes its results to the following filter in a waterfall-like filtering system.
508
+ <br/>It filters all elements in the current page/element in the following order:
509
+
510
+ 1. All elements with the passed tag name(s).
511
+ 2. All elements that match all passed attribute(s).
512
+ 3. All elements that match all passed regex patterns.
513
+ 4. All elements that fulfill all passed function(s).
514
+
515
+ Note: The filtering process always starts from the first filter it finds in the filtering order above so if no tag name(s) are passed but attributes are passed, the process starts from that layer and so on. **But the order in which you pass the arguments doesn't matter.**
516
+
517
+ Examples to clear any confusion :)
518
+
519
+ ```python
520
+ >> from scrapling import Fetcher
521
+ >> page = Fetcher().get('https://quotes.toscrape.com/').adaptor
522
+ # Find all elements with tag name `div`.
523
+ >> page.find_all('div')
524
+ [<data='<div class="container"> <div class="row...' parent='<body> <div class="container"> <div clas...'>,
525
+ <data='<div class="row header-box"> <div class=...' parent='<div class="container"> <div class="row...'>,
526
+ ...]
527
+
528
+ # Find all div elements with a class that equals `quote`.
529
+ >> page.find_all('div', class_='quote')
530
+ [<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
531
+ <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
532
+ ...]
533
+
534
+ # Same as above.
535
+ >> page.find_all('div', {'class': 'quote'})
536
+ [<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
537
+ <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
538
+ ...]
539
+
540
+ # Find all elements with a class that equals `quote`.
541
+ >> page.find_all({'class': 'quote'})
542
+ [<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
543
+ <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
544
+ ...]
545
+
546
+ # Find all div elements with a class that equals `quote`, and contains the element `.text` which contains the word 'world' in its content.
547
+ >> page.find_all('div', {'class': 'quote'}, lambda e: "world" in e.css_first('.text::text'))
548
+ [<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>]
549
+
550
+ # Find all elements that don't have children.
551
+ >> page.find_all(lambda element: len(element.children) > 0)
552
+ [<data='<html lang="en"><head><meta charset="UTF...'>,
553
+ <data='<head><meta charset="UTF-8"><title>Quote...' parent='<html lang="en"><head><meta charset="UTF...'>,
554
+ <data='<body> <div class="container"> <div clas...' parent='<html lang="en"><head><meta charset="UTF...'>,
555
+ ...]
556
+
557
+ # Find all elements that contain the word 'world' in its content.
558
+ >> page.find_all(lambda element: "world" in element.text)
559
+ [<data='<span class="text" itemprop="text">“The...' parent='<div class="quote" itemscope itemtype="h...'>,
560
+ <data='<a class="tag" href="/tag/world/page/1/"...' parent='<div class="tags"> Tags: <meta class="ke...'>]
561
+
562
+ # Find all span elements that match the given regex
563
+ >> page.find_all('span', re.compile(r'world'))
564
+ [<data='<span class="text" itemprop="text">“The...' parent='<div class="quote" itemscope itemtype="h...'>]
565
+
566
+ # Find all div and span elements with class 'quote' (No span elements like that so only div returned)
567
+ >> page.find_all(['div', 'span'], {'class': 'quote'})
568
+ [<data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
569
+ <data='<div class="quote" itemscope itemtype="h...' parent='<div class="col-md-8"> <div class="quote...'>,
570
+ ...]
571
+
572
+ # Mix things up
573
+ >> page.find_all({'itemtype':"http://schema.org/CreativeWork"}, 'div').css('.author::text')
574
+ ['Albert Einstein',
575
+ 'J.K. Rowling',
576
+ ...]
577
+ ```
578
+
579
  ### Is That All?
580
  Here's what else you can do with Scrapling:
581
 
 
586
  ```
587
  - Saving and retrieving elements manually to auto-match them outside the `css` and the `xpath` methods but you have to set the identifier by yourself.
588
 
589
+ - To save an element to the database:
590
  ```python
591
  >>> element = page.find_by_text('Tipping the Velvet', first_match=True)
592
  >>> page.save(element, 'my_special_element')
593
  ```
594
+ - Now later when you want to retrieve it and relocate it inside the page with auto-matching, it would be like this
595
  ```python
596
  >>> element_dict = page.retrieve('my_special_element')
597
  >>> page.relocate(element_dict, adaptor_type=True)
 
605
  [<Element a at 0x105a2a7b0>]
606
  ```
607
 
608
+ - Filtering results based on a function
609
+ ```python
610
+ # Find all products over $50
611
+ expensive_products = page.css('.product_pod').filter(
612
+ lambda p: float(p.css('.price_color').re_first(r'[\d\.]+')) > 50
613
+ )
614
+ ```
615
+
616
+ - Searching results for the first one that matches a function
617
+ ```python
618
+ # Find all the products with price '53.23'
619
+ page.css('.product_pod').search(
620
+ lambda p: float(p.css('.price_color').re_first(r'[\d\.]+')) == 54.23
621
+ )
622
+ ```
623
+
624
  - Doing operations on element content is the same as scrapy
625
  ```python
626
+ quote.re(r'regex_pattern') # Get all strings (TextHandlers) that match the regex pattern
627
+ quote.re_first(r'regex_pattern') # Get the first string (TextHandler) only
628
  quote.json() # If the content text is jsonable, then convert it to json using `orjson` which is 10x faster than the standard json library and provides more options
629
  ```
630
+ except that you can do more with them like
631
+ ```python
632
+ quote.re(
633
+ r'regex_pattern',
634
+ replace_entities=True, # Character entity references are replaced by their corresponding character
635
+ clean_match=True, # This will ignore all whitespaces and consecutive spaces while matching
636
+ case_sensitive= False, # Set the regex to ignore letters case while compiling it
637
+ )
638
+ ```
639
+ Hence all of these methods are methods from the `TextHandler` within that contains the text content so the same can be done directly if you call the `.text` property or equivalent selector function.
640
 
641
 
642
  - Doing operations on the text content itself includes
 
650
  ```
651
  - Sort all characters in the string as if it were a list and return the new string
652
  ```python
653
+ quote.sort(reverse=False)
654
  ```
655
  > To be clear, `TextHandler` is a sub-class of Python's `str` so all normal operations/methods that work with Python strings will work with it.
656
 
657
+ - Any element's attributes are not exactly a dictionary but a sub-class of [mapping](https://docs.python.org/3/glossary.html#term-mapping) called `AttributesHandler` that's read-only so it's faster and string values returned are actually `TextHandler` objects so all operations above can be done on them, standard dictionary operations that don't modify the data, and more :)
658
  - Unlike standard dictionaries, here you can search by values too and can do partial searches. It might be handy in some cases (returns a generator of matches)
659
  ```python
660
  >>> for item in element.attrib.search_values('catalogue', partial=True):
 
681
 
682
  Note that implementing your storage system can be complex as there are some strict rules such as inheriting from the same abstract class, following the singleton design pattern used in other classes, and more. So make sure to read the docs first.
683
 
684
+ To give detailed documentation of the library, it will need a website. I'm trying to rush creating the website, researching new ideas, and adding more features/tests/benchmarks but time is tight with too many spinning plates between work, personal life, and working on Scrapling. But you can help by using the [sponsor button](https://github.com/sponsors/D4Vinci) above :)
685
 
686
+ ## ⚡ Enlightening Questions and FAQs
687
  This section addresses common questions about Scrapling, please read this section before opening an issue.
688
 
689
  ### How does auto-matching work?
 
696
  Together both are used to retrieve the element's unique properties from the database later.
697
  4. Now later when you enable the `auto_match` parameter for both the Adaptor instance and the method call. The element properties are retrieved and Scrapling loops over all elements in the page and compares each one's unique properties to the unique properties we already have for this element and a score is calculated for each one.
698
  5. The comparison between elements is not exact but more about finding how similar these values are, so everything is taken into consideration even the values' order like the order in which the element class names were written before and the order in which the same element class names are written now.
699
+ 6. The score for each element is stored in the table, and in the end, the element(s) with the highest combined similarity scores are returned.
700
 
701
  ### How does the auto-matching work if I didn't pass a URL while initializing the Adaptor object?
702
  Not a big problem as it depends on your usage. The word `default` will be used in place of the URL field while saving the element's unique properties. So this will only be an issue if you used the same identifier later for a different website that you didn't pass the URL parameter while initializing it as well. The save process will overwrite the previous data and auto-matching uses the latest saved properties only.
 
725
  Of course, you can find elements by text/regex, find similar elements in a more reliable way than AutoScraper, and finally save/retrieve elements manually to use later as the model feature in AutoScraper. I have pulled all top articles about AutoScraper from Google and tested Scrapling against examples in them. In all examples, Scrapling got the same results as AutoScraper in much less time.
726
 
727
  ### Is Scrapling thread-safe?
728
+ Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its state.
729
 
730
  ## Sponsors
731
  [![Capsolver Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/CapSolver.png)](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
 
735
 
736
  Please read the [contributing file](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) before doing anything.
737
 
738
+ ## Disclaimer for Scrapling Project
739
+ > [!CAUTION]
740
+ > This library is provided for educational and research purposes only. By using this library, you agree to comply with local and international laws regarding data scraping and privacy. The authors and contributors are not responsible for any misuse of this software. This library should not be used to violate the rights of others, for unethical purposes, or to use data in an unauthorized or illegal manner. Do not use it on any website unless you have permission from the website owner or within their allowed rules like the `robots.txt` file, for example.
741
+
742
  ## License
743
  This work is licensed under BSD-3
744
 
 
746
  This project includes code adapted from:
747
  - Parsel (BSD License) - Used for [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/translator.py) submodule
748
 
749
+ ## Thanks and References
750
+ - [Daijro](https://github.com/daijro)'s brilliant work on both [BrowserForge](https://github.com/daijro/browserforge) and [Camoufox](https://github.com/daijro/camoufox)
751
+ - [Vinyzu](https://github.com/Vinyzu)'s work on Playwright's mock on [Botright](https://github.com/Vinyzu/Botright)
752
+ - [brotector](https://github.com/kaliiiiiiiiii/brotector)
753
+ - [fakebrowser](https://github.com/kkoooqq/fakebrowser)
754
+ - [rebrowser-patches](https://github.com/rebrowser/rebrowser-patches)
755
+
756
  ## Known Issues
757
  - In the auto-matching save process, the unique properties of the first element from the selection results are the only ones that get saved. So if the selector you are using selects different elements on the page that are in different locations, auto-matching will probably return to you the first element only when you relocate it later. This doesn't include combined CSS selectors (Using commas to combine more than one selector for example) as these selectors get separated and each selector gets executed alone.
758
  - Currently, Scrapling is not compatible with async/await.
759
 
760
+ ---
761
+ <div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
ROADMAP.md CHANGED
@@ -1,13 +1,14 @@
1
  ## TODOs
2
- - Add more tests and increase the code coverage.
3
- - Structure the tests folder in a better way.
4
- - Add more documentation.
5
- - Add the browsing ability.
6
- - Create detailed documentation for 'readthedocs' website, preferably add Github action for deploying it.
7
- - Create a Scrapy plugin/decorator to make it replace parsel in the response argument when needed.
8
- - Need to add more functionality to `AttributesHandler` and more navigation functions to `Adaptor` object (ex: functions similar to map, filter, and reduce functions but here pass it to the element and the function is executed on children, siblings, next elements, etc...)
9
- - Add `.filter` method to `Adaptors` object and other similar methods.
10
- - Add functionality to automatically detect pagination URLs
11
- - Add the ability to auto-detect schemas in pages and manipulate them
12
- - Add ability to generate a regex from a group of elements (Like for all href attributes)
 
13
  -
 
1
  ## TODOs
2
+ - [x] Add more tests and increase the code coverage.
3
+ - [x] Structure the tests folder in a better way.
4
+ - [ ] Add more documentation.
5
+ - [x] Add the browsing ability.
6
+ - [ ] Create detailed documentation for 'readthedocs' website, preferably add Github action for deploying it.
7
+ - [ ] Create a Scrapy plugin/decorator to make it replace parsel in the response argument when needed.
8
+ - [ ] Need to add more functionality to `AttributesHandler` and more navigation functions to `Adaptor` object (ex: functions similar to map, filter, and reduce functions but here pass it to the element and the function is executed on children, siblings, next elements, etc...)
9
+ - [x] Add `.filter` method to `Adaptors` object and other similar methods.
10
+ - [ ] Add functionality to automatically detect pagination URLs
11
+ - [ ] Add the ability to auto-detect schemas in pages and manipulate them.
12
+ - [ ] Add `analyzer` ability that tries to learn about the page through meta elements and return what it learned
13
+ - [ ] Add ability to generate a regex from a group of elements (Like for all href attributes)
14
  -
pytest.ini CHANGED
@@ -1,2 +1,2 @@
1
  [pytest]
2
- addopts = -p no:warnings --doctest-modules --ignore=setup.py
 
1
  [pytest]
2
+ addopts = -p no:warnings --doctest-modules --ignore=setup.py --verbose
scrapling/__init__.py CHANGED
@@ -1,10 +1,11 @@
1
  # Declare top-level shortcuts
 
2
  from scrapling.parser import Adaptor, Adaptors
3
- from scrapling.custom_types import TextHandler, AttributesHandler
4
 
5
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
6
- __version__ = "0.1.2"
7
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
8
 
9
 
10
- __all__ = ['Adaptor', 'Adaptors', 'TextHandler', 'AttributesHandler']
 
1
  # Declare top-level shortcuts
2
+ from scrapling.fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher, CustomFetcher
3
  from scrapling.parser import Adaptor, Adaptors
4
+ from scrapling.core.custom_types import TextHandler, AttributesHandler
5
 
6
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
7
+ __version__ = "0.2"
8
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
9
 
10
 
11
+ __all__ = ['Adaptor', 'Fetcher', 'StealthyFetcher', 'PlayWrightFetcher']
scrapling/core/__init__.py ADDED
File without changes
scrapling/core/_types.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Type definitions for type checking purposes.
3
+ """
4
+
5
+ from typing import (
6
+ Dict, Optional, Union, Callable, Any, List, Tuple, Pattern, Generator, Iterable, Type, TYPE_CHECKING, Literal
7
+ )
8
+
9
+ try:
10
+ from typing import Protocol
11
+ except ImportError:
12
+ # Added in Python 3.8
13
+ Protocol = object
14
+
15
+ try:
16
+ from typing import SupportsIndex
17
+ except ImportError:
18
+ # 'SupportsIndex' got added in Python 3.8
19
+ SupportsIndex = None
20
+
21
+ if TYPE_CHECKING:
22
+ # typing.Self requires Python 3.11
23
+ from typing_extensions import Self
24
+ else:
25
+ Self = object
scrapling/{custom_types.py → core/custom_types.py} RENAMED
@@ -1,9 +1,9 @@
1
  import re
2
  from types import MappingProxyType
3
  from collections.abc import Mapping
4
- from typing import Dict, List, Union, Pattern
5
 
6
- from scrapling.utils import _is_iterable, flatten
 
7
 
8
  from orjson import loads, dumps
9
  from w3lib.html import replace_entities as _replace_entities
@@ -69,7 +69,7 @@ class TextHandler(str):
69
  return [TextHandler(_replace_entities(s)) for s in results]
70
 
71
  def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
72
- clean_match: bool = False, case_sensitive: bool = False,):
73
  """Apply the given regex to text and return the first match if found, otherwise return the default value.
74
 
75
  :param regex: Can be either a compiled regular expression or a string.
@@ -83,6 +83,51 @@ class TextHandler(str):
83
  return result[0] if result else default
84
 
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  class AttributesHandler(Mapping):
87
  """A read-only mapping to use instead of the standard dictionary for the speed boost but
88
  at the same time I use it to add more functionalities.
 
1
  import re
2
  from types import MappingProxyType
3
  from collections.abc import Mapping
 
4
 
5
+ from scrapling.core.utils import _is_iterable, flatten
6
+ from scrapling.core._types import Dict, List, Union, Pattern, SupportsIndex
7
 
8
  from orjson import loads, dumps
9
  from w3lib.html import replace_entities as _replace_entities
 
69
  return [TextHandler(_replace_entities(s)) for s in results]
70
 
71
  def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
72
+ clean_match: bool = False, case_sensitive: bool = False) -> Union[str, None]:
73
  """Apply the given regex to text and return the first match if found, otherwise return the default value.
74
 
75
  :param regex: Can be either a compiled regular expression or a string.
 
83
  return result[0] if result else default
84
 
85
 
86
+ class TextHandlers(List[TextHandler]):
87
+ """
88
+ The :class:`TextHandlers` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
89
+ """
90
+ __slots__ = ()
91
+
92
+ def __getitem__(self, pos: Union[SupportsIndex, slice]) -> Union[TextHandler, "TextHandlers[TextHandler]"]:
93
+ lst = super().__getitem__(pos)
94
+ if isinstance(pos, slice):
95
+ return self.__class__(lst)
96
+ else:
97
+ return lst
98
+
99
+ def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
100
+ case_sensitive: bool = False) -> 'List[str]':
101
+ """Call the ``.re()`` method for each element in this list and return
102
+ their results flattened as TextHandlers.
103
+
104
+ :param regex: Can be either a compiled regular expression or a string.
105
+ :param replace_entities: if enabled character entity references are replaced by their corresponding character
106
+ :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
107
+ :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
108
+ """
109
+ results = [
110
+ n.re(regex, replace_entities, clean_match, case_sensitive) for n in self
111
+ ]
112
+ return flatten(results)
113
+
114
+ def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
115
+ clean_match: bool = False, case_sensitive: bool = False) -> Union[str, None]:
116
+ """Call the ``.re_first()`` method for each element in this list and return
117
+ the first result or the default value otherwise.
118
+
119
+ :param regex: Can be either a compiled regular expression or a string.
120
+ :param default: The default value to be returned if there is no match
121
+ :param replace_entities: if enabled character entity references are replaced by their corresponding character
122
+ :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
123
+ :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
124
+ """
125
+ for n in self:
126
+ for result in n.re(regex, replace_entities, clean_match, case_sensitive):
127
+ return result
128
+ return default
129
+
130
+
131
  class AttributesHandler(Mapping):
132
  """A read-only mapping to use instead of the standard dictionary for the speed boost but
133
  at the same time I use it to add more functionalities.
scrapling/{mixins.py → core/mixins.py} RENAMED
@@ -4,7 +4,7 @@ class SelectorsGeneration:
4
  Trying to generate selectors like Firefox or maybe cleaner ones!? Ehm
5
  Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591"""
6
 
7
- def __general_selection(self, selection: str = 'css') -> str:
8
  """Generate a selector for the current element.
9
  :return: A string of the generated selector.
10
  """
@@ -20,10 +20,11 @@ class SelectorsGeneration:
20
  else f"[@id='{target.attrib['id']}']"
21
  )
22
  selectorPath.append(part)
23
- return (
24
- " > ".join(reversed(selectorPath)) if css
25
- else '//*' + "/".join(reversed(selectorPath))
26
- )
 
27
  else:
28
  part = f'{target.tag}'
29
  # We won't use classes anymore because I some websites share exact classes between elements
@@ -60,15 +61,29 @@ class SelectorsGeneration:
60
  )
61
 
62
  @property
63
- def css_selector(self) -> str:
64
  """Generate a CSS selector for the current element
65
  :return: A string of the generated selector.
66
  """
67
  return self.__general_selection()
68
 
69
  @property
70
- def xpath_selector(self) -> str:
 
 
 
 
 
 
 
71
  """Generate a XPath selector for the current element
72
  :return: A string of the generated selector.
73
  """
74
  return self.__general_selection('xpath')
 
 
 
 
 
 
 
 
4
  Trying to generate selectors like Firefox or maybe cleaner ones!? Ehm
5
  Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591"""
6
 
7
+ def __general_selection(self, selection: str = 'css', full_path=False) -> str:
8
  """Generate a selector for the current element.
9
  :return: A string of the generated selector.
10
  """
 
20
  else f"[@id='{target.attrib['id']}']"
21
  )
22
  selectorPath.append(part)
23
+ if not full_path:
24
+ return (
25
+ " > ".join(reversed(selectorPath)) if css
26
+ else '//*' + "/".join(reversed(selectorPath))
27
+ )
28
  else:
29
  part = f'{target.tag}'
30
  # We won't use classes anymore because I some websites share exact classes between elements
 
61
  )
62
 
63
  @property
64
+ def generate_css_selector(self) -> str:
65
  """Generate a CSS selector for the current element
66
  :return: A string of the generated selector.
67
  """
68
  return self.__general_selection()
69
 
70
  @property
71
+ def generate_full_css_selector(self) -> str:
72
+ """Generate a complete CSS selector for the current element
73
+ :return: A string of the generated selector.
74
+ """
75
+ return self.__general_selection(full_path=True)
76
+
77
+ @property
78
+ def generate_xpath_selector(self) -> str:
79
  """Generate a XPath selector for the current element
80
  :return: A string of the generated selector.
81
  """
82
  return self.__general_selection('xpath')
83
+
84
+ @property
85
+ def generate_full_xpath_selector(self) -> str:
86
+ """Generate a complete XPath selector for the current element
87
+ :return: A string of the generated selector.
88
+ """
89
+ return self.__general_selection('xpath', full_path=True)
scrapling/{storage_adaptors.py → core/storage_adaptors.py} RENAMED
@@ -4,9 +4,9 @@ import logging
4
  import threading
5
  from hashlib import sha256
6
  from abc import ABC, abstractmethod
7
- from typing import Dict, Optional, Union
8
 
9
- from scrapling.utils import _StorageTools, cache
 
10
 
11
  from lxml import html
12
  from tldextract import extract as tld
 
4
  import threading
5
  from hashlib import sha256
6
  from abc import ABC, abstractmethod
 
7
 
8
+ from scrapling.core._types import Dict, Optional, Union
9
+ from scrapling.core.utils import _StorageTools, cache
10
 
11
  from lxml import html
12
  from tldextract import extract as tld
scrapling/{translator.py → core/translator.py} RENAMED
@@ -9,24 +9,14 @@ which will be important in future releases but most importantly...
9
  import re
10
 
11
  from w3lib.html import HTML5_WHITESPACE
12
- from typing import TYPE_CHECKING, Any, Optional
13
- try:
14
- from typing import Protocol
15
- except ImportError:
16
- # Added in Python 3.8
17
- Protocol = object
18
-
19
- from scrapling.utils import cache
20
 
21
  from cssselect.xpath import ExpressionError
22
  from cssselect.xpath import XPathExpr as OriginalXPathExpr
23
  from cssselect import HTMLTranslator as OriginalHTMLTranslator
24
  from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
25
 
26
- if TYPE_CHECKING:
27
- # typing.Self requires Python 3.11
28
- from typing_extensions import Self
29
-
30
 
31
  regex = f"[{HTML5_WHITESPACE}]+"
32
  replace_html5_whitespaces = re.compile(regex).sub
 
9
  import re
10
 
11
  from w3lib.html import HTML5_WHITESPACE
12
+ from scrapling.core.utils import cache
13
+ from scrapling.core._types import Any, Optional, Protocol, Self
 
 
 
 
 
 
14
 
15
  from cssselect.xpath import ExpressionError
16
  from cssselect.xpath import XPathExpr as OriginalXPathExpr
17
  from cssselect import HTMLTranslator as OriginalHTMLTranslator
18
  from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
19
 
 
 
 
 
20
 
21
  regex = f"[{HTML5_WHITESPACE}]+"
22
  replace_html5_whitespaces = re.compile(regex).sub
scrapling/{utils.py → core/utils.py} RENAMED
@@ -1,14 +1,13 @@
1
  import re
2
- import os
3
  import logging
4
  from itertools import chain
5
- from logging import handlers
6
  # Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
7
  from functools import lru_cache as cache # functools.cache is available on Python 3.9+ only so let's keep lru_cache
8
 
9
- from typing import Dict, Iterable, Any
10
 
11
  from lxml import html
 
12
  html_forbidden = {html.HtmlComment, }
13
  logging.basicConfig(
14
  level=logging.ERROR,
@@ -45,64 +44,6 @@ def _is_iterable(s: Any):
45
  return isinstance(s, (list, tuple,))
46
 
47
 
48
- @cache(None, typed=True)
49
- class _Logger(object):
50
- # I will leave this class here for now in case I decide I want to come back to use it :)
51
- __slots__ = ('console_logger', 'logger_file_path',)
52
- levels = {
53
- 'debug': logging.DEBUG,
54
- 'info': logging.INFO,
55
- 'warning': logging.WARNING,
56
- 'error': logging.ERROR,
57
- 'critical': logging.CRITICAL
58
- }
59
-
60
- def __init__(self, filename: str = 'debug.log', level: str = 'debug', when: str = 'midnight', backcount: int = 1):
61
- os.makedirs(os.path.join(os.path.dirname(__file__), 'logs'), exist_ok=True)
62
- format_str = logging.Formatter("[%(asctime)s] %(levelname)s: %(message)s", "%Y-%m-%d %H:%M:%S")
63
-
64
- # on-screen output
65
- lvl = self.levels[level.lower()]
66
- self.console_logger = logging.getLogger('Scrapling')
67
- self.console_logger.setLevel(lvl)
68
- console_handler = logging.StreamHandler()
69
- console_handler.setLevel(lvl)
70
- console_handler.setFormatter(format_str)
71
- self.console_logger.addHandler(console_handler)
72
-
73
- if lvl == logging.DEBUG:
74
- filename = os.path.join(os.path.dirname(__file__), 'logs', filename)
75
- self.logger_file_path = filename
76
- # Automatically generates the logging file at specified intervals
77
- file_handler = handlers.TimedRotatingFileHandler(
78
- # If more than (backcount+1) existed, oldest logs will be deleted
79
- filename=filename, when=when, backupCount=backcount, encoding='utf-8'
80
- )
81
- file_handler.setLevel(lvl)
82
- file_handler.setFormatter(format_str)
83
- # This for the logger when it appends the date to the new log
84
- file_handler.namer = lambda name: name.replace(".log", "") + ".log"
85
- self.console_logger.addHandler(file_handler)
86
- self.debug(f'Debug log path: {self.logger_file_path}')
87
- else:
88
- self.logger_file_path = None
89
-
90
- def debug(self, message: str) -> None:
91
- self.console_logger.debug(message)
92
-
93
- def info(self, message: str) -> None:
94
- self.console_logger.info(message)
95
-
96
- def warning(self, message: str) -> None:
97
- self.console_logger.warning(message)
98
-
99
- def error(self, message: str) -> None:
100
- self.console_logger.error(message)
101
-
102
- def critical(self, message: str) -> None:
103
- self.console_logger.critical(message)
104
-
105
-
106
  class _StorageTools:
107
  @staticmethod
108
  def __clean_attributes(element: html.HtmlElement, forbidden: tuple = ()) -> Dict:
 
1
  import re
 
2
  import logging
3
  from itertools import chain
 
4
  # Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
5
  from functools import lru_cache as cache # functools.cache is available on Python 3.9+ only so let's keep lru_cache
6
 
7
+ from scrapling.core._types import Dict, Iterable, Any
8
 
9
  from lxml import html
10
+
11
  html_forbidden = {html.HtmlComment, }
12
  logging.basicConfig(
13
  level=logging.ERROR,
 
44
  return isinstance(s, (list, tuple,))
45
 
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  class _StorageTools:
48
  @staticmethod
49
  def __clean_attributes(element: html.HtmlElement, forbidden: tuple = ()) -> Dict:
scrapling/engines/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from .camo import CamoufoxEngine
2
+ from .static import StaticEngine
3
+ from .pw import PlaywrightEngine
4
+ from .constants import DEFAULT_DISABLED_RESOURCES, DEFAULT_STEALTH_FLAGS
5
+ from .toolbelt import check_if_engine_usable
6
+
7
+ __all__ = ['CamoufoxEngine', 'PlaywrightEngine']
scrapling/engines/camo.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from scrapling.core._types import Union, Callable, Optional, Dict, List, Literal
3
+
4
+ from scrapling.engines.toolbelt import (
5
+ Response,
6
+ do_nothing,
7
+ get_os_name,
8
+ intercept_route,
9
+ check_type_validity,
10
+ generate_convincing_referer,
11
+ )
12
+
13
+ from camoufox.sync_api import Camoufox
14
+
15
+
16
+ class CamoufoxEngine:
17
+ def __init__(
18
+ self, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
19
+ block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
20
+ timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
21
+ wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, adaptor_arguments: Dict = None
22
+ ):
23
+ """An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
24
+
25
+ :param headless: Run the browser in headless/hidden (default), virtual screen mode, or headful/visible mode.
26
+ :param block_images: Prevent the loading of images through Firefox preferences.
27
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
28
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends but it made requests ~25% faster in my tests for some websites.
29
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
30
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
31
+ :param block_webrtc: Blocks WebRTC entirely.
32
+ :param addons: List of Firefox addons to use. Must be paths to extracted addons.
33
+ :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
34
+ :param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases.
35
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
36
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
37
+ :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
38
+ :param wait_selector: Wait for a specific css selector to be in a specific state.
39
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
40
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
41
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
42
+ :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
43
+ """
44
+ self.headless = headless
45
+ self.block_images = bool(block_images)
46
+ self.disable_resources = bool(disable_resources)
47
+ self.block_webrtc = bool(block_webrtc)
48
+ self.allow_webgl = bool(allow_webgl)
49
+ self.network_idle = bool(network_idle)
50
+ self.google_search = bool(google_search)
51
+ self.extra_headers = extra_headers or {}
52
+ self.addons = addons or []
53
+ self.humanize = humanize
54
+ self.timeout = check_type_validity(timeout, [int, float], 30000)
55
+ if callable(page_action):
56
+ self.page_action = page_action
57
+ else:
58
+ self.page_action = do_nothing
59
+ logging.error('[Ignored] Argument "page_action" must be callable')
60
+
61
+ self.wait_selector = wait_selector
62
+ self.wait_selector_state = wait_selector_state
63
+ self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
64
+
65
+ def fetch(self, url: str) -> Response:
66
+ """Opens up the browser and do your request based on your chosen options.
67
+
68
+ :param url: Target url.
69
+ :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
70
+ """
71
+ with Camoufox(
72
+ headless=self.headless,
73
+ block_images=self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
74
+ os=get_os_name(),
75
+ block_webrtc=self.block_webrtc,
76
+ allow_webgl=self.allow_webgl,
77
+ addons=self.addons,
78
+ humanize=self.humanize,
79
+ i_know_what_im_doing=True, # To turn warnings off with user configurations
80
+ ) as browser:
81
+ page = browser.new_page()
82
+ page.set_default_navigation_timeout(self.timeout)
83
+ page.set_default_timeout(self.timeout)
84
+ if self.disable_resources:
85
+ page.route("**/*", intercept_route)
86
+
87
+ if self.extra_headers:
88
+ page.set_extra_http_headers(self.extra_headers)
89
+
90
+ res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
91
+ page.wait_for_load_state(state="domcontentloaded")
92
+ if self.network_idle:
93
+ page.wait_for_load_state('networkidle')
94
+
95
+ page = self.page_action(page)
96
+
97
+ if self.wait_selector and type(self.wait_selector) is str:
98
+ waiter = page.locator(self.wait_selector)
99
+ waiter.wait_for(state=self.wait_selector_state)
100
+
101
+ content_type = res.headers.get('content-type', '')
102
+ # Parse charset from content-type
103
+ encoding = 'utf-8' # default encoding
104
+ if 'charset=' in content_type.lower():
105
+ encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
106
+
107
+ response = Response(
108
+ url=res.url,
109
+ text=page.content(),
110
+ content=res.body(),
111
+ status=res.status,
112
+ reason=res.status_text,
113
+ encoding=encoding,
114
+ cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
115
+ headers=res.all_headers(),
116
+ request_headers=res.request.all_headers(),
117
+ adaptor_arguments=self.adaptor_arguments
118
+ )
119
+ page.close()
120
+
121
+ return response
scrapling/engines/constants.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Disable loading these resources for speed
2
+ DEFAULT_DISABLED_RESOURCES = [
3
+ 'font',
4
+ 'image',
5
+ 'media',
6
+ 'beacon',
7
+ 'object',
8
+ 'imageset',
9
+ 'texttrack',
10
+ 'websocket',
11
+ 'csp_report',
12
+ 'stylesheet',
13
+ ]
14
+
15
+ DEFAULT_STEALTH_FLAGS = [
16
+ # Explanation: https://peter.sh/experiments/chromium-command-line-switches/
17
+ # Generally this will make the browser faster and less detectable
18
+ '--no-pings',
19
+ '--incognito',
20
+ '--test-type',
21
+ '--lang=en-US',
22
+ '--mute-audio',
23
+ '--no-first-run',
24
+ '--disable-sync',
25
+ '--hide-scrollbars',
26
+ '--disable-logging',
27
+ '--start-maximized', # For headless check bypass
28
+ '--enable-async-dns',
29
+ '--disable-breakpad',
30
+ '--disable-infobars',
31
+ '--accept-lang=en-US',
32
+ '--use-mock-keychain',
33
+ '--disable-translate',
34
+ '--disable-extensions',
35
+ '--disable-voice-input',
36
+ '--window-position=0,0',
37
+ '--disable-wake-on-wifi',
38
+ '--ignore-gpu-blocklist',
39
+ '--enable-tcp-fast-open',
40
+ '--enable-web-bluetooth',
41
+ '--disable-hang-monitor',
42
+ '--password-store=basic',
43
+ '--disable-cloud-import',
44
+ '--disable-default-apps',
45
+ '--disable-print-preview',
46
+ '--disable-dev-shm-usage',
47
+ '--disable-popup-blocking',
48
+ '--metrics-recording-only',
49
+ '--disable-crash-reporter',
50
+ '--disable-partial-raster',
51
+ '--disable-gesture-typing',
52
+ '--disable-checker-imaging',
53
+ '--disable-prompt-on-repost',
54
+ '--force-color-profile=srgb',
55
+ '--font-render-hinting=none',
56
+ '--no-default-browser-check',
57
+ '--aggressive-cache-discard',
58
+ '--disable-component-update',
59
+ '--disable-cookie-encryption',
60
+ '--disable-domain-reliability',
61
+ '--disable-threaded-animation',
62
+ '--disable-threaded-scrolling',
63
+ # '--disable-reading-from-canvas', # For Firefox
64
+ '--enable-simple-cache-backend',
65
+ '--disable-background-networking',
66
+ '--disable-session-crashed-bubble',
67
+ '--enable-surface-synchronization',
68
+ '--disable-image-animation-resync',
69
+ '--disable-renderer-backgrounding',
70
+ '--disable-ipc-flooding-protection',
71
+ '--prerender-from-omnibox=disabled',
72
+ '--safebrowsing-disable-auto-update',
73
+ '--disable-offer-upload-credit-cards',
74
+ '--disable-features=site-per-process',
75
+ '--disable-background-timer-throttling',
76
+ '--disable-new-content-rendering-timeout',
77
+ '--run-all-compositor-stages-before-draw',
78
+ '--disable-client-side-phishing-detection',
79
+ '--disable-backgrounding-occluded-windows',
80
+ '--disable-layer-tree-host-memory-pressure',
81
+ '--autoplay-policy=no-user-gesture-required',
82
+ '--disable-offer-store-unmasked-wallet-cards',
83
+ '--disable-blink-features=AutomationControlled',
84
+ '--webrtc-ip-handling-policy=disable_non_proxied_udp',
85
+ '--disable-component-extensions-with-background-pages',
86
+ '--force-webrtc-ip-handling-policy=disable_non_proxied_udp',
87
+ '--enable-features=NetworkService,NetworkServiceInProcess,TrustTokens,TrustTokensAlwaysAllowIssuance',
88
+ '--blink-settings=primaryHoverType=2,availableHoverTypes=2,primaryPointerType=4,availablePointerTypes=4',
89
+ '--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process,TranslateUI,BlinkGenPropertyTrees',
90
+ ]
91
+
92
+ # Defaulting to the docker mode, token doesn't matter in it as it's passed for the container
93
+ NSTBROWSER_DEFAULT_QUERY = {
94
+ "once": True,
95
+ "headless": True,
96
+ "autoClose": True,
97
+ "fingerprint": {
98
+ "flags": {
99
+ "timezone": "BasedOnIp",
100
+ "screen": "Custom"
101
+ },
102
+ "platform": 'linux', # support: windows, mac, linux
103
+ "kernel": 'chromium', # only support: chromium
104
+ "kernelMilestone": '128',
105
+ "hardwareConcurrency": 8,
106
+ "deviceMemory": 8,
107
+ },
108
+ }
scrapling/engines/pw.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ from scrapling.core._types import Union, Callable, Optional, List, Dict
4
+
5
+ from scrapling.engines.constants import DEFAULT_STEALTH_FLAGS, NSTBROWSER_DEFAULT_QUERY
6
+ from scrapling.engines.toolbelt import (
7
+ Response,
8
+ do_nothing,
9
+ js_bypass_path,
10
+ intercept_route,
11
+ generate_headers,
12
+ check_type_validity,
13
+ construct_cdp_url,
14
+ generate_convincing_referer,
15
+ )
16
+
17
+
18
+ class PlaywrightEngine:
19
+ def __init__(
20
+ self, headless: Union[bool, str] = True,
21
+ disable_resources: bool = False,
22
+ useragent: Optional[str] = None,
23
+ network_idle: Optional[bool] = False,
24
+ timeout: Optional[float] = 30000,
25
+ page_action: Callable = do_nothing,
26
+ wait_selector: Optional[str] = None,
27
+ wait_selector_state: Optional[str] = 'attached',
28
+ stealth: bool = False,
29
+ hide_canvas: bool = True,
30
+ disable_webgl: bool = False,
31
+ cdp_url: Optional[str] = None,
32
+ nstbrowser_mode: bool = False,
33
+ nstbrowser_config: Optional[Dict] = None,
34
+ google_search: Optional[bool] = True,
35
+ extra_headers: Optional[Dict[str, str]] = None,
36
+ adaptor_arguments: Dict = None
37
+ ):
38
+ """An engine that utilizes PlayWright library, check the `PlayWrightFetcher` class for more documentation.
39
+
40
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
41
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends but it made requests ~25% faster in my tests for some websites.
42
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
43
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
44
+ :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
45
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
46
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
47
+ :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
48
+ :param wait_selector: Wait for a specific css selector to be in a specific state.
49
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
50
+ :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
51
+ :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
52
+ :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
53
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
54
+ :param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
55
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
56
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
57
+ :param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
58
+ :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
59
+ """
60
+ self.headless = headless
61
+ self.disable_resources = disable_resources
62
+ self.network_idle = bool(network_idle)
63
+ self.stealth = bool(stealth)
64
+ self.hide_canvas = bool(hide_canvas)
65
+ self.disable_webgl = bool(disable_webgl)
66
+ self.google_search = bool(google_search)
67
+ self.extra_headers = extra_headers or {}
68
+ self.cdp_url = cdp_url
69
+ self.useragent = useragent
70
+ self.timeout = check_type_validity(timeout, [int, float], 30000)
71
+ if callable(page_action):
72
+ self.page_action = page_action
73
+ else:
74
+ self.page_action = do_nothing
75
+ logging.error('[Ignored] Argument "page_action" must be callable')
76
+
77
+ self.wait_selector = wait_selector
78
+ self.wait_selector_state = wait_selector_state
79
+ self.nstbrowser_mode = bool(nstbrowser_mode)
80
+ self.nstbrowser_config = nstbrowser_config
81
+ self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
82
+
83
+ def _cdp_url_logic(self, flags: Optional[List] = None) -> str:
84
+ """Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is
85
+
86
+ :param flags: Chrome flags to be added to NSTBrowser query
87
+ :return: CDP URL
88
+ """
89
+ cdp_url = self.cdp_url
90
+ if self.nstbrowser_mode:
91
+ if self.nstbrowser_config and type(self.nstbrowser_config) is Dict:
92
+ config = self.nstbrowser_config
93
+ else:
94
+ query = NSTBROWSER_DEFAULT_QUERY.copy()
95
+ if flags:
96
+ query.update({
97
+ "args": dict(zip(flags, [''] * len(flags))), # browser args should be a dictionary
98
+ })
99
+
100
+ config = {
101
+ 'config': json.dumps(query),
102
+ # 'token': ''
103
+ }
104
+ cdp_url = construct_cdp_url(cdp_url, config)
105
+ else:
106
+ # To validate it
107
+ cdp_url = construct_cdp_url(cdp_url)
108
+
109
+ return cdp_url
110
+
111
+ def fetch(self, url: str) -> Response:
112
+ """Opens up the browser and do your request based on your chosen options.
113
+
114
+ :param url: Target url.
115
+ :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
116
+ """
117
+ if not self.stealth:
118
+ from playwright.sync_api import sync_playwright
119
+ else:
120
+ from rebrowser_playwright.sync_api import sync_playwright
121
+
122
+ with sync_playwright() as p:
123
+ # Handle the UserAgent early
124
+ if self.useragent:
125
+ extra_headers = {}
126
+ useragent = self.useragent
127
+ else:
128
+ extra_headers = generate_headers(browser_mode=True)
129
+ useragent = extra_headers.get('User-Agent')
130
+
131
+ # Prepare the flags before diving
132
+ flags = DEFAULT_STEALTH_FLAGS
133
+ if self.hide_canvas:
134
+ flags += ['--fingerprinting-canvas-image-data-noise']
135
+ if self.disable_webgl:
136
+ flags += ['--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2']
137
+
138
+ # Creating the browser
139
+ if self.cdp_url:
140
+ cdp_url = self._cdp_url_logic(flags if self.stealth else None)
141
+ browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url)
142
+ else:
143
+ if self.stealth:
144
+ browser = p.chromium.launch(headless=self.headless, args=flags, ignore_default_args=['--enable-automation'], chromium_sandbox=True)
145
+ else:
146
+ browser = p.chromium.launch(headless=self.headless, ignore_default_args=['--enable-automation'])
147
+
148
+ # Creating the context
149
+ if self.stealth:
150
+ context = browser.new_context(
151
+ locale='en-US',
152
+ is_mobile=False,
153
+ has_touch=False,
154
+ color_scheme='dark', # Bypasses the 'prefersLightColor' check in creepjs
155
+ user_agent=useragent,
156
+ device_scale_factor=2,
157
+ # I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
158
+ service_workers="allow",
159
+ ignore_https_errors=True,
160
+ extra_http_headers=extra_headers,
161
+ screen={"width": 1920, "height": 1080},
162
+ viewport={"width": 1920, "height": 1080},
163
+ permissions=["geolocation", 'notifications'],
164
+ )
165
+ else:
166
+ context = browser.new_context(
167
+ color_scheme='dark',
168
+ user_agent=useragent,
169
+ device_scale_factor=2,
170
+ extra_http_headers=extra_headers
171
+ )
172
+
173
+ # Finally we are in business
174
+ page = context.new_page()
175
+ page.set_default_navigation_timeout(self.timeout)
176
+ page.set_default_timeout(self.timeout)
177
+
178
+ if self.extra_headers:
179
+ page.set_extra_http_headers(self.extra_headers)
180
+
181
+ if self.disable_resources:
182
+ page.route("**/*", intercept_route)
183
+
184
+ if self.stealth:
185
+ # Basic bypasses nothing fancy as I'm still working on it
186
+ # But with adding these bypasses to the above config, it bypasses many online tests like
187
+ # https://bot.sannysoft.com/
188
+ # https://kaliiiiiiiiii.github.io/brotector/
189
+ # https://pixelscan.net/
190
+ # https://iphey.com/
191
+ # https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
192
+ # https://arh.antoinevastel.com/bots/areyouheadless/
193
+ # https://prescience-data.github.io/execution-monitor.html
194
+ page.add_init_script(path=js_bypass_path('webdriver_fully.js'))
195
+ page.add_init_script(path=js_bypass_path('window_chrome.js'))
196
+ page.add_init_script(path=js_bypass_path('navigator_plugins.js'))
197
+ page.add_init_script(path=js_bypass_path('pdf_viewer.js'))
198
+ page.add_init_script(path=js_bypass_path('notification_permission.js'))
199
+ page.add_init_script(path=js_bypass_path('screen_props.js'))
200
+ page.add_init_script(path=js_bypass_path('playwright_fingerprint.js'))
201
+
202
+ res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
203
+ page.wait_for_load_state(state="domcontentloaded")
204
+ if self.network_idle:
205
+ page.wait_for_load_state('networkidle')
206
+
207
+ page = self.page_action(page)
208
+
209
+ if self.wait_selector and type(self.wait_selector) is str:
210
+ waiter = page.locator(self.wait_selector)
211
+ waiter.wait_for(state=self.wait_selector_state)
212
+
213
+ content_type = res.headers.get('content-type', '')
214
+ # Parse charset from content-type
215
+ encoding = 'utf-8' # default encoding
216
+ if 'charset=' in content_type.lower():
217
+ encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
218
+
219
+ response = Response(
220
+ url=res.url,
221
+ text=page.content(),
222
+ content=res.body(),
223
+ status=res.status,
224
+ reason=res.status_text,
225
+ encoding=encoding,
226
+ cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
227
+ headers=res.all_headers(),
228
+ request_headers=res.request.all_headers(),
229
+ adaptor_arguments=self.adaptor_arguments
230
+ )
231
+ page.close()
232
+ return response
scrapling/engines/static.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ from scrapling.core._types import Union, Optional, Dict
4
+ from .toolbelt import Response, generate_convincing_referer, generate_headers
5
+
6
+ import httpx
7
+ from httpx._models import Response as httpxResponse
8
+
9
+
10
+ class StaticEngine:
11
+ def __init__(self, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = None, adaptor_arguments: Dict = None):
12
+ """An engine that utilizes httpx library, check the `Fetcher` class for more documentation.
13
+
14
+ :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
15
+ :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
16
+ :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
17
+ """
18
+ self.timeout = timeout
19
+ self.follow_redirects = bool(follow_redirects)
20
+ self._extra_headers = generate_headers(browser_mode=False)
21
+ self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
22
+
23
+ @staticmethod
24
+ def _headers_job(headers: Optional[Dict], url: str, stealth: bool) -> Dict:
25
+ """Adds useragent to headers if it doesn't exist, generates real headers and append it to current headers, and
26
+ finally generates a referer header that looks like if this request came from Google's search of the current URL's domain.
27
+
28
+ :param headers: Current headers in the request if the user passed any
29
+ :param url: The Target URL.
30
+ :param stealth: Whether stealth mode is enabled or not.
31
+ :return: A dictionary of the new headers.
32
+ """
33
+ headers = headers or {}
34
+
35
+ # Validate headers
36
+ if not headers.get('user-agent') and not headers.get('User-Agent'):
37
+ headers['User-Agent'] = generate_headers(browser_mode=False).get('User-Agent')
38
+ logging.info(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
39
+
40
+ if stealth:
41
+ extra_headers = generate_headers(browser_mode=False)
42
+ headers.update(extra_headers)
43
+ headers.update({'referer': generate_convincing_referer(url)})
44
+
45
+ return headers
46
+
47
+ def _prepare_response(self, response: httpxResponse) -> Response:
48
+ """Takes httpx response and generates `Response` object from it.
49
+
50
+ :param response: httpx response object
51
+ :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
52
+ """
53
+ return Response(
54
+ url=str(response.url),
55
+ text=response.text,
56
+ content=response.content,
57
+ status=response.status_code,
58
+ reason=response.reason_phrase,
59
+ encoding=response.encoding or 'utf-8',
60
+ cookies=dict(response.cookies),
61
+ headers=dict(response.headers),
62
+ request_headers=dict(response.request.headers),
63
+ adaptor_arguments=self.adaptor_arguments
64
+ )
65
+
66
+ def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
67
+ """Make basic HTTP GET request for you but with some added flavors.
68
+ :param url: Target url.
69
+ :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
70
+ create a referer header as if this request had came from Google's search of this URL's domain.
71
+ :param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
72
+ :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
73
+ """
74
+ headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
75
+ request = httpx.get(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
76
+ return self._prepare_response(request)
77
+
78
+ def post(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
79
+ """Make basic HTTP POST request for you but with some added flavors.
80
+ :param url: Target url.
81
+ :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
82
+ create a referer header as if this request had came from Google's search of this URL's domain.
83
+ :param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
84
+ :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
85
+ """
86
+ headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
87
+ request = httpx.post(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
88
+ return self._prepare_response(request)
89
+
90
+ def delete(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
91
+ """Make basic HTTP DELETE request for you but with some added flavors.
92
+ :param url: Target url.
93
+ :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
94
+ create a referer header as if this request had came from Google's search of this URL's domain.
95
+ :param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
96
+ :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
97
+ """
98
+ headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
99
+ request = httpx.delete(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
100
+ return self._prepare_response(request)
101
+
102
+ def put(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
103
+ """Make basic HTTP PUT request for you but with some added flavors.
104
+ :param url: Target url.
105
+ :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
106
+ create a referer header as if this request had came from Google's search of this URL's domain.
107
+ :param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
108
+ :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
109
+ """
110
+ headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
111
+ request = httpx.put(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
112
+ return self._prepare_response(request)
scrapling/engines/toolbelt/__init__.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .fingerprints import (
2
+ get_os_name,
3
+ generate_headers,
4
+ generate_convincing_referer,
5
+ )
6
+ from .custom import (
7
+ Response,
8
+ do_nothing,
9
+ BaseFetcher,
10
+ get_variable_name,
11
+ check_type_validity,
12
+ check_if_engine_usable,
13
+ )
14
+ from .navigation import (
15
+ js_bypass_path,
16
+ intercept_route,
17
+ construct_cdp_url,
18
+ )
scrapling/engines/toolbelt/bypasses/navigator_plugins.js ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ if(navigator.plugins.length == 0){
2
+ Object.defineProperty(navigator, 'plugins', {
3
+ get: () => {
4
+ const PDFViewerPlugin = Object.create(Plugin.prototype, {
5
+ description: { value: 'Portable Document Format', enumerable: false },
6
+ filename: { value: 'internal-pdf-viewer', enumerable: false },
7
+ name: { value: 'PDF Viewer', enumerable: false },
8
+ });
9
+ const ChromePDFViewer = Object.create(Plugin.prototype, {
10
+ description: { value: 'Portable Document Format', enumerable: false },
11
+ filename: { value: 'internal-pdf-viewer', enumerable: false },
12
+ name: { value: 'Chrome PDF Viewer', enumerable: false },
13
+ });
14
+ const ChromiumPDFViewer = Object.create(Plugin.prototype, {
15
+ description: { value: 'Portable Document Format', enumerable: false },
16
+ filename: { value: 'internal-pdf-viewer', enumerable: false },
17
+ name: { value: 'Chromium PDF Viewer', enumerable: false },
18
+ });
19
+ const EdgePDFViewer = Object.create(Plugin.prototype, {
20
+ description: { value: 'Portable Document Format', enumerable: false },
21
+ filename: { value: 'internal-pdf-viewer', enumerable: false },
22
+ name: { value: 'Microsoft Edge PDF Viewer', enumerable: false },
23
+ });
24
+ const WebKitPDFPlugin = Object.create(Plugin.prototype, {
25
+ description: { value: 'Portable Document Format', enumerable: false },
26
+ filename: { value: 'internal-pdf-viewer', enumerable: false },
27
+ name: { value: 'WebKit built-in PDF', enumerable: false },
28
+ });
29
+
30
+ return Object.create(PluginArray.prototype, {
31
+ length: { value: 5 },
32
+ 0: { value: PDFViewerPlugin },
33
+ 1: { value: ChromePDFViewer },
34
+ 2: { value: ChromiumPDFViewer },
35
+ 3: { value: EdgePDFViewer },
36
+ 4: { value: WebKitPDFPlugin },
37
+ });
38
+ },
39
+ });
40
+ }
scrapling/engines/toolbelt/bypasses/notification_permission.js ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ // Bypasses `notificationIsDenied` test in creepsjs's 'Like Headless' sections
2
+ const isSecure = document.location.protocol.startsWith('https')
3
+ if (isSecure){
4
+ Object.defineProperty(Notification, 'permission', {get: () => 'default'})
5
+ }
scrapling/engines/toolbelt/bypasses/pdf_viewer.js ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ // PDF viewer enabled
2
+ // Bypasses `pdfIsDisabled` test in creepsjs's 'Like Headless' sections
3
+ Object.defineProperty(navigator, 'pdfViewerEnabled', {
4
+ get: () => true,
5
+ });
scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ // Remove playwright fingerprint => https://github.com/microsoft/playwright/commit/c9e673c6dca746384338ab6bb0cf63c7e7caa9b2#diff-087773eea292da9db5a3f27de8f1a2940cdb895383ad750c3cd8e01772a35b40R915
2
+ delete __pwInitScripts;
scrapling/engines/toolbelt/bypasses/screen_props.js ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const windowScreenProps = {
2
+ // Dimensions
3
+ innerHeight: 0,
4
+ innerWidth: 0,
5
+ outerHeight: 754,
6
+ outerWidth: 1313,
7
+
8
+ // Position
9
+ screenX: 19,
10
+ pageXOffset: 0,
11
+ pageYOffset: 0,
12
+
13
+ // Display
14
+ devicePixelRatio: 2
15
+ };
16
+
17
+ try {
18
+ for (const [prop, value] of Object.entries(windowScreenProps)) {
19
+ if (value > 0) {
20
+ // The 0 values are introduced by collecting in the hidden iframe.
21
+ // They are document sizes anyway so no need to test them or inject them.
22
+ window[prop] = value;
23
+ }
24
+ }
25
+ } catch (e) {
26
+ console.warn(e);
27
+ };
scrapling/engines/toolbelt/bypasses/webdriver_fully.js ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Create a function that looks like a native getter
2
+ const nativeGetter = function get webdriver() {
3
+ return false;
4
+ };
5
+
6
+ // Copy over native function properties
7
+ Object.defineProperties(nativeGetter, {
8
+ name: { value: 'get webdriver', configurable: true },
9
+ length: { value: 0, configurable: true },
10
+ toString: {
11
+ value: function() {
12
+ return `function get webdriver() { [native code] }`;
13
+ },
14
+ configurable: true
15
+ }
16
+ });
17
+
18
+ // Make it look native
19
+ Object.setPrototypeOf(nativeGetter, Function.prototype);
20
+
21
+ // Apply the modified descriptor
22
+ Object.defineProperty(Navigator.prototype, 'webdriver', {
23
+ get: nativeGetter,
24
+ set: undefined,
25
+ enumerable: true,
26
+ configurable: true
27
+ });
scrapling/engines/toolbelt/bypasses/window_chrome.js ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // To escape `HEADCHR_CHROME_OBJ` test in headless mode => https://github.com/antoinevastel/fp-collect/blob/master/src/fpCollect.js#L322
2
+ // Faking window.chrome fully
3
+
4
+ if (!window.chrome) {
5
+ // First, save all existing properties
6
+ const originalKeys = Object.getOwnPropertyNames(window);
7
+ const tempObj = {};
8
+
9
+ // Recreate all properties in original order
10
+ for (const key of originalKeys) {
11
+ const descriptor = Object.getOwnPropertyDescriptor(window, key);
12
+ const value = window[key];
13
+ // delete window[key];
14
+ Object.defineProperty(tempObj, key, descriptor);
15
+ }
16
+
17
+ // Use the exact property descriptor found in headful Chrome
18
+ // fetch it via `Object.getOwnPropertyDescriptor(window, 'chrome')`
19
+ const mockChrome = {
20
+ loadTimes: {},
21
+ csi: {},
22
+ app: {
23
+ isInstalled: false
24
+ },
25
+ // Add other Chrome-specific properties
26
+ };
27
+
28
+ Object.defineProperty(tempObj, 'chrome', {
29
+ writable: true,
30
+ enumerable: true,
31
+ configurable: false,
32
+ value: mockChrome
33
+ });
34
+ for (const key of Object.getOwnPropertyNames(tempObj)) {
35
+ try {
36
+ Object.defineProperty(window, key,
37
+ Object.getOwnPropertyDescriptor(tempObj, key));
38
+ } catch (e) {}
39
+ };
40
+ // todo: solve this
41
+ // Using line below bypasses the hasHighChromeIndex test in creepjs ==> https://github.com/abrahamjuliot/creepjs/blob/master/src/headless/index.ts#L121
42
+ // Chrome object have to be in the end of the window properties
43
+ // Object.assign(window, tempObj);
44
+ // But makes window.chrome unreadable on 'https://bot.sannysoft.com/'
45
+ }
46
+
47
+ // That means we're running headful and don't need to mock anything
48
+ if ('app' in window.chrome) {
49
+ return; // Nothing to do here
50
+ }
51
+ const makeError = {
52
+ ErrorInInvocation: fn => {
53
+ const err = new TypeError(`Error in invocation of app.${fn}()`);
54
+ return utils.stripErrorWithAnchor(
55
+ err,
56
+ `at ${fn} (eval at <anonymous>`,
57
+ );
58
+ },
59
+ };
60
+ // check with: `JSON.stringify(window.chrome['app'])`
61
+ const STATIC_DATA = JSON.parse(
62
+ `
63
+ {
64
+ "isInstalled": false,
65
+ "InstallState": {
66
+ "DISABLED": "disabled",
67
+ "INSTALLED": "installed",
68
+ "NOT_INSTALLED": "not_installed"
69
+ },
70
+ "RunningState": {
71
+ "CANNOT_RUN": "cannot_run",
72
+ "READY_TO_RUN": "ready_to_run",
73
+ "RUNNING": "running"
74
+ }
75
+ }
76
+ `.trim(),
77
+ );
78
+ window.chrome.app = {
79
+ ...STATIC_DATA,
80
+
81
+ get isInstalled() {
82
+ return false;
83
+ },
84
+
85
+ getDetails: function getDetails() {
86
+ if (arguments.length) {
87
+ throw makeError.ErrorInInvocation(`getDetails`);
88
+ }
89
+ return null;
90
+ },
91
+ getIsInstalled: function getDetails() {
92
+ if (arguments.length) {
93
+ throw makeError.ErrorInInvocation(`getIsInstalled`);
94
+ }
95
+ return false;
96
+ },
97
+ runningState: function getDetails() {
98
+ if (arguments.length) {
99
+ throw makeError.ErrorInInvocation(`runningState`);
100
+ }
101
+ return 'cannot_run';
102
+ },
103
+ };
104
+ // Check that the Navigation Timing API v1 is available, we need that
105
+ if (!window.performance || !window.performance.timing) {
106
+ return;
107
+ }
108
+ const {timing} = window.performance;
109
+ window.chrome.csi = function () {
110
+ return {
111
+ onloadT: timing.domContentLoadedEventEnd,
112
+ startE: timing.navigationStart,
113
+ pageT: Date.now() - timing.navigationStart,
114
+ tran: 15, // Transition type or something
115
+ };
116
+ };
117
+ if (!window.PerformancePaintTiming){
118
+ return;
119
+ }
120
+ const {performance} = window;
121
+ // Some stuff is not available on about:blank as it requires a navigation to occur,
122
+ // let's harden the code to not fail then:
123
+ const ntEntryFallback = {
124
+ nextHopProtocol: 'h2',
125
+ type: 'other',
126
+ };
127
+
128
+ // The API exposes some funky info regarding the connection
129
+ const protocolInfo = {
130
+ get connectionInfo() {
131
+ const ntEntry =
132
+ performance.getEntriesByType('navigation')[0] || ntEntryFallback;
133
+ return ntEntry.nextHopProtocol;
134
+ },
135
+ get npnNegotiatedProtocol() {
136
+ // NPN is deprecated in favor of ALPN, but this implementation returns the
137
+ // HTTP/2 or HTTP2+QUIC/39 requests negotiated via ALPN.
138
+ const ntEntry =
139
+ performance.getEntriesByType('navigation')[0] || ntEntryFallback;
140
+ return ['h2', 'hq'].includes(ntEntry.nextHopProtocol)
141
+ ? ntEntry.nextHopProtocol
142
+ : 'unknown';
143
+ },
144
+ get navigationType() {
145
+ const ntEntry =
146
+ performance.getEntriesByType('navigation')[0] || ntEntryFallback;
147
+ return ntEntry.type;
148
+ },
149
+ get wasAlternateProtocolAvailable() {
150
+ // The Alternate-Protocol header is deprecated in favor of Alt-Svc
151
+ // (https://www.mnot.net/blog/2016/03/09/alt-svc), so technically this
152
+ // should always return false.
153
+ return false;
154
+ },
155
+ get wasFetchedViaSpdy() {
156
+ // SPDY is deprecated in favor of HTTP/2, but this implementation returns
157
+ // true for HTTP/2 or HTTP2+QUIC/39 as well.
158
+ const ntEntry =
159
+ performance.getEntriesByType('navigation')[0] || ntEntryFallback;
160
+ return ['h2', 'hq'].includes(ntEntry.nextHopProtocol);
161
+ },
162
+ get wasNpnNegotiated() {
163
+ // NPN is deprecated in favor of ALPN, but this implementation returns true
164
+ // for HTTP/2 or HTTP2+QUIC/39 requests negotiated via ALPN.
165
+ const ntEntry =
166
+ performance.getEntriesByType('navigation')[0] || ntEntryFallback;
167
+ return ['h2', 'hq'].includes(ntEntry.nextHopProtocol);
168
+ },
169
+ };
170
+
171
+ // Truncate number to specific number of decimals, most of the `loadTimes` stuff has 3
172
+ function toFixed(num, fixed) {
173
+ var re = new RegExp('^-?\\d+(?:.\\d{0,' + (fixed || -1) + '})?');
174
+ return num.toString().match(re)[0];
175
+ }
176
+
177
+ const timingInfo = {
178
+ get firstPaintAfterLoadTime() {
179
+ // This was never actually implemented and always returns 0.
180
+ return 0;
181
+ },
182
+ get requestTime() {
183
+ return timing.navigationStart / 1000;
184
+ },
185
+ get startLoadTime() {
186
+ return timing.navigationStart / 1000;
187
+ },
188
+ get commitLoadTime() {
189
+ return timing.responseStart / 1000;
190
+ },
191
+ get finishDocumentLoadTime() {
192
+ return timing.domContentLoadedEventEnd / 1000;
193
+ },
194
+ get finishLoadTime() {
195
+ return timing.loadEventEnd / 1000;
196
+ },
197
+ get firstPaintTime() {
198
+ const fpEntry = performance.getEntriesByType('paint')[0] || {
199
+ startTime: timing.loadEventEnd / 1000, // Fallback if no navigation occured (`about:blank`)
200
+ };
201
+ return toFixed(
202
+ (fpEntry.startTime + performance.timeOrigin) / 1000,
203
+ 3,
204
+ );
205
+ },
206
+ };
207
+
208
+ window.chrome.loadTimes = function () {
209
+ return {
210
+ ...protocolInfo,
211
+ ...timingInfo,
212
+ };
213
+ };
scrapling/engines/toolbelt/custom.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Functions related to custom types or type checking
3
+ """
4
+ import inspect
5
+ import logging
6
+ from dataclasses import dataclass, field
7
+
8
+ from scrapling.core.utils import setup_basic_logging
9
+ from scrapling.parser import Adaptor, SQLiteStorageSystem
10
+ from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class Response:
15
+ """This class is returned by all engines as a way to unify response type between different libraries."""
16
+ url: str
17
+ text: str
18
+ content: bytes
19
+ status: int
20
+ reason: str
21
+ encoding: str = 'utf-8' # default encoding
22
+ cookies: Dict = field(default_factory=dict)
23
+ headers: Dict = field(default_factory=dict)
24
+ request_headers: Dict = field(default_factory=dict)
25
+ adaptor_arguments: Dict = field(default_factory=dict)
26
+
27
+ @property
28
+ def adaptor(self) -> Union[Adaptor, None]:
29
+ """Generate Adaptor instance from this response if possible, otherwise return None"""
30
+ automatch_domain = self.adaptor_arguments.pop('automatch_domain', None)
31
+ if self.text:
32
+ # For playwright that will be the response after all JS executed
33
+ return Adaptor(text=self.text, url=automatch_domain or self.url, encoding=self.encoding, **self.adaptor_arguments)
34
+ elif self.content:
35
+ # For playwright, that's after all JS is loaded but not all of them executed, because playwright doesn't offer something like page.content()
36
+ # To get response Bytes after the load states
37
+ # Reference: https://playwright.dev/python/docs/api/class-page
38
+ return Adaptor(body=self.content, url=automatch_domain or self.url, encoding=self.encoding, **self.adaptor_arguments)
39
+ return None
40
+
41
+ def __repr__(self):
42
+ return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
43
+
44
+
45
+ class BaseFetcher:
46
+ def __init__(
47
+ self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
48
+ storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = True,
49
+ automatch_domain: Optional[str] = None,
50
+ ):
51
+ """Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
52
+ are detected and passed automatically from the Fetcher based on the response for accessibility.
53
+
54
+ :param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
55
+ libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
56
+ :param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
57
+ :param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
58
+ priority over all auto-match related arguments/functions in the class.
59
+ :param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
60
+ :param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
61
+ If empty, default values will be used.
62
+ :param automatch_domain: For cases where you want to automatch selectors across different websites as if they were on the same website, use this argument to unify them.
63
+ Otherwise, the domain of the request is used by default.
64
+ :param debug: Enable debug mode
65
+ """
66
+ # Adaptor class parameters
67
+ # I won't validate Adaptor's class parameters here again, I will leave it to be validated later
68
+ self.adaptor_arguments = dict(
69
+ huge_tree=huge_tree,
70
+ keep_comments=keep_comments,
71
+ auto_match=auto_match,
72
+ storage=storage,
73
+ storage_args=storage_args,
74
+ debug=debug,
75
+ )
76
+ # If the user used fetchers first, then configure the logger from here instead of the `Adaptor` class
77
+ setup_basic_logging(level='debug' if debug else 'info')
78
+ if automatch_domain:
79
+ if type(automatch_domain) is not str:
80
+ logging.warning('[Ignored] The argument "automatch_domain" must be of string type')
81
+ else:
82
+ self.adaptor_arguments.update({'automatch_domain': automatch_domain})
83
+
84
+
85
+ def check_if_engine_usable(engine: Callable) -> Union[Callable, None]:
86
+ """This function check if the passed engine can be used by a Fetcher-type class or not.
87
+
88
+ :param engine: The engine class itself
89
+ :return: The engine class again if all checks out, otherwise raises error
90
+ :raise TypeError: If engine class don't have fetch method, If engine class have fetch attribute not method, or If engine class have fetch function but it doesn't take arguments
91
+ """
92
+ # if isinstance(engine, type):
93
+ # raise TypeError("Expected an engine instance, not a class definition of the engine")
94
+
95
+ if hasattr(engine, 'fetch'):
96
+ fetch_function = getattr(engine, "fetch")
97
+ if callable(fetch_function):
98
+ if len(inspect.signature(fetch_function).parameters) > 0:
99
+ return engine
100
+ else:
101
+ # raise TypeError("Engine class instance must have a callable method 'fetch' with the first argument used for the url.")
102
+ raise TypeError("Engine class must have a callable method 'fetch' with the first argument used for the url.")
103
+ else:
104
+ # raise TypeError("Invalid engine instance! Engine class must have a callable method 'fetch'")
105
+ raise TypeError("Invalid engine class! Engine class must have a callable method 'fetch'")
106
+ else:
107
+ # raise TypeError("Invalid engine instance! Engine class must have the method 'fetch'")
108
+ raise TypeError("Invalid engine class! Engine class must have the method 'fetch'")
109
+
110
+
111
+ def get_variable_name(var: Any) -> Optional[str]:
112
+ """Get the name of a variable using global and local scopes.
113
+ :param var: The variable to find the name for
114
+ :return: The name of the variable if found, None otherwise
115
+ """
116
+ for scope in [globals(), locals()]:
117
+ for name, value in scope.items():
118
+ if value is var:
119
+ return name
120
+ return None
121
+
122
+
123
+ def check_type_validity(variable: Any, valid_types: Union[List[Type], None], default_value: Any = None, critical: bool = False, param_name: Optional[str] = None) -> Any:
124
+ """Check if a variable matches the specified type constraints.
125
+ :param variable: The variable to check
126
+ :param valid_types: List of valid types for the variable
127
+ :param default_value: Value to return if type check fails
128
+ :param critical: If True, raises TypeError instead of logging error
129
+ :param param_name: Optional parameter name for error messages
130
+ :return: The original variable if valid, default_value if invalid
131
+ :raise TypeError: If critical=True and type check fails
132
+ """
133
+ # Use provided param_name or try to get it automatically
134
+ var_name = param_name or get_variable_name(variable) or "Unknown"
135
+
136
+ # Convert valid_types to a list if None
137
+ valid_types = valid_types or []
138
+
139
+ # Handle None value
140
+ if variable is None:
141
+ if type(None) in valid_types:
142
+ return variable
143
+ error_msg = f'Argument "{var_name}" cannot be None'
144
+ if critical:
145
+ raise TypeError(error_msg)
146
+ logging.error(f'[Ignored] {error_msg}')
147
+ return default_value
148
+
149
+ # If no valid_types specified and variable has a value, return it
150
+ if not valid_types:
151
+ return variable
152
+
153
+ # Check if variable type matches any of the valid types
154
+ if not any(isinstance(variable, t) for t in valid_types):
155
+ type_names = [t.__name__ for t in valid_types]
156
+ error_msg = f'Argument "{var_name}" must be of type {" or ".join(type_names)}'
157
+ if critical:
158
+ raise TypeError(error_msg)
159
+ logging.error(f'[Ignored] {error_msg}')
160
+ return default_value
161
+
162
+ return variable
163
+
164
+
165
+ # Pew Pew
166
+ def do_nothing(page):
167
+ # Just works as a filler for `page_action` argument in browser engines
168
+ return page
scrapling/engines/toolbelt/fingerprints.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Functions related to generating headers and fingerprints generally
3
+ """
4
+
5
+ import platform
6
+
7
+ from scrapling.core.utils import cache
8
+ from scrapling.core._types import Union, Dict
9
+
10
+ from tldextract import extract
11
+ from browserforge.headers import HeaderGenerator, Browser
12
+ from browserforge.fingerprints import FingerprintGenerator, Fingerprint
13
+
14
+
15
+ @cache(None, typed=True)
16
+ def generate_convincing_referer(url: str) -> str:
17
+ """Takes the domain from the URL without the subdomain/suffix and make it look like you were searching google for this website
18
+
19
+ >>> generate_convincing_referer('https://www.somewebsite.com/blah')
20
+ 'https://www.google.com/search?q=somewebsite'
21
+
22
+ :param url: The URL you are about to fetch.
23
+ :return: Google's search URL of the domain name
24
+ """
25
+ website_name = extract(url).domain
26
+ return f'https://www.google.com/search?q={website_name}'
27
+
28
+
29
+ @cache(None, typed=True)
30
+ def get_os_name() -> Union[str, None]:
31
+ """Get the current OS name in the same format needed for browserforge
32
+
33
+ :return: Current OS name or `None` otherwise
34
+ """
35
+ #
36
+ os_name = platform.system()
37
+ return {
38
+ 'Linux': 'linux',
39
+ 'Darwin': 'macos',
40
+ 'Windows': 'windows',
41
+ # For the future? because why not
42
+ 'iOS': 'ios',
43
+ }.get(os_name)
44
+
45
+
46
+ def generate_suitable_fingerprint() -> Fingerprint:
47
+ """Generates a browserforge's fingerprint that matches current OS, desktop device, and Chrome with version 128 at least.
48
+
49
+ This function was originally created to test Browserforge's injector.
50
+ :return: `Fingerprint` object
51
+ """
52
+ return FingerprintGenerator(
53
+ browser=[Browser(name='chrome', min_version=128)],
54
+ os=get_os_name(), # None is ignored
55
+ device='desktop'
56
+ ).generate()
57
+
58
+
59
+ def generate_headers(browser_mode: bool = False) -> Dict:
60
+ """Generate real browser-like headers using browserforge's generator
61
+
62
+ :param browser_mode: If enabled, the headers created are used for playwright so it have to match everything
63
+ :return: A dictionary of the generated headers
64
+ """
65
+ if browser_mode:
66
+ # In this mode we don't care about anything other than matching the OS and the browser type with the browser we are using
67
+ # So we don't raise any inconsistency red flags while websites fingerprinting us
68
+ os_name = get_os_name()
69
+ return HeaderGenerator(
70
+ browser=[Browser(name='chrome', min_version=128)],
71
+ os=os_name, # None is ignored
72
+ device='desktop'
73
+ ).generate()
74
+ else:
75
+ # Here it's used for normal requests that aren't done through browsers so we can take it lightly
76
+ browsers = [
77
+ Browser(name='chrome', min_version=120),
78
+ Browser(name='firefox', min_version=120),
79
+ Browser(name='edge', min_version=120),
80
+ ]
81
+ return HeaderGenerator(browser=browsers, device='desktop').generate()
scrapling/engines/toolbelt/navigation.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Functions related to files and URLs
3
+ """
4
+
5
+ import os
6
+ import logging
7
+ from urllib.parse import urlparse, urlencode
8
+
9
+ from scrapling.core.utils import cache
10
+ from scrapling.core._types import Union, Dict, Optional
11
+ from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
12
+
13
+ from playwright.sync_api import Route
14
+
15
+
16
+ def intercept_route(route: Route) -> Union[Route, None]:
17
+ """This is just a route handler but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
18
+
19
+ :param route: PlayWright `Route` object of the current page
20
+ :return: PlayWright `Route` object
21
+ """
22
+ if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
23
+ logging.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
24
+ return route.abort()
25
+ return route.continue_()
26
+
27
+
28
+ def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
29
+ """Takes a CDP URL, reconstruct it to check it's valid, then adds encoded parameters if exists
30
+
31
+ :param cdp_url: The target URL.
32
+ :param query_params: A dictionary of the parameters to add.
33
+ :return: The new CDP URL.
34
+ """
35
+ try:
36
+ # Validate the base URL structure
37
+ parsed = urlparse(cdp_url)
38
+
39
+ # Check scheme
40
+ if parsed.scheme not in ('ws', 'wss'):
41
+ raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
42
+
43
+ # Validate hostname and port
44
+ if not parsed.netloc:
45
+ raise ValueError("Invalid hostname for the CDP URL")
46
+
47
+ # Ensure path starts with /
48
+ path = parsed.path
49
+ if not path.startswith('/'):
50
+ path = '/' + path
51
+
52
+ # Reconstruct the base URL with validated parts
53
+ validated_base = f"{parsed.scheme}://{parsed.netloc}{path}"
54
+
55
+ # Add query parameters
56
+ if query_params:
57
+ query_string = urlencode(query_params)
58
+ return f"{validated_base}?{query_string}"
59
+
60
+ return validated_base
61
+
62
+ except Exception as e:
63
+ raise ValueError(f"Invalid CDP URL: {str(e)}")
64
+
65
+
66
+ @cache(None, typed=True)
67
+ def js_bypass_path(filename: str) -> str:
68
+ """Takes the base filename of JS file inside the `bypasses` folder then return the full path of it
69
+
70
+ :param filename: The base filename of the JS file.
71
+ :return: The full path of the JS file.
72
+ """
73
+ current_directory = os.path.dirname(__file__)
74
+ return os.path.join(current_directory, 'bypasses', filename)
scrapling/fetchers.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from scrapling.core._types import Dict, Optional, Union, Callable, List, Literal
2
+
3
+ from scrapling.engines.toolbelt import Response, BaseFetcher, do_nothing
4
+ from scrapling.engines import CamoufoxEngine, PlaywrightEngine, StaticEngine, check_if_engine_usable
5
+
6
+
7
+ class Fetcher(BaseFetcher):
8
+ """A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on httpx.
9
+
10
+ Any additional keyword arguments passed to the methods below are passed to the respective httpx's method directly.
11
+ """
12
+ def get(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
13
+ """Make basic HTTP GET request for you but with some added flavors.
14
+ :param url: Target url.
15
+ :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
16
+ :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
17
+ :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
18
+ create a referer header as if this request had came from Google's search of this URL's domain.
19
+ :param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
20
+ :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
21
+ """
22
+ response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).get(url, stealthy_headers, **kwargs)
23
+ return response_object
24
+
25
+ def post(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
26
+ """Make basic HTTP POST request for you but with some added flavors.
27
+ :param url: Target url.
28
+ :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
29
+ :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
30
+ :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
31
+ create a referer header as if this request came from Google's search of this URL's domain.
32
+ :param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
33
+ :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
34
+ """
35
+ response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).post(url, stealthy_headers, **kwargs)
36
+ return response_object
37
+
38
+ def put(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
39
+ """Make basic HTTP PUT request for you but with some added flavors.
40
+ :param url: Target url
41
+ :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
42
+ :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
43
+ :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
44
+ create a referer header as if this request came from Google's search of this URL's domain.
45
+ :param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
46
+ :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
47
+ """
48
+ response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).put(url, stealthy_headers, **kwargs)
49
+ return response_object
50
+
51
+ def delete(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
52
+ """Make basic HTTP DELETE request for you but with some added flavors.
53
+ :param url: Target url
54
+ :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
55
+ :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
56
+ :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
57
+ create a referer header as if this request came from Google's search of this URL's domain.
58
+ :param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
59
+ :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
60
+ """
61
+ response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).delete(url, stealthy_headers, **kwargs)
62
+ return response_object
63
+
64
+
65
+ class StealthyFetcher(BaseFetcher):
66
+ """A `Fetcher` class type that is completely stealthy fetcher that uses a modified version of Firefox.
67
+
68
+ It works as real browsers passing almost all online tests/protections based on Camoufox.
69
+ Other added flavors include setting the faked OS fingerprints to match the user's OS and the referer of every request is set as if this request came from Google's search of this URL's domain.
70
+ """
71
+ def fetch(
72
+ self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
73
+ block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
74
+ timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
75
+ wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None
76
+ ) -> Response:
77
+ """
78
+ Opens up a browser and do your request based on your chosen options below.
79
+ :param url: Target url.
80
+ :param headless: Run the browser in headless/hidden (default), 'virtual' screen mode, or headful/visible mode.
81
+ :param block_images: Prevent the loading of images through Firefox preferences.
82
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
83
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends but it made requests ~25% faster in my tests for some websites.
84
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
85
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
86
+ :param block_webrtc: Blocks WebRTC entirely.
87
+ :param addons: List of Firefox addons to use. Must be paths to extracted addons.
88
+ :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
89
+ :param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases.
90
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
91
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
92
+ :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
93
+ :param wait_selector: Wait for a specific css selector to be in a specific state.
94
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
95
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
96
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
97
+ :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
98
+ """
99
+ engine = CamoufoxEngine(
100
+ timeout=timeout,
101
+ headless=headless,
102
+ page_action=page_action,
103
+ block_images=block_images,
104
+ block_webrtc=block_webrtc,
105
+ addons=addons,
106
+ humanize=humanize,
107
+ allow_webgl=allow_webgl,
108
+ disable_resources=disable_resources,
109
+ network_idle=network_idle,
110
+ wait_selector=wait_selector,
111
+ wait_selector_state=wait_selector_state,
112
+ google_search=google_search,
113
+ extra_headers=extra_headers,
114
+ adaptor_arguments=self.adaptor_arguments,
115
+ )
116
+ return engine.fetch(url)
117
+
118
+
119
+ class PlayWrightFetcher(BaseFetcher):
120
+ """A `Fetcher` class type that provide many options, all of them are based on PlayWright.
121
+
122
+ Using this Fetcher class, you can do requests with:
123
+ - Vanilla Playwright without any modifications other than the ones you chose.
124
+ - Stealthy Playwright with the stealth mode I wrote for it. It's still a work in progress but it bypasses many online tests like bot.sannysoft.com
125
+ Some of the things stealth mode does include:
126
+ 1) Patches the CDP runtime fingerprint.
127
+ 2) Mimics some of the real browsers' properties by injecting several JS files and using custom options.
128
+ 3) Using custom flags on launch to hide Playwright even more and make it faster.
129
+ 4) Generates real browser's headers of the same type and same user OS then append it to the request.
130
+ - Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
131
+ - NSTBrowser's docker browserless option by passing the CDP URL and enabling `nstbrowser_mode` option.
132
+ > Note that these are the main options with PlayWright but it can be mixed together.
133
+ """
134
+ def fetch(
135
+ self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
136
+ useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
137
+ page_action: Callable = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
138
+ hide_canvas: bool = True, disable_webgl: bool = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
139
+ stealth: bool = False,
140
+ cdp_url: Optional[str] = None,
141
+ nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
142
+ ) -> Response:
143
+ """Opens up a browser and do your request based on your chosen options below.
144
+ :param url: Target url.
145
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
146
+ :param disable_resources: Drop requests of unnecessary resources for speed boost. It depends but it made requests ~25% faster in my tests for some websites.
147
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
148
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
149
+ :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
150
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
151
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
152
+ :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
153
+ :param wait_selector: Wait for a specific css selector to be in a specific state.
154
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
155
+ :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
156
+ :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
157
+ :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
158
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
159
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
160
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
161
+ :param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
162
+ :param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
163
+ :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
164
+ """
165
+ engine = PlaywrightEngine(
166
+ timeout=timeout,
167
+ stealth=stealth,
168
+ cdp_url=cdp_url,
169
+ headless=headless,
170
+ useragent=useragent,
171
+ page_action=page_action,
172
+ hide_canvas=hide_canvas,
173
+ network_idle=network_idle,
174
+ google_search=google_search,
175
+ extra_headers=extra_headers,
176
+ wait_selector=wait_selector,
177
+ disable_webgl=disable_webgl,
178
+ nstbrowser_mode=nstbrowser_mode,
179
+ nstbrowser_config=nstbrowser_config,
180
+ disable_resources=disable_resources,
181
+ wait_selector_state=wait_selector_state,
182
+ adaptor_arguments=self.adaptor_arguments,
183
+ )
184
+ return engine.fetch(url)
185
+
186
+
187
+ class CustomFetcher(BaseFetcher):
188
+ def fetch(self, url: str, browser_engine, **kwargs) -> Response:
189
+ engine = check_if_engine_usable(browser_engine)(adaptor_arguments=self.adaptor_arguments, **kwargs)
190
+ return engine.fetch(url)
scrapling/parser.py CHANGED
@@ -1,18 +1,14 @@
1
  import os
 
 
2
  from difflib import SequenceMatcher
3
- from typing import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator
4
- try:
5
- from typing import SupportsIndex
6
- except ImportError:
7
- # 'SupportsIndex' got added in Python 3.8
8
- SupportsIndex = None
9
-
10
- from scrapling.translator import HTMLTranslator
11
- from scrapling.mixins import SelectorsGeneration
12
- from scrapling.custom_types import TextHandler, AttributesHandler
13
- from scrapling.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
14
- from scrapling.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden
15
 
 
 
 
 
 
 
16
  from lxml import etree, html
17
  from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
18
 
@@ -32,7 +28,7 @@ class Adaptor(SelectorsGeneration):
32
  huge_tree: bool = True,
33
  root: Optional[html.HtmlElement] = None,
34
  keep_comments: Optional[bool] = False,
35
- auto_match: Optional[bool] = False,
36
  storage: Any = SQLiteStorageSystem,
37
  storage_args: Optional[Dict] = None,
38
  debug: Optional[bool] = True,
@@ -125,7 +121,7 @@ class Adaptor(SelectorsGeneration):
125
  def _is_text_node(element: Union[html.HtmlElement, etree._ElementUnicodeResult]) -> bool:
126
  """Return True if given element is a result of a string expression
127
  Examples:
128
- Xpath -> '/text()', '/@attribute' etc...
129
  CSS3 -> '::text', '::attr(attrib)'...
130
  """
131
  # Faster than checking `element.is_attribute or element.is_text or element.is_tail`
@@ -163,6 +159,8 @@ class Adaptor(SelectorsGeneration):
163
  results = [self.__get_correct_result(n) for n in result]
164
  if all(isinstance(res, self.__class__) for res in results):
165
  return Adaptors(results)
 
 
166
  return results
167
 
168
  return self.__get_correct_result(result)
@@ -399,6 +397,56 @@ class Adaptor(SelectorsGeneration):
399
  return self.__convert_results(score_table[highest_probability])
400
  return []
401
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
  def css(self, selector: str, identifier: str = '',
403
  auto_match: bool = False, auto_save: bool = False, percentage: int = 0
404
  ) -> Union['Adaptors[Adaptor]', List]:
@@ -495,6 +543,113 @@ class Adaptor(SelectorsGeneration):
495
  except (SelectorError, SelectorSyntaxError, etree.XPathError, etree.XPathEvalError):
496
  raise SelectorSyntaxError(f"Invalid XPath selector: {selector}")
497
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
498
  def __calculate_similarity_score(self, original: Dict, candidate: html.HtmlElement) -> float:
499
  """Used internally to calculate a score that shows how candidate element similar to the original one
500
 
@@ -606,25 +761,33 @@ class Adaptor(SelectorsGeneration):
606
  # Operations on text functions
607
  def json(self) -> Dict:
608
  """Return json response if the response is jsonable otherwise throws error"""
609
- return self.text.json()
 
 
 
610
 
611
- def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True) -> 'List[str]':
 
612
  """Apply the given regex to the current text and return a list of strings with the matches.
613
 
614
  :param regex: Can be either a compiled regular expression or a string.
615
  :param replace_entities: if enabled character entity references are replaced by their corresponding character
 
 
616
  """
617
- return self.text.re(regex, replace_entities)
618
 
619
- def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True):
 
620
  """Apply the given regex to text and return the first match if found, otherwise return the default value.
621
 
622
  :param regex: Can be either a compiled regular expression or a string.
623
  :param default: The default value to be returned if there is no match
624
  :param replace_entities: if enabled character entity references are replaced by their corresponding character
625
-
 
626
  """
627
- return self.text.re_first(regex, default, replace_entities)
628
 
629
  def find_similar(
630
  self,
@@ -757,10 +920,10 @@ class Adaptor(SelectorsGeneration):
757
  return self.__convert_results(results)
758
 
759
  def find_by_regex(
760
- self, query: str, first_match: bool = True, case_sensitive: bool = False, clean_match: bool = True
761
  ) -> Union['Adaptors[Adaptor]', 'Adaptor', List]:
762
  """Find elements that its text content matches the input regex pattern.
763
- :param query: Regex query to match
764
  :param first_match: Return first element that matches conditions, enabled by default
765
  :param case_sensitive: if enabled, letters case will be taken into consideration in the regex
766
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
@@ -855,54 +1018,56 @@ class Adaptors(List[Adaptor]):
855
  ]
856
  return self.__class__(flatten(results))
857
 
858
- def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True) -> 'List[str]':
 
859
  """Call the ``.re()`` method for each element in this list and return
860
  their results flattened as List of TextHandler.
861
 
862
  :param regex: Can be either a compiled regular expression or a string.
863
  :param replace_entities: if enabled character entity references are replaced by their corresponding character
 
 
864
  """
865
  results = [
866
- n.text.re(regex, replace_entities) for n in self
867
  ]
868
  return flatten(results)
869
 
870
- def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True):
 
871
  """Call the ``.re_first()`` method for each element in this list and return
872
- their results flattened as List of TextHandler.
873
 
874
  :param regex: Can be either a compiled regular expression or a string.
875
  :param default: The default value to be returned if there is no match
876
  :param replace_entities: if enabled character entity references are replaced by their corresponding character
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
877
 
 
 
 
 
878
  """
879
  results = [
880
- n.text.re_first(regex, default, replace_entities) for n in self
881
  ]
882
- return flatten(results)
883
-
884
- # def __getattr__(self, name):
885
- # if name in dir(self.__class__):
886
- # return super().__getattribute__(name)
887
- #
888
- # # Execute the method itself on each Adaptor
889
- # results = []
890
- # for item in self:
891
- # results.append(getattr(item, name))
892
- #
893
- # if all(callable(r) for r in results):
894
- # def call_all(*args, **kwargs):
895
- # final_results = [r(*args, **kwargs) for r in results]
896
- # if all([isinstance(r, (Adaptor, Adaptors,)) for r in results]):
897
- # return self.__class__(final_results)
898
- # return final_results
899
- #
900
- # return call_all
901
- # else:
902
- # # Flatten the result if it's a single-item list containing a list
903
- # if len(self) == 1 and isinstance(results[0], list):
904
- # return self.__class__(results[0])
905
- # return self.__class__(results)
906
 
907
  def get(self, default=None):
908
  """Returns the first item of the current list
 
1
  import os
2
+ import re
3
+ import inspect
4
  from difflib import SequenceMatcher
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
+ from scrapling.core.translator import HTMLTranslator
7
+ from scrapling.core.mixins import SelectorsGeneration
8
+ from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
9
+ from scrapling.core.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
10
+ from scrapling.core.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden
11
+ from scrapling.core._types import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator, SupportsIndex, Iterable
12
  from lxml import etree, html
13
  from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
14
 
 
28
  huge_tree: bool = True,
29
  root: Optional[html.HtmlElement] = None,
30
  keep_comments: Optional[bool] = False,
31
+ auto_match: Optional[bool] = True,
32
  storage: Any = SQLiteStorageSystem,
33
  storage_args: Optional[Dict] = None,
34
  debug: Optional[bool] = True,
 
121
  def _is_text_node(element: Union[html.HtmlElement, etree._ElementUnicodeResult]) -> bool:
122
  """Return True if given element is a result of a string expression
123
  Examples:
124
+ XPath -> '/text()', '/@attribute' etc...
125
  CSS3 -> '::text', '::attr(attrib)'...
126
  """
127
  # Faster than checking `element.is_attribute or element.is_text or element.is_tail`
 
159
  results = [self.__get_correct_result(n) for n in result]
160
  if all(isinstance(res, self.__class__) for res in results):
161
  return Adaptors(results)
162
+ elif all(isinstance(res, TextHandler) for res in results):
163
+ return TextHandlers(results)
164
  return results
165
 
166
  return self.__get_correct_result(result)
 
397
  return self.__convert_results(score_table[highest_probability])
398
  return []
399
 
400
+ def css_first(self, selector: str, identifier: str = '',
401
+ auto_match: bool = False, auto_save: bool = False, percentage: int = 0
402
+ ) -> Union['Adaptor', 'TextHandler', None]:
403
+ """Search current tree with CSS3 selectors and return the first result if possible, otherwise return `None`
404
+
405
+ **Important:
406
+ It's recommended to use the identifier argument if you plan to use different selector later
407
+ and want to relocate the same element(s)**
408
+
409
+ :param selector: The CSS3 selector to be used.
410
+ :param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
411
+ :param identifier: A string that will be used to save/retrieve element's data in auto-matching
412
+ otherwise the selector will be used.
413
+ :param auto_save: Automatically save new elements for `auto_match` later
414
+ :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
415
+ Be aware that the percentage calculation depends solely on the page structure so don't play with this
416
+ number unless you must know what you are doing!
417
+
418
+ :return: List as :class:`Adaptors`
419
+ """
420
+ for element in self.css(selector, identifier, auto_match, auto_save, percentage):
421
+ return element
422
+ return None
423
+
424
+ def xpath_first(self, selector: str, identifier: str = '',
425
+ auto_match: bool = False, auto_save: bool = False, percentage: int = 0, **kwargs: Any
426
+ ) -> Union['Adaptor', 'TextHandler', None]:
427
+ """Search current tree with XPath selectors and return the first result if possible, otherwise return `None`
428
+
429
+ **Important:
430
+ It's recommended to use the identifier argument if you plan to use different selector later
431
+ and want to relocate the same element(s)**
432
+
433
+ Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
434
+
435
+ :param selector: The XPath selector to be used.
436
+ :param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
437
+ :param identifier: A string that will be used to save/retrieve element's data in auto-matching
438
+ otherwise the selector will be used.
439
+ :param auto_save: Automatically save new elements for `auto_match` later
440
+ :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
441
+ Be aware that the percentage calculation depends solely on the page structure so don't play with this
442
+ number unless you must know what you are doing!
443
+
444
+ :return: List as :class:`Adaptors`
445
+ """
446
+ for element in self.xpath(selector, identifier, auto_match, auto_save, percentage, **kwargs):
447
+ return element
448
+ return None
449
+
450
  def css(self, selector: str, identifier: str = '',
451
  auto_match: bool = False, auto_save: bool = False, percentage: int = 0
452
  ) -> Union['Adaptors[Adaptor]', List]:
 
543
  except (SelectorError, SelectorSyntaxError, etree.XPathError, etree.XPathEvalError):
544
  raise SelectorSyntaxError(f"Invalid XPath selector: {selector}")
545
 
546
+ def find_all(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> Union['Adaptors[Adaptor]', List]:
547
+ """Find elements by filters of your creations for ease..
548
+
549
+ :param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
550
+ :param kwargs: The attributes you want to filter elements based on it.
551
+ :return: The `Adaptors` object of the elements or empty list
552
+ """
553
+ # Attributes that are Python reserved words and can't be used directly
554
+ # Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
555
+ # https://www.w3schools.com/python/python_ref_keywords.asp
556
+ whitelisted = {
557
+ 'class_': 'class',
558
+ 'for_': 'for',
559
+ }
560
+
561
+ if not args and not kwargs:
562
+ raise TypeError('You have to pass something to search with, like tag name(s), tag attributes, or both.')
563
+
564
+ attributes = dict()
565
+ tags, patterns = set(), set()
566
+ results, functions, selectors = [], [], []
567
+
568
+ def _search_tree(element: Adaptor, filter_function: Callable) -> None:
569
+ """Collect element if it fulfills passed function otherwise, traverse the children tree and iterate"""
570
+ if filter_function(element):
571
+ results.append(element)
572
+
573
+ for branch in element.children:
574
+ _search_tree(branch, filter_function)
575
+
576
+ # Brace yourself for a wonderful journey!
577
+ for arg in args:
578
+ if type(arg) is str:
579
+ tags.add(arg)
580
+
581
+ elif type(arg) in [list, tuple, set]:
582
+ if not all(map(lambda x: type(x) is str, arg)):
583
+ raise TypeError('Nested Iterables are not accepted, only iterables of tag names are accepted')
584
+ tags.update(set(arg))
585
+
586
+ elif type(arg) is dict:
587
+ if not all([(type(k) is str and type(v) is str) for k, v in arg.items()]):
588
+ raise TypeError('Nested dictionaries are not accepted, only string keys and string values are accepted')
589
+ attributes.update(arg)
590
+
591
+ elif type(arg) is re.Pattern:
592
+ patterns.add(arg)
593
+
594
+ elif callable(arg):
595
+ if len(inspect.signature(arg).parameters) > 0:
596
+ functions.append(arg)
597
+ else:
598
+ raise TypeError("Callable filter function must have at least one argument to take `Adaptor` objects.")
599
+
600
+ else:
601
+ raise TypeError(f'Argument with type "{type(arg)}" is not accepted, please read the docs.')
602
+
603
+ if not all([(type(k) is str and type(v) is str) for k, v in kwargs.items()]):
604
+ raise TypeError('Only string values are accepted for arguments')
605
+
606
+ for attribute_name, value in kwargs.items():
607
+ # Only replace names for kwargs, replacing them in dictionaries doesn't make sense
608
+ attribute_name = whitelisted.get(attribute_name, attribute_name)
609
+ attributes[attribute_name] = value
610
+
611
+ # It's easier and faster to build a selector than traversing the tree
612
+ tags = tags or ['']
613
+ for tag in tags:
614
+ selector = tag
615
+ for key, value in attributes.items():
616
+ value = value.replace('"', r'\"') # Escape double quotes in user input
617
+ # Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
618
+ selector += '[{}="{}"]'.format(key, value)
619
+ if selector:
620
+ selectors.append(selector)
621
+
622
+ if selectors:
623
+ results = self.css(', '.join(selectors))
624
+ if results:
625
+ # From the results, get the ones that fulfill passed regex patterns
626
+ for pattern in patterns:
627
+ results = results.filter(lambda e: e.text.re(pattern, check_match=True))
628
+
629
+ # From the results, get the ones that fulfill passed functions
630
+ for function in functions:
631
+ results = results.filter(function)
632
+ else:
633
+ for pattern in patterns:
634
+ results.extend(self.find_by_regex(pattern, first_match=False))
635
+
636
+ for result in (results or [self]):
637
+ for function in functions:
638
+ _search_tree(result, function)
639
+
640
+ return self.__convert_results(results)
641
+
642
+ def find(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> Union['Adaptor', None]:
643
+ """Find elements by filters of your creations for ease then return the first result. Otherwise return `None`.
644
+
645
+ :param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
646
+ :param kwargs: The attributes you want to filter elements based on it.
647
+ :return: The `Adaptor` object of the element or `None` if the result didn't match
648
+ """
649
+ for element in self.find_all(*args, **kwargs):
650
+ return element
651
+ return None
652
+
653
  def __calculate_similarity_score(self, original: Dict, candidate: html.HtmlElement) -> float:
654
  """Used internally to calculate a score that shows how candidate element similar to the original one
655
 
 
761
  # Operations on text functions
762
  def json(self) -> Dict:
763
  """Return json response if the response is jsonable otherwise throws error"""
764
+ if self.text:
765
+ return self.text.json()
766
+ else:
767
+ return self.get_all_text(strip=True).json()
768
 
769
+ def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
770
+ clean_match: bool = False, case_sensitive: bool = False) -> 'List[str]':
771
  """Apply the given regex to the current text and return a list of strings with the matches.
772
 
773
  :param regex: Can be either a compiled regular expression or a string.
774
  :param replace_entities: if enabled character entity references are replaced by their corresponding character
775
+ :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
776
+ :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
777
  """
778
+ return self.text.re(regex, replace_entities, clean_match, case_sensitive)
779
 
780
+ def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
781
+ clean_match: bool = False, case_sensitive: bool = False) -> Union[str, None]:
782
  """Apply the given regex to text and return the first match if found, otherwise return the default value.
783
 
784
  :param regex: Can be either a compiled regular expression or a string.
785
  :param default: The default value to be returned if there is no match
786
  :param replace_entities: if enabled character entity references are replaced by their corresponding character
787
+ :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
788
+ :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
789
  """
790
+ return self.text.re_first(regex, default, replace_entities, clean_match, case_sensitive)
791
 
792
  def find_similar(
793
  self,
 
920
  return self.__convert_results(results)
921
 
922
  def find_by_regex(
923
+ self, query: Union[str, Pattern[str]], first_match: bool = True, case_sensitive: bool = False, clean_match: bool = True
924
  ) -> Union['Adaptors[Adaptor]', 'Adaptor', List]:
925
  """Find elements that its text content matches the input regex pattern.
926
+ :param query: Regex query/pattern to match
927
  :param first_match: Return first element that matches conditions, enabled by default
928
  :param case_sensitive: if enabled, letters case will be taken into consideration in the regex
929
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
 
1018
  ]
1019
  return self.__class__(flatten(results))
1020
 
1021
+ def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
1022
+ clean_match: bool = False, case_sensitive: bool = False) -> 'List[str]':
1023
  """Call the ``.re()`` method for each element in this list and return
1024
  their results flattened as List of TextHandler.
1025
 
1026
  :param regex: Can be either a compiled regular expression or a string.
1027
  :param replace_entities: if enabled character entity references are replaced by their corresponding character
1028
+ :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
1029
+ :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
1030
  """
1031
  results = [
1032
+ n.text.re(regex, replace_entities, clean_match, case_sensitive) for n in self
1033
  ]
1034
  return flatten(results)
1035
 
1036
+ def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
1037
+ clean_match: bool = False, case_sensitive: bool = False) -> Union[str, None]:
1038
  """Call the ``.re_first()`` method for each element in this list and return
1039
+ the first result or the default value otherwise.
1040
 
1041
  :param regex: Can be either a compiled regular expression or a string.
1042
  :param default: The default value to be returned if there is no match
1043
  :param replace_entities: if enabled character entity references are replaced by their corresponding character
1044
+ :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
1045
+ :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
1046
+ """
1047
+ for n in self:
1048
+ for result in n.re(regex, replace_entities, clean_match, case_sensitive):
1049
+ return result
1050
+ return default
1051
+
1052
+ def search(self, func: Callable[['Adaptor'], bool]) -> Union['Adaptor', None]:
1053
+ """Loop over all current elements and return the first element that matches the passed function
1054
+ :param func: A function that takes each element as an argument and returns True/False
1055
+ :return: The first element that match the function or ``None`` otherwise.
1056
+ """
1057
+ for element in self:
1058
+ if func(element):
1059
+ return element
1060
+ return None
1061
 
1062
+ def filter(self, func: Callable[['Adaptor'], bool]) -> Union['Adaptors', List]:
1063
+ """Filter current elements based on the passed function
1064
+ :param func: A function that takes each element as an argument and returns True/False
1065
+ :return: The new `Adaptors` object or empty list otherwise.
1066
  """
1067
  results = [
1068
+ element for element in self if func(element)
1069
  ]
1070
+ return self.__class__(results) if results else results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1071
 
1072
  def get(self, default=None):
1073
  """Returns the first item of the current list
setup.cfg CHANGED
@@ -1,8 +1,8 @@
1
  [metadata]
2
  name = scrapling
3
- version = 0.1.2
4
  author = Karim Shoair
5
  author_email = karim.shoair@pm.me
6
  description = Scrapling is a powerful, flexible, adaptive, and high-performance web scraping library for Python.
7
  license = BSD
8
- home-page = https://github.com/D4Vinci/Scrapling
 
1
  [metadata]
2
  name = scrapling
3
+ version = 0.2
4
  author = Karim Shoair
5
  author_email = karim.shoair@pm.me
6
  description = Scrapling is a powerful, flexible, adaptive, and high-performance web scraping library for Python.
7
  license = BSD
8
+ home_page = https://github.com/D4Vinci/Scrapling
setup.py CHANGED
@@ -1,4 +1,4 @@
1
- from setuptools import setup
2
 
3
  with open("README.md", "r", encoding="utf-8") as fh:
4
  long_description = fh.read()
@@ -6,7 +6,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
6
 
7
  setup(
8
  name="scrapling",
9
- version="0.1.2",
10
  description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
11
  simplifies the process of extracting data from websites, even when they undergo structural changes, and offers
12
  impressive speed improvements over many popular scraping tools.""",
@@ -15,7 +15,7 @@ setup(
15
  author="Karim Shoair",
16
  author_email="karim.shoair@pm.me",
17
  license="BSD",
18
- packages=["scrapling",],
19
  zip_safe=False,
20
  package_dir={
21
  "scrapling": "scrapling",
@@ -32,16 +32,17 @@ setup(
32
  "Natural Language :: English",
33
  "Topic :: Internet :: WWW/HTTP",
34
  "Topic :: Text Processing :: Markup",
 
35
  "Topic :: Text Processing :: Markup :: HTML",
36
  "Topic :: Software Development :: Libraries :: Python Modules",
37
  "Programming Language :: Python :: 3",
38
  "Programming Language :: Python :: 3 :: Only",
39
- "Programming Language :: Python :: 3.7",
40
  "Programming Language :: Python :: 3.8",
41
  "Programming Language :: Python :: 3.9",
42
  "Programming Language :: Python :: 3.10",
43
  "Programming Language :: Python :: 3.11",
44
  "Programming Language :: Python :: 3.12",
 
45
  "Programming Language :: Python :: Implementation :: CPython",
46
  "Typing :: Typed",
47
  ],
@@ -53,8 +54,13 @@ setup(
53
  "w3lib",
54
  "orjson>=3",
55
  "tldextract",
 
 
 
 
 
56
  ],
57
- python_requires=">=3.7",
58
  url="https://github.com/D4Vinci/Scrapling",
59
  project_urls={
60
  "Documentation": "https://github.com/D4Vinci/Scrapling/tree/main/docs", # For now
 
1
+ from setuptools import setup, find_packages
2
 
3
  with open("README.md", "r", encoding="utf-8") as fh:
4
  long_description = fh.read()
 
6
 
7
  setup(
8
  name="scrapling",
9
+ version="0.2",
10
  description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
11
  simplifies the process of extracting data from websites, even when they undergo structural changes, and offers
12
  impressive speed improvements over many popular scraping tools.""",
 
15
  author="Karim Shoair",
16
  author_email="karim.shoair@pm.me",
17
  license="BSD",
18
+ packages=find_packages(),
19
  zip_safe=False,
20
  package_dir={
21
  "scrapling": "scrapling",
 
32
  "Natural Language :: English",
33
  "Topic :: Internet :: WWW/HTTP",
34
  "Topic :: Text Processing :: Markup",
35
+ "Topic :: Internet :: WWW/HTTP :: Browsers",
36
  "Topic :: Text Processing :: Markup :: HTML",
37
  "Topic :: Software Development :: Libraries :: Python Modules",
38
  "Programming Language :: Python :: 3",
39
  "Programming Language :: Python :: 3 :: Only",
 
40
  "Programming Language :: Python :: 3.8",
41
  "Programming Language :: Python :: 3.9",
42
  "Programming Language :: Python :: 3.10",
43
  "Programming Language :: Python :: 3.11",
44
  "Programming Language :: Python :: 3.12",
45
+ "Programming Language :: Python :: 3.13",
46
  "Programming Language :: Python :: Implementation :: CPython",
47
  "Typing :: Typed",
48
  ],
 
54
  "w3lib",
55
  "orjson>=3",
56
  "tldextract",
57
+ 'httpx[brotli,zstd]',
58
+ 'playwright',
59
+ 'rebrowser-playwright',
60
+ 'camoufox>=0.3.7',
61
+ 'browserforge',
62
  ],
63
+ python_requires=">=3.8",
64
  url="https://github.com/D4Vinci/Scrapling",
65
  project_urls={
66
  "Documentation": "https://github.com/D4Vinci/Scrapling/tree/main/docs", # For now
tests/fetchers/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Because I'm too lazy to mock requests :)
tests/fetchers/test_camoufox.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ import pytest_httpbin
3
+
4
+ from scrapling import StealthyFetcher
5
+
6
+
7
+ @pytest_httpbin.use_class_based_httpbin
8
+ # @pytest_httpbin.use_class_based_httpbin_secure
9
+ class TestStealthyFetcher(unittest.TestCase):
10
+ def setUp(self):
11
+ self.fetcher = StealthyFetcher(auto_match=False)
12
+ url = self.httpbin.url
13
+ self.status_200 = f'{url}/status/200'
14
+ self.status_404 = f'{url}/status/404'
15
+ self.status_501 = f'{url}/status/501'
16
+ self.basic_url = f'{url}/get'
17
+ self.html_url = f'{url}/html'
18
+ self.delayed_url = f'{url}/delay/10' # 10 Seconds delay response
19
+ self.cookies_url = f"{url}/cookies/set/test/value"
20
+
21
+ def test_basic_fetch(self):
22
+ """Test doing basic fetch request with multiple statuses"""
23
+ self.assertEqual(self.fetcher.fetch(self.status_200).status, 200)
24
+ self.assertEqual(self.fetcher.fetch(self.status_404).status, 404)
25
+ self.assertEqual(self.fetcher.fetch(self.status_501).status, 501)
26
+
27
+ def test_networkidle(self):
28
+ """Test if waiting for `networkidle` make page does not finish loading or not"""
29
+ self.assertEqual(self.fetcher.fetch(self.basic_url, network_idle=True).status, 200)
30
+
31
+ def test_blocking_resources(self):
32
+ """Test if blocking resources make page does not finish loading or not"""
33
+ self.assertEqual(self.fetcher.fetch(self.basic_url, block_images=True).status, 200)
34
+ self.assertEqual(self.fetcher.fetch(self.basic_url, disable_resources=True).status, 200)
35
+
36
+ def test_waiting_selector(self):
37
+ """Test if waiting for a selector make page does not finish loading or not"""
38
+ self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1').status, 200)
39
+
40
+ def test_cookies_loading(self):
41
+ """Test if cookies are set after the request"""
42
+ self.assertEqual(self.fetcher.fetch(self.cookies_url).cookies, {'test': 'value'})
43
+
44
+ def test_automation(self):
45
+ """Test if automation break the code or not"""
46
+ def scroll_page(page):
47
+ page.mouse.wheel(10, 0)
48
+ page.mouse.move(100, 400)
49
+ page.mouse.up()
50
+ return page
51
+
52
+ self.assertEqual(self.fetcher.fetch(self.html_url, page_action=scroll_page).status, 200)
53
+
54
+ def test_properties(self):
55
+ """Test if different arguments breaks the code or not"""
56
+ self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=True).status, 200)
57
+ self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=False, allow_webgl=True).status, 200)
58
+ self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=False).status, 200)
59
+
60
+ def test_infinite_timeout(self):
61
+ """Test if infinite timeout breaks the code or not"""
62
+ self.assertEqual(self.fetcher.fetch(self.delayed_url, timeout=None).status, 200)
tests/fetchers/test_httpx.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ import pytest_httpbin
3
+
4
+ from scrapling import Fetcher
5
+
6
+
7
+ @pytest_httpbin.use_class_based_httpbin
8
+ class TestFetcher(unittest.TestCase):
9
+ def setUp(self):
10
+ self.fetcher = Fetcher(auto_match=False)
11
+ url = self.httpbin.url
12
+ self.status_200 = f'{url}/status/200'
13
+ self.status_404 = f'{url}/status/404'
14
+ self.status_501 = f'{url}/status/501'
15
+ self.basic_url = f'{url}/get'
16
+ self.post_url = f'{url}/post'
17
+ self.put_url = f'{url}/put'
18
+ self.delete_url = f'{url}/delete'
19
+ self.html_url = f'{url}/html'
20
+
21
+ def test_basic_get(self):
22
+ """Test doing basic get request with multiple statuses"""
23
+ self.assertEqual(self.fetcher.get(self.status_200).status, 200)
24
+ self.assertEqual(self.fetcher.get(self.status_404).status, 404)
25
+ self.assertEqual(self.fetcher.get(self.status_501).status, 501)
26
+
27
+ def test_get_properties(self):
28
+ """Test if different arguments with GET request breaks the code or not"""
29
+ self.assertEqual(self.fetcher.get(self.status_200, stealthy_headers=True).status, 200)
30
+ self.assertEqual(self.fetcher.get(self.status_200, follow_redirects=True).status, 200)
31
+ self.assertEqual(self.fetcher.get(self.status_200, timeout=None).status, 200)
32
+ self.assertEqual(
33
+ self.fetcher.get(self.status_200, stealthy_headers=True, follow_redirects=True, timeout=None).status,
34
+ 200
35
+ )
36
+
37
+ def test_post_properties(self):
38
+ """Test if different arguments with POST request breaks the code or not"""
39
+ self.assertEqual(self.fetcher.post(self.post_url, data={'key': 'value'}).status, 200)
40
+ self.assertEqual(self.fetcher.post(self.post_url, data={'key': 'value'}, stealthy_headers=True).status, 200)
41
+ self.assertEqual(self.fetcher.post(self.post_url, data={'key': 'value'}, follow_redirects=True).status, 200)
42
+ self.assertEqual(self.fetcher.post(self.post_url, data={'key': 'value'}, timeout=None).status, 200)
43
+ self.assertEqual(
44
+ self.fetcher.post(self.post_url, data={'key': 'value'}, stealthy_headers=True, follow_redirects=True, timeout=None).status,
45
+ 200
46
+ )
47
+
48
+ def test_put_properties(self):
49
+ """Test if different arguments with PUT request breaks the code or not"""
50
+ self.assertEqual(self.fetcher.put(self.put_url, data={'key': 'value'}).status, 200)
51
+ self.assertEqual(self.fetcher.put(self.put_url, data={'key': 'value'}, stealthy_headers=True).status, 200)
52
+ self.assertEqual(self.fetcher.put(self.put_url, data={'key': 'value'}, follow_redirects=True).status, 200)
53
+ self.assertEqual(self.fetcher.put(self.put_url, data={'key': 'value'}, timeout=None).status, 200)
54
+ self.assertEqual(
55
+ self.fetcher.put(self.put_url, data={'key': 'value'}, stealthy_headers=True, follow_redirects=True, timeout=None).status,
56
+ 200
57
+ )
58
+
59
+ def test_delete_properties(self):
60
+ """Test if different arguments with DELETE request breaks the code or not"""
61
+ self.assertEqual(self.fetcher.delete(self.delete_url, stealthy_headers=True).status, 200)
62
+ self.assertEqual(self.fetcher.delete(self.delete_url, follow_redirects=True).status, 200)
63
+ self.assertEqual(self.fetcher.delete(self.delete_url, timeout=None).status, 200)
64
+ self.assertEqual(
65
+ self.fetcher.delete(self.delete_url, stealthy_headers=True, follow_redirects=True, timeout=None).status,
66
+ 200
67
+ )
tests/fetchers/test_playwright.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ import pytest_httpbin
3
+
4
+ from scrapling import PlayWrightFetcher
5
+
6
+
7
+ @pytest_httpbin.use_class_based_httpbin
8
+ # @pytest_httpbin.use_class_based_httpbin_secure
9
+ class TestPlayWrightFetcher(unittest.TestCase):
10
+ def setUp(self):
11
+ self.fetcher = PlayWrightFetcher(auto_match=False)
12
+ url = self.httpbin.url
13
+ self.status_200 = f'{url}/status/200'
14
+ self.status_404 = f'{url}/status/404'
15
+ self.status_501 = f'{url}/status/501'
16
+ self.basic_url = f'{url}/get'
17
+ self.html_url = f'{url}/html'
18
+ self.delayed_url = f'{url}/delay/10' # 10 Seconds delay response
19
+ self.cookies_url = f"{url}/cookies/set/test/value"
20
+
21
+ def test_basic_fetch(self):
22
+ """Test doing basic fetch request with multiple statuses"""
23
+ self.assertEqual(self.fetcher.fetch(self.status_200).status, 200)
24
+ self.assertEqual(self.fetcher.fetch(self.status_404).status, 404)
25
+ self.assertEqual(self.fetcher.fetch(self.status_501).status, 501)
26
+
27
+ def test_networkidle(self):
28
+ """Test if waiting for `networkidle` make page does not finish loading or not"""
29
+ self.assertEqual(self.fetcher.fetch(self.basic_url, network_idle=True).status, 200)
30
+
31
+ def test_blocking_resources(self):
32
+ """Test if blocking resources make page does not finish loading or not"""
33
+ self.assertEqual(self.fetcher.fetch(self.basic_url, disable_resources=True).status, 200)
34
+
35
+ def test_waiting_selector(self):
36
+ """Test if waiting for a selector make page does not finish loading or not"""
37
+ self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1').status, 200)
38
+
39
+ def test_cookies_loading(self):
40
+ """Test if cookies are set after the request"""
41
+ self.assertEqual(self.fetcher.fetch(self.cookies_url).cookies, {'test': 'value'})
42
+
43
+ def test_automation(self):
44
+ """Test if automation break the code or not"""
45
+ def scroll_page(page):
46
+ page.mouse.wheel(10, 0)
47
+ page.mouse.move(100, 400)
48
+ page.mouse.up()
49
+ return page
50
+
51
+ self.assertEqual(self.fetcher.fetch(self.html_url, page_action=scroll_page).status, 200)
52
+
53
+ def test_properties(self):
54
+ """Test if different arguments breaks the code or not"""
55
+ self.assertEqual(self.fetcher.fetch(self.html_url, disable_webgl=True, hide_canvas=False).status, 200)
56
+ self.assertEqual(self.fetcher.fetch(self.html_url, disable_webgl=False, hide_canvas=True).status, 200)
57
+ self.assertEqual(self.fetcher.fetch(self.html_url, stealth=True).status, 200)
58
+ self.assertEqual(self.fetcher.fetch(self.html_url, useragent='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0').status, 200)
59
+
60
+ def test_cdp_url(self):
61
+ """Test if it's going to try to connect to cdp url or not"""
62
+ with self.assertRaises(ValueError):
63
+ _ = self.fetcher.fetch(self.html_url, cdp_url='blahblah')
64
+
65
+ with self.assertRaises(ValueError):
66
+ _ = self.fetcher.fetch(self.html_url, cdp_url='blahblah', nstbrowser_mode=True)
67
+
68
+ with self.assertRaises(Exception):
69
+ # There's no type for this error in PlayWright, it's just `Error`
70
+ _ = self.fetcher.fetch(self.html_url, cdp_url='ws://blahblah')
71
+
72
+ def test_infinite_timeout(self):
73
+ """Test if infinite timeout breaks the code or not"""
74
+ self.assertEqual(self.fetcher.fetch(self.delayed_url, timeout=None).status, 200)
tests/parser/__init__.py ADDED
File without changes
tests/parser/test_automatch.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+
3
+ from scrapling import Adaptor
4
+
5
+
6
+ class TestParserAutoMatch(unittest.TestCase):
7
+
8
+ def test_element_relocation(self):
9
+ """Test relocating element after structure change"""
10
+ original_html = '''
11
+ <div class="container">
12
+ <section class="products">
13
+ <article class="product" id="p1">
14
+ <h3>Product 1</h3>
15
+ <p class="description">Description 1</p>
16
+ </article>
17
+ <article class="product" id="p2">
18
+ <h3>Product 2</h3>
19
+ <p class="description">Description 2</p>
20
+ </article>
21
+ </section>
22
+ </div>
23
+ '''
24
+ changed_html = '''
25
+ <div class="new-container">
26
+ <div class="product-wrapper">
27
+ <section class="products">
28
+ <article class="product new-class" data-id="p1">
29
+ <div class="product-info">
30
+ <h3>Product 1</h3>
31
+ <p class="new-description">Description 1</p>
32
+ </div>
33
+ </article>
34
+ <article class="product new-class" data-id="p2">
35
+ <div class="product-info">
36
+ <h3>Product 2</h3>
37
+ <p class="new-description">Description 2</p>
38
+ </div>
39
+ </article>
40
+ </section>
41
+ </div>
42
+ </div>
43
+ '''
44
+
45
+ old_page = Adaptor(original_html, url='example.com', auto_match=True, debug=True)
46
+ new_page = Adaptor(changed_html, url='example.com', auto_match=True, debug=True)
47
+
48
+ # 'p1' was used as ID and now it's not and all the path elements have changes
49
+ # Also at the same time testing auto-match vs combined selectors
50
+ _ = old_page.css('#p1, #p2', auto_save=True)[0]
51
+ relocated = new_page.css('#p1', auto_match=True)
52
+
53
+ self.assertIsNotNone(relocated)
54
+ self.assertEqual(relocated[0].attrib['data-id'], 'p1')
55
+ self.assertTrue(relocated[0].has_class('new-class'))
56
+ self.assertEqual(relocated[0].css('.new-description')[0].text, 'Description 1')
tests/{test_all_functions.py → parser/test_general.py} RENAMED
@@ -112,11 +112,11 @@ class TestParser(unittest.TestCase):
112
 
113
  def test_find_similar_elements(self):
114
  """Test Finding similar elements of an element"""
115
- first_product = self.page.css('.product')[0]
116
  similar_products = first_product.find_similar()
117
  self.assertEqual(len(similar_products), 2)
118
 
119
- first_review = self.page.css('.review')[0]
120
  similar_high_rated_reviews = [
121
  review
122
  for review in first_review.find_similar()
@@ -127,16 +127,16 @@ class TestParser(unittest.TestCase):
127
  def test_expected_errors(self):
128
  """Test errors that should raised if it does"""
129
  with self.assertRaises(ValueError):
130
- _ = Adaptor()
131
 
132
  with self.assertRaises(TypeError):
133
- _ = Adaptor(root="ayo")
134
 
135
  with self.assertRaises(TypeError):
136
- _ = Adaptor(text=1)
137
 
138
  with self.assertRaises(TypeError):
139
- _ = Adaptor(body=1)
140
 
141
  with self.assertRaises(ValueError):
142
  _ = Adaptor(self.html, storage=object, auto_match=True)
@@ -169,8 +169,8 @@ class TestParser(unittest.TestCase):
169
  def test_selectors_generation(self):
170
  """Try to create selectors for all elements in the page"""
171
  def _traverse(element: Adaptor):
172
- self.assertTrue(type(element.css_selector) is str)
173
- self.assertTrue(type(element.xpath_selector) is str)
174
  for branch in element.children:
175
  _traverse(branch)
176
 
@@ -197,7 +197,7 @@ class TestParser(unittest.TestCase):
197
  parent_siblings = parent.siblings
198
  self.assertEqual(len(parent_siblings), 1)
199
 
200
- child = table.css('[data-id="1"]')[0]
201
  next_element = child.next
202
  self.assertEqual(next_element.attrib['data-id'], '2')
203
 
@@ -261,60 +261,10 @@ class TestParser(unittest.TestCase):
261
  key_value = list(products[0].attrib.search_values('1', partial=True))
262
  self.assertEqual(list(key_value[0].keys()), ['data-id'])
263
 
264
- attr_json = self.page.css('#products')[0].attrib['schema'].json()
265
  self.assertEqual(attr_json, {'jsonable': 'data'})
266
  self.assertEqual(type(self.page.css('#products')[0].attrib.json_string), bytes)
267
 
268
- def test_element_relocation(self):
269
- """Test relocating element after structure change"""
270
- original_html = '''
271
- <div class="container">
272
- <section class="products">
273
- <article class="product" id="p1">
274
- <h3>Product 1</h3>
275
- <p class="description">Description 1</p>
276
- </article>
277
- <article class="product" id="p2">
278
- <h3>Product 2</h3>
279
- <p class="description">Description 2</p>
280
- </article>
281
- </section>
282
- </div>
283
- '''
284
- changed_html = '''
285
- <div class="new-container">
286
- <div class="product-wrapper">
287
- <section class="products">
288
- <article class="product new-class" data-id="p1">
289
- <div class="product-info">
290
- <h3>Product 1</h3>
291
- <p class="new-description">Description 1</p>
292
- </div>
293
- </article>
294
- <article class="product new-class" data-id="p2">
295
- <div class="product-info">
296
- <h3>Product 2</h3>
297
- <p class="new-description">Description 2</p>
298
- </div>
299
- </article>
300
- </section>
301
- </div>
302
- </div>
303
- '''
304
-
305
- old_page = Adaptor(original_html, url='example.com', auto_match=True, debug=True)
306
- new_page = Adaptor(changed_html, url='example.com', auto_match=True, debug=True)
307
-
308
- # 'p1' was used as ID and now it's not and all the path elements have changes
309
- # Also at the same time testing auto-match vs combined selectors
310
- _ = old_page.css('#p1, #p2', auto_save=True)[0]
311
- relocated = new_page.css('#p1', auto_match=True)
312
-
313
- self.assertIsNotNone(relocated)
314
- self.assertEqual(relocated[0].attrib['data-id'], 'p1')
315
- self.assertTrue(relocated[0].has_class('new-class'))
316
- self.assertEqual(relocated[0].css('.new-description')[0].text, 'Description 1')
317
-
318
  def test_performance(self):
319
  """Test parsing and selecting speed"""
320
  import time
@@ -331,6 +281,6 @@ class TestParser(unittest.TestCase):
331
  self.assertLess(end_time - start_time, 0.1)
332
 
333
 
334
- # Use `coverage run -m unittest --verbose tests/test_all_functions.py` instead for the coverage report
335
  # if __name__ == '__main__':
336
  # unittest.main(verbosity=2)
 
112
 
113
  def test_find_similar_elements(self):
114
  """Test Finding similar elements of an element"""
115
+ first_product = self.page.css_first('.product')
116
  similar_products = first_product.find_similar()
117
  self.assertEqual(len(similar_products), 2)
118
 
119
+ first_review = self.page.find('div', class_='review')
120
  similar_high_rated_reviews = [
121
  review
122
  for review in first_review.find_similar()
 
127
  def test_expected_errors(self):
128
  """Test errors that should raised if it does"""
129
  with self.assertRaises(ValueError):
130
+ _ = Adaptor(auto_match=False)
131
 
132
  with self.assertRaises(TypeError):
133
+ _ = Adaptor(root="ayo", auto_match=False)
134
 
135
  with self.assertRaises(TypeError):
136
+ _ = Adaptor(text=1, auto_match=False)
137
 
138
  with self.assertRaises(TypeError):
139
+ _ = Adaptor(body=1, auto_match=False)
140
 
141
  with self.assertRaises(ValueError):
142
  _ = Adaptor(self.html, storage=object, auto_match=True)
 
169
  def test_selectors_generation(self):
170
  """Try to create selectors for all elements in the page"""
171
  def _traverse(element: Adaptor):
172
+ self.assertTrue(type(element.generate_css_selector) is str)
173
+ self.assertTrue(type(element.generate_xpath_selector) is str)
174
  for branch in element.children:
175
  _traverse(branch)
176
 
 
197
  parent_siblings = parent.siblings
198
  self.assertEqual(len(parent_siblings), 1)
199
 
200
+ child = table.find({'data-id': "1"})
201
  next_element = child.next
202
  self.assertEqual(next_element.attrib['data-id'], '2')
203
 
 
261
  key_value = list(products[0].attrib.search_values('1', partial=True))
262
  self.assertEqual(list(key_value[0].keys()), ['data-id'])
263
 
264
+ attr_json = self.page.css_first('#products').attrib['schema'].json()
265
  self.assertEqual(attr_json, {'jsonable': 'data'})
266
  self.assertEqual(type(self.page.css('#products')[0].attrib.json_string), bytes)
267
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  def test_performance(self):
269
  """Test parsing and selecting speed"""
270
  import time
 
281
  self.assertLess(end_time - start_time, 0.1)
282
 
283
 
284
+ # Use `coverage run -m unittest --verbose tests/test_parser_functions.py` instead for the coverage report
285
  # if __name__ == '__main__':
286
  # unittest.main(verbosity=2)
tests/requirements.txt CHANGED
@@ -1,2 +1,7 @@
1
- pytest
2
- pytest-cov
 
 
 
 
 
 
1
+ pytest>=2.8.0,<9
2
+ pytest-cov
3
+ playwright
4
+ camoufox
5
+ werkzeug<3.0.0
6
+ pytest-httpbin==2.1.0
7
+ httpbin~=0.10.0
tox.ini CHANGED
@@ -4,14 +4,17 @@
4
  # and then run "tox" from this directory.
5
 
6
  [tox]
7
- envlist = pre-commit,py37,py38,py39,py310,py311,py312
8
 
9
  [testenv]
10
  usedevelop = True
11
  changedir = tests
12
  deps =
13
  -r{toxinidir}/tests/requirements.txt
14
- commands = pytest --cov=scrapling --cov-report=xml
 
 
 
15
 
16
  [testenv:pre-commit]
17
  basepython = python3
 
4
  # and then run "tox" from this directory.
5
 
6
  [tox]
7
+ envlist = pre-commit,py{38,39,310,311,312,313}
8
 
9
  [testenv]
10
  usedevelop = True
11
  changedir = tests
12
  deps =
13
  -r{toxinidir}/tests/requirements.txt
14
+ commands =
15
+ playwright install-deps chromium firefox
16
+ camoufox fetch --browserforge
17
+ pytest --cov=scrapling --cov-report=xml
18
 
19
  [testenv:pre-commit]
20
  basepython = python3