diff --git a/.bandit.yml b/.bandit.yml index 5acd3724e4f849eda061dff10c4d1caed3ad11b2..bd06507f49dbd7031ea5b87ce861359dbc533cec 100644 --- a/.bandit.yml +++ b/.bandit.yml @@ -6,4 +6,6 @@ skips: - B404 # Using subprocess library - B602 # subprocess call with shell=True identified - B110 # Try, Except, Pass detected. -- B104 # Possible binding to all interfaces. \ No newline at end of file +- B104 # Possible binding to all interfaces. +- B301 # Pickle and modules that wrap it can be unsafe when used to deserialize untrusted data, possible security issue. +- B108 # Probable insecure usage of temp file/directory. \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/04-docs_issue.yml b/.github/ISSUE_TEMPLATE/04-docs_issue.yml new file mode 100644 index 0000000000000000000000000000000000000000..344537451e7deab333c1af33eba0af0f05f6a937 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/04-docs_issue.yml @@ -0,0 +1,40 @@ +name: Documentation issue +description: Report incorrect, unclear, or missing documentation. +labels: [documentation] +body: + - type: checkboxes + attributes: + label: Have you searched if there an existing issue for this? + description: Please search [existing issues](https://github.com/D4Vinci/Scrapling/labels/documentation). + options: + - label: I have searched the existing issues + required: true + + - type: input + attributes: + label: "Page URL" + description: "Link to the documentation page with the issue." + placeholder: "https://scrapling.readthedocs.io/en/latest/..." + validations: + required: true + + - type: dropdown + attributes: + label: "Type of issue" + options: + - Incorrect information + - Unclear or confusing + - Missing information + - Typo or formatting + - Broken link + - Other + default: 0 + validations: + required: true + + - type: textarea + attributes: + label: "Description" + description: "Describe what's wrong and what you expected to find." + validations: + required: true diff --git a/.github/workflows/code-quality.yml b/.github/workflows/code-quality.yml index 8d15d0cc7d62cfb3b383ecc130261241e4d6bf31..ada0ec273c3ba3199208768325f1d419e73608c4 100644 --- a/.github/workflows/code-quality.yml +++ b/.github/workflows/code-quality.yml @@ -50,7 +50,9 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install bandit[toml] ruff vermin + pip install bandit[toml] ruff vermin mypy pyright + pip install -e ".[all]" + pip install lxml-stubs - name: Run Bandit (Security Linter) id: bandit @@ -85,6 +87,22 @@ jobs: vermin -t=3.10- --violations --eval-annotations --no-tips scrapling/ echo "::endgroup::" + - name: Run Mypy (Static Type Checker) + id: mypy + continue-on-error: true + run: | + echo "::group::Mypy - Static Type Checker" + mypy scrapling/ + echo "::endgroup::" + + - name: Run Pyright (Static Type Checker) + id: pyright + continue-on-error: true + run: | + echo "::group::Pyright - Static Type Checker" + pyright scrapling/ + echo "::endgroup::" + - name: Check results and create summary if: always() run: | @@ -126,6 +144,22 @@ jobs: all_passed=false fi + # Check Mypy + if [ "${{ steps.mypy.outcome }}" == "success" ]; then + echo "✅ **Mypy (Type Checker)**: Passed" >> $GITHUB_STEP_SUMMARY + else + echo "❌ **Mypy (Type Checker)**: Failed" >> $GITHUB_STEP_SUMMARY + all_passed=false + fi + + # Check Pyright + if [ "${{ steps.pyright.outcome }}" == "success" ]; then + echo "✅ **Pyright (Type Checker)**: Passed" >> $GITHUB_STEP_SUMMARY + else + echo "❌ **Pyright (Type Checker)**: Failed" >> $GITHUB_STEP_SUMMARY + all_passed=false + fi + echo "" >> $GITHUB_STEP_SUMMARY if [ "$all_passed" == "true" ]; then diff --git a/.gitignore b/.gitignore index 7890c42c60c3348acc02b8cced836a97b40f5589..f27bc086f90d735ad0be83499ee9b303039fa7de 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,9 @@ +site/* + +# AI related files +.claude/* +CLAUDE.md + # cached files __pycache__/ *.py[cod] diff --git a/.readthedocs.yaml b/.readthedocs.yaml index d52ca2e8cbefb68f155bf822bd2b55e861e15fa4..8d5a8d52a7881b09df9293c049d402d5e2c993ec 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -1,25 +1,21 @@ -# Read the Docs configuration file -# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details +# See https://docs.readthedocs.com/platform/stable/intro/zensical.html for details +# Example: https://github.com/readthedocs/test-builds/tree/zensical -# Required version: 2 -# Set the OS, Python version, and other tools you might need build: os: ubuntu-24.04 apt_packages: - pngquant tools: python: "3.13" - -# Build documentation with Mkdocs -mkdocs: - configuration: mkdocs.yml - -# Optionally, but recommended, -# declare the Python requirements required to build your documentation -# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html -python: + jobs: install: - - requirements: docs/requirements.txt - + - pip install -r docs/requirements.txt + - pip install ".[all]" + build: + html: + - zensical build + post_build: + - mkdir -p $READTHEDOCS_OUTPUT/html/ + - cp --recursive site/* $READTHEDOCS_OUTPUT/html/ diff --git a/docs/README.md b/README.md similarity index 62% rename from docs/README.md rename to README.md index b4d19508bb8dbc2a768ce2e63e5dc3384d6c8e20..3c5e5cad538b2629c0f17e403324be95502b9cf2 100644 --- a/docs/README.md +++ b/README.md @@ -1,13 +1,17 @@ -Automated translations: [العربيه](https://github.com/D4Vinci/Scrapling/blob/main/docs/README_AR.md) | [Español](https://github.com/D4Vinci/Scrapling/blob/main/docs/README_ES.md) | [Deutsch](https://github.com/D4Vinci/Scrapling/blob/main/docs/README_DE.md) | [简体中文](https://github.com/D4Vinci/Scrapling/blob/main/docs/README_CN.md) | [日本語](https://github.com/D4Vinci/Scrapling/blob/main/docs/README_JP.md) | [Русский](https://github.com/D4Vinci/Scrapling/blob/main/docs/README_RU.md) - +

+ + + + Scrapling Poster + + +
+ Effortless Web Scraping for the Modern Web +

-

-
- main poster -
- Easy, effortless Web Scraping as it should be! -

+ العربيه | Español | Deutsch | 简体中文 | 日本語 | Русский +
Tests @@ -27,44 +31,45 @@ Automated translations: [العربيه](https://github.com/D4Vinci/Scrapling/bl

- - Selection methods - - · - - Choosing a fetcher - - · - - CLI - - · - - MCP mode - - · - - Migrating from Beautifulsoup - + Selection methods + · + Choosing a fetcher + · + CLI + · + MCP mode + · + Migrating from Beautifulsoup

-**Stop fighting anti-bot systems. Stop rewriting selectors after every website update.** +Scrapling is an adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl. -Scrapling isn't just another Web Scraping library. It's the first **adaptive** scraping library that learns from website changes and evolves with them. While other libraries break when websites update their structure, Scrapling automatically relocates your elements and keeps your scrapers running. +Its parser learns from website changes and automatically relocates your elements when pages update. Its fetchers bypass anti-bot systems like Cloudflare Turnstile out of the box. And its spider framework lets you scale up to concurrent, multi-session crawls with pause/resume and automatic proxy rotation — all in a few lines of Python. One library, zero compromises. -Built for the modern Web, Scrapling features **its own rapid parsing engine** and fetchers to handle all Web Scraping challenges you face or will face. Built by Web Scrapers for Web Scrapers and regular users, there's something for everyone. +Blazing fast crawls with real-time stats and streaming. Built by Web Scrapers for Web Scrapers and regular users, there's something for everyone. ```python ->> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher ->> StealthyFetcher.adaptive = True -# Fetch websites' source under the radar! ->> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) ->> print(page.status) -200 ->> products = page.css('.product', auto_save=True) # Scrape data that survives website design changes! ->> # Later, if the website structure changes, pass `adaptive=True` ->> products = page.css('.product', adaptive=True) # and Scrapling still finds them! +from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher +StealthyFetcher.adaptive = True +page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # Fetch website under the radar! +products = page.css('.product', auto_save=True) # Scrape data that survives website design changes! +products = page.css('.product', adaptive=True) # Later, if the website structure changes, pass `adaptive=True` to find them! ``` +Or scale up to full crawls +```python +from scrapling.spiders import Spider, Response + +class MySpider(Spider): + name = "demo" + start_urls = ["https://example.com/"] + + async def parse(self, response: Response): + for item in response.css('.product'): + yield {"title": item.css('h2::text').get()} + +MySpider().start() +``` + # Sponsors @@ -90,16 +95,27 @@ Built for the modern Web, Scrapling features **its own rapid parsing engine** an ## Key Features +### Spiders — A Full Crawling Framework +- 🕷️ **Scrapy-like Spider API**: Define spiders with `start_urls`, async `parse` callbacks, and `Request`/`Response` objects. +- ⚡ **Concurrent Crawling**: Configurable concurrency limits, per-domain throttling, and download delays. +- 🔄 **Multi-Session Support**: Unified interface for HTTP requests, and stealthy headless browsers in a single spider — route requests to different sessions by ID. +- 💾 **Pause & Resume**: Checkpoint-based crawl persistence. Press Ctrl+C for a graceful shutdown; restart to resume from where you left off. +- 📡 **Streaming Mode**: Stream scraped items as they arrive via `async for item in spider.stream()` with real-time stats — ideal for UI, pipelines, and long-running crawls. +- 🛡️ **Blocked Request Detection**: Automatic detection and retry of blocked requests with customizable logic. +- 📦 **Built-in Export**: Export results through hooks and your own pipeline or the built-in JSON/JSONL with `result.items.to_json()` / `result.items.to_jsonl()` respectively. + ### Advanced Websites Fetching with Session Support -- **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class. Can impersonate browsers' TLS fingerprint, headers, and use HTTP3. +- **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class. Can impersonate browsers' TLS fingerprint, headers, and use HTTP/3. - **Dynamic Loading**: Fetch dynamic websites with full browser automation through the `DynamicFetcher` class supporting Playwright's Chromium and Google's Chrome. - **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` and fingerprint spoofing. Can easily bypass all types of Cloudflare's Turnstile/Interstitial with automation. - **Session Management**: Persistent session support with `FetcherSession`, `StealthySession`, and `DynamicSession` classes for cookie and state management across requests. +- **Proxy Rotation**: Built-in `ProxyRotator` with cyclic or custom rotation strategies across all session types, plus per-request proxy overrides. +- **Domain Blocking**: Block requests to specific domains (and their subdomains) in browser-based fetchers. - **Async Support**: Complete async support across all fetchers and dedicated async session classes. ### Adaptive Scraping & AI Integration - 🔄 **Smart Element Tracking**: Relocate elements after website changes using intelligent similarity algorithms. -- 🎯 **Smart Flexible Selection**: CSS selectors, XPath selectors, filter-based search, text search, regex search, and more. +- 🎯 **Smart Flexible Selection**: CSS selectors, XPath selectors, filter-based search, text search, regex search, and more. - 🔍 **Find Similar Elements**: Automatically locate elements similar to found elements. - 🤖 **MCP Server to be used with AI**: Built-in MCP server for AI-assisted Web Scraping and data extraction. The MCP server features powerful, custom capabilities that leverage Scrapling to extract targeted content before passing it to the AI (Claude/Cursor/etc), thereby speeding up operations and reducing costs by minimizing token usage. ([demo video](https://www.youtube.com/watch?v=qyFk3ZNwOxE)) @@ -111,51 +127,107 @@ Built for the modern Web, Scrapling features **its own rapid parsing engine** an ### Developer/Web Scraper Friendly Experience - 🎯 **Interactive Web Scraping Shell**: Optional built-in IPython shell with Scrapling integration, shortcuts, and new tools to speed up Web Scraping scripts development, like converting curl requests to Scrapling requests and viewing requests results in your browser. -- 🚀 **Use it directly from the Terminal**: Optionally, you can use Scrapling to scrape a URL without writing a single code! +- 🚀 **Use it directly from the Terminal**: Optionally, you can use Scrapling to scrape a URL without writing a single line of code! - 🛠️ **Rich Navigation API**: Advanced DOM traversal with parent, sibling, and child navigation methods. - 🧬 **Enhanced Text Processing**: Built-in regex, cleaning methods, and optimized string operations. - 📝 **Auto Selector Generation**: Generate robust CSS/XPath selectors for any element. - 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup with the same pseudo-elements used in Scrapy/Parsel. -- 📘 **Complete Type Coverage**: Full type hints for excellent IDE support and code completion. +- 📘 **Complete Type Coverage**: Full type hints for excellent IDE support and code completion. The entire codebase is automatically scanned with **PyRight** and **MyPy** with each change. - 🔋 **Ready Docker image**: With each release, a Docker image containing all browsers is automatically built and pushed. ## Getting Started +Let's give you a quick glimpse of what Scrapling can do without deep diving. + ### Basic Usage +HTTP requests with session support ```python -from scrapling.fetchers import Fetcher, StealthyFetcher, DynamicFetcher -from scrapling.fetchers import FetcherSession, StealthySession, DynamicSession +from scrapling.fetchers import Fetcher, FetcherSession -# HTTP requests with session support with FetcherSession(impersonate='chrome') as session: # Use latest version of Chrome's TLS fingerprint page = session.get('https://quotes.toscrape.com/', stealthy_headers=True) - quotes = page.css('.quote .text::text') + quotes = page.css('.quote .text::text').getall() # Or use one-off requests page = Fetcher.get('https://quotes.toscrape.com/') -quotes = page.css('.quote .text::text') +quotes = page.css('.quote .text::text').getall() +``` +Advanced stealth mode +```python +from scrapling.fetchers import StealthyFetcher, StealthySession -# Advanced stealth mode (Keep the browser open until you finish) -with StealthySession(headless=True, solve_cloudflare=True) as session: +with StealthySession(headless=True, solve_cloudflare=True) as session: # Keep the browser open until you finish page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False) - data = page.css('#padded_content a') + data = page.css('#padded_content a').getall() # Or use one-off request style, it opens the browser for this request, then closes it after finishing page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare') -data = page.css('#padded_content a') - -# Full browser automation (Keep the browser open until you finish) -with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: +data = page.css('#padded_content a').getall() +``` +Full browser automation +```python +from scrapling.fetchers import DynamicFetcher, DynamicSession + +with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # Keep the browser open until you finish page = session.fetch('https://quotes.toscrape.com/', load_dom=False) - data = page.xpath('//span[@class="text"]/text()') # XPath selector if you prefer it + data = page.xpath('//span[@class="text"]/text()').getall() # XPath selector if you prefer it # Or use one-off request style, it opens the browser for this request, then closes it after finishing page = DynamicFetcher.fetch('https://quotes.toscrape.com/') -data = page.css('.quote .text::text') +data = page.css('.quote .text::text').getall() ``` -> [!NOTE] -> There's a wonderful guide to get you started quickly with Scrapling [here](https://substack.thewebscraping.club/p/scrapling-hands-on-guide) written by The Web Scraping Club. In case you find it easier to get you started than the [documentation website](https://scrapling.readthedocs.io/en/latest/). +### Spiders +Build full crawlers with concurrent requests, multiple session types, and pause/resume: +```python +from scrapling.spiders import Spider, Request, Response + +class QuotesSpider(Spider): + name = "quotes" + start_urls = ["https://quotes.toscrape.com/"] + concurrent_requests = 10 + + async def parse(self, response: Response): + for quote in response.css('.quote'): + yield { + "text": quote.css('.text::text').get(), + "author": quote.css('.author::text').get(), + } + + next_page = response.css('.next a') + if next_page: + yield response.follow(next_page[0].attrib['href']) + +result = QuotesSpider().start() +print(f"Scraped {len(result.items)} quotes") +result.items.to_json("quotes.json") +``` +Use multiple session types in a single spider: +```python +from scrapling.spiders import Spider, Request, Response +from scrapling.fetchers import FetcherSession, AsyncStealthySession + +class MultiSessionSpider(Spider): + name = "multi" + start_urls = ["https://example.com/"] + + def configure_sessions(self, manager): + manager.add("fast", FetcherSession(impersonate="chrome")) + manager.add("stealth", AsyncStealthySession(headless=True), lazy=True) + + async def parse(self, response: Response): + for link in response.css('a::attr(href)').getall(): + # Route protected pages through the stealth session + if "protected" in link: + yield Request(link, sid="stealth") + else: + yield Request(link, sid="fast", callback=self.parse) # explicit callback +``` +Pause and resume long crawls with checkpoints by running the spider like this: +```python +QuotesSpider(crawldir="./crawl_data").start() +``` +Press Ctrl+C to pause gracefully — progress is saved automatically. Later, when you start the spider again, pass the same `crawldir`, and it will resume from where it stopped. ### Advanced Parsing & Navigation ```python @@ -176,10 +248,9 @@ quotes = page.find_all(class_='quote') # and so on... quotes = page.find_by_text('quote', tag='div') # Advanced navigation -first_quote = page.css_first('.quote') -quote_text = first_quote.css('.text::text') -quote_text = page.css('.quote').css_first('.text::text') # Chained selectors -quote_text = page.css_first('.quote .text').text # Using `css_first` is faster than `css` if you want the first element +quote_text = page.css('.quote')[0].css('.text::text').get() +quote_text = page.css('.quote').css('.text::text').getall() # Chained selectors +first_quote = page.css('.quote')[0] author = first_quote.next_sibling.css('.author::text') parent_container = first_quote.parent @@ -220,7 +291,7 @@ async with AsyncStealthySession(max_pages=2) as session: ## CLI & Interactive Shell -Scrapling v0.3 includes a powerful command-line interface: +Scrapling includes a powerful command-line interface: [![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339) @@ -237,34 +308,34 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas. ``` > [!NOTE] -> There are many additional features, but we want to keep this page concise, such as the MCP server and the interactive Web Scraping Shell. Check out the full documentation [here](https://scrapling.readthedocs.io/en/latest/) +> There are many additional features, but we want to keep this page concise, including the MCP server and the interactive Web Scraping Shell. Check out the full documentation [here](https://scrapling.readthedocs.io/en/latest/) ## Performance Benchmarks -Scrapling isn't just powerful—it's also blazing fast, and the updates since version 0.3 have delivered exceptional performance improvements across all operations. The following benchmarks compare Scrapling's parser with other popular libraries. +Scrapling isn't just powerful—it's also blazing fast. The following benchmarks compare Scrapling's parser with the latest versions of other popular libraries. ### Text Extraction Speed Test (5000 nested elements) | # | Library | Time (ms) | vs Scrapling | |---|:-----------------:|:---------:|:------------:| -| 1 | Scrapling | 1.99 | 1.0x | -| 2 | Parsel/Scrapy | 2.01 | 1.01x | -| 3 | Raw Lxml | 2.5 | 1.256x | -| 4 | PyQuery | 22.93 | ~11.5x | -| 5 | Selectolax | 80.57 | ~40.5x | -| 6 | BS4 with Lxml | 1541.37 | ~774.6x | -| 7 | MechanicalSoup | 1547.35 | ~777.6x | -| 8 | BS4 with html5lib | 3410.58 | ~1713.9x | +| 1 | Scrapling | 2.02 | 1.0x | +| 2 | Parsel/Scrapy | 2.04 | 1.01 | +| 3 | Raw Lxml | 2.54 | 1.257 | +| 4 | PyQuery | 24.17 | ~12x | +| 5 | Selectolax | 82.63 | ~41x | +| 6 | MechanicalSoup | 1549.71 | ~767.1x | +| 7 | BS4 with Lxml | 1584.31 | ~784.3x | +| 8 | BS4 with html5lib | 3391.91 | ~1679.1x | ### Element Similarity & Text Search Performance Scrapling's adaptive element finding capabilities significantly outperform alternatives: -| Library | Time (ms) | vs Scrapling | +| Library | Time (ms) | vs Scrapling | |-------------|:---------:|:------------:| -| Scrapling | 2.46 | 1.0x | -| AutoScraper | 13.3 | 5.407x | +| Scrapling | 2.39 | 1.0x | +| AutoScraper | 12.45 | 5.209x | > All benchmarks represent averages of 100+ runs. See [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology. @@ -277,7 +348,7 @@ Scrapling requires Python 3.10 or higher: pip install scrapling ``` -Starting with v0.3.2, this installation only includes the parser engine and its dependencies, without any fetchers or commandline dependencies. +This installation only includes the parser engine and its dependencies, without any fetchers or commandline dependencies. ### Optional Dependencies @@ -334,12 +405,5 @@ This work is licensed under the BSD-3-Clause License. This project includes code adapted from: - Parsel (BSD License)—Used for [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py) submodule -## Thanks and References - -- [Daijro](https://github.com/daijro)'s brilliant work on [BrowserForge](https://github.com/daijro/browserforge) and [Camoufox](https://github.com/daijro/camoufox) -- [Vinyzu](https://github.com/Vinyzu)'s brilliant work on [Botright](https://github.com/Vinyzu/Botright) and [PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright) -- [brotector](https://github.com/kaliiiiiiiiii/brotector) for browser detection bypass techniques -- [fakebrowser](https://github.com/kkoooqq/fakebrowser) and [BotBrowser](https://github.com/botswin/BotBrowser) for fingerprinting research - ---
Designed & crafted with ❤️ by Karim Shoair.

\ No newline at end of file diff --git a/benchmarks.py b/benchmarks.py index 438466efe9d192d0fed4a02155df505e288c9666..fcb089c767eb1e9edcf90c51de8fc137a305cae3 100644 --- a/benchmarks.py +++ b/benchmarks.py @@ -75,7 +75,7 @@ def test_scrapling(): # No need to do `.extract()` like parsel to extract text # Also, this is faster than `[t.text for t in Selector(large_html, adaptive=False).css('.item')]` # for obvious reasons, of course. - return ScraplingSelector(large_html, adaptive=False).css(".item::text") + return ScraplingSelector(large_html, adaptive=False).css(".item::text").getall() @benchmark diff --git a/docs/README_AR.md b/docs/README_AR.md index 8a21afb1d6a6aca7157bfaaa1ec19fad0cd7290a..4ac86da42b2cd0ce884c749d542cefc9d57f3806 100644 --- a/docs/README_AR.md +++ b/docs/README_AR.md @@ -1,9 +1,14 @@ -

-
- main poster -
- استخراج بيانات الويب بسهولة ويسر كما يجب أن يكون! -

+

+ + + + Scrapling Poster + + +
+ Effortless Web Scraping for the Modern Web +

+

Tests @@ -24,46 +29,47 @@

- - طرق الاختيار - - · - - اختيار الجالب - - · - - واجهة سطر الأوامر - - · - - وضع MCP - - · - - الانتقال من Beautifulsoup - + طرق الاختيار + · + اختيار Fetcher + · + واجهة سطر الأوامر + · + وضع MCP + · + الانتقال من Beautifulsoup

-**توقف عن محاربة أنظمة مكافحة الروبوتات. توقف عن إعادة كتابة المحددات بعد كل تحديث للموقع.** +Scrapling هو إطار عمل تكيفي لـ Web Scraping يتعامل مع كل شيء من طلب واحد إلى زحف كامل النطاق. -Scrapling ليست مجرد مكتبة أخرى لاستخراج بيانات الويب. إنها أول مكتبة استخراج **تكيفية** تتعلم من تغييرات المواقع وتتطور معها. بينما تتعطل المكتبات الأخرى عندما تحدث المواقع بنيتها، يعيد Scrapling تحديد موقع عناصرك تلقائياً ويحافظ على عمل أدوات الاستخراج الخاصة بك. +محلله يتعلم من تغييرات المواقع ويعيد تحديد موقع عناصرك تلقائياً عند تحديث الصفحات. جوالبه تتجاوز أنظمة مكافحة الروبوتات مثل Cloudflare Turnstile مباشرةً. وإطار عمل Spider الخاص به يتيح لك التوسع إلى عمليات زحف متزامنة ومتعددة الجلسات مع إيقاف/استئناف وتدوير تلقائي لـ Proxy - كل ذلك في بضعة أسطر من Python. مكتبة واحدة، بدون تنازلات. -مبني للويب الحديث، يتميز Scrapling **بمحرك تحليل سريع خاص به** وجوالب للتعامل مع جميع تحديات استخراج بيانات الويب التي تواجهها أو ستواجهها. مبني بواسطة مستخرجي الويب لمستخرجي الويب والمستخدمين العاديين، هناك شيء للجميع. +زحف سريع للغاية مع إحصائيات فورية و Streaming. مبني بواسطة مستخرجي الويب لمستخرجي الويب والمستخدمين العاديين، هناك شيء للجميع. ```python ->> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher ->> StealthyFetcher.adaptive = True -# احصل على كود المصدر للمواقع بشكل خفي! ->> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) ->> print(page.status) -200 ->> products = page.css('.product', auto_save=True) # استخرج البيانات التي تنجو من تغييرات تصميم الموقع! ->> # لاحقاً، إذا تغيرت بنية الموقع، مرر `adaptive=True` ->> products = page.css('.product', adaptive=True) # و Scrapling لا يزال يجدها! +from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher +StealthyFetcher.adaptive = True +page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # احصل على الموقع بشكل خفي! +products = page.css('.product', auto_save=True) # استخرج بيانات تنجو من تغييرات تصميم الموقع! +products = page.css('.product', adaptive=True) # لاحقاً، إذا تغيرت بنية الموقع، مرر `adaptive=True` للعثور عليها! ``` +أو توسع إلى عمليات زحف كاملة +```python +from scrapling.spiders import Spider, Response -# الرعاة +class MySpider(Spider): + name = "demo" + start_urls = ["https://example.com/"] + + async def parse(self, response: Response): + for item in response.css('.product'): + yield {"title": item.css('h2::text').get()} + +MySpider().start() +``` + + +# الرعاة @@ -87,138 +93,211 @@ Scrapling ليست مجرد مكتبة أخرى لاستخراج بيانات ا ## الميزات الرئيسية +### Spiders — إطار عمل زحف كامل +- 🕷️ **واجهة Spider شبيهة بـ Scrapy**: عرّف Spiders مع `start_urls`، و async `parse` callbacks، وكائنات `Request`/`Response`. +- ⚡ **زحف متزامن**: حدود تزامن قابلة للتكوين، وتحكم بالسرعة حسب النطاق، وتأخيرات التنزيل. +- 🔄 **دعم الجلسات المتعددة**: واجهة موحدة لطلبات HTTP، ومتصفحات خفية بدون واجهة في Spider واحد — وجّه الطلبات إلى جلسات مختلفة بالمعرّف. +- 💾 **إيقاف واستئناف**: استمرارية الزحف القائمة على Checkpoint. اضغط Ctrl+C للإيقاف بسلاسة؛ أعد التشغيل للاستئناف من حيث توقفت. +- 📡 **وضع Streaming**: بث العناصر المستخرجة فور وصولها عبر `async for item in spider.stream()` مع إحصائيات فورية — مثالي لواجهات المستخدم وخطوط الأنابيب وعمليات الزحف الطويلة. +- 🛡️ **كشف الطلبات المحظورة**: كشف تلقائي وإعادة محاولة للطلبات المحظورة مع منطق قابل للتخصيص. +- 📦 **تصدير مدمج**: صدّر النتائج عبر الخطافات وخط الأنابيب الخاص بك أو JSON/JSONL المدمج مع `result.items.to_json()` / `result.items.to_jsonl()` على التوالي. + ### جلب متقدم للمواقع مع دعم الجلسات -- **طلبات HTTP**: طلبات HTTP سريعة وخفية مع فئة `Fetcher`. يمكنها تقليد بصمة TLS للمتصفح والرؤوس واستخدام HTTP3. +- **طلبات HTTP**: طلبات HTTP سريعة وخفية مع فئة `Fetcher`. يمكنها تقليد بصمة TLS للمتصفح والرؤوس واستخدام HTTP/3. - **التحميل الديناميكي**: جلب المواقع الديناميكية مع أتمتة كاملة للمتصفح من خلال فئة `DynamicFetcher` التي تدعم Chromium من Playwright و Google Chrome. -- **تجاوز مكافحة الروبوتات**: قدرات تخفي متقدمة مع `StealthyFetcher` وانتحال البصمات. يمكنه تجاوز جميع أنواع Turnstile/Interstitial من Cloudflare بسهولة بالأتمتة. +- **تجاوز مكافحة الروبوتات**: قدرات تخفي متقدمة مع `StealthyFetcher` وانتحال fingerprint. يمكنه تجاوز جميع أنواع Turnstile/Interstitial من Cloudflare بسهولة بالأتمتة. - **إدارة الجلسات**: دعم الجلسات المستمرة مع فئات `FetcherSession` و`StealthySession` و`DynamicSession` لإدارة ملفات تعريف الارتباط والحالة عبر الطلبات. +- **تدوير Proxy**: `ProxyRotator` مدمج مع استراتيجيات التدوير الدوري أو المخصصة عبر جميع أنواع الجلسات، بالإضافة إلى تجاوزات Proxy لكل طلب. +- **حظر النطاقات**: حظر الطلبات إلى نطاقات محددة (ونطاقاتها الفرعية) في الجوالب المعتمدة على المتصفح. - **دعم Async**: دعم async كامل عبر جميع الجوالب وفئات الجلسات async المخصصة. ### الاستخراج التكيفي والتكامل مع الذكاء الاصطناعي - 🔄 **تتبع العناصر الذكي**: إعادة تحديد موقع العناصر بعد تغييرات الموقع باستخدام خوارزميات التشابه الذكية. - 🎯 **الاختيار المرن الذكي**: محددات CSS، محددات XPath، البحث القائم على الفلاتر، البحث النصي، البحث بالتعبيرات العادية والمزيد. - 🔍 **البحث عن عناصر مشابهة**: تحديد العناصر المشابهة للعناصر الموجودة تلقائياً. -- 🤖 **خادم MCP للاستخدام مع الذكاء الاصطناعي**: خادم MCP مدمج لاستخراج بيانات الويب بمساعدة الذكاء الاصطناعي واستخراج البيانات. يتميز خادم MCP بقدرات قوية مخصصة تستفيد من Scrapling لاستخراج المحتوى المستهدف قبل تمريره إلى الذكاء الاصطناعي (Claude/Cursor/إلخ)، وبالتالي تسريع العمليات وتقليل التكاليف عن طريق تقليل استخدام الرموز. ([فيديو توضيحي](https://www.youtube.com/watch?v=qyFk3ZNwOxE)) +- 🤖 **خادم MCP للاستخدام مع الذكاء الاصطناعي**: خادم MCP مدمج لـ Web Scraping بمساعدة الذكاء الاصطناعي واستخراج البيانات. يتميز خادم MCP بقدرات قوية مخصصة تستفيد من Scrapling لاستخراج المحتوى المستهدف قبل تمريره إلى الذكاء الاصطناعي (Claude/Cursor/إلخ)، وبالتالي تسريع العمليات وتقليل التكاليف عن طريق تقليل استخدام الرموز. ([فيديو توضيحي](https://www.youtube.com/watch?v=qyFk3ZNwOxE)) -### بنية عالية الأداء ومختبرة في المعارك -- 🚀 **سريع كالبرق**: أداء محسّن يتفوق على معظم مكتبات استخراج Python. +### بنية عالية الأداء ومختبرة ميدانياً +- 🚀 **سريع كالبرق**: أداء محسّن يتفوق على معظم مكتبات Web Scraping في Python. - 🔋 **فعال في استخدام الذاكرة**: هياكل بيانات محسّنة وتحميل كسول لأقل استخدام للذاكرة. - ⚡ **تسلسل JSON سريع**: أسرع 10 مرات من المكتبة القياسية. -- 🏗️ **مُختبر في المعارك**: لا يمتلك Scrapling فقط تغطية اختبار بنسبة 92٪ وتغطية كاملة لتلميحات الأنواع، ولكن تم استخدامه يومياً من قبل مئات مستخرجي الويب خلال العام الماضي. +- 🏗️ **مُختبر ميدانياً**: لا يمتلك Scrapling فقط تغطية اختبار بنسبة 92٪ وتغطية كاملة لتلميحات الأنواع، بل تم استخدامه يومياً من قبل مئات مستخرجي الويب خلال العام الماضي. ### تجربة صديقة للمطورين/مستخرجي الويب -- 🎯 **غلاف استخراج ويب تفاعلي**: غلاف IPython مدمج اختياري مع تكامل Scrapling، واختصارات، وأدوات جديدة لتسريع تطوير سكريبتات استخراج الويب، مثل تحويل طلبات curl إلى طلبات Scrapling وعرض نتائج الطلبات في متصفحك. +- 🎯 **Shell تفاعلي لـ Web Scraping**: Shell IPython مدمج اختياري مع تكامل Scrapling، واختصارات، وأدوات جديدة لتسريع تطوير سكريبتات Web Scraping، مثل تحويل طلبات curl إلى طلبات Scrapling وعرض نتائج الطلبات في متصفحك. - 🚀 **استخدمه مباشرة من الطرفية**: اختيارياً، يمكنك استخدام Scrapling لاستخراج عنوان URL دون كتابة سطر واحد من الكود! -- 🛠️ **واجهة برمجة تطبيقات التنقل الغنية**: اجتياز DOM متقدم مع طرق التنقل بين الوالدين والأشقاء والأطفال. -- 🧬 **معالجة نصوص محسّنة**: تعبيرات عادية مدمجة وطرق تنظيف وعمليات سلسلة محسّنة. -- 📝 **إنشاء محدد تلقائي**: إنشاء محددات CSS/XPath قوية لأي عنصر. -- 🔌 **واجهة برمجة تطبيقات مألوفة**: مشابه لـ Scrapy/BeautifulSoup مع نفس العناصر الزائفة المستخدمة في Scrapy/Parsel. -- 📘 **تغطية كاملة للأنواع**: تلميحات نوع كاملة لدعم IDE ممتاز وإكمال الكود. +- 🛠️ **واجهة تنقل غنية**: اجتياز DOM متقدم مع طرق التنقل بين العناصر الوالدية والشقيقة والفرعية. +- 🧬 **معالجة نصوص محسّنة**: تعبيرات عادية مدمجة وطرق تنظيف وعمليات نصية محسّنة. +- 📝 **إنشاء محددات تلقائي**: إنشاء محددات CSS/XPath قوية لأي عنصر. +- 🔌 **واجهة مألوفة**: مشابه لـ Scrapy/BeautifulSoup مع نفس العناصر الزائفة المستخدمة في Scrapy/Parsel. +- 📘 **تغطية كاملة للأنواع**: تلميحات نوع كاملة لدعم IDE ممتاز وإكمال الكود. يتم فحص قاعدة الكود بالكامل تلقائياً بواسطة **PyRight** و**MyPy** مع كل تغيير. - 🔋 **صورة Docker جاهزة**: مع كل إصدار، يتم بناء ودفع صورة Docker تحتوي على جميع المتصفحات تلقائياً. ## البدء +لنلقِ نظرة سريعة على ما يمكن لـ Scrapling فعله دون التعمق. + ### الاستخدام الأساسي +طلبات HTTP مع دعم الجلسات ```python -from scrapling.fetchers import Fetcher, StealthyFetcher, DynamicFetcher -from scrapling.fetchers import FetcherSession, StealthySession, DynamicSession +from scrapling.fetchers import Fetcher, FetcherSession -# طلبات HTTP مع دعم الجلسات with FetcherSession(impersonate='chrome') as session: # استخدم أحدث إصدار من بصمة TLS لـ Chrome page = session.get('https://quotes.toscrape.com/', stealthy_headers=True) - quotes = page.css('.quote .text::text') + quotes = page.css('.quote .text::text').getall() # أو استخدم طلبات لمرة واحدة page = Fetcher.get('https://quotes.toscrape.com/') -quotes = page.css('.quote .text::text') +quotes = page.css('.quote .text::text').getall() +``` +وضع التخفي المتقدم +```python +from scrapling.fetchers import StealthyFetcher, StealthySession -# وضع التخفي المتقدم (احتفظ بالمتصفح مفتوحاً حتى تنتهي) -with StealthySession(headless=True, solve_cloudflare=True) as session: +with StealthySession(headless=True, solve_cloudflare=True) as session: # أبقِ المتصفح مفتوحاً حتى تنتهي page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False) - data = page.css('#padded_content a') + data = page.css('#padded_content a').getall() # أو استخدم نمط الطلب لمرة واحدة، يفتح المتصفح لهذا الطلب، ثم يغلقه بعد الانتهاء page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare') -data = page.css('#padded_content a') - -# أتمتة المتصفح الكاملة (احتفظ بالمتصفح مفتوحاً حتى تنتهي) -with DynamicSession(headless=True) as session: - page = session.fetch('https://quotes.toscrape.com/', network_idle=True) - quotes = page.css('.quote .text::text') - -# أو استخدم نمط الطلب لمرة واحدة -page = DynamicFetcher.fetch('https://quotes.toscrape.com/', network_idle=True) -quotes = page.css('.quote .text::text') +data = page.css('#padded_content a').getall() +``` +أتمتة المتصفح الكاملة +```python +from scrapling.fetchers import DynamicFetcher, DynamicSession + +with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # أبقِ المتصفح مفتوحاً حتى تنتهي + page = session.fetch('https://quotes.toscrape.com/', load_dom=False) + data = page.xpath('//span[@class="text"]/text()').getall() # محدد XPath إذا كنت تفضله + +# أو استخدم نمط الطلب لمرة واحدة، يفتح المتصفح لهذا الطلب، ثم يغلقه بعد الانتهاء +page = DynamicFetcher.fetch('https://quotes.toscrape.com/') +data = page.css('.quote .text::text').getall() ``` -### اختيار العناصر +### Spiders +ابنِ زواحف كاملة مع طلبات متزامنة وأنواع جلسات متعددة وإيقاف/استئناف: +```python +from scrapling.spiders import Spider, Request, Response + +class QuotesSpider(Spider): + name = "quotes" + start_urls = ["https://quotes.toscrape.com/"] + concurrent_requests = 10 + + async def parse(self, response: Response): + for quote in response.css('.quote'): + yield { + "text": quote.css('.text::text').get(), + "author": quote.css('.author::text').get(), + } + + next_page = response.css('.next a') + if next_page: + yield response.follow(next_page[0].attrib['href']) + +result = QuotesSpider().start() +print(f"Scraped {len(result.items)} quotes") +result.items.to_json("quotes.json") +``` +استخدم أنواع جلسات متعددة في Spider واحد: +```python +from scrapling.spiders import Spider, Request, Response +from scrapling.fetchers import FetcherSession, AsyncStealthySession + +class MultiSessionSpider(Spider): + name = "multi" + start_urls = ["https://example.com/"] + + def configure_sessions(self, manager): + manager.add("fast", FetcherSession(impersonate="chrome")) + manager.add("stealth", AsyncStealthySession(headless=True), lazy=True) + + async def parse(self, response: Response): + for link in response.css('a::attr(href)').getall(): + # وجّه الصفحات المحمية عبر جلسة التخفي + if "protected" in link: + yield Request(link, sid="stealth") + else: + yield Request(link, sid="fast", callback=self.parse) # callback صريح +``` +أوقف واستأنف عمليات الزحف الطويلة مع Checkpoints بتشغيل Spider هكذا: ```python -# محددات CSS -page.css('a::text') # استخراج النص -page.css('a::attr(href)') # استخراج السمات -page.css('a', recursive=False) # العناصر المباشرة فقط -page.css('a', auto_save=True) # حفظ مواضع العناصر تلقائياً - -# XPath -page.xpath('//a/text()') - -# بحث مرن -page.find_by_text('Python', first_match=True) # البحث بالنص -page.find_by_regex(r'\d{4}') # البحث بنمط التعبير العادي -page.find('div', {'class': 'container'}) # البحث بالسمات - -# التنقل -element.parent # الحصول على العنصر الوالد -element.next_sibling # الحصول على الشقيق التالي -element.children # الحصول على الأطفال - -# عناصر مشابهة -similar = page.get_similar(element) # البحث عن عناصر مشابهة - -# الاستخراج التكيفي -saved_elements = page.css('.product', auto_save=True) -# لاحقاً، عندما يتغير الموقع: -page.css('.product', adaptive=True) # البحث عن العناصر باستخدام المواضع المحفوظة +QuotesSpider(crawldir="./crawl_data").start() ``` +اضغط Ctrl+C للإيقاف بسلاسة — يتم حفظ التقدم تلقائياً. لاحقاً، عند تشغيل Spider مرة أخرى، مرر نفس `crawldir`، وسيستأنف من حيث توقف. -### استخدام الجلسة +### التحليل المتقدم والتنقل +```python +from scrapling.fetchers import Fetcher + +# اختيار عناصر غني وتنقل +page = Fetcher.get('https://quotes.toscrape.com/') + +# احصل على الاقتباسات بطرق اختيار متعددة +quotes = page.css('.quote') # محدد CSS +quotes = page.xpath('//div[@class="quote"]') # XPath +quotes = page.find_all('div', {'class': 'quote'}) # بأسلوب BeautifulSoup +# نفس الشيء مثل +quotes = page.find_all('div', class_='quote') +quotes = page.find_all(['div'], class_='quote') +quotes = page.find_all(class_='quote') # وهكذا... +# البحث عن عنصر بمحتوى النص +quotes = page.find_by_text('quote', tag='div') + +# التنقل المتقدم +quote_text = page.css('.quote')[0].css('.text::text').get() +quote_text = page.css('.quote').css('.text::text').getall() # محددات متسلسلة +first_quote = page.css('.quote')[0] +author = first_quote.next_sibling.css('.author::text') +parent_container = first_quote.parent + +# علاقات العناصر والتشابه +similar_elements = first_quote.find_similar() +below_elements = first_quote.below_elements() +``` +يمكنك استخدام المحلل مباشرة إذا كنت لا تريد جلب المواقع كما يلي: ```python -from scrapling.fetchers import FetcherSession, AsyncFetcherSession - -# جلسة متزامنة -with FetcherSession() as session: - # يتم الاحتفاظ بملفات تعريف الارتباط تلقائياً - page1 = session.get('https://quotes.toscrape.com/login') - page2 = session.post('https://quotes.toscrape.com/login', data={'username': 'admin', 'password': 'admin'}) - - # تبديل بصمة المتصفح إذا لزم الأمر +from scrapling.parser import Selector + +page = Selector("...") +``` +وهو يعمل بنفس الطريقة تماماً! + +### أمثلة إدارة الجلسات بشكل Async +```python +import asyncio +from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession + +async with FetcherSession(http3=True) as session: # `FetcherSession` واعٍ بالسياق ويعمل في كلا النمطين المتزامن/async + page1 = session.get('https://quotes.toscrape.com/') page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135') # استخدام جلسة async async with AsyncStealthySession(max_pages=2) as session: tasks = [] urls = ['https://example.com/page1', 'https://example.com/page2'] - + for url in urls: task = session.fetch(url) tasks.append(task) - + print(session.get_pool_stats()) # اختياري - حالة مجموعة علامات تبويب المتصفح (مشغول/حر/خطأ) results = await asyncio.gather(*tasks) print(session.get_pool_stats()) ``` -## واجهة سطر الأوامر والغلاف التفاعلي +## واجهة سطر الأوامر والـ Shell التفاعلي -يتضمن Scrapling v0.3 واجهة سطر أوامر قوية: +يتضمن Scrapling واجهة سطر أوامر قوية: [![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339) -تشغيل غلاف استخراج الويب التفاعلي +تشغيل Shell الـ Web Scraping التفاعلي ```bash scrapling shell ``` -استخراج الصفحات إلى ملف مباشرة دون برمجة (يستخرج المحتوى داخل وسم `body` افتراضياً). إذا انتهى ملف الإخراج بـ `.txt`، فسيتم استخراج محتوى النص للهدف. إذا انتهى بـ `.md`، فسيكون تمثيل Markdown لمحتوى HTML؛ إذا انتهى بـ `.html`، فسيكون محتوى HTML نفسه. +استخرج الصفحات إلى ملف مباشرة دون برمجة (يستخرج المحتوى داخل وسم `body` افتراضياً). إذا انتهى ملف الإخراج بـ `.txt`، فسيتم استخراج محتوى النص للهدف. إذا انتهى بـ `.md`، فسيكون تمثيل Markdown لمحتوى HTML؛ إذا انتهى بـ `.html`، فسيكون محتوى HTML نفسه. ```bash scrapling extract get 'https://example.com' content.md scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # جميع العناصر المطابقة لمحدد CSS '#fromSkipToProducts' @@ -227,24 +306,24 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas. ``` > [!NOTE] -> هناك العديد من الميزات الإضافية، لكننا نريد إبقاء هذه الصفحة موجزة، مثل خادم MCP وغلاف استخراج الويب التفاعلي. تحقق من الوثائق الكاملة [هنا](https://scrapling.readthedocs.io/en/latest/) +> هناك العديد من الميزات الإضافية، لكننا نريد إبقاء هذه الصفحة موجزة، بما في ذلك خادم MCP والـ Shell التفاعلي لـ Web Scraping. تحقق من الوثائق الكاملة [هنا](https://scrapling.readthedocs.io/en/latest/) ## معايير الأداء -Scrapling ليس قوياً فقط - إنه أيضاً سريع بشكل مذهل، والتحديثات منذ الإصدار 0.3 قدمت تحسينات أداء استثنائية عبر جميع العمليات. تقارن المعايير التالية محلل Scrapling مع المكتبات الشائعة الأخرى. +Scrapling ليس قوياً فحسب — بل هو أيضاً سريع بشكل مذهل. تقارن المعايير التالية محلل Scrapling مع أحدث إصدارات المكتبات الشائعة الأخرى. ### اختبار سرعة استخراج النص (5000 عنصر متداخل) -| # | المكتبة | الوقت (ms) | vs Scrapling | +| # | المكتبة | الوقت (ms) | vs Scrapling | |---|:-----------------:|:----------:|:------------:| -| 1 | Scrapling | 1.99 | 1.0x | -| 2 | Parsel/Scrapy | 2.01 | 1.01x | -| 3 | Raw Lxml | 2.5 | 1.256x | -| 4 | PyQuery | 22.93 | ~11.5x | -| 5 | Selectolax | 80.57 | ~40.5x | -| 6 | BS4 with Lxml | 1541.37 | ~774.6x | -| 7 | MechanicalSoup | 1547.35 | ~777.6x | -| 8 | BS4 with html5lib | 3410.58 | ~1713.9x | +| 1 | Scrapling | 2.02 | 1.0x | +| 2 | Parsel/Scrapy | 2.04 | 1.01 | +| 3 | Raw Lxml | 2.54 | 1.257 | +| 4 | PyQuery | 24.17 | ~12x | +| 5 | Selectolax | 82.63 | ~41x | +| 6 | MechanicalSoup | 1549.71 | ~767.1x | +| 7 | BS4 with Lxml | 1584.31 | ~784.3x | +| 8 | BS4 with html5lib | 3391.91 | ~1679.1x | ### أداء تشابه العناصر والبحث النصي @@ -253,39 +332,39 @@ Scrapling ليس قوياً فقط - إنه أيضاً سريع بشكل مذه | المكتبة | الوقت (ms) | vs Scrapling | |-------------|:----------:|:------------:| -| Scrapling | 2.46 | 1.0x | -| AutoScraper | 13.3 | 5.407x | +| Scrapling | 2.39 | 1.0x | +| AutoScraper | 12.45 | 5.209x | > تمثل جميع المعايير متوسطات أكثر من 100 تشغيل. انظر [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) للمنهجية. ## التثبيت -يتطلب Scrapling Python 3.10 أو أعلى: +يتطلب Scrapling إصدار Python 3.10 أو أعلى: ```bash pip install scrapling ``` -بدءاً من v0.3.2، يتضمن هذا التثبيت فقط محرك المحلل وتبعياته، بدون أي جوالب أو تبعيات سطر أوامر. +يتضمن هذا التثبيت فقط محرك المحلل وتبعياته، بدون أي جوالب أو تبعيات سطر الأوامر. ### التبعيات الاختيارية 1. إذا كنت ستستخدم أياً من الميزات الإضافية أدناه، أو الجوالب، أو فئاتها، فستحتاج إلى تثبيت تبعيات الجوالب وتبعيات المتصفح الخاصة بها على النحو التالي: ```bash pip install "scrapling[fetchers]" - + scrapling install ``` - يقوم هذا بتنزيل جميع المتصفحات، إلى جانب تبعيات النظام وتبعيات معالجة البصمات الخاصة بها. + يقوم هذا بتنزيل جميع المتصفحات، إلى جانب تبعيات النظام وتبعيات معالجة fingerprint الخاصة بها. 2. ميزات إضافية: - تثبيت ميزة خادم MCP: ```bash pip install "scrapling[ai]" ``` - - تثبيت ميزات الغلاف (غلاف استخراج الويب وأمر `extract`): + - تثبيت ميزات Shell (Shell الـ Web Scraping وأمر `extract`): ```bash pip install "scrapling[shell]" ``` @@ -322,14 +401,7 @@ docker pull ghcr.io/d4vinci/scrapling:latest ## الشكر والتقدير يتضمن هذا المشروع كوداً معدلاً من: -- Parsel (ترخيص BSD) - يستخدم للوحدة الفرعية [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py) - -## الشكر والمراجع - -- العمل الرائع لـ [Daijro](https://github.com/daijro) على [BrowserForge](https://github.com/daijro/browserforge) و[Camoufox](https://github.com/daijro/camoufox) -- العمل الرائع لـ [Vinyzu](https://github.com/Vinyzu) على [Botright](https://github.com/Vinyzu/Botright) و[PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright) -- [brotector](https://github.com/kaliiiiiiiiii/brotector) لتقنيات تجاوز اكتشاف المتصفح -- [fakebrowser](https://github.com/kkoooqq/fakebrowser) و[BotBrowser](https://github.com/botswin/BotBrowser) لأبحاث البصمات +- Parsel (ترخيص BSD) — يُستخدم للوحدة الفرعية [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py) --- -
مصمم ومصنوع بـ ❤️ بواسطة كريم شعير.

\ No newline at end of file +
مصمم ومصنوع بـ ❤️ بواسطة كريم شعير.

diff --git a/docs/README_CN.md b/docs/README_CN.md index ac8321804b6c8c8bf3f3ba96419560d46515a074..b33c422663cea0f3621c4b3caca1795f7d171c0d 100644 --- a/docs/README_CN.md +++ b/docs/README_CN.md @@ -1,9 +1,14 @@ -

-
- main poster -
- 简单、轻松的网页抓取,本该如此! -

+

+ + + + Scrapling Poster + + +
+ Effortless Web Scraping for the Modern Web +

+

Tests @@ -24,46 +29,47 @@

- - 选择方法 - - · - - 选择获取器 - - · - - 命令行界面 - - · - - MCP模式 - - · - - 从Beautifulsoup迁移 - + 选择方法 + · + 选择Fetcher + · + CLI + · + MCP模式 + · + 从Beautifulsoup迁移

-**停止与反机器人系统斗争。停止在每次网站更新后重写选择器。** +Scrapling是一个自适应Web Scraping框架,能处理从单个请求到大规模爬取的一切需求。 -Scrapling不仅仅是另一个网页抓取库。它是第一个**自适应**抓取库,能够从网站变化中学习并与之共同进化。当其他库在网站更新结构时失效,Scrapling会自动重新定位您的元素并保持抓取器运行。 +它的解析器能够从网站变化中学习,并在页面更新时自动重新定位您的元素。它的Fetcher能够开箱即用地绕过Cloudflare Turnstile等反机器人系统。它的Spider框架让您可以扩展到并发、多Session爬取,支持暂停/恢复和自动Proxy轮换——只需几行Python代码。一个库,零妥协。 -为现代网络而构建,Scrapling具有**自己的快速解析引擎**和获取器来处理您面临或将要面临的所有网页抓取挑战。由网页抓取者为网页抓取者和普通用户构建,适合每个人。 +极速爬取,实时统计和Streaming。由Web Scraper为Web Scraper和普通用户而构建,每个人都能找到适合自己的功能。 ```python ->> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher ->> StealthyFetcher.adaptive = True -# 隐秘地获取网站源代码! ->> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) ->> print(page.status) -200 ->> products = page.css('.product', auto_save=True) # 抓取在网站设计变更后仍能存活的数据! ->> # 之后,如果网站结构改变,传递 `adaptive=True` ->> products = page.css('.product', adaptive=True) # Scrapling仍然能找到它们! +from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher +StealthyFetcher.adaptive = True +page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # 隐秘地获取网站! +products = page.css('.product', auto_save=True) # 抓取在网站设计变更后仍能存活的数据! +products = page.css('.product', adaptive=True) # 之后,如果网站结构改变,传递 `adaptive=True` 来找到它们! ``` +或扩展为完整爬取 +```python +from scrapling.spiders import Spider, Response -# 赞助商 +class MySpider(Spider): + name = "demo" + start_urls = ["https://example.com/"] + + async def parse(self, response: Response): + for item in response.css('.product'): + yield {"title": item.css('h2::text').get()} + +MySpider().start() +``` + + +# 赞助商 @@ -87,122 +93,195 @@ Scrapling不仅仅是另一个网页抓取库。它是第一个**自适应**抓 ## 主要特性 -### 支持会话的高级网站获取 -- **HTTP请求**:使用`Fetcher`类进行快速和隐秘的HTTP请求。可以模拟浏览器的TLS指纹、标头并使用HTTP3。 +### Spider — 完整的爬取框架 +- 🕷️ **类Scrapy的Spider API**:使用`start_urls`、async `parse` callback和`Request`/`Response`对象定义Spider。 +- ⚡ **并发爬取**:可配置的并发限制、按域名节流和下载延迟。 +- 🔄 **多Session支持**:统一接口,支持HTTP请求和隐秘无头浏览器在同一个Spider中使用——通过ID将请求路由到不同的Session。 +- 💾 **暂停与恢复**:基于Checkpoint的爬取持久化。按Ctrl+C优雅关闭;重启后从上次停止的地方继续。 +- 📡 **Streaming模式**:通过`async for item in spider.stream()`以实时统计Streaming抓取的数据——非常适合UI、管道和长时间运行的爬取。 +- 🛡️ **被阻止请求检测**:自动检测并重试被阻止的请求,支持自定义逻辑。 +- 📦 **内置导出**:通过钩子和您自己的管道导出结果,或使用内置的JSON/JSONL,分别通过`result.items.to_json()`/`result.items.to_jsonl()`。 + +### 支持Session的高级网站获取 +- **HTTP请求**:使用`Fetcher`类进行快速和隐秘的HTTP请求。可以模拟浏览器的TLS fingerprint、标头并使用HTTP/3。 - **动态加载**:通过`DynamicFetcher`类使用完整的浏览器自动化获取动态网站,支持Playwright的Chromium和Google Chrome。 -- **反机器人绕过**:使用`StealthyFetcher`的高级隐秘功能和指纹伪装。可以轻松自动绕过所有类型的Cloudflare的Turnstile/Interstitial。 -- **会话管理**:使用`FetcherSession`、`StealthySession`和`DynamicSession`类持久化会话支持,用于跨请求的cookie和状态管理。 -- **异步支持**:所有获取器和专用异步会话类的完整异步支持。 +- **反机器人绕过**:使用`StealthyFetcher`的高级隐秘功能和fingerprint伪装。可以轻松自动绕过所有类型的Cloudflare Turnstile/Interstitial。 +- **Session管理**:使用`FetcherSession`、`StealthySession`和`DynamicSession`类实现持久化Session支持,用于跨请求的cookie和状态管理。 +- **Proxy轮换**:内置`ProxyRotator`,支持轮询或自定义策略,适用于所有Session类型,并支持按请求覆盖Proxy。 +- **域名屏蔽**:在基于浏览器的Fetcher中屏蔽对特定域名(及其子域名)的请求。 +- **Async支持**:所有Fetcher和专用async Session类的完整async支持。 ### 自适应抓取和AI集成 - 🔄 **智能元素跟踪**:使用智能相似性算法在网站更改后重新定位元素。 - 🎯 **智能灵活选择**:CSS选择器、XPath选择器、基于过滤器的搜索、文本搜索、正则表达式搜索等。 -- 🔍 **查找相似元素**:自动定位与找到的元素相似的元素。 -- 🤖 **与AI一起使用的MCP服务器**:内置MCP服务器用于AI辅助网页抓取和数据提取。MCP服务器具有强大的自定义功能,利用Scrapling在将内容传递给AI(Claude/Cursor等)之前提取目标内容,从而加快操作并通过最小化令牌使用来降低成本。([演示视频](https://www.youtube.com/watch?v=qyFk3ZNwOxE)) +- 🔍 **查找相似元素**:自动定位与已找到元素相似的元素。 +- 🤖 **与AI一起使用的MCP服务器**:内置MCP服务器用于AI辅助Web Scraping和数据提取。MCP服务器具有强大的自定义功能,利用Scrapling在将内容传递给AI(Claude/Cursor等)之前提取目标内容,从而加快操作并通过最小化token使用来降低成本。([演示视频](https://www.youtube.com/watch?v=qyFk3ZNwOxE)) ### 高性能和经过实战测试的架构 - 🚀 **闪电般快速**:优化性能超越大多数Python抓取库。 - 🔋 **内存高效**:优化的数据结构和延迟加载,最小内存占用。 - ⚡ **快速JSON序列化**:比标准库快10倍。 -- 🏗️ **经过实战测试**:Scrapling不仅拥有92%的测试覆盖率和完整的类型提示覆盖率,而且在过去一年中每天被数百名网页抓取者使用。 +- 🏗️ **经过实战测试**:Scrapling不仅拥有92%的测试覆盖率和完整的类型提示覆盖率,而且在过去一年中每天被数百名Web Scraper使用。 -### 对开发者/网页抓取者友好的体验 -- 🎯 **交互式网页抓取Shell**:可选的内置IPython shell,具有Scrapling集成、快捷方式和新工具,可加快网页抓取脚本开发,例如将curl请求转换为Scrapling请求并在浏览器中查看请求结果。 +### 对开发者/Web Scraper友好的体验 +- 🎯 **交互式Web Scraping Shell**:可选的内置IPython Shell,具有Scrapling集成、快捷方式和新工具,可加快Web Scraping脚本开发,例如将curl请求转换为Scrapling请求并在浏览器中查看请求结果。 - 🚀 **直接从终端使用**:可选地,您可以使用Scrapling抓取URL而无需编写任何代码! - 🛠️ **丰富的导航API**:使用父级、兄弟级和子级导航方法进行高级DOM遍历。 - 🧬 **增强的文本处理**:内置正则表达式、清理方法和优化的字符串操作。 - 📝 **自动选择器生成**:为任何元素生成强大的CSS/XPath选择器。 - 🔌 **熟悉的API**:类似于Scrapy/BeautifulSoup,使用与Scrapy/Parsel相同的伪元素。 -- 📘 **完整的类型覆盖**:完整的类型提示,出色的IDE支持和代码补全。 +- 📘 **完整的类型覆盖**:完整的类型提示,出色的IDE支持和代码补全。整个代码库在每次更改时都会自动使用**PyRight**和**MyPy**扫描。 - 🔋 **现成的Docker镜像**:每次发布时,包含所有浏览器的Docker镜像会自动构建和推送。 ## 入门 +让我们快速展示Scrapling的功能,无需深入了解。 + ### 基本用法 +支持Session的HTTP请求 ```python -from scrapling.fetchers import Fetcher, StealthyFetcher, DynamicFetcher -from scrapling.fetchers import FetcherSession, StealthySession, DynamicSession +from scrapling.fetchers import Fetcher, FetcherSession -# 支持会话的HTTP请求 -with FetcherSession(impersonate='chrome') as session: # 使用Chrome的最新版本TLS指纹 +with FetcherSession(impersonate='chrome') as session: # 使用Chrome的最新版本TLS fingerprint page = session.get('https://quotes.toscrape.com/', stealthy_headers=True) - quotes = page.css('.quote .text::text') + quotes = page.css('.quote .text::text').getall() # 或使用一次性请求 page = Fetcher.get('https://quotes.toscrape.com/') -quotes = page.css('.quote .text::text') +quotes = page.css('.quote .text::text').getall() +``` +高级隐秘模式 +```python +from scrapling.fetchers import StealthyFetcher, StealthySession -# 高级隐秘模式(保持浏览器打开直到完成) -with StealthySession(headless=True, solve_cloudflare=True) as session: +with StealthySession(headless=True, solve_cloudflare=True) as session: # 保持浏览器打开直到完成 page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False) - data = page.css('#padded_content a') + data = page.css('#padded_content a').getall() # 或使用一次性请求样式,为此请求打开浏览器,完成后关闭 page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare') -data = page.css('#padded_content a') - -# 完整的浏览器自动化(保持浏览器打开直到完成) -with DynamicSession(headless=True) as session: - page = session.fetch('https://quotes.toscrape.com/', network_idle=True) - quotes = page.css('.quote .text::text') - -# 或使用一次性请求样式 -page = DynamicFetcher.fetch('https://quotes.toscrape.com/', network_idle=True) -quotes = page.css('.quote .text::text') +data = page.css('#padded_content a').getall() ``` +完整的浏览器自动化 +```python +from scrapling.fetchers import DynamicFetcher, DynamicSession + +with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # 保持浏览器打开直到完成 + page = session.fetch('https://quotes.toscrape.com/', load_dom=False) + data = page.xpath('//span[@class="text"]/text()').getall() # 如果您偏好XPath选择器 -### 元素选择 +# 或使用一次性请求样式,为此请求打开浏览器,完成后关闭 +page = DynamicFetcher.fetch('https://quotes.toscrape.com/') +data = page.css('.quote .text::text').getall() +``` + +### Spider +构建具有并发请求、多种Session类型和暂停/恢复功能的完整爬虫: +```python +from scrapling.spiders import Spider, Request, Response + +class QuotesSpider(Spider): + name = "quotes" + start_urls = ["https://quotes.toscrape.com/"] + concurrent_requests = 10 + + async def parse(self, response: Response): + for quote in response.css('.quote'): + yield { + "text": quote.css('.text::text').get(), + "author": quote.css('.author::text').get(), + } + + next_page = response.css('.next a') + if next_page: + yield response.follow(next_page[0].attrib['href']) + +result = QuotesSpider().start() +print(f"抓取了 {len(result.items)} 条引用") +result.items.to_json("quotes.json") +``` +在单个Spider中使用多种Session类型: +```python +from scrapling.spiders import Spider, Request, Response +from scrapling.fetchers import FetcherSession, AsyncStealthySession + +class MultiSessionSpider(Spider): + name = "multi" + start_urls = ["https://example.com/"] + + def configure_sessions(self, manager): + manager.add("fast", FetcherSession(impersonate="chrome")) + manager.add("stealth", AsyncStealthySession(headless=True), lazy=True) + + async def parse(self, response: Response): + for link in response.css('a::attr(href)').getall(): + # 将受保护的页面路由到隐秘Session + if "protected" in link: + yield Request(link, sid="stealth") + else: + yield Request(link, sid="fast", callback=self.parse) # 显式callback +``` +通过如下方式运行Spider来暂停和恢复长时间爬取,使用Checkpoint: +```python +QuotesSpider(crawldir="./crawl_data").start() +``` +按Ctrl+C优雅暂停——进度会自动保存。之后,当您再次启动Spider时,传递相同的`crawldir`,它将从上次停止的地方继续。 + +### 高级解析与导航 +```python +from scrapling.fetchers import Fetcher + +# 丰富的元素选择和导航 +page = Fetcher.get('https://quotes.toscrape.com/') + +# 使用多种选择方法获取引用 +quotes = page.css('.quote') # CSS选择器 +quotes = page.xpath('//div[@class="quote"]') # XPath +quotes = page.find_all('div', {'class': 'quote'}) # BeautifulSoup风格 +# 等同于 +quotes = page.find_all('div', class_='quote') +quotes = page.find_all(['div'], class_='quote') +quotes = page.find_all(class_='quote') # 等等... +# 按文本内容查找元素 +quotes = page.find_by_text('quote', tag='div') + +# 高级导航 +quote_text = page.css('.quote')[0].css('.text::text').get() +quote_text = page.css('.quote').css('.text::text').getall() # 链式选择器 +first_quote = page.css('.quote')[0] +author = first_quote.next_sibling.css('.author::text') +parent_container = first_quote.parent + +# 元素关系和相似性 +similar_elements = first_quote.find_similar() +below_elements = first_quote.below_elements() +``` +如果您不想获取网站,可以直接使用解析器,如下所示: ```python -# CSS选择器 -page.css('a::text') # 提取文本 -page.css('a::attr(href)') # 提取属性 -page.css('a', recursive=False) # 仅直接元素 -page.css('a', auto_save=True) # 自动保存元素位置 - -# XPath -page.xpath('//a/text()') - -# 灵活搜索 -page.find_by_text('Python', first_match=True) # 按文本查找 -page.find_by_regex(r'\d{4}') # 按正则表达式模式查找 -page.find('div', {'class': 'container'}) # 按属性查找 - -# 导航 -element.parent # 获取父元素 -element.next_sibling # 获取下一个兄弟元素 -element.children # 获取子元素 - -# 相似元素 -similar = page.get_similar(element) # 查找相似元素 - -# 自适应抓取 -saved_elements = page.css('.product', auto_save=True) -# 之后,当网站更改时: -page.css('.product', adaptive=True) # 使用保存的位置查找元素 +from scrapling.parser import Selector + +page = Selector("...") ``` +用法完全相同! -### 会话使用 +### Async Session管理示例 ```python -from scrapling.fetchers import FetcherSession, AsyncFetcherSession - -# 同步会话 -with FetcherSession() as session: - # Cookie自动保持 - page1 = session.get('https://quotes.toscrape.com/login') - page2 = session.post('https://quotes.toscrape.com/login', data={'username': 'admin', 'password': 'admin'}) - - # 如需要,切换浏览器指纹 +import asyncio +from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession + +async with FetcherSession(http3=True) as session: # `FetcherSession`是上下文感知的,可以在sync/async模式下工作 + page1 = session.get('https://quotes.toscrape.com/') page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135') -# 异步会话使用 +# Async Session用法 async with AsyncStealthySession(max_pages=2) as session: tasks = [] urls = ['https://example.com/page1', 'https://example.com/page2'] - + for url in urls: task = session.fetch(url) tasks.append(task) - + print(session.get_pool_stats()) # 可选 - 浏览器标签池的状态(忙/空闲/错误) results = await asyncio.gather(*tasks) print(session.get_pool_stats()) @@ -210,11 +289,11 @@ async with AsyncStealthySession(max_pages=2) as session: ## CLI和交互式Shell -Scrapling v0.3包含强大的命令行界面: +Scrapling包含强大的命令行界面: [![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339) -启动交互式网页抓取shell +启动交互式Web Scraping Shell ```bash scrapling shell ``` @@ -227,24 +306,24 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas. ``` > [!NOTE] -> 还有许多其他功能,但我们希望保持此页面简洁,例如MCP服务器和交互式网页抓取Shell。查看完整文档[这里](https://scrapling.readthedocs.io/en/latest/) +> 还有许多其他功能,但我们希望保持此页面简洁,包括MCP服务器和交互式Web Scraping Shell。查看完整文档[这里](https://scrapling.readthedocs.io/en/latest/) ## 性能基准 -Scrapling不仅功能强大——它还速度极快,自0.3版本以来的更新在所有操作中都提供了卓越的性能改进。以下基准测试将Scrapling的解析器与其他流行库进行了比较。 +Scrapling不仅功能强大——它还速度极快。以下基准测试将Scrapling的解析器与其他流行库的最新版本进行了比较。 ### 文本提取速度测试(5000个嵌套元素) -| # | 库 | 时间(ms) | vs Scrapling | -|---|:-----------------:|:-------:|:------------:| -| 1 | Scrapling | 1.99 | 1.0x | -| 2 | Parsel/Scrapy | 2.01 | 1.01x | -| 3 | Raw Lxml | 2.5 | 1.256x | -| 4 | PyQuery | 22.93 | ~11.5x | -| 5 | Selectolax | 80.57 | ~40.5x | -| 6 | BS4 with Lxml | 1541.37 | ~774.6x | -| 7 | MechanicalSoup | 1547.35 | ~777.6x | -| 8 | BS4 with html5lib | 3410.58 | ~1713.9x | +| # | 库 | 时间(ms) | vs Scrapling | +|---|:-----------------:|:---------:|:------------:| +| 1 | Scrapling | 2.02 | 1.0x | +| 2 | Parsel/Scrapy | 2.04 | 1.01 | +| 3 | Raw Lxml | 2.54 | 1.257 | +| 4 | PyQuery | 24.17 | ~12x | +| 5 | Selectolax | 82.63 | ~41x | +| 6 | MechanicalSoup | 1549.71 | ~767.1x | +| 7 | BS4 with Lxml | 1584.31 | ~784.3x | +| 8 | BS4 with html5lib | 3391.91 | ~1679.1x | ### 元素相似性和文本搜索性能 @@ -252,9 +331,9 @@ Scrapling不仅功能强大——它还速度极快,自0.3版本以来的更 Scrapling的自适应元素查找功能明显优于替代方案: | 库 | 时间(ms) | vs Scrapling | -|-------------|:------:|:------------:| -| Scrapling | 2.46 | 1.0x | -| AutoScraper | 13.3 | 5.407x | +|-------------|:---------:|:------------:| +| Scrapling | 2.39 | 1.0x | +| AutoScraper | 12.45 | 5.209x | > 所有基准测试代表100+次运行的平均值。请参阅[benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py)了解方法。 @@ -267,25 +346,25 @@ Scrapling需要Python 3.10或更高版本: pip install scrapling ``` -从v0.3.2开始,此安装仅包括解析器引擎及其依赖项,没有任何获取器或命令行依赖项。 +此安装仅包括解析器引擎及其依赖项,没有任何Fetcher或命令行依赖项。 ### 可选依赖项 -1. 如果您要使用以下任何额外功能、获取器或它们的类,您将需要安装获取器的依赖项和它们的浏览器依赖项,如下所示: +1. 如果您要使用以下任何额外功能、Fetcher或它们的类,您将需要安装Fetcher的依赖项和它们的浏览器依赖项,如下所示: ```bash pip install "scrapling[fetchers]" - + scrapling install ``` - 这会下载所有浏览器,以及它们的系统依赖项和指纹操作依赖项。 + 这会下载所有浏览器,以及它们的系统依赖项和fingerprint操作依赖项。 2. 额外功能: - 安装MCP服务器功能: ```bash pip install "scrapling[ai]" ``` - - 安装shell功能(网页抓取shell和`extract`命令): + - 安装Shell功能(Web Scraping Shell和`extract`命令): ```bash pip install "scrapling[shell]" ``` @@ -324,12 +403,5 @@ docker pull ghcr.io/d4vinci/scrapling:latest 此项目包含改编自以下内容的代码: - Parsel(BSD许可证)——用于[translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)子模块 -## 感谢和参考 - -- [Daijro](https://github.com/daijro)在[BrowserForge](https://github.com/daijro/browserforge)和[Camoufox](https://github.com/daijro/camoufox)上的出色工作 -- [Vinyzu](https://github.com/Vinyzu)在[Botright](https://github.com/Vinyzu/Botright)和[PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)上的出色工作 -- [brotector](https://github.com/kaliiiiiiiiii/brotector)提供的浏览器检测绕过技术 -- [fakebrowser](https://github.com/kkoooqq/fakebrowser)和[BotBrowser](https://github.com/botswin/BotBrowser)提供的指纹识别研究 - --- -
由Karim Shoair用❤️设计和制作。

\ No newline at end of file +
由Karim Shoair用❤️设计和制作。

diff --git a/docs/README_DE.md b/docs/README_DE.md index 3013d51cea87abe98ee07e141284b399f6abcb58..71244839b3bc9595fc863043f6be8e756f85335e 100644 --- a/docs/README_DE.md +++ b/docs/README_DE.md @@ -1,9 +1,14 @@ -

-
- main poster -
- Einfaches, müheloses Web Scraping, wie es sein sollte! -

+

+ + + + Scrapling Poster + + +
+ Effortless Web Scraping for the Modern Web +

+

Tests @@ -24,46 +29,47 @@

- - Auswahlmethoden - - · - - Fetcher wählen - - · - - CLI - - · - - MCP-Modus - - · - - Migration von Beautifulsoup - + Auswahlmethoden + · + Einen Fetcher wählen + · + CLI + · + MCP-Modus + · + Migration von Beautifulsoup

-**Hören Sie auf, gegen Anti-Bot-Systeme zu kämpfen. Hören Sie auf, Selektoren nach jedem Website-Update neu zu schreiben.** +Scrapling ist ein adaptives Web-Scraping-Framework, das alles abdeckt -- von einer einzelnen Anfrage bis hin zu einem umfassenden Crawl. -Scrapling ist nicht nur eine weitere Web-Scraping-Bibliothek. Es ist die erste **adaptive** Scraping-Bibliothek, die von Website-Änderungen lernt und sich mit ihnen weiterentwickelt. Während andere Bibliotheken brechen, wenn Websites ihre Struktur aktualisieren, lokalisiert Scrapling Ihre Elemente automatisch neu und hält Ihre Scraper am Laufen. +Sein Parser lernt aus Website-Änderungen und lokalisiert Ihre Elemente automatisch neu, wenn sich Seiten aktualisieren. Seine Fetcher umgehen Anti-Bot-Systeme wie Cloudflare Turnstile direkt ab Werk. Und sein Spider-Framework ermöglicht es Ihnen, auf parallele Multi-Session-Crawls mit Pause & Resume und automatischer Proxy-Rotation hochzuskalieren -- alles in wenigen Zeilen Python. Eine Bibliothek, keine Kompromisse. -Für das moderne Web entwickelt, bietet Scrapling **seine eigene schnelle Parsing-Engine** und Fetcher, um alle Web-Scraping-Herausforderungen zu bewältigen, denen Sie begegnen oder begegnen werden. Von Web Scrapern für Web Scraper und normale Benutzer entwickelt, ist für jeden etwas dabei. +Blitzschnelle Crawls mit Echtzeit-Statistiken und Streaming. Von Web Scrapern für Web Scraper und normale Benutzer entwickelt, ist für jeden etwas dabei. ```python ->> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher ->> StealthyFetcher.adaptive = True -# Holen Sie sich Website-Quellcode unter dem Radar! ->> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) ->> print(page.status) -200 ->> products = page.css('.product', auto_save=True) # Scrapen Sie Daten, die Website-Designänderungen überleben! ->> # Später, wenn sich die Website-Struktur ändert, übergeben Sie `adaptive=True` ->> products = page.css('.product', adaptive=True) # und Scrapling findet sie trotzdem! +from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher +StealthyFetcher.adaptive = True +page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # Website unbemerkt abrufen! +products = page.css('.product', auto_save=True) # Daten scrapen, die Website-Designänderungen überleben! +products = page.css('.product', adaptive=True) # Später, wenn sich die Website-Struktur ändert, `adaptive=True` übergeben, um sie zu finden! +``` +Oder auf vollständige Crawls hochskalieren +```python +from scrapling.spiders import Spider, Response + +class MySpider(Spider): + name = "demo" + start_urls = ["https://example.com/"] + + async def parse(self, response: Response): + for item in response.css('.product'): + yield {"title": item.css('h2::text').get()} + +MySpider().start() ``` -# Sponsoren + +# Sponsoren @@ -87,12 +93,23 @@ Für das moderne Web entwickelt, bietet Scrapling **seine eigene schnelle Parsin ## Hauptmerkmale -### Erweiterte Website-Abruf mit Sitzungsunterstützung -- **HTTP-Anfragen**: Schnelle und heimliche HTTP-Anfragen mit der `Fetcher`-Klasse. Kann Browser-TLS-Fingerabdrücke, Header imitieren und HTTP3 verwenden. -- **Dynamisches Laden**: Abrufen dynamischer Websites mit vollständiger Browser-Automatisierung über die `DynamicFetcher`-Klasse, die Playwrights Chromium und Google Chrome unterstützt. -- **Anti-Bot-Umgehung**: Erweiterte Stealth-Fähigkeiten mit `StealthyFetcher` und Fingerabdruck-Spoofing. Kann alle Arten von Cloudflares Turnstile/Interstitial einfach mit Automatisierung umgehen. -- **Sitzungsverwaltung**: Persistente Sitzungsunterstützung mit den Klassen `FetcherSession`, `StealthySession` und `DynamicSession` für Cookie- und Zustandsverwaltung über Anfragen hinweg. -- **Async-Unterstützung**: Vollständige Async-Unterstützung über alle Fetcher und dedizierte Async-Sitzungsklassen hinweg. +### Spiders -- Ein vollständiges Crawling-Framework +- 🕷️ **Scrapy-ähnliche Spider-API**: Definieren Sie Spiders mit `start_urls`, async `parse` Callbacks und `Request`/`Response`-Objekten. +- ⚡ **Paralleles Crawling**: Konfigurierbare Parallelitätslimits, domainbezogenes Throttling und Download-Verzögerungen. +- 🔄 **Multi-Session-Unterstützung**: Einheitliche Schnittstelle für HTTP-Anfragen und heimliche Headless-Browser in einem einzigen Spider -- leiten Sie Anfragen per ID an verschiedene Sessions weiter. +- 💾 **Pause & Resume**: Checkpoint-basierte Crawl-Persistenz. Drücken Sie Strg+C für ein kontrolliertes Herunterfahren; starten Sie neu, um dort fortzufahren, wo Sie aufgehört haben. +- 📡 **Streaming-Modus**: Gescrapte Elemente in Echtzeit streamen über `async for item in spider.stream()` mit Echtzeit-Statistiken -- ideal für UI, Pipelines und lang laufende Crawls. +- 🛡️ **Erkennung blockierter Anfragen**: Automatische Erkennung und Wiederholung blockierter Anfragen mit anpassbarer Logik. +- 📦 **Integrierter Export**: Ergebnisse über Hooks und Ihre eigene Pipeline oder den integrierten JSON/JSONL-Export mit `result.items.to_json()` / `result.items.to_jsonl()` exportieren. + +### Erweitertes Website-Abrufen mit Session-Unterstützung +- **HTTP-Anfragen**: Schnelle und heimliche HTTP-Anfragen mit der `Fetcher`-Klasse. Kann Browser-TLS-Fingerprints und Header imitieren und HTTP/3 verwenden. +- **Dynamisches Laden**: Dynamische Websites mit vollständiger Browser-Automatisierung über die `DynamicFetcher`-Klasse abrufen, die Playwrights Chromium und Google Chrome unterstützt. +- **Anti-Bot-Umgehung**: Erweiterte Stealth-Fähigkeiten mit `StealthyFetcher` und Fingerprint-Spoofing. Kann alle Arten von Cloudflares Turnstile/Interstitial einfach mit Automatisierung umgehen. +- **Session-Verwaltung**: Persistente Session-Unterstützung mit den Klassen `FetcherSession`, `StealthySession` und `DynamicSession` für Cookie- und Zustandsverwaltung über Anfragen hinweg. +- **Proxy-Rotation**: Integrierter `ProxyRotator` mit zyklischen oder benutzerdefinierten Rotationsstrategien über alle Session-Typen hinweg, plus Proxy-Überschreibungen pro Anfrage. +- **Domain-Blockierung**: Anfragen an bestimmte Domains (und deren Subdomains) in browserbasierten Fetchern blockieren. +- **Async-Unterstützung**: Vollständige async-Unterstützung über alle Fetcher und dedizierte async Session-Klassen hinweg. ### Adaptives Scraping & KI-Integration - 🔄 **Intelligente Element-Verfolgung**: Elemente nach Website-Änderungen mit intelligenten Ähnlichkeitsalgorithmen neu lokalisieren. @@ -106,103 +123,165 @@ Für das moderne Web entwickelt, bietet Scrapling **seine eigene schnelle Parsin - ⚡ **Schnelle JSON-Serialisierung**: 10x schneller als die Standardbibliothek. - 🏗️ **Praxiserprobt**: Scrapling hat nicht nur eine Testabdeckung von 92% und eine vollständige Type-Hints-Abdeckung, sondern wird seit dem letzten Jahr täglich von Hunderten von Web Scrapern verwendet. -### Entwickler/Web-Scraper-freundliche Erfahrung +### Entwickler-/Web-Scraper-freundliche Erfahrung - 🎯 **Interaktive Web-Scraping-Shell**: Optionale integrierte IPython-Shell mit Scrapling-Integration, Shortcuts und neuen Tools zur Beschleunigung der Web-Scraping-Skriptentwicklung, wie das Konvertieren von Curl-Anfragen in Scrapling-Anfragen und das Anzeigen von Anfrageergebnissen in Ihrem Browser. - 🚀 **Direkt vom Terminal aus verwenden**: Optional können Sie Scrapling verwenden, um eine URL zu scrapen, ohne eine einzige Codezeile zu schreiben! - 🛠️ **Umfangreiche Navigations-API**: Erweiterte DOM-Traversierung mit Eltern-, Geschwister- und Kind-Navigationsmethoden. - 🧬 **Verbesserte Textverarbeitung**: Integrierte Regex, Bereinigungsmethoden und optimierte String-Operationen. - 📝 **Automatische Selektorgenerierung**: Robuste CSS/XPath-Selektoren für jedes Element generieren. - 🔌 **Vertraute API**: Ähnlich wie Scrapy/BeautifulSoup mit denselben Pseudo-Elementen, die in Scrapy/Parsel verwendet werden. -- 📘 **Vollständige Typabdeckung**: Vollständige Type Hints für hervorragende IDE-Unterstützung und Code-Vervollständigung. +- 📘 **Vollständige Typabdeckung**: Vollständige Type Hints für hervorragende IDE-Unterstützung und Code-Vervollständigung. Die gesamte Codebasis wird bei jeder Änderung automatisch mit **PyRight** und **MyPy** gescannt. - 🔋 **Fertiges Docker-Image**: Mit jeder Veröffentlichung wird automatisch ein Docker-Image erstellt und gepusht, das alle Browser enthält. ## Erste Schritte +Hier ein kurzer Überblick über das, was Scrapling kann, ohne zu sehr ins Detail zu gehen. + ### Grundlegende Verwendung +HTTP-Anfragen mit Session-Unterstützung ```python -from scrapling.fetchers import Fetcher, StealthyFetcher, DynamicFetcher -from scrapling.fetchers import FetcherSession, StealthySession, DynamicSession +from scrapling.fetchers import Fetcher, FetcherSession -# HTTP-Anfragen mit Sitzungsunterstützung -with FetcherSession(impersonate='chrome') as session: # Verwenden Sie die neueste Version von Chromes TLS-Fingerabdruck +with FetcherSession(impersonate='chrome') as session: # Neueste Version von Chromes TLS-Fingerprint verwenden page = session.get('https://quotes.toscrape.com/', stealthy_headers=True) - quotes = page.css('.quote .text::text') + quotes = page.css('.quote .text::text').getall() -# Oder verwenden Sie einmalige Anfragen +# Oder einmalige Anfragen verwenden page = Fetcher.get('https://quotes.toscrape.com/') -quotes = page.css('.quote .text::text') +quotes = page.css('.quote .text::text').getall() +``` +Erweiterter Stealth-Modus +```python +from scrapling.fetchers import StealthyFetcher, StealthySession -# Erweiterter Stealth-Modus (Browser offen halten, bis Sie fertig sind) -with StealthySession(headless=True, solve_cloudflare=True) as session: +with StealthySession(headless=True, solve_cloudflare=True) as session: # Browser offen halten, bis Sie fertig sind page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False) - data = page.css('#padded_content a') + data = page.css('#padded_content a').getall() -# Oder verwenden Sie den einmaligen Anfragenstil, öffnet den Browser für diese Anfrage und schließt ihn dann nach Abschluss +# Oder einmaligen Anfragenstil verwenden: öffnet den Browser für diese Anfrage und schließt ihn nach Abschluss page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare') -data = page.css('#padded_content a') - -# Vollständige Browser-Automatisierung (Browser offen halten, bis Sie fertig sind) -with DynamicSession(headless=True) as session: - page = session.fetch('https://quotes.toscrape.com/', network_idle=True) - quotes = page.css('.quote .text::text') - -# Oder verwenden Sie den einmaligen Anfragenstil -page = DynamicFetcher.fetch('https://quotes.toscrape.com/', network_idle=True) -quotes = page.css('.quote .text::text') +data = page.css('#padded_content a').getall() +``` +Vollständige Browser-Automatisierung +```python +from scrapling.fetchers import DynamicFetcher, DynamicSession + +with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # Browser offen halten, bis Sie fertig sind + page = session.fetch('https://quotes.toscrape.com/', load_dom=False) + data = page.xpath('//span[@class="text"]/text()').getall() # XPath-Selektor, falls bevorzugt + +# Oder einmaligen Anfragenstil verwenden: öffnet den Browser für diese Anfrage und schließt ihn nach Abschluss +page = DynamicFetcher.fetch('https://quotes.toscrape.com/') +data = page.css('.quote .text::text').getall() +``` + +### Spiders +Vollständige Crawler mit parallelen Anfragen, mehreren Session-Typen und Pause & Resume erstellen: +```python +from scrapling.spiders import Spider, Request, Response + +class QuotesSpider(Spider): + name = "quotes" + start_urls = ["https://quotes.toscrape.com/"] + concurrent_requests = 10 + + async def parse(self, response: Response): + for quote in response.css('.quote'): + yield { + "text": quote.css('.text::text').get(), + "author": quote.css('.author::text').get(), + } + + next_page = response.css('.next a') + if next_page: + yield response.follow(next_page[0].attrib['href']) + +result = QuotesSpider().start() +print(f"{len(result.items)} Zitate gescrapt") +result.items.to_json("quotes.json") +``` +Mehrere Session-Typen in einem einzigen Spider verwenden: +```python +from scrapling.spiders import Spider, Request, Response +from scrapling.fetchers import FetcherSession, AsyncStealthySession + +class MultiSessionSpider(Spider): + name = "multi" + start_urls = ["https://example.com/"] + + def configure_sessions(self, manager): + manager.add("fast", FetcherSession(impersonate="chrome")) + manager.add("stealth", AsyncStealthySession(headless=True), lazy=True) + + async def parse(self, response: Response): + for link in response.css('a::attr(href)').getall(): + # Geschützte Seiten über die Stealth-Session leiten + if "protected" in link: + yield Request(link, sid="stealth") + else: + yield Request(link, sid="fast", callback=self.parse) # Expliziter Callback +``` +Lange Crawls mit Checkpoints pausieren und fortsetzen, indem Sie den Spider so starten: +```python +QuotesSpider(crawldir="./crawl_data").start() ``` +Drücken Sie Strg+C, um kontrolliert zu pausieren -- der Fortschritt wird automatisch gespeichert. Wenn Sie den Spider später erneut starten, übergeben Sie dasselbe `crawldir`, und er setzt dort fort, wo er aufgehört hat. + +### Erweitertes Parsing & Navigation +```python +from scrapling.fetchers import Fetcher -### Elementauswahl +# Umfangreiche Elementauswahl und Navigation +page = Fetcher.get('https://quotes.toscrape.com/') + +# Zitate mit verschiedenen Auswahlmethoden abrufen +quotes = page.css('.quote') # CSS-Selektor +quotes = page.xpath('//div[@class="quote"]') # XPath +quotes = page.find_all('div', {'class': 'quote'}) # BeautifulSoup-Stil +# Gleich wie +quotes = page.find_all('div', class_='quote') +quotes = page.find_all(['div'], class_='quote') +quotes = page.find_all(class_='quote') # und so weiter... +# Element nach Textinhalt finden +quotes = page.find_by_text('quote', tag='div') + +# Erweiterte Navigation +quote_text = page.css('.quote')[0].css('.text::text').get() +quote_text = page.css('.quote').css('.text::text').getall() # Verkettete Selektoren +first_quote = page.css('.quote')[0] +author = first_quote.next_sibling.css('.author::text') +parent_container = first_quote.parent + +# Elementbeziehungen und Ähnlichkeit +similar_elements = first_quote.find_similar() +below_elements = first_quote.below_elements() +``` +Sie können den Parser direkt verwenden, wenn Sie keine Websites abrufen möchten, wie unten gezeigt: ```python -# CSS-Selektoren -page.css('a::text') # Text extrahieren -page.css('a::attr(href)') # Attribute extrahieren -page.css('a', recursive=False) # Nur direkte Elemente -page.css('a', auto_save=True) # Elementpositionen automatisch speichern - -# XPath -page.xpath('//a/text()') - -# Flexible Suche -page.find_by_text('Python', first_match=True) # Nach Text suchen -page.find_by_regex(r'\d{4}') # Nach Regex-Muster suchen -page.find('div', {'class': 'container'}) # Nach Attributen suchen - -# Navigation -element.parent # Elternelement abrufen -element.next_sibling # Nächstes Geschwister abrufen -element.children # Kindelemente abrufen - -# Ähnliche Elemente -similar = page.get_similar(element) # Ähnliche Elemente finden - -# Adaptives Scraping -saved_elements = page.css('.product', auto_save=True) -# Später, wenn sich die Website ändert: -page.css('.product', adaptive=True) # Elemente mithilfe gespeicherter Positionen finden +from scrapling.parser import Selector + +page = Selector("...") ``` +Und es funktioniert genau auf die gleiche Weise! -### Sitzungsverwendung +### Beispiele für async Session-Verwaltung ```python -from scrapling.fetchers import FetcherSession, AsyncFetcherSession - -# Synchrone Sitzung -with FetcherSession() as session: - # Cookies werden automatisch beibehalten - page1 = session.get('https://quotes.toscrape.com/login') - page2 = session.post('https://quotes.toscrape.com/login', data={'username': 'admin', 'password': 'admin'}) - - # Bei Bedarf Browser-Fingerabdruck wechseln +import asyncio +from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession + +async with FetcherSession(http3=True) as session: # `FetcherSession` ist kontextbewusst und kann sowohl in sync- als auch in async-Mustern arbeiten + page1 = session.get('https://quotes.toscrape.com/') page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135') -# Async-Sitzungsverwendung +# Async-Session-Verwendung async with AsyncStealthySession(max_pages=2) as session: tasks = [] urls = ['https://example.com/page1', 'https://example.com/page2'] - + for url in urls: task = session.fetch(url) tasks.append(task) - + print(session.get_pool_stats()) # Optional - Der Status des Browser-Tab-Pools (beschäftigt/frei/Fehler) results = await asyncio.gather(*tasks) print(session.get_pool_stats()) @@ -210,7 +289,7 @@ async with AsyncStealthySession(max_pages=2) as session: ## CLI & Interaktive Shell -Scrapling v0.3 enthält eine leistungsstarke Befehlszeilenschnittstelle: +Scrapling enthält eine leistungsstarke Befehlszeilenschnittstelle: [![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339) @@ -218,7 +297,7 @@ Interaktive Web-Scraping-Shell starten ```bash scrapling shell ``` -Seiten direkt ohne Programmierung in eine Datei extrahieren (Extrahiert standardmäßig den Inhalt im `body`-Tag). Wenn die Ausgabedatei mit `.txt` endet, wird der Textinhalt des Ziels extrahiert. Wenn sie mit `.md` endet, ist es eine Markdown-Darstellung des HTML-Inhalts; wenn sie mit `.html` endet, ist es der HTML-Inhalt selbst. +Seiten direkt ohne Programmierung in eine Datei extrahieren (extrahiert standardmäßig den Inhalt im `body`-Tag). Wenn die Ausgabedatei mit `.txt` endet, wird der Textinhalt des Ziels extrahiert. Wenn sie mit `.md` endet, ist es eine Markdown-Darstellung des HTML-Inhalts; wenn sie mit `.html` endet, ist es der HTML-Inhalt selbst. ```bash scrapling extract get 'https://example.com' content.md scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # Alle Elemente, die dem CSS-Selektor '#fromSkipToProducts' entsprechen @@ -227,24 +306,24 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas. ``` > [!NOTE] -> Es gibt viele zusätzliche Funktionen, aber wir möchten diese Seite prägnant halten, wie den MCP-Server und die interaktive Web-Scraping-Shell. Schauen Sie sich die vollständige Dokumentation [hier](https://scrapling.readthedocs.io/en/latest/) an +> Es gibt viele zusätzliche Funktionen, aber wir möchten diese Seite prägnant halten, einschließlich des MCP-Servers und der interaktiven Web-Scraping-Shell. Schauen Sie sich die vollständige Dokumentation [hier](https://scrapling.readthedocs.io/en/latest/) an ## Leistungsbenchmarks -Scrapling ist nicht nur leistungsstark – es ist auch blitzschnell, und die Updates seit Version 0.3 haben außergewöhnliche Leistungsverbesserungen bei allen Operationen gebracht. Die folgenden Benchmarks vergleichen den Parser von Scrapling mit anderen beliebten Bibliotheken. +Scrapling ist nicht nur leistungsstark -- es ist auch blitzschnell. Die folgenden Benchmarks vergleichen Scraplings Parser mit den neuesten Versionen anderer beliebter Bibliotheken. ### Textextraktions-Geschwindigkeitstest (5000 verschachtelte Elemente) -| # | Bibliothek | Zeit (ms) | vs Scrapling | +| # | Bibliothek | Zeit (ms) | vs Scrapling | |---|:-----------------:|:---------:|:------------:| -| 1 | Scrapling | 1.99 | 1.0x | -| 2 | Parsel/Scrapy | 2.01 | 1.01x | -| 3 | Raw Lxml | 2.5 | 1.256x | -| 4 | PyQuery | 22.93 | ~11.5x | -| 5 | Selectolax | 80.57 | ~40.5x | -| 6 | BS4 with Lxml | 1541.37 | ~774.6x | -| 7 | MechanicalSoup | 1547.35 | ~777.6x | -| 8 | BS4 with html5lib | 3410.58 | ~1713.9x | +| 1 | Scrapling | 2.02 | 1.0x | +| 2 | Parsel/Scrapy | 2.04 | 1.01 | +| 3 | Raw Lxml | 2.54 | 1.257 | +| 4 | PyQuery | 24.17 | ~12x | +| 5 | Selectolax | 82.63 | ~41x | +| 6 | MechanicalSoup | 1549.71 | ~767.1x | +| 7 | BS4 with Lxml | 1584.31 | ~784.3x | +| 8 | BS4 with html5lib | 3391.91 | ~1679.1x | ### Element-Ähnlichkeit & Textsuche-Leistung @@ -253,8 +332,8 @@ Scraplings adaptive Element-Finding-Fähigkeiten übertreffen Alternativen deutl | Bibliothek | Zeit (ms) | vs Scrapling | |-------------|:---------:|:------------:| -| Scrapling | 2.46 | 1.0x | -| AutoScraper | 13.3 | 5.407x | +| Scrapling | 2.39 | 1.0x | +| AutoScraper | 12.45 | 5.209x | > Alle Benchmarks stellen Durchschnittswerte von über 100 Durchläufen dar. Siehe [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) für die Methodik. @@ -267,18 +346,18 @@ Scrapling erfordert Python 3.10 oder höher: pip install scrapling ``` -Ab v0.3.2 enthält diese Installation nur die Parser-Engine und ihre Abhängigkeiten, ohne Fetcher oder Kommandozeilenabhängigkeiten. +Diese Installation enthält nur die Parser-Engine und ihre Abhängigkeiten, ohne Fetcher oder Kommandozeilenabhängigkeiten. ### Optionale Abhängigkeiten 1. Wenn Sie eine der folgenden zusätzlichen Funktionen, die Fetcher oder ihre Klassen verwenden möchten, müssen Sie die Abhängigkeiten der Fetcher und ihre Browser-Abhängigkeiten wie folgt installieren: ```bash pip install "scrapling[fetchers]" - + scrapling install ``` - Dies lädt alle Browser zusammen mit ihren Systemabhängigkeiten und Fingerabdruck-Manipulationsabhängigkeiten herunter. + Dies lädt alle Browser zusammen mit ihren Systemabhängigkeiten und Fingerprint-Manipulationsabhängigkeiten herunter. 2. Zusätzliche Funktionen: - MCP-Server-Funktion installieren: @@ -322,14 +401,7 @@ Diese Arbeit ist unter der BSD-3-Clause-Lizenz lizenziert. ## Danksagungen Dieses Projekt enthält angepassten Code von: -- Parsel (BSD-Lizenz) – Verwendet für [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)-Submodul - -## Dank und Referenzen - -- [Daijros](https://github.com/daijro) brillante Arbeit an [BrowserForge](https://github.com/daijro/browserforge) und [Camoufox](https://github.com/daijro/camoufox) -- [Vinyzus](https://github.com/Vinyzu) brillante Arbeit an [Botright](https://github.com/Vinyzu/Botright) und [PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright) -- [brotector](https://github.com/kaliiiiiiiiii/brotector) für Browser-Erkennungs-Umgehungstechniken -- [fakebrowser](https://github.com/kkoooqq/fakebrowser) und [BotBrowser](https://github.com/botswin/BotBrowser) für Fingerprinting-Forschung +- Parsel (BSD-Lizenz) -- Verwendet für das [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)-Submodul --- -
Entworfen und hergestellt mit ❤️ von Karim Shoair.

\ No newline at end of file +
Entworfen und hergestellt mit ❤️ von Karim Shoair.

diff --git a/docs/README_ES.md b/docs/README_ES.md index a91360849f076a5480f24cd45729d183f4f4a51e..8957c4133782b8d441b0ce782f004ad40e5c85c4 100644 --- a/docs/README_ES.md +++ b/docs/README_ES.md @@ -1,9 +1,14 @@ -

-
- main poster -
- ¡Web Scraping fácil y sin esfuerzo como debería ser! -

+

+ + + + Scrapling Poster + + +
+ Effortless Web Scraping for the Modern Web +

+

Tests @@ -24,46 +29,47 @@

- - Métodos de selección - - · - - Elegir un fetcher - - · - - CLI - - · - - Modo MCP - - · - - Migrar desde Beautifulsoup - + Metodos de seleccion + · + Elegir un fetcher + · + CLI + · + Modo MCP + · + Migrar desde Beautifulsoup

-**Deja de luchar contra sistemas anti-bot. Deja de reescribir selectores después de cada actualización del sitio web.** +Scrapling es un framework de Web Scraping adaptativo que se encarga de todo, desde una sola solicitud hasta un rastreo a gran escala. -Scrapling no es solo otra biblioteca de Web Scraping. Es la primera biblioteca de scraping **adaptativa** que aprende de los cambios de los sitios web y evoluciona con ellos. Mientras que otras bibliotecas se rompen cuando los sitios web actualizan su estructura, Scrapling relocaliza automáticamente tus elementos y mantiene tus scrapers funcionando. +Su parser aprende de los cambios de los sitios web y relocaliza automáticamente tus elementos cuando las páginas se actualizan. Sus fetchers evaden sistemas anti-bot como Cloudflare Turnstile de forma nativa. Y su framework Spider te permite escalar a rastreos concurrentes con múltiples sesiones, con Pause & Resume y rotación automática de Proxy, todo en unas pocas líneas de Python. Una biblioteca, cero compromisos. -Construido para la Web moderna, Scrapling presenta **su propio motor de análisis rápido** y fetchers para manejar todos los desafíos de Web Scraping que enfrentas o enfrentarás. Construido por Web Scrapers para Web Scrapers y usuarios regulares, hay algo para todos. +Rastreos ultrarrápidos con estadísticas en tiempo real y Streaming. Construido por Web Scrapers para Web Scrapers y usuarios regulares, hay algo para todos. ```python ->> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher ->> StealthyFetcher.adaptive = True -# ¡Obtén el código fuente de sitios web bajo el radar! ->> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) ->> print(page.status) -200 ->> products = page.css('.product', auto_save=True) # ¡Extrae datos que sobreviven a cambios de diseño del sitio web! ->> # Más tarde, si la estructura del sitio web cambia, pasa `adaptive=True` ->> products = page.css('.product', adaptive=True) # ¡y Scrapling aún los encuentra! +from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher +StealthyFetcher.adaptive = True +page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # ¡Obtén el sitio web bajo el radar! +products = page.css('.product', auto_save=True) # ¡Extrae datos que sobreviven a cambios de diseño del sitio web! +products = page.css('.product', adaptive=True) # Más tarde, si la estructura del sitio web cambia, ¡pasa `adaptive=True` para encontrarlos! ``` +O escala a rastreos completos +```python +from scrapling.spiders import Spider, Response -# Patrocinadores +class MySpider(Spider): + name = "demo" + start_urls = ["https://example.com/"] + + async def parse(self, response: Response): + for item in response.css('.product'): + yield {"title": item.css('h2::text').get()} + +MySpider().start() +``` + + +# Patrocinadores @@ -87,24 +93,35 @@ Construido para la Web moderna, Scrapling presenta **su propio motor de análisi ## Características Principales -### Obtención Avanzada de Sitios Web con Soporte de Sesión -- **Solicitudes HTTP**: Solicitudes HTTP rápidas y sigilosas con la clase `Fetcher`. Puede imitar la huella TLS de los navegadores, encabezados y usar HTTP3. +### Spiders — Un Framework Completo de Rastreo +- 🕷️ **API de Spider al estilo Scrapy**: Define spiders con `start_urls`, callbacks async `parse`, y objetos `Request`/`Response`. +- ⚡ **Rastreo Concurrente**: Límites de concurrencia configurables, limitación por dominio y retrasos de descarga. +- 🔄 **Soporte Multi-Session**: Interfaz unificada para solicitudes HTTP y navegadores headless sigilosos en un solo Spider — enruta solicitudes a diferentes sesiones por ID. +- 💾 **Pause & Resume**: Persistencia de rastreo basada en Checkpoint. Presiona Ctrl+C para un cierre ordenado; reinicia para continuar desde donde lo dejaste. +- 📡 **Modo Streaming**: Transmite elementos extraídos a medida que llegan con `async for item in spider.stream()` con estadísticas en tiempo real — ideal para UI, pipelines y rastreos de larga duración. +- 🛡️ **Detección de Solicitudes Bloqueadas**: Detección automática y reintento de solicitudes bloqueadas con lógica personalizable. +- 📦 **Exportación Integrada**: Exporta resultados a través de hooks y tu propio pipeline o el JSON/JSONL integrado con `result.items.to_json()` / `result.items.to_jsonl()` respectivamente. + +### Obtención Avanzada de Sitios Web con Soporte de Session +- **Solicitudes HTTP**: Solicitudes HTTP rápidas y sigilosas con la clase `Fetcher`. Puede imitar el fingerprint TLS de los navegadores, encabezados y usar HTTP/3. - **Carga Dinámica**: Obtén sitios web dinámicos con automatización completa del navegador a través de la clase `DynamicFetcher` compatible con Chromium de Playwright y Google Chrome. -- **Evasión Anti-bot**: Capacidades de sigilo avanzadas con `StealthyFetcher` y falsificación de huellas digitales. Puede evadir fácilmente todos los tipos de Turnstile/Interstitial de Cloudflare con automatización. -- **Gestión de Sesión**: Soporte de sesión persistente con las clases `FetcherSession`, `StealthySession` y `DynamicSession` para la gestión de cookies y estado entre solicitudes. +- **Evasión Anti-bot**: Capacidades de sigilo avanzadas con `StealthyFetcher` y falsificación de fingerprint. Puede evadir fácilmente todos los tipos de Turnstile/Interstitial de Cloudflare con automatización. +- **Gestión de Session**: Soporte de sesión persistente con las clases `FetcherSession`, `StealthySession` y `DynamicSession` para la gestión de cookies y estado entre solicitudes. +- **Rotación de Proxy**: `ProxyRotator` integrado con estrategias de rotación cíclica o personalizadas en todos los tipos de sesión, además de sobrescrituras de Proxy por solicitud. +- **Bloqueo de Dominios**: Bloquea solicitudes a dominios específicos (y sus subdominios) en fetchers basados en navegador. - **Soporte Async**: Soporte async completo en todos los fetchers y clases de sesión async dedicadas. ### Scraping Adaptativo e Integración con IA - 🔄 **Seguimiento Inteligente de Elementos**: Relocaliza elementos después de cambios en el sitio web usando algoritmos inteligentes de similitud. - 🎯 **Selección Flexible Inteligente**: Selectores CSS, selectores XPath, búsqueda basada en filtros, búsqueda de texto, búsqueda regex y más. - 🔍 **Encontrar Elementos Similares**: Localiza automáticamente elementos similares a los elementos encontrados. -- 🤖 **Servidor MCP para usar con IA**: Servidor MCP integrado para Web Scraping asistido por IA y extracción de datos. El servidor MCP presenta capacidades poderosas y personalizadas que aprovechan Scrapling para extraer contenido específico antes de pasarlo a la IA (Claude/Cursor/etc), acelerando así las operaciones y reduciendo costos al minimizar el uso de tokens. ([video demo](https://www.youtube.com/watch?v=qyFk3ZNwOxE)) +- 🤖 **Servidor MCP para usar con IA**: Servidor MCP integrado para Web Scraping asistido por IA y extracción de datos. El servidor MCP presenta capacidades potentes y personalizadas que aprovechan Scrapling para extraer contenido específico antes de pasarlo a la IA (Claude/Cursor/etc), acelerando así las operaciones y reduciendo costos al minimizar el uso de tokens. ([video demo](https://www.youtube.com/watch?v=qyFk3ZNwOxE)) ### Arquitectura de Alto Rendimiento y Probada en Batalla -- 🚀 **Ultrarrápido**: Rendimiento optimizado que supera a la mayoría de las bibliotecas de scraping de Python. +- 🚀 **Ultrarrápido**: Rendimiento optimizado que supera a la mayoría de las bibliotecas de Web Scraping de Python. - 🔋 **Eficiente en Memoria**: Estructuras de datos optimizadas y carga diferida para una huella de memoria mínima. - ⚡ **Serialización JSON Rápida**: 10 veces más rápido que la biblioteca estándar. -- 🏗️ **Probado en batalla**: Scrapling no solo tiene una cobertura de prueba del 92% y cobertura completa de type hints, sino que ha sido utilizado diariamente por cientos de Web Scrapers durante el último año. +- 🏗️ **Probado en batalla**: Scrapling no solo tiene una cobertura de pruebas del 92% y cobertura completa de type hints, sino que ha sido utilizado diariamente por cientos de Web Scrapers durante el último año. ### Experiencia Amigable para Desarrolladores/Web Scrapers - 🎯 **Shell Interactivo de Web Scraping**: Shell IPython integrado opcional con integración de Scrapling, atajos y nuevas herramientas para acelerar el desarrollo de scripts de Web Scraping, como convertir solicitudes curl a solicitudes Scrapling y ver resultados de solicitudes en tu navegador. @@ -113,96 +130,158 @@ Construido para la Web moderna, Scrapling presenta **su propio motor de análisi - 🧬 **Procesamiento de Texto Mejorado**: Métodos integrados de regex, limpieza y operaciones de cadena optimizadas. - 📝 **Generación Automática de Selectores**: Genera selectores CSS/XPath robustos para cualquier elemento. - 🔌 **API Familiar**: Similar a Scrapy/BeautifulSoup con los mismos pseudo-elementos usados en Scrapy/Parsel. -- 📘 **Cobertura Completa de Tipos**: Type hints completos para excelente soporte de IDE y autocompletado de código. +- 📘 **Cobertura Completa de Tipos**: Type hints completos para excelente soporte de IDE y autocompletado de código. Todo el código fuente se escanea automáticamente con **PyRight** y **MyPy** en cada cambio. - 🔋 **Imagen Docker Lista**: Con cada lanzamiento, se construye y publica automáticamente una imagen Docker que contiene todos los navegadores. -## Empezando +## Primeros Pasos + +Aquí tienes un vistazo rápido de lo que Scrapling puede hacer sin entrar en profundidad. ### Uso Básico +Solicitudes HTTP con soporte de sesión ```python -from scrapling.fetchers import Fetcher, StealthyFetcher, DynamicFetcher -from scrapling.fetchers import FetcherSession, StealthySession, DynamicSession +from scrapling.fetchers import Fetcher, FetcherSession -# Solicitudes HTTP con soporte de sesión -with FetcherSession(impersonate='chrome') as session: # Usa la última versión de la huella TLS de Chrome +with FetcherSession(impersonate='chrome') as session: # Usa la última versión del fingerprint TLS de Chrome page = session.get('https://quotes.toscrape.com/', stealthy_headers=True) - quotes = page.css('.quote .text::text') + quotes = page.css('.quote .text::text').getall() # O usa solicitudes de una sola vez page = Fetcher.get('https://quotes.toscrape.com/') -quotes = page.css('.quote .text::text') +quotes = page.css('.quote .text::text').getall() +``` +Modo sigiloso avanzado +```python +from scrapling.fetchers import StealthyFetcher, StealthySession -# Modo sigiloso avanzado (Mantén el navegador abierto hasta que termines) -with StealthySession(headless=True, solve_cloudflare=True) as session: +with StealthySession(headless=True, solve_cloudflare=True) as session: # Mantén el navegador abierto hasta que termines page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False) - data = page.css('#padded_content a') + data = page.css('#padded_content a').getall() # O usa el estilo de solicitud de una sola vez, abre el navegador para esta solicitud, luego lo cierra después de terminar page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare') -data = page.css('#padded_content a') - -# Automatización completa del navegador (Mantén el navegador abierto hasta que termines) -with DynamicSession(headless=True) as session: - page = session.fetch('https://quotes.toscrape.com/', network_idle=True) - quotes = page.css('.quote .text::text') - -# O usa el estilo de solicitud de una sola vez -page = DynamicFetcher.fetch('https://quotes.toscrape.com/', network_idle=True) -quotes = page.css('.quote .text::text') +data = page.css('#padded_content a').getall() ``` +Automatización completa del navegador +```python +from scrapling.fetchers import DynamicFetcher, DynamicSession + +with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # Mantén el navegador abierto hasta que termines + page = session.fetch('https://quotes.toscrape.com/', load_dom=False) + data = page.xpath('//span[@class="text"]/text()').getall() # Selector XPath si lo prefieres -### Selección de Elementos +# O usa el estilo de solicitud de una sola vez, abre el navegador para esta solicitud, luego lo cierra después de terminar +page = DynamicFetcher.fetch('https://quotes.toscrape.com/') +data = page.css('.quote .text::text').getall() +``` + +### Spiders +Construye rastreadores completos con solicitudes concurrentes, múltiples tipos de sesión y Pause & Resume: +```python +from scrapling.spiders import Spider, Request, Response + +class QuotesSpider(Spider): + name = "quotes" + start_urls = ["https://quotes.toscrape.com/"] + concurrent_requests = 10 + + async def parse(self, response: Response): + for quote in response.css('.quote'): + yield { + "text": quote.css('.text::text').get(), + "author": quote.css('.author::text').get(), + } + + next_page = response.css('.next a') + if next_page: + yield response.follow(next_page[0].attrib['href']) + +result = QuotesSpider().start() +print(f"Se extrajeron {len(result.items)} citas") +result.items.to_json("quotes.json") +``` +Usa múltiples tipos de sesión en un solo Spider: +```python +from scrapling.spiders import Spider, Request, Response +from scrapling.fetchers import FetcherSession, AsyncStealthySession + +class MultiSessionSpider(Spider): + name = "multi" + start_urls = ["https://example.com/"] + + def configure_sessions(self, manager): + manager.add("fast", FetcherSession(impersonate="chrome")) + manager.add("stealth", AsyncStealthySession(headless=True), lazy=True) + + async def parse(self, response: Response): + for link in response.css('a::attr(href)').getall(): + # Enruta las páginas protegidas a través de la sesión sigilosa + if "protected" in link: + yield Request(link, sid="stealth") + else: + yield Request(link, sid="fast", callback=self.parse) # callback explícito +``` +Pausa y reanuda rastreos largos con checkpoints ejecutando el Spider así: +```python +QuotesSpider(crawldir="./crawl_data").start() +``` +Presiona Ctrl+C para pausar de forma ordenada — el progreso se guarda automáticamente. Después, cuando inicies el Spider de nuevo, pasa el mismo `crawldir`, y continuará desde donde se detuvo. + +### Análisis Avanzado y Navegación +```python +from scrapling.fetchers import Fetcher + +# Selección rica de elementos y navegación +page = Fetcher.get('https://quotes.toscrape.com/') + +# Obtén citas con múltiples métodos de selección +quotes = page.css('.quote') # Selector CSS +quotes = page.xpath('//div[@class="quote"]') # XPath +quotes = page.find_all('div', {'class': 'quote'}) # Estilo BeautifulSoup +# Igual que +quotes = page.find_all('div', class_='quote') +quotes = page.find_all(['div'], class_='quote') +quotes = page.find_all(class_='quote') # y así sucesivamente... +# Encuentra elementos por contenido de texto +quotes = page.find_by_text('quote', tag='div') + +# Navegación avanzada +quote_text = page.css('.quote')[0].css('.text::text').get() +quote_text = page.css('.quote').css('.text::text').getall() # Selectores encadenados +first_quote = page.css('.quote')[0] +author = first_quote.next_sibling.css('.author::text') +parent_container = first_quote.parent + +# Relaciones y similitud de elementos +similar_elements = first_quote.find_similar() +below_elements = first_quote.below_elements() +``` +Puedes usar el parser directamente si no necesitas obtener sitios web, como se muestra a continuación: ```python -# CSS selectors -page.css('a::text') # Extracta texto -page.css('a::attr(href)') # Extracta atributos -page.css('a', recursive=False) # Solo elementos directos -page.css('a', auto_save=True) # Guarda posiciones de los elementos automáticamente - -# XPath -page.xpath('//a/text()') - -# Búsqueda flexible -page.find_by_text('Python', first_match=True) # Encuentra por texto -page.find_by_regex(r'\d{4}') # Encuentra por patrón regex -page.find('div', {'class': 'container'}) # Encuentra por atributos - -# Navegación -element.parent # Obtener elemento padre -element.next_sibling # Obtener siguiente hermano -element.children # Obtener hijos - -# Elementos similares -similar = page.get_similar(element) # Encuentra elementos similares - -# Scraping adaptativo -saved_elements = page.css('.product', auto_save=True) -# Más tarde, cuando el sitio web cambia: -page.css('.product', adaptive=True) # Encuentra elementos usando posiciones guardadas +from scrapling.parser import Selector + +page = Selector("...") ``` +¡Y funciona exactamente de la misma manera! -### Uso de Sesión +### Ejemplos de Gestión de Session Async ```python -from scrapling.fetchers import FetcherSession, AsyncFetcherSession - -# Sesión sincrónica -with FetcherSession() as session: - # Las cookies se mantienen automáticamente - page1 = session.get('https://quotes.toscrape.com/login') - page2 = session.post('https://quotes.toscrape.com/login', data={'username': 'admin', 'password': 'admin'}) - - # Cambiar fingerprint del navegador si es necesario +import asyncio +from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession + +async with FetcherSession(http3=True) as session: # `FetcherSession` es consciente del contexto y puede funcionar tanto en patrones sync/async + page1 = session.get('https://quotes.toscrape.com/') page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135') # Uso de sesión async async with AsyncStealthySession(max_pages=2) as session: tasks = [] urls = ['https://example.com/page1', 'https://example.com/page2'] - + for url in urls: task = session.fetch(url) tasks.append(task) - + print(session.get_pool_stats()) # Opcional - El estado del pool de pestañas del navegador (ocupado/libre/error) results = await asyncio.gather(*tasks) print(session.get_pool_stats()) @@ -210,11 +289,11 @@ async with AsyncStealthySession(max_pages=2) as session: ## CLI y Shell Interactivo -Scrapling v0.3 incluye una poderosa interfaz de línea de comandos: +Scrapling incluye una poderosa interfaz de línea de comandos: [![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339) -Lanzar shell interactivo de Web Scraping +Lanzar el Shell interactivo de Web Scraping ```bash scrapling shell ``` @@ -227,24 +306,24 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas. ``` > [!NOTE] -> Hay muchas características adicionales, pero queremos mantener esta página concisa, como el servidor MCP y el Shell Interactivo de Web Scraping. Consulta la documentación completa [aquí](https://scrapling.readthedocs.io/en/latest/) +> Hay muchas características adicionales, pero queremos mantener esta página concisa, incluyendo el servidor MCP y el Shell Interactivo de Web Scraping. Consulta la documentación completa [aquí](https://scrapling.readthedocs.io/en/latest/) ## Benchmarks de Rendimiento -Scrapling no solo es poderoso, también es increíblemente rápido, y las actualizaciones desde la versión 0.3 han brindado mejoras de rendimiento excepcionales en todas las operaciones. Los siguientes benchmarks comparan el analizador de Scrapling con otras bibliotecas populares. +Scrapling no solo es potente, también es ultrarrápido. Los siguientes benchmarks comparan el parser de Scrapling con las últimas versiones de otras bibliotecas populares. ### Prueba de Velocidad de Extracción de Texto (5000 elementos anidados) -| # | Biblioteca | Tiempo (ms) | vs Scrapling | +| # | Biblioteca | Tiempo (ms) | vs Scrapling | |---|:-----------------:|:-----------:|:------------:| -| 1 | Scrapling | 1.99 | 1.0x | -| 2 | Parsel/Scrapy | 2.01 | 1.01x | -| 3 | Raw Lxml | 2.5 | 1.256x | -| 4 | PyQuery | 22.93 | ~11.5x | -| 5 | Selectolax | 80.57 | ~40.5x | -| 6 | BS4 with Lxml | 1541.37 | ~774.6x | -| 7 | MechanicalSoup | 1547.35 | ~777.6x | -| 8 | BS4 with html5lib | 3410.58 | ~1713.9x | +| 1 | Scrapling | 2.02 | 1.0x | +| 2 | Parsel/Scrapy | 2.04 | 1.01 | +| 3 | Raw Lxml | 2.54 | 1.257 | +| 4 | PyQuery | 24.17 | ~12x | +| 5 | Selectolax | 82.63 | ~41x | +| 6 | MechanicalSoup | 1549.71 | ~767.1x | +| 7 | BS4 with Lxml | 1584.31 | ~784.3x | +| 8 | BS4 with html5lib | 3391.91 | ~1679.1x | ### Rendimiento de Similitud de Elementos y Búsqueda de Texto @@ -253,8 +332,8 @@ Las capacidades de búsqueda adaptativa de elementos de Scrapling superan signif | Biblioteca | Tiempo (ms) | vs Scrapling | |-------------|:-----------:|:------------:| -| Scrapling | 2.46 | 1.0x | -| AutoScraper | 13.3 | 5.407x | +| Scrapling | 2.39 | 1.0x | +| AutoScraper | 12.45 | 5.209x | > Todos los benchmarks representan promedios de más de 100 ejecuciones. Ver [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) para la metodología. @@ -267,29 +346,29 @@ Scrapling requiere Python 3.10 o superior: pip install scrapling ``` -A partir de v0.3.2, esta instalación solo incluye el motor de análisis y sus dependencias, sin ningún fetcher o dependencias de línea de comandos. +Esta instalación solo incluye el motor de análisis y sus dependencias, sin ningún fetcher ni dependencias de línea de comandos. ### Dependencias Opcionales 1. Si vas a usar alguna de las características adicionales a continuación, los fetchers, o sus clases, necesitarás instalar las dependencias de los fetchers y sus dependencias del navegador de la siguiente manera: ```bash pip install "scrapling[fetchers]" - + scrapling install ``` - Esto descarga todos los navegadores, junto con sus dependencias del sistema y dependencias de manipulación de huellas digitales. + Esto descarga todos los navegadores, junto con sus dependencias del sistema y dependencias de manipulación de fingerprint. 2. Características adicionales: - Instalar la característica del servidor MCP: ```bash pip install "scrapling[ai]" ``` - - Instalar características del shell (shell de Web Scraping y el comando `extract`): + - Instalar características del Shell (Shell de Web Scraping y el comando `extract`): ```bash pip install "scrapling[shell]" ``` - - Instalar todo: + - Instalar todo: ```bash pip install "scrapling[all]" ``` @@ -324,12 +403,5 @@ Este trabajo está licenciado bajo la Licencia BSD-3-Clause. Este proyecto incluye código adaptado de: - Parsel (Licencia BSD)—Usado para el submódulo [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py) -## Agradecimientos y Referencias - -- El brillante trabajo de [Daijro](https://github.com/daijro) en [BrowserForge](https://github.com/daijro/browserforge) y [Camoufox](https://github.com/daijro/camoufox) -- El brillante trabajo de [Vinyzu](https://github.com/Vinyzu) en [Botright](https://github.com/Vinyzu/Botright) y [PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright) -- [brotector](https://github.com/kaliiiiiiiiii/brotector) por técnicas de evasión de detección de navegador -- [fakebrowser](https://github.com/kkoooqq/fakebrowser) y [BotBrowser](https://github.com/botswin/BotBrowser) por investigación de huellas digitales - --- -
Diseñado y elaborado con ❤️ por Karim Shoair.

\ No newline at end of file +
Diseñado y elaborado con ❤️ por Karim Shoair.

diff --git a/docs/README_JP.md b/docs/README_JP.md index 1427607ef9f3cc39fe60a13294f00e0a88dc519c..3423f4999f747ce5a4fd6892c1f03a10d3e916eb 100644 --- a/docs/README_JP.md +++ b/docs/README_JP.md @@ -1,9 +1,14 @@ -

-
- main poster -
- 簡単で効率的なウェブスクレイピング、あるべき姿! -

+

+ + + + Scrapling Poster + + +
+ Effortless Web Scraping for the Modern Web +

+

Tests @@ -24,46 +29,47 @@

- - 選択メソッド - - · - - フェッチャーの選択 - - · - - CLI - - · - - MCPモード - - · - - Beautifulsoupからの移行 - + 選択メソッド + · + Fetcherの選び方 + · + CLI + · + MCPモード + · + Beautifulsoupからの移行

-**アンチボットシステムとの戦いをやめましょう。ウェブサイトが更新されるたびにセレクタを書き直すのをやめましょう。** +Scraplingは、単一のリクエストから本格的なクロールまですべてを処理する適応型Web Scrapingフレームワークです。 -Scraplingは単なるウェブスクレイピングライブラリではありません。ウェブサイトの変更から学習し、それとともに進化する最初の**適応型**スクレイピングライブラリです。他のライブラリがウェブサイトの構造が更新されると壊れる一方で、Scraplingは自動的に要素を再配置し、スクレイパーを稼働し続けます。 +そのパーサーはウェブサイトの変更から学習し、ページが更新されたときに要素を自動的に再配置します。Fetcherはすぐに使えるCloudflare Turnstileなどのアンチボットシステムを回避します。そしてSpiderフレームワークにより、Pause & Resumeや自動Proxy回転機能を備えた並行マルチSessionクロールへとスケールアップできます — すべてわずか数行のPythonで。1つのライブラリ、妥協なし。 -モダンウェブ向けに構築されたScraplingは、**独自の高速パースエンジン**とフェッチャーを備えており、あなたが直面する、または直面するであろうすべてのウェブスクレイピングの課題に対応します。ウェブスクレイパーによってウェブスクレイパーと一般ユーザーのために構築され、誰にでも何かがあります。 +リアルタイム統計とStreamingによる超高速クロール。Web Scraperによって、Web Scraperと一般ユーザーのために構築され、誰にでも何かがあります。 ```python ->> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher ->> StealthyFetcher.adaptive = True -# レーダーの下でウェブサイトのソースを取得! ->> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) ->> print(page.status) -200 ->> products = page.css('.product', auto_save=True) # ウェブサイトのデザイン変更に耐えるデータをスクレイプ! ->> # 後でウェブサイトの構造が変わったら、`adaptive=True`を渡す ->> products = page.css('.product', adaptive=True) # そしてScraplingはまだそれらを見つけます! +from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher +StealthyFetcher.adaptive = True +page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # レーダーの下でウェブサイトを取得! +products = page.css('.product', auto_save=True) # ウェブサイトのデザイン変更に耐えるデータをスクレイプ! +products = page.css('.product', adaptive=True) # 後でウェブサイトの構造が変わったら、`adaptive=True`を渡して見つける! +``` +または本格的なクロールへスケールアップ +```python +from scrapling.spiders import Spider, Response + +class MySpider(Spider): + name = "demo" + start_urls = ["https://example.com/"] + + async def parse(self, response: Response): + for item in response.css('.product'): + yield {"title": item.css('h2::text').get()} + +MySpider().start() ``` -# スポンサー + +# スポンサー @@ -87,138 +93,211 @@ Scraplingは単なるウェブスクレイピングライブラリではあり ## 主な機能 -### セッションサポート付き高度なウェブサイト取得 -- **HTTPリクエスト**:`Fetcher`クラスで高速でステルスなHTTPリクエスト。ブラウザのTLSフィンガープリント、ヘッダーを模倣し、HTTP3を使用できます。 -- **動的読み込み**:Playwright's ChromiumとGoogle Chromeをサポートする`DynamicFetcher`クラスを通じた完全なブラウザ自動化で動的ウェブサイトを取得。 -- **アンチボット回避**:`StealthyFetcher`とフィンガープリント偽装による高度なステルス機能。自動化でCloudflareのTurnstile/Interstitialのすべてのタイプを簡単に回避できます。 -- **セッション管理**:リクエスト間でCookieと状態を管理するための`FetcherSession`、`StealthySession`、`DynamicSession`クラスによる永続的なセッションサポート。 -- **非同期サポート**:すべてのフェッチャーと専用非同期セッションクラス全体での完全な非同期サポート。 +### Spider — 本格的なクロールフレームワーク +- 🕷️ **Scrapy風のSpider API**:`start_urls`、async `parse` callback、`Request`/`Response`オブジェクトでSpiderを定義。 +- ⚡ **並行クロール**:設定可能な並行数制限、ドメインごとのスロットリング、ダウンロード遅延。 +- 🔄 **マルチSessionサポート**:HTTPリクエストとステルスヘッドレスブラウザの統一インターフェース — IDによって異なるSessionにリクエストをルーティング。 +- 💾 **Pause & Resume**:Checkpointベースのクロール永続化。Ctrl+Cで正常にシャットダウン;再起動すると中断したところから再開。 +- 📡 **Streamingモード**:`async for item in spider.stream()`でリアルタイム統計とともにスクレイプされたアイテムをStreamingで受信 — UI、パイプライン、長時間実行クロールに最適。 +- 🛡️ **ブロックされたリクエストの検出**:カスタマイズ可能なロジックによるブロックされたリクエストの自動検出とリトライ。 +- 📦 **組み込みエクスポート**:フックや独自のパイプライン、または組み込みのJSON/JSONLで結果をエクスポート。それぞれ`result.items.to_json()` / `result.items.to_jsonl()`を使用。 + +### Sessionサポート付き高度なウェブサイト取得 +- **HTTPリクエスト**:`Fetcher`クラスで高速かつステルスなHTTPリクエスト。ブラウザのTLS fingerprint、ヘッダーを模倣し、HTTP/3を使用可能。 +- **動的読み込み**:PlaywrightのChromiumとGoogle Chromeをサポートする`DynamicFetcher`クラスによる完全なブラウザ自動化で動的ウェブサイトを取得。 +- **アンチボット回避**:`StealthyFetcher`とfingerprint偽装による高度なステルス機能。自動化でCloudflareのTurnstile/Interstitialのすべてのタイプを簡単に回避。 +- **Session管理**:リクエスト間でCookieと状態を管理するための`FetcherSession`、`StealthySession`、`DynamicSession`クラスによる永続的なSessionサポート。 +- **Proxy回転**:すべてのSessionタイプに対応したラウンドロビンまたはカスタム戦略の組み込み`ProxyRotator`、さらにリクエストごとのProxyオーバーライド。 +- **ドメインブロック**:ブラウザベースのFetcherで特定のドメイン(およびそのサブドメイン)へのリクエストをブロック。 +- **asyncサポート**:すべてのFetcherおよび専用asyncSessionクラス全体での完全なasyncサポート。 ### 適応型スクレイピングとAI統合 - 🔄 **スマート要素追跡**:インテリジェントな類似性アルゴリズムを使用してウェブサイトの変更後に要素を再配置。 - 🎯 **スマート柔軟選択**:CSSセレクタ、XPathセレクタ、フィルタベース検索、テキスト検索、正規表現検索など。 -- 🔍 **類似要素を見つける**:見つかった要素に類似した要素を自動的に特定。 -- 🤖 **AIと使用するMCPサーバー**:AI支援ウェブスクレイピングとデータ抽出のための組み込みMCPサーバー。MCPサーバーは、AI(Claude/Cursorなど)に渡す前にScraplingを活用してターゲットコンテンツを抽出する強力でカスタムな機能を備えており、操作を高速化し、トークン使用量を最小限に抑えることでコストを削減します。([デモビデオ](https://www.youtube.com/watch?v=qyFk3ZNwOxE)) +- 🔍 **類似要素の検出**:見つかった要素に類似した要素を自動的に特定。 +- 🤖 **AIと使用するMCPサーバー**:AI支援Web Scrapingとデータ抽出のための組み込みMCPサーバー。MCPサーバーは、AI(Claude/Cursorなど)に渡す前にScraplingを活用してターゲットコンテンツを抽出する強力でカスタムな機能を備えており、操作を高速化し、トークン使用量を最小限に抑えることでコストを削減します。([デモ動画](https://www.youtube.com/watch?v=qyFk3ZNwOxE)) ### 高性能で実戦テスト済みのアーキテクチャ -- 🚀 **高速**:ほとんどのPythonスクレイピングライブラリを上回る最適化されたパフォーマンス。 +- 🚀 **超高速**:ほとんどのPythonスクレイピングライブラリを上回る最適化されたパフォーマンス。 - 🔋 **メモリ効率**:最小のメモリフットプリントのための最適化されたデータ構造と遅延読み込み。 - ⚡ **高速JSONシリアル化**:標準ライブラリの10倍の速度。 -- 🏗️ **実戦テスト済み**:Scraplingは92%のテストカバレッジと完全な型ヒントカバレッジを備えているだけでなく、過去1年間に数百人のウェブスクレイパーによって毎日使用されてきました。 +- 🏗️ **実戦テスト済み**:Scraplingは92%のテストカバレッジと完全な型ヒントカバレッジを備えているだけでなく、過去1年間に数百人のWeb Scraperによって毎日使用されてきました。 -### 開発者/ウェブスクレイパーにやさしい体験 -- 🎯 **インタラクティブウェブスクレイピングシェル**:Scraping統合、ショートカット、curlリクエストをScraplingリクエストに変換したり、ブラウザでリクエスト結果を表示したりするなどの新しいツールを備えたオプションの組み込みIPythonシェルで、ウェブスクレイピングスクリプトの開発を加速します。 +### 開発者/Web Scraperにやさしい体験 +- 🎯 **インタラクティブWeb Scraping Shell**:Scrapling統合、ショートカット、curlリクエストをScraplingリクエストに変換したり、ブラウザでリクエスト結果を表示したりするなどの新しいツールを備えたオプションの組み込みIPython Shellで、Web Scrapingスクリプトの開発を加速。 - 🚀 **ターミナルから直接使用**:オプションで、コードを一行も書かずにScraplingを使用してURLをスクレイプできます! - 🛠️ **豊富なナビゲーションAPI**:親、兄弟、子のナビゲーションメソッドによる高度なDOMトラバーサル。 - 🧬 **強化されたテキスト処理**:組み込みの正規表現、クリーニングメソッド、最適化された文字列操作。 - 📝 **自動セレクタ生成**:任意の要素に対して堅牢なCSS/XPathセレクタを生成。 -- 🔌 **馴染みのあるAPI**:Scrapy/Parselで使用されている同じ疑似要素を持つScrapy/BeautifulSoupに似ています。 -- 📘 **完全な型カバレッジ**:優れたIDEサポートとコード補完のための完全な型ヒント。 +- 🔌 **馴染みのあるAPI**:Scrapy/Parselで使用されている同じ疑似要素を持つScrapy/BeautifulSoupに似た設計。 +- 📘 **完全な型カバレッジ**:優れたIDEサポートとコード補完のための完全な型ヒント。コードベース全体が変更のたびに**PyRight**と**MyPy**で自動的にスキャンされます。 - 🔋 **すぐに使えるDockerイメージ**:各リリースで、すべてのブラウザを含むDockerイメージが自動的にビルドおよびプッシュされます。 ## はじめに +深く掘り下げずに、Scraplingにできることの簡単な概要をお見せしましょう。 + ### 基本的な使い方 +Sessionサポート付きHTTPリクエスト ```python -from scrapling.fetchers import Fetcher, StealthyFetcher, DynamicFetcher -from scrapling.fetchers import FetcherSession, StealthySession, DynamicSession +from scrapling.fetchers import Fetcher, FetcherSession -# セッションサポート付きHTTPリクエスト -with FetcherSession(impersonate='chrome') as session: # ChromeのTLSフィンガープリントの最新バージョンを使用 +with FetcherSession(impersonate='chrome') as session: # ChromeのTLS fingerprintの最新バージョンを使用 page = session.get('https://quotes.toscrape.com/', stealthy_headers=True) - quotes = page.css('.quote .text::text') + quotes = page.css('.quote .text::text').getall() # または一回限りのリクエストを使用 page = Fetcher.get('https://quotes.toscrape.com/') -quotes = page.css('.quote .text::text') +quotes = page.css('.quote .text::text').getall() +``` +高度なステルスモード +```python +from scrapling.fetchers import StealthyFetcher, StealthySession -# 高度なステルスモード(完了するまでブラウザを開いたままにする) -with StealthySession(headless=True, solve_cloudflare=True) as session: +with StealthySession(headless=True, solve_cloudflare=True) as session: # 完了するまでブラウザを開いたままにする page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False) - data = page.css('#padded_content a') + data = page.css('#padded_content a').getall() -# または一回限りのリクエストスタイルを使用、このリクエストのためにブラウザを開き、完了後に閉じる +# または一回限りのリクエストスタイル、このリクエストのためにブラウザを開き、完了後に閉じる page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare') -data = page.css('#padded_content a') - -# 完全なブラウザ自動化(完了するまでブラウザを開いたままにする) -with DynamicSession(headless=True) as session: - page = session.fetch('https://quotes.toscrape.com/', network_idle=True) - quotes = page.css('.quote .text::text') - -# または一回限りのリクエストスタイルを使用 -page = DynamicFetcher.fetch('https://quotes.toscrape.com/', network_idle=True) -quotes = page.css('.quote .text::text') +data = page.css('#padded_content a').getall() +``` +完全なブラウザ自動化 +```python +from scrapling.fetchers import DynamicFetcher, DynamicSession + +with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # 完了するまでブラウザを開いたままにする + page = session.fetch('https://quotes.toscrape.com/', load_dom=False) + data = page.xpath('//span[@class="text"]/text()').getall() # お好みであればXPathセレクタを使用 + +# または一回限りのリクエストスタイル、このリクエストのためにブラウザを開き、完了後に閉じる +page = DynamicFetcher.fetch('https://quotes.toscrape.com/') +data = page.css('.quote .text::text').getall() +``` + +### Spider +並行リクエスト、複数のSessionタイプ、Pause & Resumeを備えた本格的なクローラーを構築: +```python +from scrapling.spiders import Spider, Request, Response + +class QuotesSpider(Spider): + name = "quotes" + start_urls = ["https://quotes.toscrape.com/"] + concurrent_requests = 10 + + async def parse(self, response: Response): + for quote in response.css('.quote'): + yield { + "text": quote.css('.text::text').get(), + "author": quote.css('.author::text').get(), + } + + next_page = response.css('.next a') + if next_page: + yield response.follow(next_page[0].attrib['href']) + +result = QuotesSpider().start() +print(f"{len(result.items)}件の引用をスクレイプしました") +result.items.to_json("quotes.json") +``` +単一のSpiderで複数のSessionタイプを使用: +```python +from scrapling.spiders import Spider, Request, Response +from scrapling.fetchers import FetcherSession, AsyncStealthySession + +class MultiSessionSpider(Spider): + name = "multi" + start_urls = ["https://example.com/"] + + def configure_sessions(self, manager): + manager.add("fast", FetcherSession(impersonate="chrome")) + manager.add("stealth", AsyncStealthySession(headless=True), lazy=True) + + async def parse(self, response: Response): + for link in response.css('a::attr(href)').getall(): + # 保護されたページはステルスSessionを通してルーティング + if "protected" in link: + yield Request(link, sid="stealth") + else: + yield Request(link, sid="fast", callback=self.parse) # 明示的なcallback +``` +Checkpointを使用して長時間のクロールをPause & Resume: +```python +QuotesSpider(crawldir="./crawl_data").start() ``` +Ctrl+Cを押すと正常に一時停止し、進捗は自動的に保存されます。後でSpiderを再度起動する際に同じ`crawldir`を渡すと、中断したところから再開します。 + +### 高度なパースとナビゲーション +```python +from scrapling.fetchers import Fetcher -### 要素の選択 +# 豊富な要素選択とナビゲーション +page = Fetcher.get('https://quotes.toscrape.com/') + +# 複数の選択メソッドで引用を取得 +quotes = page.css('.quote') # CSSセレクタ +quotes = page.xpath('//div[@class="quote"]') # XPath +quotes = page.find_all('div', {'class': 'quote'}) # BeautifulSoupスタイル +# 以下と同じ +quotes = page.find_all('div', class_='quote') +quotes = page.find_all(['div'], class_='quote') +quotes = page.find_all(class_='quote') # など... +# テキスト内容で要素を検索 +quotes = page.find_by_text('quote', tag='div') + +# 高度なナビゲーション +quote_text = page.css('.quote')[0].css('.text::text').get() +quote_text = page.css('.quote').css('.text::text').getall() # チェーンセレクタ +first_quote = page.css('.quote')[0] +author = first_quote.next_sibling.css('.author::text') +parent_container = first_quote.parent + +# 要素の関連性と類似性 +similar_elements = first_quote.find_similar() +below_elements = first_quote.below_elements() +``` +ウェブサイトを取得せずにパーサーをすぐに使用することもできます: ```python -# CSSセレクタ -page.css('a::text') # テキストを抽出 -page.css('a::attr(href)') # 属性を抽出 -page.css('a', recursive=False) # 直接の要素のみ -page.css('a', auto_save=True) # 要素の位置を自動保存 - -# XPath -page.xpath('//a/text()') - -# 柔軟な検索 -page.find_by_text('Python', first_match=True) # テキストで検索 -page.find_by_regex(r'\d{4}') # 正規表現パターンで検索 -page.find('div', {'class': 'container'}) # 属性で検索 - -# ナビゲーション -element.parent # 親要素を取得 -element.next_sibling # 次の兄弟を取得 -element.children # 子要素を取得 - -# 類似要素 -similar = page.get_similar(element) # 類似要素を見つける - -# 適応型スクレイピング -saved_elements = page.css('.product', auto_save=True) -# 後でウェブサイトが変更されたとき: -page.css('.product', adaptive=True) # 保存された位置を使用して要素を見つける +from scrapling.parser import Selector + +page = Selector("...") ``` +まったく同じ方法で動作します! -### セッションの使用 +### 非同期Session管理の例 ```python -from scrapling.fetchers import FetcherSession, AsyncFetcherSession - -# 同期セッション -with FetcherSession() as session: - # Cookieは自動的に維持されます - page1 = session.get('https://quotes.toscrape.com/login') - page2 = session.post('https://quotes.toscrape.com/login', data={'username': 'admin', 'password': 'admin'}) - - # 必要に応じてブラウザのフィンガープリントを切り替え +import asyncio +from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession + +async with FetcherSession(http3=True) as session: # `FetcherSession`はコンテキストアウェアで、同期/非同期両方のパターンで動作可能 + page1 = session.get('https://quotes.toscrape.com/') page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135') -# 非同期セッションの使用 +# 非同期Sessionの使用 async with AsyncStealthySession(max_pages=2) as session: tasks = [] urls = ['https://example.com/page1', 'https://example.com/page2'] - + for url in urls: task = session.fetch(url) tasks.append(task) - + print(session.get_pool_stats()) # オプション - ブラウザタブプールのステータス(ビジー/フリー/エラー) results = await asyncio.gather(*tasks) print(session.get_pool_stats()) ``` -## CLIとインタラクティブシェル +## CLIとインタラクティブShell -Scrapling v0.3には強力なコマンドラインインターフェースが含まれています: +Scraplingには強力なコマンドラインインターフェースが含まれています: [![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339) -インタラクティブウェブスクレイピングシェルを起動 +インタラクティブWeb Scraping Shellを起動 ```bash scrapling shell ``` -プログラミングせずに直接ページをファイルに抽出(デフォルトで`body`タグ内のコンテンツを抽出)。出力ファイルが`.txt`で終わる場合、ターゲットのテキストコンテンツが抽出されます。`.md`で終わる場合、HTMLコンテンツのMarkdown表現になります;`.html`で終わる場合、HTMLコンテンツそのものになります。 +プログラミングせずに直接ページをファイルに抽出(デフォルトで`body`タグ内のコンテンツを抽出)。出力ファイルが`.txt`で終わる場合、ターゲットのテキストコンテンツが抽出されます。`.md`で終わる場合、HTMLコンテンツのMarkdown表現になります。`.html`で終わる場合、HTMLコンテンツそのものになります。 ```bash scrapling extract get 'https://example.com' content.md scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # CSSセレクタ'#fromSkipToProducts'に一致するすべての要素 @@ -227,34 +306,34 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas. ``` > [!NOTE] -> MCPサーバーやインタラクティブウェブスクレイピングシェルなど、他にも多くの追加機能がありますが、このページは簡潔に保ちたいと思います。完全なドキュメントは[こちら](https://scrapling.readthedocs.io/en/latest/)をご覧ください +> MCPサーバーやインタラクティブWeb Scraping Shellなど、他にも多くの追加機能がありますが、このページは簡潔に保ちたいと思います。完全なドキュメントは[こちら](https://scrapling.readthedocs.io/en/latest/)をご覧ください ## パフォーマンスベンチマーク -Scraplingは強力であるだけでなく、驚くほど高速で、バージョン0.3以降のアップデートはすべての操作で優れたパフォーマンス向上を実現しています。以下のベンチマークは、Scraplingのパーサーを他の人気のあるライブラリと比較しています。 +Scraplingは強力であるだけでなく、超高速です。以下のベンチマークは、Scraplingのパーサーを他の人気ライブラリの最新バージョンと比較しています。 ### テキスト抽出速度テスト(5000個のネストされた要素) -| # | ライブラリ | 時間(ms) | vs Scrapling | -|---|:-----------------:|:-------:|:------------:| -| 1 | Scrapling | 1.99 | 1.0x | -| 2 | Parsel/Scrapy | 2.01 | 1.01x | -| 3 | Raw Lxml | 2.5 | 1.256x | -| 4 | PyQuery | 22.93 | ~11.5x | -| 5 | Selectolax | 80.57 | ~40.5x | -| 6 | BS4 with Lxml | 1541.37 | ~774.6x | -| 7 | MechanicalSoup | 1547.35 | ~777.6x | -| 8 | BS4 with html5lib | 3410.58 | ~1713.9x | +| # | ライブラリ | 時間(ms) | vs Scrapling | +|---|:-----------------:|:---------:|:------------:| +| 1 | Scrapling | 2.02 | 1.0x | +| 2 | Parsel/Scrapy | 2.04 | 1.01 | +| 3 | Raw Lxml | 2.54 | 1.257 | +| 4 | PyQuery | 24.17 | ~12x | +| 5 | Selectolax | 82.63 | ~41x | +| 6 | MechanicalSoup | 1549.71 | ~767.1x | +| 7 | BS4 with Lxml | 1584.31 | ~784.3x | +| 8 | BS4 with html5lib | 3391.91 | ~1679.1x | ### 要素類似性とテキスト検索のパフォーマンス Scraplingの適応型要素検索機能は代替手段を大幅に上回ります: -| ライブラリ | 時間(ms) | vs Scrapling | -|-------------|:------:|:------------:| -| Scrapling | 2.46 | 1.0x | -| AutoScraper | 13.3 | 5.407x | +| ライブラリ | 時間(ms) | vs Scrapling | +|-------------|:---------:|:------------:| +| Scrapling | 2.39 | 1.0x | +| AutoScraper | 12.45 | 5.209x | > すべてのベンチマークは100回以上の実行の平均を表します。方法論については[benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py)を参照してください。 @@ -267,25 +346,25 @@ ScraplingにはPython 3.10以上が必要です: pip install scrapling ``` -v0.3.2以降、このインストールにはパーサーエンジンとその依存関係のみが含まれており、フェッチャーやコマンドライン依存関係は含まれていません。 +このインストールにはパーサーエンジンとその依存関係のみが含まれており、Fetcherやコマンドライン依存関係は含まれていません。 ### オプションの依存関係 -1. 以下の追加機能、フェッチャー、またはそれらのクラスのいずれかを使用する場合は、フェッチャーの依存関係とブラウザの依存関係を次のようにインストールする必要があります: +1. 以下の追加機能、Fetcher、またはそれらのクラスのいずれかを使用する場合は、Fetcherの依存関係とブラウザの依存関係を次のようにインストールする必要があります: ```bash pip install "scrapling[fetchers]" - + scrapling install ``` - これにより、すべてのブラウザ、およびそれらのシステム依存関係とフィンガープリント操作依存関係がダウンロードされます。 + これにより、すべてのブラウザ、およびそれらのシステム依存関係とfingerprint操作依存関係がダウンロードされます。 2. 追加機能: - MCPサーバー機能をインストール: ```bash pip install "scrapling[ai]" ``` - - シェル機能(ウェブスクレイピングシェルと`extract`コマンド)をインストール: + - Shell機能(Web Scraping Shellと`extract`コマンド)をインストール: ```bash pip install "scrapling[shell]" ``` @@ -324,12 +403,5 @@ docker pull ghcr.io/d4vinci/scrapling:latest このプロジェクトには次から適応されたコードが含まれています: - Parsel(BSDライセンス)— [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)サブモジュールに使用 -## 感謝と参考文献 - -- [Daijro](https://github.com/daijro)の[BrowserForge](https://github.com/daijro/browserforge)と[Camoufox](https://github.com/daijro/camoufox)における素晴らしい仕事 -- [Vinyzu](https://github.com/Vinyzu)の[Botright](https://github.com/Vinyzu/Botright)と[PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)における素晴らしい仕事 -- ブラウザ検出回避技術を提供する[brotector](https://github.com/kaliiiiiiiiii/brotector) -- フィンガープリント研究を提供する[fakebrowser](https://github.com/kkoooqq/fakebrowser)と[BotBrowser](https://github.com/botswin/BotBrowser) - --- -
Karim Shoairによって❤️でデザインおよび作成されました。

\ No newline at end of file +
Karim Shoairによって❤️でデザインおよび作成されました。

diff --git a/docs/README_RU.md b/docs/README_RU.md index fed6f2615a4d4995f32715545257e77ecca00218..dc59ba61db2cd0e7d45d2138aa2875d5c53eefb3 100644 --- a/docs/README_RU.md +++ b/docs/README_RU.md @@ -1,9 +1,14 @@ -

-
- main poster -
- Простой, легкий веб-скрапинг, каким он и должен быть! -

+

+ + + + Scrapling Poster + + +
+ Effortless Web Scraping for the Modern Web +

+

Tests @@ -24,46 +29,47 @@

- - Методы выбора - - · - - Выбор фетчера - - · - - CLI - - · - - Режим MCP - - · - - Миграция с Beautifulsoup - + Методы выбора + · + Выбор Fetcher + · + CLI + · + Режим MCP + · + Миграция с Beautifulsoup

-**Прекратите бороться с анти-ботовыми системами. Прекратите переписывать селекторы после каждого обновления сайта.** +Scrapling — это адаптивный фреймворк для Web Scraping, который берёт на себя всё: от одного запроса до полномасштабного обхода сайтов. -Scrapling - это не просто очередная библиотека для веб-скрапинга. Это первая **адаптивная** библиотека для скрапинга, которая учится на изменениях сайтов и развивается вместе с ними. В то время как другие библиотеки ломаются, когда сайты обновляют свою структуру, Scrapling автоматически перемещает ваши элементы и поддерживает работу ваших скраперов. +Его парсер учится на изменениях сайтов и автоматически перемещает ваши элементы при обновлении страниц. Его Fetcher'ы обходят анти-бот системы вроде Cloudflare Turnstile прямо из коробки. А его Spider-фреймворк позволяет масштабироваться до параллельных, многосессионных обходов с Pause & Resume и автоматической ротацией Proxy — и всё это в нескольких строках Python. Одна библиотека, без компромиссов. -Созданный для современного веба, Scrapling имеет **собственный быстрый движок парсинга** и фетчеры для решения всех задач веб-скрапинга, с которыми вы сталкиваетесь или столкнетесь. Созданный веб-скраперами для веб-скраперов и обычных пользователей, здесь есть что-то для каждого. +Молниеносно быстрые обходы с отслеживанием статистики в реальном времени и Streaming. Создано веб-скраперами для веб-скраперов и обычных пользователей — здесь есть что-то для каждого. ```python ->> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher ->> StealthyFetcher.adaptive = True -# Получайте исходный код сайтов незаметно! ->> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) ->> print(page.status) -200 ->> products = page.css('.product', auto_save=True) # Скрапьте данные, которые переживут изменения дизайна сайта! ->> # Позже, если структура сайта изменится, передайте `adaptive=True` ->> products = page.css('.product', adaptive=True) # и Scrapling все равно их найдет! +from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher +StealthyFetcher.adaptive = True +page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # Загрузите сайт незаметно! +products = page.css('.product', auto_save=True) # Скрапьте данные, которые переживут изменения дизайна сайта! +products = page.css('.product', adaptive=True) # Позже, если структура сайта изменится, передайте `adaptive=True`, чтобы найти их! +``` +Или масштабируйте до полного обхода +```python +from scrapling.spiders import Spider, Response + +class MySpider(Spider): + name = "demo" + start_urls = ["https://example.com/"] + + async def parse(self, response: Response): + for item in response.css('.product'): + yield {"title": item.css('h2::text').get()} + +MySpider().start() ``` -# Спонсоры + +# Спонсоры @@ -87,138 +93,211 @@ Scrapling - это не просто очередная библиотека д ## Ключевые особенности -### Продвинутая загрузка сайтов с поддержкой сессий -- **HTTP-запросы**: Быстрые и скрытные HTTP-запросы с классом `Fetcher`. Может имитировать TLS-отпечаток браузера, заголовки и использовать HTTP3. +### Spider'ы — полноценный фреймворк для обхода сайтов +- 🕷️ **Scrapy-подобный Spider API**: Определяйте Spider'ов с `start_urls`, async `parse` callback'ами и объектами `Request`/`Response`. +- ⚡ **Параллельный обход**: Настраиваемые лимиты параллелизма, ограничение скорости по домену и задержки загрузки. +- 🔄 **Поддержка нескольких сессий**: Единый интерфейс для HTTP-запросов и скрытных headless-браузеров в одном Spider — маршрутизируйте запросы к разным сессиям по ID. +- 💾 **Pause & Resume**: Persistence обхода на основе Checkpoint'ов. Нажмите Ctrl+C для мягкой остановки; перезапустите, чтобы продолжить с того места, где вы остановились. +- 📡 **Режим Streaming**: Стримьте извлечённые элементы по мере их поступления через `async for item in spider.stream()` со статистикой в реальном времени — идеально для UI, конвейеров и длительных обходов. +- 🛡️ **Обнаружение заблокированных запросов**: Автоматическое обнаружение и повторная отправка заблокированных запросов с настраиваемой логикой. +- 📦 **Встроенный экспорт**: Экспортируйте результаты через хуки и собственный конвейер или встроенный JSON/JSONL с `result.items.to_json()` / `result.items.to_jsonl()` соответственно. + +### Продвинутая загрузка сайтов с поддержкой Session +- **HTTP-запросы**: Быстрые и скрытные HTTP-запросы с классом `Fetcher`. Может имитировать TLS fingerprint браузера, заголовки и использовать HTTP/3. - **Динамическая загрузка**: Загрузка динамических сайтов с полной автоматизацией браузера через класс `DynamicFetcher`, поддерживающий Chromium от Playwright и Google Chrome. -- **Обход анти-ботов**: Расширенные возможности скрытности с `StealthyFetcher` и подмену отпечатков. Может легко обойти все типы Turnstile/Interstitial от Cloudflare с помощью автоматизации. +- **Обход анти-ботов**: Расширенные возможности скрытности с `StealthyFetcher` и подмену fingerprint'ов. Может легко обойти все типы Cloudflare Turnstile/Interstitial с помощью автоматизации. - **Управление сессиями**: Поддержка постоянных сессий с классами `FetcherSession`, `StealthySession` и `DynamicSession` для управления cookie и состоянием между запросами. -- **Поддержка асинхронности**: Полная асинхронная поддержка во всех фетчерах и выделенных асинхронных классах сессий. +- **Ротация Proxy**: Встроенный `ProxyRotator` с циклической или пользовательскими стратегиями для всех типов сессий, а также переопределение Proxy для каждого запроса. +- **Блокировка доменов**: Блокируйте запросы к определённым доменам (и их поддоменам) в браузерных Fetcher'ах. +- **Поддержка async**: Полная async-поддержка во всех Fetcher'ах и выделенных async-классах сессий. ### Адаптивный скрапинг и интеграция с ИИ - 🔄 **Умное отслеживание элементов**: Перемещайте элементы после изменений сайта с помощью интеллектуальных алгоритмов подобия. - 🎯 **Умный гибкий выбор**: CSS-селекторы, XPath-селекторы, поиск на основе фильтров, текстовый поиск, поиск по регулярным выражениям и многое другое. -- 🔍 **Поиск похожих элементов**: Автоматически находите элементы, похожие на найденные элементы. -- 🤖 **MCP-сервер для использования с ИИ**: Встроенный MCP-сервер для веб-скрапинга с помощью ИИ и извлечения данных. MCP-сервер обладает мощными, пользовательскими возможностями, которые используют Scrapling для извлечения целевого контента перед передачей его ИИ (Claude/Cursor/и т.д.), тем самым ускоряя операции и снижая затраты за счет минимизации использования токенов. ([демо-видео](https://www.youtube.com/watch?v=qyFk3ZNwOxE)) +- 🔍 **Поиск похожих элементов**: Автоматически находите элементы, похожие на найденные. +- 🤖 **MCP-сервер для использования с ИИ**: Встроенный MCP-сервер для Web Scraping с помощью ИИ и извлечения данных. MCP-сервер обладает мощными пользовательскими возможностями, которые используют Scrapling для извлечения целевого контента перед передачей его ИИ (Claude/Cursor/и т.д.), тем самым ускоряя операции и снижая затраты за счёт минимизации использования токенов. ([демо-видео](https://www.youtube.com/watch?v=qyFk3ZNwOxE)) ### Высокопроизводительная и проверенная в боях архитектура -- 🚀 **Молниеносно быстро**: Оптимизированная производительность превосходит большинство библиотек скрапинга Python. +- 🚀 **Молниеносная скорость**: Оптимизированная производительность, превосходящая большинство Python-библиотек для скрапинга. - 🔋 **Эффективное использование памяти**: Оптимизированные структуры данных и ленивая загрузка для минимального потребления памяти. -- ⚡ **Быстрая сериализация JSON**: В 10 раз быстрее, чем стандартная библиотека. +- ⚡ **Быстрая сериализация JSON**: В 10 раз быстрее стандартной библиотеки. - 🏗️ **Проверено в боях**: Scrapling имеет не только 92% покрытия тестами и полное покрытие type hints, но и ежедневно использовался сотнями веб-скраперов в течение последнего года. ### Удобный для разработчиков/веб-скраперов опыт -- 🎯 **Интерактивная оболочка веб-скрапинга**: Опциональная встроенная оболочка IPython с интеграцией Scrapling, ярлыками и новыми инструментами для ускорения разработки скриптов веб-скрапинга, такими как преобразование curl-запросов в Scrapling-запросы и просмотр результатов запросов в вашем браузере. +- 🎯 **Интерактивная Web Scraping Shell**: Опциональная встроенная IPython-оболочка с интеграцией Scrapling, ярлыками и новыми инструментами для ускорения разработки скриптов Web Scraping, такими как преобразование curl-запросов в запросы Scrapling и просмотр результатов запросов в браузере. - 🚀 **Используйте прямо из терминала**: При желании вы можете использовать Scrapling для скрапинга URL без написания ни одной строки кода! - 🛠️ **Богатый API навигации**: Расширенный обход DOM с методами навигации по родителям, братьям и детям. - 🧬 **Улучшенная обработка текста**: Встроенные регулярные выражения, методы очистки и оптимизированные операции со строками. -- 📝 **Автоматическая генерация селекторов**: Генерация надежных CSS/XPath селекторов для любого элемента. +- 📝 **Автоматическая генерация селекторов**: Генерация надёжных CSS/XPath-селекторов для любого элемента. - 🔌 **Знакомый API**: Похож на Scrapy/BeautifulSoup с теми же псевдоэлементами, используемыми в Scrapy/Parsel. -- 📘 **Полное покрытие типами**: Полные подсказки типов для отличной поддержки IDE и автодополнения кода. -- 🔋 **Готовый Docker-образ**: С каждым релизом автоматически создается и отправляется Docker-образ, содержащий все браузеры. +- 📘 **Полное покрытие типами**: Полные type hints для отличной поддержки IDE и автодополнения кода. Вся кодовая база автоматически проверяется **PyRight** и **MyPy** при каждом изменении. +- 🔋 **Готовый Docker-образ**: С каждым релизом автоматически создаётся и публикуется Docker-образ, содержащий все браузеры. ## Начало работы +Давайте кратко покажем, на что способен Scrapling, без глубокого погружения. + ### Базовое использование +HTTP-запросы с поддержкой Session ```python -from scrapling.fetchers import Fetcher, StealthyFetcher, DynamicFetcher -from scrapling.fetchers import FetcherSession, StealthySession, DynamicSession +from scrapling.fetchers import Fetcher, FetcherSession -# HTTP-запросы с поддержкой сессий -with FetcherSession(impersonate='chrome') as session: # Используйте последнюю версию TLS-отпечатка Chrome +with FetcherSession(impersonate='chrome') as session: # Используйте последнюю версию TLS fingerprint Chrome page = session.get('https://quotes.toscrape.com/', stealthy_headers=True) - quotes = page.css('.quote .text::text') + quotes = page.css('.quote .text::text').getall() # Или используйте одноразовые запросы page = Fetcher.get('https://quotes.toscrape.com/') -quotes = page.css('.quote .text::text') +quotes = page.css('.quote .text::text').getall() +``` +Расширенный режим скрытности +```python +from scrapling.fetchers import StealthyFetcher, StealthySession -# Расширенный режим скрытности (Держите браузер открытым до завершения) -with StealthySession(headless=True, solve_cloudflare=True) as session: +with StealthySession(headless=True, solve_cloudflare=True) as session: # Держите браузер открытым, пока не закончите page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False) - data = page.css('#padded_content a') + data = page.css('#padded_content a').getall() -# Или используйте стиль одноразового запроса, открывает браузер для этого запроса, затем закрывает его после завершения +# Или используйте стиль одноразового запроса — открывает браузер для этого запроса, затем закрывает его после завершения page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare') -data = page.css('#padded_content a') - -# Полная автоматизация браузера (Держите браузер открытым до завершения) -with DynamicSession(headless=True) as session: - page = session.fetch('https://quotes.toscrape.com/', network_idle=True) - quotes = page.css('.quote .text::text') - -# Или используйте стиль одноразового запроса -page = DynamicFetcher.fetch('https://quotes.toscrape.com/', network_idle=True) -quotes = page.css('.quote .text::text') +data = page.css('#padded_content a').getall() +``` +Полная автоматизация браузера +```python +from scrapling.fetchers import DynamicFetcher, DynamicSession + +with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session: # Держите браузер открытым, пока не закончите + page = session.fetch('https://quotes.toscrape.com/', load_dom=False) + data = page.xpath('//span[@class="text"]/text()').getall() # XPath-селектор, если вы предпочитаете его + +# Или используйте стиль одноразового запроса — открывает браузер для этого запроса, затем закрывает его после завершения +page = DynamicFetcher.fetch('https://quotes.toscrape.com/') +data = page.css('.quote .text::text').getall() +``` + +### Spider'ы +Создавайте полноценные обходчики с параллельными запросами, несколькими типами сессий и Pause & Resume: +```python +from scrapling.spiders import Spider, Request, Response + +class QuotesSpider(Spider): + name = "quotes" + start_urls = ["https://quotes.toscrape.com/"] + concurrent_requests = 10 + + async def parse(self, response: Response): + for quote in response.css('.quote'): + yield { + "text": quote.css('.text::text').get(), + "author": quote.css('.author::text').get(), + } + + next_page = response.css('.next a') + if next_page: + yield response.follow(next_page[0].attrib['href']) + +result = QuotesSpider().start() +print(f"Извлечено {len(result.items)} цитат") +result.items.to_json("quotes.json") +``` +Используйте несколько типов сессий в одном Spider: +```python +from scrapling.spiders import Spider, Request, Response +from scrapling.fetchers import FetcherSession, AsyncStealthySession + +class MultiSessionSpider(Spider): + name = "multi" + start_urls = ["https://example.com/"] + + def configure_sessions(self, manager): + manager.add("fast", FetcherSession(impersonate="chrome")) + manager.add("stealth", AsyncStealthySession(headless=True), lazy=True) + + async def parse(self, response: Response): + for link in response.css('a::attr(href)').getall(): + # Направляйте защищённые страницы через stealth-сессию + if "protected" in link: + yield Request(link, sid="stealth") + else: + yield Request(link, sid="fast", callback=self.parse) # явный callback +``` +Приостанавливайте и возобновляйте длительные обходы с помощью Checkpoint'ов, запуская Spider следующим образом: +```python +QuotesSpider(crawldir="./crawl_data").start() ``` +Нажмите Ctrl+C для мягкой остановки — прогресс сохраняется автоматически. Позже, когда вы снова запустите Spider, передайте тот же `crawldir`, и он продолжит с того места, где остановился. + +### Продвинутый парсинг и навигация +```python +from scrapling.fetchers import Fetcher -### Выбор элементов +# Богатый выбор элементов и навигация +page = Fetcher.get('https://quotes.toscrape.com/') + +# Получение цитат различными методами выбора +quotes = page.css('.quote') # CSS-селектор +quotes = page.xpath('//div[@class="quote"]') # XPath +quotes = page.find_all('div', {'class': 'quote'}) # В стиле BeautifulSoup +# То же самое, что +quotes = page.find_all('div', class_='quote') +quotes = page.find_all(['div'], class_='quote') +quotes = page.find_all(class_='quote') # и так далее... +# Найти элемент по текстовому содержимому +quotes = page.find_by_text('quote', tag='div') + +# Продвинутая навигация +quote_text = page.css('.quote')[0].css('.text::text').get() +quote_text = page.css('.quote').css('.text::text').getall() # Цепочка селекторов +first_quote = page.css('.quote')[0] +author = first_quote.next_sibling.css('.author::text') +parent_container = first_quote.parent + +# Связи элементов и подобие +similar_elements = first_quote.find_similar() +below_elements = first_quote.below_elements() +``` +Вы можете использовать парсер напрямую, если не хотите загружать сайты, как показано ниже: ```python -# CSS-селекторы -page.css('a::text') # Извлечь текст -page.css('a::attr(href)') # Извлечь атрибуты -page.css('a', recursive=False) # Только прямые элементы -page.css('a', auto_save=True) # Автоматически сохранять позиции элементов - -# XPath -page.xpath('//a/text()') - -# Гибкий поиск -page.find_by_text('Python', first_match=True) # Найти по тексту -page.find_by_regex(r'\d{4}') # Найти по паттерну regex -page.find('div', {'class': 'container'}) # Найти по атрибутам - -# Навигация -element.parent # Получить родительский элемент -element.next_sibling # Получить следующего брата -element.children # Получить дочерние элементы - -# Похожие элементы -similar = page.get_similar(element) # Найти похожие элементы - -# Адаптивный скрапинг -saved_elements = page.css('.product', auto_save=True) -# Позже, когда сайт изменится: -page.css('.product', adaptive=True) # Найти элементы используя сохраненные позиции +from scrapling.parser import Selector + +page = Selector("...") ``` +И он работает точно так же! -### Использование сессий +### Примеры async Session ```python -from scrapling.fetchers import FetcherSession, AsyncFetcherSession - -# Синхронная сессия -with FetcherSession() as session: - # Cookie автоматически сохраняются - page1 = session.get('https://quotes.toscrape.com/login') - page2 = session.post('https://quotes.toscrape.com/login', data={'username': 'admin', 'password': 'admin'}) - - # При необходимости переключите отпечаток браузера +import asyncio +from scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession + +async with FetcherSession(http3=True) as session: # `FetcherSession` контекстно-осведомлён и может работать как в sync, так и в async-режимах + page1 = session.get('https://quotes.toscrape.com/') page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135') -# Использование асинхронной сессии +# Использование async-сессии async with AsyncStealthySession(max_pages=2) as session: tasks = [] urls = ['https://example.com/page1', 'https://example.com/page2'] - + for url in urls: task = session.fetch(url) tasks.append(task) - - print(session.get_pool_stats()) # Опционально - Статус пула вкладок браузера (занят/свободен/ошибка) + + print(session.get_pool_stats()) # Опционально — статус пула вкладок браузера (занят/свободен/ошибка) results = await asyncio.gather(*tasks) print(session.get_pool_stats()) ``` -## CLI и интерактивная оболочка +## CLI и интерактивная Shell -Scrapling v0.3 включает мощный интерфейс командной строки: +Scrapling включает мощный интерфейс командной строки: [![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339) -Запустить интерактивную оболочку веб-скрапинга +Запустить интерактивную Web Scraping Shell ```bash scrapling shell ``` -Извлечь страницы в файл напрямую без программирования (Извлекает содержимое внутри тега `body` по умолчанию). Если выходной файл заканчивается на `.txt`, то будет извлечено текстовое содержимое цели. Если заканчивается на `.md`, это будет Markdown-представление HTML-содержимого; если заканчивается на `.html`, это будет само HTML-содержимое. +Извлечь страницы в файл напрямую без программирования (по умолчанию извлекает содержимое внутри тега `body`). Если выходной файл заканчивается на `.txt`, будет извлечено текстовое содержимое цели. Если заканчивается на `.md`, это будет Markdown-представление HTML-содержимого; если заканчивается на `.html`, это будет само HTML-содержимое. ```bash scrapling extract get 'https://example.com' content.md scrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome' # Все элементы, соответствующие CSS-селектору '#fromSkipToProducts' @@ -227,24 +306,24 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas. ``` > [!NOTE] -> Есть много дополнительных функций, но мы хотим сохранить эту страницу краткой, например, MCP-сервер и интерактивная оболочка веб-скрапинга. Ознакомьтесь с полной документацией [здесь](https://scrapling.readthedocs.io/en/latest/) +> Есть множество дополнительных возможностей, но мы хотим сохранить эту страницу краткой, включая MCP-сервер и интерактивную Web Scraping Shell. Ознакомьтесь с полной документацией [здесь](https://scrapling.readthedocs.io/en/latest/) ## Тесты производительности -Scrapling не только мощный - он также невероятно быстрый, и обновления с версии 0.3 обеспечили исключительные улучшения производительности во всех операциях. Следующие тесты производительности сравнивают парсер Scrapling с другими популярными библиотеками. +Scrapling не только мощный — он ещё и невероятно быстрый. Следующие тесты производительности сравнивают парсер Scrapling с последними версиями других популярных библиотек. ### Тест скорости извлечения текста (5000 вложенных элементов) -| # | Библиотека | Время (мс) | vs Scrapling | +| # | Библиотека | Время (мс) | vs Scrapling | |---|:-----------------:|:----------:|:------------:| -| 1 | Scrapling | 1.99 | 1.0x | -| 2 | Parsel/Scrapy | 2.01 | 1.01x | -| 3 | Raw Lxml | 2.5 | 1.256x | -| 4 | PyQuery | 22.93 | ~11.5x | -| 5 | Selectolax | 80.57 | ~40.5x | -| 6 | BS4 with Lxml | 1541.37 | ~774.6x | -| 7 | MechanicalSoup | 1547.35 | ~777.6x | -| 8 | BS4 with html5lib | 3410.58 | ~1713.9x | +| 1 | Scrapling | 2.02 | 1.0x | +| 2 | Parsel/Scrapy | 2.04 | 1.01 | +| 3 | Raw Lxml | 2.54 | 1.257 | +| 4 | PyQuery | 24.17 | ~12x | +| 5 | Selectolax | 82.63 | ~41x | +| 6 | MechanicalSoup | 1549.71 | ~767.1x | +| 7 | BS4 with Lxml | 1584.31 | ~784.3x | +| 8 | BS4 with html5lib | 3391.91 | ~1679.1x | ### Производительность подобия элементов и текстового поиска @@ -253,8 +332,8 @@ Scrapling не только мощный - он также невероятно | Библиотека | Время (мс) | vs Scrapling | |-------------|:----------:|:------------:| -| Scrapling | 2.46 | 1.0x | -| AutoScraper | 13.3 | 5.407x | +| Scrapling | 2.39 | 1.0x | +| AutoScraper | 12.45 | 5.209x | > Все тесты производительности представляют собой средние значения более 100 запусков. См. [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) для методологии. @@ -267,33 +346,33 @@ Scrapling требует Python 3.10 или выше: pip install scrapling ``` -Начиная с v0.3.2, эта установка включает только движок парсера и его зависимости, без каких-либо фетчеров или зависимостей командной строки. +Эта установка включает только движок парсера и его зависимости, без каких-либо Fetcher'ов или зависимостей командной строки. ### Опциональные зависимости -1. Если вы собираетесь использовать какие-либо из дополнительных функций ниже, фетчеры или их классы, вам необходимо установить зависимости фетчеров и их зависимости браузера следующим образом: +1. Если вы собираетесь использовать какие-либо из дополнительных возможностей ниже, Fetcher'ы или их классы, вам необходимо установить зависимости Fetcher'ов и браузеров следующим образом: ```bash pip install "scrapling[fetchers]" - + scrapling install ``` - Это загрузит все браузеры вместе с их системными зависимостями и зависимостями манипуляции отпечатками. + Это загрузит все браузеры вместе с их системными зависимостями и зависимостями для манипуляции fingerprint'ами. -2. Дополнительные функции: +2. Дополнительные возможности: - Установить функцию MCP-сервера: ```bash pip install "scrapling[ai]" ``` - - Установить функции оболочки (оболочка веб-скрапинга и команда `extract`): + - Установить функции Shell (Web Scraping Shell и команда `extract`): ```bash pip install "scrapling[shell]" ``` - - Установить все: + - Установить всё: ```bash pip install "scrapling[all]" ``` - Помните, что вам нужно установить зависимости браузера с помощью `scrapling install` после любого из этих дополнений (если вы еще этого не сделали) + Помните, что вам нужно установить зависимости браузеров с помощью `scrapling install` после любого из этих дополнений (если вы ещё этого не сделали) ### Docker Вы также можете установить Docker-образ со всеми дополнениями и браузерами с помощью следующей команды из DockerHub: @@ -304,11 +383,11 @@ docker pull pyd4vinci/scrapling ```bash docker pull ghcr.io/d4vinci/scrapling:latest ``` -Этот образ автоматически создается и отправляется с использованием GitHub Actions и основной ветки репозитория. +Этот образ автоматически создаётся и публикуется с помощью GitHub Actions и основной ветки репозитория. -## Вклад +## Участие в разработке -Мы приветствуем вклад! Пожалуйста, прочитайте наши [руководства по внесению вклада](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) перед началом работы. +Мы приветствуем участие! Пожалуйста, прочитайте наши [руководства по участию в разработке](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) перед началом работы. ## Отказ от ответственности @@ -324,12 +403,5 @@ docker pull ghcr.io/d4vinci/scrapling:latest Этот проект включает код, адаптированный из: - Parsel (лицензия BSD) — Используется для подмодуля [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py) -## Благодарности и ссылки - -- Блестящая работа [Daijro](https://github.com/daijro) над [BrowserForge](https://github.com/daijro/browserforge) и [Camoufox](https://github.com/daijro/camoufox) -- Блестящая работа [Vinyzu](https://github.com/Vinyzu) над [Botright](https://github.com/Vinyzu/Botright) и [PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright) -- [brotector](https://github.com/kaliiiiiiiiii/brotector) за техники обхода обнаружения браузера -- [fakebrowser](https://github.com/kkoooqq/fakebrowser) и [BotBrowser](https://github.com/botswin/BotBrowser) за исследование отпечатков - --- -
Разработано и создано с ❤️ Карим Шоаир.

\ No newline at end of file +
Разработано и создано с ❤️ Карим Шоаир.

diff --git a/docs/ai/mcp-server.md b/docs/ai/mcp-server.md index 8e03cd8d521d7c221fd6dd0398208932db2aa4ec..316f4552e1b081ddc929b20d6b62d32a361c73cb 100644 --- a/docs/ai/mcp-server.md +++ b/docs/ai/mcp-server.md @@ -179,7 +179,7 @@ We will gradually go from simple prompts to more complex ones. We will use Claud ``` Use regular requests to scrape the main content from https://example.com and convert it to markdown format. ``` - This tells Claude which tool to use here, so it doesn't have to guess. Sometimes it will start using normal requests on its own, and at other times, it will assume browsers are better suited for this website without any apparent reason. As a general rule of thumb, you should always tell Claude which tool to use if you want to save time and money and get consistent results. + This tells Claude which tool to use here, so it doesn't have to guess. Sometimes it will start using normal requests on its own, and at other times, it will assume browsers are better suited for this website without any apparent reason. As a rule of thumb, you should always tell Claude which tool to use to save time and money and get consistent results. 2. **Targeted Data Extraction** @@ -189,7 +189,7 @@ We will gradually go from simple prompts to more complex ones. We will use Claud Get all product titles from https://shop.example.com using the CSS selector '.product-title'. If the request fails, retry up to 5 times every 10 seconds. ``` - The server will extract only the elements matching your selector and return them as a structured list. Notice I told it to set the tool to try only 3 times in case the website has connection issues, but the default setting should be fine for most cases. + The server will extract only the elements matching your selector and return them as a structured list. Notice I told it to set the tool to try up to 5 times in case the website has connection issues, but the default setting should be fine for most cases. 3. **E-commerce Data Collection** diff --git a/docs/api-reference/mcp-server.md b/docs/api-reference/mcp-server.md index 55fa91e54f87f80b6bfaded935cac7a9dadce23b..03cb10227144f48277934245c52cad2017443e7d 100644 --- a/docs/api-reference/mcp-server.md +++ b/docs/api-reference/mcp-server.md @@ -19,7 +19,7 @@ Or import the server class directly: from scrapling.core.ai import ScraplingMCPServer server = ScraplingMCPServer() -server.serve() +server.serve(http=False, host="0.0.0.0", port=8000) ``` ## Response Model diff --git a/docs/api-reference/proxy-rotation.md b/docs/api-reference/proxy-rotation.md new file mode 100644 index 0000000000000000000000000000000000000000..61a8fad70c763bb548274cb83e18c042fe241a55 --- /dev/null +++ b/docs/api-reference/proxy-rotation.md @@ -0,0 +1,18 @@ +--- +search: + exclude: true +--- + +# Proxy Rotation + +The `ProxyRotator` class provides thread-safe proxy rotation for any fetcher or session. + +You can import it directly like below: + +```python +from scrapling.fetchers import ProxyRotator +``` + +## ::: scrapling.engines.toolbelt.proxy_rotation.ProxyRotator + handler: python + :docstring: diff --git a/docs/api-reference/response.md b/docs/api-reference/response.md new file mode 100644 index 0000000000000000000000000000000000000000..26f6895453a96fd9b51a4c1405233933f9024f3b --- /dev/null +++ b/docs/api-reference/response.md @@ -0,0 +1,18 @@ +--- +search: + exclude: true +--- + +# Response Class + +The `Response` class wraps HTTP responses returned by all fetchers, providing access to status, headers, body, cookies, and a `Selector` for parsing. + +You can import the `Response` class like below: + +```python +from scrapling.engines.toolbelt.custom import Response +``` + +## ::: scrapling.engines.toolbelt.custom.Response + handler: python + :docstring: diff --git a/docs/api-reference/spiders.md b/docs/api-reference/spiders.md new file mode 100644 index 0000000000000000000000000000000000000000..0a69abbf8d696306718da67a57f99c45f822194c --- /dev/null +++ b/docs/api-reference/spiders.md @@ -0,0 +1,42 @@ +--- +search: + exclude: true +--- + +# Spider Classes + +Here's the reference information for the spider framework classes' parameters, attributes, and methods. + +You can import them directly like below: + +```python +from scrapling.spiders import Spider, Request, CrawlResult, SessionManager, Response +``` + +## ::: scrapling.spiders.Spider + handler: python + :docstring: + +## ::: scrapling.spiders.Request + handler: python + :docstring: + +## Result Classes + +## ::: scrapling.spiders.result.CrawlResult + handler: python + :docstring: + +## ::: scrapling.spiders.result.CrawlStats + handler: python + :docstring: + +## ::: scrapling.spiders.result.ItemList + handler: python + :docstring: + +## Session Management + +## ::: scrapling.spiders.session.SessionManager + handler: python + :docstring: diff --git a/docs/benchmarks.md b/docs/benchmarks.md index f4040105bea8edd27903b0688eaea70fb35e83ac..40a0fdf54235ef036e70388b75080f7432a6f3aa 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -1,21 +1,20 @@ # Performance Benchmarks -Scrapling isn't just powerful—it's also blazing fast, and the updates since version 0.3 have delivered exceptional performance improvements across all operations. The following benchmarks compare Scrapling's parser with other popular libraries. - -## Benchmark Results +Scrapling isn't just powerful—it's also blazing fast. The following benchmarks compare Scrapling's parser with the latest versions of other popular libraries. ### Text Extraction Speed Test (5000 nested elements) | # | Library | Time (ms) | vs Scrapling | |---|:-----------------:|:---------:|:------------:| -| 1 | Scrapling | 1.99 | 1.0x | -| 2 | Parsel/Scrapy | 2.01 | 1.01x | -| 3 | Raw Lxml | 2.5 | 1.256x | -| 4 | PyQuery | 22.93 | ~11.5x | -| 5 | Selectolax | 80.57 | ~40.5x | -| 6 | BS4 with Lxml | 1541.37 | ~774.6x | -| 7 | MechanicalSoup | 1547.35 | ~777.6x | -| 8 | BS4 with html5lib | 3410.58 | ~1713.9x | +| 1 | Scrapling | 2.02 | 1.0x | +| 2 | Parsel/Scrapy | 2.04 | 1.01 | +| 3 | Raw Lxml | 2.54 | 1.257 | +| 4 | PyQuery | 24.17 | ~12x | +| 5 | Selectolax | 82.63 | ~41x | +| 6 | MechanicalSoup | 1549.71 | ~767.1x | +| 7 | BS4 with Lxml | 1584.31 | ~784.3x | +| 8 | BS4 with html5lib | 3391.91 | ~1679.1x | + ### Element Similarity & Text Search Performance @@ -23,5 +22,7 @@ Scrapling's adaptive element finding capabilities significantly outperform alter | Library | Time (ms) | vs Scrapling | |-------------|:---------:|:------------:| -| Scrapling | 2.46 | 1.0x | -| AutoScraper | 13.3 | 5.407x | +| Scrapling | 2.39 | 1.0x | +| AutoScraper | 12.45 | 5.209x | + +> All benchmarks represent averages of 100+ runs. See [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology. diff --git a/docs/cli/extract-commands.md b/docs/cli/extract-commands.md index e0e7176ec093e1e0729454467c9294d867e051bc..fa622d0428d95ff2ef58812df17845a0bc21aba0 100644 --- a/docs/cli/extract-commands.md +++ b/docs/cli/extract-commands.md @@ -4,12 +4,12 @@ The `scrapling extract` command lets you download and extract content from websites directly from your terminal without writing any code. Ideal for beginners, researchers, and anyone requiring rapid web data extraction. -> 💡 **Prerequisites:** -> -> 1. You’ve completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use. -> 2. You’ve completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object. -> 3. You’ve completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class. -> 4. You’ve completed or read at least one page from the fetchers section to use here for requests: [HTTP requests](../fetching/static.md), [Dynamic websites](../fetching/dynamic.md), or [Dynamic websites with hard protections](../fetching/stealthy.md). +!!! success "Prerequisites" + + 1. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use. + 2. You've completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object. + 3. You've completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class. + 4. You've completed or read at least one page from the fetchers section to use here for requests: [HTTP requests](../fetching/static.md), [Dynamic websites](../fetching/dynamic.md), or [Dynamic websites with hard protections](../fetching/stealthy.md). ## What is the Extract Command group? @@ -280,7 +280,7 @@ We will go through each command in detail below. -s, --css-selector TEXT CSS selector to extract specific content from the page. It returns all matches. --wait-selector TEXT CSS selector to wait for before proceeding --locale TEXT Specify user locale. Defaults to the system default locale. - ---real-chrome/--no-real-chrome If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False) + --real-chrome/--no-real-chrome If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False) --proxy TEXT Proxy URL in format "http://username:password@host:port" -H, --extra-headers TEXT Extra headers in format "Key: Value" (can be used multiple times) --help Show this message and exit. @@ -320,8 +320,7 @@ We will go through each command in detail below. --solve-cloudflare / --no-solve-cloudflare Solve Cloudflare challenges (default: False) --allow-webgl / --block-webgl Allow WebGL (default: True) --network-idle / --no-network-idle Wait for network idle (default: False) - ---real-chrome/--no-real-chrom If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False) - --hide-canvas/--show-canvas Add noise to canvas operations (default: False) + --real-chrome/--no-real-chrome If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False) --timeout INTEGER Timeout in milliseconds (default: 30000) --wait INTEGER Additional wait time in milliseconds after page load (default: 0) -s, --css-selector TEXT CSS selector to extract specific content from the page. It returns all matches. diff --git a/docs/cli/interactive-shell.md b/docs/cli/interactive-shell.md index 7838371872fc30179f0cde8af551579d338caefc..b897ce7970310403eb0ab9348b5f4743f00b4437 100644 --- a/docs/cli/interactive-shell.md +++ b/docs/cli/interactive-shell.md @@ -1,17 +1,17 @@ # Scrapling Interactive Shell Guide - + **Powerful Web Scraping REPL for Developers and Data Scientists** The Scrapling Interactive Shell is an enhanced IPython-based environment designed specifically for Web Scraping tasks. It provides instant access to all Scrapling features, clever shortcuts, automatic page management, and advanced tools, such as conversion of the curl command. -> 💡 **Prerequisites:** -> -> 1. You’ve completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use. -> 2. You’ve completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object. -> 3. You’ve completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class. -> 4. You’ve completed or read at least one page from the fetchers section to use here for requests: [HTTP requests](../fetching/static.md), [Dynamic websites](../fetching/dynamic.md), or [Dynamic websites with hard protections](../fetching/stealthy.md). +!!! success "Prerequisites" + + 1. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use. + 2. You've completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object. + 3. You've completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class. + 4. You've completed or read at least one page from the fetchers section to use here for requests: [HTTP requests](../fetching/static.md), [Dynamic websites](../fetching/dynamic.md), or [Dynamic websites with hard protections](../fetching/stealthy.md). ## Why use the Interactive Shell? @@ -133,7 +133,7 @@ The shell provides a few functions to help you convert curl commands from the br First, you need to copy a request as a curl command like the following: -Copying a request as a curl command from Chrome +Copying a request as a curl command from Chrome - **Convert Curl command to Request Object** @@ -174,7 +174,7 @@ The shell inherits all IPython capabilities: >>> %save filename.py 1-10 # Save commands 1-10 to file >>> # Tab completion works everywhere ->>> page.c # Shows: css, css_first, cookies, etc. +>>> page.c # Shows: css, cookies, headers, etc. >>> Fetcher. # Shows all Fetcher methods >>> # Object inspection diff --git a/docs/development/adaptive_storage_system.md b/docs/development/adaptive_storage_system.md index e8bb07fa46b637e2c7c9353a1bc2facdebe19a8d..788ad6e2bbd85bcb4da2908d09b09cca308ecc84 100644 --- a/docs/development/adaptive_storage_system.md +++ b/docs/development/adaptive_storage_system.md @@ -1,3 +1,5 @@ +# Writing your retrieval system + Scrapling uses SQLite by default, but this tutorial shows how to write your own storage system to store element properties for the `adaptive` feature. You might want to use Firebase, for example, and share the database between multiple spiders on different machines. It's a great idea to use an online database like that because spiders can share adaptive data with each other. @@ -54,7 +56,7 @@ class RedisStorage(StorageSystemMixin): orjson.dumps(element_dict) ) - def retrieve(self, identifier: str) -> dict: + def retrieve(self, identifier: str) -> dict | None: # Get data key = f"scrapling:{self._get_base_url()}:{identifier}" data = self.redis.get(key) diff --git a/docs/development/scrapling_custom_types.md b/docs/development/scrapling_custom_types.md index d1dd58571b889d307cb2c917d61029c7bc32afdc..2f638a98f9edc6442f3346a4d5c4887367e6b55d 100644 --- a/docs/development/scrapling_custom_types.md +++ b/docs/development/scrapling_custom_types.md @@ -1,3 +1,5 @@ +# Using Scrapling's custom types + > You can take advantage of the custom-made types for Scrapling and use them outside the library if you want. It's better than copying their code, after all :) ### All current types can be imported alone, like below diff --git a/docs/fetching/choosing.md b/docs/fetching/choosing.md index cfb3114f98b16a7b70042a5f6cfc55e0d1e3ab6c..dcba9b9be49287cb1fcd70b6ce57de3cd4bf37ae 100644 --- a/docs/fetching/choosing.md +++ b/docs/fetching/choosing.md @@ -1,3 +1,5 @@ +# Fetchers basics + ## Introduction Fetchers are classes that can do requests or fetch pages for you easily in a single-line fashion with many features and then return a [Response](#response-object) object. Starting with v0.3, all fetchers have separate classes to keep the session running, so for example, a fetcher that uses a browser will keep the browser open till you finish all your requests through it instead of opening multiple browsers. So it depends on your use case. @@ -38,21 +40,22 @@ Then you use it right away without initializing like this, and it will use the d If you want to configure the parser ([Selector class](../parsing/main_classes.md#selector)) that will be used on the response before returning it for you, then do this first: ```python >>> from scrapling.fetchers import Fetcher ->>> Fetcher.configure(adaptive=True, encoding="utf-8", keep_comments=False, keep_cdata=False) # and the rest +>>> Fetcher.configure(adaptive=True, keep_comments=False, keep_cdata=False) # and the rest ``` or ```python >>> from scrapling.fetchers import Fetcher >>> Fetcher.adaptive=True ->>> Fetcher.encoding="utf-8" >>> Fetcher.keep_comments=False >>> Fetcher.keep_cdata=False # and the rest ``` Then, continue your code as usual. -The available configuration arguments are: `adaptive`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the [Selector](../parsing/main_classes.md#selector) class. You can display the current configuration anytime by running `.display_config()`. +The available configuration arguments are: `adaptive`, `adaptive_domain`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the [Selector](../parsing/main_classes.md#selector) class. You can display the current configuration anytime by running `.display_config()`. + +!!! info -> Note: The `adaptive` argument is disabled by default; you must enable it to use that feature. + The `adaptive` argument is disabled by default; you must enable it to use that feature. ### Set parser config per request As you probably understand, the logic above for setting the parser config will apply globally to all requests/fetches made through that class, and it's intended for simplicity. @@ -71,7 +74,12 @@ The `Response` object is the same as the [Selector](../parsing/main_classes.md#s >>> page.headers # Response headers >>> page.request_headers # Request headers >>> page.history # Response history of redirections, if any ->>> page.body # Raw response body without any processing +>>> page.body # Raw response body as bytes >>> page.encoding # Response encoding +>>> page.meta # Response metadata dictionary (e.g., proxy used). Mainly helpful with the spiders system. ``` -All fetchers return the `Response` object. \ No newline at end of file +All fetchers return the `Response` object. + +!!! note + + Unlike the [Selector](../parsing/main_classes.md#selector) class, the `Response` class's body is always bytes since v0.4. \ No newline at end of file diff --git a/docs/fetching/dynamic.md b/docs/fetching/dynamic.md index e62d7b1be7157c7c5675cbc6c20f3d3143a85e92..31574a6eb7b2f4aa6e4865d2b5e270238b66e659 100644 --- a/docs/fetching/dynamic.md +++ b/docs/fetching/dynamic.md @@ -1,14 +1,14 @@ -# Introduction +# Fetching dynamic websites Here, we will discuss the `DynamicFetcher` class (formerly `PlayWrightFetcher`). This class provides flexible browser automation with multiple configuration options and little under-the-hood stealth improvements. As we will explain later, to automate the page, you need some knowledge of [Playwright's Page API](https://playwright.dev/python/docs/api/class-page). -> 💡 **Prerequisites:** -> -> 1. You’ve completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use. -> 2. You’ve completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object. -> 3. You’ve completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class. +!!! success "Prerequisites" + + 1. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use. + 2. You've completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object. + 3. You've completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class. ## Basic Usage You have one primary way to import this Fetcher, which is the same for all fetchers. @@ -20,7 +20,9 @@ Check out how to configure the parsing options [here](choosing.md#parser-configu Now, we will review most of the arguments one by one, using examples. If you want to jump to a table of all arguments for quick reference, [click here](#full-list-of-arguments) -> Note: The async version of the `fetch` method is the `async_fetch` method, of course. +!!! abstract + + The async version of the `fetch` method is `async_fetch`, of course. This fetcher currently provides three main run options that can be combined as desired. @@ -51,10 +53,10 @@ DynamicFetcher.fetch('https://example.com', cdp_url='ws://localhost:9222') Instead of launching a browser locally (Chromium/Google Chrome), you can connect to a remote browser through the [Chrome DevTools Protocol](https://chromedevtools.github.io/devtools-protocol/). -> Notes: -> -> * There was a `stealth` option here, but it was moved to the `StealthyFetcher` class, as explained on the next page, with additional features since version 0.3.13.
-> * This makes it less confusing for new users, easier to maintain, and provides other benefits, as explained on the [StealthyFetcher page](../fetching/stealthy.md). +!!! note "Notes:" + + * There was a `stealth` option here, but it was moved to the `StealthyFetcher` class, as explained on the next page, with additional features since version 0.3.13.
+ * This makes it less confusing for new users, easier to maintain, and provides other benefits, as explained on the [StealthyFetcher page](../fetching/stealthy.md). ## Full list of arguments Scrapling provides many options with this fetcher and its session classes. To make it as simple as possible, we will list the options here and give examples of how to use most of them. @@ -85,15 +87,19 @@ Scrapling provides many options with this fetcher and its session classes. To ma | extra_flags | A list of additional browser flags to pass to the browser on launch. | ✔️ | | additional_args | Additional arguments to be passed to Playwright's context as additional settings, and they take higher priority than Scrapling's settings. | ✔️ | | selector_config | A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class. | ✔️ | +| blocked_domains | A set of domain names to block requests to. Subdomains are also matched (e.g., `"example.com"` blocks `"sub.example.com"` too). | ✔️ | +| proxy_rotator | A `ProxyRotator` instance for automatic proxy rotation. Cannot be combined with `proxy`. | ✔️ | +| retries | Number of retry attempts for failed requests. Defaults to 3. | ✔️ | +| retry_delay | Seconds to wait between retry attempts. Defaults to 1. | ✔️ | -In session classes, all these arguments can be set globally for the session. Still, you can configure each request individually by passing some of the arguments here that can be configured on the browser tab level like: `google_search`, `timeout`, `wait`, `page_action`, `extra_headers`, `disable_resources`, `wait_selector`, `wait_selector_state`, `network_idle`, `load_dom`, and `selector_config`. +In session classes, all these arguments can be set globally for the session. Still, you can configure each request individually by passing some of the arguments here that can be configured on the browser tab level like: `google_search`, `timeout`, `wait`, `page_action`, `extra_headers`, `disable_resources`, `wait_selector`, `wait_selector_state`, `network_idle`, `load_dom`, `blocked_domains`, `proxy`, and `selector_config`. -> 🔍 Notes: -> -> 1. The `disable_resources` option made requests ~25% faster in my tests for some websites and can help save your proxy usage, but be careful with it, as it can cause some websites to never finish loading. -> 2. The `google_search` argument is enabled by default for all requests, making the request appear to come from a Google search page. So, a request for `https://example.com` will set the referer to `https://www.google.com/search?q=example`. Also, if used together, it takes priority over the referer set by the `extra_headers` argument. -> 3. Since version 0.3.13, the `stealth` option has been removed here in favor of the `StealthyFetcher` class, and the `hide_canvas` option has been moved to it. The `disable_webgl` argument has been moved to the `StealthyFetcher` class and renamed as `allow_webgl`. -> 4. If you didn't set a user agent and enabled headless mode, the fetcher will generate a real user agent for the same browser version and use it. If you didn't set a user agent and didn't enable headless mode, the fetcher will use the browser's default user agent, which is the same as in standard browsers in the latest versions. +!!! note "Notes:" + + 1. The `disable_resources` option made requests ~25% faster in my tests for some websites and can help save your proxy usage, but be careful with it, as it can cause some websites to never finish loading. + 2. The `google_search` argument is enabled by default for all requests, making the request appear to come from a Google search page. So, a request for `https://example.com` will set the referer to `https://www.google.com/search?q=example`. Also, if used together, it takes priority over the referer set by the `extra_headers` argument. + 3. Since version 0.3.13, the `stealth` option has been removed here in favor of the `StealthyFetcher` class, and the `hide_canvas` option has been moved to it. The `disable_webgl` argument has been moved to the `StealthyFetcher` class and renamed as `allow_webgl`. + 4. If you didn't set a user agent and enabled headless mode, the fetcher will generate a real user agent for the same browser version and use it. If you didn't set a user agent and didn't enable headless mode, the fetcher will use the browser's default user agent, which is the same as in standard browsers in the latest versions. ## Examples @@ -106,6 +112,13 @@ It's easier to understand with examples, so let's take a look. page = DynamicFetcher.fetch('https://example.com', disable_resources=True) # Blocks fonts, images, media, etc. ``` +### Domain Blocking + +```python +# Block requests to specific domains (and their subdomains) +page = DynamicFetcher.fetch('https://example.com', blocked_domains={"ads.example.com", "tracker.net"}) +``` + ### Network Control ```python @@ -119,16 +132,41 @@ page = DynamicFetcher.fetch('https://example.com', timeout=30000) # 30 seconds page = DynamicFetcher.fetch('https://example.com', proxy='http://username:password@host:port') ``` +### Proxy Rotation + +```python +from scrapling.fetchers import DynamicSession, ProxyRotator + +# Set up proxy rotation +rotator = ProxyRotator([ + "http://proxy1:8080", + "http://proxy2:8080", + "http://proxy3:8080", +]) + +# Use with session - rotates proxy automatically with each request +with DynamicSession(proxy_rotator=rotator, headless=True) as session: + page1 = session.fetch('https://example1.com') + page2 = session.fetch('https://example2.com') + + # Override rotator for a specific request + page3 = session.fetch('https://example3.com', proxy='http://specific-proxy:8080') +``` + +!!! warning + + Remember that by default, all browser-based fetchers and sessions use a persistent browser context with a pool of tabs. However, since browsers can't set a proxy per tab, when you use a `ProxyRotator`, the fetcher will automatically open a separate context for each proxy, with one tab per context. Once the tab's job is done, both the tab and its context are closed. + ### Downloading Files ```python -page = DynamicFetcher.fetch('https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/poster.png') +page = DynamicFetcher.fetch('https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/main_cover.png') -with open(file='poster.png', mode='wb') as f: +with open(file='main_cover.png', mode='wb') as f: f.write(page.body) ``` -The `body` attribute of the `Response` object is a `bytes` object containing the response body in case of non-HTML responses. +The `body` attribute of the `Response` object always returns `bytes`. ### Browser Automation This is where your knowledge about [Playwright's Page API](https://playwright.dev/python/docs/api/class-page) comes into play. The function you pass here takes the page object from Playwright's API, performs the desired action, and then the fetcher continues. @@ -206,7 +244,7 @@ def scrape_dynamic_content(): content = page.css('.content') return { - 'title': content.css_first('h1::text'), + 'title': content.css('h1::text').get(), 'items': [ item.text for item in content.css('.item') ] diff --git a/docs/fetching/static.md b/docs/fetching/static.md index 5b2a2dc9142ddbcf2caa597434666cec95f0ff74..0071785d8f9229c00d2d44cc95f21cb368b3531b 100644 --- a/docs/fetching/static.md +++ b/docs/fetching/static.md @@ -1,12 +1,12 @@ -# Introduction +# HTTP requests The `Fetcher` class provides rapid and lightweight HTTP requests using the high-performance `curl_cffi` library with a lot of stealth capabilities. -> 💡 **Prerequisites:** -> -> 1. You’ve completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use. -> 2. You’ve completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object. -> 3. You’ve completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class. +!!! success "Prerequisites" + + 1. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use. + 2. You've completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object. + 3. You've completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class. ## Basic Usage You have one primary way to import this Fetcher, which is the same for all fetchers. @@ -31,18 +31,20 @@ All methods for making requests here share some arguments, so let's discuss them - **proxy**: As the name implies, the proxy for this request is used to route all traffic (HTTP and HTTPS). The format accepted here is `http://username:password@localhost:8030`. - **proxy_auth**: HTTP basic auth for proxy, tuple of (username, password). - **proxies**: Dict of proxies to use. Format: `{"http": proxy_url, "https": proxy_url}`. +- **proxy_rotator**: A `ProxyRotator` instance for automatic proxy rotation. Cannot be combined with `proxy` or `proxies`. - **headers**: Headers to include in the request. Can override any header generated by the `stealthy_headers` argument - **max_redirects**: Maximum number of redirects. **Defaults to 30**, use -1 for unlimited. - **verify**: Whether to verify HTTPS certificates. **Defaults to True**. - **cert**: Tuple of (cert, key) filenames for the client certificate. - **selector_config**: A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class. -> Note:
-> 1. The currently available browsers to impersonate are (`"edge"`, `"chrome"`, `"chrome_android"`, `"safari"`, `"safari_beta"`, `"safari_ios"`, `"safari_ios_beta"`, `"firefox"`, `"tor"`)
-> 2. The available browsers to impersonate, along with their corresponding versions, are automatically displayed in the argument autocompletion and updated with each `curl_cffi` update.
-> 3. If any of the arguments `impersonate` or `stealthy_headers` are enabled, the fetchers will automatically generate real browser headers that match the browser version used. +!!! note "Notes:" + + 1. The currently available browsers to impersonate are (`"edge"`, `"chrome"`, `"chrome_android"`, `"safari"`, `"safari_beta"`, `"safari_ios"`, `"safari_ios_beta"`, `"firefox"`, `"tor"`)
+ 2. The available browsers to impersonate, along with their corresponding versions, are automatically displayed in the argument autocompletion and updated with each `curl_cffi` update.
+ 3. If any of the arguments `impersonate` or `stealthy_headers` are enabled, the fetchers will automatically generate real browser headers that match the browser version used. -Other than this, for further customization, you can pass any arguments that `curl_cffi` supports for any method if that method doesn't already support it. +Other than this, for further customization, you can pass any arguments that `curl_cffi` supports for any method if that method doesn't already support them. ### HTTP Methods There are additional arguments for each method, depending on the method, such as `params` for GET requests and `data`/`json` for POST/PUT/DELETE requests. @@ -186,19 +188,50 @@ with FetcherSession( page1 = session.get('https://scrapling.requestcatcher.com/get') page2 = session.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'}) page3 = session.get('https://api.github.com/events') - + # All requests share the same session and connection pool ``` +You can also use a `ProxyRotator` with `FetcherSession` for automatic proxy rotation across requests: + +```python +from scrapling.fetchers import FetcherSession, ProxyRotator + +rotator = ProxyRotator([ + 'http://proxy1:8080', + 'http://proxy2:8080', + 'http://proxy3:8080', +]) + +with FetcherSession(proxy_rotator=rotator, impersonate='chrome') as session: + # Each request automatically uses the next proxy in rotation + page1 = session.get('https://example.com/page1') + page2 = session.get('https://example.com/page2') + + # You can check which proxy was used via the response metadata + print(page1.meta['proxy']) +``` + +You can also override the session proxy (or rotator) for a specific request by passing `proxy=` directly to the request method: + +```python +with FetcherSession(proxy='http://default-proxy:8080') as session: + # Uses the session proxy + page1 = session.get('https://example.com/page1') + + # Override the proxy for this specific request + page2 = session.get('https://example.com/page2', proxy='http://special-proxy:9090') +``` + And here's an async example ```python async with FetcherSession(impersonate='firefox', http3=True) as session: # All standard HTTP methods available - response = async session.get('https://example.com') - response = async session.post('https://scrapling.requestcatcher.com/post', json={'data': 'value'}) - response = async session.put('https://scrapling.requestcatcher.com/put', data={'update': 'info'}) - response = async session.delete('https://scrapling.requestcatcher.com/delete') + response = await session.get('https://example.com') + response = await session.post('https://scrapling.requestcatcher.com/post', json={'data': 'value'}) + response = await session.put('https://scrapling.requestcatcher.com/put', data={'update': 'info'}) + response = await session.delete('https://scrapling.requestcatcher.com/delete') ``` or better ```python @@ -239,11 +272,11 @@ page = Fetcher.get('https://example.com') # Check the status if page.status == 200: # Extract title - title = page.css_first('title::text') + title = page.css('title::text').get() print(f"Page title: {title}") - + # Extract all links - links = page.css('a::attr(href)') + links = page.css('a::attr(href)').getall() print(f"Found {len(links)} links") ``` @@ -261,9 +294,9 @@ def scrape_products(): results = [] for product in products: results.append({ - 'title': product.css_first('.title::text'), - 'price': product.css_first('.price::text').re_first(r'\d+\.\d{2}'), - 'description': product.css_first('.description::text'), + 'title': product.css('.title::text').get(), + 'price': product.css('.price::text').re_first(r'\d+\.\d{2}'), + 'description': product.css('.description::text').get(), 'in_stock': product.has_class('in-stock') }) @@ -275,8 +308,8 @@ def scrape_products(): ```python from scrapling.fetchers import Fetcher -page = Fetcher.get('https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/poster.png') -with open(file='poster.png', mode='wb') as f: +page = Fetcher.get('https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/main_cover.png') +with open(file='main_cover.png', mode='wb') as f: f.write(page.body) ``` @@ -302,8 +335,8 @@ def scrape_all_pages(): # Process products for product in products: all_products.append({ - 'name': product.css_first('.name::text'), - 'price': product.css_first('.price::text') + 'name': product.css('.name::text').get(), + 'price': product.css('.price::text').get() }) # Next page @@ -329,7 +362,7 @@ response = Fetcher.post( # Check login success if response.status == 200: # Extract user info - user_name = response.css_first('.user-name::text') + user_name = response.css('.user-name::text').get() print(f"Logged in as: {user_name}") ``` @@ -342,7 +375,7 @@ def extract_table(): page = Fetcher.get('https://example.com/data') # Find table - table = page.css_first('table') + table = page.css('table')[0] # Extract headers headers = [ @@ -367,12 +400,13 @@ def extract_menu(): page = Fetcher.get('https://example.com') # Find navigation - nav = page.css_first('nav') + nav = page.css('nav')[0] menu = {} for item in nav.css('li'): - link = item.css_first('a') - if link: + links = item.css('a') + if links: + link = links[0] menu[link.text] = { 'url': link['href'], 'has_submenu': bool(item.css('.submenu')) diff --git a/docs/fetching/stealthy.md b/docs/fetching/stealthy.md index fc29d4260b2bc3629ccdb5d6f7a76a46ae5a7bd1..fcbc8ea3eb3d7d0b2185f79ca626b5f473aa4847 100644 --- a/docs/fetching/stealthy.md +++ b/docs/fetching/stealthy.md @@ -1,17 +1,15 @@ -# Introduction +# Fetching dynamic websites with hard protections Here, we will discuss the `StealthyFetcher` class. This class is very similar to the [DynamicFetcher](dynamic.md#introduction) class, including the browsers, the automation, and the use of [Playwright's API](https://playwright.dev/python/docs/intro). The main difference is that this class provides advanced anti-bot protection bypass capabilities; most of them are handled automatically under the hood, and the rest is up to you to enable. As with [DynamicFetcher](dynamic.md#introduction), you will need some knowledge about [Playwright's Page API](https://playwright.dev/python/docs/api/class-page) to automate the page, as we will explain later. -**Note:** _This fetcher was using a custom version of [Camoufox](https://github.com/daijro/camoufox) as an engine before version 0.3.13, which was replaced now with [patchright](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright) for many reasons. See [this section](#using-camoufox-as-an-engine) for information if you still need to use [Camoufox](https://github.com/daijro/camoufox). We might switch back to [Camoufox](https://github.com/daijro/camoufox) in the future if its development continues._ +!!! success "Prerequisites" -> 💡 **Prerequisites:** -> -> 1. You've completed or read the [DynamicFetcher](dynamic.md#introduction) page since this class builds upon it, and we won't repeat the same information here for that reason. -> 2. You’ve completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use. -> 3. You’ve completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object. -> 4. You’ve completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class. + 1. You've completed or read the [DynamicFetcher](dynamic.md#introduction) page since this class builds upon it, and we won't repeat the same information here for that reason. + 2. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use. + 3. You've completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object. + 4. You've completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class. ## Basic Usage You have one primary way to import this Fetcher, which is the same for all fetchers. @@ -21,7 +19,9 @@ You have one primary way to import this Fetcher, which is the same for all fetch ``` Check out how to configure the parsing options [here](choosing.md#parser-configuration-in-all-fetchers) -> Note: The async version of the `fetch` method is the `async_fetch` method, of course. +!!! abstract + + The async version of the `fetch` method is `async_fetch`, of course. ## What does it do? @@ -69,15 +69,19 @@ Scrapling provides many options with this fetcher and its session classes. Befor | allow_webgl | Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended, as many WAFs now check if WebGL is enabled. | ✔️ | | additional_args | Additional arguments to be passed to Playwright's context as additional settings, and they take higher priority than Scrapling's settings. | ✔️ | | selector_config | A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class. | ✔️ | +| blocked_domains | A set of domain names to block requests to. Subdomains are also matched (e.g., `"example.com"` blocks `"sub.example.com"` too). | ✔️ | +| proxy_rotator | A `ProxyRotator` instance for automatic proxy rotation. Cannot be combined with `proxy`. | ✔️ | +| retries | Number of retry attempts for failed requests. Defaults to 3. | ✔️ | +| retry_delay | Seconds to wait between retry attempts. Defaults to 1. | ✔️ | + +In session classes, all these arguments can be set globally for the session. Still, you can configure each request individually by passing some of the arguments here that can be configured on the browser tab level like: `google_search`, `timeout`, `wait`, `page_action`, `extra_headers`, `disable_resources`, `wait_selector`, `wait_selector_state`, `network_idle`, `load_dom`, `solve_cloudflare`, `blocked_domains`, `proxy`, and `selector_config`. -In session classes, all these arguments can be set globally for the session. Still, you can configure each request individually by passing some of the arguments here that can be configured on the browser tab level like: `google_search`, `timeout`, `wait`, `page_action`, `extra_headers`, `disable_resources`, `wait_selector`, `wait_selector_state`, `network_idle`, `load_dom`, `solve_cloudflare`, and `selector_config`. +!!! note "Notes:" -> 🔍 Notes: -> -> 1. It's basically the same arguments as [DynamicFetcher](dynamic.md#introduction) class but with these additional arguments `solve_cloudflare`, `block_webrtc`, `hide_canvas`, and `allow_webgl`. -> 2. The `disable_resources` option made requests ~25% faster in my tests for some websites and can help save your proxy usage, but be careful with it, as it can cause some websites to never finish loading. -> 3. The `google_search` argument is enabled by default for all requests, making the request appear to come from a Google search page. So, a request for `https://example.com` will set the referer to `https://www.google.com/search?q=example`. Also, if used together, it takes priority over the referer set by the `extra_headers` argument. -> 4. If you didn't set a user agent and enabled headless mode, the fetcher will generate a real user agent for the same browser version and use it. If you didn't set a user agent and didn't enable headless mode, the fetcher will use the browser's default user agent, which is the same as in standard browsers in the latest versions. + 1. It's basically the same arguments as [DynamicFetcher](dynamic.md#introduction) class, but with these additional arguments: `solve_cloudflare`, `block_webrtc`, `hide_canvas`, and `allow_webgl`. + 2. The `disable_resources` option made requests ~25% faster in my tests for some websites and can help save your proxy usage, but be careful with it, as it can cause some websites to never finish loading. + 3. The `google_search` argument is enabled by default for all requests, making the request appear to come from a Google search page. So, a request for `https://example.com` will set the referer to `https://www.google.com/search?q=example`. Also, if used together, it takes priority over the referer set by the `extra_headers` argument. + 4. If you didn't set a user agent and enabled headless mode, the fetcher will generate a real user agent for the same browser version and use it. If you didn't set a user agent and didn't enable headless mode, the fetcher will use the browser's default user agent, which is the same as in standard browsers in the latest versions. ## Examples It's easier to understand with examples, so we will now review most of the arguments individually. Since it's the same class as the [DynamicFetcher](dynamic.md#introduction), you can refer to that page for more examples, as we won't repeat all the examples from there. @@ -108,11 +112,11 @@ The `solve_cloudflare` parameter enables automatic detection and solving all typ And even solves the custom pages with embedded captcha. -> 🔍 **Important notes:** -> -> 1. Sometimes, with websites that use custom implementations, you will need to use `wait_selector` to make sure Scrapling waits for the real website content to be loaded after solving the captcha. Some websites can be the real definition of an edge case while we are trying to make the solver as generic as possible. -> 2. The timeout should be at least 60 seconds when using the Cloudflare solver for sufficient challenge-solving time. -> 3. This feature works seamlessly with proxies and other stealth options. +!!! notes "**Important notes:**" + + 1. Sometimes, with websites that use custom implementations, you will need to use `wait_selector` to make sure Scrapling waits for the real website content to be loaded after solving the captcha. Some websites can be the real definition of an edge case while we are trying to make the solver as generic as possible. + 2. The timeout should be at least 60 seconds when using the Cloudflare solver for sufficient challenge-solving time. + 3. This feature works seamlessly with proxies and other stealth options. ### Browser Automation This is where your knowledge about [Playwright's Page API](https://playwright.dev/python/docs/api/class-page) comes into play. The function you pass here takes the page object from Playwright's API, performs the desired action, and then the fetcher continues. @@ -172,14 +176,14 @@ def scrape_amazon_product(url): # Extract product details return { - 'title': page.css_first('#productTitle::text').clean(), - 'price': page.css_first('.a-price .a-offscreen::text'), - 'rating': page.css_first('[data-feature-name="averageCustomerReviews"] .a-popover-trigger .a-color-base::text'), + 'title': page.css('#productTitle::text').get().clean(), + 'price': page.css('.a-price .a-offscreen::text').get(), + 'rating': page.css('[data-feature-name="averageCustomerReviews"] .a-popover-trigger .a-color-base::text').get(), 'reviews_count': page.css('#acrCustomerReviewText::text').re_first(r'[\d,]+'), 'features': [ - li.clean() for li in page.css('#feature-bullets li span::text') + li.get().clean() for li in page.css('#feature-bullets li span::text') ], - 'availability': page.css_first('#availability').get_all_text(strip=True), + 'availability': page.css('#availability')[0].get_all_text(strip=True), 'images': [ img.attrib['src'] for img in page.css('#altImages img') ] @@ -248,7 +252,8 @@ In versions 0.3 and 0.3.1, the pool was reusing finished tabs to save more resou - **Memory efficiency**: Better resource usage compared to launching new browsers with each fetch. ## Using Camoufox as an engine -If you see that Camoufox is stable on your device, has no high memory issues, and want to continue using Camoufox as before v0.3.13. This section is for you. + +This fetcher used a custom version of [Camoufox](https://github.com/daijro/camoufox) as an engine before version 0.3.13, which was replaced by [patchright](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright) for many reasons. If you see that Camoufox is stable on your device, has no high memory issues, and you want to continue using it, then you can. First, you will need to install the Camoufox library, browser, and Firefox system dependencies if you didn't already: ```commandline diff --git a/docs/index.md b/docs/index.md index e61b4588956a975104d5b0561a5daca6c992e5b0..3e53e335e1c290401be82e0ce72192d5792514d4 100644 --- a/docs/index.md +++ b/docs/index.md @@ -2,34 +2,46 @@ .md-typeset h1 { display: none; } +[data-md-color-scheme="default"] .only-dark { display: none; } +[data-md-color-scheme="slate"] .only-light { display: none; } +
- poster + Scrapling + Scrapling +
-
- Easy, effortless Web Scraping as it should be! -

-
+

Effortless Web Scraping for the Modern Web


-**Stop fighting anti-bot systems. Stop rewriting selectors after every website update.** +Scrapling is an adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl. -Scrapling isn't just another Web Scraping library. It's the first **adaptive** scraping library that learns from website changes and evolves with them. While other libraries break when websites update their structure, Scrapling automatically relocates your elements and keeps your scrapers running. +Its parser learns from website changes and automatically relocates your elements when pages update. Its fetchers bypass anti-bot systems like Cloudflare Turnstile out of the box. And its spider framework lets you scale up to concurrent, multi-session crawls with pause/resume and automatic proxy rotation — all in a few lines of Python. One library, zero compromises. -Built for the modern Web, Scrapling features **its own rapid parsing engine** and fetchers to handle all Web Scraping challenges you face or will face. Built by Web Scrapers for Web Scrapers and regular users, there's something for everyone. +Blazing fast crawls with real-time stats and streaming. Built by Web Scrapers for Web Scrapers and regular users, there's something for everyone. ```python ->> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher ->> StealthyFetcher.adaptive = True -# Fetch websites' source under the radar! ->> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) ->> print(page.status) -200 ->> products = page.css('.product', auto_save=True) # Scrape data that survives website design changes! ->> # Later, if the website structure changes, pass `adaptive=True` ->> products = page.css('.product', adaptive=True) # and Scrapling still finds them! +from scrapling.fetchers import Fetcher, StealthyFetcher, DynamicFetcher +StealthyFetcher.adaptive = True +page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True) # Fetch website under the radar! +products = page.css('.product', auto_save=True) # Scrape data that survives website design changes! +products = page.css('.product', adaptive=True) # Later, if the website structure changes, pass `adaptive=True` to find them! +``` +Or scale up to full crawls +```python +from scrapling.spiders import Spider, Response + +class MySpider(Spider): + name = "demo" + start_urls = ["https://example.com/"] + + async def parse(self, response: Response): + for item in response.css('.product'): + yield {"title": item.css('h2::text').get()} + +MySpider().start() ``` ## Top Sponsors @@ -51,16 +63,27 @@ Built for the modern Web, Scrapling features **its own rapid parsing engine** an ## Key Features +### Spiders — A Full Crawling Framework +- 🕷️ **Scrapy-like Spider API**: Define spiders with `start_urls`, async `parse` callbacks, and `Request`/`Response` objects. +- ⚡ **Concurrent Crawling**: Configurable concurrency limits, per-domain throttling, and download delays. +- 🔄 **Multi-Session Support**: Unified interface for HTTP requests, and stealthy headless browsers in a single spider — route requests to different sessions by ID. +- 💾 **Pause & Resume**: Checkpoint-based crawl persistence. Press Ctrl+C for a graceful shutdown; restart to resume from where you left off. +- 📡 **Streaming Mode**: Stream scraped items as they arrive via `async for item in spider.stream()` with real-time stats — ideal for UI, pipelines, and long-running crawls. +- 🛡️ **Blocked Request Detection**: Automatic detection and retry of blocked requests with customizable logic. +- 📦 **Built-in Export**: Export results through hooks and your own pipeline or the built-in JSON/JSONL with `result.items.to_json()` / `result.items.to_jsonl()` respectively. + ### Advanced Websites Fetching with Session Support - **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class. Can impersonate browsers' TLS fingerprint, headers, and use HTTP/3. -- **Dynamic Loading**: Fetch dynamic websites with full browser automation through the `DynamicFetcher` class supporting Playwright's Chromium, and Google's Chrome. -- **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` and fingerprint spoofing. Can bypass all types of Cloudflare's Turnstile/Interstitial with automation easily. +- **Dynamic Loading**: Fetch dynamic websites with full browser automation through the `DynamicFetcher` class supporting Playwright's Chromium and Google's Chrome. +- **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` and fingerprint spoofing. Can easily bypass all types of Cloudflare's Turnstile/Interstitial with automation. - **Session Management**: Persistent session support with `FetcherSession`, `StealthySession`, and `DynamicSession` classes for cookie and state management across requests. +- **Proxy Rotation**: Built-in `ProxyRotator` with cyclic or custom rotation strategies across all session types, plus per-request proxy overrides. +- **Domain Blocking**: Block requests to specific domains (and their subdomains) in browser-based fetchers. - **Async Support**: Complete async support across all fetchers and dedicated async session classes. ### Adaptive Scraping & AI Integration - 🔄 **Smart Element Tracking**: Relocate elements after website changes using intelligent similarity algorithms. -- 🎯 **Smart Flexible Selection**: CSS selectors, XPath selectors, filter-based search, text search, regex search, and more. +- 🎯 **Smart Flexible Selection**: CSS selectors, XPath selectors, filter-based search, text search, regex search, and more. - 🔍 **Find Similar Elements**: Automatically locate elements similar to found elements. - 🤖 **MCP Server to be used with AI**: Built-in MCP server for AI-assisted Web Scraping and data extraction. The MCP server features powerful, custom capabilities that leverage Scrapling to extract targeted content before passing it to the AI (Claude/Cursor/etc), thereby speeding up operations and reducing costs by minimizing token usage. ([demo video](https://www.youtube.com/watch?v=qyFk3ZNwOxE)) @@ -72,12 +95,12 @@ Built for the modern Web, Scrapling features **its own rapid parsing engine** an ### Developer/Web Scraper Friendly Experience - 🎯 **Interactive Web Scraping Shell**: Optional built-in IPython shell with Scrapling integration, shortcuts, and new tools to speed up Web Scraping scripts development, like converting curl requests to Scrapling requests and viewing requests results in your browser. -- 🚀 **Use it directly from the Terminal**: Optionally, you can use Scrapling to scrape a URL without writing a single code! +- 🚀 **Use it directly from the Terminal**: Optionally, you can use Scrapling to scrape a URL without writing a single line of code! - 🛠️ **Rich Navigation API**: Advanced DOM traversal with parent, sibling, and child navigation methods. - 🧬 **Enhanced Text Processing**: Built-in regex, cleaning methods, and optimized string operations. - 📝 **Auto Selector Generation**: Generate robust CSS/XPath selectors for any element. - 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup with the same pseudo-elements used in Scrapy/Parsel. -- 📘 **Complete Type Coverage**: Full type hints for excellent IDE support and code completion. +- 📘 **Complete Type Coverage**: Full type hints for excellent IDE support and code completion. The entire codebase is automatically scanned with **PyRight** and **MyPy** with each change. - 🔋 **Ready Docker image**: With each release, a Docker image containing all browsers is automatically built and pushed. @@ -86,10 +109,34 @@ Scrapling’s GitHub stars have grown steadily since its release (see chart belo + + ## Installation Scrapling requires Python 3.10 or higher: @@ -98,7 +145,7 @@ Scrapling requires Python 3.10 or higher: pip install scrapling ``` -Starting with v0.3.2, this installation only includes the parser engine and its dependencies, without any fetchers or commandline dependencies. +This installation only includes the parser engine and its dependencies, without any fetchers or commandline dependencies. ### Optional Dependencies diff --git a/docs/overview.md b/docs/overview.md index ad8dac39e69a4a12007054d11e673cd2947891e4..aa91967ed56985fb41e0e1d14a472d28270fd50e 100644 --- a/docs/overview.md +++ b/docs/overview.md @@ -1,3 +1,17 @@ +## Pick Your Path + +Not sure where to start? Pick the path that matches what you're trying to do: + +| I want to... | Start here | +|:---|:---| +| **Parse HTML** I already have | [Querying elements](parsing/selection.md) — CSS, XPath, and text-based selection | +| **Quickly scrape a page** and prototype | Pick a [fetcher](fetching/choosing.md) and test right away, or launch the [interactive shell](cli/interactive-shell.md) | +| **Build a crawler** that scales | [Spiders](spiders/getting-started.md) — concurrent, multi-session crawls with pause/resume | +| **Scrape without writing code** | [CLI extract commands](cli/extract-commands.md) or hook up the [MCP server](ai/mcp-server.md) to your favourite AI tool | +| **Migrate** from another library | [From BeautifulSoup](tutorials/migrating_from_beautifulsoup.md) or [Scrapy comparison](spiders/architecture.md#comparison-with-scrapy) | + +--- + We will start by quickly reviewing the parsing capabilities. Then we will fetch websites using custom browsers, make requests, and parse the responses. Here's an HTML document generated by ChatGPT that we will be using as an example throughout this page: @@ -134,7 +148,7 @@ target_element.find_similar() ``` Find the first element that matches a CSS selector ```python -page.css_first('.product-list [data-id="1"]') +page.css('.product-list [data-id="1"]')[0] # ``` Find all elements that match a CSS selector @@ -144,7 +158,7 @@ page.css('.product-list article') ``` Find the first element that matches an XPath selector ```python -page.xpath_first("//*[@id='products']/div/article") +page.xpath("//*[@id='products']/div/article")[0] # ``` Find all elements that match an XPath selector @@ -220,14 +234,14 @@ Using the elements we found above [

Customer Revie...' parent='
] >>> section_element.next # gets the next element, the same logic applies to `quote.previous`.

Customer Revie...' parent='
->>> section_element.children.css('h2::text') +>>> section_element.children.css('h2::text').getall() ['Products'] ->>> page.css_first('[data-id="1"]').has_class('product') +>>> page.css('[data-id="1"]')[0].has_class('product') True ``` If your case needs more than the element's parent, you can iterate over the whole ancestors' tree of any element, like the one below ```python -for ancestor in quote.iterancestors(): +for ancestor in section_element.iterancestors(): # do something with it... ``` You can search for a specific ancestor of an element that satisfies a function; all you need to do is pass a function that takes a `Selector` object as an argument and returns `True` if the condition is satisfied or `False` otherwise, like below: @@ -264,11 +278,11 @@ For Async requests, you will replace the import like below: >>> page = await AsyncFetcher.delete('https://scrapling.requestcatcher.com/delete') ``` -> Notes: -> -> 1. You have the `stealthy_headers` argument, which, when enabled, makes requests to generate real browser headers and use them, including a referer header, as if this request came from a Google search of this domain. It's enabled by default. -> 2. The `impersonate` argument lets you fake the TLS fingerprint for a specific browser version. -> 3. There's also the `http3` argument, which, when enabled, makes the fetcher use HTTP/3 for requests, which makes your requests more authentic +!!! note "Notes:" + + 1. You have the `stealthy_headers` argument, which, when enabled, makes requests to generate real browser headers and use them, including a referer header, as if this request came from a Google search of this domain. It's enabled by default. + 2. The `impersonate` argument lets you fake the TLS fingerprint for a specific browser version. + 3. There's also the `http3` argument, which, when enabled, makes the fetcher use HTTP/3 for requests, which makes your requests more authentic This is just the tip of the iceberg with this fetcher; check out the rest from [here](fetching/static.md) @@ -279,11 +293,11 @@ The `DynamicFetcher` class (formerly `PlayWrightFetcher`) offers many options fo ```python >>> from scrapling.fetchers import DynamicFetcher >>> page = DynamicFetcher.fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # Vanilla Playwright option ->>> page.css_first("#search a::attr(href)") +>>> page.css("#search a::attr(href)").get() 'https://github.com/D4Vinci/Scrapling' >>> # The async version of fetch >>> page = await DynamicFetcher.async_fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) ->>> page.css_first("#search a::attr(href)") +>>> page.css("#search a::attr(href)").get() 'https://github.com/D4Vinci/Scrapling' ``` It's built on top of [Playwright](https://playwright.dev/python/), and it's currently providing two main run options that can be mixed as you want: @@ -324,7 +338,7 @@ True True ``` -Again, this is just the tip of the iceberg with this fetcher. Check out the rest from [here](fetching/dynamic.md) for all details and the complete list of arguments. +Again, this is just the tip of the iceberg with this fetcher. Check out the rest from [here](fetching/stealthy.md) for all details and the complete list of arguments. --- diff --git a/docs/parsing/adaptive.md b/docs/parsing/adaptive.md index 33396e9cc1457d85dbde81d48bf762f44d95a2f1..23dcaf3c19436f7442380d44ae3a75cac742f78e 100644 --- a/docs/parsing/adaptive.md +++ b/docs/parsing/adaptive.md @@ -1,10 +1,9 @@ -## Introduction +# Adaptive scraping -> 💡 **Prerequisites:** -> -> 1. You’ve completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector) object. -> 2. You’ve completed or read the [Main classes](../parsing/main_classes.md) page to understand the [Selector](../parsing/main_classes.md#selector) class. ->

+!!! success "Prerequisites" + + 1. You've completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector) object. + 2. You've completed or read the [Main classes](../parsing/main_classes.md) page to understand the [Selector](../parsing/main_classes.md#selector) class. Adaptive scraping (previously known as automatch) is one of Scrapling's most powerful features. It allows your scraper to survive website changes by intelligently tracking and relocating elements. @@ -84,11 +83,11 @@ Now, let's test the same selector in both versions >> Fetcher.configure(adaptive = True, adaptive_domain='stackoverflow.com') >> >> page = Fetcher.get(old_url, timeout=30) ->> element1 = page.css_first(selector, auto_save=True) +>> element1 = page.css(selector, auto_save=True)[0] >> >> # Same selector but used in the updated website >> page = Fetcher.get(new_url) ->> element2 = page.css_first(selector, adaptive=True) +>> element2 = page.css(selector, adaptive=True)[0] >> >> if element1.text == element2.text: ... print('Scrapling found the same element in the old and new designs!') @@ -100,7 +99,9 @@ The code will be the same in a real-world scenario, except it will use the same Hence, in the two examples above, I used both the `Selector` and `Fetcher` classes to show that the adaptive logic is the same. -> Note: the main reason for creating the `adaptive_domain` argument was to handle if the website changed its URL while changing the design/structure. In that case, you can use it to continue using the previously stored adaptive data for the new URL. Otherwise, scrapling will consider it a new website and discard the old data. +!!! info + + The main reason for creating the `adaptive_domain` argument was to handle if the website changed its URL while changing the design/structure. In that case, you can use it to continue using the previously stored adaptive data for the new URL. Otherwise, scrapling will consider it a new website and discard the old data. ## How the adaptive scraping feature works Adaptive scraping works in two phases: @@ -144,7 +145,7 @@ Examples: >>> page = Selector(html_doc, adaptive=True) # OR >>> Fetcher.adaptive = True ->>> page = Fetcher.fetch('https://example.com') +>>> page = Fetcher.get('https://example.com') ``` If you are using the [Selector](main_classes.md#selector) class, you need to pass the url of the website you are using with the argument `url` so Scrapling can separate the properties saved for each element by domain. @@ -157,7 +158,7 @@ Now that you've enabled the `adaptive` feature globally, you have two main ways ### The CSS/XPath Selection way As you have seen in the example above, first, you have to use the `auto_save` argument while selecting an element that exists on the page, like below ```python -element = page.css('#p1' auto_save=True) +element = page.css('#p1', auto_save=True) ``` And when the element doesn't exist, you can use the same selector and the `adaptive` argument, and the library will find it for you ```python @@ -165,7 +166,7 @@ element = page.css('#p1', adaptive=True) ``` Pretty simple, eh? -Well, a lot happened under the hood here. Remember the identifier we mentioned before that you need to set to retrieve the element you want? Here, with the `css`/`css_first`/`xpath`/`xpath_first` methods, the identifier is set automatically as the selector you passed here to make things easier :) +Well, a lot happened under the hood here. Remember the identifier we mentioned before that you need to set to retrieve the element you want? Here, with the `css`/`xpath` methods, the identifier is set automatically as the selector you passed here to make things easier :) Additionally, for all these methods, you can pass the `identifier` argument to set it yourself. This is useful in some instances, or you can use it to save properties with the `auto_save` argument. @@ -185,7 +186,7 @@ Now, later, when you want to retrieve it and relocate it inside the page with `a >>> element_dict = page.retrieve('my_special_element') >>> page.relocate(element_dict, selector_type=True) [] ->>> page.relocate(element_dict, selector_type=True).css('::text') +>>> page.relocate(element_dict, selector_type=True).css('::text').getall() ['Tipping the Velvet'] ``` Hence, the `retrieve` and `relocate` methods are used. diff --git a/docs/parsing/main_classes.md b/docs/parsing/main_classes.md index ac8af75c296b9a24c1cc5bfec44da52f120abef7..f310037f2a85aab526341bb1ab6e05d634f58fae 100644 --- a/docs/parsing/main_classes.md +++ b/docs/parsing/main_classes.md @@ -1,9 +1,8 @@ -## Introduction +# Parsing main classes -> 💡 **Prerequisites:** -> -> - You’ve completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector) object. ->

+!!! success "Prerequisites" + + - You’ve completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector) object. After exploring the various ways to select elements with Scrapling and its related features, let's take a step back and examine the [Selector](#selector) class in general, as well as other objects, to gain a better understanding of the parsing engine. @@ -166,10 +165,10 @@ print(article.prettify()) ``` -Use the `.body` property to get the raw content of the page +Use the `.body` property to get the raw content of the page. Starting from v0.4, when used on a `Response` object from fetchers, `.body` always returns `bytes`. ```python >>> page.body -'\n \n Some page\n \n \n
\n
\n

Product 1

\n

This is product 1

\n $10.99\n \n
\n\n
\n

Product 2

\n

This is product 2

\n $20.99\n \n
\n\n
\n

Product 3

\n

This is product 3

\n $15.99\n \n
\n
\n\n \n \n' +'\n \n Some page\n \n ...' ``` To get all the ancestors in the DOM tree of this element ```python @@ -234,7 +233,7 @@ This element returns the same result as the `children` property because its chil Another example of using the element with the `product-list` class will clear the difference between the `children` property and the `below_elements` property ```python ->>> products_list = page.css_first('.product-list') +>>> products_list = page.css('.product-list')[0] >>> products_list.children [, , @@ -263,7 +262,7 @@ Get the next element of the current element The same logic applies to the `previous` property ```python >>> article.previous # It's the first child, so it doesn't have a previous element ->>> second_article = page.css_first('.product[data-id="2"]') +>>> second_article = page.css('.product[data-id="2"]')[0] >>> second_article.previous ``` @@ -277,7 +276,7 @@ If your case needs more than the element's parent, you can iterate over the whol for ancestor in article.iterancestors(): # do something with it... ``` -You can search for a specific ancestor of an element that satisfies a search function; all you need to do is to pass a function that takes a [Selector](#selector) object as an argument and return `True` if the condition satisfies or `False` otherwise, like below: +You can search for a specific ancestor of an element that satisfies a search function; all you need to do is pass a function that takes a [Selector](#selector) object as an argument and return `True` if the condition satisfies or `False` otherwise, like below: ```python >>> article.find_ancestor(lambda ancestor: ancestor.has_class('product-list'))
@@ -288,33 +287,63 @@ You can search for a specific ancestor of an element that satisfies a search fun ## Selectors The class `Selectors` is the "List" version of the [Selector](#selector) class. It inherits from the Python standard `List` type, so it shares all `List` properties and methods while adding more methods to make the operations you want to execute on the [Selector](#selector) instances within more straightforward. -In the [Selector](#selector) class, all methods/properties that should return a group of elements return them as a [Selectors](#selectors) class instance. The only exceptions are when you use the CSS/XPath methods as follows: +In the [Selector](#selector) class, all methods/properties that should return a group of elements return them as a [Selectors](#selectors) class instance. -- If you selected a text node with the selector, then the return type will be [TextHandler](#texthandler)/[TextHandlers](#texthandlers).
Examples: - ```python - >>> page.css('a::text') # -> TextHandlers - >>> page.xpath('//a/text()') # -> TextHandlers - >>> page.css_first('a::text') # -> TextHandler - >>> page.xpath_first('//a/text()') # -> TextHandler - >>> page.css('a::attr(href)') # -> TextHandlers - >>> page.xpath('//a/@href') # -> TextHandlers - >>> page.css_first('a::attr(href)') # -> TextHandler - >>> page.xpath_first('//a/@href') # -> TextHandler - ``` -- If you used a combined selector that returns mixed types, the result will be a Python standard `List`.
Examples: - ```python - >>> page.css('.price_color') # -> Selectors - >>> page.css('.product_pod a::attr(href)') # -> TextHandlers - >>> page.css('.price_color, .product_pod a::attr(href)') # -> List - ``` +Starting with v0.4, all selection methods consistently return [Selector](#selector)/[Selectors](#selectors) objects, even for text nodes and attribute values. Text nodes (selected via `::text`, `/text()`, `::attr()`, `/@attr`) are wrapped in [Selector](#selector) objects. These text node selectors have `tag` set to `"#text"`, and their `text` property returns the text value. You can still access the text value directly, and all other properties return empty/default values gracefully. + +```python +>>> page.css('a::text') # -> Selectors (of text node Selectors) +>>> page.xpath('//a/text()') # -> Selectors +>>> page.css('a::text').get() # -> TextHandler (the first text value) +>>> page.css('a::text').getall() # -> TextHandlers (all text values) +>>> page.css('a::attr(href)') # -> Selectors +>>> page.xpath('//a/@href') # -> Selectors +>>> page.css('.price_color') # -> Selectors +``` + +### Data extraction methods +Starting with v0.4, [Selector](#selector) and [Selectors](#selectors) both provide `get()`, `getall()`, and their aliases `extract_first` and `extract` (following Scrapy conventions). The old `get_all()` method has been removed. + +**On a [Selector](#selector) object:** + +- `get()` returns a `TextHandler` — for text node selectors, it returns the text value; for HTML element selectors, it returns the serialized outer HTML. +- `getall()` returns a `TextHandlers` list containing the single serialized string. +- `extract_first` is an alias for `get()`, and `extract` is an alias for `getall()`. + +```python +>>> page.css('h3')[0].get() # Outer HTML of the element +'

Product 1

' -Let's see what [Selectors](#selectors) class adds to the table with that out of the way. +>>> page.css('h3::text')[0].get() # Text value of the text node +'Product 1' +``` + +**On a [Selectors](#selectors) object:** + +- `get(default=None)` returns the serialized string of the **first** element, or `default` if the list is empty. +- `getall()` serializes **all** elements and returns a `TextHandlers` list. +- `extract_first` is an alias for `get()`, and `extract` is an alias for `getall()`. + +```python +>>> page.css('.price::text').get() # First price text +'$10.99' + +>>> page.css('.price::text').getall() # All price texts +['$10.99', '$20.99', '$15.99'] + +>>> page.css('.price::text').get('') # With default value +'$10.99' +``` + +These methods work seamlessly with all selection types (CSS, XPath, `find`, etc.) and are the recommended way to extract text and attribute values in a Scrapy-compatible style. + +Now, let's see what [Selectors](#selectors) class adds to the table with that out of the way. ### Properties Apart from the standard operations on Python lists, such as iteration and slicing. You can do the following: -Execute CSS and XPath selectors directly on the [Selector](#selector) instances it has, while the arguments and the return types are the same as [Selector](#selector)'s `css` and `xpath` methods. This, of course, makes chaining methods very straightforward. +Execute CSS and XPath selectors directly on the [Selector](#selector) instances it has, while the return types are the same as [Selector](#selector)'s `css` and `xpath` methods. The arguments are similar, except the `adaptive` argument is not available here. This, of course, makes chaining methods very straightforward. ```python >>> page.css('.product_pod a') [
, ...] ``` +You can safely access the first or last element without worrying about index errors: +```python +>>> page.css('.product').first # First Selector or None + +>>> page.css('.product').last # Last Selector or None + +>>> page.css('.nonexistent').first # Returns None instead of raising IndexError +``` + If you are too lazy like me and want to know the number of [Selector](#selector) instances in a [Selectors](#selectors) instance. You can do this: ```python page.css('.product_pod').length @@ -441,14 +479,14 @@ First, we start with the `re` and `re_first` methods. These are the same methods - You also have the `.json()` method, which tries to convert the content to a JSON object quickly if possible; otherwise, it throws an error ```python - >>> page.css_first('#page-data::text') + >>> page.css('#page-data::text').get() '\n {\n "lastUpdated": "2024-09-22T10:30:00Z",\n "totalProducts": 3\n }\n ' - >>> page.css_first('#page-data::text').json() + >>> page.css('#page-data::text').get().json() {'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3} ``` Hence, if you didn't specify a text node while selecting an element (like the text content or an attribute text content), the text content will be selected automatically, like this ```python - >>> page.css_first('#page-data').json() + >>> page.css('#page-data')[0].json() {'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3} ``` The [Selector](#selector) class adds one thing here, too; let's say this is the page we are working with: @@ -469,12 +507,12 @@ First, we start with the `re` and `re_first` methods. These are the same methods The [Selector](#selector) class has the `get_all_text` method, which you should be aware of by now. This method returns a `TextHandler`, of course.

So, as you know here, if you did something like this ```python - >>> page.css_first('div::text').json() + >>> page.css('div::text').get().json() ``` You will get an error because the `div` tag doesn't have any direct text content that can be serialized to JSON; it doesn't have any direct text content at all.

In this case, the `get_all_text` method comes to the rescue, so you can do something like that ```python - >>> page.css_first('div').get_all_text(ignore_tags=[]).json() + >>> page.css('div')[0].get_all_text(ignore_tags=[]).json() {'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3} ``` I used the `ignore_tags` argument here because the default value of it is `('script', 'style',)`, as you are aware.

@@ -493,7 +531,7 @@ First, we start with the `re` and `re_first` methods. These are the same methods {'some_key': 'some_value'} ``` You might wonder how this happened, given that the `html` tag doesn't contain direct text.
- Well, for cases like JSON responses, I made the [Selector](#selector) class keep a raw copy of the content it receives. This way, when you use the `.json()` method, it checks for that raw copy and then converts it to JSON. If the raw copy is not available like the case with the elements, it checks for the current element text content, or otherwise it uses the `get_all_text` method directly.
+ Well, for cases like JSON responses, I made the [Selector](#selector) class keep a raw copy of the content it receives. This way, when you use the `.json()` method, it checks for that raw copy and then converts it to JSON. If the raw copy is unavailable, as with the elements, it checks the current element's text content; otherwise, it uses the `get_all_text` method directly.
- Another handy method is `.clean()`, which will remove all white spaces and consecutive spaces for you and return a new `TextHandler` instance ```python @@ -521,7 +559,7 @@ You probably guessed it: This class is similar to [Selectors](#selectors) and [S The only difference is that the `re_first` method logic here runs `re` on each [TextHandler](#texthandler) and returns the first result, or `None`. Nothing new needs to be explained here, but new methods will be added over time. ## AttributesHandler -This is a read-only version of Python's standard dictionary, or `dict`, used solely to store the attributes of each element or [Selector](#selector) instance. +This is a read-only version of Python's standard dictionary, or `dict`, used solely to store the attributes of each element/[Selector](#selector) instance. ```python >>> print(page.find('script').attrib) {'id': 'page-data', 'type': 'application/json'} diff --git a/docs/parsing/selection.md b/docs/parsing/selection.md index 0b3877d7cc977e2b13cb052e680cbe13f21326fe..53113912a177d7a3df5d3b9ac986933a2f1a69a2 100644 --- a/docs/parsing/selection.md +++ b/docs/parsing/selection.md @@ -1,4 +1,4 @@ -## Introduction +# Querying elements Scrapling currently supports parsing HTML pages exclusively, so it doesn't support XML feeds. This decision was made because the adaptive feature won't work with XML, but that might change soon, so stay tuned :) In Scrapling, there are five main ways to find elements: @@ -27,16 +27,16 @@ Also, Scrapling implements some non-standard pseudo-elements like: In short, if you come from Scrapy/Parsel, you will find the same logic for selectors here to make it easier. No need to implement a stranger logic to the one that most of us are used to :) -To select elements with CSS selectors, you have the `css` and `css_first` methods. The latter is ~10% faster and more valuable when you are interested in the first element it finds, or if it's just one element, etc. It's beneficial when there's more than one, as it returns `Selectors`. +To select elements with CSS selectors, use the `css` method, which returns `Selectors`. Use `[0]` to get the first element, or `.get()` / `.getall()` to extract text values from text/attribute pseudo-selectors. ### What are XPath selectors? [XPath](https://en.wikipedia.org/wiki/XPath) is a language for selecting nodes in XML documents, which can also be used with HTML. This [cheatsheet](https://devhints.io/xpath) is a good resource for learning about [XPath](https://en.wikipedia.org/wiki/XPath). Scrapling adds XPath selectors directly through [lxml](https://lxml.de/). In short, it is the same situation as CSS Selectors; if you come from Scrapy/Parsel, you will find the same logic for selectors here. However, Scrapling doesn't implement the XPath extension function `has-class` as Scrapy/Parsel does. Instead, it provides the `has_class` method, which can be used on elements returned for the same purpose. -To select elements with XPath selectors, you have the `xpath` and `xpath_first` methods. Again, these methods follow the same logic as the CSS selectors methods above, and `xpath_first` is faster. +To select elements with XPath selectors, you have the `xpath` method. Again, this method follows the same logic as the CSS selectors method above. -> Note that each method of `css`, `css_first`, `xpath`, and `xpath_first` has additional arguments, but we didn't explain them here, as they are all about the adaptive feature. The adaptive feature will have its own page later to be described in detail. +> Note that each method of `css` and `xpath` has additional arguments, but we didn't explain them here, as they are all about the adaptive feature. The adaptive feature will have its own page later to be described in detail. ### Selectors examples Let's see some shared examples of using CSS and XPath Selectors. @@ -46,43 +46,40 @@ Select all elements with the class `product`. products = page.css('.product') products = page.xpath('//*[@class="product"]') ``` -Note: The XPath one won't be accurate if there's another class; **it's always better to rely on CSS for selecting by class** +!!! info "Note:" + + The XPath one won't be accurate if there's another class; **it's always better to rely on CSS for selecting by class** Select the first element with the class `product`. ```python -product = page.css_first('.product') -product = page.xpath_first('//*[@class="product"]') -``` -Which would be the same as doing (but a bit slower) -```python product = page.css('.product')[0] product = page.xpath('//*[@class="product"]')[0] ``` Get the text of the first element with the `h1` tag name ```python -title = page.css_first('h1::text') -title = page.xpath_first('//h1//text()') +title = page.css('h1::text').get() +title = page.xpath('//h1//text()').get() ``` -Which is again the same as doing +Which is the same as doing ```python -title = page.css_first('h1').text -title = page.xpath_first('//h1').text +title = page.css('h1')[0].text +title = page.xpath('//h1')[0].text ``` -Get the `href` attribute of the first element with the `a` tag name +Get the `href` attribute of the first element with the `a` tag name ```python -link = page.css_first('a::attr(href)') -link = page.xpath_first('//a/@href') +link = page.css('a::attr(href)').get() +link = page.xpath('//a/@href').get() ``` Select the text of the first element with the `h1` tag name, which contains `Phone`, and under an element with class `product`. ```python -title = page.css_first('.product h1:contains("Phone")::text') -title = page.page.xpath_first('//*[@class="product"]//h1[contains(text(),"Phone")]/text()') +title = page.css('.product h1:contains("Phone")::text').get() +title = page.xpath('//*[@class="product"]//h1[contains(text(),"Phone")]/text()').get() ``` You can nest and chain selectors as you want, given that they return results ```python -page.css_first('.product').css_first('h1:contains("Phone")::text') -page.xpath_first('//*[@class="product"]').xpath_first('//h1[contains(text(),"Phone")]/text()') -page.xpath_first('//*[@class="product"]').css_first('h1:contains("Phone")::text') +page.css('.product')[0].css('h1:contains("Phone")::text').get() +page.xpath('//*[@class="product"]')[0].xpath('//h1[contains(text(),"Phone")]/text()').get() +page.xpath('//*[@class="product"]')[0].css('h1:contains("Phone")::text').get() ``` Another example @@ -91,7 +88,7 @@ All links that have 'image' in their 'href' attribute links = page.css('a[href*="image"]') links = page.xpath('//a[contains(@href, "image")]') for index, link in enumerate(links): - link_value = link.attrib['href'] # Cleaner than link.css('::attr(href)') + link_value = link.attrib['href'] # Cleaner than link.css('::attr(href)').get() link_text = link.text print(f'Link number {index} points to this url {link_value} with text content as "{link_text}"') ``` @@ -114,7 +111,9 @@ By default, Scrapling searches for the exact matching of the text/pattern you pa * **partial**: If enabled, `find_by_text` will return elements that contain the input text. So it's not an exact match anymore -Note: The method `find_by_regex` can accept both regular strings and a compiled regex pattern as its first argument, as you will see in the upcoming examples. +!!! abstract "Note:" + + The method `find_by_regex` can accept both regular strings and a compiled regex pattern as its first argument, as you will see in the upcoming examples. ### Finding Similar Elements One of the most remarkable new features Scrapling puts on the table is the ability to tell Scrapling to find elements similar to the element at hand. This feature's inspiration came from the AutoScraper library, but in Scrapling, it can be used on elements found by any method. Most of its usage would likely occur after finding elements through text content, similar to how AutoScraper works, making it convenient to explain here. @@ -239,9 +238,9 @@ To increase the complexity a little bit, let's say we want to get all the books' ```python >>> for product in element.parent.parent.find_similar(): print({ - "name": product.css_first('h3 a::text'), - "price": product.css_first('.price_color').re_first(r'[\d\.]+'), - "stock": product.css('.availability::text')[-1].clean() + "name": product.css('h3 a::text').get(), + "price": product.css('.price_color')[0].re_first(r'[\d\.]+'), + "stock": product.css('.availability::text').getall()[-1].clean() }) {'name': 'A Light in the ...', 'price': '51.77', 'stock': 'In stock'} {'name': 'Soumission', 'price': '50.10', 'stock': 'In stock'} @@ -264,10 +263,10 @@ def extract_product_grid(page): return [ { - 'name': p.css_first('h3::text'), - 'price': p.css_first('.price::text').re_first(r'\d+\.\d{2}'), + 'name': p.css('h3::text').get(), + 'price': p.css('.price::text').re_first(r'\d+\.\d{2}'), 'stock': 'In stock' in p.text, - 'rating': p.css_first('.rating').attrib.get('data-rating') + 'rating': p.css('.rating')[0].attrib.get('data-rating') } for p in products ] @@ -276,16 +275,16 @@ Table Row Extraction ```python def extract_table_data(page): # Find the first data row - first_row = page.css_first('table tbody tr') + first_row = page.css('table tbody tr')[0] # Find similar rows rows = first_row.find_similar() return [ { - 'column1': row.css_first('td:nth-child(1)::text'), - 'column2': row.css_first('td:nth-child(2)::text'), - 'column3': row.css_first('td:nth-child(3)::text') + 'column1': row.css('td:nth-child(1)::text').get(), + 'column2': row.css('td:nth-child(2)::text').get(), + 'column3': row.css('td:nth-child(3)::text').get() } for row in rows ] @@ -294,7 +293,7 @@ Form Field Extraction ```python def extract_form_fields(page): # Find first form field container - first_field = page.css_first('input').find_ancestor( + first_field = page.css('input')[0].find_ancestor( lambda e: e.has_class('form-field') ) @@ -303,9 +302,9 @@ def extract_form_fields(page): return [ { - 'label': f.css_first('label::text'), - 'type': f.css_first('input').attrib.get('type'), - 'required': 'required' in f.css_first('input').attrib + 'label': f.css('label::text').get(), + 'type': f.css('input')[0].attrib.get('type'), + 'required': 'required' in f.css('input')[0].attrib } for f in fields ] @@ -324,9 +323,9 @@ def extract_reviews(page): return [ { - 'text': r.css_first('.review-text::text'), + 'text': r.css('.review-text::text').get(), 'rating': r.attrib.get('data-rating'), - 'author': r.css_first('.reviewer::text') + 'author': r.css('.reviewer::text').get() } for r in all_reviews ] @@ -354,10 +353,10 @@ It filters all elements in the current page/element in the following order: 3. All elements that match all passed regex patterns are collected, or if previous filter(s) are used, then previously collected elements are filtered. 4. All elements that fulfill all passed function(s) are collected; if a previous filter(s) is used, then previously collected elements are filtered. -Notes: +!!! note "Notes:" -1. As you probably understood, the filtering process always starts from the first filter it finds in the filtering order above. So, if no tag name(s) are passed but attributes are passed, the process starts from that step (number 2), and so on. -2. The order in which you pass the arguments doesn't matter. The only order considered is the one explained above. + 1. As you probably understood, the filtering process always starts from the first filter it finds in the filtering order above. So, if no tag name(s) are passed but attributes are passed, the process starts from that step (number 2), and so on. + 2. The order in which you pass the arguments doesn't matter. The only order considered is the one explained above. Check examples to clear any confusion :) @@ -396,10 +395,10 @@ Find all elements with a class that equals `quote`. ``` Find all div elements with a class that equals `quote` and contains the element `.text`, which contains the word 'world' in its content. ```python ->>> page.find_all('div', {'class': 'quote'}, lambda e: "world" in e.css_first('.text::text')) +>>> page.find_all('div', {'class': 'quote'}, lambda e: "world" in e.css('.text::text').get()) [
>> page.find_all({'itemtype':"http://schema.org/CreativeWork"}, 'div').css('.author::text').getall() ['Albert Einstein', 'J.K. Rowling', ...] @@ -473,15 +472,16 @@ Generate a full XPath selector for the `url_element` element from the start of t >>> url_element.generate_full_xpath_selector '//body/div/div[2]/div/div/span[2]/a' ``` -> Note:
-> When you tell Scrapling to create a short selector, it tries to find a unique element to use in generation as a stop point, like an element with an `id` attribute, but in our case, there wasn't any, so that's why the short and the full selector will be the same. +!!! abstract "Note:" + + When you tell Scrapling to create a short selector, it tries to find a unique element to use in generation as a stop point, like an element with an `id` attribute, but in our case, there wasn't any, so that's why the short and the full selector will be the same. ## Using selectors with regular expressions Similar to `parsel`/`scrapy`, `re` and `re_first` methods are available for extracting data using regular expressions. However, unlike the former libraries, these methods are in nearly all classes like `Selector`/`Selectors`/`TextHandler` and `TextHandlers`, which means you can use them directly on the element even if you didn't select a text node. We will have a deep look at it while explaining the [TextHandler](main_classes.md#texthandler) class, but in general, it works like the examples below: ```python ->>> page.css_first('.price_color').re_first(r'[\d\.]+') +>>> page.css('.price_color')[0].re_first(r'[\d\.]+') '51.77' >>> page.css('.price_color').re_first(r'[\d\.]+') diff --git a/docs/requirements.txt b/docs/requirements.txt index c2c5d541e12cacf13751f24c66dc0643f8a16fcb..ff7edb01990897e61718540af2305dfedb4304b3 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,8 +1,8 @@ -mkdocstrings>=1.0.0 -mkdocstrings-python>=2.0.1 +zensical>=0.0.23 +mkdocstrings>=1.0.3 +mkdocstrings-python>=2.0.2 griffe-inherited-docstrings griffe-runtime-objects griffe-sphinx -mkdocs-material[imaging]>=9.7.1 -black>=25.12.0 +black>=26.1.0 pngquant \ No newline at end of file diff --git a/docs/spiders/advanced.md b/docs/spiders/advanced.md new file mode 100644 index 0000000000000000000000000000000000000000..1a363b92ca8914d713df565ee5c0471f56f8f790 --- /dev/null +++ b/docs/spiders/advanced.md @@ -0,0 +1,313 @@ +# Advanced usages + +## Introduction + +!!! success "Prerequisites" + + 1. You've read the [Getting started](getting-started.md) page and know how to create and run a basic spider. + +This page covers the spider system's advanced features: concurrency control, pause/resume, streaming, lifecycle hooks, statistics, and logging. + +## Concurrency Control + +The spider system uses three class attributes to control how aggressively it crawls: + +| Attribute | Default | Description | +|----------------------------------|---------|------------------------------------------------------------------| +| `concurrent_requests` | `4` | Maximum number of requests being processed at the same time | +| `concurrent_requests_per_domain` | `0` | Maximum concurrent requests per domain (0 = no per-domain limit) | +| `download_delay` | `0.0` | Seconds to wait before each request | + +```python +class PoliteSpider(Spider): + name = "polite" + start_urls = ["https://example.com"] + + # Be gentle with the server + concurrent_requests = 4 + concurrent_requests_per_domain = 2 + download_delay = 1.0 # Wait 1 second between requests + + async def parse(self, response: Response): + yield {"title": response.css("title::text").get("")} +``` + +When `concurrent_requests_per_domain` is set, each domain gets its own concurrency limiter in addition to the global limit. This is useful when crawling multiple domains simultaneously — you can allow high global concurrency while being polite to each individual domain. + +!!! tip + + The `download_delay` parameter adds a fixed wait before every request, regardless of the domain. Use it for simple rate limiting. + +### Using uvloop + +The `start()` method accepts a `use_uvloop` parameter to use the faster [uvloop](https://github.com/MagicStack/uvloop)/[winloop](https://github.com/nicktimko/winloop) event loop implementation, if available: + +```python +result = MySpider().start(use_uvloop=True) +``` + +This can improve throughput for I/O-heavy crawls. You'll need to install `uvloop` (Linux/macOS) or `winloop` (Windows) separately. + +## Pause & Resume + +The spider supports graceful pause-and-resume via checkpointing. To enable it, pass a `crawldir` directory to the spider constructor: + +```python +spider = MySpider(crawldir="crawl_data/my_spider") +result = spider.start() + +if result.paused: + print("Crawl was paused. Run again to resume.") +else: + print("Crawl completed!") +``` + +### How It Works + +1. **Pausing**: Press `Ctrl+C` during a crawl. The spider waits for all in-flight requests to finish, saves a checkpoint (pending requests + a set of seen request fingerprints), and then exits. +2. **Force stopping**: Press `Ctrl+C` a second time to stop immediately without waiting for active tasks. +3. **Resuming**: Run the spider again with the same `crawldir`. It detects the checkpoint, restores the queue and seen set, and continues from where it left off — skipping `start_requests()`. +4. **Cleanup**: When a crawl completes normally (not paused), the checkpoint files are deleted automatically. + +**Checkpoints are also saved periodically during the crawl (every 5 minutes by default).** + +You can change the interval as follows: + +```python +# Save checkpoint every 2 minutes +spider = MySpider(crawldir="crawl_data/my_spider", interval=120.0) +``` + +The writing to the disk is atomic, so it's totally safe. + +!!! tip + + Pressing `Ctrl+C` during a crawl always causes the spider to close gracefully, even if the checkpoint system is not enabled. Doing it again without waiting forces the spider to close immediately. + +### Knowing If You're Resuming + +The `on_start()` hook receives a `resuming` flag: + +```python +async def on_start(self, resuming: bool = False): + if resuming: + self.logger.info("Resuming from checkpoint!") + else: + self.logger.info("Starting fresh crawl") +``` + +## Streaming + +For long-running spiders or applications that need real-time access to scraped items, use the `stream()` method instead of `start()`: + +```python +import anyio + +async def main(): + spider = MySpider() + async for item in spider.stream(): + print(f"Got item: {item}") + # Access real-time stats + print(f"Items so far: {spider.stats.items_scraped}") + print(f"Requests made: {spider.stats.requests_count}") + +anyio.run(main) +``` + +Key differences from `start()`: + +- `stream()` must be called from an async context +- Items are yielded one by one as they're scraped, not collected into a list +- You can access `spider.stats` during iteration for real-time statistics + +!!! abstract + + The full list of all stats that can be accessed by `spider.stats` is explained below [here](#results--statistics) + +You can use it with the checkpoint system too, so it's easy to build UI on top of spiders. UIs that have real-time data and can be paused/resumed. + +```python +import anyio + +async def main(): + spider = MySpider(crawldir="crawl_data/my_spider") + async for item in spider.stream(): + print(f"Got item: {item}") + # Access real-time stats + print(f"Items so far: {spider.stats.items_scraped}") + print(f"Requests made: {spider.stats.requests_count}") + +anyio.run(main) +``` +You can also use `spider.pause()` to shut down the spider in the code above. If you used it without enabling the checkpoint system, it will just close the crawl. + +## Lifecycle Hooks + +The spider provides several hooks you can override to add custom behavior at different stages of the crawl: + +### on_start + +Called before crawling begins. Use it for setup tasks like loading data or initializing resources: + +```python +async def on_start(self, resuming: bool = False): + self.logger.info("Spider starting up") + # Load seed URLs from a database, initialize counters, etc. +``` + +### on_close + +Called after crawling finishes (whether completed or paused). Use it for cleanup: + +```python +async def on_close(self): + self.logger.info("Spider shutting down") + # Close database connections, flush buffers, etc. +``` + +### on_error + +Called when a request fails with an exception. Use it for error tracking or custom recovery logic: + +```python +async def on_error(self, request: Request, error: Exception): + self.logger.error(f"Failed: {request.url} - {error}") + # Log to error tracker, save failed URL for later, etc. +``` + +### on_scraped_item + +Called for every scraped item before it's added to the results. Return the item (modified or not) to keep it, or return `None` to drop it: + +```python +async def on_scraped_item(self, item: dict) -> dict | None: + # Drop items without a title + if not item.get("title"): + return None + + # Modify items (e.g., add timestamps) + item["scraped_at"] = "2026-01-01" + return item +``` + +!!! tip + + This hook can also be used to direct items through your own pipelines and drop them from the spider. + +### start_requests + +Override `start_requests()` for custom initial request generation instead of using `start_urls`: + +```python +async def start_requests(self): + # POST request to log in first + yield Request( + "https://example.com/login", + method="POST", + data={"user": "admin", "pass": "secret"}, + callback=self.after_login, + ) + +async def after_login(self, response: Response): + # Now crawl the authenticated pages + yield response.follow("/dashboard", callback=self.parse) +``` + +## Results & Statistics + +The `CrawlResult` returned by `start()` contains both the scraped items and detailed statistics: + +```python +result = MySpider().start() + +# Items +print(f"Total items: {len(result.items)}") +result.items.to_json("output.json", indent=True) + +# Did the crawl complete? +print(f"Completed: {result.completed}") +print(f"Paused: {result.paused}") + +# Statistics +stats = result.stats +print(f"Requests: {stats.requests_count}") +print(f"Failed: {stats.failed_requests_count}") +print(f"Blocked: {stats.blocked_requests_count}") +print(f"Offsite filtered: {stats.offsite_requests_count}") +print(f"Items scraped: {stats.items_scraped}") +print(f"Items dropped: {stats.items_dropped}") +print(f"Response bytes: {stats.response_bytes}") +print(f"Duration: {stats.elapsed_seconds:.1f}s") +print(f"Speed: {stats.requests_per_second:.1f} req/s") +``` + +### Detailed Stats + +The `CrawlStats` object tracks granular information: + +```python +stats = result.stats + +# Status code distribution +print(stats.response_status_count) +# {'status_200': 150, 'status_404': 3, 'status_403': 1} + +# Bytes downloaded per domain +print(stats.domains_response_bytes) +# {'example.com': 1234567, 'api.example.com': 45678} + +# Requests per session +print(stats.sessions_requests_count) +# {'http': 120, 'stealth': 34} + +# Proxies used during the crawl +print(stats.proxies) +# ['http://proxy1:8080', 'http://proxy2:8080'] + +# Log level counts +print(stats.log_levels_counter) +# {'debug': 200, 'info': 50, 'warning': 3, 'error': 1, 'critical': 0} + +# Timing information +print(stats.start_time) # Unix timestamp when crawl started +print(stats.end_time) # Unix timestamp when crawl finished +print(stats.download_delay) # The download delay used (seconds) + +# Concurrency settings used +print(stats.concurrent_requests) # Global concurrency limit +print(stats.concurrent_requests_per_domain) # Per-domain concurrency limit + +# Custom stats (set by your spider code) +print(stats.custom_stats) +# {'login_attempts': 3, 'pages_with_errors': 5} + +# Export everything as a dict +print(stats.to_dict()) +``` + +## Logging + +The spider has a built-in logger accessible via `self.logger`. It's pre-configured with the spider's name and supports several customization options: + +| Attribute | Default | Description | +|-----------------------|--------------------------------------------------------------|----------------------------------------------------| +| `logging_level` | `logging.DEBUG` | Minimum log level | +| `logging_format` | `"[%(asctime)s]:({spider_name}) %(levelname)s: %(message)s"` | Log message format | +| `logging_date_format` | `"%Y-%m-%d %H:%M:%S"` | Date format in log messages | +| `log_file` | `None` | Path to a log file (in addition to console output) | + +```python +import logging + +class MySpider(Spider): + name = "my_spider" + start_urls = ["https://example.com"] + logging_level = logging.INFO + log_file = "logs/my_spider.log" + + async def parse(self, response: Response): + self.logger.info(f"Processing {response.url}") + yield {"title": response.css("title::text").get("")} +``` + +The log file directory is created automatically if it doesn't exist. Both console and file output use the same format. \ No newline at end of file diff --git a/docs/spiders/architecture.md b/docs/spiders/architecture.md new file mode 100644 index 0000000000000000000000000000000000000000..09b61e497ebe17914c7390b65e87029252a1eaca --- /dev/null +++ b/docs/spiders/architecture.md @@ -0,0 +1,98 @@ +# Spiders architecture + +!!! success "Prerequisites" + + 1. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand the different fetcher types and when to use each one. + 2. You've completed or read the [Main classes](../parsing/main_classes.md) page to understand the [Selector](../parsing/main_classes.md#selector) and [Response](../fetching/choosing.md#response-object) classes. + +Scrapling's spider system is a Scrapy-inspired async crawling framework designed for concurrent, multi-session crawls with built-in pause/resume support. It brings together Scrapling's parsing engine and fetchers into a unified crawling API while adding scheduling, concurrency control, and checkpointing. + +If you're familiar with Scrapy, you'll feel right at home. If not, don't worry — the system is designed to be straightforward. + +## Data Flow + +The diagram below shows how data flows through the spider system when a crawl is running: + +Spider architecture diagram by @TrueSkills + +Here's what happens step by step when you run a spider without many details: + +1. The **Spider** produces the first batch of `Request` objects. By default, it creates one request for each URL in `start_urls`, but you can override `start_requests()` for custom logic. +2. The **Scheduler** receives requests and places them in a priority queue, and creates fingerprints for them. Higher-priority requests are dequeued first. +3. The **Crawler Engine** asks the **Scheduler** to dequeue the next request, respecting concurrency limits (global and per-domain) and download delays. Once the **Crawler Engine** receives the request, it passes it to the **Session Manager**, which routes it to the correct session based on the request's `sid` (session ID). +4. The **session** fetches the page and returns a [Response](../fetching/choosing.md#response-object) object to the **Crawler Engine**. The engine records statistics and checks for blocked responses. If the response is blocked, the engine retries the request up to `max_blocked_retries` times. Of course, the blocking detection and the retry logic for blocked requests can be customized. +5. The **Crawler Engine** passes the [Response](../fetching/choosing.md#response-object) to the request's callback. The callback either yields a dictionary, which gets treated as a scraped item, or a follow-up request, which gets sent to the scheduler for queuing. +6. The cycle repeats from step 2 until the scheduler is empty and no tasks are active, or the spider is paused. +7. If `crawldir` is set while starting the spider, the **Crawler Engine** periodically saves a checkpoint (pending requests + seen URLs set) to disk. On graceful shutdown (Ctrl+C), a final checkpoint is saved. The next time the spider runs with the same `crawldir`, it resumes from where it left off — skipping `start_requests()` and restoring the scheduler state. + + +## Components + +### Spider + +The central class you interact with. You subclass `Spider`, define your `start_urls` and `parse()` method, and optionally configure sessions and override lifecycle hooks. + +```python +from scrapling.spiders import Spider, Response, Request + +class MySpider(Spider): + name = "my_spider" + start_urls = ["https://example.com"] + + async def parse(self, response: Response): + for link in response.css("a::attr(href)").getall(): + yield response.follow(link, callback=self.parse_page) + + async def parse_page(self, response: Response): + yield {"title": response.css("h1::text").get("")} +``` + +### Crawler Engine + +The engine orchestrates the entire crawl. It manages the main loop, enforces concurrency limits, dispatches requests through the Session Manager, and processes results from callbacks. You don't interact with it directly — the `Spider.start()` and `Spider.stream()` methods handle it for you. + +### Scheduler + +A priority queue with built-in URL deduplication. Requests are fingerprinted based on their URL, HTTP method, body, and session ID. The scheduler supports `snapshot()` and `restore()` for the checkpoint system, allowing the crawl state to be saved and resumed. + +### Session Manager + +Manages one or more named session instances. Each session is one of: + +- [FetcherSession](../fetching/static.md) +- [AsyncDynamicSession](../fetching/dynamic.md) +- [AsyncStealthySession](../fetching/stealthy.md) + +When a request comes in, the Session Manager routes it to the correct session based on the request's `sid` field. Sessions can be started with the spider start (default) or lazily (started on the first use). + +### Checkpoint System + +An optional system that, if enabled, saves the crawler's state (pending requests + seen URL fingerprints) to a pickle file on disk. Writes are atomic (temp file + rename) to prevent corruption. Checkpoints are saved periodically at a configurable interval and on graceful shutdown. Upon successful completion (not paused), checkpoint files are automatically cleaned up. + +### Output + +Scraped items are collected in an `ItemList` (a list subclass with `to_json()` and `to_jsonl()` export methods). Crawl statistics are tracked in a `CrawlStats` dataclass which contains a lot of useful info. + + +## Comparison with Scrapy + +If you're coming from Scrapy, here's how Scrapling's spider system maps: + +| Concept | Scrapy | Scrapling | +|--------------------|-------------------------------|-----------------------------------------------------------------| +| Spider definition | `scrapy.Spider` subclass | `scrapling.spiders.Spider` subclass | +| Initial requests | `start_requests()` | `async start_requests()` | +| Callbacks | `def parse(self, response)` | `async def parse(self, response)` | +| Following links | `response.follow(url)` | `response.follow(url)` | +| Item output | `yield dict` or `yield Item` | `yield dict` | +| Request scheduling | Scheduler + Dupefilter | Scheduler with built-in deduplication | +| Downloading | Downloader + Middlewares | Session Manager with multi-session support | +| Item processing | Item Pipelines | `on_scraped_item()` hook | +| Blocked detection | Through custom middlewares | Built-in `is_blocked()` + `retry_blocked_request()` hooks | +| Concurrency | `CONCURRENT_REQUESTS` setting | `concurrent_requests` class attribute | +| Domain filtering | `allowed_domains` | `allowed_domains` | +| Pause/Resume | `JOBDIR` setting | `crawldir` constructor argument | +| Export | Feed exports | `result.items.to_json()` / `to_jsonl()` or custom through hooks | +| Running | `scrapy crawl spider_name` | `MySpider().start()` | +| Streaming | N/A | `async for item in spider.stream()` | +| Multi-session | N/A | Multiple sessions with different types per spider | \ No newline at end of file diff --git a/docs/spiders/getting-started.md b/docs/spiders/getting-started.md new file mode 100644 index 0000000000000000000000000000000000000000..bb547b4adae4d88a7e449b269800a3c00e2633a9 --- /dev/null +++ b/docs/spiders/getting-started.md @@ -0,0 +1,159 @@ +# Getting started + +## Introduction + +!!! success "Prerequisites" + + 1. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand the different fetcher types and when to use each one. + 2. You've completed or read the [Main classes](../parsing/main_classes.md) page to understand the [Selector](../parsing/main_classes.md#selector) and [Response](../fetching/choosing.md#response-object) classes. + 3. You've read the [Architecture](architecture.md) page for a high-level overview of how the spider system works. + +The spider system lets you build concurrent, multi-page crawlers in just a few lines of code. If you've used Scrapy before, the patterns will feel familiar. If not, this guide will walk you through everything you need to get started. + +## Your First Spider + +A spider is a class that defines how to crawl and extract data from websites. Here's the simplest possible spider: + +```python +from scrapling.spiders import Spider, Response + +class QuotesSpider(Spider): + name = "quotes" + start_urls = ["https://quotes.toscrape.com"] + + async def parse(self, response: Response): + for quote in response.css("div.quote"): + yield { + "text": quote.css("span.text::text").get(""), + "author": quote.css("small.author::text").get(""), + } +``` + +Every spider needs three things: + +1. **`name`** — A unique identifier for the spider. +2. **`start_urls`** — A list of URLs to start crawling from. +3. **`parse()`** — An async generator method that processes each response and yields results. + +The `parse()` method is where the magic happens. You use the same selection methods you'd use with Scrapling's [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object), and `yield` dictionaries to output scraped items. + +## Running the Spider + +To run your spider, create an instance and call `start()`: + +```python +result = QuotesSpider().start() +``` + +The `start()` method handles all the async machinery internally — no need to worry about event loops. While the spider is running, everything that happens is logged to the terminal, and at the end of the crawl, you get very detailed stats. + +Those stats are in the returned `CrawlResult` object, which gives you everything you need: + +```python +result = QuotesSpider().start() + +# Access scraped items +for item in result.items: + print(item["text"], "-", item["author"]) + +# Check statistics +print(f"Scraped {result.stats.items_scraped} items") +print(f"Made {result.stats.requests_count} requests") +print(f"Took {result.stats.elapsed_seconds:.1f} seconds") + +# Did the crawl finish or was it paused? +print(f"Completed: {result.completed}") +``` + +## Following Links + +Most crawls need to follow links across multiple pages. Use `response.follow()` to create follow-up requests: + +```python +from scrapling.spiders import Spider, Response + +class QuotesSpider(Spider): + name = "quotes" + start_urls = ["https://quotes.toscrape.com"] + + async def parse(self, response: Response): + # Extract items from the current page + for quote in response.css("div.quote"): + yield { + "text": quote.css("span.text::text").get(""), + "author": quote.css("small.author::text").get(""), + } + + # Follow the "next page" link + next_page = response.css("li.next a::attr(href)").get() + if next_page: + yield response.follow(next_page, callback=self.parse) +``` + +`response.follow()` handles relative URLs automatically — it joins them with the current page's URL. It also sets the current page as the `Referer` header by default. + +You can point follow-up requests at different callback methods for different page types: + +```python +async def parse(self, response: Response): + for link in response.css("a.product-link::attr(href)").getall(): + yield response.follow(link, callback=self.parse_product) + +async def parse_product(self, response: Response): + yield { + "name": response.css("h1::text").get(""), + "price": response.css(".price::text").get(""), + } +``` + +!!! note + + All callback methods must be async generators (using `async def` and `yield`). + +## Exporting Data + +The `ItemList` returned in `result.items` has built-in export methods: + +```python +result = QuotesSpider().start() + +# Export as JSON +result.items.to_json("quotes.json") + +# Export as JSON with pretty-printing +result.items.to_json("quotes.json", indent=True) + +# Export as JSON Lines (one JSON object per line) +result.items.to_jsonl("quotes.jsonl") +``` + +Both methods create parent directories automatically if they don't exist. + +## Filtering Domains + +Use `allowed_domains` to restrict the spider to specific domains. This prevents it from accidentally following links to external websites: + +```python +class MySpider(Spider): + name = "my_spider" + start_urls = ["https://example.com"] + allowed_domains = {"example.com"} + + async def parse(self, response: Response): + for link in response.css("a::attr(href)").getall(): + # Links to other domains are silently dropped + yield response.follow(link, callback=self.parse) +``` + +Subdomains are matched automatically — setting `allowed_domains = {"example.com"}` also allows `sub.example.com`, `blog.example.com`, etc. + +When a request is filtered out, it's counted in `stats.offsite_requests_count` so you can see how many were dropped. + +## What's Next + +Now that you have the basics, you can explore: + +- [Requests & Responses](requests-responses.md) — learn about request priority, deduplication, metadata, and more. +- [Sessions](sessions.md) — use multiple fetcher types (HTTP, browser, stealth) in a single spider. +- [Proxy management & blocking](proxy-blocking.md) — rotate proxies across requests and how to handle blocking in the spider. +- [Advanced features](advanced.md) — concurrency control, pause/resume, streaming, lifecycle hooks, and logging. \ No newline at end of file diff --git a/docs/spiders/proxy-blocking.md b/docs/spiders/proxy-blocking.md new file mode 100644 index 0000000000000000000000000000000000000000..4c829b9a23029f7bb1498a2d645dc11c97e8181f --- /dev/null +++ b/docs/spiders/proxy-blocking.md @@ -0,0 +1,244 @@ +# Proxy management and handling Blocks + +## Introduction + +!!! success "Prerequisites" + + 1. You've read the [Getting started](getting-started.md) page and know how to create and run a basic spider. + 2. You've read the [Sessions](sessions.md) page and understand how to configure sessions. + +When scraping at scale, you'll often need to rotate through multiple proxies to avoid rate limits and blocks. Scrapling's `ProxyRotator` makes this straightforward — it works with all session types and integrates with the spider's blocked request retry system. + +If you don't know what a proxy is or how to choose a good one, [this guide can help](https://substack.thewebscraping.club/p/everything-about-proxies). + +## ProxyRotator + +The `ProxyRotator` class manages a list of proxies and rotates through them automatically. Pass it to any session type via the `proxy_rotator` parameter: + +```python +from scrapling.spiders import Spider, Response +from scrapling.fetchers import FetcherSession, ProxyRotator + +class MySpider(Spider): + name = "my_spider" + start_urls = ["https://example.com"] + + def configure_sessions(self, manager): + rotator = ProxyRotator([ + "http://proxy1:8080", + "http://proxy2:8080", + "http://user:pass@proxy3:8080", + ]) + manager.add("default", FetcherSession(proxy_rotator=rotator)) + + async def parse(self, response: Response): + # Check which proxy was used + print(f"Proxy used: {response.meta.get('proxy')}") + yield {"title": response.css("title::text").get("")} +``` + +Each request automatically gets the next proxy in the rotation. The proxy used is stored in `response.meta["proxy"]` so you can track which proxy fetched which page. + + +When you use it with browser sessions, you will need some adjustments, like below: + +```python +from scrapling.fetchers import AsyncDynamicSession, AsyncStealthySession, ProxyRotator + +# String proxies work for all session types +rotator = ProxyRotator([ + "http://proxy1:8080", + "http://proxy2:8080", +]) + +# Dict proxies (Playwright format) work for browser sessions +rotator = ProxyRotator([ + {"server": "http://proxy1:8080", "username": "user", "password": "pass"}, + {"server": "http://proxy2:8080"}, +]) + +# Then inside the spider +def configure_sessions(self, manager): + rotator = ProxyRotator(["http://proxy1:8080", "http://proxy2:8080"]) + manager.add("browser", AsyncStealthySession(proxy_rotator=rotator)) +``` + +!!! info + + 1. You cannot use the `proxy_rotator` argument together with the static `proxy` or `proxies` parameters on the same session. Pick one approach when configuring the session, and override it per request later if you want, as we will show later. + 2. Remember that by default, all browser-based sessions use a persistent browser context with a pool of tabs. However, since browsers can't set a proxy per tab, when you use a `ProxyRotator`, the fetcher will automatically open a separate context for each proxy, with one tab per context. Once the tab's job is done, both the tab and its context are closed. + +## Custom Rotation Strategies + +By default, `ProxyRotator` uses cyclic rotation — it iterates through proxies sequentially, wrapping around at the end. + +You can provide a custom strategy function to change this behavior, but it has to match the below signature: + +```python +from scrapling.core._types import ProxyType + +def my_strategy(proxies: list, current_index: int) -> tuple[ProxyType, int]: + ... +``` + +It receives the list of proxies and the current index, and must return the chosen proxy and the next index. + +Below are some examples of custom rotation strategies you can use. + +### Random Rotation + +```python +import random +from scrapling.fetchers import ProxyRotator + +def random_strategy(proxies, current_index): + idx = random.randint(0, len(proxies) - 1) + return proxies[idx], idx + +rotator = ProxyRotator( + ["http://proxy1:8080", "http://proxy2:8080", "http://proxy3:8080"], + strategy=random_strategy, +) +``` + +### Weighted Rotation + +```python +import random + +def weighted_strategy(proxies, current_index): + # First proxy gets 60% of traffic, others split the rest + weights = [60] + [40 // (len(proxies) - 1)] * (len(proxies) - 1) + proxy = random.choices(proxies, weights=weights, k=1)[0] + return proxy, current_index # Index doesn't matter for weighted + +rotator = ProxyRotator(proxies, strategy=weighted_strategy) +``` + + +## Per-Request Proxy Override + +You can override the rotator for individual requests by passing `proxy=` as a keyword argument: + +```python +async def parse(self, response: Response): + # This request uses the rotator's next proxy + yield response.follow("/page1", callback=self.parse_page) + + # This request uses a specific proxy, bypassing the rotator + yield response.follow( + "/special-page", + callback=self.parse_page, + proxy="http://special-proxy:8080", + ) +``` + +This is useful when certain pages require a specific proxy (e.g., a geo-located proxy for region-specific content). + +## Blocked Request Handling + +The spider has built-in blocked request detection and retry. By default, it considers the following HTTP status codes blocked: `401`, `403`, `407`, `429`, `444`, `500`, `502`, `503`, `504`. + +The retry system works like this: + +1. After a response comes back, the spider calls the `is_blocked(response)` method. +2. If blocked, it copies the request and calls the `retry_blocked_request()` method so you can modify it before retrying. +3. The retried request is re-queued with `dont_filter=True` (bypassing deduplication) and lower priority, so it's not retried right away. +4. This repeats up to `max_blocked_retries` times (default: 3). + +!!! tip + + 1. On retry, the previous `proxy`/`proxies` kwargs are cleared from the request automatically, so the rotator assigns a fresh proxy. + 2. The `max_blocked_retries` attribute is different than the session retries and doesn't share the counter. + +### Custom Block Detection + +Override `is_blocked()` to add your own detection logic: + +```python +class MySpider(Spider): + name = "my_spider" + start_urls = ["https://example.com"] + + async def is_blocked(self, response: Response) -> bool: + # Check status codes (default behavior) + if response.status in {403, 429, 503}: + return True + + # Check response content + body = response.body.decode("utf-8", errors="ignore") + if "access denied" in body.lower() or "rate limit" in body.lower(): + return True + + return False + + async def parse(self, response: Response): + yield {"title": response.css("title::text").get("")} +``` + +### Customizing Retries + +Override `retry_blocked_request()` to modify the request before retrying. The `max_blocked_retries` attribute controls how many times a blocked request is retried (default: 3): + +```python +from scrapling.spiders import Spider, SessionManager, Request, Response +from scrapling.fetchers import FetcherSession, AsyncStealthySession + + +class MySpider(Spider): + name = "my_spider" + start_urls = ["https://example.com"] + max_blocked_retries = 5 + + def configure_sessions(self, manager: SessionManager) -> None: + manager.add('requests', FetcherSession(impersonate=['chrome', 'firefox', 'safari'])) + manager.add('stealth', AsyncStealthySession(block_webrtc=True), lazy=True) + + async def retry_blocked_request(self, request: Request, response: Response) -> Request: + request.sid = "stealth" + self.logger.info(f"Retrying blocked request: {request.url}") + return request + + async def parse(self, response: Response): + yield {"title": response.css("title::text").get("")} +``` + +What happened above is that I left the blocking detection logic unchanged and had the spider mainly use requests until it got blocked, then switch to the stealthy browser. + + +Putting it all together: + +```python +from scrapling.spiders import Spider, SessionManager, Request, Response +from scrapling.fetchers import FetcherSession, AsyncStealthySession, ProxyRotator + + +cheap_proxies = ProxyRotator([ "http://proxy1:8080", "http://proxy2:8080"]) + +# A format acceptable by the browser +expensive_proxies = ProxyRotator([ + {"server": "http://residential_proxy1:8080", "username": "user", "password": "pass"}, + {"server": "http://residential_proxy2:8080", "username": "user", "password": "pass"}, + {"server": "http://mobile_proxy1:8080", "username": "user", "password": "pass"}, + {"server": "http://mobile_proxy2:8080", "username": "user", "password": "pass"}, +]) + + +class MySpider(Spider): + name = "my_spider" + start_urls = ["https://example.com"] + max_blocked_retries = 5 + + def configure_sessions(self, manager: SessionManager) -> None: + manager.add('requests', FetcherSession(impersonate=['chrome', 'firefox', 'safari'], proxy_rotator=cheap_proxies)) + manager.add('stealth', AsyncStealthySession(block_webrtc=True, proxy_rotator=expensive_proxies), lazy=True) + + async def retry_blocked_request(self, request: Request, response: Response) -> Request: + request.sid = "stealth" + self.logger.info(f"Retrying blocked request: {request.url}") + return request + + async def parse(self, response: Response): + yield {"title": response.css("title::text").get("")} +``` +The above logic is: requests are made with cheap proxies, such as datacenter proxies, until they are blocked, then retried with higher-quality proxies, such as residential or mobile proxies. \ No newline at end of file diff --git a/docs/spiders/requests-responses.md b/docs/spiders/requests-responses.md new file mode 100644 index 0000000000000000000000000000000000000000..c587af8dc58fd4592f9d7e6e51de33dc840f3c23 --- /dev/null +++ b/docs/spiders/requests-responses.md @@ -0,0 +1,202 @@ +# Requests & Responses + +!!! success "Prerequisites" + + 1. You've read the [Getting started](getting-started.md) page and know how to create and run a basic spider. + +This page covers the `Request` object in detail — how to construct requests, pass data between callbacks, control priority and deduplication, and use `response.follow()` for link-following. + +## The Request Object + +A `Request` represents a URL to be fetched. You create requests either directly or via `response.follow()`: + +```python +from scrapling.spiders import Request + +# Direct construction +request = Request( + "https://example.com/page", + callback=self.parse_page, + priority=5, +) + +# Via response.follow (preferred in callbacks) +request = response.follow("/page", callback=self.parse_page) +``` + +Here are all the arguments you can pass to `Request`: + +| Argument | Type | Default | Description | +|---------------|------------|------------|-------------------------------------------------------------------------------------------------------| +| `url` | `str` | *required* | The URL to fetch | +| `sid` | `str` | `""` | Session ID — routes the request to a specific session (see [Sessions](sessions.md)) | +| `callback` | `callable` | `None` | Async generator method to process the response. Defaults to `parse()` | +| `priority` | `int` | `0` | Higher values are processed first | +| `dont_filter` | `bool` | `False` | If `True`, skip deduplication (allow duplicate requests) | +| `meta` | `dict` | `{}` | Arbitrary metadata passed through to the response | +| `**kwargs` | | | Additional keyword arguments passed to the session's fetch method (e.g., `headers`, `method`, `data`) | + +Any extra keyword arguments are forwarded directly to the underlying session. For example, to make a POST request: + +```python +yield Request( + "https://example.com/api", + method="POST", + data={"key": "value"}, + callback=self.parse_result, +) +``` + +## Response.follow() + +`response.follow()` is the recommended way to create follow-up requests inside callbacks. It offers several advantages over constructing `Request` objects directly: + +- **Relative URLs** are resolved automatically against the current page URL +- **Referer header** is set to the current page URL by default +- **Session kwargs** from the original request are inherited (headers, proxy settings, etc.) +- **Callback, session ID, and priority** are inherited from the original request if not specified + +```python +async def parse(self, response: Response): + # Minimal — inherits callback, sid, priority from current request + yield response.follow("/next-page") + + # Override specific fields + yield response.follow( + "/product/123", + callback=self.parse_product, + priority=10, + ) + + # Pass additional metadata to + yield response.follow( + "/details", + callback=self.parse_details, + meta={"category": "electronics"}, + ) +``` + +| Argument | Type | Default | Description | +|--------------------|------------|------------|------------------------------------------------------------| +| `url` | `str` | *required* | URL to follow (absolute or relative) | +| `sid` | `str` | `""` | Session ID (inherits from original request if empty) | +| `callback` | `callable` | `None` | Callback method (inherits from original request if `None`) | +| `priority` | `int` | `None` | Priority (inherits from original request if `None`) | +| `dont_filter` | `bool` | `False` | Skip deduplication | +| `meta` | `dict` | `None` | Metadata (merged with existing response meta) | +| **`referer_flow`** | `bool` | `True` | Set current URL as Referer header | +| `**kwargs` | | | Merged with original request's session kwargs | + +### Disabling Referer Flow + +By default, `response.follow()` sets the `Referer` header to the current page URL. To disable this: + +```python +yield response.follow("/page", referer_flow=False) +``` + +## Callbacks + +Callbacks are async generator methods on your spider that process responses. They must `yield` one of three types: + +- **`dict`** — A scraped item, added to the results +- **`Request`** — A follow-up request, added to the queue +- **`None`** — Silently ignored + +```python +class MySpider(Spider): + name = "my_spider" + start_urls = ["https://example.com"] + + async def parse(self, response: Response): + # Yield items (dicts) + yield {"url": response.url, "title": response.css("title::text").get("")} + + # Yield follow-up requests + for link in response.css("a::attr(href)").getall(): + yield response.follow(link, callback=self.parse_page) + + async def parse_page(self, response: Response): + yield {"content": response.css("article::text").get("")} +``` + +!!! tip "Note:" + + All callback methods must be `async def` and use `yield` (not `return`). Even if a callback only yields items with no follow-up requests, it must still be an async generator. + +## Request Priority + +Requests with higher priority values are processed first. This is useful when some pages are more important to be processed first before others: + +```python +async def parse(self, response: Response): + # High priority — process product pages first + for link in response.css("a.product::attr(href)").getall(): + yield response.follow(link, callback=self.parse_product, priority=10) + + # Low priority — pagination links processed after products + next_page = response.css("a.next::attr(href)").get() + if next_page: + yield response.follow(next_page, callback=self.parse, priority=0) +``` + +When using `response.follow()`, the priority is inherited from the original request unless you specify a new one. + +## Deduplication + +The spider automatically deduplicates requests based on a fingerprint computed from the URL, HTTP method, request body, and session ID. If two requests produce the same fingerprint, the second one is silently dropped. + +To allow duplicate requests (e.g., re-visiting a page after login), set `dont_filter=True`: + +```python +yield Request("https://example.com/dashboard", dont_filter=True, callback=self.parse_dashboard) + +# Or with response.follow +yield response.follow("/dashboard", dont_filter=True, callback=self.parse_dashboard) +``` + +You can fine-tune what goes into the fingerprint using class attributes on your spider: + +| Attribute | Default | Effect | +|----------------------|---------|-----------------------------------------------------------------------------------------------------------------| +| `fp_include_kwargs` | `False` | Include extra request kwargs (arguments you passed to the session fetch, like headers, etc.) in the fingerprint | +| `fp_keep_fragments` | `False` | Keep URL fragments (`#section`) when computing fingerprints | +| `fp_include_headers` | `False` | Include request headers in the fingerprint | + +For example, if you need to treat `https://example.com/page#section1` and `https://example.com/page#section2` as different URLs: + +```python +class MySpider(Spider): + name = "my_spider" + fp_keep_fragments = True + # ... +``` + +## Request Meta + +The `meta` dictionary lets you pass arbitrary data between callbacks. This is useful when you need context from one page to process another: + +```python +async def parse(self, response: Response): + for product in response.css("div.product"): + category = product.css("span.category::text").get("") + link = product.css("a::attr(href)").get() + if link: + yield response.follow( + link, + callback=self.parse_product, + meta={"category": category}, + ) + +async def parse_product(self, response: Response): + yield { + "name": response.css("h1::text").get(""), + "price": response.css(".price::text").get(""), + # Access meta from the request + "category": response.meta.get("category", ""), + } +``` + +When using `response.follow()`, the meta from the current response is merged with the new meta you provide (new values take precedence). + +The spider system also automatically stores some metadata. For example, the proxy used for a request is available as `response.meta["proxy"]` when proxy rotation is enabled. \ No newline at end of file diff --git a/docs/spiders/sessions.md b/docs/spiders/sessions.md new file mode 100644 index 0000000000000000000000000000000000000000..d922ee18fe41af1dbb165fce4c690ef55a1713cc --- /dev/null +++ b/docs/spiders/sessions.md @@ -0,0 +1,218 @@ +# Spiders sessions + +!!! success "Prerequisites" + + 1. You've read the [Getting started](getting-started.md) page and know how to create and run a basic spider. + 2. You're familiar with [Fetchers basics](../fetching/choosing.md) and the differences between HTTP, Dynamic, and Stealthy sessions. + +A spider can use multiple fetcher sessions simultaneously — for example, a fast HTTP session for simple pages and a stealth browser session for protected pages. This page shows you how to configure and use sessions. + +## What are Sessions? + +As you should already know, a session is a pre-configured fetcher instance that stays alive for the duration of the crawl. Instead of creating a new connection or browser for every request, the spider reuses sessions, which is faster and more resource-efficient. + +By default, every spider creates a single [FetcherSession](../fetching/static.md). You can add more sessions or swap the default by overriding the `configure_sessions()` method, but you have to use the async version of each session only, as the table shows below: + + +| Session Type | Use Case | +|-------------------------------------------------|------------------------------------------| +| [FetcherSession](../fetching/static.md) | Fast HTTP requests, no JavaScript | +| [AsyncDynamicSession](../fetching/dynamic.md) | Browser automation, JavaScript rendering | +| [AsyncStealthySession](../fetching/stealthy.md) | Anti-bot bypass, Cloudflare, etc. | + + +## Configuring Sessions + +Override `configure_sessions()` on your spider to set up sessions. The `manager` parameter is a `SessionManager` instance — use `manager.add()` to register sessions: + +```python +from scrapling.spiders import Spider, Response +from scrapling.fetchers import FetcherSession + +class MySpider(Spider): + name = "my_spider" + start_urls = ["https://example.com"] + + def configure_sessions(self, manager): + manager.add("default", FetcherSession()) + + async def parse(self, response: Response): + yield {"title": response.css("title::text").get("")} +``` + +The `manager.add()` method takes: + +| Argument | Type | Default | Description | +|--------------|-----------|------------|----------------------------------------------| +| `session_id` | `str` | *required* | A name to reference this session in requests | +| `session` | `Session` | *required* | The session instance | +| `default` | `bool` | `False` | Make this the default session | +| `lazy` | `bool` | `False` | Start the session only when first used | + +!!! note "Notes:" + + 1. In all requests, if you don't specify which session to use, the default session is used. The default session is determined in one of two ways: + 1. The first session you add to the managed becomes the default automatically. + 2. The session that gets `default=True` while added to the manager. + 2. The instances you pass of each session don't have to be already started by you; the spider checks on all sessions if they are not already started and starts them. + 3. If you want a specific session to start when used only, then use the `lazy` argument while adding that session to the manager. Example: start the browser only when you need it, not with the spider start. + +## Multi-Session Spider + +Here's a practical example: use a fast HTTP session for listing pages and a stealth browser for detail pages that have bot protection: + +```python +from scrapling.spiders import Spider, Response +from scrapling.fetchers import FetcherSession, AsyncStealthySession + +class ProductSpider(Spider): + name = "products" + start_urls = ["https://shop.example.com/products"] + + def configure_sessions(self, manager): + # Fast HTTP for listing pages (default) + manager.add("http", FetcherSession()) + + # Stealth browser for protected product pages + manager.add("stealth", AsyncStealthySession( + headless=True, + network_idle=True, + )) + + async def parse(self, response: Response): + for link in response.css("a.product::attr(href)").getall(): + # Route product pages through the stealth session + yield response.follow(link, sid="stealth", callback=self.parse_product) + + next_page = response.css("a.next::attr(href)").get() + if next_page: + yield response.follow(next_page) + + async def parse_product(self, response: Response): + yield { + "name": response.css("h1::text").get(""), + "price": response.css(".price::text").get(""), + } +``` + +The key is the `sid` parameter — it tells the spider which session to use for each request. When you call `response.follow()` without `sid`, the session ID from the original request is inherited. + +Note that the sessions don't have to be from different classes only, but can be the same session, but different instances with different configurations, for example, like below: + +```python +from scrapling.spiders import Spider, Response +from scrapling.fetchers import FetcherSession + +class ProductSpider(Spider): + name = "products" + start_urls = ["https://shop.example.com/products"] + + def configure_sessions(self, manager): + chrome_requests = FetcherSession(impersonate="chrome") + firefox_requests = FetcherSession(impersonate="firefox") + + manager.add("chrome", chrome_requests) + manager.add("firefox", firefox_requests) + + async def parse(self, response: Response): + for link in response.css("a.product::attr(href)").getall(): + yield response.follow(link, callback=self.parse_product) + + next_page = response.css("a.next::attr(href)").get() + if next_page: + yield response.follow(next_page, sid="firefox") + + async def parse_product(self, response: Response): + yield { + "name": response.css("h1::text").get(""), + "price": response.css(".price::text").get(""), + } +``` + +Or you can separate concerns and keep a session with its cookies/state for specific requests, etc... + +## Session Arguments + +Extra keyword arguments passed to a `Request` (or through `response.follow(**kwargs)`) are forwarded to the session's fetch method. This lets you customize individual requests without changing the session configuration: + +```python +async def parse(self, response: Response): + # Pass extra headers for this specific request + yield Request( + "https://api.example.com/data", + headers={"Authorization": "Bearer token123"}, + callback=self.parse_api, + ) + + # Use a different HTTP method + yield Request( + "https://example.com/submit", + method="POST", + data={"field": "value"}, + sid="firefox", + callback=self.parse_result, + ) +``` + +!!! warning + + Normally, when you use `FetcherSession`, `Fetcher`, or `AsyncFetcher`, you specify the HTTP method to use with the corresponding method like `.get()` and `.post()`. But while using `FetcherSession` in spiders, you can't do this. By default, the request is an _HTTP GET_ request; if you want to use another HTTP method, you have to pass it to the `method` argument, as in the above example. The reason for this is to unify the `Request` interface across all session types. + +For browser sessions (`AsyncDynamicSession`, `AsyncStealthySession`), you can pass browser-specific arguments like `wait_selector`, `page_action`, or `extra_headers`: + +```python +async def parse(self, response: Response): + # Use Cloudflare solver with the `AsyncStealthySession` we configured above + yield Request( + "https://nopecha.com/demo/cloudflare", + sid="stealth", + callback=self.parse_result, + solve_cloudflare=True, + block_webrtc=True, + hide_canvas=True, + google_search=True, + ) + + yield response.follow( + "/dynamic-page", + sid="browser", + callback=self.parse_dynamic, + wait_selector="div.loaded", + network_idle=True, + ) +``` + +!!! warning + + Session arguments (**kwargs) passed from the original request are inherited by `response.follow()`. New kwargs take precedence over inherited ones. + +```python +from scrapling.spiders import Spider, Response +from scrapling.fetchers import FetcherSession + +class ProductSpider(Spider): + name = "products" + start_urls = ["https://shop.example.com/products"] + + def configure_sessions(self, manager): + manager.add("http", FetcherSession(impersonate='chrome')) + + async def parse(self, response: Response): + # I don't want the follow request to impersonate a desktop Chrome like the previous request, but a mobile one + # so I override it like this + for link in response.css("a.product::attr(href)").getall(): + yield response.follow(link, impersonate="chrome131_android", callback=self.parse_product) + + next_page = response.css("a.next::attr(href)").get() + if next_page: + yield Request(next_page) + + async def parse_product(self, response: Response): + yield { + "name": response.css("h1::text").get(""), + "price": response.css(".price::text").get(""), + } +``` +!!! info + + No need to mention that, upon spider closure, the manager automatically checks whether any sessions are still running and closes them before closing the spider. \ No newline at end of file diff --git a/docs/tutorials/migrating_from_beautifulsoup.md b/docs/tutorials/migrating_from_beautifulsoup.md index e5474bbdeb8b785892465a9bbaf8259091ec156f..9abda95c907307c1f6e1bb0a1375cc6fceb633aa 100644 --- a/docs/tutorials/migrating_from_beautifulsoup.md +++ b/docs/tutorials/migrating_from_beautifulsoup.md @@ -18,10 +18,10 @@ You will notice that some shortcuts in BeautifulSoup are missing in Scrapling, w | Finding a single element (Example 4) | `element = soup.find(lambda e: len(list(e.children)) > 0)` | `element = page.find(lambda e: len(e.children) > 0)` | | Finding a single element (Example 5) | `element = soup.find(["a", "b"])` | `element = page.find(["a", "b"])` | | Find element by its text content | `element = soup.find(text="some text")` | `element = page.find_by_text("some text", partial=False)` | -| Using CSS selectors to find the first matching element | `elements = soup.select_one('div.example')` | `elements = page.css_first('div.example')` | +| Using CSS selectors to find the first matching element | `elements = soup.select_one('div.example')` | `elements = page.css('div.example').first` | | Using CSS selectors to find all matching element | `elements = soup.select('div.example')` | `elements = page.css('div.example')` | | Get a prettified version of the page/element source | `prettified = soup.prettify()` | `prettified = page.prettify()` | -| Get a Non-pretty version of the page/element source | `source = str(soup)` | `source = page.body` | +| Get a Non-pretty version of the page/element source | `source = str(soup)` | `source = page.html_content` | | Get tag name of an element | `name = element.name` | `name = element.tag` | | Extracting text content of an element | `string = element.string` | `string = element.text` | | Extracting all the text in a document or beneath a tag | `text = soup.get_text(strip=True)` | `text = page.get_all_text(strip=True)` | @@ -36,14 +36,16 @@ You will notice that some shortcuts in BeautifulSoup are missing in Scrapling, w | Searching for elements in the siblings of an element | `target_sibling = element.find_next_siblings("a")`
`target_sibling = element.find_previous_siblings("a")` | `target_sibling = element.siblings.filter(lambda s: s.tag == 'a')` | | Searching for an element in the next elements of an element | `target_parent = element.find_next("a")` | `target_parent = element.below_elements.search(lambda p: p.tag == 'a')` | | Searching for elements in the next elements of an element | `target_parent = element.find_all_next("a")` | `target_parent = element.below_elements.filter(lambda p: p.tag == 'a')` | -| Searching for an element in the previous elements of an element | `target_parent = element.find_previous("a")` | `target_parent = element.path.search(lambda p: p.tag == 'a')` | -| Searching for elements in the previous elements of an element | `target_parent = element.find_all_previous("a")` | `target_parent = element.path.filter(lambda p: p.tag == 'a')` | +| Searching for an element in the ancestors of an element | `target_parent = element.find_previous("a")` ¹ | `target_parent = element.path.search(lambda p: p.tag == 'a')` | +| Searching for elements in the ancestors of an element | `target_parent = element.find_all_previous("a")` ¹ | `target_parent = element.path.filter(lambda p: p.tag == 'a')` | | Get previous sibling of an element | `prev_element = element.previous_sibling` | `prev_element = element.previous` | | Navigating to children | `children = list(element.children)` | `children = element.children` | | Get all descendants of an element | `children = list(element.descendants)` | `children = element.below_elements` | | Filtering a group of elements that satisfies a condition | `group = soup.find('p', 'story').css.filter('a')` | `group = page.find_all('p', 'story').filter(lambda p: p.tag == 'a')` | +¹ **Note:** BS4's `find_previous`/`find_all_previous` searches all preceding elements in document order, while Scrapling's `path` only returns ancestors (the parent chain). These are not exact equivalents, but ancestor search covers the most common use case. + **One key point to remember**: BeautifulSoup offers features for modifying and manipulating the page after it has been parsed. Scrapling focuses more on scraping the page faster for you, and then you can do what you want with the extracted information. So, two different tools can be used in Web Scraping, but one of them specializes in Web Scraping :) ### Putting It All Together @@ -80,12 +82,12 @@ for link in links: As you can see, Scrapling simplifies the process by combining fetching and parsing into a single step, making your code cleaner and more efficient. -**Additional Notes:** +!!! abstract "**Additional Notes:**" -- **Different parsers**: BeautifulSoup allows you to set the parser engine to use, and one of them is `lxml`. Scrapling doesn't do that and uses the `lxml` library by default for performance reasons. -- **Element Types**: In BeautifulSoup, elements are `Tag` objects; in Scrapling, they are `Selector` objects. However, they provide similar methods and properties for navigation and data extraction. -- **Error Handling**: Both libraries return `None` when an element is not found (e.g., `soup.find()` or `page.css_first()`). To avoid errors, check for `None` before accessing properties. -- **Text Extraction**: Scrapling provides additional methods for handling text through `TextHandler`, such as `clean()`, which can help remove extra whitespace, consecutive spaces, or unwanted characters. Please check out the documentation for the complete list. + - **Different parsers**: BeautifulSoup allows you to set the parser engine to use, and one of them is `lxml`. Scrapling doesn't do that and uses the `lxml` library by default for performance reasons. + - **Element Types**: In BeautifulSoup, elements are `Tag` objects; in Scrapling, they are `Selector` objects. However, they provide similar methods and properties for navigation and data extraction. + - **Error Handling**: Both libraries return `None` when an element is not found (e.g., `soup.find()` or `page.find()`). In Scrapling, `page.css()` returns an empty `Selectors` list when no elements match, and you can use `page.css('.foo').first` to safely get the first match or `None`. To avoid errors, check for `None` or empty results before accessing properties. + - **Text Extraction**: Scrapling provides additional methods for handling text through `TextHandler`, such as `clean()`, which can help remove extra whitespace, consecutive spaces, or unwanted characters. Please check out the documentation for the complete list. The documentation provides more details on Scrapling's features and the complete list of arguments that can be passed to all methods. diff --git a/mkdocs.yml b/mkdocs.yml deleted file mode 100644 index c8b97092959045ec5d8f7934ed15bb82f0ab02be..0000000000000000000000000000000000000000 --- a/mkdocs.yml +++ /dev/null @@ -1,180 +0,0 @@ -site_name: Scrapling -site_description: Scrapling - Easy, effortless Web Scraping as it should be! -site_author: Karim Shoair -repo_url: https://github.com/D4Vinci/Scrapling -site_url: https://scrapling.readthedocs.io/en/latest/ -repo_name: D4Vinci/Scrapling -copyright: Copyright © 2025 Karim Shoair -
Change cookie settings - -theme: - name: material - language: en - logo: assets/logo.png - favicon: assets/favicon.ico - palette: - scheme: slate - primary: black - accent: deep purple - font: - text: Open Sans - code: JetBrains Mono - icon: - repo: fontawesome/brands/github-alt - features: - - announce.dismiss - - navigation.top - - navigation.footer - - navigation.instant - - navigation.indexes - - navigation.sections - - navigation.tracking - - navigation.instant - - navigation.instant.prefetch - - navigation.instant.progress -# - navigation.tabs -# - navigation.expand -# - toc.integrate - - search.share - - search.suggest - - search.highlight - - content.tabs.link - - content.width.full - - content.action.view - - content.action.edit - - content.code.copy - - content.code.select - - content.code.annotate - - content.code.annotation - -nav: - - Introduction: index.md - - Overview: overview.md - - What's New in v0.3: 'https://github.com/D4Vinci/Scrapling/releases/tag/v0.3' - - Performance Benchmarks: benchmarks.md - - User Guide: - - Parsing: - - Querying elements: parsing/selection.md - - Main classes: parsing/main_classes.md - - Adaptive scraping: parsing/adaptive.md - - Fetching: - - Fetchers basics: fetching/choosing.md - - HTTP requests: fetching/static.md - - Dynamic websites: fetching/dynamic.md - - Dynamic websites with hard protections: fetching/stealthy.md - - Command Line Interface: - - Overview: cli/overview.md - - Interactive shell: cli/interactive-shell.md - - Extract commands: cli/extract-commands.md - - Integrations: - - AI MCP server: ai/mcp-server.md - - Tutorials: - - A Free Alternative to AI for Robust Web Scraping: tutorials/replacing_ai.md - - Migrating from BeautifulSoup: tutorials/migrating_from_beautifulsoup.md - - Using Scrapeless browser: tutorials/external.md -# - Migrating from AutoScraper: tutorials/migrating_from_autoscraper.md - - Development: - - API Reference: - - Selector: api-reference/selector.md - - Fetchers: api-reference/fetchers.md - - MCP Server: api-reference/mcp-server.md - - Custom Types: api-reference/custom-types.md - - Writing your retrieval system: development/adaptive_storage_system.md - - Using Scrapling's custom types: development/scrapling_custom_types.md - - Support and Advertisement: donate.md - - Contributing: 'https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md' - - Changelog: 'https://github.com/D4Vinci/Scrapling/releases' - -markdown_extensions: - - admonition - - abbr -# - mkautodoc - - pymdownx.emoji - - pymdownx.details - - pymdownx.superfences - - pymdownx.highlight: - anchor_linenums: true - - pymdownx.inlinehilite - - pymdownx.snippets - - pymdownx.tabbed: - alternate_style: true - - tables - - codehilite: - css_class: highlight - - toc: - permalink: true - -plugins: - - search - - privacy: - links: false - - optimize - - social: - cards_layout_options: - background_color: "#1f1f1f" - font_family: Roboto - - mkdocstrings: - handlers: - python: - paths: [scrapling] - options: - docstring_style: sphinx - show_source: true - show_root_heading: true - show_if_no_docstring: true - inherited_members: true - members_order: source - separate_signature: true - unwrap_annotated: true - filters: - - '!^_' - - "^__" - merge_init_into_class: true - docstring_section_style: spacy - signature_crossrefs: true - show_symbol_type_heading: true - show_symbol_type_toc: true - show_inheritance_diagram: true - modernize_annotations: true - extensions: - - griffe_runtime_objects - - griffe_sphinx - - griffe_inherited_docstrings: - merge: true - -extra: - homepage: https://scrapling.readthedocs.io/en/latest/ - social: - - icon: fontawesome/brands/github - link: https://github.com/D4Vinci/Scrapling - - icon: fontawesome/brands/python - link: https://pypi.org/project/scrapling/ - - icon: fontawesome/brands/x-twitter - link: https://x.com/Scrapling_dev - - icon: fontawesome/brands/discord - link: https://discord.gg/EMgGbDceNQ - analytics: - provider: google - property: G-CS3DKLY73Z - feedback: - title: Was this page helpful? - ratings: - - icon: material/emoticon-happy-outline - name: This page was helpful - data: 1 - note: >- - Thanks for your feedback! - - icon: material/emoticon-sad-outline - name: This page could be improved - data: 0 - note: >- - Thanks for your feedback! - consent: - title: Cookie consent - description: >- - We use cookies to recognize your repeated visits and preferences, as well - as to measure the effectiveness of our documentation and whether users - find what they're searching for. With your consent, you're helping us to - make our documentation better. - -extra_css: - - stylesheets/extra.css \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index f6fb8ab33ae66d5cf331608cc84f74e9555a41e0..00a66954ee33f67d2ff8b21c0545c61f2f8ea9b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta" [project] name = "scrapling" # Static version instead of a dynamic version so we can get better layer caching while building docker, check the docker file to understand -version = "0.3.14" +version = "0.4" description = "Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!" readme = {file = "docs/README.md", content-type = "text/markdown"} license = {file = "LICENSE"} @@ -28,6 +28,9 @@ keywords = [ "web-crawler", "browser", "crawling", + "headless", + "scraper", + "chrome", ] requires-python = ">=3.10" classifiers = [ @@ -46,6 +49,7 @@ classifiers = [ "Topic :: Text Processing :: Markup :: HTML", "Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Software Development :: Libraries", + "Topic :: Software Development :: Libraries :: Application Frameworks", "Topic :: Software Development :: Libraries :: Python Modules", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", @@ -58,9 +62,11 @@ classifiers = [ ] dependencies = [ "lxml>=6.0.2", - "cssselect>=1.3.0", - "orjson>=3.11.5", - "tldextract>=5.3.1", + "cssselect>=1.4.0", + "orjson>=3.11.7", + "tld>=0.13.1", + "w3lib>=2.4.0", + "typing_extensions", ] [project.optional-dependencies] @@ -69,8 +75,9 @@ fetchers = [ "curl_cffi>=0.14.0", "playwright==1.56.0", "patchright==1.56.0", - "browserforge>=1.2.3", + "browserforge>=1.2.4", "msgspec>=0.20.0", + "anyio>=4.12.1" ] ai = [ "mcp>=1.24.0", @@ -92,6 +99,8 @@ Changelog = "https://github.com/D4Vinci/Scrapling/releases" Documentation = "https://scrapling.readthedocs.io/en/latest/" Repository = "https://github.com/D4Vinci/Scrapling" "Bug Tracker" = "https://github.com/D4Vinci/Scrapling/issues" +"Discord" = "https://discord.gg/EMgGbDceNQ" +"Release Notes" = "https://github.com/D4Vinci/Scrapling/releases" [project.scripts] scrapling = "scrapling.cli:main" @@ -102,4 +111,16 @@ include-package-data = true [tool.setuptools.packages.find] where = ["."] -include = ["scrapling*"] \ No newline at end of file +include = ["scrapling*"] + +[tool.mypy] +python_version = "3.10" +warn_unused_configs = true +ignore_missing_imports = true +check_untyped_defs = true + +[tool.pyright] +pythonVersion = "3.10" +typeCheckingMode = "basic" +include = ["scrapling"] +ignore = ["tests", "benchmarks.py"] \ No newline at end of file diff --git a/scrapling/__init__.py b/scrapling/__init__.py index 24256863554b44187882b66e99bef0cdef4e73f0..60c859c75c6c77f7ed7d05cf4a494f394a5c1dcb 100644 --- a/scrapling/__init__.py +++ b/scrapling/__init__.py @@ -1,5 +1,5 @@ __author__ = "Karim Shoair (karim.shoair@pm.me)" -__version__ = "0.3.14" +__version__ = "0.4" __copyright__ = "Copyright (c) 2024 Karim Shoair" from typing import Any, TYPE_CHECKING diff --git a/scrapling/cli.py b/scrapling/cli.py index 68470a5462d7fe1ff0b5afbd13ed5fdff02612ad..59f017efcc5bf56e21e7505ea863482b776ba355 100644 --- a/scrapling/cli.py +++ b/scrapling/cli.py @@ -128,6 +128,9 @@ def install(force): # pragma: no cover ], "Playwright dependencies", ) + from tld.utils import update_tld_names + + update_tld_names(fail_silently=True) # if no errors raised by the above commands, then we add the below file __PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").touch() else: diff --git a/scrapling/core/_html_utils.py b/scrapling/core/_html_utils.py deleted file mode 100644 index 6b09830b215ef2205e3f5a25cd2fa50f4d3bb38a..0000000000000000000000000000000000000000 --- a/scrapling/core/_html_utils.py +++ /dev/null @@ -1,342 +0,0 @@ -""" -This file is mostly copied from the submodule `w3lib.html` source code to stop downloading the whole library to use a small part of it. -So the goal of doing this is to minimize the memory footprint and keep the library size relatively smaller. -Repo source code: https://github.com/scrapy/w3lib/blob/master/w3lib/html.py -""" - -from re import compile as _re_compile, IGNORECASE - -from scrapling.core._types import Iterable, Optional, Match, StrOrBytes - -_ent_re = _re_compile( - r"&((?P[a-z\d]+)|#(?P\d+)|#x(?P[a-f\d]+))(?P;?)", - IGNORECASE, -) -# maps HTML4 entity name to the Unicode code point -name2codepoint = { - "AElig": 0x00C6, # latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1 - "Aacute": 0x00C1, # latin capital letter A with acute, U+00C1 ISOlat1 - "Acirc": 0x00C2, # latin capital letter A with circumflex, U+00C2 ISOlat1 - "Agrave": 0x00C0, # latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1 - "Alpha": 0x0391, # greek capital letter alpha, U+0391 - "Aring": 0x00C5, # latin capital letter A with the ring above = latin capital letter A ring, U+00C5 ISOlat1 - "Atilde": 0x00C3, # latin capital letter A with tilde, U+00C3 ISOlat1 - "Auml": 0x00C4, # latin capital letter A with diaeresis, U+00C4 ISOlat1 - "Beta": 0x0392, # greek capital letter beta, U+0392 - "Ccedil": 0x00C7, # latin capital letter C with cedilla, U+00C7 ISOlat1 - "Chi": 0x03A7, # greek capital letter chi, U+03A7 - "Dagger": 0x2021, # double dagger, U+2021 ISOpub - "Delta": 0x0394, # greek capital letter delta, U+0394 ISOgrk3 - "ETH": 0x00D0, # latin capital letter ETH, U+00D0 ISOlat1 - "Eacute": 0x00C9, # latin capital letter E with acute, U+00C9 ISOlat1 - "Ecirc": 0x00CA, # latin capital letter E with circumflex, U+00CA ISOlat1 - "Egrave": 0x00C8, # latin capital letter E with grave, U+00C8 ISOlat1 - "Epsilon": 0x0395, # greek capital letter epsilon, U+0395 - "Eta": 0x0397, # greek capital letter eta, U+0397 - "Euml": 0x00CB, # latin capital letter E with diaeresis, U+00CB ISOlat1 - "Gamma": 0x0393, # greek capital letter gamma, U+0393 ISOgrk3 - "Iacute": 0x00CD, # latin capital letter I with acute, U+00CD ISOlat1 - "Icirc": 0x00CE, # latin capital letter I with circumflex, U+00CE ISOlat1 - "Igrave": 0x00CC, # latin capital letter I with grave, U+00CC ISOlat1 - "Iota": 0x0399, # greek capital letter iota, U+0399 - "Iuml": 0x00CF, # latin capital letter I with diaeresis, U+00CF ISOlat1 - "Kappa": 0x039A, # greek capital letter kappa, U+039A - "Lambda": 0x039B, # greek capital letter lambda, U+039B ISOgrk3 - "Mu": 0x039C, # greek capital letter mu, U+039C - "Ntilde": 0x00D1, # latin capital letter N with tilde, U+00D1 ISOlat1 - "Nu": 0x039D, # greek capital letter nu, U+039D - "OElig": 0x0152, # latin capital ligature OE, U+0152 ISOlat2 - "Oacute": 0x00D3, # latin capital letter O with acute, U+00D3 ISOlat1 - "Ocirc": 0x00D4, # latin capital letter O with circumflex, U+00D4 ISOlat1 - "Ograve": 0x00D2, # latin capital letter O with grave, U+00D2 ISOlat1 - "Omega": 0x03A9, # greek capital letter omega, U+03A9 ISOgrk3 - "Omicron": 0x039F, # greek capital letter omicron, U+039F - "Oslash": 0x00D8, # latin capital letter O with stroke = latin capital letter O slash, U+00D8 ISOlat1 - "Otilde": 0x00D5, # latin capital letter O with tilde, U+00D5 ISOlat1 - "Ouml": 0x00D6, # latin capital letter O with diaeresis, U+00D6 ISOlat1 - "Phi": 0x03A6, # greek capital letter phi, U+03A6 ISOgrk3 - "Pi": 0x03A0, # greek capital letter pi, U+03A0 ISOgrk3 - "Prime": 0x2033, # double prime = seconds = inches, U+2033 ISOtech - "Psi": 0x03A8, # greek capital letter psi, U+03A8 ISOgrk3 - "Rho": 0x03A1, # greek capital letter rho, U+03A1 - "Scaron": 0x0160, # latin capital letter S with caron, U+0160 ISOlat2 - "Sigma": 0x03A3, # greek capital letter sigma, U+03A3 ISOgrk3 - "THORN": 0x00DE, # latin capital letter THORN, U+00DE ISOlat1 - "Tau": 0x03A4, # greek capital letter tau, U+03A4 - "Theta": 0x0398, # greek capital letter theta, U+0398 ISOgrk3 - "Uacute": 0x00DA, # latin capital letter U with acute, U+00DA ISOlat1 - "Ucirc": 0x00DB, # latin capital letter U with circumflex, U+00DB ISOlat1 - "Ugrave": 0x00D9, # latin capital letter U with grave, U+00D9 ISOlat1 - "Upsilon": 0x03A5, # greek capital letter upsilon, U+03A5 ISOgrk3 - "Uuml": 0x00DC, # latin capital letter U with diaeresis, U+00DC ISOlat1 - "Xi": 0x039E, # greek capital letter xi, U+039E ISOgrk3 - "Yacute": 0x00DD, # latin capital letter Y with acute, U+00DD ISOlat1 - "Yuml": 0x0178, # latin capital letter Y with diaeresis, U+0178 ISOlat2 - "Zeta": 0x0396, # greek capital letter zeta, U+0396 - "aacute": 0x00E1, # latin small letter a with acute, U+00E1 ISOlat1 - "acirc": 0x00E2, # latin small letter a with circumflex, U+00E2 ISOlat1 - "acute": 0x00B4, # acute accent = spacing acute, U+00B4 ISOdia - "aelig": 0x00E6, # latin small letter ae = latin small ligature ae, U+00E6 ISOlat1 - "agrave": 0x00E0, # latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1 - "alefsym": 0x2135, # alef symbol = first transfinite cardinal, U+2135 NEW - "alpha": 0x03B1, # greek small letter alpha, U+03B1 ISOgrk3 - "amp": 0x0026, # ampersand, U+0026 ISOnum - "and": 0x2227, # logical and = wedge, U+2227 ISOtech - "ang": 0x2220, # angle, U+2220 ISOamso - "aring": 0x00E5, # latin small letter a with the ring above = latin small letter a ring, U+00E5 ISOlat1 - "asymp": 0x2248, # almost equal to = asymptotic to, U+2248 ISOamsr - "atilde": 0x00E3, # latin small letter a with tilde, U+00E3 ISOlat1 - "auml": 0x00E4, # latin small letter a with diaeresis, U+00E4 ISOlat1 - "bdquo": 0x201E, # double low-9 quotation mark, U+201E NEW - "beta": 0x03B2, # greek small letter beta, U+03B2 ISOgrk3 - "brvbar": 0x00A6, # broken bar = broken vertical bar, U+00A6 ISOnum - "bull": 0x2022, # bullet = black small circle, U+2022 ISOpub - "cap": 0x2229, # intersection = cap, U+2229 ISOtech - "ccedil": 0x00E7, # latin small letter c with cedilla, U+00E7 ISOlat1 - "cedil": 0x00B8, # cedilla = spacing cedilla, U+00B8 ISOdia - "cent": 0x00A2, # cent sign, U+00A2 ISOnum - "chi": 0x03C7, # greek small letter chi, U+03C7 ISOgrk3 - "circ": 0x02C6, # modifier letter circumflex accent, U+02C6 ISOpub - "clubs": 0x2663, # black club suit = shamrock, U+2663 ISOpub - "cong": 0x2245, # approximately equal to, U+2245 ISOtech - "copy": 0x00A9, # copyright sign, U+00A9 ISOnum - "crarr": 0x21B5, # downwards arrow with corner leftwards = carriage return, U+21B5 NEW - "cup": 0x222A, # union = cup, U+222A ISOtech - "curren": 0x00A4, # currency sign, U+00A4 ISOnum - "dArr": 0x21D3, # downwards double arrow, U+21D3 ISOamsa - "dagger": 0x2020, # dagger, U+2020 ISOpub - "darr": 0x2193, # downwards arrow, U+2193 ISOnum - "deg": 0x00B0, # degree sign, U+00B0 ISOnum - "delta": 0x03B4, # greek small letter delta, U+03B4 ISOgrk3 - "diams": 0x2666, # black diamond suit, U+2666 ISOpub - "divide": 0x00F7, # division sign, U+00F7 ISOnum - "eacute": 0x00E9, # latin small letter e with acute, U+00E9 ISOlat1 - "ecirc": 0x00EA, # latin small letter e with circumflex, U+00EA ISOlat1 - "egrave": 0x00E8, # latin small letter e with grave, U+00E8 ISOlat1 - "empty": 0x2205, # empty set = null set = diameter, U+2205 ISOamso - "emsp": 0x2003, # em space, U+2003 ISOpub - "ensp": 0x2002, # en space, U+2002 ISOpub - "epsilon": 0x03B5, # greek small letter epsilon, U+03B5 ISOgrk3 - "equiv": 0x2261, # identical to, U+2261 ISOtech - "eta": 0x03B7, # greek small letter eta, U+03B7 ISOgrk3 - "eth": 0x00F0, # latin small letter eth, U+00F0 ISOlat1 - "euml": 0x00EB, # latin small letter e with diaeresis, U+00EB ISOlat1 - "euro": 0x20AC, # euro sign, U+20AC NEW - "exist": 0x2203, # there exists, U+2203 ISOtech - "fnof": 0x0192, # latin small f with hook = function = florin, U+0192 ISOtech - "forall": 0x2200, # for all, U+2200 ISOtech - "frac12": 0x00BD, # vulgar fraction one half = fraction one half, U+00BD ISOnum - "frac14": 0x00BC, # vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum - "frac34": 0x00BE, # vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum - "frasl": 0x2044, # fraction slash, U+2044 NEW - "gamma": 0x03B3, # greek small letter gamma, U+03B3 ISOgrk3 - "ge": 0x2265, # greater-than or equal to, U+2265 ISOtech - "gt": 0x003E, # greater-than sign, U+003E ISOnum - "hArr": 0x21D4, # left right double arrow, U+21D4 ISOamsa - "harr": 0x2194, # left right arrow, U+2194 ISOamsa - "hearts": 0x2665, # black heart suit = valentine, U+2665 ISOpub - "hellip": 0x2026, # horizontal ellipsis = three dot leader, U+2026 ISOpub - "iacute": 0x00ED, # latin small letter i with acute, U+00ED ISOlat1 - "icirc": 0x00EE, # latin small letter i with circumflex, U+00EE ISOlat1 - "iexcl": 0x00A1, # inverted exclamation mark, U+00A1 ISOnum - "igrave": 0x00EC, # latin small letter i with grave, U+00EC ISOlat1 - "image": 0x2111, # blackletter capital I = imaginary part, U+2111 ISOamso - "infin": 0x221E, # infinity, U+221E ISOtech - "int": 0x222B, # integral, U+222B ISOtech - "iota": 0x03B9, # greek small letter iota, U+03B9 ISOgrk3 - "iquest": 0x00BF, # inverted question mark = turned question mark, U+00BF ISOnum - "isin": 0x2208, # element of, U+2208 ISOtech - "iuml": 0x00EF, # latin small letter i with diaeresis, U+00EF ISOlat1 - "kappa": 0x03BA, # greek small letter kappa, U+03BA ISOgrk3 - "lArr": 0x21D0, # leftwards double arrow, U+21D0 ISOtech - "lambda": 0x03BB, # greek small letter lambda, U+03BB ISOgrk3 - "lang": 0x2329, # left-pointing angle bracket = bra, U+2329 ISOtech - "laquo": 0x00AB, # left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum - "larr": 0x2190, # leftwards arrow, U+2190 ISOnum - "lceil": 0x2308, # left ceiling = apl upstile, U+2308 ISOamsc - "ldquo": 0x201C, # left double quotation mark, U+201C ISOnum - "le": 0x2264, # less-than or equal to, U+2264 ISOtech - "lfloor": 0x230A, # left floor = apl downstile, U+230A ISOamsc - "lowast": 0x2217, # asterisk operator, U+2217 ISOtech - "loz": 0x25CA, # lozenge, U+25CA ISOpub - "lrm": 0x200E, # left-to-right mark, U+200E NEW RFC 2070 - "lsaquo": 0x2039, # single left-pointing angle quotation mark, U+2039 ISO proposed - "lsquo": 0x2018, # left single quotation mark, U+2018 ISOnum - "lt": 0x003C, # less-than sign, U+003C ISOnum - "macr": 0x00AF, # macron = spacing macron = overline = APL overbar, U+00AF ISOdia - "mdash": 0x2014, # em dash, U+2014 ISOpub - "micro": 0x00B5, # micro sign, U+00B5 ISOnum - "middot": 0x00B7, # middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum - "minus": 0x2212, # minus sign, U+2212 ISOtech - "mu": 0x03BC, # greek small letter mu, U+03BC ISOgrk3 - "nabla": 0x2207, # nabla = backward difference, U+2207 ISOtech - "nbsp": 0x00A0, # no-break space = non-breaking space, U+00A0 ISOnum - "ndash": 0x2013, # en dash, U+2013 ISOpub - "ne": 0x2260, # not equal to, U+2260 ISOtech - "ni": 0x220B, # contains as member, U+220B ISOtech - "not": 0x00AC, # not sign, U+00AC ISOnum - "notin": 0x2209, # not an element of, U+2209 ISOtech - "nsub": 0x2284, # not a subset of, U+2284 ISOamsn - "ntilde": 0x00F1, # latin small letter n with tilde, U+00F1 ISOlat1 - "nu": 0x03BD, # greek small letter nu, U+03BD ISOgrk3 - "oacute": 0x00F3, # latin small letter o with acute, U+00F3 ISOlat1 - "ocirc": 0x00F4, # latin small letter o with circumflex, U+00F4 ISOlat1 - "oelig": 0x0153, # latin small ligature oe, U+0153 ISOlat2 - "ograve": 0x00F2, # latin small letter o with grave, U+00F2 ISOlat1 - "oline": 0x203E, # overline = spacing overscore, U+203E NEW - "omega": 0x03C9, # greek small letter omega, U+03C9 ISOgrk3 - "omicron": 0x03BF, # greek small letter omicron, U+03BF NEW - "oplus": 0x2295, # circled plus = direct sum, U+2295 ISOamsb - "or": 0x2228, # logical or = vee, U+2228 ISOtech - "ordf": 0x00AA, # feminine ordinal indicator, U+00AA ISOnum - "ordm": 0x00BA, # masculine ordinal indicator, U+00BA ISOnum - "oslash": 0x00F8, # latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1 - "otilde": 0x00F5, # latin small letter o with tilde, U+00F5 ISOlat1 - "otimes": 0x2297, # circled times = vector product, U+2297 ISOamsb - "ouml": 0x00F6, # latin small letter o with diaeresis, U+00F6 ISOlat1 - "para": 0x00B6, # pilcrow sign = paragraph sign, U+00B6 ISOnum - "part": 0x2202, # partial differential, U+2202 ISOtech - "permil": 0x2030, # per mille sign, U+2030 ISOtech - "perp": 0x22A5, # up tack = orthogonal to = perpendicular, U+22A5 ISOtech - "phi": 0x03C6, # greek small letter phi, U+03C6 ISOgrk3 - "pi": 0x03C0, # greek small letter pi, U+03C0 ISOgrk3 - "piv": 0x03D6, # greek pi symbol, U+03D6 ISOgrk3 - "plusmn": 0x00B1, # plus-minus sign = plus-or-minus sign, U+00B1 ISOnum - "pound": 0x00A3, # pound sign, U+00A3 ISOnum - "prime": 0x2032, # prime = minutes = feet, U+2032 ISOtech - "prod": 0x220F, # n-ary product = product sign, U+220F ISOamsb - "prop": 0x221D, # proportional to, U+221D ISOtech - "psi": 0x03C8, # greek small letter psi, U+03C8 ISOgrk3 - "quot": 0x0022, # quotation mark = APL quote, U+0022 ISOnum - "rArr": 0x21D2, # rightwards double arrow, U+21D2 ISOtech - "radic": 0x221A, # square root = radical sign, U+221A ISOtech - "rang": 0x232A, # right-pointing angle bracket = ket, U+232A ISOtech - "raquo": 0x00BB, # right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum - "rarr": 0x2192, # rightwards arrow, U+2192 ISOnum - "rceil": 0x2309, # right ceiling, U+2309 ISOamsc - "rdquo": 0x201D, # right double quotation mark, U+201D ISOnum - "real": 0x211C, # blackletter capital R = real part symbol, U+211C ISOamso - "reg": 0x00AE, # registered sign = registered trade mark sign, U+00AE ISOnum - "rfloor": 0x230B, # right floor, U+230B ISOamsc - "rho": 0x03C1, # greek small letter rho, U+03C1 ISOgrk3 - "rlm": 0x200F, # right-to-left mark, U+200F NEW RFC 2070 - "rsaquo": 0x203A, # single right-pointing angle quotation mark, U+203A ISO proposed - "rsquo": 0x2019, # right single quotation mark, U+2019 ISOnum - "sbquo": 0x201A, # single low-9 quotation mark, U+201A NEW - "scaron": 0x0161, # latin small letter s with caron, U+0161 ISOlat2 - "sdot": 0x22C5, # dot operator, U+22C5 ISOamsb - "sect": 0x00A7, # section sign, U+00A7 ISOnum - "shy": 0x00AD, # soft hyphen = discretionary hyphen, U+00AD ISOnum - "sigma": 0x03C3, # greek small letter sigma, U+03C3 ISOgrk3 - "sigmaf": 0x03C2, # greek small letter final sigma, U+03C2 ISOgrk3 - "sim": 0x223C, # tilde operator = varies with = similar to, U+223C ISOtech - "spades": 0x2660, # black spade suit, U+2660 ISOpub - "sub": 0x2282, # subset of, U+2282 ISOtech - "sube": 0x2286, # subset of or equal to, U+2286 ISOtech - "sum": 0x2211, # n-ary summation, U+2211 ISOamsb - "sup": 0x2283, # superset of, U+2283 ISOtech - "sup1": 0x00B9, # superscript one = superscript digit one, U+00B9 ISOnum - "sup2": 0x00B2, # superscript two = superscript digit two = squared, U+00B2 ISOnum - "sup3": 0x00B3, # superscript three = superscript digit three = cubed, U+00B3 ISOnum - "supe": 0x2287, # superset of or equal to, U+2287 ISOtech - "szlig": 0x00DF, # latin small letter sharp s = ess-zed, U+00DF ISOlat1 - "tau": 0x03C4, # greek small letter tau, U+03C4 ISOgrk3 - "there4": 0x2234, # therefore, U+2234 ISOtech - "theta": 0x03B8, # greek small letter theta, U+03B8 ISOgrk3 - "thetasym": 0x03D1, # greek small letter theta symbol, U+03D1 NEW - "thinsp": 0x2009, # thin space, U+2009 ISOpub - "thorn": 0x00FE, # latin small letter thorn with, U+00FE ISOlat1 - "tilde": 0x02DC, # small tilde, U+02DC ISOdia - "times": 0x00D7, # multiplication sign, U+00D7 ISOnum - "trade": 0x2122, # trade mark sign, U+2122 ISOnum - "uArr": 0x21D1, # upwards double arrow, U+21D1 ISOamsa - "uacute": 0x00FA, # latin small letter u with acute, U+00FA ISOlat1 - "uarr": 0x2191, # upwards arrow, U+2191 ISOnum - "ucirc": 0x00FB, # latin small letter u with circumflex, U+00FB ISOlat1 - "ugrave": 0x00F9, # latin small letter u with grave, U+00F9 ISOlat1 - "uml": 0x00A8, # diaeresis = spacing diaeresis, U+00A8 ISOdia - "upsih": 0x03D2, # greek upsilon with hook symbol, U+03D2 NEW - "upsilon": 0x03C5, # greek small letter upsilon, U+03C5 ISOgrk3 - "uuml": 0x00FC, # latin small letter u with diaeresis, U+00FC ISOlat1 - "weierp": 0x2118, # script capital P = power set = Weierstrass p, U+2118 ISOamso - "xi": 0x03BE, # greek small letter xi, U+03BE ISOgrk3 - "yacute": 0x00FD, # latin small letter y with acute, U+00FD ISOlat1 - "yen": 0x00A5, # yen sign = yuan sign, U+00A5 ISOnum - "yuml": 0x00FF, # latin small letter y with diaeresis, U+00FF ISOlat1 - "zeta": 0x03B6, # greek small letter zeta, U+03B6 ISOgrk3 - "zwj": 0x200D, # zero width joiner, U+200D NEW RFC 2070 - "zwnj": 0x200C, # zero width non-joiner, U+200C NEW RFC 2070 -} - - -def to_unicode(text: StrOrBytes, encoding: Optional[str] = None, errors: str = "strict") -> str: - """Return the Unicode representation of a bytes object `text`. If `text` - is already a Unicode object, return it as-is.""" - if isinstance(text, str): - return text - if not isinstance(text, (bytes, str)): - raise TypeError(f"to_unicode must receive bytes or str, got {type(text).__name__}") - if encoding is None: - encoding = "utf-8" - return text.decode(encoding, errors) - - -def _replace_entities( - text: StrOrBytes, - keep: Iterable[str] = (), - remove_illegal: bool = True, - encoding: str = "utf-8", -) -> str: - """Remove entities from the given `text` by converting them to their - corresponding Unicode character. - - `text` can be a Unicode string or a byte string encoded in the given - `encoding` (which defaults to 'utf-8'). - - If `keep` is passed (with a list of entity names), those entities will - be kept (they won't be removed). - - It supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``) - and named entities (such as `` `` or ``>``). - - If `remove_illegal` is ``True``, entities that can't be converted are removed. - If `remove_illegal` is ``False``, entities that can't be converted are kept "as - is". For more information, see the tests. - - Always returns a Unicode string (with the entities removed). - - >>> _replace_entities(b'Price: £100') - 'Price: \\xa3100' - >>> print(_replace_entities(b'Price: £100')) - Price: £100 - >>> - - """ - - def convert_entity(m: Match[str]) -> str: - groups = m.groupdict() - number = None - if groups.get("dec"): - number = int(groups["dec"], 10) - elif groups.get("hex"): - number = int(groups["hex"], 16) - elif groups.get("named"): - entity_name = groups["named"] - if entity_name.lower() in keep: - return m.group(0) - number = name2codepoint.get(entity_name) or name2codepoint.get(entity_name.lower()) - if number is not None: - # Browsers typically - # interpret numeric character references in the 80-9F range as representing the characters mapped - # to bytes 80-9F in the Windows-1252 encoding. For more info - # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML - try: - if 0x80 <= number <= 0x9F: - return bytes((number,)).decode("cp1252") - return chr(number) - except (ValueError, OverflowError): # pragma: no cover - pass - - return "" if remove_illegal and groups.get("semicolon") else m.group(0) - - return _ent_re.sub(convert_entity, to_unicode(text, encoding)) diff --git a/scrapling/core/_types.py b/scrapling/core/_types.py index d7c1f8baa830c869ff24eede0a86b5b4c3678882..f2fc097a1087b42e36c9c0057319641ddb4c8fa8 100644 --- a/scrapling/core/_types.py +++ b/scrapling/core/_types.py @@ -12,12 +12,14 @@ from typing import ( Callable, Dict, Generator, + AsyncGenerator, Generic, Iterable, List, Set, Literal, Optional, + Iterator, Pattern, Sequence, Tuple, @@ -30,34 +32,16 @@ from typing import ( Coroutine, SupportsIndex, ) +from typing_extensions import Self, Unpack +# Proxy can be a string URL or a dict (Playwright format: {"server": "...", "username": "...", "password": "..."}) +ProxyType = Union[str, Dict[str, str]] SUPPORTED_HTTP_METHODS = Literal["GET", "POST", "PUT", "DELETE"] SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"] PageLoadStates = Literal["commit", "domcontentloaded", "load", "networkidle"] extraction_types = Literal["text", "html", "markdown"] StrOrBytes = Union[str, bytes] -if TYPE_CHECKING: # pragma: no cover - from typing_extensions import Unpack -else: # pragma: no cover - - class _Unpack: - @staticmethod - def __getitem__(*args, **kwargs): - pass - - Unpack = _Unpack() - - -try: - # Python 3.11+ - from typing import Self # novermin -except ImportError: # pragma: no cover - try: - from typing_extensions import Self # Backport - except ImportError: - Self = object - # Copied from `playwright._impl._api_structures.SetCookieParam` class SetCookieParam(TypedDict, total=False): diff --git a/scrapling/core/ai.py b/scrapling/core/ai.py index 4d5929bdd562318f1177b6f9888081ce324a7462..171359a5c366a530a96fd799f171ebb3e928d7bd 100644 --- a/scrapling/core/ai.py +++ b/scrapling/core/ai.py @@ -213,7 +213,7 @@ class ScraplingMCPServer: extraction_type: extraction_types = "markdown", css_selector: Optional[str] = None, main_content_only: bool = True, - headless: bool = False, + headless: bool = True, # noqa: F821 google_search: bool = True, real_chrome: bool = False, wait: int | float = 0, @@ -295,7 +295,7 @@ class ScraplingMCPServer: extraction_type: extraction_types = "markdown", css_selector: Optional[str] = None, main_content_only: bool = True, - headless: bool = False, + headless: bool = True, # noqa: F821 google_search: bool = True, real_chrome: bool = False, wait: int | float = 0, diff --git a/scrapling/core/custom_types.py b/scrapling/core/custom_types.py index d437a239cb8f9f991c7e62ab5ee36bee2e922b9a..d06ac29cb668fdd0e6412eb85bad89f2ac490ce8 100644 --- a/scrapling/core/custom_types.py +++ b/scrapling/core/custom_types.py @@ -3,6 +3,7 @@ from types import MappingProxyType from re import compile as re_compile, UNICODE, IGNORECASE from orjson import dumps, loads +from w3lib.html import replace_entities as _replace_entities from scrapling.core._types import ( Any, @@ -19,7 +20,6 @@ from scrapling.core._types import ( SupportsIndex, ) from scrapling.core.utils import _is_iterable, flatten, __CONSECUTIVE_SPACES_REGEX__ -from scrapling.core._html_utils import _replace_entities # Define type variable for AttributeHandler value type _TextHandlerType = TypeVar("_TextHandlerType", bound="TextHandler") @@ -35,9 +35,7 @@ class TextHandler(str): lst = super().__getitem__(key) return TextHandler(lst) - def split( - self, sep: str | None = None, maxsplit: SupportsIndex = -1 - ) -> Union[List, "TextHandlers"]: # pragma: no cover + def split(self, sep: str | None = None, maxsplit: SupportsIndex = -1) -> list[Any]: # pragma: no cover return TextHandlers([TextHandler(s) for s in super().split(sep, maxsplit)]) def strip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover @@ -61,7 +59,7 @@ class TextHandler(str): def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, "TextHandler"]: # pragma: no cover return TextHandler(super().expandtabs(tabsize)) - def format(self, *args: object, **kwargs: str) -> Union[str, "TextHandler"]: # pragma: no cover + def format(self, *args: object, **kwargs: object) -> Union[str, "TextHandler"]: # pragma: no cover return TextHandler(super().format(*args, **kwargs)) def format_map(self, mapping) -> Union[str, "TextHandler"]: # pragma: no cover @@ -291,7 +289,7 @@ class AttributesHandler(Mapping[str, _TextHandlerType]): __slots__ = ("_data",) - def __init__(self, mapping=None, **kwargs): + def __init__(self, mapping: Any = None, **kwargs: Any) -> None: mapping = ( {key: TextHandler(value) if isinstance(value, str) else value for key, value in mapping.items()} if mapping is not None @@ -324,8 +322,8 @@ class AttributesHandler(Mapping[str, _TextHandlerType]): yield AttributesHandler({key: value}) @property - def json_string(self): - """Convert current attributes to JSON string if the attributes are JSON serializable otherwise throws error""" + def json_string(self) -> bytes: + """Convert current attributes to JSON bytes if the attributes are JSON serializable otherwise throws error""" return dumps(dict(self._data)) def __getitem__(self, key: str) -> _TextHandlerType: diff --git a/scrapling/core/mixins.py b/scrapling/core/mixins.py index 3a96bdab142dcd29163318c6d6f9b13dc3ba6049..c2e74202179470c0358645f1628ab0406734cf6c 100644 --- a/scrapling/core/mixins.py +++ b/scrapling/core/mixins.py @@ -1,7 +1,4 @@ -from scrapling.core._types import TYPE_CHECKING - -if TYPE_CHECKING: - from scrapling.parser import Selector +from scrapling.core._types import Any, Dict class SelectorsGeneration: @@ -11,10 +8,17 @@ class SelectorsGeneration: Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591 """ - def _general_selection(self: "Selector", selection: str = "css", full_path: bool = False) -> str: # type: ignore[name-defined] + # Note: This is a mixin class meant to be used with Selector. + # The methods access Selector attributes (._root, .parent, .attrib, .tag, etc.) + # through self, which will be a Selector instance at runtime. + + def _general_selection(self: Any, selection: str = "css", full_path: bool = False) -> str: """Generate a selector for the current element. :return: A string of the generated selector. """ + if self._is_text_node(self._root): + return "" + selectorPath = [] target = self css = selection.lower() == "css" @@ -33,7 +37,7 @@ class SelectorsGeneration: # if classes and css: # part += f".{'.'.join(classes)}" # else: - counter = {} + counter: Dict[str, int] = {} for child in target.parent.children: counter.setdefault(child.tag, 0) counter[child.tag] += 1 @@ -53,28 +57,28 @@ class SelectorsGeneration: return " > ".join(reversed(selectorPath)) if css else "//" + "/".join(reversed(selectorPath)) @property - def generate_css_selector(self: "Selector") -> str: # type: ignore[name-defined] + def generate_css_selector(self: Any) -> str: """Generate a CSS selector for the current element :return: A string of the generated selector. """ return self._general_selection() @property - def generate_full_css_selector(self: "Selector") -> str: # type: ignore[name-defined] + def generate_full_css_selector(self: Any) -> str: """Generate a complete CSS selector for the current element :return: A string of the generated selector. """ return self._general_selection(full_path=True) @property - def generate_xpath_selector(self: "Selector") -> str: # type: ignore[name-defined] + def generate_xpath_selector(self: Any) -> str: """Generate an XPath selector for the current element :return: A string of the generated selector. """ return self._general_selection("xpath") @property - def generate_full_xpath_selector(self: "Selector") -> str: # type: ignore[name-defined] + def generate_full_xpath_selector(self: Any) -> str: """Generate a complete XPath selector for the current element :return: A string of the generated selector. """ diff --git a/scrapling/core/shell.py b/scrapling/core/shell.py index 0499e814bf0d03680caa8d840ff41aac2a42ee6b..0bd6efcc3d0da38f06c27142ec246c83d7a4ff61 100644 --- a/scrapling/core/shell.py +++ b/scrapling/core/shell.py @@ -30,6 +30,7 @@ from scrapling.core.custom_types import TextHandler from scrapling.engines.toolbelt.custom import Response from scrapling.core.utils._shell import _ParseHeaders, _CookieParser from scrapling.core._types import ( + Callable, Dict, Any, cast, @@ -82,7 +83,7 @@ class NoExitArgumentParser(ArgumentParser): # pragma: no cover class CurlParser: """Builds the argument parser for relevant curl flags from DevTools.""" - def __init__(self): + def __init__(self) -> None: from scrapling.fetchers import Fetcher as __Fetcher self.__fetcher = __Fetcher @@ -467,19 +468,21 @@ Type 'exit' or press Ctrl+D to exit. return result - def create_wrapper(self, func, get_signature=True, signature_name=None): + def create_wrapper( + self, func: Callable, get_signature: bool = True, signature_name: Optional[str] = None + ) -> Callable: """Create a wrapper that preserves function signature but updates page""" @wraps(func) - def wrapper(*args, **kwargs): + def wrapper(*args: Any, **kwargs: Any) -> Any: result = func(*args, **kwargs) return self.update_page(result) if get_signature: # Explicitly preserve and unpack signature for IPython introspection and autocompletion - wrapper.__signature__ = _unpack_signature(func, signature_name) # pyright: ignore + setattr(wrapper, "__signature__", _unpack_signature(func, signature_name)) else: - wrapper.__signature__ = signature(func) # pyright: ignore + setattr(wrapper, "__signature__", signature(func)) return wrapper @@ -583,7 +586,7 @@ class Convertor: raise ValueError(f"Unknown extraction type: {extraction_type}") else: if main_content_only: - page = cast(Selector, page.css_first("body")) or page + page = cast(Selector, page.css("body").first) or page pages = [page] if not css_selector else cast(Selectors, page.css(css_selector)) for page in pages: @@ -601,7 +604,7 @@ class Convertor: " ", ): # Remove consecutive white-spaces - txt_content = re_sub(f"[{s}]+", s, txt_content) + txt_content = TextHandler(re_sub(f"[{s}]+", s, txt_content)) yield txt_content yield "" diff --git a/scrapling/core/storage.py b/scrapling/core/storage.py index 50258c5a7a4efe054166a3a26ac5dbb0cce7a947..d93a320bc29e5cd4a37fa478ab836a65d16140ab 100644 --- a/scrapling/core/storage.py +++ b/scrapling/core/storage.py @@ -8,7 +8,7 @@ from orjson import dumps, loads from lxml.html import HtmlElement from scrapling.core.utils import _StorageTools, log -from scrapling.core._types import Dict, Optional, Any +from scrapling.core._types import Dict, Optional, Any, cast class StorageSystemMixin(ABC): # pragma: no cover @@ -17,18 +17,24 @@ class StorageSystemMixin(ABC): # pragma: no cover """ :param url: URL of the website we are working on to separate it from other websites data """ - self.url = url + # Make the url in lowercase to handle this edge case until it's updated: https://github.com/barseghyanartur/tld/issues/124 + self.url = url.lower() if (url and isinstance(url, str)) else None @lru_cache(64, typed=True) def _get_base_url(self, default_value: str = "default") -> str: - if not self.url or not isinstance(self.url, str): + if not self.url: return default_value try: - from tldextract import extract as tld + from tld import get_tld, Result - extracted = tld(self.url) - return extracted.top_domain_under_public_suffix or extracted.domain or default_value + # Fixing the inaccurate return type hint in `get_tld` + extracted: Result | None = cast( + Result, get_tld(self.url, as_object=True, fail_silently=True, fix_protocol=True) + ) + if not extracted: + return default_value + return extracted.fld or extracted.domain or default_value except AttributeError: return default_value @@ -57,12 +63,11 @@ class StorageSystemMixin(ABC): # pragma: no cover def _get_hash(identifier: str) -> str: """If you want to hash identifier in your storage system, use this safer""" _identifier = identifier.lower().strip() - if isinstance(_identifier, str): - # Hash functions have to take bytes - _identifier = _identifier.encode("utf-8") + # Hash functions have to take bytes + _identifier_bytes = _identifier.encode("utf-8") - hash_value = sha256(_identifier).hexdigest() - return f"{hash_value}_{len(_identifier)}" # Length to reduce collision chance + hash_value = sha256(_identifier_bytes).hexdigest() + return f"{hash_value}_{len(_identifier_bytes)}" # Length to reduce collision chance @lru_cache(1, typed=True) diff --git a/scrapling/core/utils/__init__.py b/scrapling/core/utils/__init__.py index 6ae80fe155545ea1b9466329603bbc8d199267f7..77766c3c7bbaa9cc42237d1a31bde69ac9902972 100644 --- a/scrapling/core/utils/__init__.py +++ b/scrapling/core/utils/__init__.py @@ -1,5 +1,7 @@ from ._utils import ( log, + set_logger, + reset_logger, __CONSECUTIVE_SPACES_REGEX__, flatten, _is_iterable, diff --git a/scrapling/core/utils/_utils.py b/scrapling/core/utils/_utils.py index 2f57cfaff82f9aff1cf7b351e2731a5ee6129c04..a2fa6cab9b8fe2683dd4e873ad778b47eaeb87aa 100644 --- a/scrapling/core/utils/_utils.py +++ b/scrapling/core/utils/_utils.py @@ -1,6 +1,7 @@ import logging from itertools import chain from re import compile as re_compile +from contextvars import ContextVar, Token from lxml import html @@ -36,7 +37,25 @@ def setup_logger(): return logger -log = setup_logger() +_current_logger: ContextVar[logging.Logger] = ContextVar("scrapling_logger", default=setup_logger()) + + +class LoggerProxy: + def __getattr__(self, name: str): + return getattr(_current_logger.get(), name) + + +log = LoggerProxy() + + +def set_logger(logger: logging.Logger) -> Token: + """Set the current context logger. Returns token for reset.""" + return _current_logger.set(logger) + + +def reset_logger(token: Token) -> None: + """Reset logger to previous state using token.""" + _current_logger.reset(token) def flatten(lst: Iterable[Any]) -> List[Any]: diff --git a/scrapling/engines/_browsers/_base.py b/scrapling/engines/_browsers/_base.py index bb14b5f00d47c72f6872eeb282b998018d64c6d0..51ce3083d31bbf521300954cb54a2b75756b5807 100644 --- a/scrapling/engines/_browsers/_base.py +++ b/scrapling/engines/_browsers/_base.py @@ -1,61 +1,85 @@ from time import time from asyncio import sleep as asyncio_sleep, Lock +from contextlib import contextmanager, asynccontextmanager from playwright.sync_api._generated import Page from playwright.sync_api import ( Frame, BrowserContext, - Playwright, Response as SyncPlaywrightResponse, ) from playwright.async_api._generated import Page as AsyncPage from playwright.async_api import ( Frame as AsyncFrame, - Playwright as AsyncPlaywright, Response as AsyncPlaywrightResponse, BrowserContext as AsyncBrowserContext, ) from playwright._impl._errors import Error as PlaywrightError -from ._page import PageInfo, PagePool from scrapling.parser import Selector -from ._validators import validate, PlaywrightConfig, StealthConfig -from ._config_tools import __default_chrome_useragent__, __default_useragent__ -from scrapling.engines.toolbelt.navigation import intercept_route, async_intercept_route -from scrapling.core._types import Any, cast, Dict, List, Optional, Callable, TYPE_CHECKING, overload, Tuple -from scrapling.engines.constants import ( - DEFAULT_STEALTH_FLAGS, - HARMFUL_DEFAULT_ARGS, - DEFAULT_FLAGS, +from scrapling.engines._browsers._page import PageInfo, PagePool +from scrapling.engines._browsers._validators import validate, PlaywrightConfig, StealthConfig +from scrapling.engines._browsers._config_tools import __default_chrome_useragent__, __default_useragent__ +from scrapling.engines.toolbelt.navigation import ( + construct_proxy_dict, + create_intercept_handler, + create_async_intercept_handler, ) +from scrapling.core._types import ( + Any, + Dict, + List, + Set, + Optional, + Callable, + TYPE_CHECKING, + cast, + overload, + Tuple, + ProxyType, + Generator, + AsyncGenerator, +) +from scrapling.engines.constants import STEALTH_ARGS, HARMFUL_ARGS, DEFAULT_ARGS class SyncSession: + _config: "PlaywrightConfig | StealthConfig" + _context_options: Dict[str, Any] + + def _build_context_with_proxy(self, proxy: Optional[ProxyType] = None) -> Dict[str, Any]: + raise NotImplementedError # pragma: no cover + def __init__(self, max_pages: int = 1): self.max_pages = max_pages self.page_pool = PagePool(max_pages) self._max_wait_for_page = 60 - self.playwright: Playwright | Any = None - self.context: BrowserContext | Any = None - self._closed = False + self.playwright: Any = None + self.context: Any = None + self.browser: Any = None + self._is_alive = False - def start(self): + def start(self) -> None: pass def close(self): # pragma: no cover """Close all resources""" - if self._closed: + if not self._is_alive: return if self.context: self.context.close() self.context = None + if self.browser: + self.browser.close() + self.browser = None + if self.playwright: self.playwright.stop() self.playwright = None # pyright: ignore - self._closed = True + self._is_alive = False def __enter__(self): self.start() @@ -64,25 +88,36 @@ class SyncSession: def __exit__(self, exc_type, exc_val, exc_tb): self.close() + def _initialize_context(self, config: PlaywrightConfig | StealthConfig, ctx: BrowserContext) -> BrowserContext: + """Initialize the browser context.""" + if config.init_script: + ctx.add_init_script(path=config.init_script) + + if config.cookies: # pragma: no cover + ctx.add_cookies(config.cookies) + + return ctx + def _get_page( self, timeout: int | float, extra_headers: Optional[Dict[str, str]], disable_resources: bool, + blocked_domains: Optional[Set[str]] = None, + context: Optional[BrowserContext] = None, ) -> PageInfo[Page]: # pragma: no cover """Get a new page to use""" - # No need to check if a page is available or not in sync code because the code blocked before reaching here till the page closed, ofc. - assert self.context is not None, "Browser context not initialized" - page = self.context.new_page() + ctx = context if context is not None else self.context + assert ctx is not None, "Browser context not initialized" + page = ctx.new_page() page.set_default_navigation_timeout(timeout) page.set_default_timeout(timeout) if extra_headers: page.set_extra_http_headers(extra_headers) - if disable_resources: - page.route("**/*", intercept_route) - + if disable_resources or blocked_domains: + page.route("**/*", create_intercept_handler(disable_resources, blocked_domains)) page_info = self.page_pool.add_page(page) page_info.mark_busy() return page_info @@ -129,34 +164,77 @@ class SyncSession: return handle_response + @contextmanager + def _page_generator( + self, + timeout: int | float, + extra_headers: Optional[Dict[str, str]], + disable_resources: bool, + proxy: Optional[ProxyType] = None, + blocked_domains: Optional[Set[str]] = None, + ) -> Generator["PageInfo[Page]", None, None]: + """Acquire a page - either from persistent context or fresh context with proxy.""" + if proxy: + # Rotation mode: create fresh context with the provided proxy + if not self.browser: # pragma: no cover + raise RuntimeError("Browser not initialized for proxy rotation mode") + context_options = self._build_context_with_proxy(proxy) + context: BrowserContext = self.browser.new_context(**context_options) + + try: + context = self._initialize_context(self._config, context) + page_info = self._get_page(timeout, extra_headers, disable_resources, blocked_domains, context=context) + yield page_info + finally: + context.close() + else: + # Standard mode: use PagePool with persistent context + page_info = self._get_page(timeout, extra_headers, disable_resources, blocked_domains) + try: + yield page_info + finally: + page_info.page.close() + self.page_pool.pages.remove(page_info) + class AsyncSession: + _config: "PlaywrightConfig | StealthConfig" + _context_options: Dict[str, Any] + + def _build_context_with_proxy(self, proxy: Optional[ProxyType] = None) -> Dict[str, Any]: + raise NotImplementedError # pragma: no cover + def __init__(self, max_pages: int = 1): self.max_pages = max_pages self.page_pool = PagePool(max_pages) self._max_wait_for_page = 60 - self.playwright: AsyncPlaywright | Any = None - self.context: AsyncBrowserContext | Any = None - self._closed = False + self.playwright: Any = None + self.context: Any = None + self.browser: Any = None + self._is_alive = False self._lock = Lock() - async def start(self): + async def start(self) -> None: pass async def close(self): """Close all resources""" - if self._closed: # pragma: no cover + if not self._is_alive: # pragma: no cover return if self.context: await self.context.close() self.context = None # pyright: ignore + if self.browser: + await self.browser.close() + self.browser = None + if self.playwright: await self.playwright.stop() self.playwright = None # pyright: ignore - self._closed = True + self._is_alive = False async def __aenter__(self): await self.start() @@ -165,19 +243,35 @@ class AsyncSession: async def __aexit__(self, exc_type, exc_val, exc_tb): await self.close() + async def _initialize_context( + self, config: PlaywrightConfig | StealthConfig, ctx: AsyncBrowserContext + ) -> AsyncBrowserContext: + """Initialize the browser context.""" + if config.init_script: # pragma: no cover + await ctx.add_init_script(path=config.init_script) + + if config.cookies: # pragma: no cover + await ctx.add_cookies(config.cookies) + + return ctx + async def _get_page( self, timeout: int | float, extra_headers: Optional[Dict[str, str]], disable_resources: bool, + blocked_domains: Optional[Set[str]] = None, + context: Optional[AsyncBrowserContext] = None, ) -> PageInfo[AsyncPage]: # pragma: no cover """Get a new page to use""" + ctx = context if context is not None else self.context if TYPE_CHECKING: - assert self.context is not None, "Browser context not initialized" + assert ctx is not None, "Browser context not initialized" async with self._lock: # If we're at max capacity after cleanup, wait for busy pages to finish - if self.page_pool.pages_count >= self.max_pages: + if context is None and self.page_pool.pages_count >= self.max_pages: + # Only applies when using persistent context start_time = time() while time() - start_time < self._max_wait_for_page: await asyncio_sleep(0.05) @@ -188,14 +282,14 @@ class AsyncSession: f"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period" ) - page = await self.context.new_page() + page = await ctx.new_page() page.set_default_navigation_timeout(timeout) page.set_default_timeout(timeout) if extra_headers: await page.set_extra_http_headers(extra_headers) - if disable_resources: - await page.route("**/*", async_intercept_route) + if disable_resources or blocked_domains: + await page.route("**/*", create_async_intercept_handler(disable_resources, blocked_domains)) return self.page_pool.add_page(page) @@ -241,8 +335,44 @@ class AsyncSession: return handle_response + @asynccontextmanager + async def _page_generator( + self, + timeout: int | float, + extra_headers: Optional[Dict[str, str]], + disable_resources: bool, + proxy: Optional[ProxyType] = None, + blocked_domains: Optional[Set[str]] = None, + ) -> AsyncGenerator["PageInfo[AsyncPage]", None]: + """Acquire a page - either from persistent context or fresh context with proxy.""" + if proxy: + # Rotation mode: create fresh context with the provided proxy + if not self.browser: # pragma: no cover + raise RuntimeError("Browser not initialized for proxy rotation mode") + context_options = self._build_context_with_proxy(proxy) + context: AsyncBrowserContext = await self.browser.new_context(**context_options) + + try: + context = await self._initialize_context(self._config, context) + page_info = await self._get_page( + timeout, extra_headers, disable_resources, blocked_domains, context=context + ) + yield page_info + finally: + await context.close() + else: + # Standard mode: use PagePool with persistent context + page_info = await self._get_page(timeout, extra_headers, disable_resources, blocked_domains) + try: + yield page_info + finally: + await page_info.page.close() + self.page_pool.pages.remove(page_info) + class BaseSessionMixin: + _config: "PlaywrightConfig | StealthConfig" + @overload def __validate_routine__(self, params: Dict, model: type[StealthConfig]) -> StealthConfig: ... @@ -254,9 +384,9 @@ class BaseSessionMixin: ) -> PlaywrightConfig | StealthConfig: # Dark color scheme bypasses the 'prefersLightColor' check in creepjs self._context_options: Dict[str, Any] = {"color_scheme": "dark", "device_scale_factor": 2} - self._launch_options: Dict[str, Any] = self._context_options | { - "args": DEFAULT_FLAGS, - "ignore_default_args": HARMFUL_DEFAULT_ARGS, + self._browser_options: Dict[str, Any] = { + "args": DEFAULT_ARGS, + "ignore_default_args": HARMFUL_ARGS, } if "__max_pages" in params: params["max_pages"] = params.pop("__max_pages") @@ -269,7 +399,7 @@ class BaseSessionMixin: return config def __generate_options__(self, extra_flags: Tuple | None = None) -> None: - config = cast(PlaywrightConfig, getattr(self, "_config", None)) + config: PlaywrightConfig | StealthConfig = self._config self._context_options.update( { "proxy": config.proxy, @@ -287,28 +417,40 @@ class BaseSessionMixin: ) if not config.cdp_url: - self._launch_options |= self._context_options - self._context_options = {} - flags = self._launch_options["args"] + flags = self._browser_options["args"] if config.extra_flags or extra_flags: flags = list(set(flags + (config.extra_flags or extra_flags))) - self._launch_options.update( + self._browser_options.update( { "args": flags, "headless": config.headless, - "user_data_dir": config.user_data_dir, "channel": "chrome" if config.real_chrome else "chromium", } ) - if config.additional_args: - self._launch_options.update(config.additional_args) + self._user_data_dir = config.user_data_dir else: - # while `context_options` is left to be used when cdp mode is enabled - self._launch_options = dict() - if config.additional_args: - self._context_options.update(config.additional_args) + self._browser_options = {} + + if config.additional_args: + self._context_options.update(config.additional_args) + + def _build_context_with_proxy(self, proxy: Optional[ProxyType] = None) -> Dict[str, Any]: + """ + Build context options with a specific proxy for rotation mode. + + :param proxy: Proxy URL string or Playwright-style proxy dict to use for this context. + :return: Dictionary of context options for browser.new_context(). + """ + + context_options = self._context_options.copy() + + # Override proxy if provided + if proxy: + context_options["proxy"] = construct_proxy_dict(proxy) + + return context_options class DynamicSessionMixin(BaseSessionMixin): @@ -319,7 +461,7 @@ class DynamicSessionMixin(BaseSessionMixin): class StealthySessionMixin(BaseSessionMixin): def __validate__(self, **params): - self._config: StealthConfig = self.__validate_routine__(params, model=StealthConfig) + self._config = self.__validate_routine__(params, model=StealthConfig) self._context_options.update( { "is_mobile": False, @@ -335,22 +477,23 @@ class StealthySessionMixin(BaseSessionMixin): self.__generate_stealth_options() def __generate_stealth_options(self) -> None: - flags = tuple() - if not self._config.cdp_url: - flags = DEFAULT_FLAGS + DEFAULT_STEALTH_FLAGS + config = cast(StealthConfig, self._config) + flags: Tuple[str, ...] = tuple() + if not config.cdp_url: + flags = DEFAULT_ARGS + STEALTH_ARGS - if self._config.block_webrtc: + if config.block_webrtc: flags += ( "--webrtc-ip-handling-policy=disable_non_proxied_udp", "--force-webrtc-ip-handling-policy", # Ensures the policy is enforced ) - if not self._config.allow_webgl: + if not config.allow_webgl: flags += ( "--disable-webgl", "--disable-webgl-image-chromium", "--disable-webgl2", ) - if self._config.hide_canvas: + if config.hide_canvas: flags += ("--fingerprinting-canvas-image-data-noise",) super(StealthySessionMixin, self).__generate_options__(flags) diff --git a/scrapling/engines/_browsers/_controllers.py b/scrapling/engines/_browsers/_controllers.py index ff214386076f690559b0f07ee07b2a5bab8b6667..5d9e801da96aff97aa3b66fdff33efd592a069d2 100644 --- a/scrapling/engines/_browsers/_controllers.py +++ b/scrapling/engines/_browsers/_controllers.py @@ -1,22 +1,23 @@ +from time import sleep as time_sleep +from asyncio import sleep as asyncio_sleep + from playwright.sync_api import ( Locator, - Playwright, sync_playwright, ) from playwright.async_api import ( async_playwright, Locator as AsyncLocator, - Playwright as AsyncPlaywright, - BrowserContext as AsyncBrowserContext, ) from scrapling.core.utils import log -from scrapling.core._types import Unpack, TYPE_CHECKING -from ._types import PlaywrightSession, PlaywrightFetchParams -from ._base import SyncSession, AsyncSession, DynamicSessionMixin -from ._validators import validate_fetch as _validate, PlaywrightConfig +from scrapling.core._types import Optional, ProxyType, Unpack +from scrapling.engines.toolbelt.proxy_rotation import is_proxy_error from scrapling.engines.toolbelt.convertor import Response, ResponseFactory from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer +from scrapling.engines._browsers._types import PlaywrightSession, PlaywrightFetchParams +from scrapling.engines._browsers._base import SyncSession, AsyncSession, DynamicSessionMixin +from scrapling.engines._browsers._validators import validate_fetch as _validate, PlaywrightConfig class DynamicSession(SyncSession, DynamicSessionMixin): @@ -25,13 +26,14 @@ class DynamicSession(SyncSession, DynamicSessionMixin): __slots__ = ( "_config", "_context_options", - "_launch_options", + "_browser_options", + "_user_data_dir", + "_headers_keys", "max_pages", "page_pool", "_max_wait_for_page", "playwright", "context", - "_closed", ) def __init__(self, **kwargs: Unpack[PlaywrightSession]): @@ -40,6 +42,7 @@ class DynamicSession(SyncSession, DynamicSessionMixin): :param headless: Run the browser in headless/hidden (default), or headful/visible mode. :param disable_resources: Drop requests for unnecessary resources for a speed boost. Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`. + :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too). :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it. :param cookies: Set cookies for the next request. :param network_idle: Wait for the page until there are no network connections for at least 500 ms. @@ -69,19 +72,30 @@ class DynamicSession(SyncSession, DynamicSessionMixin): def start(self): """Create a browser for this instance and context.""" if not self.playwright: - self.playwright: Playwright = sync_playwright().start() # pyright: ignore [reportAttributeAccessIssue] - - if self._config.cdp_url: # pragma: no cover - browser = self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url) - self.context = browser.new_context(**self._context_options) - else: - self.context = self.playwright.chromium.launch_persistent_context(**self._launch_options) - - if self._config.init_script: # pragma: no cover - self.context.add_init_script(path=self._config.init_script) - - if self._config.cookies: # pragma: no cover - self.context.add_cookies(self._config.cookies) + self.playwright = sync_playwright().start() + + try: + if self._config.cdp_url: # pragma: no cover + self.browser = self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url) + if not self._config.proxy_rotator and self.browser: + self.context = self.browser.new_context(**self._context_options) + elif self._config.proxy_rotator: + self.browser = self.playwright.chromium.launch(**self._browser_options) + else: + persistent_options = ( + self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir} + ) + self.context = self.playwright.chromium.launch_persistent_context(**persistent_options) + + if self.context: + self.context = self._initialize_context(self._config, self.context) + + self._is_alive = True + except Exception: + # Clean up playwright if browser setup fails + self.playwright.stop() + self.playwright = None + raise else: raise RuntimeError("Session has been already started") @@ -96,76 +110,107 @@ class DynamicSession(SyncSession, DynamicSessionMixin): :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._ :param disable_resources: Drop requests for unnecessary resources for a speed boost. Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`. + :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too). :param wait_selector: Wait for a specific CSS selector to be in a specific state. :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`. :param network_idle: Wait for the page until there are no network connections for at least 500 ms. :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute. :param selector_config: The arguments that will be passed in the end while creating the final Selector's class. + :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it. :return: A `Response` object. """ + static_proxy = kwargs.pop("proxy", None) + params = _validate(kwargs, self, PlaywrightConfig) - if self._closed: # pragma: no cover + if not self._is_alive: # pragma: no cover raise RuntimeError("Context manager has been closed") + request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set() referer = ( - generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None + generate_convincing_referer(url) + if (params.google_search and "referer" not in request_headers_keys) + else None ) - page_info = self._get_page(params.timeout, params.extra_headers, params.disable_resources) - final_response = [None] - handle_response = self._create_response_handler(page_info, final_response) - - try: # pragma: no cover - # Navigate to URL and wait for a specified state - page_info.page.on("response", handle_response) - first_response = page_info.page.goto(url, referer=referer) - self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle) + for attempt in range(self._config.retries): + proxy: Optional[ProxyType] = None + if self._config.proxy_rotator and static_proxy is None: + proxy = self._config.proxy_rotator.get_proxy() + else: + proxy = static_proxy - if not first_response: - raise RuntimeError(f"Failed to get response for {url}") + with self._page_generator( + params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains + ) as page_info: + final_response = [None] + page = page_info.page + page.on("response", self._create_response_handler(page_info, final_response)) - if params.page_action: try: - _ = params.page_action(page_info.page) - except Exception as e: # pragma: no cover - log.error(f"Error executing page_action: {e}") + first_response = page.goto(url, referer=referer) + self._wait_for_page_stability(page, params.load_dom, params.network_idle) - if params.wait_selector: - try: - waiter: Locator = page_info.page.locator(params.wait_selector) - waiter.first.wait_for(state=params.wait_selector_state) - # Wait again after waiting for the selector, helpful with protections like Cloudflare - self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle) - except Exception as e: # pragma: no cover - log.error(f"Error waiting for selector {params.wait_selector}: {e}") + if not first_response: + raise RuntimeError(f"Failed to get response for {url}") - page_info.page.wait_for_timeout(params.wait) + if params.page_action: + try: + _ = params.page_action(page) + except Exception as e: # pragma: no cover + log.error(f"Error executing page_action: {e}") - # Create response object - response = ResponseFactory.from_playwright_response( - page_info.page, first_response, final_response[0], params.selector_config - ) + if params.wait_selector: + try: + waiter: Locator = page.locator(params.wait_selector) + waiter.first.wait_for(state=params.wait_selector_state) + self._wait_for_page_stability(page, params.load_dom, params.network_idle) + except Exception as e: # pragma: no cover + log.error(f"Error waiting for selector {params.wait_selector}: {e}") - # Close the page to free up resources - page_info.page.close() - self.page_pool.pages.remove(page_info) + page.wait_for_timeout(params.wait) - return response + response = ResponseFactory.from_playwright_response( + page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy} + ) + return response - except Exception as e: - page_info.mark_error() - raise e + except Exception as e: + page_info.mark_error() + if attempt < self._config.retries - 1: + if is_proxy_error(e): + log.warning( + f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..." + ) + else: + log.warning( + f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..." + ) + time_sleep(self._config.retry_delay) + else: + log.error(f"Failed after {self._config.retries} attempts: {e}") + raise + + raise RuntimeError("Request failed") # pragma: no cover class AsyncDynamicSession(AsyncSession, DynamicSessionMixin): """An async Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.""" + __slots__ = ( + "_config", + "_context_options", + "_browser_options", + "_user_data_dir", + "_headers_keys", + ) + def __init__(self, **kwargs: Unpack[PlaywrightSession]): """A Browser session manager with page pooling :param headless: Run the browser in headless/hidden (default), or headful/visible mode. :param disable_resources: Drop requests for unnecessary resources for a speed boost. Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`. + :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too). :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it. :param cookies: Set cookies for the next request. :param network_idle: Wait for the page until there are no network connections for at least 500 ms. @@ -193,24 +238,32 @@ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin): self.__validate__(**kwargs) super().__init__(max_pages=self._config.max_pages) - async def start(self): + async def start(self) -> None: """Create a browser for this instance and context.""" if not self.playwright: - self.playwright: AsyncPlaywright = await async_playwright().start() # pyright: ignore [reportAttributeAccessIssue] - - if self._config.cdp_url: - browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url) - self.context: AsyncBrowserContext = await browser.new_context(**self._context_options) - else: - self.context: AsyncBrowserContext = await self.playwright.chromium.launch_persistent_context( - **self._launch_options - ) - - if self._config.init_script: # pragma: no cover - await self.context.add_init_script(path=self._config.init_script) - - if self._config.cookies: - await self.context.add_cookies(self._config.cookies) # pyright: ignore + self.playwright = await async_playwright().start() + try: + if self._config.cdp_url: + self.browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url) + if not self._config.proxy_rotator and self.browser: + self.context = await self.browser.new_context(**self._context_options) + elif self._config.proxy_rotator: + self.browser = await self.playwright.chromium.launch(**self._browser_options) + else: + persistent_options = ( + self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir} + ) + self.context = await self.playwright.chromium.launch_persistent_context(**persistent_options) + + if self.context: + self.context = await self._initialize_context(self._config, self.context) + + self._is_alive = True + except Exception: + # Clean up playwright if browser setup fails + await self.playwright.stop() + self.playwright = None + raise else: raise RuntimeError("Session has been already started") @@ -225,68 +278,85 @@ class AsyncDynamicSession(AsyncSession, DynamicSessionMixin): :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._ :param disable_resources: Drop requests for unnecessary resources for a speed boost. Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`. + :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too). :param wait_selector: Wait for a specific CSS selector to be in a specific state. :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`. :param network_idle: Wait for the page until there are no network connections for at least 500 ms. :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute. :param selector_config: The arguments that will be passed in the end while creating the final Selector's class. + :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it. :return: A `Response` object. """ + static_proxy = kwargs.pop("proxy", None) + params = _validate(kwargs, self, PlaywrightConfig) - if self._closed: # pragma: no cover + if not self._is_alive: # pragma: no cover raise RuntimeError("Context manager has been closed") + request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set() referer = ( - generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None + generate_convincing_referer(url) + if (params.google_search and "referer" not in request_headers_keys) + else None ) - page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources) - final_response = [None] - handle_response = self._create_response_handler(page_info, final_response) - - if TYPE_CHECKING: - from playwright.async_api import Page as async_Page - - if not isinstance(page_info.page, async_Page): - raise TypeError - - try: - # Navigate to URL and wait for a specified state - page_info.page.on("response", handle_response) - first_response = await page_info.page.goto(url, referer=referer) - await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle) + for attempt in range(self._config.retries): + proxy: Optional[ProxyType] = None + if self._config.proxy_rotator and static_proxy is None: + proxy = self._config.proxy_rotator.get_proxy() + else: + proxy = static_proxy - if not first_response: - raise RuntimeError(f"Failed to get response for {url}") + async with self._page_generator( + params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains + ) as page_info: + final_response = [None] + page = page_info.page + page.on("response", self._create_response_handler(page_info, final_response)) - if params.page_action: try: - _ = await params.page_action(page_info.page) - except Exception as e: - log.error(f"Error executing page_action: {e}") + first_response = await page.goto(url, referer=referer) + await self._wait_for_page_stability(page, params.load_dom, params.network_idle) - if params.wait_selector: - try: - waiter: AsyncLocator = page_info.page.locator(params.wait_selector) - await waiter.first.wait_for(state=params.wait_selector_state) - # Wait again after waiting for the selector, helpful with protections like Cloudflare - await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle) - except Exception as e: - log.error(f"Error waiting for selector {params.wait_selector}: {e}") + if not first_response: + raise RuntimeError(f"Failed to get response for {url}") + + if params.page_action: + try: + _ = await params.page_action(page) + except Exception as e: # pragma: no cover + log.error(f"Error executing page_action: {e}") - await page_info.page.wait_for_timeout(params.wait) + if params.wait_selector: + try: + waiter: AsyncLocator = page.locator(params.wait_selector) + await waiter.first.wait_for(state=params.wait_selector_state) + await self._wait_for_page_stability(page, params.load_dom, params.network_idle) + except Exception as e: # pragma: no cover + log.error(f"Error waiting for selector {params.wait_selector}: {e}") - # Create response object - response = await ResponseFactory.from_async_playwright_response( - page_info.page, first_response, final_response[0], params.selector_config - ) + await page.wait_for_timeout(params.wait) - # Close the page to free up resources - await page_info.page.close() - self.page_pool.pages.remove(page_info) - return response + response = await ResponseFactory.from_async_playwright_response( + page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy} + ) + return response - except Exception as e: # pragma: no cover - page_info.mark_error() - raise e + except Exception as e: + page_info.mark_error() + if attempt < self._config.retries - 1: + if is_proxy_error(e): + log.warning( + f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..." + ) + else: + log.warning( + f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..." + ) + await asyncio_sleep(self._config.retry_delay) + else: + log.error(f"Failed after {self._config.retries} attempts: {e}") + raise + + raise RuntimeError("Request failed") # pragma: no cover diff --git a/scrapling/engines/_browsers/_page.py b/scrapling/engines/_browsers/_page.py index 655d3d179082c0f6f8e6f7b56a972ad7ca659311..481016e0028e4488306775313da5fc4979f3552c 100644 --- a/scrapling/engines/_browsers/_page.py +++ b/scrapling/engines/_browsers/_page.py @@ -61,7 +61,9 @@ class PagePool: raise RuntimeError(f"Maximum page limit ({self.max_pages}) reached") if isinstance(page, AsyncPage): - page_info = cast(PageInfo[AsyncPage], PageInfo(page, "ready", "")) + page_info: PageInfo[SyncPage] | PageInfo[AsyncPage] = cast( + PageInfo[AsyncPage], PageInfo(page, "ready", "") + ) else: page_info = cast(PageInfo[SyncPage], PageInfo(page, "ready", "")) diff --git a/scrapling/engines/_browsers/_stealth.py b/scrapling/engines/_browsers/_stealth.py index 912d38e050bc5390f3faba394d69aad00854c637..3c9ea58147d45dafe63cb28323e3f3419560737c 100644 --- a/scrapling/engines/_browsers/_stealth.py +++ b/scrapling/engines/_browsers/_stealth.py @@ -1,28 +1,26 @@ from random import randint from re import compile as re_compile +from time import sleep as time_sleep +from asyncio import sleep as asyncio_sleep -from playwright.sync_api import ( - Locator, - Page, - Playwright, -) +from playwright.sync_api import Locator, Page, BrowserContext from playwright.async_api import ( Page as async_Page, Locator as AsyncLocator, - Playwright as AsyncPlaywright, BrowserContext as AsyncBrowserContext, ) from patchright.sync_api import sync_playwright from patchright.async_api import async_playwright from scrapling.core.utils import log -from scrapling.core._types import Any, Unpack -from ._config_tools import _compiled_stealth_scripts -from ._types import StealthSession, StealthFetchParams -from ._base import SyncSession, AsyncSession, StealthySessionMixin -from ._validators import validate_fetch as _validate, StealthConfig +from scrapling.core._types import Any, Optional, ProxyType, Unpack +from scrapling.engines.toolbelt.proxy_rotation import is_proxy_error from scrapling.engines.toolbelt.convertor import Response, ResponseFactory from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer +from scrapling.engines._browsers._config_tools import _compiled_stealth_scripts +from scrapling.engines._browsers._types import StealthSession, StealthFetchParams +from scrapling.engines._browsers._base import SyncSession, AsyncSession, StealthySessionMixin +from scrapling.engines._browsers._validators import validate_fetch as _validate, StealthConfig __CF_PATTERN__ = re_compile("challenges.cloudflare.com/cdn-cgi/challenge-platform/.*") @@ -33,13 +31,14 @@ class StealthySession(SyncSession, StealthySessionMixin): __slots__ = ( "_config", "_context_options", - "_launch_options", + "_browser_options", + "_user_data_dir", + "_headers_keys", "max_pages", "page_pool", "_max_wait_for_page", "playwright", "context", - "_closed", ) def __init__(self, **kwargs: Unpack[StealthSession]): @@ -48,6 +47,7 @@ class StealthySession(SyncSession, StealthySessionMixin): :param headless: Run the browser in headless/hidden (default), or headful/visible mode. :param disable_resources: Drop requests for unnecessary resources for a speed boost. Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`. + :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too). :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it. :param cookies: Set cookies for the next request. :param network_idle: Wait for the page until there are no network connections for at least 500 ms. @@ -78,28 +78,45 @@ class StealthySession(SyncSession, StealthySessionMixin): self.__validate__(**kwargs) super().__init__() - def start(self): + def start(self) -> None: """Create a browser for this instance and context.""" if not self.playwright: - self.playwright: Playwright = sync_playwright().start() # pyright: ignore [reportAttributeAccessIssue] - - if self._config.cdp_url: # pragma: no cover - browser = self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url) - self.context = browser.new_context(**self._context_options) - else: - self.context = self.playwright.chromium.launch_persistent_context(**self._launch_options) - - for script in _compiled_stealth_scripts(): - self.context.add_init_script(script=script) - - if self._config.init_script: # pragma: no cover - self.context.add_init_script(path=self._config.init_script) - - if self._config.cookies: # pragma: no cover - self.context.add_cookies(self._config.cookies) + self.playwright = sync_playwright().start() + + try: + if self._config.cdp_url: # pragma: no cover + self.browser = self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url) + if not self._config.proxy_rotator: + assert self.browser is not None + self.context = self.browser.new_context(**self._context_options) + elif self._config.proxy_rotator: + self.browser = self.playwright.chromium.launch(**self._browser_options) + else: + persistent_options = ( + self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir} + ) + self.context = self.playwright.chromium.launch_persistent_context(**persistent_options) + + if self.context: + self.context = self._initialize_context(self._config, self.context) + + self._is_alive = True + except Exception: + # Clean up playwright if browser setup fails + self.playwright.stop() + self.playwright = None + raise else: raise RuntimeError("Session has been already started") + def _initialize_context(self, config, ctx: BrowserContext) -> BrowserContext: + """Initialize the browser context.""" + for script in _compiled_stealth_scripts(): + ctx.add_init_script(script=script) + + ctx = super()._initialize_context(config, ctx) + return ctx + def _cloudflare_solver(self, page: Page) -> None: # pragma: no cover """Solve the cloudflare challenge displayed on the playwright page passed @@ -129,7 +146,7 @@ class StealthySession(SyncSession, StealthySessionMixin): # Waiting for the verify spinner to disappear, checking every 1s if it disappeared page.wait_for_timeout(500) - outer_box = {} + outer_box: Any = {} iframe = page.frame(url=__CF_PATTERN__) if iframe is not None: self._wait_for_page_stability(iframe, True, False) @@ -139,14 +156,14 @@ class StealthySession(SyncSession, StealthySessionMixin): # Double-checking that the iframe is loaded page.wait_for_timeout(500) - outer_box: Any = iframe.frame_element().bounding_box() + outer_box = iframe.frame_element().bounding_box() if not iframe or not outer_box: if "Just a moment..." not in (ResponseFactory._get_page_content(page)): log.info("Cloudflare captcha is solved") return - outer_box: Any = page.locator(box_selector).last.bounding_box() + outer_box = page.locator(box_selector).last.bounding_box() # Calculate the Captcha coordinates for any viewport captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27) @@ -182,82 +199,113 @@ class StealthySession(SyncSession, StealthySessionMixin): :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._ :param disable_resources: Drop requests for unnecessary resources for a speed boost. Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`. + :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too). :param wait_selector: Wait for a specific CSS selector to be in a specific state. :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`. :param network_idle: Wait for the page until there are no network connections for at least 500 ms. :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute. :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you. :param selector_config: The arguments that will be passed in the end while creating the final Selector's class. + :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it. :return: A `Response` object. """ + static_proxy = kwargs.pop("proxy", None) + params = _validate(kwargs, self, StealthConfig) - if self._closed: # pragma: no cover + if not self._is_alive: # pragma: no cover raise RuntimeError("Context manager has been closed") + request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set() referer = ( - generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None + generate_convincing_referer(url) + if (params.google_search and "referer" not in request_headers_keys) + else None ) - page_info = self._get_page(params.timeout, params.extra_headers, params.disable_resources) - final_response = [None] - handle_response = self._create_response_handler(page_info, final_response) - - try: # pragma: no cover - # Navigate to URL and wait for a specified state - page_info.page.on("response", handle_response) - first_response = page_info.page.goto(url, referer=referer) - self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle) - - if not first_response: - raise RuntimeError(f"Failed to get response for {url}") - - if params.solve_cloudflare: - self._cloudflare_solver(page_info.page) - # Make sure the page is fully loaded after the captcha - self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle) + for attempt in range(self._config.retries): + proxy: Optional[ProxyType] = None + if self._config.proxy_rotator and static_proxy is None: + proxy = self._config.proxy_rotator.get_proxy() + else: + proxy = static_proxy - if params.page_action: - try: - _ = params.page_action(page_info.page) - except Exception as e: # pragma: no cover - log.error(f"Error executing page_action: {e}") + with self._page_generator( + params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains + ) as page_info: + final_response = [None] + page = page_info.page + page.on("response", self._create_response_handler(page_info, final_response)) - if params.wait_selector: try: - waiter: Locator = page_info.page.locator(params.wait_selector) - waiter.first.wait_for(state=params.wait_selector_state) - # Wait again after waiting for the selector, helpful with protections like Cloudflare - self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle) - except Exception as e: # pragma: no cover - log.error(f"Error waiting for selector {params.wait_selector}: {e}") - - page_info.page.wait_for_timeout(params.wait) - - # Create response object - response = ResponseFactory.from_playwright_response( - page_info.page, first_response, final_response[0], params.selector_config - ) + first_response = page.goto(url, referer=referer) + self._wait_for_page_stability(page, params.load_dom, params.network_idle) + + if not first_response: + raise RuntimeError(f"Failed to get response for {url}") + + if params.solve_cloudflare: + self._cloudflare_solver(page) + # Make sure the page is fully loaded after the captcha + self._wait_for_page_stability(page, params.load_dom, params.network_idle) + + if params.page_action: + try: + _ = params.page_action(page) + except Exception as e: # pragma: no cover + log.error(f"Error executing page_action: {e}") + + if params.wait_selector: + try: + waiter: Locator = page.locator(params.wait_selector) + waiter.first.wait_for(state=params.wait_selector_state) + self._wait_for_page_stability(page, params.load_dom, params.network_idle) + except Exception as e: # pragma: no cover + log.error(f"Error waiting for selector {params.wait_selector}: {e}") + + page.wait_for_timeout(params.wait) + + response = ResponseFactory.from_playwright_response( + page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy} + ) + return response - # Close the page to free up resources - page_info.page.close() - self.page_pool.pages.remove(page_info) - - return response - - except Exception as e: - page_info.mark_error() - raise e + except Exception as e: + page_info.mark_error() + if attempt < self._config.retries - 1: + if is_proxy_error(e): + log.warning( + f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..." + ) + else: + log.warning( + f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..." + ) + time_sleep(self._config.retry_delay) + else: + log.error(f"Failed after {self._config.retries} attempts: {e}") + raise + + raise RuntimeError("Request failed") # pragma: no cover class AsyncStealthySession(AsyncSession, StealthySessionMixin): """An async Stealthy Browser session manager with page pooling.""" + __slots__ = ( + "_config", + "_context_options", + "_browser_options", + "_user_data_dir", + "_headers_keys", + ) + def __init__(self, **kwargs: Unpack[StealthSession]): """A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory. :param headless: Run the browser in headless/hidden (default), or headful/visible mode. :param disable_resources: Drop requests for unnecessary resources for a speed boost. Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`. + :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too). :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it. :param cookies: Set cookies for the next request. :param network_idle: Wait for the page until there are no network connections for at least 500 ms. @@ -288,30 +336,44 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin): self.__validate__(**kwargs) super().__init__(max_pages=self._config.max_pages) - async def start(self): + async def start(self) -> None: """Create a browser for this instance and context.""" if not self.playwright: - self.playwright: AsyncPlaywright = await async_playwright().start() # pyright: ignore [reportAttributeAccessIssue] - - if self._config.cdp_url: - browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url) - self.context: AsyncBrowserContext = await browser.new_context(**self._context_options) - else: - self.context: AsyncBrowserContext = await self.playwright.chromium.launch_persistent_context( - **self._launch_options - ) - - for script in _compiled_stealth_scripts(): - await self.context.add_init_script(script=script) - - if self._config.init_script: # pragma: no cover - await self.context.add_init_script(path=self._config.init_script) - - if self._config.cookies: - await self.context.add_cookies(self._config.cookies) # pyright: ignore + self.playwright = await async_playwright().start() + try: + if self._config.cdp_url: + self.browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url) + if not self._config.proxy_rotator: + assert self.browser is not None + self.context = await self.browser.new_context(**self._context_options) + elif self._config.proxy_rotator: + self.browser = await self.playwright.chromium.launch(**self._browser_options) + else: + persistent_options = ( + self._browser_options | self._context_options | {"user_data_dir": self._user_data_dir} + ) + self.context = await self.playwright.chromium.launch_persistent_context(**persistent_options) + + if self.context: + self.context = await self._initialize_context(self._config, self.context) + + self._is_alive = True + except Exception: + # Clean up playwright if browser setup fails + await self.playwright.stop() + self.playwright = None + raise else: raise RuntimeError("Session has been already started") + async def _initialize_context(self, config: Any, ctx: AsyncBrowserContext) -> AsyncBrowserContext: + """Initialize the browser context.""" + for script in _compiled_stealth_scripts(): + await ctx.add_init_script(script=script) + + ctx = await super()._initialize_context(config, ctx) + return ctx + async def _cloudflare_solver(self, page: async_Page) -> None: # pragma: no cover """Solve the cloudflare challenge displayed on the playwright page passed @@ -341,7 +403,7 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin): # Waiting for the verify spinner to disappear, checking every 1s if it disappeared await page.wait_for_timeout(500) - outer_box = {} + outer_box: Any = {} iframe = page.frame(url=__CF_PATTERN__) if iframe is not None: await self._wait_for_page_stability(iframe, True, False) @@ -351,14 +413,14 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin): # Double-checking that the iframe is loaded await page.wait_for_timeout(500) - outer_box: Any = await (await iframe.frame_element()).bounding_box() + outer_box = await (await iframe.frame_element()).bounding_box() if not iframe or not outer_box: if "Just a moment..." not in (await ResponseFactory._get_async_page_content(page)): log.info("Cloudflare captcha is solved") return - outer_box: Any = await page.locator(box_selector).last.bounding_box() + outer_box = await page.locator(box_selector).last.bounding_box() # Calculate the Captcha coordinates for any viewport captcha_x, captcha_y = outer_box["x"] + randint(26, 28), outer_box["y"] + randint(25, 27) @@ -394,68 +456,91 @@ class AsyncStealthySession(AsyncSession, StealthySessionMixin): :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._ :param disable_resources: Drop requests for unnecessary resources for a speed boost. Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`. + :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too). :param wait_selector: Wait for a specific CSS selector to be in a specific state. :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`. :param network_idle: Wait for the page until there are no network connections for at least 500 ms. :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute. :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you. :param selector_config: The arguments that will be passed in the end while creating the final Selector's class. + :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it. :return: A `Response` object. """ + static_proxy = kwargs.pop("proxy", None) + params = _validate(kwargs, self, StealthConfig) - if self._closed: # pragma: no cover + if not self._is_alive: # pragma: no cover raise RuntimeError("Context manager has been closed") + request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set() referer = ( - generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None + generate_convincing_referer(url) + if (params.google_search and "referer" not in request_headers_keys) + else None ) - page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources) - final_response = [None] - handle_response = self._create_response_handler(page_info, final_response) - - try: - # Navigate to URL and wait for a specified state - page_info.page.on("response", handle_response) - first_response = await page_info.page.goto(url, referer=referer) - await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle) - - if not first_response: - raise RuntimeError(f"Failed to get response for {url}") + for attempt in range(self._config.retries): + proxy: Optional[ProxyType] = None + if self._config.proxy_rotator and static_proxy is None: + proxy = self._config.proxy_rotator.get_proxy() + else: + proxy = static_proxy - if params.solve_cloudflare: - await self._cloudflare_solver(page_info.page) - # Make sure the page is fully loaded after the captcha - await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle) + async with self._page_generator( + params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains + ) as page_info: + final_response = [None] + page = page_info.page + page.on("response", self._create_response_handler(page_info, final_response)) - if params.page_action: try: - _ = await params.page_action(page_info.page) - except Exception as e: - log.error(f"Error executing page_action: {e}") + first_response = await page.goto(url, referer=referer) + await self._wait_for_page_stability(page, params.load_dom, params.network_idle) + + if not first_response: + raise RuntimeError(f"Failed to get response for {url}") + + if params.solve_cloudflare: + await self._cloudflare_solver(page) + # Make sure the page is fully loaded after the captcha + await self._wait_for_page_stability(page, params.load_dom, params.network_idle) + + if params.page_action: + try: + _ = await params.page_action(page) + except Exception as e: # pragma: no cover + log.error(f"Error executing page_action: {e}") + + if params.wait_selector: + try: + waiter: AsyncLocator = page.locator(params.wait_selector) + await waiter.first.wait_for(state=params.wait_selector_state) + await self._wait_for_page_stability(page, params.load_dom, params.network_idle) + except Exception as e: # pragma: no cover + log.error(f"Error waiting for selector {params.wait_selector}: {e}") + + await page.wait_for_timeout(params.wait) + + response = await ResponseFactory.from_async_playwright_response( + page, first_response, final_response[0], params.selector_config, meta={"proxy": proxy} + ) + return response - if params.wait_selector: - try: - waiter: AsyncLocator = page_info.page.locator(params.wait_selector) - await waiter.first.wait_for(state=params.wait_selector_state) - # Wait again after waiting for the selector, helpful with protections like Cloudflare - await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle) except Exception as e: - log.error(f"Error waiting for selector {params.wait_selector}: {e}") - - await page_info.page.wait_for_timeout(params.wait) - - # Create response object - response = await ResponseFactory.from_async_playwright_response( - page_info.page, first_response, final_response[0], params.selector_config - ) - - # Close the page to free up resources - await page_info.page.close() - self.page_pool.pages.remove(page_info) - return response - - except Exception as e: # pragma: no cover - page_info.mark_error() - raise e + page_info.mark_error() + if attempt < self._config.retries - 1: + if is_proxy_error(e): + log.warning( + f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s..." + ) + else: + log.warning( + f"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s..." + ) + await asyncio_sleep(self._config.retry_delay) + else: + log.error(f"Failed after {self._config.retries} attempts: {e}") + raise + + raise RuntimeError("Request failed") # pragma: no cover diff --git a/scrapling/engines/_browsers/_types.py b/scrapling/engines/_browsers/_types.py index cb8d760fcba8cadddc45bfc15e38721e9bb5901e..30d6af1ab0eb8d864d58fc2cd5ee0bdabdf6b6bf 100644 --- a/scrapling/engines/_browsers/_types.py +++ b/scrapling/engines/_browsers/_types.py @@ -1,3 +1,5 @@ +from io import BytesIO + from curl_cffi.requests import ( ProxySpec, CookieTypes, @@ -7,6 +9,7 @@ from curl_cffi.requests import ( from scrapling.core._types import ( Dict, List, + Set, Tuple, Mapping, Optional, @@ -16,97 +19,100 @@ from scrapling.core._types import ( TypeAlias, SetCookieParam, SelectorWaitStates, - TYPE_CHECKING, ) +from scrapling.engines.toolbelt.proxy_rotation import ProxyRotator # Type alias for `impersonate` parameter - accepts a single browser or list of browsers ImpersonateType: TypeAlias = BrowserTypeLiteral | List[BrowserTypeLiteral] | None -if TYPE_CHECKING: # pragma: no cover - # Types for session initialization - class RequestsSession(TypedDict, total=False): - impersonate: ImpersonateType - http3: Optional[bool] - stealthy_headers: Optional[bool] - proxies: Optional[ProxySpec] - proxy: Optional[str] - proxy_auth: Optional[Tuple[str, str]] - timeout: Optional[int | float] - headers: Optional[Mapping[str, Optional[str]]] - retries: Optional[int] - retry_delay: Optional[int] - follow_redirects: Optional[bool] - max_redirects: Optional[int] - verify: Optional[bool] - cert: Optional[str | Tuple[str, str]] - selector_config: Optional[Dict] - - # Types for GET request method parameters - class GetRequestParams(RequestsSession, total=False): - params: Optional[Dict | List | Tuple] - cookies: Optional[CookieTypes] - auth: Optional[Tuple[str, str]] - - # Types for POST/PUT/DELETE request method parameters - class DataRequestParams(GetRequestParams, total=False): - data: Optional[Dict | str] - json: Optional[Dict | List] - - # Types for browser session - class PlaywrightSession(TypedDict, total=False): - max_pages: int - headless: bool - disable_resources: bool - network_idle: bool - load_dom: bool - wait_selector: Optional[str] - wait_selector_state: SelectorWaitStates - cookies: Sequence[SetCookieParam] | None - google_search: bool - wait: int | float - timezone_id: str | None - page_action: Optional[Callable] - proxy: Optional[str | Dict[str, str] | Tuple] - extra_headers: Optional[Dict[str, str]] - timeout: int | float - init_script: Optional[str] - user_data_dir: str - selector_config: Optional[Dict] - additional_args: Optional[Dict] - locale: Optional[str] - real_chrome: bool - cdp_url: Optional[str] - useragent: Optional[str] - extra_flags: Optional[List[str]] - - class PlaywrightFetchParams(TypedDict, total=False): - load_dom: bool - wait: int | float - network_idle: bool - google_search: bool - timeout: int | float - disable_resources: bool - wait_selector: Optional[str] - page_action: Optional[Callable] - selector_config: Optional[Dict] - extra_headers: Optional[Dict[str, str]] - wait_selector_state: SelectorWaitStates - - class StealthSession(PlaywrightSession, total=False): - allow_webgl: bool - hide_canvas: bool - block_webrtc: bool - solve_cloudflare: bool - - class StealthFetchParams(PlaywrightFetchParams, total=False): - solve_cloudflare: bool - -else: # pragma: no cover - RequestsSession = TypedDict - GetRequestParams = TypedDict - DataRequestParams = TypedDict - PlaywrightSession = TypedDict - PlaywrightFetchParams = TypedDict - StealthSession = TypedDict - StealthFetchParams = TypedDict +# Types for session initialization +class RequestsSession(TypedDict, total=False): + impersonate: ImpersonateType + http3: Optional[bool] + stealthy_headers: Optional[bool] + proxies: Optional[ProxySpec] + proxy: Optional[str] + proxy_auth: Optional[Tuple[str, str]] + proxy_rotator: Optional[ProxyRotator] + timeout: Optional[int | float] + headers: Optional[Mapping[str, Optional[str]]] + retries: Optional[int] + retry_delay: Optional[int] + follow_redirects: Optional[bool] + max_redirects: Optional[int] + verify: Optional[bool] + cert: Optional[str | Tuple[str, str]] + selector_config: Optional[Dict] + + +# Types for GET request method parameters +class GetRequestParams(RequestsSession, total=False): + params: Optional[Dict | List | Tuple] + cookies: Optional[CookieTypes] + auth: Optional[Tuple[str, str]] + + +# Types for POST/PUT/DELETE request method parameters +class DataRequestParams(GetRequestParams, total=False): + data: Optional[Dict[str, str] | List[Tuple] | str | BytesIO | bytes] + json: Optional[Dict | List] + + +# Types for browser session +class PlaywrightSession(TypedDict, total=False): + max_pages: int + headless: bool + disable_resources: bool + network_idle: bool + load_dom: bool + wait_selector: Optional[str] + wait_selector_state: SelectorWaitStates + cookies: Sequence[SetCookieParam] | None + google_search: bool + wait: int | float + timezone_id: str | None + page_action: Optional[Callable] + proxy: Optional[str | Dict[str, str] | Tuple] + proxy_rotator: Optional[ProxyRotator] + extra_headers: Optional[Dict[str, str]] + timeout: int | float + init_script: Optional[str] + user_data_dir: str + selector_config: Optional[Dict] + additional_args: Optional[Dict] + locale: Optional[str] + real_chrome: bool + cdp_url: Optional[str] + useragent: Optional[str] + extra_flags: Optional[List[str]] + blocked_domains: Optional[Set[str]] + retries: int + retry_delay: int | float + + +class PlaywrightFetchParams(TypedDict, total=False): + load_dom: bool + wait: int | float + network_idle: bool + google_search: bool + timeout: int | float + disable_resources: bool + wait_selector: Optional[str] + page_action: Optional[Callable] + selector_config: Optional[Dict] + extra_headers: Optional[Dict[str, str]] + wait_selector_state: SelectorWaitStates + blocked_domains: Optional[Set[str]] + proxy: Optional[str | Dict[str, str]] + + +class StealthSession(PlaywrightSession, total=False): + allow_webgl: bool + hide_canvas: bool + block_webrtc: bool + solve_cloudflare: bool + + +class StealthFetchParams(PlaywrightFetchParams, total=False): + solve_cloudflare: bool diff --git a/scrapling/engines/_browsers/_validators.py b/scrapling/engines/_browsers/_validators.py index 82270ef119dde18e4fa69bc5a8772f6ab0ab1536..e2424f49b34835cf53387e3f7b7be35882d0b25f 100644 --- a/scrapling/engines/_browsers/_validators.py +++ b/scrapling/engines/_browsers/_validators.py @@ -10,6 +10,7 @@ from scrapling.core._types import ( Any, Dict, List, + Set, Tuple, Optional, Callable, @@ -18,6 +19,7 @@ from scrapling.core._types import ( SetCookieParam, SelectorWaitStates, ) +from scrapling.engines.toolbelt.proxy_rotation import ProxyRotator from scrapling.engines.toolbelt.navigation import construct_proxy_dict from scrapling.engines._browsers._types import PlaywrightFetchParams, StealthFetchParams @@ -50,6 +52,7 @@ def _is_invalid_cdp_url(cdp_url: str) -> bool | str: # Type aliases for cleaner annotations PagesCount = Annotated[int, Meta(ge=1, le=50)] +RetriesCount = Annotated[int, Meta(ge=1, le=10)] Seconds = Annotated[int, float, Meta(ge=0)] @@ -69,6 +72,7 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False, weakref=True): timezone_id: str | None = "" page_action: Optional[Callable] = None proxy: Optional[str | Dict[str, str] | Tuple] = None # The default value for proxy in Playwright's source is `None` + proxy_rotator: Optional[ProxyRotator] = None extra_headers: Optional[Dict[str, str]] = None timeout: Seconds = 30000 init_script: Optional[str] = None @@ -80,11 +84,19 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False, weakref=True): cdp_url: Optional[str] = None useragent: Optional[str] = None extra_flags: Optional[List[str]] = None + blocked_domains: Optional[Set[str]] = None + retries: RetriesCount = 3 + retry_delay: Seconds = 1 def __post_init__(self): # pragma: no cover """Custom validation after msgspec validation""" if self.page_action and not callable(self.page_action): raise TypeError(f"page_action must be callable, got {type(self.page_action).__name__}") + if self.proxy and self.proxy_rotator: + raise ValueError( + "Cannot use 'proxy_rotator' together with 'proxy'. " + "Use either a static proxy or proxy rotation, not both." + ) if self.proxy: self.proxy = construct_proxy_dict(self.proxy) if self.cdp_url: @@ -135,6 +147,7 @@ class _fetch_params: wait_selector_state: SelectorWaitStates network_idle: bool load_dom: bool + blocked_domains: Optional[Set[str]] solve_cloudflare: bool selector_config: Dict @@ -144,15 +157,16 @@ def validate_fetch( session: Any, model: type[PlaywrightConfig] | type[StealthConfig], ) -> _fetch_params: # pragma: no cover - result = {} - overrides = {} + result: Dict[str, Any] = {} + overrides: Dict[str, Any] = {} + kwargs_dict: Dict[str, Any] = dict(method_kwargs) # Get all field names that _fetch_params needs fetch_param_fields = {f.name for f in fields(_fetch_params)} for key in fetch_param_fields: - if key in method_kwargs: - overrides[key] = method_kwargs[key] + if key in kwargs_dict: + overrides[key] = kwargs_dict[key] elif hasattr(session, "_config") and hasattr(session._config, key): result[key] = getattr(session._config, key) @@ -173,6 +187,7 @@ def validate_fetch( # solve_cloudflare defaults to False for models that don't have it (PlaywrightConfig) result.setdefault("solve_cloudflare", False) + result.setdefault("blocked_domains", None) return _fetch_params(**result) diff --git a/scrapling/engines/constants.py b/scrapling/engines/constants.py index 39e39074fad9d21824e025204d533adc708bc521..4bc00149f977d3cdcab880428f36f5cad7bedfde 100644 --- a/scrapling/engines/constants.py +++ b/scrapling/engines/constants.py @@ -1,5 +1,5 @@ # Disable loading these resources for speed -DEFAULT_DISABLED_RESOURCES = { +EXTRA_RESOURCES = { "font", "image", "media", @@ -12,7 +12,7 @@ DEFAULT_DISABLED_RESOURCES = { "stylesheet", } -HARMFUL_DEFAULT_ARGS = ( +HARMFUL_ARGS = ( # This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884 "--enable-automation", "--disable-popup-blocking", @@ -21,7 +21,7 @@ HARMFUL_DEFAULT_ARGS = ( "--disable-extensions", ) -DEFAULT_FLAGS = ( +DEFAULT_ARGS = ( # Speed up chromium browsers by default "--no-pings", "--no-first-run", @@ -30,12 +30,13 @@ DEFAULT_FLAGS = ( "--no-service-autorun", "--homepage=about:blank", "--password-store=basic", + "--disable-hang-monitor", "--no-default-browser-check", "--disable-session-crashed-bubble", "--disable-search-engine-choice-screen", ) -DEFAULT_STEALTH_FLAGS = ( +STEALTH_ARGS = ( # Explanation: https://peter.sh/experiments/chromium-command-line-switches/ # Generally this will make the browser faster and less detectable # "--incognito", @@ -56,7 +57,6 @@ DEFAULT_STEALTH_FLAGS = ( "--ignore-gpu-blocklist", "--enable-tcp-fast-open", "--enable-web-bluetooth", - "--disable-hang-monitor", "--disable-cloud-import", "--disable-print-preview", "--disable-dev-shm-usage", @@ -83,18 +83,17 @@ DEFAULT_STEALTH_FLAGS = ( "--prerender-from-omnibox=disabled", "--safebrowsing-disable-auto-update", "--disable-offer-upload-credit-cards", - "--disable-features=site-per-process", "--disable-background-timer-throttling", "--disable-new-content-rendering-timeout", "--run-all-compositor-stages-before-draw", "--disable-client-side-phishing-detection", "--disable-backgrounding-occluded-windows", "--disable-layer-tree-host-memory-pressure", - "--autoplay-policy=no-user-gesture-required", + "--autoplay-policy=user-gesture-required", "--disable-offer-store-unmasked-wallet-cards", "--disable-blink-features=AutomationControlled", "--disable-component-extensions-with-background-pages", "--enable-features=NetworkService,NetworkServiceInProcess,TrustTokens,TrustTokensAlwaysAllowIssuance", "--blink-settings=primaryHoverType=2,availableHoverTypes=2,primaryPointerType=4,availablePointerTypes=4", - "--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process,TranslateUI,BlinkGenPropertyTrees", + "--disable-features=AudioServiceOutOfProcess,TranslateUI,BlinkGenPropertyTrees", ) diff --git a/scrapling/engines/static.py b/scrapling/engines/static.py index c7afa79526ddbae65c87e94147ee3e46a3b11372..204c10560ebffbba35ab638b727c6d403f8dc17d 100644 --- a/scrapling/engines/static.py +++ b/scrapling/engines/static.py @@ -22,9 +22,10 @@ from scrapling.core._types import ( SUPPORTED_HTTP_METHODS, ) -from ._browsers._types import RequestsSession, GetRequestParams, DataRequestParams, ImpersonateType from .toolbelt.custom import Response from .toolbelt.convertor import ResponseFactory +from .toolbelt.proxy_rotation import ProxyRotator, is_proxy_error +from ._browsers._types import RequestsSession, GetRequestParams, DataRequestParams, ImpersonateType from .toolbelt.fingerprints import generate_convincing_referer, generate_headers, __default_useragent__ _NO_SESSION: Any = object() @@ -62,6 +63,8 @@ class _ConfigurationLogic(ABC): "_default_cert", "_default_http3", "selector_config", + "_is_alive", + "_proxy_rotator", ) def __init__(self, **kwargs: Unpack[RequestsSession]): @@ -80,6 +83,14 @@ class _ConfigurationLogic(ABC): self._default_cert = kwargs.get("cert") or None self._default_http3 = kwargs.get("http3", False) self.selector_config = kwargs.get("selector_config") or {} + self._is_alive = False + self._proxy_rotator: Optional[ProxyRotator] = kwargs.get("proxy_rotator") + + if self._proxy_rotator and (self._default_proxy or self._default_proxies): + raise ValueError( + "Cannot use 'proxy_rotator' together with 'proxy' or 'proxies'. " + "Use either a static proxy or proxy rotation, not both." + ) @staticmethod def _get_param(kwargs: Dict, key: str, default: Any) -> Any: @@ -134,6 +145,9 @@ class _ConfigurationLogic(ABC): "retries", "retry_delay", "selector_config", + # Browser session params (ignored by HTTP sessions) + "extra_headers", + "google_search", } for k, v in method_kwargs.items(): if k not in skip_keys and v is not None: @@ -183,10 +197,11 @@ class _SyncSessionLogic(_ConfigurationLogic): def __enter__(self): """Creates and returns a new synchronous Fetcher Session""" - if self._curl_session: + if self._is_alive: raise RuntimeError("This FetcherSession instance already has an active synchronous session.") self._curl_session = CurlSession() + self._is_alive = True return self def __exit__(self, exc_type, exc_val, exc_tb): @@ -201,7 +216,9 @@ class _SyncSessionLogic(_ConfigurationLogic): self._curl_session.close() self._curl_session = None - def __make_request(self, method: SUPPORTED_HTTP_METHODS, stealth: Optional[bool] = None, **kwargs) -> Response: + self._is_alive = False + + def _make_request(self, method: SUPPORTED_HTTP_METHODS, stealth: Optional[bool] = None, **kwargs) -> Response: """ Perform an HTTP request using the configured session. """ @@ -210,7 +227,7 @@ class _SyncSessionLogic(_ConfigurationLogic): selector_config = self._get_param(kwargs, "selector_config", self.selector_config) or self.selector_config max_retries = self._get_param(kwargs, "retries", self._default_retries) retry_delay = self._get_param(kwargs, "retry_delay", self._default_retry_delay) - request_args = self._merge_request_args(stealth=stealth, **kwargs) + static_proxy = kwargs.pop("proxy", None) session = self._curl_session one_off_request = False @@ -220,22 +237,38 @@ class _SyncSessionLogic(_ConfigurationLogic): session = CurlSession() one_off_request = True - if session: + if not session: + raise RuntimeError("No active session available.") # pragma: no cover + + try: for attempt in range(max_retries): + if self._proxy_rotator and static_proxy is None: + proxy = self._proxy_rotator.get_proxy() + else: + proxy = static_proxy + + request_args = self._merge_request_args(stealth=stealth, proxy=proxy, **kwargs) try: response = session.request(method, **request_args) - result = ResponseFactory.from_http_request(response, selector_config) + result = ResponseFactory.from_http_request(response, selector_config, meta={"proxy": proxy}) return result except CurlError as e: # pragma: no cover if attempt < max_retries - 1: - log.error(f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds...") + # Now if the rotator is enabled, we will try again with the new proxy + # If it's not enabled, then we will try again with the same proxy + if is_proxy_error(e): + log.warning( + f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {retry_delay} seconds..." + ) + else: + log.warning(f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds...") time_sleep(retry_delay) else: log.error(f"Failed after {max_retries} attempts: {e}") raise # Raise the exception if all retries fail - finally: - if session and one_off_request: - session.close() + finally: + if session and one_off_request: + session.close() raise RuntimeError("No active session available.") # pragma: no cover @@ -267,7 +300,7 @@ class _SyncSessionLogic(_ConfigurationLogic): :return: A `Response` object. """ stealthy_headers = kwargs.pop("stealthy_headers", None) - return self.__make_request("GET", stealth=stealthy_headers, url=url, **kwargs) + return self._make_request("GET", stealth=stealthy_headers, url=url, **kwargs) def post(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response: """ @@ -299,7 +332,7 @@ class _SyncSessionLogic(_ConfigurationLogic): :return: A `Response` object. """ stealthy_headers = kwargs.pop("stealthy_headers", None) - return self.__make_request("POST", stealth=stealthy_headers, url=url, **kwargs) + return self._make_request("POST", stealth=stealthy_headers, url=url, **kwargs) def put(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response: """ @@ -331,7 +364,7 @@ class _SyncSessionLogic(_ConfigurationLogic): :return: A `Response` object. """ stealthy_headers = kwargs.pop("stealthy_headers", None) - return self.__make_request("PUT", stealth=stealthy_headers, url=url, **kwargs) + return self._make_request("PUT", stealth=stealthy_headers, url=url, **kwargs) def delete(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response: """ @@ -365,7 +398,7 @@ class _SyncSessionLogic(_ConfigurationLogic): # Careful of sending a body in a DELETE request, it might cause some websites to reject the request as per https://www.rfc-editor.org/rfc/rfc7231#section-4.3.5, # But some websites accept it, it depends on the implementation used. stealthy_headers = kwargs.pop("stealthy_headers", None) - return self.__make_request("DELETE", stealth=stealthy_headers, url=url, **kwargs) + return self._make_request("DELETE", stealth=stealthy_headers, url=url, **kwargs) class _ASyncSessionLogic(_ConfigurationLogic): @@ -377,10 +410,11 @@ class _ASyncSessionLogic(_ConfigurationLogic): async def __aenter__(self): # pragma: no cover """Creates and returns a new asynchronous Session.""" - if self._async_curl_session: + if self._is_alive: raise RuntimeError("This FetcherSession instance already has an active asynchronous session.") self._async_curl_session = AsyncCurlSession() + self._is_alive = True return self async def __aexit__(self, exc_type, exc_val, exc_tb): @@ -395,9 +429,9 @@ class _ASyncSessionLogic(_ConfigurationLogic): await self._async_curl_session.close() self._async_curl_session = None - async def __make_request( - self, method: SUPPORTED_HTTP_METHODS, stealth: Optional[bool] = None, **kwargs - ) -> Response: + self._is_alive = False + + async def _make_request(self, method: SUPPORTED_HTTP_METHODS, stealth: Optional[bool] = None, **kwargs) -> Response: """ Perform an HTTP request using the configured session. """ @@ -406,7 +440,7 @@ class _ASyncSessionLogic(_ConfigurationLogic): selector_config = self._get_param(kwargs, "selector_config", self.selector_config) or self.selector_config max_retries = self._get_param(kwargs, "retries", self._default_retries) retry_delay = self._get_param(kwargs, "retry_delay", self._default_retry_delay) - request_args = self._merge_request_args(stealth=stealth, **kwargs) + static_proxy = kwargs.pop("proxy", None) session = self._async_curl_session one_off_request = False @@ -418,22 +452,40 @@ class _ASyncSessionLogic(_ConfigurationLogic): session = AsyncCurlSession() one_off_request = True - if session: + if not session: + raise RuntimeError("No active session available.") # pragma: no cover + + try: + # Determine if we should use proxy rotation for attempt in range(max_retries): + if self._proxy_rotator and static_proxy is None: + proxy = self._proxy_rotator.get_proxy() + else: + proxy = static_proxy + + request_args = self._merge_request_args(stealth=stealth, proxy=proxy, **kwargs) try: response = await session.request(method, **request_args) - result = ResponseFactory.from_http_request(response, selector_config) + result = ResponseFactory.from_http_request(response, selector_config, meta={"proxy": proxy}) return result except CurlError as e: # pragma: no cover if attempt < max_retries - 1: - log.error(f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds...") + # Now if the rotator is enabled, we will try again with the new proxy + # If it's not enabled, then we will try again with the same proxy + if is_proxy_error(e): + log.warning( + f"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {retry_delay} seconds..." + ) + else: + log.warning(f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds...") + await asyncio_sleep(retry_delay) else: log.error(f"Failed after {max_retries} attempts: {e}") raise # Raise the exception if all retries fail - finally: - if session and one_off_request: - await session.close() + finally: + if session and one_off_request: + await session.close() raise RuntimeError("No active session available.") # pragma: no cover @@ -465,7 +517,7 @@ class _ASyncSessionLogic(_ConfigurationLogic): :return: A `Response` object. """ stealthy_headers = kwargs.pop("stealthy_headers", None) - return self.__make_request("GET", stealth=stealthy_headers, url=url, **kwargs) + return self._make_request("GET", stealth=stealthy_headers, url=url, **kwargs) def post(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]: """ @@ -497,7 +549,7 @@ class _ASyncSessionLogic(_ConfigurationLogic): :return: A `Response` object. """ stealthy_headers = kwargs.pop("stealthy_headers", None) - return self.__make_request("POST", stealth=stealthy_headers, url=url, **kwargs) + return self._make_request("POST", stealth=stealthy_headers, url=url, **kwargs) def put(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]: """ @@ -529,7 +581,7 @@ class _ASyncSessionLogic(_ConfigurationLogic): :return: A `Response` object. """ stealthy_headers = kwargs.pop("stealthy_headers", None) - return self.__make_request("PUT", stealth=stealthy_headers, url=url, **kwargs) + return self._make_request("PUT", stealth=stealthy_headers, url=url, **kwargs) def delete(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]: """ @@ -563,7 +615,7 @@ class _ASyncSessionLogic(_ConfigurationLogic): # Careful of sending a body in a DELETE request, it might cause some websites to reject the request as per https://www.rfc-editor.org/rfc/rfc7231#section-4.3.5, # But some websites accept it, it depends on the implementation used. stealthy_headers = kwargs.pop("stealthy_headers", None) - return self.__make_request("DELETE", stealth=stealthy_headers, url=url, **kwargs) + return self._make_request("DELETE", stealth=stealthy_headers, url=url, **kwargs) class FetcherSession: @@ -594,6 +646,8 @@ class FetcherSession: "_default_http3", "selector_config", "_client", + "_is_alive", + "_proxy_rotator", ) def __init__( @@ -613,6 +667,7 @@ class FetcherSession: verify: bool = True, cert: Optional[str | Tuple[str, str]] = None, selector_config: Optional[Dict] = None, + proxy_rotator: Optional[ProxyRotator] = None, ): """ :param impersonate: Browser version to impersonate. Can be a single browser string or a list of browser strings for random selection. (Default: latest available Chrome version) @@ -631,6 +686,7 @@ class FetcherSession: :param verify: Whether to verify HTTPS certificates. Defaults to True. :param cert: Tuple of (cert, key) filenames for the client certificate. :param selector_config: Arguments passed when creating the final Selector class. + :param proxy_rotator: A ProxyRotator instance for automatic proxy rotation. """ self._default_impersonate: ImpersonateType = impersonate self._stealth = stealthy_headers @@ -647,7 +703,9 @@ class FetcherSession: self._default_cert = cert self._default_http3 = http3 self.selector_config = selector_config or {} + self._is_alive = False self._client: _SyncSessionLogic | _ASyncSessionLogic | None = None + self._proxy_rotator = proxy_rotator def __enter__(self) -> _SyncSessionLogic: """Creates and returns a new synchronous Fetcher Session""" @@ -656,7 +714,9 @@ class FetcherSession: config = {k.replace("_default_", ""): getattr(self, k) for k in self.__slots__ if k.startswith("_default")} config["stealthy_headers"] = self._stealth config["selector_config"] = self.selector_config + config["proxy_rotator"] = self._proxy_rotator self._client = _SyncSessionLogic(**config) + self._is_alive = True return self._client.__enter__() raise RuntimeError("This FetcherSession instance already has an active synchronous session.") @@ -664,6 +724,7 @@ class FetcherSession: if self._client is not None and isinstance(self._client, _SyncSessionLogic): self._client.__exit__(exc_type, exc_val, exc_tb) self._client = None + self._is_alive = False return raise RuntimeError("Cannot exit invalid session") @@ -674,7 +735,9 @@ class FetcherSession: config = {k.replace("_default_", ""): getattr(self, k) for k in self.__slots__ if k.startswith("_default")} config["stealthy_headers"] = self._stealth config["selector_config"] = self.selector_config + config["proxy_rotator"] = self._proxy_rotator self._client = _ASyncSessionLogic(**config) + self._is_alive = True return await self._client.__aenter__() raise RuntimeError("This FetcherSession instance already has an active asynchronous session.") @@ -682,6 +745,7 @@ class FetcherSession: if self._client is not None and isinstance(self._client, _ASyncSessionLogic): await self._client.__aexit__(exc_type, exc_val, exc_tb) self._client = None + self._is_alive = False return raise RuntimeError("Cannot exit invalid session") @@ -689,7 +753,7 @@ class FetcherSession: class FetcherClient(_SyncSessionLogic): __slots__ = ("__enter__", "__exit__") - def __init__(self, **kwargs): + def __init__(self, **kwargs: Any) -> None: super().__init__(**kwargs) self.__enter__: Any = None self.__exit__: Any = None @@ -699,7 +763,7 @@ class FetcherClient(_SyncSessionLogic): class AsyncFetcherClient(_ASyncSessionLogic): __slots__ = ("__aenter__", "__aexit__") - def __init__(self, **kwargs): + def __init__(self, **kwargs: Any) -> None: super().__init__(**kwargs) self.__aenter__: Any = None self.__aexit__: Any = None diff --git a/scrapling/engines/toolbelt/__init__.py b/scrapling/engines/toolbelt/__init__.py index 8b137891791fe96927ad78e64b0aad7bded08bdc..fea2781eb9204fd9f2eeb103e39fc96b033ec5c1 100644 --- a/scrapling/engines/toolbelt/__init__.py +++ b/scrapling/engines/toolbelt/__init__.py @@ -1 +1,3 @@ +from .proxy_rotation import ProxyRotator, is_proxy_error, cyclic_rotation +__all__ = ["ProxyRotator", "is_proxy_error", "cyclic_rotation"] diff --git a/scrapling/engines/toolbelt/convertor.py b/scrapling/engines/toolbelt/convertor.py index ef28acfae3240746267516a4f3bc2db55ce57acb..1a8d799b1942691351640e3dd48fd844b53e3636 100644 --- a/scrapling/engines/toolbelt/convertor.py +++ b/scrapling/engines/toolbelt/convertor.py @@ -38,7 +38,7 @@ class ResponseFactory: @classmethod def _process_response_history(cls, first_response: SyncResponse, parser_arguments: Dict) -> list[Response]: """Process response history to build a list of `Response` objects""" - history = [] + history: list[Response] = [] current_request = first_response.request.redirected_from try: @@ -85,6 +85,7 @@ class ResponseFactory: first_response: SyncResponse, final_response: Optional[SyncResponse], parser_arguments: Dict, + meta: Optional[Dict] = None, ) -> Response: """ Transforms a Playwright response into an internal `Response` object, encapsulating @@ -100,6 +101,7 @@ class ResponseFactory: :param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one. :param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into the `Response` object. + :param meta: Additional meta data to be saved with the response. :return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata. :rtype: Response @@ -116,12 +118,12 @@ class ResponseFactory: history = cls._process_response_history(first_response, parser_arguments) try: if "html" in final_response.all_headers().get("content-type", ""): - page_content = cls._get_page_content(page) + page_content = cls._get_page_content(page).encode("utf-8") else: page_content = final_response.body() except Exception as e: # pragma: no cover log.error(f"Error getting page content: {e}") - page_content = "" + page_content = b"" return Response( **{ @@ -134,6 +136,7 @@ class ResponseFactory: "headers": first_response.all_headers(), "request_headers": first_response.request.all_headers(), "history": history, + "meta": meta, **parser_arguments, } ) @@ -143,7 +146,7 @@ class ResponseFactory: cls, first_response: AsyncResponse, parser_arguments: Dict ) -> list[Response]: """Process response history to build a list of `Response` objects""" - history = [] + history: list[Response] = [] current_request = first_response.request.redirected_from try: @@ -220,6 +223,7 @@ class ResponseFactory: first_response: AsyncResponse, final_response: Optional[AsyncResponse], parser_arguments: Dict, + meta: Optional[Dict] = None, ) -> Response: """ Transforms a Playwright response into an internal `Response` object, encapsulating @@ -235,6 +239,7 @@ class ResponseFactory: :param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one. :param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into the `Response` object. + :param meta: Additional meta data to be saved with the response. :return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata. :rtype: Response @@ -251,12 +256,12 @@ class ResponseFactory: history = await cls._async_process_response_history(first_response, parser_arguments) try: if "html" in (await final_response.all_headers()).get("content-type", ""): - page_content = await cls._get_async_page_content(page) + page_content = (await cls._get_async_page_content(page)).encode("utf-8") else: page_content = await final_response.body() except Exception as e: # pragma: no cover log.error(f"Error getting page content in async: {e}") - page_content = "" + page_content = b"" return Response( **{ @@ -269,16 +274,18 @@ class ResponseFactory: "headers": await first_response.all_headers(), "request_headers": await first_response.request.all_headers(), "history": history, + "meta": meta, **parser_arguments, } ) @staticmethod - def from_http_request(response: CurlResponse, parser_arguments: Dict) -> Response: + def from_http_request(response: CurlResponse, parser_arguments: Dict, meta: Optional[Dict] = None) -> Response: """Takes `curl_cffi` response and generates `Response` object from it. :param response: `curl_cffi` response object :param parser_arguments: Additional arguments to be passed to the `Response` object constructor. + :param meta: Optional metadata dictionary to attach to the Response. :return: A `Response` object that is the same as `Selector` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers` """ return Response( @@ -293,6 +300,7 @@ class ResponseFactory: "request_headers": dict(response.request.headers) if response.request else {}, "method": response.request.method if response.request else "GET", "history": response.history, # https://github.com/lexiforest/curl_cffi/issues/82 + "meta": meta, **parser_arguments, } ) diff --git a/scrapling/engines/toolbelt/custom.py b/scrapling/engines/toolbelt/custom.py index d1f95b75dc2d5f4eca8b64a1bd09e611e1b3b085..58a748535a7845eb5c9cdf92537e3fd146cbb34c 100644 --- a/scrapling/engines/toolbelt/custom.py +++ b/scrapling/engines/toolbelt/custom.py @@ -10,12 +10,20 @@ from scrapling.core._types import ( Dict, cast, List, - Optional, Tuple, + Union, + Optional, + Callable, + Sequence, + TYPE_CHECKING, + AsyncGenerator, ) from scrapling.core.custom_types import MappingProxyType from scrapling.parser import Selector, SQLiteStorageSystem +if TYPE_CHECKING: + from scrapling.spiders import Request + class Response(Selector): """This class is returned by all engines as a way to unify the response type between different libraries.""" @@ -32,8 +40,12 @@ class Response(Selector): encoding: str = "utf-8", method: str = "GET", history: List | None = None, + meta: Dict[str, Any] | None = None, **selector_config: Any, ): + if isinstance(content, str): + content = content.encode("utf-8") + adaptive_domain: str = cast(str, selector_config.pop("adaptive_domain", "")) self.status = status self.reason = reason @@ -50,6 +62,78 @@ class Response(Selector): # For easier debugging while working from a Python shell log.info(f"Fetched ({status}) <{method} {url}> (referer: {request_headers.get('referer')})") + if meta and not isinstance(meta, dict): + raise TypeError(f"Response meta should be dictionary but got {type(meta).__name__} instead!") + + self.meta: Dict[str, Any] = meta or {} + self.request: Optional["Request"] = None # Will be set by crawler + + @property + def body(self) -> bytes: + """Return the raw body of the response as bytes.""" + return cast(bytes, cast(Sequence, self._raw_body)) + + def follow( + self, + url: str, + sid: str = "", + callback: Callable[["Response"], AsyncGenerator[Union[Dict[str, Any], "Request", None], None]] | None = None, + priority: int | None = None, + dont_filter: bool = False, + meta: dict[str, Any] | None = None, + referer_flow: bool = True, + **kwargs: Any, + ) -> Any: + """Create a Request to follow a URL. + + This is a helper method for spiders to easily follow links found in pages. + + **IMPORTANT**: The below arguments if left empty, the corresponding value from the previous request will be used. The only exception is `dont_filter`. + + :param url: The URL to follow (can be relative, will be joined with current URL) + :param sid: The session id to use + :param callback: Spider callback method to use + :param priority: The priority number to use, the higher the number, the higher priority to be processed first. + :param dont_filter: If this request has been done before, disable the filter to allow it again. + :param meta: Additional meta data to included in the request + :param referer_flow: Enabled by default, set the current response url as referer for the new request url. + :param kwargs: Additional Request arguments + :return: Request object ready to be yielded + """ + from scrapling.spiders import Request + + if not self.request or not isinstance(self.request, Request): + raise TypeError("This response has no request set yet.") + + # Merge original session kwargs with new kwargs (new takes precedence) + session_kwargs = {**self.request._session_kwargs, **kwargs} + + if referer_flow: + # For requests + headers = session_kwargs.get("headers", {}) + headers["referer"] = self.url + session_kwargs["headers"] = headers + + # For browsers + extra_headers = session_kwargs.get("extra_headers", {}) + extra_headers["referer"] = self.url + session_kwargs["extra_headers"] = extra_headers + + session_kwargs["google_search"] = False + + return Request( + url=self.urljoin(url), + sid=sid or self.request.sid, + callback=callback or self.request.callback, + priority=priority if priority is not None else self.request.priority, + dont_filter=dont_filter, + meta={**(self.meta or {}), **(meta or {})}, + **session_kwargs, + ) + + def __str__(self) -> str: + return f"<{self.status} {self.url}>" + class BaseFetcher: __slots__ = () diff --git a/scrapling/engines/toolbelt/fingerprints.py b/scrapling/engines/toolbelt/fingerprints.py index 2ddbbe5ff5263477744bbb1f7b41ed0d4ccc7534..677469174e26bce4b2dd4be4a2727580a261f1f4 100644 --- a/scrapling/engines/toolbelt/fingerprints.py +++ b/scrapling/engines/toolbelt/fingerprints.py @@ -5,11 +5,11 @@ Functions related to generating headers and fingerprints generally from functools import lru_cache from platform import system as platform_system -from tldextract import extract +from tld import get_tld, Result from browserforge.headers import Browser, HeaderGenerator from browserforge.headers.generator import SUPPORTED_OPERATING_SYSTEMS -from scrapling.core._types import Dict, Literal, Tuple +from scrapling.core._types import Dict, Literal, Tuple, cast __OS_NAME__ = platform_system() OSName = Literal["linux", "macos", "windows"] @@ -28,11 +28,15 @@ def generate_convincing_referer(url: str) -> str | None: :param url: The URL you are about to fetch. :return: Google's search URL of the domain name, or None for localhost/IP addresses """ - extracted = extract(url) + # Fixing the inaccurate return type hint in `get_tld` + extracted: Result | None = cast(Result, get_tld(url, as_object=True, fail_silently=True)) + if not extracted: + return None + website_name = extracted.domain # Skip generating referer for localhost, IP addresses, or when there's no valid domain - if not website_name or not extracted.suffix or website_name in ("localhost", "127.0.0.1", "::1"): + if not website_name or not extracted.tld or website_name in ("localhost", "127.0.0.1", "::1"): return None # Check if it's an IP address (simple check for IPv4) diff --git a/scrapling/engines/toolbelt/navigation.py b/scrapling/engines/toolbelt/navigation.py index 1b44f5ab3e6ea1d82fcb509f01c2b7f6c2d2d535..5c1942763be6c4f92dbb803e4a5c173f9c93f7bc 100644 --- a/scrapling/engines/toolbelt/navigation.py +++ b/scrapling/engines/toolbelt/navigation.py @@ -11,8 +11,8 @@ from msgspec import Struct, structs, convert, ValidationError from playwright.sync_api import Route from scrapling.core.utils import log -from scrapling.core._types import Dict, Tuple -from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES +from scrapling.core._types import Dict, Set, Tuple, Optional, Callable +from scrapling.engines.constants import EXTRA_RESOURCES __BYPASSES_DIR__ = Path(__file__).parent / "bypasses" @@ -23,30 +23,58 @@ class ProxyDict(Struct): password: str = "" -def intercept_route(route: Route): - """This is just a route handler, but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES` +def create_intercept_handler(disable_resources: bool, blocked_domains: Optional[Set[str]] = None) -> Callable: + """Create a route handler that blocks both resource types and specific domains. - :param route: PlayWright `Route` object of the current page - :return: PlayWright `Route` object + :param disable_resources: Whether to block default resource types. + :param blocked_domains: Set of domain names to block requests to. + :return: A sync route handler function. """ - if route.request.resource_type in DEFAULT_DISABLED_RESOURCES: - log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"') - route.abort() - else: - route.continue_() - - -async def async_intercept_route(route: async_Route): - """This is just a route handler, but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES` - - :param route: PlayWright `Route` object of the current page - :return: PlayWright `Route` object + disabled_resources = EXTRA_RESOURCES if disable_resources else set() + domains = blocked_domains or set() + + def handler(route: Route): + if route.request.resource_type in disabled_resources: + log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"') + route.abort() + elif domains: + hostname = urlparse(route.request.url).hostname or "" + if any(hostname == d or hostname.endswith("." + d) for d in domains): + log.debug(f'Blocking request to blocked domain "{hostname}" ({route.request.url})') + route.abort() + else: + route.continue_() + else: + route.continue_() + + return handler + + +def create_async_intercept_handler(disable_resources: bool, blocked_domains: Optional[Set[str]] = None) -> Callable: + """Create an async route handler that blocks both resource types and specific domains. + + :param disable_resources: Whether to block default resource types. + :param blocked_domains: Set of domain names to block requests to. + :return: An async route handler function. """ - if route.request.resource_type in DEFAULT_DISABLED_RESOURCES: - log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"') - await route.abort() - else: - await route.continue_() + disabled_resources = EXTRA_RESOURCES if disable_resources else set() + domains = blocked_domains or set() + + async def handler(route: async_Route): + if route.request.resource_type in disabled_resources: + log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"') + await route.abort() + elif domains: + hostname = urlparse(route.request.url).hostname or "" + if any(hostname == d or hostname.endswith("." + d) for d in domains): + log.debug(f'Blocking request to blocked domain "{hostname}" ({route.request.url})') + await route.abort() + else: + await route.continue_() + else: + await route.continue_() + + return handler def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple) -> Dict: diff --git a/scrapling/engines/toolbelt/proxy_rotation.py b/scrapling/engines/toolbelt/proxy_rotation.py new file mode 100644 index 0000000000000000000000000000000000000000..37cdba1e734cdf290c061aca23fbfe48e023af83 --- /dev/null +++ b/scrapling/engines/toolbelt/proxy_rotation.py @@ -0,0 +1,104 @@ +from threading import Lock + +from scrapling.core._types import Callable, Dict, List, Tuple, ProxyType + + +RotationStrategy = Callable[[List[ProxyType], int], Tuple[ProxyType, int]] +_PROXY_ERROR_INDICATORS = { + "net::err_proxy", + "net::err_tunnel", + "connection refused", + "connection reset", + "connection timed out", + "failed to connect", + "could not resolve proxy", +} + + +def _get_proxy_key(proxy: ProxyType) -> str: + """Generate a unique key for a proxy (for dicts it's server plus username).""" + if isinstance(proxy, str): + return proxy + server = proxy.get("server", "") + username = proxy.get("username", "") + return f"{server}|{username}" + + +def is_proxy_error(error: Exception) -> bool: + """Check if an error is proxy-related. Works for both HTTP and browser errors.""" + error_msg = str(error).lower() + return any(indicator in error_msg for indicator in _PROXY_ERROR_INDICATORS) + + +def cyclic_rotation(proxies: List[ProxyType], current_index: int) -> Tuple[ProxyType, int]: + """Default cyclic rotation strategy — iterates through proxies sequentially, wrapping around at the end.""" + idx = current_index % len(proxies) + return proxies[idx], (idx + 1) % len(proxies) + + +class ProxyRotator: + """ + A thread-safe proxy rotator with pluggable rotation strategies. + + Supports: + - Cyclic rotation (default) + - Custom rotation strategies via callable + - Both string URLs and Playwright-style dict proxies + """ + + __slots__ = ("_proxies", "_proxy_to_index", "_strategy", "_current_index", "_lock") + + def __init__( + self, + proxies: List[ProxyType], + strategy: RotationStrategy = cyclic_rotation, + ): + """ + Initialize the proxy rotator. + + :param proxies: List of proxy URLs or Playwright-style proxy dicts. + - String format: "http://proxy1:8080" or "http://user:pass@proxy:8080" + - Dict format: {"server": "http://proxy:8080", "username": "user", "password": "pass"} + :param strategy: Rotation strategy function. Takes (proxies, current_index) and returns (proxy, next_index). Defaults to cyclic_rotation. + """ + if not proxies: + raise ValueError("At least one proxy must be provided") + + if not callable(strategy): + raise TypeError(f"strategy must be callable, got {type(strategy).__name__}") + + self._strategy = strategy + self._lock = Lock() + + # Validate and store proxies + self._proxies: List[ProxyType] = [] + self._proxy_to_index: Dict[str, int] = {} # O(1) lookup by unique key (server + username) + for i, proxy in enumerate(proxies): + if isinstance(proxy, (str, dict)): + if isinstance(proxy, dict) and "server" not in proxy: + raise ValueError("Proxy dict must have a 'server' key") + + self._proxy_to_index[_get_proxy_key(proxy)] = i + self._proxies.append(proxy) + else: + raise TypeError(f"Invalid proxy type: {type(proxy)}. Expected str or dict.") + + self._current_index = 0 + + def get_proxy(self) -> ProxyType: + """Get the next proxy according to the rotation strategy.""" + with self._lock: + proxy, self._current_index = self._strategy(self._proxies, self._current_index) + return proxy + + @property + def proxies(self) -> List[ProxyType]: + """Get a copy of all configured proxies.""" + return list(self._proxies) + + def __len__(self) -> int: + """Return the total number of configured proxies.""" + return len(self._proxies) + + def __repr__(self) -> str: + return f"ProxyRotator(proxies={len(self._proxies)})" diff --git a/scrapling/fetchers/__init__.py b/scrapling/fetchers/__init__.py index e2e5866cf12bd79380e1ffd1b5e34feb941ee5b7..7db355d5781e3cb10b2ccc74c1677b06366847a5 100644 --- a/scrapling/fetchers/__init__.py +++ b/scrapling/fetchers/__init__.py @@ -1,4 +1,5 @@ from typing import TYPE_CHECKING, Any +from scrapling.engines.toolbelt import ProxyRotator if TYPE_CHECKING: from scrapling.fetchers.requests import Fetcher, AsyncFetcher, FetcherSession @@ -22,6 +23,7 @@ _LAZY_IMPORTS = { __all__ = [ "Fetcher", "AsyncFetcher", + "ProxyRotator", "FetcherSession", "DynamicFetcher", "DynamicSession", diff --git a/scrapling/fetchers/chrome.py b/scrapling/fetchers/chrome.py index 0d50c193cfc5d39dc6e8d7ca9a9699a0e1848787..3b746f4454bd7b0258b80d706760374676c02bdb 100644 --- a/scrapling/fetchers/chrome.py +++ b/scrapling/fetchers/chrome.py @@ -13,7 +13,8 @@ class DynamicFetcher(BaseFetcher): :param url: Target url. :param headless: Run the browser in headless/hidden (default), or headful/visible mode. - :param disable_resources: Drop requests of unnecessary resources for a speed boost. + :param disable_resources: Drop requests for unnecessary resources for a speed boost. + :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too). :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it. :param cookies: Set cookies for the next request. :param network_idle: Wait for the page until there are no network connections for at least 500 ms. @@ -23,12 +24,9 @@ class DynamicFetcher(BaseFetcher): :param page_action: Added for automation. A function that takes the `page` object and does the automation you need. :param wait_selector: Wait for a specific CSS selector to be in a specific state. :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request. - :param locale: Set the locale for the browser if wanted. The default value is `en-US`. + :param locale: Set the locale for the browser if wanted. Defaults to the system default locale. :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`. - :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently. :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. - :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting. - :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely. :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP. :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name. :param extra_headers: A dictionary of extra headers to add to the request. @@ -55,7 +53,8 @@ class DynamicFetcher(BaseFetcher): :param url: Target url. :param headless: Run the browser in headless/hidden (default), or headful/visible mode. - :param disable_resources: Drop requests of unnecessary resources for a speed boost. + :param disable_resources: Drop requests for unnecessary resources for a speed boost. + :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too). :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it. :param cookies: Set cookies for the next request. :param network_idle: Wait for the page until there are no network connections for at least 500 ms. @@ -65,12 +64,9 @@ class DynamicFetcher(BaseFetcher): :param page_action: Added for automation. A function that takes the `page` object and does the automation you need. :param wait_selector: Wait for a specific CSS selector to be in a specific state. :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request. - :param locale: Set the locale for the browser if wanted. The default value is `en-US`. + :param locale: Set the locale for the browser if wanted. Defaults to the system default locale. :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`. - :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently. :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. - :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting. - :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely. :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP. :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name. :param extra_headers: A dictionary of extra headers to add to the request. diff --git a/scrapling/fetchers/stealth_chrome.py b/scrapling/fetchers/stealth_chrome.py index 58574d3a6e9b82c0bf3799d607a1242e7fb042c4..6a702461dbdc4d6d24c65670a3d0588435d08be1 100644 --- a/scrapling/fetchers/stealth_chrome.py +++ b/scrapling/fetchers/stealth_chrome.py @@ -19,6 +19,7 @@ class StealthyFetcher(BaseFetcher): :param headless: Run the browser in headless/hidden (default), or headful/visible mode. :param disable_resources: Drop requests for unnecessary resources for a speed boost. Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`. + :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too). :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it. :param cookies: Set cookies for the next request. :param network_idle: Wait for the page until there are no network connections for at least 500 ms. @@ -67,6 +68,7 @@ class StealthyFetcher(BaseFetcher): :param headless: Run the browser in headless/hidden (default), or headful/visible mode. :param disable_resources: Drop requests for unnecessary resources for a speed boost. Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`. + :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``"example.com"`` blocks ``"sub.example.com"`` too). :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it. :param cookies: Set cookies for the next request. :param network_idle: Wait for the page until there are no network connections for at least 500 ms. diff --git a/scrapling/parser.py b/scrapling/parser.py index f7da3a9bfafdb47a8c3d24fb10b718d0732e9f88..166e6ea66fefa395f9bbc016f87cda14236c6c3f 100644 --- a/scrapling/parser.py +++ b/scrapling/parser.py @@ -4,7 +4,7 @@ from urllib.parse import urljoin from difflib import SequenceMatcher from re import Pattern as re_Pattern -from lxml.html import HtmlElement, HtmlMixin, HTMLParser +from lxml.html import HtmlElement, HTMLParser from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors from lxml.etree import ( XPath, @@ -23,6 +23,7 @@ from scrapling.core._types import ( List, Tuple, Union, + TypeVar, Pattern, Callable, Literal, @@ -51,6 +52,7 @@ _whitelisted = { "class_": "class", "for_": "for", } +_T = TypeVar("_T") # Pre-compiled selectors for efficiency _find_all_elements = XPath(".//*") _find_all_elements_with_spaces = XPath( @@ -74,9 +76,6 @@ class Selector(SelectorsGeneration): "_raw_body", ) - if TYPE_CHECKING: - _storage: StorageSystemMixin - def __init__( self, content: Optional[str | bytes] = None, @@ -118,8 +117,19 @@ class Selector(SelectorsGeneration): if root is None and content is None: raise ValueError("Selector class needs HTML content, or root arguments to work") - self.__text = None + self.url = url + self._raw_body: str | bytes = "" + self.encoding = encoding + self.__keep_cdata = keep_cdata + self.__huge_tree_enabled = huge_tree + self.__keep_comments = keep_comments + # For selector stuff + self.__text: Optional[TextHandler] = None + self.__attributes: Optional[AttributesHandler] = None + self.__tag: Optional[str] = None + self._storage: Optional[StorageSystemMixin] = None if root is None: + body: str | bytes if isinstance(content, str): body = content.strip().replace("\x00", "") or "" elif isinstance(content, bytes): @@ -128,30 +138,28 @@ class Selector(SelectorsGeneration): raise TypeError(f"content argument must be str or bytes, got {type(content)}") # https://lxml.de/api/lxml.etree.HTMLParser-class.html - parser = HTMLParser( + _parser_kwargs: Dict[str, Any] = dict( recover=True, remove_blank_text=True, remove_comments=(not keep_comments), encoding=encoding, compact=True, huge_tree=huge_tree, - default_doctype=True, + default_doctype=True, # Supported by lxml but missing from stubs strip_cdata=(not keep_cdata), ) - self._root = cast(HtmlElement, fromstring(body, parser=parser, base_url=url or None)) + parser = HTMLParser(**_parser_kwargs) + self._root = cast(HtmlElement, fromstring(body or "", parser=parser, base_url=url or "")) self._raw_body = content else: - # All HTML types inherit from HtmlMixin so this to check for all at once - if not issubclass(type(root), HtmlMixin): - raise TypeError( - f"Root have to be a valid element of `html` module types to work, not of type {type(root)}" - ) - self._root = cast(HtmlElement, root) - self._raw_body = "" - self.__adaptive_enabled = adaptive + if self._is_text_node(root): + self.__adaptive_enabled = False + return + + self.__adaptive_enabled = bool(adaptive) if self.__adaptive_enabled: if _storage is not None: @@ -171,40 +179,14 @@ class Selector(SelectorsGeneration): self._storage = storage(**storage_args) - self.__keep_comments = keep_comments - self.__keep_cdata = keep_cdata - self.__huge_tree_enabled = huge_tree - self.encoding = encoding - self.url = url - # For selector stuff - self.__attributes = None - self.__tag = None - - @property - def __response_data(self): - # No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed) - if not hasattr(self, "_cached_response_data"): - self._cached_response_data = ( - { - key: getattr(self, key) - for key in ( - "status", - "reason", - "cookies", - "history", - "headers", - "request_headers", - ) - } - if hasattr(self, "status") - else {} - ) - return self._cached_response_data - def __getitem__(self, key: str) -> TextHandler: + if self._is_text_node(self._root): + raise TypeError("Text nodes do not have attributes") return self.attrib[key] def __contains__(self, key: str) -> bool: + if self._is_text_node(self._root): + return False return key in self.attrib # Node functionalities, I wanted to move to a separate Mixin class, but it had a slight impact on performance @@ -220,36 +202,48 @@ class Selector(SelectorsGeneration): # Faster than checking `element.is_attribute or element.is_text or element.is_tail` return issubclass(type(element), _ElementUnicodeResult) - def __element_convertor(self, element: HtmlElement) -> "Selector": - """Used internally to convert a single HtmlElement to Selector directly without checks""" - db_instance = self._storage if (hasattr(self, "_storage") and self._storage) else None + def __element_convertor(self, element: HtmlElement | _ElementUnicodeResult) -> "Selector": + """Used internally to convert a single HtmlElement or text node to Selector directly without checks""" return Selector( root=element, url=self.url, encoding=self.encoding, adaptive=self.__adaptive_enabled, - _storage=db_instance, # Reuse existing storage if it exists otherwise it won't be checked if `adaptive` is turned off + _storage=self._storage, keep_comments=self.__keep_comments, keep_cdata=self.__keep_cdata, huge_tree=self.__huge_tree_enabled, - **self.__response_data, ) - def __elements_convertor(self, elements: List[HtmlElement]) -> "Selectors": - return Selectors(map(self.__element_convertor, elements)) + def __elements_convertor(self, elements: List[HtmlElement | _ElementUnicodeResult]) -> "Selectors": + # Store them for non-repeated call-ups + url = self.url + encoding = self.encoding + adaptive = self.__adaptive_enabled + storage = self._storage + comments = self.__keep_comments + cdata = self.__keep_cdata + huge_tree = self.__huge_tree_enabled + + return Selectors( + Selector( + root=el, + url=url, + encoding=encoding, + adaptive=adaptive, + _storage=storage, + keep_comments=comments, + keep_cdata=cdata, + huge_tree=huge_tree, + ) + for el in elements + ) - def __handle_elements( - self, result: List[HtmlElement | _ElementUnicodeResult] - ) -> Union["Selectors", "TextHandlers"]: - """Used internally in all functions to convert results to type (Selectors|TextHandlers) in bulk when possible""" + def __handle_elements(self, result: List[HtmlElement | _ElementUnicodeResult]) -> "Selectors": + """Used internally in all functions to convert results to Selectors in bulk""" if not result: return Selectors() - # From within the code, this method will always get a list of the same type, - # so we will continue without checks for a slight performance boost - if self._is_text_node(result[0]): - return TextHandlers(map(TextHandler, result)) - return self.__elements_convertor(result) def __getstate__(self) -> Any: @@ -264,13 +258,17 @@ class Selector(SelectorsGeneration): @property def tag(self) -> str: """Get the tag name of the element""" + if self._is_text_node(self._root): + return "#text" if not self.__tag: - self.__tag = self._root.tag - return self.__tag + self.__tag = str(self._root.tag) + return self.__tag or "" @property def text(self) -> TextHandler: """Get text content of the element""" + if self._is_text_node(self._root): + return TextHandler(str(self._root)) if self.__text is None: # If you want to escape lxml default behavior and remove comments like this `CONDITION: Excellent` # before extracting text, then keep `keep_comments` set to False while initializing the first class @@ -296,11 +294,14 @@ class Selector(SelectorsGeneration): :return: A TextHandler """ - ignored_elements = set() + if self._is_text_node(self._root): + return TextHandler(str(self._root)) + + ignored_elements: set[Any] = set() if ignore_tags: for element in self._root.iter(*ignore_tags): ignored_elements.add(element) - ignored_elements.update(set(_find_all_elements(element))) + ignored_elements.update(cast(list, _find_all_elements(element))) _all_strings = [] for node in self._root.iter(): @@ -320,6 +321,8 @@ class Selector(SelectorsGeneration): @property def attrib(self) -> AttributesHandler: """Get attributes of the element""" + if self._is_text_node(self._root): + return AttributesHandler({}) if not self.__attributes: self.__attributes = AttributesHandler(self._root.attrib) return self.__attributes @@ -327,6 +330,8 @@ class Selector(SelectorsGeneration): @property def html_content(self) -> TextHandler: """Return the inner HTML code of the element""" + if self._is_text_node(self._root): + return TextHandler(str(self._root)) content = tostring(self._root, encoding=self.encoding, method="html", with_tail=False) if isinstance(content, bytes): content = content.strip().decode(self.encoding) @@ -335,10 +340,14 @@ class Selector(SelectorsGeneration): @property def body(self) -> str | bytes: """Return the raw body of the current `Selector` without any processing. Useful for binary and non-HTML requests.""" + if self._is_text_node(self._root): + return "" return self._raw_body def prettify(self) -> TextHandler: """Return a prettified version of the element's inner html-code""" + if self._is_text_node(self._root): + return TextHandler(str(self._root)) content = tostring( self._root, encoding=self.encoding, @@ -355,6 +364,8 @@ class Selector(SelectorsGeneration): :param class_name: The class name to check for :return: True if element has class with that name otherwise False """ + if self._is_text_node(self._root): + return False return class_name in self._root.classes @property @@ -366,12 +377,16 @@ class Selector(SelectorsGeneration): @property def below_elements(self) -> "Selectors": """Return all elements under the current element in the DOM tree""" - below = _find_all_elements(self._root) + if self._is_text_node(self._root): + return Selectors() + below = cast(List, _find_all_elements(self._root)) return self.__elements_convertor(below) if below is not None else Selectors() @property def children(self) -> "Selectors": """Return the children elements of the current element or empty list otherwise""" + if self._is_text_node(self._root): + return Selectors() return Selectors( self.__element_convertor(child) for child in self._root.iterchildren() @@ -387,6 +402,8 @@ class Selector(SelectorsGeneration): def iterancestors(self) -> Generator["Selector", None, None]: """Return a generator that loops over all ancestors of the element, starting with the element's parent.""" + if self._is_text_node(self._root): + return for ancestor in self._root.iterancestors(): yield self.__element_convertor(ancestor) @@ -409,6 +426,8 @@ class Selector(SelectorsGeneration): @property def next(self) -> Optional["Selector"]: """Returns the next element of the current element in the children of the parent or ``None`` otherwise.""" + if self._is_text_node(self._root): + return None next_element = self._root.getnext() while next_element is not None and isinstance(next_element, html_forbidden): # Ignore HTML comments and unwanted types @@ -419,6 +438,8 @@ class Selector(SelectorsGeneration): @property def previous(self) -> Optional["Selector"]: """Returns the previous element of the current element in the children of the parent or ``None`` otherwise.""" + if self._is_text_node(self._root): + return None prev_element = self._root.getprevious() while prev_element is not None and isinstance(prev_element, html_forbidden): # Ignore HTML comments and unwanted types @@ -426,26 +447,40 @@ class Selector(SelectorsGeneration): return self.__element_convertor(prev_element) if prev_element is not None else None - # For easy copy-paste from Scrapy/parsel code when needed :) - def get(self, default=None): # pyright: ignore - return self + def get(self) -> TextHandler: + """ + Serialize this element to a string. + For text nodes, returns the text value. For HTML elements, returns the outer HTML. + """ + if self._is_text_node(self._root): + return TextHandler(str(self._root)) + return self.html_content - def get_all(self): - return self + def getall(self) -> TextHandlers: + """Return a single-element list containing this element's serialized string.""" + return TextHandlers([self.get()]) - extract = get_all + extract = getall extract_first = get def __str__(self) -> str: + if self._is_text_node(self._root): + return str(self._root) return self.html_content def __repr__(self) -> str: length_limit = 40 - data = "<" + + if self._is_text_node(self._root): + text = str(self._root) + if len(text) > length_limit: + text = text[:length_limit].strip() + "..." + return f"" + content = clean_spaces(self.html_content) if len(content) > length_limit: content = content[:length_limit].strip() + "..." - data += f"data='{content}'" + data = f" Union["Selector", "TextHandler", None]: - """Search the current tree with CSS3 selectors and return the first result if possible, otherwise return `None` - - **Important: - It's recommended to use the identifier argument if you plan to use a different selector later - and want to relocate the same element(s)** - - :param selector: The CSS3 selector to be used. - :param adaptive: Enabled will make the function try to relocate the element if it was 'saved' before - :param identifier: A string that will be used to save/retrieve element's data in adaptive, - otherwise the selector will be used. - :param auto_save: Automatically save new elements for `adaptive` later - :param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that. - Be aware that the percentage calculation depends solely on the page structure, so don't play with this - number unless you must know what you are doing! - """ - for element in self.css( - selector, - identifier, - adaptive, - auto_save, - percentage, - _scrapling_first_match=True, - ): - return element - return None - - def xpath_first( - self, - selector: str, - identifier: str = "", - adaptive: bool = False, - auto_save: bool = False, - percentage: int = 0, - **kwargs: Any, - ) -> Union["Selector", "TextHandler", None]: - """Search the current tree with XPath selectors and return the first result if possible, otherwise return `None` - - **Important: - It's recommended to use the identifier argument if you plan to use a different selector later - and want to relocate the same element(s)** - - Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!** - - :param selector: The XPath selector to be used. - :param adaptive: Enabled will make the function try to relocate the element if it was 'saved' before - :param identifier: A string that will be used to save/retrieve element's data in adaptive, - otherwise the selector will be used. - :param auto_save: Automatically save new elements for `adaptive` later - :param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that. - Be aware that the percentage calculation depends solely on the page structure, so don't play with this - number unless you must know what you are doing! - """ - for element in self.xpath( - selector, - identifier, - adaptive, - auto_save, - percentage, - _scrapling_first_match=True, - **kwargs, - ): - return element - return None - def css( self, selector: str, @@ -591,8 +554,7 @@ class Selector(SelectorsGeneration): adaptive: bool = False, auto_save: bool = False, percentage: int = 0, - **kwargs: Any, - ) -> Union["Selectors", List[Any], "TextHandlers"]: + ) -> "Selectors": """Search the current tree with CSS3 selectors **Important: @@ -610,6 +572,9 @@ class Selector(SelectorsGeneration): :return: `Selectors` class. """ + if self._is_text_node(self._root): + return Selectors() + try: if not self.__adaptive_enabled or "," not in selector: # No need to split selectors in this case, let's save some CPU cycles :) @@ -620,10 +585,9 @@ class Selector(SelectorsGeneration): adaptive, auto_save, percentage, - _scrapling_first_match=kwargs.pop("_scrapling_first_match", False), ) - results = [] + results = Selectors() for single_selector in split_selectors(selector): # I'm doing this only so the `save` function saves data correctly for combined selectors # Like using the ',' to combine two different selectors that point to different elements. @@ -634,10 +598,9 @@ class Selector(SelectorsGeneration): adaptive, auto_save, percentage, - _scrapling_first_match=kwargs.pop("_scrapling_first_match", False), ) - return results + return Selectors(results) except ( SelectorError, SelectorSyntaxError, @@ -652,7 +615,7 @@ class Selector(SelectorsGeneration): auto_save: bool = False, percentage: int = 0, **kwargs: Any, - ) -> Union["Selectors", "TextHandlers"]: + ) -> "Selectors": """Search the current tree with XPath selectors **Important: @@ -672,9 +635,9 @@ class Selector(SelectorsGeneration): :return: `Selectors` class. """ - _first_match = kwargs.pop( - "_scrapling_first_match", False - ) # Used internally only to speed up `css_first` and `xpath_first` + if self._is_text_node(self._root): + return Selectors() + try: if elements := self._root.xpath(selector, **kwargs): if not self.__adaptive_enabled and auto_save: @@ -684,7 +647,7 @@ class Selector(SelectorsGeneration): elif self.__adaptive_enabled and auto_save: self.save(elements[0], identifier or selector) - return self.__handle_elements(elements[0:1] if (_first_match and elements) else elements) + return self.__handle_elements(elements) elif self.__adaptive_enabled: if adaptive: element_data = self.retrieve(identifier or selector) @@ -693,7 +656,7 @@ class Selector(SelectorsGeneration): if elements is not None and auto_save: self.save(elements[0], identifier or selector) - return self.__handle_elements(elements[0:1] if (_first_match and elements) else elements) + return self.__handle_elements(elements) else: if adaptive: log.warning( @@ -704,7 +667,7 @@ class Selector(SelectorsGeneration): "Argument `auto_save` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info." ) - return self.__handle_elements(elements[0:1] if (_first_match and elements) else elements) + return self.__handle_elements(elements) except ( SelectorError, @@ -725,11 +688,13 @@ class Selector(SelectorsGeneration): :param kwargs: The attributes you want to filter elements based on it. :return: The `Selectors` object of the elements or empty list """ + if self._is_text_node(self._root): + return Selectors() if not args and not kwargs: raise TypeError("You have to pass something to search with, like tag name(s), tag attributes, or both.") - attributes = dict() + attributes: Dict[str, Any] = dict() tags: Set[str] = set() patterns: Set[Pattern] = set() results, functions, selectors = Selectors(), [], [] @@ -828,21 +793,19 @@ class Selector(SelectorsGeneration): :param candidate: The element to compare with the original element. :return: A percentage score of how similar is the candidate to the original element """ - score, checks = 0, 0 + score: float = 0 + checks: int = 0 data = _StorageTools.element_to_dict(candidate) - # Possible TODO: - # Study the idea of giving weight to each test below so some are more important than others - # Current results: With weights some websites had better score while it was worse for others - score += 1 if original["tag"] == data["tag"] else 0 # * 0.3 # 30% + score += 1 if original["tag"] == data["tag"] else 0 checks += 1 if original["text"]: - score += SequenceMatcher(None, original["text"], data.get("text") or "").ratio() # * 0.3 # 30% + score += SequenceMatcher(None, original["text"], data.get("text") or "").ratio() checks += 1 # if both don't have attributes, it still counts for something! - score += self.__calculate_dict_diff(original["attributes"], data["attributes"]) # * 0.3 # 30% + score += self.__calculate_dict_diff(original["attributes"], data["attributes"]) checks += 1 # Separate similarity test for class, id, href,... this will help in full structural changes @@ -857,23 +820,19 @@ class Selector(SelectorsGeneration): None, original["attributes"][attrib], data["attributes"].get(attrib) or "", - ).ratio() # * 0.3 # 30% + ).ratio() checks += 1 - score += SequenceMatcher(None, original["path"], data["path"]).ratio() # * 0.1 # 10% + score += SequenceMatcher(None, original["path"], data["path"]).ratio() checks += 1 if original.get("parent_name"): # Then we start comparing parents' data if data.get("parent_name"): - score += SequenceMatcher( - None, original["parent_name"], data.get("parent_name") or "" - ).ratio() # * 0.2 # 20% + score += SequenceMatcher(None, original["parent_name"], data.get("parent_name") or "").ratio() checks += 1 - score += self.__calculate_dict_diff( - original["parent_attribs"], data.get("parent_attribs") or {} - ) # * 0.2 # 20% + score += self.__calculate_dict_diff(original["parent_attribs"], data.get("parent_attribs") or {}) checks += 1 if original["parent_text"]: @@ -881,14 +840,14 @@ class Selector(SelectorsGeneration): None, original["parent_text"], data.get("parent_text") or "", - ).ratio() # * 0.1 # 10% + ).ratio() checks += 1 # else: # # The original element has a parent and this one not, this is not a good sign # score -= 0.1 if original.get("siblings"): - score += SequenceMatcher(None, original["siblings"], data.get("siblings") or []).ratio() # * 0.1 # 10% + score += SequenceMatcher(None, original["siblings"], data.get("siblings") or []).ratio() checks += 1 # How % sure? let's see @@ -908,15 +867,15 @@ class Selector(SelectorsGeneration): :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See the docs for more info. """ - if self.__adaptive_enabled: - target = element - if isinstance(target, self.__class__): - target: HtmlElement = target._root + if self.__adaptive_enabled and self._storage: + target_element: Any = element + if isinstance(target_element, self.__class__): + target_element = target_element._root - if self._is_text_node(target): - target: HtmlElement = target.getparent() + if self._is_text_node(target_element): + target_element = target_element.getparent() - self._storage.save(target, identifier) + self._storage.save(target_element, identifier) else: raise RuntimeError( "Can't use `adaptive` features while it's disabled globally, you have to start a new class instance." @@ -929,7 +888,7 @@ class Selector(SelectorsGeneration): the docs for more info. :return: A dictionary of the unique properties """ - if self.__adaptive_enabled: + if self.__adaptive_enabled and self._storage: return self._storage.retrieve(identifier) raise RuntimeError( @@ -939,6 +898,8 @@ class Selector(SelectorsGeneration): # Operations on text functions def json(self) -> Dict: """Return JSON response if the response is jsonable otherwise throws error""" + if self._is_text_node(self._root): + return TextHandler(str(self._root)).json() if self._raw_body and isinstance(self._raw_body, (str, bytes)): if isinstance(self._raw_body, str): return TextHandler(self._raw_body).json() @@ -1004,7 +965,8 @@ class Selector(SelectorsGeneration): candidate_attributes = ( self.__get_attributes(candidate, ignore_attributes) if ignore_attributes else candidate.attrib ) - score, checks = 0, 0 + score: float = 0 + checks: int = 0 if original_attributes: score += sum( @@ -1059,6 +1021,9 @@ class Selector(SelectorsGeneration): :return: A ``Selectors`` container of ``Selector`` objects or empty list """ + if self._is_text_node(self._root): + return Selectors() + # We will use the elements' root from now on to get the speed boost of using Lxml directly root = self._root similar_elements = list() @@ -1088,6 +1053,26 @@ class Selector(SelectorsGeneration): return Selectors(map(self.__element_convertor, similar_elements)) + @overload + def find_by_text( + self, + text: str, + first_match: Literal[True] = ..., + partial: bool = ..., + case_sensitive: bool = ..., + clean_match: bool = ..., + ) -> "Selector": ... + + @overload + def find_by_text( + self, + text: str, + first_match: Literal[False], + partial: bool = ..., + case_sensitive: bool = ..., + clean_match: bool = ..., + ) -> "Selectors": ... + def find_by_text( self, text: str, @@ -1103,21 +1088,23 @@ class Selector(SelectorsGeneration): :param case_sensitive: if enabled, the letters case will be taken into consideration :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching """ + if self._is_text_node(self._root): + return Selectors() results = Selectors() if not case_sensitive: text = text.lower() - possible_targets = _find_all_elements_with_spaces(self._root) + possible_targets = cast(List, _find_all_elements_with_spaces(self._root)) if possible_targets: for node in self.__elements_convertor(possible_targets): """Check if element matches given text otherwise, traverse the children tree and iterate""" - node_text = node.text + node_text: TextHandler = node.text if clean_match: - node_text = node_text.clean() + node_text = TextHandler(node_text.clean()) if not case_sensitive: - node_text = node_text.lower() + node_text = TextHandler(node_text.lower()) if partial: if text in node_text: @@ -1134,6 +1121,24 @@ class Selector(SelectorsGeneration): return results[0] return results + @overload + def find_by_regex( + self, + query: str | Pattern[str], + first_match: Literal[True] = ..., + case_sensitive: bool = ..., + clean_match: bool = ..., + ) -> "Selector": ... + + @overload + def find_by_regex( + self, + query: str | Pattern[str], + first_match: Literal[False], + case_sensitive: bool = ..., + clean_match: bool = ..., + ) -> "Selectors": ... + def find_by_regex( self, query: str | Pattern[str], @@ -1147,9 +1152,12 @@ class Selector(SelectorsGeneration): :param case_sensitive: If enabled, the letters case will be taken into consideration in the regex. :param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching. """ + if self._is_text_node(self._root): + return Selectors() + results = Selectors() - possible_targets = _find_all_elements_with_spaces(self._root) + possible_targets = cast(List, _find_all_elements_with_spaces(self._root)) if possible_targets: for node in self.__elements_convertor(possible_targets): """Check if element matches given regex otherwise, traverse the children tree and iterate""" @@ -1309,31 +1317,39 @@ class Selectors(List[Selector]): """ return self.__class__([element for element in self if func(element)]) - # For easy copy-paste from Scrapy/parsel code when needed :) + @overload + def get(self) -> Optional[TextHandler]: ... + + @overload + def get(self, default: _T) -> Union[TextHandler, _T]: ... + def get(self, default=None): - """Returns the first item of the current list + """Returns the serialized string of the first element, or ``default`` if empty. :param default: the default value to return if the current list is empty """ - return self[0] if len(self) > 0 else default + for x in self: + return x.get() + return default - def extract(self): - return self + def getall(self) -> TextHandlers: + """Serialize all elements and return as a TextHandlers list.""" + return TextHandlers([x.get() for x in self]) + extract = getall extract_first = get - get_all = extract @property - def first(self): - """Returns the first item of the current list or `None` if the list is empty""" - return self.get() + def first(self) -> Optional[Selector]: + """Returns the first Selector item of the current list or `None` if the list is empty""" + return self[0] if len(self) > 0 else None @property - def last(self): - """Returns the last item of the current list or `None` if the list is empty""" + def last(self) -> Optional[Selector]: + """Returns the last Selector item of the current list or `None` if the list is empty""" return self[-1] if len(self) > 0 else None @property - def length(self): + def length(self) -> int: """Returns the length of the current list""" return len(self) diff --git a/scrapling/spiders/__init__.py b/scrapling/spiders/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..92eb2e9250925e59117f98b33cf6b4dfd885beea --- /dev/null +++ b/scrapling/spiders/__init__.py @@ -0,0 +1,18 @@ +from .request import Request +from .result import CrawlResult +from .scheduler import Scheduler +from .engine import CrawlerEngine +from .session import SessionManager +from .spider import Spider, SessionConfigurationError +from scrapling.engines.toolbelt.custom import Response + +__all__ = [ + "Spider", + "SessionConfigurationError", + "Request", + "CrawlerEngine", + "CrawlResult", + "SessionManager", + "Scheduler", + "Response", +] diff --git a/scrapling/spiders/checkpoint.py b/scrapling/spiders/checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..25de36264241391fd3b004074ac4cdff7551a705 --- /dev/null +++ b/scrapling/spiders/checkpoint.py @@ -0,0 +1,90 @@ +import pickle +from pathlib import Path +from dataclasses import dataclass, field + +import anyio +from anyio import Path as AsyncPath + +from scrapling.core.utils import log +from scrapling.core._types import Set, List, Optional, TYPE_CHECKING + +if TYPE_CHECKING: + from scrapling.spiders.request import Request + + +@dataclass +class CheckpointData: + """Container for checkpoint state.""" + + requests: List["Request"] = field(default_factory=list) + seen: Set[bytes] = field(default_factory=set) + + +class CheckpointManager: + """Manages saving and loading checkpoint state to/from disk.""" + + CHECKPOINT_FILE = "checkpoint.pkl" + + def __init__(self, crawldir: str | Path | AsyncPath, interval: float = 300.0): + self.crawldir = AsyncPath(crawldir) + self._checkpoint_path = self.crawldir / self.CHECKPOINT_FILE + self.interval = interval + if not isinstance(interval, (int, float)): + raise TypeError("Checkpoints interval must be integer or float.") + else: + if interval < 0: + raise ValueError("Checkpoints interval must be equal or greater than 0.") + + async def has_checkpoint(self) -> bool: + """Check if a checkpoint exists.""" + return await self._checkpoint_path.exists() + + async def save(self, data: CheckpointData) -> None: + """Save checkpoint data to disk atomically.""" + await self.crawldir.mkdir(parents=True, exist_ok=True) + + temp_path = self._checkpoint_path.with_suffix(".tmp") + + try: + serialized = pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL) + async with await anyio.open_file(temp_path, "wb") as f: + await f.write(serialized) + + await temp_path.rename(self._checkpoint_path) + + log.info(f"Checkpoint saved: {len(data.requests)} requests, {len(data.seen)} seen URLs") + except Exception as e: + # Clean up temp file if it exists + if await temp_path.exists(): + await temp_path.unlink() + log.error(f"Failed to save checkpoint: {e}") + raise + + async def load(self) -> Optional[CheckpointData]: + """Load checkpoint data from disk. + + Returns None if no checkpoint exists or if loading fails. + """ + if not await self.has_checkpoint(): + return None + + try: + async with await anyio.open_file(self._checkpoint_path, "rb") as f: + content = await f.read() + data: CheckpointData = pickle.loads(content) + + log.info(f"Checkpoint loaded: {len(data.requests)} requests, {len(data.seen)} seen URLs") + return data + + except Exception as e: + log.error(f"Failed to load checkpoint (starting fresh): {e}") + return None + + async def cleanup(self) -> None: + """Delete checkpoint file after successful completion.""" + try: + if await self._checkpoint_path.exists(): + await self._checkpoint_path.unlink() + log.debug("Checkpoint file cleaned up") + except Exception as e: + log.warning(f"Failed to cleanup checkpoint file: {e}") diff --git a/scrapling/spiders/engine.py b/scrapling/spiders/engine.py new file mode 100644 index 0000000000000000000000000000000000000000..416911dba598fe918e154126a4bc05fcbffc6e95 --- /dev/null +++ b/scrapling/spiders/engine.py @@ -0,0 +1,333 @@ +import json +import pprint +from pathlib import Path + +import anyio +from anyio import Path as AsyncPath +from anyio import create_task_group, CapacityLimiter, create_memory_object_stream, EndOfStream + +from scrapling.core.utils import log +from scrapling.spiders.request import Request +from scrapling.spiders.scheduler import Scheduler +from scrapling.spiders.session import SessionManager +from scrapling.spiders.result import CrawlStats, ItemList +from scrapling.spiders.checkpoint import CheckpointManager, CheckpointData +from scrapling.core._types import Dict, Union, Optional, TYPE_CHECKING, Any, AsyncGenerator + +if TYPE_CHECKING: + from scrapling.spiders.spider import Spider + + +def _dump(obj: Dict) -> str: + return json.dumps(obj, indent=4) + + +class CrawlerEngine: + """Orchestrates the crawling process.""" + + def __init__( + self, + spider: "Spider", + session_manager: SessionManager, + crawldir: Optional[Union[str, Path, AsyncPath]] = None, + interval: float = 300.0, + ): + self.spider = spider + self.session_manager = session_manager + self.scheduler = Scheduler( + include_kwargs=spider.fp_include_kwargs, + include_headers=spider.fp_include_headers, + keep_fragments=spider.fp_keep_fragments, + ) + self.stats = CrawlStats() + + self._global_limiter = CapacityLimiter(spider.concurrent_requests) + self._domain_limiters: dict[str, CapacityLimiter] = {} + self._allowed_domains: set[str] = spider.allowed_domains or set() + + self._active_tasks: int = 0 + self._running: bool = False + self._items: ItemList = ItemList() + self._item_stream: Any = None + + self._checkpoint_system_enabled = bool(crawldir) + self._checkpoint_manager = CheckpointManager(crawldir or "", interval) + self._last_checkpoint_time: float = 0.0 + self._pause_requested: bool = False + self._force_stop: bool = False + self.paused: bool = False + + def _is_domain_allowed(self, request: Request) -> bool: + """Check if the request's domain is in allowed_domains.""" + if not self._allowed_domains: + return True + + domain = request.domain + for allowed in self._allowed_domains: + if domain == allowed or domain.endswith("." + allowed): + return True + return False + + def _rate_limiter(self, domain: str) -> CapacityLimiter: + """Get or create a per-domain concurrency limiter if enabled, otherwise use the global limiter.""" + if self.spider.concurrent_requests_per_domain: + if domain not in self._domain_limiters: + self._domain_limiters[domain] = CapacityLimiter(self.spider.concurrent_requests_per_domain) + return self._domain_limiters[domain] + return self._global_limiter + + def _normalize_request(self, request: Request) -> None: + """Normalize request fields before enqueueing. + + Resolves empty sid to the session manager's default session ID. + This ensures consistent fingerprinting for requests using the same session. + """ + if not request.sid: + request.sid = self.session_manager.default_session_id + + async def _process_request(self, request: Request) -> None: + """Download and process a single request.""" + async with self._rate_limiter(request.domain): + if self.spider.download_delay: + await anyio.sleep(self.spider.download_delay) + + if request._session_kwargs.get("proxy"): + self.stats.proxies.append(request._session_kwargs["proxy"]) + if request._session_kwargs.get("proxies"): + self.stats.proxies.append(dict(request._session_kwargs["proxies"])) + try: + response = await self.session_manager.fetch(request) + self.stats.increment_requests_count(request.sid or self.session_manager.default_session_id) + self.stats.increment_response_bytes(request.domain, len(response.body)) + self.stats.increment_status(response.status) + + except Exception as e: + self.stats.failed_requests_count += 1 + await self.spider.on_error(request, e) + return + + if await self.spider.is_blocked(response): + self.stats.blocked_requests_count += 1 + if request._retry_count < self.spider.max_blocked_retries: + retry_request = request.copy() + retry_request._retry_count += 1 + retry_request.priority -= 1 # Don't retry immediately + retry_request.dont_filter = True + retry_request._session_kwargs.pop("proxy", None) + retry_request._session_kwargs.pop("proxies", None) + + new_request = await self.spider.retry_blocked_request(retry_request, response) + self._normalize_request(new_request) + await self.scheduler.enqueue(new_request) + log.info( + f"Scheduled blocked request for retry ({retry_request._retry_count}/{self.spider.max_blocked_retries}): {request.url}" + ) + else: + log.warning(f"Max retries exceeded for blocked request: {request.url}") + return + + callback = request.callback if request.callback else self.spider.parse + try: + async for result in callback(response): + if isinstance(result, Request): + if self._is_domain_allowed(result): + self._normalize_request(result) + await self.scheduler.enqueue(result) + else: + self.stats.offsite_requests_count += 1 + log.debug(f"Filtered offsite request to: {result.url}") + elif isinstance(result, dict): + processed_result = await self.spider.on_scraped_item(result) + if processed_result: + self.stats.items_scraped += 1 + log.debug(f"Scraped from {str(response)}\n{pprint.pformat(processed_result)}") + if self._item_stream: + await self._item_stream.send(processed_result) + else: + self._items.append(processed_result) + else: + self.stats.items_dropped += 1 + log.warning(f"Dropped from {str(response)}\n{processed_result}") + elif result is not None: + log.error(f"Spider must return Request, dict or None, got '{type(result)}' in {request}") + except Exception as e: + msg = f"Spider error processing {request}:\n {e}" + log.error(msg, exc_info=e) + await self.spider.on_error(request, e) + + async def _task_wrapper(self, request: Request) -> None: + """Wrapper to track active task count.""" + try: + await self._process_request(request) + finally: + self._active_tasks -= 1 + + def request_pause(self) -> None: + """Request a graceful pause of the crawl. + + First call: requests graceful pause (waits for active tasks). + Second call: forces immediate stop. + """ + if self._force_stop: + return # Already forcing stop + + if self._pause_requested: + # Second Ctrl+C - force stop + self._force_stop = True + log.warning("Force stop requested, cancelling immediately...") + else: + self._pause_requested = True + log.info( + "Pause requested, waiting for in-flight requests to complete (press Ctrl+C again to force stop)..." + ) + + async def _save_checkpoint(self) -> None: + """Save current state to checkpoint files.""" + requests, seen = self.scheduler.snapshot() + data = CheckpointData(requests=requests, seen=seen) + await self._checkpoint_manager.save(data) + self._last_checkpoint_time = anyio.current_time() + + def _is_checkpoint_time(self) -> bool: + """Check if it's time for the periodic checkpoint.""" + if not self._checkpoint_system_enabled: + return False + + if self._checkpoint_manager.interval == 0: + return False + + current_time = anyio.current_time() + return (current_time - self._last_checkpoint_time) >= self._checkpoint_manager.interval + + async def _restore_from_checkpoint(self) -> bool: + """Attempt to restore state from checkpoint. + + Returns True if successfully restored, False otherwise. + """ + if not self._checkpoint_system_enabled: + raise + + data = await self._checkpoint_manager.load() + if data is None: + return False + + self.scheduler.restore(data) + + # Restore callbacks from spider after scheduler restore + for request in data.requests: + request._restore_callback(self.spider) + + return True + + async def crawl(self) -> CrawlStats: + """Run the spider and return CrawlStats.""" + self._running = True + self._items.clear() + self.paused = False + self._pause_requested = False + self._force_stop = False + self.stats = CrawlStats(start_time=anyio.current_time()) + + # Check for existing checkpoint + resuming = (await self._restore_from_checkpoint()) if self._checkpoint_system_enabled else False + self._last_checkpoint_time = anyio.current_time() + + async with self.session_manager: + self.stats.concurrent_requests = self.spider.concurrent_requests + self.stats.concurrent_requests_per_domain = self.spider.concurrent_requests_per_domain + self.stats.download_delay = self.spider.download_delay + await self.spider.on_start(resuming=resuming) + + try: + if not resuming: + async for request in self.spider.start_requests(): + self._normalize_request(request) + await self.scheduler.enqueue(request) + else: + log.info("Resuming from checkpoint, skipping start_requests()") + + # Process queue + async with create_task_group() as tg: + while self._running: + if self._pause_requested: + if self._active_tasks == 0 or self._force_stop: + if self._force_stop: + log.warning(f"Force stopping with {self._active_tasks} active tasks") + tg.cancel_scope.cancel() + + # Only save checkpoint if checkpoint system is enabled + if self._checkpoint_system_enabled: + await self._save_checkpoint() + self.paused = True + log.info("Spider paused, checkpoint saved") + else: + log.info("Spider stopped gracefully") + + self._running = False + break + + # Wait briefly and check again + await anyio.sleep(0.05) + continue + + if self._checkpoint_system_enabled and self._is_checkpoint_time(): + await self._save_checkpoint() + + if self.scheduler.is_empty: + # Empty queue + no active tasks = done + if self._active_tasks == 0: + self._running = False + log.debug("Spider idle") + break + + # Brief wait for callbacks to enqueue new requests + await anyio.sleep(0.05) + continue + + # Only spawn tasks up to concurrent_requests limit + # This prevents spawning thousands of waiting tasks + if self._active_tasks >= self.spider.concurrent_requests: + await anyio.sleep(0.01) + continue + + request = await self.scheduler.dequeue() + self._active_tasks += 1 + tg.start_soon(self._task_wrapper, request) + + finally: + await self.spider.on_close() + # Clean up checkpoint files on successful completion (not paused) + if not self.paused and self._checkpoint_system_enabled: + await self._checkpoint_manager.cleanup() + + self.stats.log_levels_counter = self.spider._log_counter.get_counts() + self.stats.end_time = anyio.current_time() + log.info(_dump(self.stats.to_dict())) + return self.stats + + @property + def items(self) -> ItemList: + """Access scraped items.""" + return self._items + + def __aiter__(self) -> AsyncGenerator[dict, None]: + return self._stream() + + async def _stream(self) -> AsyncGenerator[dict, None]: + """Async generator that runs crawl and yields items.""" + send, recv = create_memory_object_stream[dict](100) + self._item_stream = send + + async def run(): + try: + await self.crawl() + finally: + await send.aclose() + + async with create_task_group() as tg: + tg.start_soon(run) + try: + async for item in recv: + yield item + except EndOfStream: + pass diff --git a/scrapling/spiders/request.py b/scrapling/spiders/request.py new file mode 100644 index 0000000000000000000000000000000000000000..ce728ee2e139897a5d8efe26bb977165195168a1 --- /dev/null +++ b/scrapling/spiders/request.py @@ -0,0 +1,163 @@ +import hashlib +from io import BytesIO +from functools import cached_property +from urllib.parse import urlparse, urlencode + +import orjson +from w3lib.url import canonicalize_url + +from scrapling.engines.toolbelt.custom import Response +from scrapling.core._types import Any, AsyncGenerator, Callable, Dict, Optional, Union, Tuple, TYPE_CHECKING + +if TYPE_CHECKING: + from scrapling.spiders.spider import Spider + + +def _convert_to_bytes(value: str | bytes) -> bytes: + if isinstance(value, bytes): + return value + if not isinstance(value, str): + raise TypeError(f"Can't convert {type(value).__name__} to bytes") + + return value.encode(encoding="utf-8", errors="ignore") + + +class Request: + def __init__( + self, + url: str, + sid: str = "", + callback: Callable[[Response], AsyncGenerator[Union[Dict[str, Any], "Request", None], None]] | None = None, + priority: int = 0, + dont_filter: bool = False, + meta: dict[str, Any] | None = None, + _retry_count: int = 0, + **kwargs: Any, + ) -> None: + self.url: str = url + self.sid: str = sid + self.callback = callback + self.priority: int = priority + self.dont_filter: bool = dont_filter + self.meta: dict[str, Any] = meta if meta else {} + self._retry_count: int = _retry_count + self._session_kwargs = kwargs if kwargs else {} + self._fp: Optional[bytes] = None + + def copy(self) -> "Request": + """Create a copy of this request.""" + return Request( + url=self.url, + sid=self.sid, + callback=self.callback, + priority=self.priority, + dont_filter=self.dont_filter, + meta=self.meta.copy(), + _retry_count=self._retry_count, + **self._session_kwargs, + ) + + @cached_property + def domain(self) -> str: + return urlparse(self.url).netloc + + def update_fingerprint( + self, + include_kwargs: bool = False, + include_headers: bool = False, + keep_fragments: bool = False, + ) -> bytes: + """Generate a unique fingerprint for deduplication. + + Caches the result in self._fp after first computation. + """ + if self._fp is not None: + return self._fp + + post_data = self._session_kwargs.get("data", {}) + body = b"" + if post_data: + if isinstance(post_data, dict | list | tuple): + body = urlencode(post_data).encode() + elif isinstance(post_data, str): + body = post_data.encode() + elif isinstance(post_data, BytesIO): + body = post_data.getvalue() + elif isinstance(post_data, bytes): + body = post_data + else: + post_data = self._session_kwargs.get("json", {}) + body = orjson.dumps(post_data) if post_data else b"" + + data: Dict[str, str | Tuple] = { + "sid": self.sid, + "body": body.hex(), + "method": self._session_kwargs.get("method", "GET"), + "url": canonicalize_url(self.url, keep_fragments=keep_fragments), + } + + if include_kwargs: + kwargs = (key.lower() for key in self._session_kwargs.keys() if key.lower() not in ("data", "json")) + data["kwargs"] = "".join(set(_convert_to_bytes(key).hex() for key in kwargs)) + + if include_headers: + headers = self._session_kwargs.get("headers") or self._session_kwargs.get("extra_headers") or {} + processed_headers = {} + # Some header normalization + for key, value in headers.items(): + processed_headers[_convert_to_bytes(key.lower()).hex()] = _convert_to_bytes(value.lower()).hex() + data["headers"] = tuple(processed_headers.items()) + + fp = hashlib.sha1(orjson.dumps(data, option=orjson.OPT_SORT_KEYS), usedforsecurity=False).digest() + self._fp = fp + return fp + + def __repr__(self) -> str: + callback_name = getattr(self.callback, "__name__", None) or "None" + return f"" + + def __str__(self) -> str: + return self.url + + def __lt__(self, other: object) -> bool: + """Compare requests by priority""" + if not isinstance(other, Request): + return NotImplemented + return self.priority < other.priority + + def __gt__(self, other: object) -> bool: + """Compare requests by priority""" + if not isinstance(other, Request): + return NotImplemented + return self.priority > other.priority + + def __eq__(self, other: object) -> bool: + """Requests are equal if they have the same fingerprint.""" + if not isinstance(other, Request): + return NotImplemented + if self._fp is None or other._fp is None: + raise RuntimeError("Cannot compare requests before generating their fingerprints!") + return self._fp == other._fp + + def __getstate__(self) -> dict[str, Any]: + """Prepare state for pickling - store callback as name string for pickle compatibility.""" + state = self.__dict__.copy() + state["_callback_name"] = getattr(self.callback, "__name__", None) if self.callback is not None else None + state["callback"] = None # Don't pickle the actual callable + return state + + def __setstate__(self, state: dict[str, Any]) -> None: + """Restore state from pickle - callback restored later via _restore_callback().""" + self._callback_name: str | None = state.pop("_callback_name", None) + self.__dict__.update(state) + + def _restore_callback(self, spider: "Spider") -> None: + """Restore callback from spider after unpickling. + + :param spider: Spider instance to look up callback method on + """ + if hasattr(self, "_callback_name") and self._callback_name: + self.callback = getattr(spider, self._callback_name, None) or spider.parse + del self._callback_name + elif hasattr(self, "_callback_name"): + del self._callback_name diff --git a/scrapling/spiders/result.py b/scrapling/spiders/result.py new file mode 100644 index 0000000000000000000000000000000000000000..08a765812f0d50d1645613236f5a9e26a73b5ca0 --- /dev/null +++ b/scrapling/spiders/result.py @@ -0,0 +1,125 @@ +from pathlib import Path +from dataclasses import dataclass, field + +import orjson + +from scrapling.core.utils import log +from scrapling.core._types import Any, Iterator, Dict, List, Tuple, Union + + +class ItemList(list): + """A list of scraped items with export capabilities.""" + + def to_json(self, path: Union[str, Path], *, indent: bool = False): + """Export items to a JSON file. + + :param path: Path to the output file + :param indent: Pretty-print with 2-space indentation (slightly slower) + """ + options = orjson.OPT_SERIALIZE_NUMPY + if indent: + options |= orjson.OPT_INDENT_2 + + file = Path(path) + file.parent.mkdir(parents=True, exist_ok=True) + file.write_bytes(orjson.dumps(list(self), option=options)) + log.info("Saved %d items to %s", len(self), path) + + def to_jsonl(self, path: Union[str, Path]): + """Export items as JSON Lines (one JSON object per line). + + :param path: Path to the output file + """ + Path(path).parent.mkdir(parents=True, exist_ok=True) + with open(path, "wb") as f: + for item in self: + f.write(orjson.dumps(item, option=orjson.OPT_SERIALIZE_NUMPY)) + f.write(b"\n") + log.info("Saved %d items to %s", len(self), path) + + +@dataclass +class CrawlStats: + """Statistics for a crawl run.""" + + requests_count: int = 0 + concurrent_requests: int = 0 + concurrent_requests_per_domain: int = 0 + failed_requests_count: int = 0 + offsite_requests_count: int = 0 + response_bytes: int = 0 + items_scraped: int = 0 + items_dropped: int = 0 + start_time: float = 0.0 + end_time: float = 0.0 + download_delay: float = 0.0 + blocked_requests_count: int = 0 + custom_stats: Dict = field(default_factory=dict) + response_status_count: Dict = field(default_factory=dict) + domains_response_bytes: Dict = field(default_factory=dict) + sessions_requests_count: Dict = field(default_factory=dict) + proxies: List[str | Dict | Tuple] = field(default_factory=list) + log_levels_counter: Dict = field(default_factory=dict) + + @property + def elapsed_seconds(self) -> float: + return self.end_time - self.start_time + + @property + def requests_per_second(self) -> float: + if self.elapsed_seconds == 0: + return 0.0 + return self.requests_count / self.elapsed_seconds + + def increment_status(self, status: int) -> None: + self.response_status_count[f"status_{status}"] = self.response_status_count.get(f"status_{status}", 0) + 1 + + def increment_response_bytes(self, domain: str, count: int) -> None: + self.response_bytes += count + self.domains_response_bytes[domain] = self.domains_response_bytes.get(domain, 0) + count + + def increment_requests_count(self, sid: str) -> None: + self.requests_count += 1 + self.sessions_requests_count[sid] = self.sessions_requests_count.get(sid, 0) + 1 + + def to_dict(self) -> dict[str, Any]: + return { + "items_scraped": self.items_scraped, + "items_dropped": self.items_dropped, + "elapsed_seconds": round(self.elapsed_seconds, 2), + "download_delay": round(self.download_delay, 2), + "concurrent_requests": self.concurrent_requests, + "concurrent_requests_per_domain": self.concurrent_requests_per_domain, + "requests_count": self.requests_count, + "requests_per_second": round(self.requests_per_second, 2), + "sessions_requests_count": self.sessions_requests_count, + "failed_requests_count": self.failed_requests_count, + "offsite_requests_count": self.offsite_requests_count, + "blocked_requests_count": self.blocked_requests_count, + "response_status_count": self.response_status_count, + "response_bytes": self.response_bytes, + "domains_response_bytes": self.domains_response_bytes, + "proxies": self.proxies, + "custom_stats": self.custom_stats, + "log_count": self.log_levels_counter, + } + + +@dataclass +class CrawlResult: + """Complete result from a spider run.""" + + stats: CrawlStats + items: ItemList + paused: bool = False + + @property + def completed(self) -> bool: + """True if the crawl completed normally (not paused).""" + return not self.paused + + def __len__(self) -> int: + return len(self.items) + + def __iter__(self) -> Iterator[dict[str, Any]]: + return iter(self.items) diff --git a/scrapling/spiders/scheduler.py b/scrapling/spiders/scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..aca32779cc7fd748fea680e2d3f72f180868ba58 --- /dev/null +++ b/scrapling/spiders/scheduler.py @@ -0,0 +1,80 @@ +import asyncio +from itertools import count + +from scrapling.core.utils import log +from scrapling.spiders.request import Request +from scrapling.core._types import List, Set, Tuple, TYPE_CHECKING + +if TYPE_CHECKING: + from scrapling.spiders.checkpoint import CheckpointData + + +class Scheduler: + """ + Priority queue with URL deduplication. (heapq) + + Higher priority requests are processed first. + Duplicate URLs are filtered unless dont_filter=True. + """ + + def __init__(self, include_kwargs: bool = False, include_headers: bool = False, keep_fragments: bool = False): + self._queue: asyncio.PriorityQueue[tuple[int, int, Request]] = asyncio.PriorityQueue() + self._seen: set[bytes] = set() + self._counter = count() + # Mirror dict for snapshot without draining queue + self._pending: dict[int, tuple[int, int, Request]] = {} + self._include_kwargs = include_kwargs + self._include_headers = include_headers + self._keep_fragments = keep_fragments + + async def enqueue(self, request: Request) -> bool: + """Add a request to the queue.""" + fingerprint = request.update_fingerprint(self._include_kwargs, self._include_headers, self._keep_fragments) + + if not request.dont_filter and fingerprint in self._seen: + log.debug("Dropped duplicate request: %s", request) + return False + + self._seen.add(fingerprint) + + # Negative priority so higher priority = dequeued first + counter = next(self._counter) + item = (-request.priority, counter, request) + self._pending[counter] = item + await self._queue.put(item) + return True + + async def dequeue(self) -> Request: + """Get the next request to process.""" + _, counter, request = await self._queue.get() + self._pending.pop(counter, None) + return request + + def __len__(self) -> int: + return self._queue.qsize() + + @property + def is_empty(self) -> bool: + return self._queue.empty() + + def snapshot(self) -> Tuple[List[Request], Set[bytes]]: + """Create a snapshot of the current state for checkpoints.""" + sorted_items = sorted(self._pending.values(), key=lambda x: (x[0], x[1])) # Maintain queue order + requests = [item[2] for item in sorted_items] + return requests, self._seen.copy() + + def restore(self, data: "CheckpointData") -> None: + """Restore scheduler state from checkpoint data. + + :param data: CheckpointData containing requests and seen set + """ + self._seen = data.seen.copy() + + # Restore pending requests in order (they're already sorted by priority) + for request in data.requests: + counter = next(self._counter) + item = (-request.priority, counter, request) + self._pending[counter] = item + self._queue.put_nowait(item) + + log.info(f"Scheduler restored: {len(data.requests)} requests, {len(data.seen)} seen") diff --git a/scrapling/spiders/session.py b/scrapling/spiders/session.py new file mode 100644 index 0000000000000000000000000000000000000000..cc07042bad1c6268cf47068d1d4d2970e2c4e578 --- /dev/null +++ b/scrapling/spiders/session.py @@ -0,0 +1,145 @@ +from asyncio import Lock + +from scrapling.spiders.request import Request +from scrapling.engines.static import _ASyncSessionLogic +from scrapling.engines.toolbelt.convertor import Response +from scrapling.core._types import Set, cast, SUPPORTED_HTTP_METHODS +from scrapling.fetchers import AsyncDynamicSession, AsyncStealthySession, FetcherSession + +Session = FetcherSession | AsyncDynamicSession | AsyncStealthySession + + +class SessionManager: + """Manages pre-configured session instances.""" + + def __init__(self) -> None: + self._sessions: dict[str, Session] = {} + self._default_session_id: str | None = None + self._started: bool = False + self._lazy_sessions: Set[str] = set() + self._lazy_lock = Lock() + + def add(self, session_id: str, session: Session, *, default: bool = False, lazy: bool = False) -> "SessionManager": + """Register a session instance. + + :param session_id: Name to reference this session in requests + :param session: Your pre-configured session instance + :param default: If True, this becomes the default session + :param lazy: If True, the session will be started only when a request uses its ID. + """ + if session_id in self._sessions: + raise ValueError(f"Session '{session_id}' already registered") + + self._sessions[session_id] = session + + if default or self._default_session_id is None: + self._default_session_id = session_id + + if lazy: + self._lazy_sessions.add(session_id) + + return self + + def remove(self, session_id: str) -> None: + """Removes a session. + + :param session_id: ID of session to remove + """ + _ = self.pop(session_id) + + def pop(self, session_id: str) -> Session: + """Remove and returns a session. + + :param session_id: ID of session to remove + """ + if session_id not in self._sessions: + raise KeyError(f"Session '{session_id}' not found") + + session = self._sessions.pop(session_id) + if session_id in self._lazy_sessions: + self._lazy_sessions.remove(session_id) + + if session and self._default_session_id == session_id: + self._default_session_id = next(iter(self._sessions), None) + + return session + + @property + def default_session_id(self) -> str: + if self._default_session_id is None: + raise RuntimeError("No sessions registered") + return self._default_session_id + + @property + def session_ids(self) -> list[str]: + return list(self._sessions.keys()) + + def get(self, session_id: str) -> Session: + if session_id not in self._sessions: + available = ", ".join(self._sessions.keys()) + raise KeyError(f"Session '{session_id}' not found. Available: {available}") + return self._sessions[session_id] + + async def start(self) -> None: + """Start all sessions that aren't already alive.""" + if self._started: + return + + for sid, session in self._sessions.items(): + if sid not in self._lazy_sessions and not session._is_alive: + await session.__aenter__() + + self._started = True + + async def close(self) -> None: + """Close all registered sessions.""" + for session in self._sessions.values(): + _ = await session.__aexit__(None, None, None) + + self._started = False + + async def fetch(self, request: Request) -> Response: + sid = request.sid if request.sid else self.default_session_id + session = self.get(sid) + + if session: + if sid in self._lazy_sessions and not session._is_alive: + async with self._lazy_lock: + if not session._is_alive: + await session.__aenter__() + + if isinstance(session, FetcherSession): + client = session._client + + if isinstance(client, _ASyncSessionLogic): + response = await client._make_request( + method=cast(SUPPORTED_HTTP_METHODS, request._session_kwargs.pop("method", "GET")), + url=request.url, + **request._session_kwargs, + ) + else: + # Sync session or other types - shouldn't happen in async context + raise TypeError(f"Session type {type(client)} not supported for async fetch") + else: + response = await session.fetch(url=request.url, **request._session_kwargs) + + response.request = request + # Merge request meta into response meta (response meta takes priority) + response.meta = {**request.meta, **response.meta} + return response + raise RuntimeError("No session found with the request session id") + + async def __aenter__(self) -> "SessionManager": + await self.start() + return self + + async def __aexit__(self, *exc) -> None: + await self.close() + + def __contains__(self, session_id: str) -> bool: + """Check if a session ID is registered.""" + return session_id in self._sessions + + def __len__(self) -> int: + """Number of registered sessions.""" + return len(self._sessions) diff --git a/scrapling/spiders/spider.py b/scrapling/spiders/spider.py new file mode 100644 index 0000000000000000000000000000000000000000..4f3891235bafef7a995625ed1f2d8c4caf43062a --- /dev/null +++ b/scrapling/spiders/spider.py @@ -0,0 +1,316 @@ +import signal +import logging +from pathlib import Path +from abc import ABC, abstractmethod + +import anyio +from anyio import Path as AsyncPath + +from scrapling.spiders.request import Request +from scrapling.spiders.engine import CrawlerEngine +from scrapling.spiders.session import SessionManager +from scrapling.core.utils import set_logger, reset_logger +from scrapling.spiders.result import CrawlResult, CrawlStats +from scrapling.core._types import Set, Any, Dict, Optional, Union, TYPE_CHECKING, AsyncGenerator + +BLOCKED_CODES = {401, 403, 407, 429, 444, 500, 502, 503, 504} +if TYPE_CHECKING: + from scrapling.engines.toolbelt.custom import Response + + +class LogCounterHandler(logging.Handler): + """A logging handler that counts log messages by level.""" + + def __init__(self): + super().__init__() + self.counts = { + logging.DEBUG: 0, + logging.INFO: 0, + logging.WARNING: 0, + logging.ERROR: 0, + logging.CRITICAL: 0, + } + + def emit(self, record: logging.LogRecord) -> None: + level = record.levelno + # Map to the closest standard level + if level >= logging.CRITICAL: + self.counts[logging.CRITICAL] += 1 + elif level >= logging.ERROR: + self.counts[logging.ERROR] += 1 + elif level >= logging.WARNING: + self.counts[logging.WARNING] += 1 + elif level >= logging.INFO: + self.counts[logging.INFO] += 1 + else: + self.counts[logging.DEBUG] += 1 + + def get_counts(self) -> Dict[str, int]: + """Return counts as a dictionary with string keys.""" + return { + "debug": self.counts[logging.DEBUG], + "info": self.counts[logging.INFO], + "warning": self.counts[logging.WARNING], + "error": self.counts[logging.ERROR], + "critical": self.counts[logging.CRITICAL], + } + + +class SessionConfigurationError(Exception): + """Raised when session configuration fails.""" + + pass + + +class Spider(ABC): + """An abstract base class for creating web spiders. + + Check the documentation website for more information. + """ + + name: Optional[str] = None + start_urls: list[str] = [] + allowed_domains: Set[str] = set() + + # Concurrency settings + concurrent_requests: int = 4 + concurrent_requests_per_domain: int = 0 + download_delay: float = 0.0 + max_blocked_retries: int = 3 + + # Fingerprint adjustments + fp_include_kwargs: bool = False + fp_keep_fragments: bool = False + fp_include_headers: bool = False + + # Logging settings + logging_level: int = logging.DEBUG + logging_format: str = "[%(asctime)s]:({spider_name}) %(levelname)s: %(message)s" + logging_date_format: str = "%Y-%m-%d %H:%M:%S" + log_file: Optional[str] = None + + def __init__(self, crawldir: Optional[Union[str, Path, AsyncPath]] = None, interval: float = 300.0): + """Initialize the spider. + + :param crawldir: Directory for checkpoint files. If provided, enables pause/resume. + :param interval: Seconds between periodic checkpoint saves (default 5 minutes). + """ + if self.name is None: + raise ValueError(f"{self.__class__.__name__} must have a name.") + + self.logger = logging.getLogger(f"scrapling.spiders.{self.name}") + self.logger.setLevel(self.logging_level) + self.logger.handlers.clear() + self.logger.propagate = False # Don't propagate to parent 'scrapling' logger + + formatter = logging.Formatter( + fmt=self.logging_format.format(spider_name=self.name), datefmt=self.logging_date_format + ) + + # Add a log counter handler to track log counts by level + self._log_counter = LogCounterHandler() + self.logger.addHandler(self._log_counter) + + console_handler = logging.StreamHandler() + console_handler.setFormatter(formatter) + self.logger.addHandler(console_handler) + + if self.log_file: + Path(self.log_file).parent.mkdir(parents=True, exist_ok=True) + file_handler = logging.FileHandler(self.log_file) + file_handler.setFormatter(formatter) + self.logger.addHandler(file_handler) + + self.crawldir: Optional[Path] = Path(crawldir) if crawldir else None + self._interval = interval + self._engine: Optional[CrawlerEngine] = None + self._original_sigint_handler: Any = None + + self._session_manager = SessionManager() + + try: + self.configure_sessions(self._session_manager) + except Exception as e: + raise SessionConfigurationError(f"Error in {self.__class__.__name__}.configure_sessions(): {e}") from e + + if len(self._session_manager) == 0: + raise SessionConfigurationError(f"{self.__class__.__name__}.configure_sessions() did not add any sessions") + + self.logger.info("Spider initialized") + + async def start_requests(self) -> AsyncGenerator[Request, None]: + """Generate initial requests to start the crawl. + + By default, this generates Request objects for each URL in `start_urls` + using the session manager's default session and `parse()` as callback. + + Override this method for more control over initial requests + (e.g., to add custom headers, use different callbacks, etc.) + """ + if not self.start_urls: + raise RuntimeError( + "Spider has no starting point, either set `start_urls` or override `start_requests` function." + ) + + for url in self.start_urls: + yield Request(url, sid=self._session_manager.default_session_id) + + @abstractmethod + async def parse(self, response: "Response") -> AsyncGenerator[Dict[str, Any] | Request | None, None]: + """Default callback for processing responses""" + raise NotImplementedError(f"{self.__class__.__name__} must implement parse() method") + yield # Make this a generator for type checkers + + async def on_start(self, resuming: bool = False) -> None: + """Called before crawling starts. Override for setup logic. + + :param resuming: It's enabled if the spider is resuming from a checkpoint, left for the user to use. + """ + if resuming: + self.logger.debug("Resuming spider from checkpoint") + else: + self.logger.debug("Starting spider") + + async def on_close(self) -> None: + """Called after crawling finishes. Override for cleanup logic.""" + self.logger.debug("Spider closed") + + async def on_error(self, request: Request, error: Exception) -> None: + """ + Handle request errors for all spider requests. + + Override for custom error handling. + """ + pass + + async def on_scraped_item(self, item: Dict[str, Any]) -> Dict[str, Any] | None: + """A hook to be overridden by users to do some processing on scraped items, return `None` to drop the item silently.""" + return item + + async def is_blocked(self, response: "Response") -> bool: + """Check if the response is blocked. Users should override this for custom detection logic.""" + if response.status in BLOCKED_CODES: + return True + return False + + async def retry_blocked_request(self, request: Request, response: "Response") -> Request: + """Users should override this to prepare the blocked request before retrying, if needed.""" + return request + + def __repr__(self) -> str: + """String representation of the spider.""" + return f"<{self.__class__.__name__} '{self.name}'>" + + def configure_sessions(self, manager: SessionManager) -> None: + """Configure sessions for this spider. + + Override this method to add custom sessions. + The default implementation creates a FetcherSession session. + + The first session added becomes the default for `start_requests()` unless specified otherwise. + + :param manager: SessionManager to configure + """ + from scrapling.fetchers import FetcherSession + + manager.add("default", FetcherSession()) + + def pause(self): + """Request graceful shutdown of the crawling process.""" + if self._engine: + self._engine.request_pause() + else: + raise RuntimeError("No active crawl to stop") + + def _setup_signal_handler(self) -> None: + """Set up SIGINT handler for graceful pause.""" + + def handler(_signum: int, _frame: Any) -> None: + if self._engine: + self._engine.request_pause() + else: + # No engine yet, just raise KeyboardInterrupt + raise KeyboardInterrupt + + try: + self._original_sigint_handler = signal.signal(signal.SIGINT, handler) + except ValueError: + self._original_sigint_handler = None + + def _restore_signal_handler(self) -> None: + """Restore original SIGINT handler.""" + if self._original_sigint_handler is not None: + try: + signal.signal(signal.SIGINT, self._original_sigint_handler) + except ValueError: + pass + + async def __run(self) -> CrawlResult: + token = set_logger(self.logger) + try: + self._engine = CrawlerEngine(self, self._session_manager, self.crawldir, self._interval) + stats = await self._engine.crawl() + paused = self._engine.paused + return CrawlResult(stats=stats, items=self._engine.items, paused=paused) + finally: + self._engine = None + reset_logger(token) + # Close any file handlers to release file resources. + if self.log_file: + for handler in self.logger.handlers: + if isinstance(handler, logging.FileHandler): + handler.close() + + def start(self, use_uvloop: bool = False, **backend_options: Any) -> CrawlResult: + """Run the spider and return results. + + This is the main entry point for running a spider. + Handles async execution internally via anyio. + + Pressing Ctrl+C will initiate graceful shutdown (waits for active tasks to complete). + Pressing Ctrl+C a second time will force immediate stop. + + If crawldir is set, a checkpoint will also be saved on graceful shutdown, + allowing you to resume the crawl later by running the spider again. + + :param use_uvloop: Whether to use the faster uvloop/winloop event loop implementation, if available. + :param backend_options: Asyncio backend options to be used with `anyio.run` + """ + backend_options = backend_options or {} + if use_uvloop: + backend_options.update({"use_uvloop": True}) + + # Set up SIGINT handler for graceful shutdown + self._setup_signal_handler() + try: + return anyio.run(self.__run, backend="asyncio", backend_options=backend_options) + finally: + self._restore_signal_handler() + + async def stream(self) -> AsyncGenerator[Dict[str, Any], None]: + """Stream items as they're scraped. Ideal for long-running spiders or building applications on top of the spiders. + + Must be called from an async context. Yields items one by one as they are scraped. + Access `spider.stats` during iteration for real-time statistics. + + Note: SIGINT handling for pause/resume is not available in stream mode. + """ + token = set_logger(self.logger) + try: + self._engine = CrawlerEngine(self, self._session_manager, self.crawldir, self._interval) + async for item in self._engine: + yield item + finally: + self._engine = None + reset_logger(token) + if self.log_file: + for handler in self.logger.handlers: + if isinstance(handler, logging.FileHandler): + handler.close() + + @property + def stats(self) -> CrawlStats: + """Access current crawl stats (works during streaming).""" + if self._engine: + return self._engine.stats + raise RuntimeError("No active crawl. Use this property inside `async for item in spider.stream():`") diff --git a/setup.cfg b/setup.cfg index 23fdc33ed7d0c2d40bf899aadf9230d0249c372f..d8aedf7acda10a9798607ae452a425023f21c7b7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = scrapling -version = 0.3.14 +version = 0.4 author = Karim Shoair author_email = karim.shoair@pm.me description = Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be! diff --git a/tests/cli/test_cli.py b/tests/cli/test_cli.py index e2f86f3cbf408daa5056337f3c48937828638c16..39345d0a06b060a0c75fa6909f31011f2f2d4ff3 100644 --- a/tests/cli/test_cli.py +++ b/tests/cli/test_cli.py @@ -17,7 +17,6 @@ def configure_selector_mock(): mock_response.html_content = "Test content" mock_response.encoding = "utf-8" mock_response.get_all_text.return_value = "Test content" - mock_response.css_first.return_value = mock_response mock_response.css.return_value = [mock_response] return mock_response diff --git a/tests/fetchers/async/test_dynamic_session.py b/tests/fetchers/async/test_dynamic_session.py index c403017f125bb1289da2b4a4a5fef15fc9ea3771..95bb2891049d7aecb210483ce6ec091b515d9ae2 100644 --- a/tests/fetchers/async/test_dynamic_session.py +++ b/tests/fetchers/async/test_dynamic_session.py @@ -43,7 +43,7 @@ class TestAsyncDynamicSession: assert stats["total_pages"] <= 3 # After exit, should be closed - assert session._closed is True + assert session._is_alive is False # Should raise RuntimeError when used after closing with pytest.raises(RuntimeError): diff --git a/tests/fetchers/async/test_stealth_session.py b/tests/fetchers/async/test_stealth_session.py index 140742db63ed620b760f07e2b558888e44749191..db790020ce378b4079272887c8f3055721539ab7 100644 --- a/tests/fetchers/async/test_stealth_session.py +++ b/tests/fetchers/async/test_stealth_session.py @@ -44,7 +44,7 @@ class TestAsyncStealthySession: assert stats["total_pages"] <= 3 # After exit, should be closed - assert session._closed is True + assert session._is_alive is False # Should raise RuntimeError when used after closing with pytest.raises(RuntimeError): diff --git a/tests/fetchers/test_constants.py b/tests/fetchers/test_constants.py index 36662c6b9776602e7b3c4c76169e412f10f22e87..91afd4cc1ac2b49e6f0b20eb12f3114aa62a4be5 100644 --- a/tests/fetchers/test_constants.py +++ b/tests/fetchers/test_constants.py @@ -1,9 +1,4 @@ -from scrapling.engines.constants import ( - DEFAULT_DISABLED_RESOURCES, - DEFAULT_STEALTH_FLAGS, - HARMFUL_DEFAULT_ARGS, - DEFAULT_FLAGS, -) +from scrapling.engines.constants import EXTRA_RESOURCES, STEALTH_ARGS, HARMFUL_ARGS, DEFAULT_ARGS class TestConstants: @@ -11,18 +6,18 @@ class TestConstants: def test_default_disabled_resources(self): """Test default disabled resources""" - assert "image" in DEFAULT_DISABLED_RESOURCES - assert "font" in DEFAULT_DISABLED_RESOURCES - assert "stylesheet" in DEFAULT_DISABLED_RESOURCES - assert "media" in DEFAULT_DISABLED_RESOURCES + assert "image" in EXTRA_RESOURCES + assert "font" in EXTRA_RESOURCES + assert "stylesheet" in EXTRA_RESOURCES + assert "media" in EXTRA_RESOURCES def test_harmful_default_args(self): """Test harmful default arguments""" - assert "--enable-automation" in HARMFUL_DEFAULT_ARGS - assert "--disable-popup-blocking" in HARMFUL_DEFAULT_ARGS + assert "--enable-automation" in HARMFUL_ARGS + assert "--disable-popup-blocking" in HARMFUL_ARGS def test_flags(self): """Test default stealth flags""" - assert "--no-pings" in DEFAULT_FLAGS - # assert "--incognito" in DEFAULT_STEALTH_FLAGS - assert "--disable-blink-features=AutomationControlled" in DEFAULT_STEALTH_FLAGS + assert "--no-pings" in DEFAULT_ARGS + # assert "--incognito" in STEALTH_ARGS + assert "--disable-blink-features=AutomationControlled" in STEALTH_ARGS diff --git a/tests/fetchers/test_proxy_rotation.py b/tests/fetchers/test_proxy_rotation.py new file mode 100644 index 0000000000000000000000000000000000000000..e08104db4a20680c3022e33f713386dfc42e2594 --- /dev/null +++ b/tests/fetchers/test_proxy_rotation.py @@ -0,0 +1,303 @@ +import pytest +import random +from threading import Thread +from concurrent.futures import ThreadPoolExecutor + +from scrapling.engines.toolbelt import ProxyRotator, is_proxy_error, cyclic_rotation + + +class TestCyclicRotationStrategy: + """Test the default cyclic_rotation strategy function""" + + def test_cyclic_rotation_cycles_through_proxies(self): + """Test that cyclic_rotation returns proxies in order""" + proxies = ["http://p1:8080", "http://p2:8080", "http://p3:8080"] + + proxy, next_idx = cyclic_rotation(proxies, 0) + assert proxy == "http://p1:8080" + assert next_idx == 1 + + proxy, next_idx = cyclic_rotation(proxies, 1) + assert proxy == "http://p2:8080" + assert next_idx == 2 + + proxy, next_idx = cyclic_rotation(proxies, 2) + assert proxy == "http://p3:8080" + assert next_idx == 0 # Wraps around + + def test_cyclic_rotation_wraps_index(self): + """Test that cyclic_rotation handles index overflow""" + proxies = ["http://p1:8080", "http://p2:8080"] + + # Index larger than list length should wrap + proxy, next_idx = cyclic_rotation(proxies, 5) + assert proxy == "http://p2:8080" # 5 % 2 = 1 + assert next_idx == 0 + + +class TestProxyRotatorCreation: + """Test ProxyRotator initialization and validation""" + + def test_create_with_string_proxies(self): + """Test creating rotator with string proxy URLs""" + proxies = ["http://p1:8080", "http://p2:8080"] + rotator = ProxyRotator(proxies) + + assert len(rotator) == 2 + assert rotator.proxies == proxies + + def test_create_with_dict_proxies(self): + """Test creating rotator with dict proxies""" + proxies = [ + {"server": "http://p1:8080", "username": "user1", "password": "pass1"}, + {"server": "http://p2:8080"}, + ] + rotator = ProxyRotator(proxies) + + assert len(rotator) == 2 + assert rotator.proxies == proxies + + def test_create_with_mixed_proxies(self): + """Test creating rotator with mixed string and dict proxies""" + proxies = [ + "http://p1:8080", + {"server": "http://p2:8080", "username": "user"}, + ] + rotator = ProxyRotator(proxies) + + assert len(rotator) == 2 + + def test_empty_proxies_raises_error(self): + """Test that empty proxy list raises ValueError""" + with pytest.raises(ValueError, match="At least one proxy must be provided"): + ProxyRotator([]) + + def test_dict_without_server_raises_error(self): + """Test that dict proxy without 'server' key raises ValueError""" + with pytest.raises(ValueError, match="Proxy dict must have a 'server' key"): + ProxyRotator([{"username": "user", "password": "pass"}]) + + def test_invalid_proxy_type_raises_error(self): + """Test that invalid proxy type raises TypeError""" + with pytest.raises(TypeError, match="Invalid proxy type"): + ProxyRotator([123]) + + with pytest.raises(TypeError, match="Invalid proxy type"): + ProxyRotator([None]) + + def test_non_callable_strategy_raises_error(self): + """Test that non-callable strategy raises TypeError""" + with pytest.raises(TypeError, match="strategy must be callable"): + ProxyRotator(["http://p1:8080"], strategy="cyclic_rotation") + + with pytest.raises(TypeError, match="strategy must be callable"): + ProxyRotator(["http://p1:8080"], strategy=123) + + +class TestProxyRotatorRotation: + """Test ProxyRotator rotation behavior""" + + def test_get_proxy_cyclic_rotation(self): + """Test that get_proxy cycles through proxies in order""" + proxies = ["http://p1:8080", "http://p2:8080", "http://p3:8080"] + rotator = ProxyRotator(proxies) + + # First cycle + assert rotator.get_proxy() == "http://p1:8080" + assert rotator.get_proxy() == "http://p2:8080" + assert rotator.get_proxy() == "http://p3:8080" + + # Second cycle - wraps around + assert rotator.get_proxy() == "http://p1:8080" + assert rotator.get_proxy() == "http://p2:8080" + assert rotator.get_proxy() == "http://p3:8080" + + def test_get_proxy_single_proxy(self): + """Test rotation with single proxy always returns the same proxy""" + rotator = ProxyRotator(["http://only:8080"]) + + for _ in range(5): + assert rotator.get_proxy() == "http://only:8080" + + def test_get_proxy_with_dict_proxies(self): + """Test rotation with dict proxies""" + proxies = [ + {"server": "http://p1:8080"}, + {"server": "http://p2:8080"}, + ] + rotator = ProxyRotator(proxies) + + assert rotator.get_proxy() == {"server": "http://p1:8080"} + assert rotator.get_proxy() == {"server": "http://p2:8080"} + assert rotator.get_proxy() == {"server": "http://p1:8080"} + + +class TestCustomStrategies: + """Test ProxyRotator with custom rotation strategies""" + + def test_random_strategy(self): + """Test custom random selection strategy""" + def random_strategy(proxies, idx): + return random.choice(proxies), idx + + proxies = ["http://p1:8080", "http://p2:8080", "http://p3:8080"] + rotator = ProxyRotator(proxies, strategy=random_strategy) + + # Get multiple proxies - they should all be valid + results = [rotator.get_proxy() for _ in range(10)] + assert all(p in proxies for p in results) + + def test_sticky_strategy(self): + """Test custom sticky strategy that always returns first proxy""" + def sticky_strategy(proxies, idx): + return proxies[0], idx + + rotator = ProxyRotator( + ["http://p1:8080", "http://p2:8080"], + strategy=sticky_strategy + ) + + for _ in range(5): + assert rotator.get_proxy() == "http://p1:8080" + + def test_weighted_strategy(self): + """Test custom weighted strategy""" + call_count = {"count": 0} + + def alternating_strategy(proxies, idx): + # Returns first proxy twice, then second proxy once + call_count["count"] += 1 + if call_count["count"] % 3 == 0: + return proxies[1], idx + return proxies[0], idx + + rotator = ProxyRotator( + ["http://primary:8080", "http://backup:8080"], + strategy=alternating_strategy + ) + + assert rotator.get_proxy() == "http://primary:8080" + assert rotator.get_proxy() == "http://primary:8080" + assert rotator.get_proxy() == "http://backup:8080" + + def test_lambda_strategy(self): + """Test using lambda as strategy""" + rotator = ProxyRotator( + ["http://p1:8080", "http://p2:8080"], + strategy=lambda proxies, idx: (proxies[-1], idx) # Always last + ) + + assert rotator.get_proxy() == "http://p2:8080" + assert rotator.get_proxy() == "http://p2:8080" + + +class TestProxyRotatorProperties: + """Test ProxyRotator properties and methods""" + + def test_proxies_property_returns_copy(self): + """Test that proxies property returns a copy, not the original list""" + original = ["http://p1:8080", "http://p2:8080"] + rotator = ProxyRotator(original) + + proxies_copy = rotator.proxies + proxies_copy.append("http://p3:8080") + + # Original should be unchanged + assert len(rotator) == 2 + assert len(rotator.proxies) == 2 + + def test_len_returns_proxy_count(self): + """Test __len__ returns correct count""" + assert len(ProxyRotator(["http://p1:8080"])) == 1 + assert len(ProxyRotator(["http://p1:8080", "http://p2:8080"])) == 2 + assert len(ProxyRotator(["a", "b", "c", "d", "e"])) == 5 + + def test_repr(self): + """Test __repr__ format""" + rotator = ProxyRotator(["http://p1:8080", "http://p2:8080", "http://p3:8080"]) + assert repr(rotator) == "ProxyRotator(proxies=3)" + + +class TestProxyRotatorThreadSafety: + """Test ProxyRotator thread safety""" + + def test_concurrent_get_proxy(self): + """Test that concurrent get_proxy calls don't cause errors""" + proxies = [f"http://p{i}:8080" for i in range(10)] + rotator = ProxyRotator(proxies) + results = [] + + def get_proxies(n): + for _ in range(n): + results.append(rotator.get_proxy()) + + threads = [Thread(target=get_proxies, args=(100,)) for _ in range(10)] + for t in threads: + t.start() + for t in threads: + t.join() + + # All results should be valid proxies + assert len(results) == 1000 + assert all(p in proxies for p in results) + + def test_thread_pool_concurrent_access(self): + """Test concurrent access using ThreadPoolExecutor""" + proxies = ["http://p1:8080", "http://p2:8080", "http://p3:8080"] + rotator = ProxyRotator(proxies) + + with ThreadPoolExecutor(max_workers=5) as executor: + futures = [executor.submit(rotator.get_proxy) for _ in range(100)] + results = [f.result() for f in futures] + + assert len(results) == 100 + assert all(p in proxies for p in results) + + +class TestIsProxyError: + """Test is_proxy_error utility function""" + + @pytest.mark.parametrize("error_msg", [ + "net::err_proxy_connection_failed", + "NET::ERR_PROXY_AUTH_FAILED", + "net::err_tunnel_connection_failed", + "Connection refused by proxy", + "Connection reset by peer", + "Connection timed out while connecting to proxy", + "Failed to connect to proxy server", + "Could not resolve proxy host", + ]) + def test_proxy_errors_detected(self, error_msg): + """Test that proxy-related errors are detected""" + assert is_proxy_error(Exception(error_msg)) is True + + @pytest.mark.parametrize("error_msg", [ + "Page not found", + "404 Not Found", + "Internal server error", + "DNS resolution failed", + "SSL certificate error", + "Timeout waiting for response", + "Invalid JSON response", + ]) + def test_non_proxy_errors_not_detected(self, error_msg): + """Test that non-proxy errors are not detected as proxy errors""" + assert is_proxy_error(Exception(error_msg)) is False + + def test_case_insensitive_detection(self): + """Test that error detection is case-insensitive""" + assert is_proxy_error(Exception("NET::ERR_PROXY")) is True + assert is_proxy_error(Exception("Net::Err_Proxy")) is True + assert is_proxy_error(Exception("CONNECTION REFUSED")) is True + + def test_empty_error_message(self): + """Test handling of empty error message""" + assert is_proxy_error(Exception("")) is False + + def test_custom_exception_types(self): + """Test with custom exception types""" + class CustomError(Exception): + pass + + assert is_proxy_error(CustomError("net::err_proxy_failed")) is True + assert is_proxy_error(CustomError("normal error")) is False diff --git a/tests/fetchers/test_utils.py b/tests/fetchers/test_utils.py index fea3f3ffa7d44269d211d93c2bd3911b0e1e8842..4637a63049d66bbce555f984161e98f5d020ac3e 100644 --- a/tests/fetchers/test_utils.py +++ b/tests/fetchers/test_utils.py @@ -4,7 +4,9 @@ from pathlib import Path from scrapling.engines.toolbelt.custom import StatusText, Response from scrapling.engines.toolbelt.navigation import ( construct_proxy_dict, - js_bypass_path + create_intercept_handler, + create_async_intercept_handler, + js_bypass_path, ) from scrapling.engines.toolbelt.fingerprints import ( generate_convincing_referer, @@ -300,3 +302,150 @@ class TestResponse: # Should handle 'bytes' content properly assert response.status == 200 + + +class _MockRequest: + """Minimal mock for Playwright's Request object.""" + def __init__(self, url: str, resource_type: str = "document"): + self.url = url + self.resource_type = resource_type + + +class _MockRoute: + """Minimal mock for Playwright's sync Route object.""" + def __init__(self, url: str, resource_type: str = "document"): + self.request = _MockRequest(url, resource_type) + self.aborted = False + self.continued = False + + def abort(self): + self.aborted = True + + def continue_(self): + self.continued = True + + +class _AsyncMockRoute: + """Minimal mock for Playwright's async Route object.""" + def __init__(self, url: str, resource_type: str = "document"): + self.request = _MockRequest(url, resource_type) + self.aborted = False + self.continued = False + + async def abort(self): + self.aborted = True + + async def continue_(self): + self.continued = True + + +class TestCreateInterceptHandler: + """Test the unified sync route handler factory.""" + + def test_blocks_disabled_resource_types(self): + handler = create_intercept_handler(disable_resources=True) + route = _MockRoute("https://example.com/image.png", resource_type="image") + handler(route) + assert route.aborted + + def test_continues_allowed_resource_types(self): + handler = create_intercept_handler(disable_resources=True) + route = _MockRoute("https://example.com/page", resource_type="document") + handler(route) + assert route.continued + + def test_blocks_exact_domain(self): + handler = create_intercept_handler(disable_resources=False, blocked_domains={"ads.example.com"}) + route = _MockRoute("https://ads.example.com/tracker.js") + handler(route) + assert route.aborted + + def test_blocks_subdomain(self): + handler = create_intercept_handler(disable_resources=False, blocked_domains={"example.com"}) + route = _MockRoute("https://sub.example.com/page") + handler(route) + assert route.aborted + + def test_continues_non_blocked_domain(self): + handler = create_intercept_handler(disable_resources=False, blocked_domains={"ads.example.com"}) + route = _MockRoute("https://safe.example.com/page") + handler(route) + assert route.continued + + def test_resource_blocking_takes_priority_over_domain(self): + """When both are active, resource type check comes first.""" + handler = create_intercept_handler(disable_resources=True, blocked_domains={"example.com"}) + route = _MockRoute("https://example.com/style.css", resource_type="stylesheet") + handler(route) + assert route.aborted + + def test_domain_blocking_with_resources_disabled(self): + """Non-blocked resource type from a blocked domain should still be aborted.""" + handler = create_intercept_handler(disable_resources=True, blocked_domains={"tracker.io"}) + route = _MockRoute("https://tracker.io/api", resource_type="document") + handler(route) + assert route.aborted + + def test_no_blocking_continues(self): + handler = create_intercept_handler(disable_resources=False) + route = _MockRoute("https://example.com/page") + handler(route) + assert route.continued + + def test_does_not_block_partial_domain_match(self): + """'example.com' should not block 'notexample.com'.""" + handler = create_intercept_handler(disable_resources=False, blocked_domains={"example.com"}) + route = _MockRoute("https://notexample.com/page") + handler(route) + assert route.continued + + def test_multiple_blocked_domains(self): + handler = create_intercept_handler(disable_resources=False, blocked_domains={"ads.com", "tracker.io"}) + route_ads = _MockRoute("https://ads.com/banner") + route_tracker = _MockRoute("https://cdn.tracker.io/script.js") + route_safe = _MockRoute("https://example.com/page") + handler(route_ads) + handler(route_tracker) + handler(route_safe) + assert route_ads.aborted + assert route_tracker.aborted + assert route_safe.continued + + +class TestCreateAsyncInterceptHandler: + """Test the unified async route handler factory.""" + + @pytest.mark.asyncio + async def test_blocks_disabled_resource_types(self): + handler = create_async_intercept_handler(disable_resources=True) + route = _AsyncMockRoute("https://example.com/font.woff", resource_type="font") + await handler(route) + assert route.aborted + + @pytest.mark.asyncio + async def test_blocks_domain(self): + handler = create_async_intercept_handler(disable_resources=False, blocked_domains={"ads.example.com"}) + route = _AsyncMockRoute("https://ads.example.com/track") + await handler(route) + assert route.aborted + + @pytest.mark.asyncio + async def test_continues_non_blocked(self): + handler = create_async_intercept_handler(disable_resources=False, blocked_domains={"ads.example.com"}) + route = _AsyncMockRoute("https://safe.example.com/page") + await handler(route) + assert route.continued + + @pytest.mark.asyncio + async def test_blocks_subdomain(self): + handler = create_async_intercept_handler(disable_resources=False, blocked_domains={"tracker.io"}) + route = _AsyncMockRoute("https://cdn.tracker.io/script.js") + await handler(route) + assert route.aborted + + @pytest.mark.asyncio + async def test_does_not_block_partial_domain_match(self): + handler = create_async_intercept_handler(disable_resources=False, blocked_domains={"example.com"}) + route = _AsyncMockRoute("https://notexample.com/page") + await handler(route) + assert route.continued diff --git a/tests/fetchers/test_validator.py b/tests/fetchers/test_validator.py index 209fae0b66b76ab763fb6598a600b1a4de62ab98..a682f9ef59d4aaf02420fddfe807137ea1179fc9 100644 --- a/tests/fetchers/test_validator.py +++ b/tests/fetchers/test_validator.py @@ -77,3 +77,25 @@ class TestValidators: config = validate(params, StealthConfig) assert config.timeout == 60000 # Should be increased + + def test_playwright_config_blocked_domains(self): + """Test PlaywrightConfig with blocked_domains""" + params = {"blocked_domains": {"ads.example.com", "tracker.io"}} + + config = validate(params, PlaywrightConfig) + + assert config.blocked_domains == {"ads.example.com", "tracker.io"} + + def test_playwright_config_blocked_domains_default_none(self): + """Test PlaywrightConfig blocked_domains defaults to None""" + config = validate({}, PlaywrightConfig) + + assert config.blocked_domains is None + + def test_stealth_config_blocked_domains(self): + """Test StealthConfig inherits blocked_domains""" + params = {"blocked_domains": {"ads.example.com"}} + + config = validate(params, StealthConfig) + + assert config.blocked_domains == {"ads.example.com"} diff --git a/tests/parser/test_general.py b/tests/parser/test_general.py index e5f8d5984ae6cba442a269d5aa0e87b7188e23c7..26f6e6c2cee3534e76aeb57c19007da501eadc46 100644 --- a/tests/parser/test_general.py +++ b/tests/parser/test_general.py @@ -147,7 +147,7 @@ class TestTextMatching: class TestSimilarElements: def test_finding_similar_products(self, page): """Test finding similar product elements""" - first_product = page.css_first(".product") + first_product = page.css(".product").first similar_products = first_product.find_similar() assert len(similar_products) == 2 @@ -170,10 +170,6 @@ class TestErrorHandling: with pytest.raises(ValueError): _ = Selector(adaptive=False) - # Invalid argument types - with pytest.raises(TypeError): - _ = Selector(root="ayo", adaptive=False) - with pytest.raises(TypeError): _ = Selector(content=1, adaptive=False) @@ -255,7 +251,7 @@ class TestElementNavigation: class TestJSONAndAttributes: def test_json_conversion(self, page): """Test converting content to JSON""" - script_content = page.css("#page-data::text")[0] + script_content = page.css("#page-data::text")[0].get() assert issubclass(type(script_content.sort()), str) page_data = script_content.json() assert page_data["totalProducts"] == 3 @@ -282,7 +278,7 @@ class TestJSONAndAttributes: assert list(key_value[0].keys()) == ["data-id"] # JSON attribute conversion - attr_json = page.css_first("#products").attrib["schema"].json() + attr_json = page.css("#products").first.attrib["schema"].json() assert attr_json == {"jsonable": "data"} assert isinstance(page.css("#products")[0].attrib.json_string, bytes) diff --git a/tests/parser/test_html_utils.py b/tests/parser/test_html_utils.py deleted file mode 100644 index 924ef95ccbc34f64fb2df8d9630ec995fc346553..0000000000000000000000000000000000000000 --- a/tests/parser/test_html_utils.py +++ /dev/null @@ -1,194 +0,0 @@ -import pytest - -from scrapling.core._html_utils import to_unicode, _replace_entities, name2codepoint - - -class TestToUnicode: - def test_string_input(self): - """Test to_unicode with string input""" - text = "hello world" - assert to_unicode(text) == "hello world" - - def test_bytes_input_default_encoding(self): - """Test to_unicode with `bytes` input using default UTF-8""" - text = b"hello world" - assert to_unicode(text) == "hello world" - - def test_bytes_input_custom_encoding(self): - """Test to_unicode with custom encoding""" - text = "café".encode('latin-1') - assert to_unicode(text, encoding='latin-1') == "café" - - def test_bytes_input_with_errors(self): - """Test to_unicode with error handling""" - # Invalid UTF-8 bytes - text = b'\xff\xfe' - assert to_unicode(text, errors='ignore') == "" - assert to_unicode(text, errors='replace') == "��" - - def test_invalid_input_type(self): - """Test to_unicode with an invalid input type""" - with pytest.raises(TypeError, match="to_unicode must receive bytes or str"): - to_unicode(123) - - def test_none_encoding_defaults_to_utf8(self): - """Test that None encoding defaults to UTF-8""" - text = "café".encode('utf-8') - assert to_unicode(text, encoding=None) == "café" - - -class TestReplaceEntities: - def test_named_entities(self): - """Test replacement of named HTML entities""" - text = "& < > "  " - result = _replace_entities(text) - assert result == "& < > \" \xa0" - - def test_decimal_entities(self): - """Test replacement of decimal numeric entities""" - text = "& < >" - result = _replace_entities(text) - assert result == "& < >" - - def test_hexadecimal_entities(self): - """Test replacement of hexadecimal numeric entities""" - text = "& < >" - result = _replace_entities(text) - assert result == "& < >" - - def test_mixed_entities(self): - """Test replacement of mixed entity types""" - text = "Price: £100 €50 $25" - result = _replace_entities(text) - assert result == "Price: £100 €50 $25" - - def test_keep_entities(self): - """Test keeping specific entities""" - text = "& < >" - result = _replace_entities(text, keep=['amp', 'lt']) - assert result == "& < >" - - def test_windows_1252_range(self): - """Test handling of Windows-1252 range characters""" - text = "€ ‚ Ÿ" # Windows-1252 range - result = _replace_entities(text) - # These should be decoded using cp1252 - assert "€" in result # 128 -> Euro sign - - def test_remove_illegal_entities_true(self): - """Test removing illegal entities with remove_illegal=True""" - text = "&unknown; 󴈿" - result = _replace_entities(text, remove_illegal=True) - # The function may convert large numbers to Unicode characters or leave them as-is - assert "&unknown;" not in result # Unknown entities should be removed or converted - - def test_remove_illegal_entities_false(self): - """Test keeping illegal entities with remove_illegal=False""" - text = "&unknown; 󴈿" - result = _replace_entities(text, remove_illegal=False) - # Unknown entities should be preserved when remove_illegal=False - assert "&unknown;" in result - # Large numeric entities may be converted to Unicode characters - - def test_bytes_input(self): - """Test with bytes input""" - text = b"& < >" - result = _replace_entities(text) - assert result == "& < >" - - def test_custom_encoding(self): - """Test with custom encoding""" - text = "é".encode('latin-1') - result = _replace_entities(text, encoding='latin-1') - assert result == "é" - - def test_entities_without_semicolon(self): - """Test entities without semicolon""" - text = "& < >" - result = _replace_entities(text, remove_illegal=True) - # Should handle entities without a semicolon - assert len(result) <= len(text) - - def test_case_insensitive_named_entities(self): - """Test case-insensitive named-entity handling""" - text = "& ≪ >" - result = _replace_entities(text) - assert result == "& < >" - - def test_edge_cases(self): - """Test edge cases""" - # Empty string - assert _replace_entities("") == "" - - # No entities - assert _replace_entities("plain text") == "plain text" - - # Invalid numeric entity - text = "&#-1;" - result = _replace_entities(text, remove_illegal=True) - # Invalid entities may be left as-is or removed depending on implementation - assert len(result) >= 0 # Ensure no exception is raised - - -class TestName2Codepoint: - def test_common_entities_exist(self): - """Test that common HTML entities exist in mapping""" - common_entities = ['amp', 'lt', 'gt', 'quot', 'nbsp', 'copy', 'reg'] - for entity in common_entities: - assert entity in name2codepoint - - def test_greek_letters_exist(self): - """Test that Greek letter entities exist""" - greek_letters = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] - for letter in greek_letters: - assert letter in name2codepoint - - def test_mathematical_symbols_exist(self): - """Test that mathematical symbol entities exist""" - math_symbols = ['sum', 'prod', 'int', 'infin', 'plusmn'] - for symbol in math_symbols: - assert symbol in name2codepoint - - def test_currency_symbols_exist(self): - """Test that currency symbol entities exist""" - currencies = ['pound', 'yen', 'euro', 'cent'] - for currency in currencies: - assert currency in name2codepoint - - def test_codepoint_values(self): - """Test specific codepoint values""" - assert name2codepoint['amp'] == 0x0026 # & - assert name2codepoint['lt'] == 0x003C # < - assert name2codepoint['gt'] == 0x003E # > - assert name2codepoint['nbsp'] == 0x00A0 # non-breaking space - assert name2codepoint['copy'] == 0x00A9 # © - - -class TestIntegration: - def test_real_world_html(self): - """Test with real-world HTML content""" - html = """ - <div class="content"> - © 2024 Company & Associates - Price: £99.99 (€89.99) - Math: α + β = γ - </div> - """ - result = _replace_entities(html) - - assert '
' in result - assert '© 2024 Company & Associates' in result - assert 'Price: £99.99 (€89.99)' in result - assert 'Math: α + β = γ' in result - - def test_performance_with_large_text(self): - """Test performance with large text containing many entities""" - # Create large text with repeated entities - text = ("& < > " " * 1000) - result = _replace_entities(text) - - # Should complete without issues and have correct content - assert result.count("&") == 1000 - assert result.count("<") == 1000 - assert result.count(">") == 1000 - assert result.count('"') == 1000 diff --git a/tests/parser/test_parser_advanced.py b/tests/parser/test_parser_advanced.py index 3ac81cfe03ec6038a05b35c7ba9719157fe095ac..57086ba89dca7eb553dde37aa520091c755e69e9 100644 --- a/tests/parser/test_parser_advanced.py +++ b/tests/parser/test_parser_advanced.py @@ -121,11 +121,12 @@ class TestAdvancedSelectors: # ::text pseudo-element texts = page.css("p::text") assert len(texts) == 2 - assert isinstance(texts[0], TextHandler) + assert isinstance(texts[0], Selector) + assert isinstance(texts[0].get(), TextHandler) # ::attr() pseudo-element attrs = page.css("div::attr(class)") - assert "container" in attrs + assert "container" in attrs.getall() def test_complex_attribute_operations(self, complex_html): """Test complex attribute handling""" diff --git a/tests/spiders/__init__.py b/tests/spiders/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/spiders/test_checkpoint.py b/tests/spiders/test_checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..fc35f43595cf130370301640f776ef62f6bc9442 --- /dev/null +++ b/tests/spiders/test_checkpoint.py @@ -0,0 +1,341 @@ +"""Tests for the CheckpointManager and CheckpointData classes.""" + +import pickle +import tempfile +from pathlib import Path + +import pytest +import anyio + +from scrapling.spiders.request import Request +from scrapling.spiders.checkpoint import CheckpointData, CheckpointManager + + +class TestCheckpointData: + """Test CheckpointData dataclass.""" + + def test_default_values(self): + """Test CheckpointData with default values.""" + data = CheckpointData() + + assert data.requests == [] + assert data.seen == set() + + def test_with_requests_and_seen(self): + """Test CheckpointData with requests and seen URLs.""" + requests = [ + Request("https://example.com/1", priority=10), + Request("https://example.com/2", priority=5), + ] + seen = {"url1", "url2", "url3"} + + data = CheckpointData(requests=requests, seen=seen) + + assert len(data.requests) == 2 + assert data.requests[0].url == "https://example.com/1" + assert data.seen == {"url1", "url2", "url3"} + + def test_pickle_roundtrip(self): + """Test that CheckpointData can be pickled and unpickled.""" + requests = [Request("https://example.com", priority=5)] + seen = {"fingerprint1", "fingerprint2"} + data = CheckpointData(requests=requests, seen=seen) + + pickled = pickle.dumps(data) + restored = pickle.loads(pickled) + + assert len(restored.requests) == 1 + assert restored.requests[0].url == "https://example.com" + assert restored.seen == {"fingerprint1", "fingerprint2"} + + +class TestCheckpointManagerInit: + """Test CheckpointManager initialization.""" + + def test_init_with_string_path(self): + """Test initialization with string path.""" + manager = CheckpointManager("/tmp/test_crawl") + + assert str(manager.crawldir) == "/tmp/test_crawl" + assert manager.interval == 300.0 + + def test_init_with_pathlib_path(self): + """Test initialization with pathlib.Path.""" + path = Path("/tmp/test_crawl") + manager = CheckpointManager(path) + + assert str(manager.crawldir) == "/tmp/test_crawl" + + def test_init_with_custom_interval(self): + """Test initialization with custom interval.""" + manager = CheckpointManager("/tmp/test", interval=60.0) + assert manager.interval == 60.0 + + def test_init_with_zero_interval(self): + """Test initialization with zero interval (disable periodic checkpoints).""" + manager = CheckpointManager("/tmp/test", interval=0) + assert manager.interval == 0 + + def test_init_with_negative_interval_raises(self): + """Test that negative interval raises ValueError.""" + with pytest.raises(ValueError, match="greater than 0"): + CheckpointManager("/tmp/test", interval=-1) + + def test_init_with_invalid_interval_type_raises(self): + """Test that invalid interval type raises TypeError.""" + with pytest.raises(TypeError, match="integer or float"): + CheckpointManager("/tmp/test", interval="invalid") # type: ignore + + def test_checkpoint_file_path(self): + """Test that checkpoint file path is correctly constructed.""" + manager = CheckpointManager("/tmp/test_crawl") + + expected_path = "/tmp/test_crawl/checkpoint.pkl" + assert str(manager._checkpoint_path) == expected_path + + +class TestCheckpointManagerOperations: + """Test CheckpointManager save/load/cleanup operations.""" + + @pytest.fixture + def temp_dir(self): + """Create a temporary directory for testing.""" + with tempfile.TemporaryDirectory() as tmpdir: + yield Path(tmpdir) + + @pytest.mark.asyncio + async def test_has_checkpoint_false_when_no_file(self, temp_dir: Path): + """Test has_checkpoint returns False when no checkpoint exists.""" + manager = CheckpointManager(temp_dir / "crawl") + + result = await manager.has_checkpoint() + + assert result is False + + @pytest.mark.asyncio + async def test_save_creates_checkpoint_file(self, temp_dir: Path): + """Test that save creates the checkpoint file.""" + crawl_dir = temp_dir / "crawl" + manager = CheckpointManager(crawl_dir) + + data = CheckpointData( + requests=[Request("https://example.com")], + seen={"fp1", "fp2"}, + ) + + await manager.save(data) + + checkpoint_path = crawl_dir / "checkpoint.pkl" + assert checkpoint_path.exists() + + @pytest.mark.asyncio + async def test_save_creates_directory_if_not_exists(self, temp_dir: Path): + """Test that save creates the directory if it doesn't exist.""" + crawl_dir = temp_dir / "nested" / "crawl" / "dir" + manager = CheckpointManager(crawl_dir) + + data = CheckpointData() + await manager.save(data) + + assert crawl_dir.exists() + + @pytest.mark.asyncio + async def test_has_checkpoint_true_after_save(self, temp_dir: Path): + """Test has_checkpoint returns True after saving.""" + manager = CheckpointManager(temp_dir / "crawl") + + data = CheckpointData() + await manager.save(data) + + result = await manager.has_checkpoint() + assert result is True + + @pytest.mark.asyncio + async def test_load_returns_none_when_no_checkpoint(self, temp_dir: Path): + """Test load returns None when no checkpoint exists.""" + manager = CheckpointManager(temp_dir / "crawl") + + result = await manager.load() + + assert result is None + + @pytest.mark.asyncio + async def test_save_and_load_roundtrip(self, temp_dir: Path): + """Test saving and loading checkpoint data.""" + manager = CheckpointManager(temp_dir / "crawl") + + original_data = CheckpointData( + requests=[ + Request("https://example.com/1", priority=10), + Request("https://example.com/2", priority=5), + ], + seen={"fp1", "fp2", "fp3"}, + ) + + await manager.save(original_data) + loaded_data = await manager.load() + + assert loaded_data is not None + assert len(loaded_data.requests) == 2 + assert loaded_data.requests[0].url == "https://example.com/1" + assert loaded_data.requests[0].priority == 10 + assert loaded_data.seen == {"fp1", "fp2", "fp3"} + + @pytest.mark.asyncio + async def test_save_is_atomic(self, temp_dir: Path): + """Test that save uses atomic write (temp file + rename).""" + crawl_dir = temp_dir / "crawl" + manager = CheckpointManager(crawl_dir) + + data = CheckpointData(requests=[Request("https://example.com")]) + await manager.save(data) + + # Temp file should not exist after successful save + temp_path = crawl_dir / "checkpoint.tmp" + assert not temp_path.exists() + + # Checkpoint file should exist + checkpoint_path = crawl_dir / "checkpoint.pkl" + assert checkpoint_path.exists() + + @pytest.mark.asyncio + async def test_cleanup_removes_checkpoint_file(self, temp_dir: Path): + """Test that cleanup removes the checkpoint file.""" + crawl_dir = temp_dir / "crawl" + manager = CheckpointManager(crawl_dir) + + # Save a checkpoint first + data = CheckpointData() + await manager.save(data) + + checkpoint_path = crawl_dir / "checkpoint.pkl" + assert checkpoint_path.exists() + + # Cleanup should remove it + await manager.cleanup() + + assert not checkpoint_path.exists() + + @pytest.mark.asyncio + async def test_cleanup_no_error_when_no_file(self, temp_dir: Path): + """Test that cleanup doesn't raise error when no file exists.""" + manager = CheckpointManager(temp_dir / "crawl") + + # Should not raise + await manager.cleanup() + + @pytest.mark.asyncio + async def test_load_returns_none_on_corrupt_file(self, temp_dir: Path): + """Test load returns None when checkpoint file is corrupt.""" + crawl_dir = temp_dir / "crawl" + crawl_dir.mkdir(parents=True) + + checkpoint_path = crawl_dir / "checkpoint.pkl" + checkpoint_path.write_bytes(b"not valid pickle data") + + manager = CheckpointManager(crawl_dir) + + result = await manager.load() + + assert result is None + + @pytest.mark.asyncio + async def test_multiple_saves_overwrite(self, temp_dir: Path): + """Test that multiple saves overwrite the checkpoint.""" + manager = CheckpointManager(temp_dir / "crawl") + + # First save + data1 = CheckpointData( + requests=[Request("https://example.com/1")], + seen={"fp1"}, + ) + await manager.save(data1) + + # Second save + data2 = CheckpointData( + requests=[Request("https://example.com/2"), Request("https://example.com/3")], + seen={"fp2", "fp3"}, + ) + await manager.save(data2) + + # Load should return the second save + loaded = await manager.load() + + assert loaded is not None + assert len(loaded.requests) == 2 + assert loaded.requests[0].url == "https://example.com/2" + assert loaded.seen == {"fp2", "fp3"} + + +class TestCheckpointManagerEdgeCases: + """Test edge cases for CheckpointManager.""" + + @pytest.fixture + def temp_dir(self): + """Create a temporary directory for testing.""" + with tempfile.TemporaryDirectory() as tmpdir: + yield Path(tmpdir) + + @pytest.mark.asyncio + async def test_save_empty_checkpoint(self, temp_dir: Path): + """Test saving empty checkpoint data.""" + manager = CheckpointManager(temp_dir / "crawl") + + data = CheckpointData(requests=[], seen=set()) + await manager.save(data) + + loaded = await manager.load() + + assert loaded is not None + assert loaded.requests == [] + assert loaded.seen == set() + + @pytest.mark.asyncio + async def test_save_large_checkpoint(self, temp_dir: Path): + """Test saving checkpoint with many requests.""" + manager = CheckpointManager(temp_dir / "crawl") + + # Create 1000 requests + requests = [ + Request(f"https://example.com/{i}", priority=i % 10) + for i in range(1000) + ] + seen = {f"fp_{i}" for i in range(2000)} + + data = CheckpointData(requests=requests, seen=seen) + await manager.save(data) + + loaded = await manager.load() + + assert loaded is not None + assert len(loaded.requests) == 1000 + assert len(loaded.seen) == 2000 + + @pytest.mark.asyncio + async def test_requests_preserve_metadata(self, temp_dir: Path): + """Test that request metadata is preserved through checkpoint.""" + manager = CheckpointManager(temp_dir / "crawl") + + original_request = Request( + url="https://example.com", + sid="my_session", + priority=42, + dont_filter=True, + meta={"item_id": 123, "page": 5}, + proxy="http://proxy:8080", + ) + + data = CheckpointData(requests=[original_request], seen=set()) + await manager.save(data) + + loaded = await manager.load() + + assert loaded is not None + restored = loaded.requests[0] + + assert restored.url == "https://example.com" + assert restored.sid == "my_session" + assert restored.priority == 42 + assert restored.dont_filter is True + assert restored.meta == {"item_id": 123, "page": 5} + assert restored._session_kwargs == {"proxy": "http://proxy:8080"} diff --git a/tests/spiders/test_engine.py b/tests/spiders/test_engine.py new file mode 100644 index 0000000000000000000000000000000000000000..ca768bfb780e063d5019ea5a539bc65eafd001db --- /dev/null +++ b/tests/spiders/test_engine.py @@ -0,0 +1,915 @@ +"""Tests for the CrawlerEngine class.""" + +import tempfile +from pathlib import Path + +import anyio +import pytest + +from scrapling.spiders.engine import CrawlerEngine, _dump +from scrapling.spiders.request import Request +from scrapling.spiders.session import SessionManager +from scrapling.spiders.result import CrawlStats, ItemList +from scrapling.spiders.checkpoint import CheckpointData +from scrapling.core._types import Any, Dict, Set, AsyncGenerator + + +# --------------------------------------------------------------------------- +# Mock helpers +# --------------------------------------------------------------------------- + + +class MockResponse: + """Minimal Response stand-in.""" + + def __init__(self, status: int = 200, body: bytes = b"ok", url: str = "https://example.com"): + self.status = status + self.body = body + self.url = url + self.request: Any = None + self.meta: Dict[str, Any] = {} + + def __str__(self) -> str: + return self.url + + +class MockSession: + """Mock session that returns a canned response.""" + + def __init__(self, name: str = "mock", response: MockResponse | None = None): + self.name = name + self._is_alive = False + self._response = response or MockResponse() + self.fetch_calls: list[dict] = [] + + async def __aenter__(self): + self._is_alive = True + return self + + async def __aexit__(self, *args): + self._is_alive = False + + async def fetch(self, url: str, **kwargs): + self.fetch_calls.append({"url": url, **kwargs}) + resp = MockResponse(status=self._response.status, body=self._response.body, url=url) + return resp + + +class ErrorSession(MockSession): + """Session that raises on fetch.""" + + def __init__(self, error: Exception | None = None): + super().__init__("error") + self._error = error or RuntimeError("fetch failed") + + async def fetch(self, url: str, **kwargs): + raise self._error + + +class MockSpider: + """Lightweight spider stub for engine tests.""" + + def __init__( + self, + *, + concurrent_requests: int = 4, + concurrent_requests_per_domain: int = 0, + download_delay: float = 0.0, + max_blocked_retries: int = 3, + allowed_domains: Set[str] | None = None, + fp_include_kwargs: bool = False, + fp_include_headers: bool = False, + fp_keep_fragments: bool = False, + is_blocked_fn=None, + on_scraped_item_fn=None, + retry_blocked_request_fn=None, + ): + self.concurrent_requests = concurrent_requests + self.concurrent_requests_per_domain = concurrent_requests_per_domain + self.download_delay = download_delay + self.max_blocked_retries = max_blocked_retries + self.allowed_domains = allowed_domains or set() + self.fp_include_kwargs = fp_include_kwargs + self.fp_include_headers = fp_include_headers + self.fp_keep_fragments = fp_keep_fragments + self.name = "test_spider" + + # Tracking lists + self.on_start_calls: list[dict] = [] + self.on_close_calls: int = 0 + self.on_error_calls: list[tuple[Request, Exception]] = [] + self.scraped_items: list[dict] = [] + self.blocked_responses: list = [] + self.retry_requests: list = [] + + # Pluggable behaviour + self._is_blocked_fn = is_blocked_fn + self._on_scraped_item_fn = on_scraped_item_fn + self._retry_blocked_request_fn = retry_blocked_request_fn + + # Log counter stub + self._log_counter = _LogCounterStub() + + async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]: + yield {"url": str(response)} + + async def on_start(self, resuming: bool = False) -> None: + self.on_start_calls.append({"resuming": resuming}) + + async def on_close(self) -> None: + self.on_close_calls += 1 + + async def on_error(self, request: Request, error: Exception) -> None: + self.on_error_calls.append((request, error)) + + async def on_scraped_item(self, item: Dict[str, Any]) -> Dict[str, Any] | None: + if self._on_scraped_item_fn: + return self._on_scraped_item_fn(item) + self.scraped_items.append(item) + return item + + async def is_blocked(self, response) -> bool: + if self._is_blocked_fn: + return self._is_blocked_fn(response) + return False + + async def retry_blocked_request(self, request: Request, response) -> Request: + self.retry_requests.append(request) + if self._retry_blocked_request_fn: + return self._retry_blocked_request_fn(request, response) + return request + + async def start_requests(self) -> AsyncGenerator[Request, None]: + yield Request("https://example.com", sid="default") + + +class _LogCounterStub: + """Stub for LogCounterHandler.""" + + def get_counts(self) -> Dict[str, int]: + return {"debug": 0, "info": 0, "warning": 0, "error": 0, "critical": 0} + + +def _make_engine( + spider: MockSpider | None = None, + session: MockSession | None = None, + crawldir: str | None = None, + interval: float = 300.0, +) -> CrawlerEngine: + """Create a CrawlerEngine wired to mock objects.""" + spider = spider or MockSpider() + sm = SessionManager() + sm.add("default", session or MockSession()) + return CrawlerEngine(spider, sm, crawldir=crawldir, interval=interval) + + +# --------------------------------------------------------------------------- +# Tests: _dump helper +# --------------------------------------------------------------------------- + + +class TestDumpHelper: + def test_dump_returns_json_string(self): + result = _dump({"key": "value"}) + assert '"key": "value"' in result + + def test_dump_handles_nested(self): + result = _dump({"a": {"b": 1}}) + assert '"a"' in result + assert '"b"' in result + + +# --------------------------------------------------------------------------- +# Tests: __init__ +# --------------------------------------------------------------------------- + + +class TestCrawlerEngineInit: + def test_default_initialisation(self): + engine = _make_engine() + + assert engine._running is False + assert engine._active_tasks == 0 + assert engine._pause_requested is False + assert engine._force_stop is False + assert engine.paused is False + assert isinstance(engine.stats, CrawlStats) + assert isinstance(engine.items, ItemList) + + def test_checkpoint_system_disabled_by_default(self): + engine = _make_engine() + assert engine._checkpoint_system_enabled is False + + def test_checkpoint_system_enabled_with_crawldir(self): + with tempfile.TemporaryDirectory() as tmpdir: + engine = _make_engine(crawldir=tmpdir) + assert engine._checkpoint_system_enabled is True + + def test_global_limiter_uses_concurrent_requests(self): + spider = MockSpider(concurrent_requests=8) + engine = _make_engine(spider=spider) + assert engine._global_limiter.total_tokens == 8 + + def test_allowed_domains_from_spider(self): + spider = MockSpider(allowed_domains={"example.com", "test.org"}) + engine = _make_engine(spider=spider) + assert engine._allowed_domains == {"example.com", "test.org"} + + +# --------------------------------------------------------------------------- +# Tests: _is_domain_allowed +# --------------------------------------------------------------------------- + + +class TestIsDomainAllowed: + def test_all_allowed_when_empty(self): + engine = _make_engine() + request = Request("https://anything.com/page") + assert engine._is_domain_allowed(request) is True + + def test_exact_domain_match(self): + spider = MockSpider(allowed_domains={"example.com"}) + engine = _make_engine(spider=spider) + + assert engine._is_domain_allowed(Request("https://example.com/page")) is True + assert engine._is_domain_allowed(Request("https://other.com/page")) is False + + def test_subdomain_match(self): + spider = MockSpider(allowed_domains={"example.com"}) + engine = _make_engine(spider=spider) + + assert engine._is_domain_allowed(Request("https://sub.example.com/page")) is True + assert engine._is_domain_allowed(Request("https://deep.sub.example.com/x")) is True + + def test_partial_name_not_matched(self): + spider = MockSpider(allowed_domains={"example.com"}) + engine = _make_engine(spider=spider) + + # "notexample.com" should NOT match "example.com" + assert engine._is_domain_allowed(Request("https://notexample.com/x")) is False + + def test_multiple_allowed_domains(self): + spider = MockSpider(allowed_domains={"a.com", "b.org"}) + engine = _make_engine(spider=spider) + + assert engine._is_domain_allowed(Request("https://a.com/")) is True + assert engine._is_domain_allowed(Request("https://b.org/")) is True + assert engine._is_domain_allowed(Request("https://c.net/")) is False + + +# --------------------------------------------------------------------------- +# Tests: _rate_limiter +# --------------------------------------------------------------------------- + + +class TestRateLimiter: + def test_returns_global_limiter_when_per_domain_disabled(self): + engine = _make_engine() # concurrent_requests_per_domain=0 + limiter = engine._rate_limiter("example.com") + assert limiter is engine._global_limiter + + def test_returns_per_domain_limiter_when_enabled(self): + spider = MockSpider(concurrent_requests_per_domain=2) + engine = _make_engine(spider=spider) + + limiter = engine._rate_limiter("example.com") + assert limiter is not engine._global_limiter + assert limiter.total_tokens == 2 + + def test_same_domain_returns_same_limiter(self): + spider = MockSpider(concurrent_requests_per_domain=2) + engine = _make_engine(spider=spider) + + l1 = engine._rate_limiter("example.com") + l2 = engine._rate_limiter("example.com") + assert l1 is l2 + + def test_different_domains_get_different_limiters(self): + spider = MockSpider(concurrent_requests_per_domain=2) + engine = _make_engine(spider=spider) + + l1 = engine._rate_limiter("a.com") + l2 = engine._rate_limiter("b.com") + assert l1 is not l2 + + +# --------------------------------------------------------------------------- +# Tests: _normalize_request +# --------------------------------------------------------------------------- + + +class TestNormalizeRequest: + def test_sets_default_sid_when_empty(self): + engine = _make_engine() + request = Request("https://example.com") + assert request.sid == "" + + engine._normalize_request(request) + assert request.sid == "default" + + def test_preserves_existing_sid(self): + engine = _make_engine() + request = Request("https://example.com", sid="custom") + + engine._normalize_request(request) + assert request.sid == "custom" + + +# --------------------------------------------------------------------------- +# Tests: _process_request +# --------------------------------------------------------------------------- + + +class TestProcessRequest: + @pytest.mark.asyncio + async def test_successful_fetch_updates_stats(self): + spider = MockSpider() + session = MockSession(response=MockResponse(status=200, body=b"hello")) + engine = _make_engine(spider=spider, session=session) + + request = Request("https://example.com", sid="default") + await engine._process_request(request) + + assert engine.stats.requests_count == 1 + assert engine.stats.response_bytes == 5 # len(b"hello") from MockSession + assert "status_200" in engine.stats.response_status_count + + @pytest.mark.asyncio + async def test_failed_fetch_increments_failed_count(self): + spider = MockSpider() + sm = SessionManager() + sm.add("default", ErrorSession()) + engine = CrawlerEngine(spider, sm) + + request = Request("https://example.com", sid="default") + await engine._process_request(request) + + assert engine.stats.failed_requests_count == 1 + assert len(spider.on_error_calls) == 1 + + @pytest.mark.asyncio + async def test_failed_fetch_does_not_increment_requests_count(self): + spider = MockSpider() + sm = SessionManager() + sm.add("default", ErrorSession()) + engine = CrawlerEngine(spider, sm) + + request = Request("https://example.com", sid="default") + await engine._process_request(request) + + assert engine.stats.requests_count == 0 + + @pytest.mark.asyncio + async def test_blocked_response_triggers_retry(self): + spider = MockSpider(is_blocked_fn=lambda r: True, max_blocked_retries=2) + engine = _make_engine(spider=spider) + + request = Request("https://example.com", sid="default") + await engine._process_request(request) + + assert engine.stats.blocked_requests_count == 1 + # A retry request should be enqueued + assert not engine.scheduler.is_empty + + @pytest.mark.asyncio + async def test_blocked_response_max_retries_exceeded(self): + spider = MockSpider(is_blocked_fn=lambda r: True, max_blocked_retries=2) + engine = _make_engine(spider=spider) + + request = Request("https://example.com", sid="default") + request._retry_count = 2 # Already at max + await engine._process_request(request) + + assert engine.stats.blocked_requests_count == 1 + # No retry enqueued + assert engine.scheduler.is_empty + + @pytest.mark.asyncio + async def test_retry_request_has_dont_filter(self): + spider = MockSpider(is_blocked_fn=lambda r: True, max_blocked_retries=3) + engine = _make_engine(spider=spider) + + request = Request("https://example.com", sid="default") + await engine._process_request(request) + + retry = await engine.scheduler.dequeue() + assert retry.dont_filter is True + assert retry._retry_count == 1 + + @pytest.mark.asyncio + async def test_retry_clears_proxy_kwargs(self): + spider = MockSpider(is_blocked_fn=lambda r: True, max_blocked_retries=3) + engine = _make_engine(spider=spider) + + request = Request("https://example.com", sid="default", proxy="http://proxy:8080") + await engine._process_request(request) + + retry = await engine.scheduler.dequeue() + assert "proxy" not in retry._session_kwargs + assert "proxies" not in retry._session_kwargs + + @pytest.mark.asyncio + async def test_callback_yielding_dict_increments_items(self): + spider = MockSpider() + engine = _make_engine(spider=spider) + + request = Request("https://example.com", sid="default") + await engine._process_request(request) + + assert engine.stats.items_scraped == 1 + assert len(engine.items) == 1 + + @pytest.mark.asyncio + async def test_callback_yielding_request_enqueues(self): + async def callback(response) -> AsyncGenerator: + yield Request("https://example.com/page2", sid="default") + + spider = MockSpider() + engine = _make_engine(spider=spider) + + request = Request("https://example.com", sid="default", callback=callback) + await engine._process_request(request) + + assert not engine.scheduler.is_empty + + @pytest.mark.asyncio + async def test_callback_yielding_offsite_request_filtered(self): + async def callback(response) -> AsyncGenerator: + yield Request("https://other.com/page", sid="default") + + spider = MockSpider(allowed_domains={"example.com"}) + engine = _make_engine(spider=spider) + + request = Request("https://example.com", sid="default", callback=callback) + await engine._process_request(request) + + assert engine.stats.offsite_requests_count == 1 + assert engine.scheduler.is_empty + + @pytest.mark.asyncio + async def test_dropped_item_when_on_scraped_item_returns_none(self): + spider = MockSpider(on_scraped_item_fn=lambda item: None) + engine = _make_engine(spider=spider) + + request = Request("https://example.com", sid="default") + await engine._process_request(request) + + assert engine.stats.items_dropped == 1 + assert engine.stats.items_scraped == 0 + assert len(engine.items) == 0 + + @pytest.mark.asyncio + async def test_callback_exception_calls_on_error(self): + async def bad_callback(response) -> AsyncGenerator: + raise ValueError("callback boom") + yield # noqa: unreachable + + spider = MockSpider() + engine = _make_engine(spider=spider) + + request = Request("https://example.com", sid="default", callback=bad_callback) + await engine._process_request(request) + + assert len(spider.on_error_calls) == 1 + assert isinstance(spider.on_error_calls[0][1], ValueError) + + @pytest.mark.asyncio + async def test_proxy_tracked_in_stats(self): + spider = MockSpider() + engine = _make_engine(spider=spider) + + request = Request("https://example.com", sid="default", proxy="http://p:8080") + await engine._process_request(request) + + assert "http://p:8080" in engine.stats.proxies + + @pytest.mark.asyncio + async def test_proxies_dict_tracked_in_stats(self): + spider = MockSpider() + engine = _make_engine(spider=spider) + + proxies = {"http": "http://p:8080", "https": "https://p:8443"} + request = Request("https://example.com", sid="default", proxies=proxies) + await engine._process_request(request) + + assert len(engine.stats.proxies) == 1 + assert engine.stats.proxies[0] == proxies + + @pytest.mark.asyncio + async def test_uses_parse_when_no_callback(self): + items_seen = [] + + async def custom_parse(response) -> AsyncGenerator: + yield {"from": "custom_parse"} + + spider = MockSpider() + spider.parse = custom_parse # type: ignore[assignment] + engine = _make_engine(spider=spider) + + request = Request("https://example.com", sid="default") + # No callback set → should use spider.parse + await engine._process_request(request) + + assert engine.stats.items_scraped == 1 + + +# --------------------------------------------------------------------------- +# Tests: _task_wrapper +# --------------------------------------------------------------------------- + + +class TestTaskWrapper: + @pytest.mark.asyncio + async def test_decrements_active_tasks(self): + engine = _make_engine() + engine._active_tasks = 1 + + request = Request("https://example.com", sid="default") + await engine._task_wrapper(request) + + assert engine._active_tasks == 0 + + @pytest.mark.asyncio + async def test_decrements_even_on_error(self): + spider = MockSpider() + sm = SessionManager() + sm.add("default", ErrorSession()) + engine = CrawlerEngine(spider, sm) + engine._active_tasks = 1 + + request = Request("https://example.com", sid="default") + await engine._task_wrapper(request) + + assert engine._active_tasks == 0 + + +# --------------------------------------------------------------------------- +# Tests: request_pause +# --------------------------------------------------------------------------- + + +class TestRequestPause: + def test_first_call_sets_pause_requested(self): + engine = _make_engine() + + engine.request_pause() + + assert engine._pause_requested is True + assert engine._force_stop is False + + def test_second_call_sets_force_stop(self): + engine = _make_engine() + + engine.request_pause() # first + engine.request_pause() # second + + assert engine._pause_requested is True + assert engine._force_stop is True + + def test_third_call_after_force_stop_is_noop(self): + engine = _make_engine() + + engine.request_pause() + engine.request_pause() + engine.request_pause() # should not raise + + assert engine._force_stop is True + + +# --------------------------------------------------------------------------- +# Tests: checkpoint methods +# --------------------------------------------------------------------------- + + +class TestCheckpointMethods: + def test_is_checkpoint_time_false_when_disabled(self): + engine = _make_engine() # no crawldir + assert engine._is_checkpoint_time() is False + + @pytest.mark.asyncio + async def test_save_and_restore_checkpoint(self): + with tempfile.TemporaryDirectory() as tmpdir: + spider = MockSpider() + engine = _make_engine(spider=spider, crawldir=tmpdir) + + # Enqueue a request so snapshot has data + req = Request("https://example.com", sid="default") + engine._normalize_request(req) + await engine.scheduler.enqueue(req) + + await engine._save_checkpoint() + + # Verify checkpoint file exists + checkpoint_path = Path(tmpdir) / "checkpoint.pkl" + assert checkpoint_path.exists() + + @pytest.mark.asyncio + async def test_restore_when_no_checkpoint_returns_false(self): + with tempfile.TemporaryDirectory() as tmpdir: + engine = _make_engine(crawldir=tmpdir) + result = await engine._restore_from_checkpoint() + assert result is False + + @pytest.mark.asyncio + async def test_restore_from_checkpoint_raises_when_disabled(self): + engine = _make_engine() # no crawldir → checkpoint disabled + with pytest.raises(RuntimeError): + await engine._restore_from_checkpoint() + + +# --------------------------------------------------------------------------- +# Tests: crawl +# --------------------------------------------------------------------------- + + +class TestCrawl: + @pytest.mark.asyncio + async def test_basic_crawl_returns_stats(self): + spider = MockSpider() + engine = _make_engine(spider=spider) + + stats = await engine.crawl() + + assert isinstance(stats, CrawlStats) + assert stats.requests_count >= 1 + assert stats.items_scraped >= 1 + + @pytest.mark.asyncio + async def test_crawl_calls_on_start_and_on_close(self): + spider = MockSpider() + engine = _make_engine(spider=spider) + + await engine.crawl() + + assert len(spider.on_start_calls) == 1 + assert spider.on_start_calls[0]["resuming"] is False + assert spider.on_close_calls == 1 + + @pytest.mark.asyncio + async def test_crawl_sets_stats_timing(self): + spider = MockSpider() + engine = _make_engine(spider=spider) + + stats = await engine.crawl() + + assert stats.start_time > 0 + assert stats.end_time > 0 + assert stats.end_time >= stats.start_time + + @pytest.mark.asyncio + async def test_crawl_sets_concurrency_stats(self): + spider = MockSpider(concurrent_requests=16, concurrent_requests_per_domain=4) + engine = _make_engine(spider=spider) + + stats = await engine.crawl() + + assert stats.concurrent_requests == 16 + assert stats.concurrent_requests_per_domain == 4 + + @pytest.mark.asyncio + async def test_crawl_processes_multiple_start_urls(self): + spider = MockSpider() + + urls = ["https://example.com/1", "https://example.com/2", "https://example.com/3"] + + async def multi_start_requests() -> AsyncGenerator[Request, None]: + for url in urls: + yield Request(url, sid="default") + + spider.start_requests = multi_start_requests # type: ignore[assignment] + engine = _make_engine(spider=spider) + + stats = await engine.crawl() + + assert stats.requests_count == 3 + assert stats.items_scraped == 3 + + @pytest.mark.asyncio + async def test_crawl_follows_yielded_requests(self): + """Test that requests yielded from callbacks are processed.""" + call_count = 0 + + async def parse_with_follow(response) -> AsyncGenerator: + nonlocal call_count + call_count += 1 + if call_count == 1: + yield Request("https://example.com/page2", sid="default") + yield {"page": str(response)} + + spider = MockSpider() + spider.parse = parse_with_follow # type: ignore[assignment] + engine = _make_engine(spider=spider) + + stats = await engine.crawl() + + assert stats.requests_count == 2 + assert stats.items_scraped == 2 + + @pytest.mark.asyncio + async def test_crawl_with_download_delay(self): + spider = MockSpider(download_delay=0.01) + engine = _make_engine(spider=spider) + + stats = await engine.crawl() + + assert stats.download_delay == 0.01 + assert stats.requests_count >= 1 + + @pytest.mark.asyncio + async def test_crawl_filters_offsite_requests(self): + async def parse_offsite(response) -> AsyncGenerator: + yield Request("https://other-domain.com/page", sid="default") + yield {"url": str(response)} + + spider = MockSpider(allowed_domains={"example.com"}) + spider.parse = parse_offsite # type: ignore[assignment] + engine = _make_engine(spider=spider) + + stats = await engine.crawl() + + assert stats.offsite_requests_count == 1 + assert stats.requests_count == 1 # Only the initial request + + @pytest.mark.asyncio + async def test_crawl_cleans_up_checkpoint_on_completion(self): + with tempfile.TemporaryDirectory() as tmpdir: + spider = MockSpider() + engine = _make_engine(spider=spider, crawldir=tmpdir) + + await engine.crawl() + + checkpoint_path = Path(tmpdir) / "checkpoint.pkl" + assert not checkpoint_path.exists() # Cleaned up + + @pytest.mark.asyncio + async def test_crawl_handles_fetch_error_gracefully(self): + spider = MockSpider() + sm = SessionManager() + sm.add("default", ErrorSession()) + engine = CrawlerEngine(spider, sm) + + stats = await engine.crawl() + + assert stats.failed_requests_count == 1 + assert len(spider.on_error_calls) == 1 + + @pytest.mark.asyncio + async def test_crawl_log_levels_populated(self): + spider = MockSpider() + engine = _make_engine(spider=spider) + + stats = await engine.crawl() + + assert isinstance(stats.log_levels_counter, dict) + + @pytest.mark.asyncio + async def test_crawl_resets_state_on_each_run(self): + spider = MockSpider() + engine = _make_engine(spider=spider) + + # Run first crawl + await engine.crawl() + assert engine.stats.requests_count >= 1 + + # Run second crawl - stats should reset + stats = await engine.crawl() + # Items are cleared on each crawl + assert engine.paused is False + + +# --------------------------------------------------------------------------- +# Tests: items property +# --------------------------------------------------------------------------- + + +class TestItemsProperty: + def test_items_returns_item_list(self): + engine = _make_engine() + assert isinstance(engine.items, ItemList) + + def test_items_initially_empty(self): + engine = _make_engine() + assert len(engine.items) == 0 + + @pytest.mark.asyncio + async def test_items_populated_after_crawl(self): + engine = _make_engine() + await engine.crawl() + assert len(engine.items) >= 1 + + +# --------------------------------------------------------------------------- +# Tests: streaming (__aiter__ / _stream) +# --------------------------------------------------------------------------- + + +class TestStreaming: + @pytest.mark.asyncio + async def test_stream_yields_items(self): + spider = MockSpider() + engine = _make_engine(spider=spider) + + items = [] + async for item in engine: + items.append(item) + + assert len(items) >= 1 + assert isinstance(items[0], dict) + + @pytest.mark.asyncio + async def test_stream_processes_follow_up_requests(self): + call_count = 0 + + async def parse_with_follow(response) -> AsyncGenerator: + nonlocal call_count + call_count += 1 + if call_count == 1: + yield Request("https://example.com/page2", sid="default") + yield {"page": call_count} + + spider = MockSpider() + spider.parse = parse_with_follow # type: ignore[assignment] + engine = _make_engine(spider=spider) + + items = [] + async for item in engine: + items.append(item) + + assert len(items) == 2 + + @pytest.mark.asyncio + async def test_stream_items_not_stored_in_items_list(self): + """When streaming, items go to the stream, not to engine._items.""" + spider = MockSpider() + engine = _make_engine(spider=spider) + + items = [] + async for item in engine: + items.append(item) + + # Items were sent through stream, not appended to _items + assert len(items) >= 1 + assert len(engine.items) == 0 + + +# --------------------------------------------------------------------------- +# Tests: pause during crawl +# --------------------------------------------------------------------------- + + +class TestPauseDuringCrawl: + @pytest.mark.asyncio + async def test_pause_stops_crawl_gracefully(self): + processed = 0 + + async def slow_parse(response) -> AsyncGenerator: + nonlocal processed + processed += 1 + # Yield more requests to keep the crawl going + if processed <= 2: + yield Request(f"https://example.com/p{processed + 1}", sid="default") + yield {"n": processed} + + spider = MockSpider() + spider.parse = slow_parse # type: ignore[assignment] + engine = _make_engine(spider=spider) + + # Request pause immediately - the engine will stop as soon as active tasks complete + engine._pause_requested = True + + stats = await engine.crawl() + # Should stop without processing everything + assert engine._running is False + + @pytest.mark.asyncio + async def test_pause_with_checkpoint_sets_paused(self): + with tempfile.TemporaryDirectory() as tmpdir: + parse_count = 0 + + async def parse_and_pause(response) -> AsyncGenerator: + nonlocal parse_count + parse_count += 1 + # Request pause after first request, but yield follow-ups + if parse_count == 1: + engine.request_pause() + yield Request("https://example.com/p2", sid="default") + yield {"n": parse_count} + + spider = MockSpider() + spider.parse = parse_and_pause # type: ignore[assignment] + engine = _make_engine(spider=spider, crawldir=tmpdir) + + await engine.crawl() + + assert engine.paused is True + + @pytest.mark.asyncio + async def test_pause_without_checkpoint_does_not_set_paused(self): + spider = MockSpider() + engine = _make_engine(spider=spider) + + engine._pause_requested = True + + await engine.crawl() + + assert engine.paused is False diff --git a/tests/spiders/test_request.py b/tests/spiders/test_request.py new file mode 100644 index 0000000000000000000000000000000000000000..997a71ba1539fd89090e8183f363db2cdabc726a --- /dev/null +++ b/tests/spiders/test_request.py @@ -0,0 +1,381 @@ +"""Tests for the Request class.""" + +import pickle + +import pytest + +from scrapling.spiders.request import Request +from scrapling.core._types import Any, Dict, AsyncGenerator + + +class TestRequestCreation: + """Test Request initialization and basic attributes.""" + + def test_basic_request_creation(self): + """Test creating a request with just a URL.""" + request = Request("https://example.com") + + assert request.url == "https://example.com" + assert request.sid == "" + assert request.callback is None + assert request.priority == 0 + assert request.dont_filter is False + assert request.meta == {} + assert request._retry_count == 0 + assert request._session_kwargs == {} + + def test_request_with_all_parameters(self): + """Test creating a request with all parameters.""" + + async def my_callback(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]: + yield {"test": "data"} + + request = Request( + url="https://example.com/page", + sid="my_session", + callback=my_callback, + priority=10, + dont_filter=True, + meta={"key": "value"}, + _retry_count=2, + proxy="http://proxy:8080", + timeout=30, + ) + + assert request.url == "https://example.com/page" + assert request.sid == "my_session" + assert request.callback == my_callback + assert request.priority == 10 + assert request.dont_filter is True + assert request.meta == {"key": "value"} + assert request._retry_count == 2 + assert request._session_kwargs == {"proxy": "http://proxy:8080", "timeout": 30} + + def test_request_meta_default_is_empty_dict(self): + """Test that meta defaults to empty dict, not shared reference.""" + r1 = Request("https://example.com") + r2 = Request("https://example.com") + + r1.meta["key"] = "value" + + assert r1.meta == {"key": "value"} + assert r2.meta == {} + + +class TestRequestProperties: + """Test Request computed properties.""" + + def test_domain_extraction(self): + """Test domain property extracts netloc correctly.""" + request = Request("https://www.example.com/path/page.html?query=1") + assert request.domain == "www.example.com" + + def test_domain_with_port(self): + """Test domain extraction with port number.""" + request = Request("http://localhost:8080/api") + assert request.domain == "localhost:8080" + + def test_domain_with_subdomain(self): + """Test domain extraction with subdomains.""" + request = Request("https://api.v2.example.com/endpoint") + assert request.domain == "api.v2.example.com" + + def test_fingerprint_returns_bytes(self): + """Test fingerprint generation returns bytes.""" + request = Request("https://example.com") + fp = request.update_fingerprint() + assert isinstance(fp, bytes) + assert len(fp) == 20 # SHA1 produces 20 bytes + + def test_fingerprint_is_deterministic(self): + """Test same request produces same fingerprint.""" + r1 = Request("https://example.com", data={"key": "value"}) + r2 = Request("https://example.com", data={"key": "value"}) + assert r1.update_fingerprint() == r2.update_fingerprint() + + def test_fingerprint_different_urls(self): + """Test different URLs produce different fingerprints.""" + r1 = Request("https://example.com/page1") + r2 = Request("https://example.com/page2") + assert r1.update_fingerprint() != r2.update_fingerprint() + + +class TestRequestCopy: + """Test Request copy functionality.""" + + def test_copy_creates_independent_request(self): + """Test that copy creates a new independent request.""" + + async def callback(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]: + yield None + + original = Request( + url="https://example.com", + sid="session", + callback=callback, + priority=5, + dont_filter=True, + meta={"original": True}, + _retry_count=1, + proxy="http://proxy:8080", + ) + + copied = original.copy() + + # Check all values are copied + assert copied.url == original.url + assert copied.sid == original.sid + assert copied.callback == original.callback + assert copied.priority == original.priority + assert copied.dont_filter == original.dont_filter + assert copied.meta == original.meta + assert copied._retry_count == original._retry_count + assert copied._session_kwargs == original._session_kwargs + + # Check they are different objects + assert copied is not original + assert copied.meta is not original.meta # Meta should be a copy + + def test_copy_meta_is_independent(self): + """Test that modifying copied meta doesn't affect original.""" + original = Request("https://example.com", meta={"key": "original"}) + copied = original.copy() + + copied.meta["key"] = "modified" + copied.meta["new_key"] = "new_value" + + assert original.meta == {"key": "original"} + assert copied.meta == {"key": "modified", "new_key": "new_value"} + + +class TestRequestComparison: + """Test Request comparison operators.""" + + def test_priority_less_than(self): + """Test less than comparison by priority.""" + low_priority = Request("https://example.com/1", priority=1) + high_priority = Request("https://example.com/2", priority=10) + + assert low_priority < high_priority + assert not high_priority < low_priority + + def test_priority_greater_than(self): + """Test greater than comparison by priority.""" + low_priority = Request("https://example.com/1", priority=1) + high_priority = Request("https://example.com/2", priority=10) + + assert high_priority > low_priority + assert not low_priority > high_priority + + def test_equality_by_fingerprint(self): + """Test equality comparison by fingerprint.""" + r1 = Request("https://example.com") + r2 = Request("https://example.com") + r3 = Request("https://example.com/other") + + # Generate fingerprints first (required for equality) + r1.update_fingerprint() + r2.update_fingerprint() + r3.update_fingerprint() + + assert r1 == r2 + assert r1 != r3 + + def test_equality_different_priorities_same_fingerprint(self): + """Test requests with same fingerprint are equal despite different priorities.""" + r1 = Request("https://example.com", priority=1) + r2 = Request("https://example.com", priority=100) + + # Generate fingerprints first + r1.update_fingerprint() + r2.update_fingerprint() + + assert r1 == r2 # Same fingerprint means equal + + def test_comparison_with_non_request(self): + """Test comparison with non-Request types returns NotImplemented.""" + request = Request("https://example.com") + + assert request.__lt__("not a request") == NotImplemented + assert request.__gt__("not a request") == NotImplemented + assert request.__eq__("not a request") == NotImplemented + + +class TestRequestStringRepresentation: + """Test Request string representations.""" + + def test_str_returns_url(self): + """Test __str__ returns the URL.""" + request = Request("https://example.com/page") + assert str(request) == "https://example.com/page" + + def test_repr_without_callback(self): + """Test __repr__ without callback.""" + request = Request("https://example.com", priority=5) + repr_str = repr(request) + + assert "Request" in repr_str + assert "https://example.com" in repr_str + assert "priority=5" in repr_str + assert "callback=None" in repr_str + + def test_repr_with_callback(self): + """Test __repr__ with named callback.""" + + async def my_custom_callback(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]: + yield None + + request = Request("https://example.com", callback=my_custom_callback) + repr_str = repr(request) + + assert "callback=my_custom_callback" in repr_str + + +class TestRequestPickling: + """Test Request serialization for checkpointing.""" + + def test_pickle_without_callback(self): + """Test pickling request without callback.""" + original = Request( + url="https://example.com", + sid="session", + priority=5, + meta={"key": "value"}, + ) + + pickled = pickle.dumps(original) + restored = pickle.loads(pickled) + + assert restored.url == original.url + assert restored.sid == original.sid + assert restored.priority == original.priority + assert restored.meta == original.meta + assert restored.callback is None + + def test_pickle_with_callback_stores_name(self): + """Test that callback name is stored when pickling.""" + + async def parse_page(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]: + yield {"data": "test"} + + original = Request("https://example.com", callback=parse_page) + + # Check getstate stores callback name + state = original.__getstate__() + assert state["_callback_name"] == "parse_page" + assert state["callback"] is None + + def test_pickle_with_none_callback(self): + """Test pickling with None callback.""" + original = Request("https://example.com", callback=None) + + state = original.__getstate__() + assert state["_callback_name"] is None + assert state["callback"] is None + + def test_setstate_stores_callback_name(self): + """Test that setstate correctly handles callback name.""" + request = Request("https://example.com") + state = { + "url": "https://example.com", + "sid": "", + "callback": None, + "priority": 0, + "dont_filter": False, + "meta": {}, + "_retry_count": 0, + "_session_kwargs": {}, + "_callback_name": "custom_parse", + } + + request.__setstate__(state) + + assert hasattr(request, "_callback_name") + assert request._callback_name == "custom_parse" + + def test_pickle_roundtrip_preserves_session_kwargs(self): + """Test that session kwargs are preserved through pickle.""" + original = Request( + "https://example.com", + proxy="http://proxy:8080", + timeout=30, + headers={"User-Agent": "test"}, + ) + + pickled = pickle.dumps(original) + restored = pickle.loads(pickled) + + assert restored._session_kwargs == { + "proxy": "http://proxy:8080", + "timeout": 30, + "headers": {"User-Agent": "test"}, + } + + +class TestRequestRestoreCallback: + """Test callback restoration from spider.""" + + def test_restore_callback_from_spider(self): + """Test restoring callback from spider instance.""" + + class MockSpider: + async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]: + yield None + + async def parse_detail(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]: + yield {"detail": True} + + spider = MockSpider() + request = Request("https://example.com") + request._callback_name = "parse_detail" + + request._restore_callback(spider) # type: ignore[arg-type] + + assert request.callback == spider.parse_detail + assert not hasattr(request, "_callback_name") + + def test_restore_callback_falls_back_to_parse(self): + """Test that missing callback falls back to spider.parse.""" + + class MockSpider: + async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]: + yield None + + spider = MockSpider() + request = Request("https://example.com") + request._callback_name = "nonexistent_method" + + request._restore_callback(spider) # type: ignore[arg-type] + + assert request.callback == spider.parse + assert not hasattr(request, "_callback_name") + + def test_restore_callback_with_none_name(self): + """Test restore callback when _callback_name is None.""" + + class MockSpider: + async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]: + yield None + + spider = MockSpider() + request = Request("https://example.com") + request._callback_name = None + + request._restore_callback(spider) # type: ignore[arg-type] + + # Should clean up _callback_name attribute + assert not hasattr(request, "_callback_name") + + def test_restore_callback_without_callback_name_attr(self): + """Test restore callback when _callback_name attribute doesn't exist.""" + + class MockSpider: + async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]: + yield None + + spider = MockSpider() + request = Request("https://example.com") + # Don't set _callback_name + + # Should not raise an error + request._restore_callback(spider) # type: ignore[arg-type] diff --git a/tests/spiders/test_result.py b/tests/spiders/test_result.py new file mode 100644 index 0000000000000000000000000000000000000000..f59f2362de29d1687c73ac6b7f1f418b0cdb3699 --- /dev/null +++ b/tests/spiders/test_result.py @@ -0,0 +1,327 @@ +"""Tests for the result module (ItemList, CrawlStats, CrawlResult).""" + +import json +import tempfile +from pathlib import Path + +import pytest + +from scrapling.spiders.result import ItemList, CrawlStats, CrawlResult + + +class TestItemList: + """Test ItemList functionality.""" + + def test_itemlist_is_list(self): + """Test that ItemList is a list subclass.""" + items = ItemList() + + assert isinstance(items, list) + + def test_itemlist_basic_operations(self): + """Test basic list operations work.""" + items = ItemList() + + items.append({"id": 1}) + items.append({"id": 2}) + + assert len(items) == 2 + assert items[0] == {"id": 1} + + def test_to_json_creates_file(self): + """Test to_json creates JSON file.""" + items = ItemList() + items.append({"name": "test", "value": 123}) + items.append({"name": "test2", "value": 456}) + + with tempfile.TemporaryDirectory() as tmpdir: + path = Path(tmpdir) / "output.json" + items.to_json(path) + + assert path.exists() + + content = json.loads(path.read_text()) + assert len(content) == 2 + assert content[0]["name"] == "test" + + def test_to_json_creates_parent_directory(self): + """Test to_json creates parent directories.""" + items = ItemList() + items.append({"data": "test"}) + + with tempfile.TemporaryDirectory() as tmpdir: + path = Path(tmpdir) / "nested" / "dirs" / "output.json" + items.to_json(path) + + assert path.exists() + + def test_to_json_with_indent(self): + """Test to_json with indentation.""" + items = ItemList() + items.append({"key": "value"}) + + with tempfile.TemporaryDirectory() as tmpdir: + path = Path(tmpdir) / "output.json" + items.to_json(path, indent=True) + + content = path.read_text() + # Indented JSON should have newlines + assert "\n" in content + + def test_to_jsonl_creates_file(self): + """Test to_jsonl creates JSON Lines file.""" + items = ItemList() + items.append({"id": 1, "name": "first"}) + items.append({"id": 2, "name": "second"}) + items.append({"id": 3, "name": "third"}) + + with tempfile.TemporaryDirectory() as tmpdir: + path = Path(tmpdir) / "output.jsonl" + items.to_jsonl(path) + + assert path.exists() + + lines = path.read_text().strip().split("\n") + assert len(lines) == 3 + + # Each line should be valid JSON + for line in lines: + parsed = json.loads(line) + assert "id" in parsed + assert "name" in parsed + + def test_to_jsonl_one_object_per_line(self): + """Test that JSONL has one JSON object per line.""" + items = ItemList() + items.append({"line": 1}) + items.append({"line": 2}) + + with tempfile.TemporaryDirectory() as tmpdir: + path = Path(tmpdir) / "output.jsonl" + items.to_jsonl(path) + + lines = path.read_text().strip().split("\n") + + assert json.loads(lines[0])["line"] == 1 + assert json.loads(lines[1])["line"] == 2 + + +class TestCrawlStats: + """Test CrawlStats dataclass.""" + + def test_default_values(self): + """Test CrawlStats default values.""" + stats = CrawlStats() + + assert stats.requests_count == 0 + assert stats.concurrent_requests == 0 + assert stats.failed_requests_count == 0 + assert stats.response_bytes == 0 + assert stats.items_scraped == 0 + assert stats.items_dropped == 0 + assert stats.start_time == 0.0 + assert stats.end_time == 0.0 + assert stats.custom_stats == {} + assert stats.response_status_count == {} + assert stats.proxies == [] + + def test_elapsed_seconds(self): + """Test elapsed_seconds property.""" + stats = CrawlStats(start_time=100.0, end_time=150.0) + + assert stats.elapsed_seconds == 50.0 + + def test_requests_per_second(self): + """Test requests_per_second calculation.""" + stats = CrawlStats( + requests_count=100, + start_time=0.0, + end_time=10.0, + ) + + assert stats.requests_per_second == 10.0 + + def test_requests_per_second_zero_elapsed(self): + """Test requests_per_second when elapsed is zero.""" + stats = CrawlStats( + requests_count=100, + start_time=0.0, + end_time=0.0, + ) + + assert stats.requests_per_second == 0.0 + + def test_increment_status(self): + """Test increment_status method.""" + stats = CrawlStats() + + stats.increment_status(200) + stats.increment_status(200) + stats.increment_status(404) + + assert stats.response_status_count == {"status_200": 2, "status_404": 1} + + def test_increment_response_bytes(self): + """Test increment_response_bytes method.""" + stats = CrawlStats() + + stats.increment_response_bytes("example.com", 1000) + stats.increment_response_bytes("example.com", 500) + stats.increment_response_bytes("other.com", 2000) + + assert stats.response_bytes == 3500 + assert stats.domains_response_bytes == { + "example.com": 1500, + "other.com": 2000, + } + + def test_increment_requests_count(self): + """Test increment_requests_count method.""" + stats = CrawlStats() + + stats.increment_requests_count("session1") + stats.increment_requests_count("session1") + stats.increment_requests_count("session2") + + assert stats.requests_count == 3 + assert stats.sessions_requests_count == {"session1": 2, "session2": 1} + + def test_to_dict(self): + """Test to_dict method returns all stats.""" + stats = CrawlStats( + items_scraped=10, + items_dropped=2, + requests_count=15, + start_time=0.0, + end_time=5.0, + ) + stats.increment_status(200) + + result = stats.to_dict() + + assert result["items_scraped"] == 10 + assert result["items_dropped"] == 2 + assert result["requests_count"] == 15 + assert result["elapsed_seconds"] == 5.0 + assert result["requests_per_second"] == 3.0 + assert result["response_status_count"] == {"status_200": 1} + + def test_custom_stats(self): + """Test custom_stats can be used.""" + stats = CrawlStats() + stats.custom_stats["my_metric"] = 42 + stats.custom_stats["another"] = "value" + + assert stats.custom_stats["my_metric"] == 42 + assert stats.to_dict()["custom_stats"]["my_metric"] == 42 + + +class TestCrawlResult: + """Test CrawlResult dataclass.""" + + def test_basic_creation(self): + """Test basic CrawlResult creation.""" + stats = CrawlStats(items_scraped=5) + items = ItemList() + items.extend([{"id": i} for i in range(5)]) + + result = CrawlResult(stats=stats, items=items) + + assert result.stats.items_scraped == 5 + assert len(result.items) == 5 + assert result.paused is False + + def test_completed_property_true_when_not_paused(self): + """Test completed is True when not paused.""" + result = CrawlResult( + stats=CrawlStats(), + items=ItemList(), + paused=False, + ) + + assert result.completed is True + + def test_completed_property_false_when_paused(self): + """Test completed is False when paused.""" + result = CrawlResult( + stats=CrawlStats(), + items=ItemList(), + paused=True, + ) + + assert result.completed is False + + def test_len_returns_item_count(self): + """Test len returns number of items.""" + items = ItemList() + items.extend([{"id": i} for i in range(10)]) + + result = CrawlResult(stats=CrawlStats(), items=items) + + assert len(result) == 10 + + def test_iter_yields_items(self): + """Test iteration yields items.""" + items = ItemList() + items.extend([{"id": 1}, {"id": 2}, {"id": 3}]) + + result = CrawlResult(stats=CrawlStats(), items=items) + + collected = list(result) + + assert collected == [{"id": 1}, {"id": 2}, {"id": 3}] + + def test_result_with_stats(self): + """Test CrawlResult with populated stats.""" + stats = CrawlStats( + requests_count=100, + items_scraped=50, + failed_requests_count=5, + start_time=0.0, + end_time=10.0, + ) + items = ItemList() + + result = CrawlResult(stats=stats, items=items) + + assert result.stats.requests_count == 100 + assert result.stats.items_scraped == 50 + assert result.stats.requests_per_second == 10.0 + + +class TestCrawlResultIntegration: + """Integration tests for result classes.""" + + def test_full_workflow(self): + """Test realistic workflow with all result classes.""" + # Simulate a crawl + stats = CrawlStats(start_time=1000.0) + + # Simulate requests + for _ in range(10): + stats.increment_requests_count("default") + stats.increment_status(200) + stats.increment_response_bytes("example.com", 5000) + + # Simulate some failures + stats.failed_requests_count = 2 + stats.blocked_requests_count = 1 + + # Collect items + items = ItemList() + for i in range(8): + items.append({"product_id": i, "name": f"Product {i}"}) + stats.items_scraped += 1 + + # Finish crawl + stats.end_time = 1005.0 + + # Create result + result = CrawlResult(stats=stats, items=items, paused=False) + + # Verify + assert result.completed is True + assert len(result) == 8 + assert result.stats.requests_count == 10 + assert result.stats.requests_per_second == 2.0 + assert result.stats.response_bytes == 50000 diff --git a/tests/spiders/test_scheduler.py b/tests/spiders/test_scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..a7365aa1743f9d7a8bd3558ef613f245dbb752ea --- /dev/null +++ b/tests/spiders/test_scheduler.py @@ -0,0 +1,386 @@ +"""Tests for the Scheduler class.""" + +import pytest + +from scrapling.spiders.request import Request +from scrapling.spiders.scheduler import Scheduler +from scrapling.spiders.checkpoint import CheckpointData + + +class TestSchedulerInit: + """Test Scheduler initialization.""" + + def test_scheduler_starts_empty(self): + """Test that scheduler starts with empty queue.""" + scheduler = Scheduler() + + assert len(scheduler) == 0 + assert scheduler.is_empty is True + + +class TestSchedulerEnqueue: + """Test Scheduler enqueue functionality.""" + + @pytest.mark.asyncio + async def test_enqueue_single_request(self): + """Test enqueueing a single request.""" + scheduler = Scheduler() + request = Request("https://example.com") + + result = await scheduler.enqueue(request) + + assert result is True + assert len(scheduler) == 1 + assert scheduler.is_empty is False + + @pytest.mark.asyncio + async def test_enqueue_multiple_requests(self): + """Test enqueueing multiple requests.""" + scheduler = Scheduler() + + for i in range(5): + request = Request(f"https://example.com/{i}") + await scheduler.enqueue(request) + + assert len(scheduler) == 5 + + @pytest.mark.asyncio + async def test_enqueue_duplicate_filtered(self): + """Test that duplicate requests are filtered by default.""" + scheduler = Scheduler() + + request1 = Request("https://example.com", sid="s1") + request2 = Request("https://example.com", sid="s1") # Same fingerprint + + result1 = await scheduler.enqueue(request1) + result2 = await scheduler.enqueue(request2) + + assert result1 is True + assert result2 is False # Duplicate filtered + assert len(scheduler) == 1 + + @pytest.mark.asyncio + async def test_enqueue_duplicate_allowed_with_dont_filter(self): + """Test that dont_filter allows duplicate requests.""" + scheduler = Scheduler() + + request1 = Request("https://example.com", sid="s1") + request2 = Request("https://example.com", sid="s1", dont_filter=True) + + result1 = await scheduler.enqueue(request1) + result2 = await scheduler.enqueue(request2) + + assert result1 is True + assert result2 is True + assert len(scheduler) == 2 + + @pytest.mark.asyncio + async def test_enqueue_different_methods_not_duplicate(self): + """Test that same URL with different methods are not duplicates.""" + scheduler = Scheduler() + + request1 = Request("https://example.com", method="GET") + request2 = Request("https://example.com", method="POST") + + result1 = await scheduler.enqueue(request1) + result2 = await scheduler.enqueue(request2) + + assert result1 is True + assert result2 is True + assert len(scheduler) == 2 + + +class TestSchedulerDequeue: + """Test Scheduler dequeue functionality.""" + + @pytest.mark.asyncio + async def test_dequeue_returns_request(self): + """Test that dequeue returns the enqueued request.""" + scheduler = Scheduler() + original = Request("https://example.com") + + await scheduler.enqueue(original) + dequeued = await scheduler.dequeue() + + assert dequeued.url == original.url + + @pytest.mark.asyncio + async def test_dequeue_respects_priority_order(self): + """Test that higher priority requests are dequeued first.""" + scheduler = Scheduler() + + low = Request("https://example.com/low", priority=1) + high = Request("https://example.com/high", priority=10) + medium = Request("https://example.com/medium", priority=5) + + await scheduler.enqueue(low) + await scheduler.enqueue(high) + await scheduler.enqueue(medium) + + # Should get high priority first + first = await scheduler.dequeue() + assert first.url == "https://example.com/high" + + second = await scheduler.dequeue() + assert second.url == "https://example.com/medium" + + third = await scheduler.dequeue() + assert third.url == "https://example.com/low" + + @pytest.mark.asyncio + async def test_dequeue_fifo_for_same_priority(self): + """Test FIFO ordering for requests with same priority.""" + scheduler = Scheduler() + + for i in range(3): + request = Request(f"https://example.com/{i}", priority=5) + await scheduler.enqueue(request) + + first = await scheduler.dequeue() + second = await scheduler.dequeue() + third = await scheduler.dequeue() + + # Should be in FIFO order since same priority + assert first.url == "https://example.com/0" + assert second.url == "https://example.com/1" + assert third.url == "https://example.com/2" + + @pytest.mark.asyncio + async def test_dequeue_updates_length(self): + """Test that dequeue decreases the queue length.""" + scheduler = Scheduler() + + await scheduler.enqueue(Request("https://example.com/1")) + await scheduler.enqueue(Request("https://example.com/2")) + + assert len(scheduler) == 2 + + await scheduler.dequeue() + assert len(scheduler) == 1 + + await scheduler.dequeue() + assert len(scheduler) == 0 + assert scheduler.is_empty is True + + +class TestSchedulerSnapshot: + """Test Scheduler snapshot functionality for checkpointing.""" + + @pytest.mark.asyncio + async def test_snapshot_empty_scheduler(self): + """Test snapshot of empty scheduler.""" + scheduler = Scheduler() + + requests, seen = scheduler.snapshot() + + assert requests == [] + assert seen == set() + + @pytest.mark.asyncio + async def test_snapshot_captures_pending_requests(self): + """Test snapshot captures all pending requests.""" + scheduler = Scheduler() + + await scheduler.enqueue(Request("https://example.com/1", priority=5)) + await scheduler.enqueue(Request("https://example.com/2", priority=10)) + await scheduler.enqueue(Request("https://example.com/3", priority=1)) + + requests, seen = scheduler.snapshot() + + assert len(requests) == 3 + # Should be sorted by priority (highest first due to negative priority in queue) + assert requests[0].url == "https://example.com/2" # priority 10 + assert requests[1].url == "https://example.com/1" # priority 5 + assert requests[2].url == "https://example.com/3" # priority 1 + + @pytest.mark.asyncio + async def test_snapshot_captures_seen_set(self): + """Test snapshot captures seen fingerprints.""" + scheduler = Scheduler() + + await scheduler.enqueue(Request("https://example.com/1")) + await scheduler.enqueue(Request("https://example.com/2")) + + requests, seen = scheduler.snapshot() + + assert len(seen) == 2 + # Fingerprints are now bytes (SHA1 hashes) + for fp in seen: + assert isinstance(fp, bytes) + assert len(fp) == 20 # SHA1 produces 20 bytes + + @pytest.mark.asyncio + async def test_snapshot_returns_copies(self): + """Test that snapshot returns copies, not references.""" + scheduler = Scheduler() + + await scheduler.enqueue(Request("https://example.com")) + + requests, seen = scheduler.snapshot() + + # Modifying snapshot shouldn't affect scheduler + requests.append(Request("https://modified.com")) + seen.add(b"new_fingerprint_bytes") + + original_requests, original_seen = scheduler.snapshot() + + assert len(original_requests) == 1 + assert b"new_fingerprint_bytes" not in original_seen + + @pytest.mark.asyncio + async def test_snapshot_excludes_dequeued_requests(self): + """Test snapshot only includes pending requests.""" + scheduler = Scheduler() + + await scheduler.enqueue(Request("https://example.com/1")) + await scheduler.enqueue(Request("https://example.com/2")) + await scheduler.enqueue(Request("https://example.com/3")) + + # Dequeue one + await scheduler.dequeue() + + requests, seen = scheduler.snapshot() + + # Snapshot should only have 2 pending requests + assert len(requests) == 2 + # But seen should still have all 3 (deduplication tracking) + assert len(seen) == 3 + + +class TestSchedulerRestore: + """Test Scheduler restore functionality from checkpoint.""" + + @pytest.mark.asyncio + async def test_restore_requests(self): + """Test restoring requests from checkpoint data.""" + scheduler = Scheduler() + + checkpoint_requests = [ + Request("https://example.com/1", priority=10), + Request("https://example.com/2", priority=5), + ] + checkpoint_seen = {b"fp1_bytes_padded!", b"fp2_bytes_padded!", b"fp3_bytes_padded!"} + + data = CheckpointData(requests=checkpoint_requests, seen=checkpoint_seen) + + scheduler.restore(data) + + assert len(scheduler) == 2 + + @pytest.mark.asyncio + async def test_restore_seen_set(self): + """Test that restore sets up seen fingerprints.""" + scheduler = Scheduler() + + data = CheckpointData( + requests=[], + seen={b"fp1_bytes_here_pad", b"fp2_bytes_here_pad"}, # Bytes fingerprints + ) + + scheduler.restore(data) + + # Verify seen set was restored + _, seen = scheduler.snapshot() + assert seen == {b"fp1_bytes_here_pad", b"fp2_bytes_here_pad"} + + @pytest.mark.asyncio + async def test_restore_maintains_priority_order(self): + """Test that restored requests maintain priority order.""" + scheduler = Scheduler() + + # Requests should already be sorted by priority in checkpoint + checkpoint_requests = [ + Request("https://example.com/high", priority=10), + Request("https://example.com/low", priority=1), + ] + + data = CheckpointData(requests=checkpoint_requests, seen=set()) + scheduler.restore(data) + + # Dequeue should return high priority first + first = await scheduler.dequeue() + assert first.url == "https://example.com/high" + + second = await scheduler.dequeue() + assert second.url == "https://example.com/low" + + @pytest.mark.asyncio + async def test_restore_empty_checkpoint(self): + """Test restoring from empty checkpoint.""" + scheduler = Scheduler() + + data = CheckpointData(requests=[], seen=set()) + scheduler.restore(data) + + assert len(scheduler) == 0 + assert scheduler.is_empty is True + + +class TestSchedulerIntegration: + """Integration tests for Scheduler with checkpoint roundtrip.""" + + @pytest.mark.asyncio + async def test_snapshot_and_restore_roundtrip(self): + """Test that snapshot -> restore works correctly.""" + # Create and populate original scheduler + original = Scheduler() + + await original.enqueue(Request("https://example.com/1", sid="s1", priority=10)) + await original.enqueue(Request("https://example.com/2", sid="s1", priority=5)) + await original.enqueue(Request("https://example.com/3", sid="s2", priority=7)) + + # Snapshot + requests, seen = original.snapshot() + data = CheckpointData(requests=requests, seen=seen) + + # Restore to new scheduler + restored = Scheduler() + restored.restore(data) + + # Verify state matches + assert len(restored) == len(original) + + # Dequeue from both and compare + for _ in range(3): + orig_req = await original.dequeue() + rest_req = await restored.dequeue() + assert orig_req.url == rest_req.url + assert orig_req.priority == rest_req.priority + + @pytest.mark.asyncio + async def test_partial_processing_then_checkpoint(self): + """Test checkpointing after partial processing.""" + scheduler = Scheduler() + + # Enqueue 5 requests + for i in range(5): + await scheduler.enqueue(Request(f"https://example.com/{i}")) + + # Process 2 + await scheduler.dequeue() + await scheduler.dequeue() + + # Snapshot should show 3 pending, 5 seen + requests, seen = scheduler.snapshot() + + assert len(requests) == 3 + assert len(seen) == 5 + + @pytest.mark.asyncio + async def test_deduplication_after_restore(self): + """Test that deduplication works after restore.""" + scheduler = Scheduler() + + await scheduler.enqueue(Request("https://example.com", sid="s1")) + + requests, seen = scheduler.snapshot() + data = CheckpointData(requests=requests, seen=seen) + + # Restore to new scheduler + new_scheduler = Scheduler() + new_scheduler.restore(data) + + # Try to add duplicate - should be filtered + result = await new_scheduler.enqueue(Request("https://example.com", sid="s1")) + + assert result is False # Duplicate filtered based on restored seen set diff --git a/tests/spiders/test_session.py b/tests/spiders/test_session.py new file mode 100644 index 0000000000000000000000000000000000000000..c1eed5d83f4c42046a028713b888a6ba90da50c6 --- /dev/null +++ b/tests/spiders/test_session.py @@ -0,0 +1,352 @@ +"""Tests for the SessionManager class.""" + +from scrapling.core._types import Any +import pytest + +from scrapling.spiders.session import SessionManager + + +class MockSession: # type: ignore[type-arg] + """Mock session for testing without actual network calls.""" + + def __init__(self, name: str = "mock"): + self.name = name + self._is_alive = False + self._started = False + self._closed = False + + async def __aenter__(self): + self._is_alive = True + self._started = True + return self + + async def __aexit__(self, *args): + self._is_alive = False + self._closed = True + + async def fetch(self, url: str, **kwargs): + pass + + +class TestSessionManagerInit: + """Test SessionManager initialization.""" + + def test_manager_starts_empty(self): + """Test that manager starts with no sessions.""" + manager = SessionManager() + + assert len(manager) == 0 + + def test_manager_no_default_session_when_empty(self): + """Test that accessing default_session_id raises when empty.""" + manager = SessionManager() + + with pytest.raises(RuntimeError, match="No sessions registered"): + _ = manager.default_session_id + + +class TestSessionManagerAdd: + """Test SessionManager add functionality.""" + + def test_add_single_session(self): + """Test adding a single session.""" + manager = SessionManager() + session = MockSession() + + manager.add("test", session) + + assert len(manager) == 1 + assert "test" in manager + assert manager.session_ids == ["test"] + + def test_first_session_becomes_default(self): + """Test that first added session becomes default.""" + manager = SessionManager() + session = MockSession() + + manager.add("first", session) + + assert manager.default_session_id == "first" + + def test_add_multiple_sessions(self): + """Test adding multiple sessions.""" + manager = SessionManager() + + manager.add("session1", MockSession("s1")) + manager.add("session2", MockSession("s2")) + manager.add("session3", MockSession("s3")) + + assert len(manager) == 3 + assert "session1" in manager + assert "session2" in manager + assert "session3" in manager + + def test_explicit_default_session(self): + """Test setting explicit default session.""" + manager = SessionManager() + + manager.add("first", MockSession()) + manager.add("second", MockSession(), default=True) + + assert manager.default_session_id == "second" + + def test_add_duplicate_id_raises(self): + """Test that adding duplicate session ID raises.""" + manager = SessionManager() + manager.add("test", MockSession()) + + with pytest.raises(ValueError, match="already registered"): + manager.add("test", MockSession()) + + def test_add_returns_self_for_chaining(self): + """Test that add returns self for method chaining.""" + manager = SessionManager() + + result = manager.add("test", MockSession()) + + assert result is manager + + def test_method_chaining(self): + """Test fluent interface for adding sessions.""" + manager = SessionManager() + + manager.add("s1", MockSession()).add("s2", MockSession()).add("s3", MockSession()) + + assert len(manager) == 3 + + def test_add_lazy_session(self): + """Test adding lazy session.""" + manager = SessionManager() + + manager.add("lazy", MockSession(), lazy=True) + + assert "lazy" in manager + assert "lazy" in manager._lazy_sessions + + +class TestSessionManagerRemove: + """Test SessionManager remove/pop functionality.""" + + def test_remove_session(self): + """Test removing a session.""" + manager = SessionManager() + manager.add("test", MockSession()) + + manager.remove("test") + + assert "test" not in manager + assert len(manager) == 0 + + def test_remove_nonexistent_raises(self): + """Test removing nonexistent session raises.""" + manager = SessionManager() + + with pytest.raises(KeyError, match="not found"): + manager.remove("nonexistent") + + def test_pop_returns_session(self): + """Test pop returns the removed session.""" + manager = SessionManager() + session = MockSession("original") + manager.add("test", session) + + popped = manager.pop("test") + + assert popped is session + assert "test" not in manager + + def test_remove_default_updates_default(self): + """Test that removing default session updates default.""" + manager = SessionManager() + manager.add("first", MockSession()) + manager.add("second", MockSession()) + + assert manager.default_session_id == "first" + + manager.remove("first") + + assert manager.default_session_id == "second" + + def test_remove_lazy_session_cleans_up(self): + """Test that removing lazy session cleans up lazy set.""" + manager = SessionManager() + manager.add("lazy", MockSession(), lazy=True) + + manager.remove("lazy") + + assert "lazy" not in manager._lazy_sessions + + +class TestSessionManagerGet: + """Test SessionManager get functionality.""" + + def test_get_existing_session(self): + """Test getting an existing session.""" + manager = SessionManager() + session = MockSession("test") + manager.add("test", session) + + retrieved = manager.get("test") + + assert retrieved is session + + def test_get_nonexistent_raises_with_available(self): + """Test getting nonexistent session shows available sessions.""" + manager = SessionManager() + manager.add("session1", MockSession()) + manager.add("session2", MockSession()) + + with pytest.raises(KeyError, match="Available:"): + manager.get("nonexistent") + + +class TestSessionManagerContains: + """Test SessionManager contains functionality.""" + + def test_contains_existing(self): + """Test contains for existing session.""" + manager = SessionManager() + manager.add("test", MockSession()) + + assert "test" in manager + + def test_not_contains_missing(self): + """Test contains for missing session.""" + manager = SessionManager() + manager.add("test", MockSession()) + + assert "other" not in manager + + +class TestSessionManagerAsyncContext: + """Test SessionManager async context manager.""" + + @pytest.mark.asyncio + async def test_start_activates_sessions(self): + """Test that start activates non-lazy sessions.""" + manager = SessionManager() + session = MockSession() + manager.add("test", session) + + await manager.start() + + assert session._is_alive is True + assert manager._started is True + + @pytest.mark.asyncio + async def test_start_skips_lazy_sessions(self): + """Test that start skips lazy sessions.""" + manager = SessionManager() + eager_session = MockSession("eager") + lazy_session = MockSession("lazy") + + manager.add("eager", eager_session) + manager.add("lazy", lazy_session, lazy=True) + + await manager.start() + + assert eager_session._is_alive is True + assert lazy_session._is_alive is False + + @pytest.mark.asyncio + async def test_close_deactivates_sessions(self): + """Test that close deactivates all sessions.""" + manager = SessionManager() + session = MockSession() + manager.add("test", session) + + await manager.start() + assert session._is_alive is True + + await manager.close() + assert session._is_alive is False + assert manager._started is False + + @pytest.mark.asyncio + async def test_async_context_manager(self): + """Test using SessionManager as async context manager.""" + manager = SessionManager() + session = MockSession() + manager.add("test", session) + + async with manager: + assert session._is_alive is True + + assert session._is_alive is False + + @pytest.mark.asyncio + async def test_start_idempotent(self): + """Test that calling start multiple times is safe.""" + manager = SessionManager() + session = MockSession() + manager.add("test", session) + + await manager.start() + await manager.start() # Should not raise or double-start + + assert session._started is True + + +class TestSessionManagerProperties: + """Test SessionManager properties.""" + + def test_session_ids_returns_list(self): + """Test session_ids returns list of IDs.""" + manager = SessionManager() + manager.add("a", MockSession()) + manager.add("b", MockSession()) + manager.add("c", MockSession()) + + ids = manager.session_ids + + assert isinstance(ids, list) + assert set(ids) == {"a", "b", "c"} + + def test_len_returns_session_count(self): + """Test len returns number of sessions.""" + manager = SessionManager() + + assert len(manager) == 0 + + manager.add("s1", MockSession()) + assert len(manager) == 1 + + manager.add("s2", MockSession()) + assert len(manager) == 2 + + +class TestSessionManagerIntegration: + """Integration tests for SessionManager.""" + + def test_realistic_setup(self): + """Test realistic session manager setup.""" + manager = SessionManager() + + # Add different types of sessions + manager.add("default", MockSession("default")) + manager.add("backup", MockSession("backup")) + manager.add("lazy_special", MockSession("special"), lazy=True) + + assert len(manager) == 3 + assert manager.default_session_id == "default" + assert "lazy_special" in manager._lazy_sessions + + @pytest.mark.asyncio + async def test_lifecycle_management(self): + """Test complete lifecycle of session manager.""" + manager = SessionManager() + sessions = [MockSession(f"s{i}") for i in range(3)] + + for i, session in enumerate(sessions): + manager.add(f"session{i}", session) + + # Before start - no sessions active + assert all(not s._is_alive for s in sessions) + + # After start - all active + await manager.start() + assert all(s._is_alive for s in sessions) + + # After close - all inactive + await manager.close() + assert all(not s._is_alive for s in sessions) diff --git a/tests/spiders/test_spider.py b/tests/spiders/test_spider.py new file mode 100644 index 0000000000000000000000000000000000000000..a95f75d08d950f948f1d0e5f86c27fa5625c4e29 --- /dev/null +++ b/tests/spiders/test_spider.py @@ -0,0 +1,574 @@ +"""Tests for the Spider class and related components.""" + +import logging +import tempfile +from pathlib import Path + +import pytest + +from scrapling.spiders.spider import Spider, SessionConfigurationError, LogCounterHandler, BLOCKED_CODES +from scrapling.spiders.request import Request +from scrapling.spiders.session import SessionManager +from scrapling.spiders.result import CrawlStats +from scrapling.core._types import Any, Dict, AsyncGenerator + + +class TestLogCounterHandler: + """Test LogCounterHandler for tracking log counts.""" + + def test_initial_counts_are_zero(self): + """Test that handler starts with zero counts.""" + handler = LogCounterHandler() + counts = handler.get_counts() + + assert counts["debug"] == 0 + assert counts["info"] == 0 + assert counts["warning"] == 0 + assert counts["error"] == 0 + assert counts["critical"] == 0 + + def test_counts_debug_messages(self): + """Test counting debug level messages.""" + handler = LogCounterHandler() + record = logging.LogRecord( + name="test", + level=logging.DEBUG, + pathname="", + lineno=0, + msg="test", + args=(), + exc_info=None, + ) + + handler.emit(record) + handler.emit(record) + + assert handler.get_counts()["debug"] == 2 + + def test_counts_info_messages(self): + """Test counting info level messages.""" + handler = LogCounterHandler() + record = logging.LogRecord( + name="test", + level=logging.INFO, + pathname="", + lineno=0, + msg="test", + args=(), + exc_info=None, + ) + + handler.emit(record) + + assert handler.get_counts()["info"] == 1 + + def test_counts_warning_messages(self): + """Test counting warning level messages.""" + handler = LogCounterHandler() + record = logging.LogRecord( + name="test", + level=logging.WARNING, + pathname="", + lineno=0, + msg="test", + args=(), + exc_info=None, + ) + + handler.emit(record) + + assert handler.get_counts()["warning"] == 1 + + def test_counts_error_messages(self): + """Test counting error level messages.""" + handler = LogCounterHandler() + record = logging.LogRecord( + name="test", + level=logging.ERROR, + pathname="", + lineno=0, + msg="test", + args=(), + exc_info=None, + ) + + handler.emit(record) + + assert handler.get_counts()["error"] == 1 + + def test_counts_critical_messages(self): + """Test counting critical level messages.""" + handler = LogCounterHandler() + record = logging.LogRecord( + name="test", + level=logging.CRITICAL, + pathname="", + lineno=0, + msg="test", + args=(), + exc_info=None, + ) + + handler.emit(record) + + assert handler.get_counts()["critical"] == 1 + + def test_counts_multiple_levels(self): + """Test counting messages at different levels.""" + handler = LogCounterHandler() + + levels = [ + logging.DEBUG, + logging.DEBUG, + logging.INFO, + logging.WARNING, + logging.ERROR, + logging.ERROR, + logging.ERROR, + logging.CRITICAL, + ] + + for level in levels: + record = logging.LogRecord( + name="test", + level=level, + pathname="", + lineno=0, + msg="test", + args=(), + exc_info=None, + ) + handler.emit(record) + + counts = handler.get_counts() + assert counts["debug"] == 2 + assert counts["info"] == 1 + assert counts["warning"] == 1 + assert counts["error"] == 3 + assert counts["critical"] == 1 + + +class TestBlockedCodes: + """Test BLOCKED_CODES constant.""" + + def test_blocked_codes_contains_expected_values(self): + """Test that BLOCKED_CODES contains expected HTTP status codes.""" + assert 401 in BLOCKED_CODES # Unauthorized + assert 403 in BLOCKED_CODES # Forbidden + assert 407 in BLOCKED_CODES # Proxy Authentication Required + assert 429 in BLOCKED_CODES # Too Many Requests + assert 444 in BLOCKED_CODES # Connection Closed Without Response (nginx) + assert 500 in BLOCKED_CODES # Internal Server Error + assert 502 in BLOCKED_CODES # Bad Gateway + assert 503 in BLOCKED_CODES # Service Unavailable + assert 504 in BLOCKED_CODES # Gateway Timeout + + def test_blocked_codes_does_not_contain_success(self): + """Test that success codes are not blocked.""" + assert 200 not in BLOCKED_CODES + assert 201 not in BLOCKED_CODES + assert 204 not in BLOCKED_CODES + assert 301 not in BLOCKED_CODES + assert 302 not in BLOCKED_CODES + + +class ConcreteSpider(Spider): + """Concrete spider implementation for testing.""" + + name = "test_spider" + start_urls = ["https://example.com"] + + async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]: + yield {"url": str(response)} + + +class TestSpiderInit: + """Test Spider initialization.""" + + def test_spider_requires_name(self): + """Test that spider without name raises ValueError.""" + + class NoNameSpider(Spider): + async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]: + yield None + + with pytest.raises(ValueError, match="must have a name"): + NoNameSpider() + + def test_spider_initializes_logger(self): + """Test that spider creates a logger.""" + spider = ConcreteSpider() + + assert spider.logger is not None + assert spider.logger.name == "scrapling.spiders.test_spider" + + def test_spider_logger_has_log_counter(self): + """Test that spider logger has log counter handler.""" + spider = ConcreteSpider() + + assert spider._log_counter is not None + assert isinstance(spider._log_counter, LogCounterHandler) + + def test_spider_with_crawldir(self): + """Test spider initialization with crawldir.""" + with tempfile.TemporaryDirectory() as tmpdir: + spider = ConcreteSpider(crawldir=tmpdir) + + assert spider.crawldir == Path(tmpdir) + + def test_spider_without_crawldir(self): + """Test spider initialization without crawldir.""" + spider = ConcreteSpider() + + assert spider.crawldir is None + + def test_spider_custom_interval(self): + """Test spider with custom checkpoint interval.""" + spider = ConcreteSpider(interval=60.0) + + assert spider._interval == 60.0 + + def test_spider_default_interval(self): + """Test spider has default checkpoint interval.""" + spider = ConcreteSpider() + + assert spider._interval == 300.0 + + def test_spider_repr(self): + """Test spider string representation.""" + spider = ConcreteSpider() + + repr_str = repr(spider) + + assert "ConcreteSpider" in repr_str + assert "test_spider" in repr_str + + +class TestSpiderClassAttributes: + """Test Spider class attribute defaults.""" + + def test_default_concurrent_requests(self): + """Test default concurrent_requests is 4.""" + assert ConcreteSpider.concurrent_requests == 4 + + def test_default_concurrent_requests_per_domain(self): + """Test default concurrent_requests_per_domain is 0 (disabled).""" + assert ConcreteSpider.concurrent_requests_per_domain == 0 + + def test_default_download_delay(self): + """Test default download_delay is 0.""" + assert ConcreteSpider.download_delay == 0.0 + + def test_default_max_blocked_retries(self): + """Test default max_blocked_retries is 3.""" + assert ConcreteSpider.max_blocked_retries == 3 + + def test_default_logging_level(self): + """Test default logging level is DEBUG.""" + assert ConcreteSpider.logging_level == logging.DEBUG + + def test_default_allowed_domains_empty(self): + """Test default allowed_domains is empty set.""" + assert ConcreteSpider.allowed_domains == set() + + +class TestSpiderSessionConfiguration: + """Test Spider session configuration.""" + + def test_default_configure_sessions(self): + """Test that default configure_sessions adds a session.""" + spider = ConcreteSpider() + + assert len(spider._session_manager) > 0 + + def test_configure_sessions_error_raises_custom_exception(self): + """Test that errors in configure_sessions raise SessionConfigurationError.""" + + class BadSessionSpider(Spider): + name = "bad_spider" + + def configure_sessions(self, manager: SessionManager) -> None: + raise RuntimeError("Configuration failed!") + + async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]: + yield None + + with pytest.raises(SessionConfigurationError, match="Configuration failed"): + BadSessionSpider() + + def test_configure_sessions_no_sessions_raises(self): + """Test that not adding any sessions raises SessionConfigurationError.""" + + class NoSessionSpider(Spider): + name = "no_session_spider" + + def configure_sessions(self, manager: SessionManager) -> None: + pass # Don't add any sessions + + async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]: + yield None + + with pytest.raises(SessionConfigurationError, match="did not add any sessions"): + NoSessionSpider() + + +class TestSpiderStartRequests: + """Test Spider start_requests method.""" + + @pytest.mark.asyncio + async def test_start_requests_yields_from_start_urls(self): + """Test that start_requests yields requests for start_urls.""" + + class MultiUrlSpider(Spider): + name = "multi_url" + start_urls = [ + "https://example.com/1", + "https://example.com/2", + "https://example.com/3", + ] + + async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]: + yield None + + spider = MultiUrlSpider() + requests = [r async for r in spider.start_requests()] + + assert len(requests) == 3 + assert requests[0].url == "https://example.com/1" + assert requests[1].url == "https://example.com/2" + assert requests[2].url == "https://example.com/3" + + @pytest.mark.asyncio + async def test_start_requests_no_urls_raises(self): + """Test that start_requests raises when no start_urls.""" + + class NoUrlSpider(Spider): + name = "no_url" + start_urls = [] + + async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]: + yield None + + spider = NoUrlSpider() + + with pytest.raises(RuntimeError, match="no starting point"): + async for _ in spider.start_requests(): + pass + + @pytest.mark.asyncio + async def test_start_requests_uses_default_session(self): + """Test that start_requests uses default session ID.""" + spider = ConcreteSpider() + requests = [r async for r in spider.start_requests()] + + # Should use the default session from session manager + default_sid = spider._session_manager.default_session_id + assert requests[0].sid == default_sid + + +class TestSpiderHooks: + """Test Spider lifecycle hooks.""" + + @pytest.mark.asyncio + async def test_on_start_default(self): + """Test default on_start doesn't raise.""" + spider = ConcreteSpider() + + # Should not raise + await spider.on_start(resuming=False) + await spider.on_start(resuming=True) + + @pytest.mark.asyncio + async def test_on_close_default(self): + """Test default on_close doesn't raise.""" + spider = ConcreteSpider() + + # Should not raise + await spider.on_close() + + @pytest.mark.asyncio + async def test_on_error_default(self): + """Test default on_error logs the error.""" + spider = ConcreteSpider() + request = Request("https://example.com") + error = ValueError("test error") + + # Should not raise + await spider.on_error(request, error) + + @pytest.mark.asyncio + async def test_on_scraped_item_default_returns_item(self): + """Test default on_scraped_item returns the item unchanged.""" + spider = ConcreteSpider() + item = {"key": "value", "nested": {"a": 1}} + + result = await spider.on_scraped_item(item) + + assert result == item + + @pytest.mark.asyncio + async def test_is_blocked_default_checks_status_codes(self): + """Test default is_blocked checks blocked status codes.""" + + class MockResponse: + def __init__(self, status: int): + self.status = status + + spider = ConcreteSpider() + + # Test blocked codes + assert await spider.is_blocked(MockResponse(403)) is True + assert await spider.is_blocked(MockResponse(429)) is True + assert await spider.is_blocked(MockResponse(503)) is True + + # Test non-blocked codes + assert await spider.is_blocked(MockResponse(200)) is False + assert await spider.is_blocked(MockResponse(404)) is False + + @pytest.mark.asyncio + async def test_retry_blocked_request_default_returns_request(self): + """Test default retry_blocked_request returns the request unchanged.""" + + class MockResponse: + status = 429 + + spider = ConcreteSpider() + request = Request("https://example.com", priority=5) + + result = await spider.retry_blocked_request(request, MockResponse()) + + assert result is request + + +class TestSpiderPause: + """Test Spider pause functionality.""" + + def test_pause_without_engine_raises(self): + """Test that pause without active engine raises RuntimeError.""" + spider = ConcreteSpider() + + with pytest.raises(RuntimeError, match="No active crawl to stop"): + spider.pause() + + +class TestSpiderStats: + """Test Spider stats property.""" + + def test_stats_without_engine_raises(self): + """Test that accessing stats without active crawl raises.""" + spider = ConcreteSpider() + + with pytest.raises(RuntimeError, match="No active crawl"): + _ = spider.stats + + +class TestSpiderCustomization: + """Test Spider customization patterns.""" + + def test_custom_concurrent_requests(self): + """Test spider with custom concurrent_requests.""" + + class CustomSpider(Spider): + name = "custom" + concurrent_requests = 32 + start_urls = ["https://example.com"] + + async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]: + yield None + + spider = CustomSpider() + assert spider.concurrent_requests == 32 + + def test_custom_allowed_domains(self): + """Test spider with allowed_domains.""" + + class DomainSpider(Spider): + name = "domain_spider" + start_urls = ["https://example.com"] + allowed_domains = {"example.com", "api.example.com"} + + async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]: + yield None + + spider = DomainSpider() + assert "example.com" in spider.allowed_domains + assert "api.example.com" in spider.allowed_domains + + def test_custom_download_delay(self): + """Test spider with download delay.""" + + class SlowSpider(Spider): + name = "slow" + download_delay = 1.5 + start_urls = ["https://example.com"] + + async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]: + yield None + + spider = SlowSpider() + assert spider.download_delay == 1.5 + + +class TestSpiderLogging: + """Test Spider logging configuration.""" + + def test_custom_logging_level(self): + """Test spider with custom logging level.""" + + class QuietSpider(Spider): + name = "quiet" + logging_level = logging.WARNING + start_urls = ["https://example.com"] + + async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]: + yield None + + spider = QuietSpider() + assert spider.logger.level == logging.WARNING + + def test_log_file_creates_handler(self): + """Test spider with log file creates file handler.""" + with tempfile.TemporaryDirectory() as tmpdir: + log_path = Path(tmpdir) / "spider.log" + + class FileLogSpider(Spider): + name = "file_log" + log_file = str(log_path) + start_urls = ["https://example.com"] + + async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]: + yield None + + spider = FileLogSpider() + + # Should have a file handler + file_handlers = [ + h for h in spider.logger.handlers if isinstance(h, logging.FileHandler) + ] + assert len(file_handlers) == 1 + + # Clean up + for h in file_handlers: + h.close() + + def test_logger_does_not_propagate(self): + """Test that spider logger does not propagate to parent.""" + spider = ConcreteSpider() + + assert spider.logger.propagate is False + + +class TestSessionConfigurationError: + """Test SessionConfigurationError exception.""" + + def test_exception_message(self): + """Test that exception preserves message.""" + error = SessionConfigurationError("Custom error message") + + assert str(error) == "Custom error message" + + def test_exception_is_exception(self): + """Test that it's a proper exception.""" + error = SessionConfigurationError("test") + + assert isinstance(error, Exception) diff --git a/zensical.toml b/zensical.toml new file mode 100644 index 0000000000000000000000000000000000000000..3c8d248fdbdd221f06bdfa66afe2a5eab8b828c4 --- /dev/null +++ b/zensical.toml @@ -0,0 +1,229 @@ +[project] +site_name = "Scrapling" +site_description = "Scrapling - Effortless Web Scraping for the Modern Web!" +site_author = "Karim Shoair" +repo_url = "https://github.com/D4Vinci/Scrapling" +site_url = "https://scrapling.readthedocs.io/en/latest/" +repo_name = "D4Vinci/Scrapling" +copyright = "Copyright © 2025 Karim Shoair - Change cookie settings" +docs_dir = "docs" +use_directory_urls = false +exclude_docs = """ +README*.md +""" +extra_css = ["stylesheets/extra.css"] + +nav = [ + {Introduction = "index.md"}, + {Overview = "overview.md"}, + {"What's New in v0.3" = "https://github.com/D4Vinci/Scrapling/releases/tag/v0.3"}, + {"What's New in v0.4" = "https://github.com/D4Vinci/Scrapling/releases/tag/v0.4"}, + {"Performance Benchmarks" = "benchmarks.md"}, + {"User Guide" = [ + {Parsing = [ + {"Querying elements" = "parsing/selection.md"}, + {"Main classes" = "parsing/main_classes.md"}, + {"Adaptive scraping" = "parsing/adaptive.md"} + ]}, + {Fetching = [ + {"Fetchers basics" = "fetching/choosing.md"}, + {"HTTP requests" = "fetching/static.md"}, + {"Dynamic websites" = "fetching/dynamic.md"}, + {"Dynamic websites with hard protections" = "fetching/stealthy.md"} + ]}, + {Spiders = [ + {"Architecture" = "spiders/architecture.md"}, + {"Getting started" = "spiders/getting-started.md"}, + {"Requests & Responses" = "spiders/requests-responses.md"}, + {"Sessions" = "spiders/sessions.md"}, + {"Proxy management & Blocking" = "spiders/proxy-blocking.md"}, + {"Advanced features" = "spiders/advanced.md"} + ]}, + {"Command Line Interface" = [ + {Overview = "cli/overview.md"}, + {"Interactive shell" = "cli/interactive-shell.md"}, + {"Extract commands" = "cli/extract-commands.md"} + ]}, + {Integrations = [ + {"AI MCP server" = "ai/mcp-server.md"} + ]} + ]}, + {Tutorials = [ + {"A Free Alternative to AI for Robust Web Scraping" = "tutorials/replacing_ai.md"}, + {"Migrating from BeautifulSoup" = "tutorials/migrating_from_beautifulsoup.md"}, + {"Using Scrapeless browser" = "tutorials/external.md"} + ]}, + {Development = [ + {"API Reference" = [ + {Selector = "api-reference/selector.md"}, + {Fetchers = "api-reference/fetchers.md"}, + {"MCP Server" = "api-reference/mcp-server.md"}, + {"Custom Types" = "api-reference/custom-types.md"}, + {Response = "api-reference/response.md"}, + {Spiders = "api-reference/spiders.md"}, + {"Proxy Rotation" = "api-reference/proxy-rotation.md"} + ]}, + {"Writing your retrieval system" = "development/adaptive_storage_system.md"}, + {"Using Scrapling's custom types" = "development/scrapling_custom_types.md"} + ]}, + {"Support and Advertisement" = "donate.md"}, + {Contributing = "https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md"}, + {Changelog = "https://github.com/D4Vinci/Scrapling/releases"} +] + +[project.theme] +language = "en" +logo = "assets/logo.png" +favicon = "assets/favicon.ico" +features = [ + "navigation.path", + "announce.dismiss", + "navigation.top", + "navigation.footer", + "navigation.indexes", + "navigation.sections", + "navigation.tracking", + "navigation.instant", + "navigation.instant.prefetch", + "navigation.instant.progress", +# "navigation.tabs", +# "navigation.expand", +# "toc.integrate", + "search.share", + "search.suggest", + "search.highlight", +] + +[[project.theme.palette]] +media = "(prefers-color-scheme: light)" +scheme = "default" +accent = "green" +primary = "deep purple" +toggle.icon = "lucide/sun" +toggle.name = "Switch to dark mode" + +[[project.theme.palette]] +media = "(prefers-color-scheme: dark)" +scheme = "slate" +accent = "light green" +primary = "deep purple" +toggle.icon = "lucide/moon" +toggle.name = "Switch to light mode" + +# Uncomment if needed: +# [project.theme.font] +# text = "Open Sans" +# code = "JetBrains Mono" + +[project.markdown_extensions.pymdownx.caret] +[project.markdown_extensions.pymdownx.mark] +[project.markdown_extensions.pymdownx.tilde] +[project.markdown_extensions.admonition] +[project.markdown_extensions.abbr] +#[project.markdown_extensions.mkautodoc] +[project.markdown_extensions.pymdownx.details] +[project.markdown_extensions.pymdownx.superfences] +custom_fences = [ + {name = "mermaid", class = "mermaid", format = "pymdownx.superfences.fence_code_format"} +] +[project.markdown_extensions.pymdownx.inlinehilite] +[project.markdown_extensions.pymdownx.snippets] +[project.markdown_extensions.tables] + +[project.markdown_extensions.pymdownx.emoji] +emoji_index = "zensical.extensions.emoji.twemoji" +emoji_generator = "zensical.extensions.emoji.to_svg" + +[project.markdown_extensions.pymdownx.highlight] +pygments_lang_class = true +anchor_linenums = true +line_spans = "__span" + +[project.markdown_extensions.pymdownx.tabbed] +alternate_style = true + +[project.markdown_extensions.codehilite] +css_class = "highlight" + +[project.markdown_extensions.toc] +title = "On this page" +permalink = true +toc_depth = 3 + +[project.plugins.mkdocstrings.handlers.python] +inventories = ["https://docs.python.org/3/objects.inv"] +paths = ["scrapling"] + +[project.plugins.mkdocstrings.handlers.python.options] +docstring_style = "sphinx" +show_source = true +show_root_heading = true +show_if_no_docstring = true +inherited_members = true +members_order = "source" +separate_signature = true +unwrap_annotated = true +filters = "public" +merge_init_into_class = true +docstring_section_style = "spacy" +signature_crossrefs = true +show_symbol_type_heading = true +show_symbol_type_toc = true +show_inheritance_diagram = true +modernize_annotations = true +extensions = [ + "griffe_runtime_objects", + "griffe_sphinx", + {griffe_inherited_docstrings = {merge = true}} +] + +[[project.extra.social]] +icon = "fontawesome/brands/github" +link = "https://github.com/D4Vinci/Scrapling" + +[[project.extra.social]] +icon = "fontawesome/brands/x-twitter" +link = "https://x.com/Scrapling_dev" + +[[project.extra.social]] +icon = "fontawesome/brands/discord" +link = "https://discord.gg/EMgGbDceNQ" + +[[project.extra.social]] +icon = "fontawesome/brands/python" +link = "https://pypi.org/project/scrapling/" + +[[project.extra.social]] +icon = "fontawesome/brands/docker" +link = "https://hub.docker.com/r/pyd4vinci/scrapling" + +[project.extra.analytics] +provider = "google" +property = "G-CS3DKLY73Z" + +[project.extra.analytics.feedback] +title = "Was this page helpful?" + +[[project.extra.analytics.feedback.ratings]] +icon = "material/heart" +name = "This page was helpful" +data = 1 +note = "Thanks for your feedback!" + +[[project.extra.analytics.feedback.ratings]] +icon = "material/heart-broken" +name = "This page could be improved" +data = 0 +note = """ +Thanks for your feedback! Help us improve this page by +opening a documentation issue. +""" + +[project.extra.consent] +title = "Cookie consent" +description = """ +We use cookies to recognize your repeated visits and preferences, as well +as to measure the effectiveness of our documentation and whether users +find what they're searching for. With your consent, you're helping us to +make our documentation better. +""" \ No newline at end of file