| | import os, sys |
| | parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
| | sys.path.append(parent_dir) |
| | __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) |
| |
|
| | import asyncio |
| | from crawl4ai import AsyncWebCrawler, CacheMode |
| | from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig |
| | from crawl4ai.content_filter_strategy import PruningContentFilter |
| | from crawl4ai.extraction_strategy import JsonCssExtractionStrategy |
| | from crawl4ai.chunking_strategy import RegexChunking |
| | from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator |
| |
|
| | |
| | async def test_browser_config_object(): |
| | """Test the new BrowserConfig object with various browser settings""" |
| | browser_config = BrowserConfig( |
| | browser_type="chromium", |
| | headless=False, |
| | viewport_width=1920, |
| | viewport_height=1080, |
| | use_managed_browser=True, |
| | user_agent_mode="random", |
| | user_agent_generator_config={"device_type": "desktop", "os_type": "windows"} |
| | ) |
| | |
| | async with AsyncWebCrawler(config=browser_config, verbose=True) as crawler: |
| | result = await crawler.arun('https://example.com', cache_mode=CacheMode.BYPASS) |
| | assert result.success, "Browser config crawl failed" |
| | assert len(result.html) > 0, "No HTML content retrieved" |
| |
|
| | async def test_browser_performance_config(): |
| | """Test browser configurations focused on performance""" |
| | browser_config = BrowserConfig( |
| | text_mode=True, |
| | light_mode=True, |
| | extra_args=['--disable-gpu', '--disable-software-rasterizer'], |
| | ignore_https_errors=True, |
| | java_script_enabled=False |
| | ) |
| | |
| | async with AsyncWebCrawler(config=browser_config) as crawler: |
| | result = await crawler.arun('https://example.com') |
| | assert result.success, "Performance optimized crawl failed" |
| | assert result.status_code == 200, "Unexpected status code" |
| |
|
| | |
| | async def test_content_extraction_config(): |
| | """Test content extraction with various strategies""" |
| | crawler_config = CrawlerRunConfig( |
| | word_count_threshold=300, |
| | extraction_strategy=JsonCssExtractionStrategy( |
| | schema={ |
| | "name": "article", |
| | "baseSelector": "div", |
| | "fields": [{ |
| | "name": "title", |
| | "selector": "h1", |
| | "type": "text" |
| | }] |
| | } |
| | ), |
| | chunking_strategy=RegexChunking(), |
| | content_filter=PruningContentFilter() |
| | ) |
| | |
| | async with AsyncWebCrawler() as crawler: |
| | result = await crawler.arun( |
| | 'https://example.com/article', |
| | config=crawler_config |
| | ) |
| | assert result.extracted_content is not None, "Content extraction failed" |
| | assert 'title' in result.extracted_content, "Missing expected content field" |
| |
|
| | |
| | async def test_cache_and_session_management(): |
| | """Test different cache modes and session handling""" |
| | browser_config = BrowserConfig(use_persistent_context=True) |
| | crawler_config = CrawlerRunConfig( |
| | cache_mode=CacheMode.WRITE_ONLY, |
| | process_iframes=True, |
| | remove_overlay_elements=True |
| | ) |
| | |
| | async with AsyncWebCrawler(config=browser_config) as crawler: |
| | |
| | result1 = await crawler.arun( |
| | 'https://example.com', |
| | config=crawler_config |
| | ) |
| | |
| | |
| | result2 = await crawler.arun( |
| | 'https://example.com', |
| | config=crawler_config |
| | ) |
| | |
| | assert result1.success and result2.success, "Cache mode crawl failed" |
| | assert result1.html == result2.html, "Inconsistent results between requests" |
| |
|
| | |
| | async def test_media_handling_config(): |
| | """Test configurations related to media handling""" |
| | |
| | os.makedirs(os.path.expanduser("~/.crawl4ai/downloads"), exist_ok=True) |
| | browser_config = BrowserConfig( |
| | viewport_width=1920, |
| | viewport_height=1080, |
| | accept_downloads=True, |
| | downloads_path= os.path.expanduser("~/.crawl4ai/downloads") |
| | ) |
| | crawler_config = CrawlerRunConfig( |
| | screenshot=True, |
| | pdf=True, |
| | adjust_viewport_to_content=True, |
| | wait_for_images=True, |
| | screenshot_height_threshold=20000 |
| | ) |
| | |
| | async with AsyncWebCrawler(config=browser_config) as crawler: |
| | result = await crawler.arun( |
| | 'https://example.com', |
| | config=crawler_config |
| | ) |
| | assert result.screenshot is not None, "Screenshot capture failed" |
| | assert result.pdf is not None, "PDF generation failed" |
| |
|
| | |
| | async def test_antibot_config(): |
| | """Test configurations for handling anti-bot measures""" |
| | crawler_config = CrawlerRunConfig( |
| | simulate_user=True, |
| | override_navigator=True, |
| | magic=True, |
| | wait_for="js:()=>document.querySelector('body')", |
| | delay_before_return_html=1.0, |
| | log_console=True, |
| | cache_mode=CacheMode.BYPASS |
| | ) |
| | |
| | async with AsyncWebCrawler() as crawler: |
| | result = await crawler.arun( |
| | 'https://example.com', |
| | config=crawler_config |
| | ) |
| | assert result.success, "Anti-bot measure handling failed" |
| |
|
| | |
| | async def test_parallel_processing(): |
| | """Test parallel processing capabilities""" |
| | crawler_config = CrawlerRunConfig( |
| | mean_delay=0.5, |
| | max_range=1.0, |
| | semaphore_count=5 |
| | ) |
| | |
| | urls = [ |
| | 'https://example.com/1', |
| | 'https://example.com/2', |
| | 'https://example.com/3' |
| | ] |
| | |
| | async with AsyncWebCrawler() as crawler: |
| | results = await crawler.arun_many( |
| | urls, |
| | config=crawler_config |
| | ) |
| | assert len(results) == len(urls), "Not all URLs were processed" |
| | assert all(r.success for r in results), "Some parallel requests failed" |
| |
|
| | |
| | async def test_legacy_parameter_support(): |
| | """Test that legacy parameters still work""" |
| | async with AsyncWebCrawler( |
| | headless=True, |
| | browser_type="chromium", |
| | viewport_width=1024, |
| | viewport_height=768 |
| | ) as crawler: |
| | result = await crawler.arun( |
| | 'https://example.com', |
| | screenshot=True, |
| | word_count_threshold=200, |
| | bypass_cache=True, |
| | css_selector=".main-content" |
| | ) |
| | assert result.success, "Legacy parameter support failed" |
| |
|
| | |
| | async def test_mixed_config_usage(): |
| | """Test mixing new config objects with legacy parameters""" |
| | browser_config = BrowserConfig(headless=True) |
| | crawler_config = CrawlerRunConfig(screenshot=True) |
| | |
| | async with AsyncWebCrawler( |
| | config=browser_config, |
| | verbose=True |
| | ) as crawler: |
| | result = await crawler.arun( |
| | 'https://example.com', |
| | config=crawler_config, |
| | cache_mode=CacheMode.BYPASS, |
| | css_selector="body" |
| | ) |
| | assert result.success, "Mixed configuration usage failed" |
| |
|
| | if __name__ == "__main__": |
| | async def run_tests(): |
| | test_functions = [ |
| | test_browser_config_object, |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | ] |
| | |
| | for test in test_functions: |
| | print(f"\nRunning {test.__name__}...") |
| | try: |
| | await test() |
| | print(f"✓ {test.__name__} passed") |
| | except AssertionError as e: |
| | print(f"✗ {test.__name__} failed: {str(e)}") |
| | except Exception as e: |
| | print(f"✗ {test.__name__} error: {str(e)}") |
| | |
| | asyncio.run(run_tests()) |