# Browser Configuration

Crawl4AI supports multiple browser engines and offers extensive configuration options for browser behavior.

## Browser Types

Choose from three browser engines:

```python
# Chromium (default)
async with AsyncWebCrawler(browser_type="chromium") as crawler:
    result = await crawler.arun(url="https://example.com")

# Firefox
async with AsyncWebCrawler(browser_type="firefox") as crawler:
    result = await crawler.arun(url="https://example.com")

# WebKit
async with AsyncWebCrawler(browser_type="webkit") as crawler:
    result = await crawler.arun(url="https://example.com")
```

## Basic Configuration

Common browser settings:

```python
async with AsyncWebCrawler(
    headless=True,           # Run in headless mode (no GUI)
    verbose=True,           # Enable detailed logging
    sleep_on_close=False    # No delay when closing browser
) as crawler:
    result = await crawler.arun(url="https://example.com")
```

## Identity Management

Control how your crawler appears to websites:

```python
# Custom user agent
async with AsyncWebCrawler(
    user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
) as crawler:
    result = await crawler.arun(url="https://example.com")

# Custom headers
headers = {
    "Accept-Language": "en-US,en;q=0.9",
    "Cache-Control": "no-cache"
}
async with AsyncWebCrawler(headers=headers) as crawler:
    result = await crawler.arun(url="https://example.com")
```

## Screenshot Capabilities

Capture page screenshots with enhanced error handling:

```python
result = await crawler.arun(
    url="https://example.com",
    screenshot=True,                # Enable screenshot
    screenshot_wait_for=2.0        # Wait 2 seconds before capture
)

if result.screenshot:  # Base64 encoded image
    import base64
    with open("screenshot.png", "wb") as f:
        f.write(base64.b64decode(result.screenshot))
```

## Timeouts and Waiting

Control page loading behavior:

```python
result = await crawler.arun(
    url="https://example.com",
    page_timeout=60000,              # Page load timeout (ms)
    delay_before_return_html=2.0,    # Wait before content capture
    wait_for="css:.dynamic-content"  # Wait for specific element
)
```

## JavaScript Execution

Execute custom JavaScript before crawling:

```python
# Single JavaScript command
result = await crawler.arun(
    url="https://example.com",
    js_code="window.scrollTo(0, document.body.scrollHeight);"
)

# Multiple commands
js_commands = [
    "window.scrollTo(0, document.body.scrollHeight);",
    "document.querySelector('.load-more').click();"
]
result = await crawler.arun(
    url="https://example.com",
    js_code=js_commands
)
```

## Proxy Configuration

Use proxies for enhanced access:

```python
# Simple proxy
async with AsyncWebCrawler(
    proxy="http://proxy.example.com:8080"
) as crawler:
    result = await crawler.arun(url="https://example.com")

# Proxy with authentication
proxy_config = {
    "server": "http://proxy.example.com:8080",
    "username": "user",
    "password": "pass"
}
async with AsyncWebCrawler(proxy_config=proxy_config) as crawler:
    result = await crawler.arun(url="https://example.com")
```

## Anti-Detection Features

Enable stealth features to avoid bot detection:

```python
result = await crawler.arun(
    url="https://example.com",
    simulate_user=True,        # Simulate human behavior
    override_navigator=True,   # Mask automation signals
    magic=True               # Enable all anti-detection features
)
```

## Handling Dynamic Content

Configure browser to handle dynamic content:

```python
# Wait for dynamic content
result = await crawler.arun(
    url="https://example.com",
    wait_for="js:() => document.querySelector('.content').children.length > 10",
    process_iframes=True     # Process iframe content
)

# Handle lazy-loaded images
result = await crawler.arun(
    url="https://example.com",
    js_code="window.scrollTo(0, document.body.scrollHeight);",
    delay_before_return_html=2.0  # Wait for images to load
)
```

## Comprehensive Example

Here's how to combine various browser configurations:

```python
async def crawl_with_advanced_config(url: str):
    async with AsyncWebCrawler(
        # Browser setup
        browser_type="chromium",
        headless=True,
        verbose=True,
        
        # Identity
        user_agent="Custom User Agent",
        headers={"Accept-Language": "en-US"},
        
        # Proxy setup
        proxy="http://proxy.example.com:8080"
    ) as crawler:
        result = await crawler.arun(
            url=url,
            # Content handling
            process_iframes=True,
            screenshot=True,
            
            # Timing
            page_timeout=60000,
            delay_before_return_html=2.0,
            
            # Anti-detection
            magic=True,
            simulate_user=True,
            
            # Dynamic content
            js_code=[
                "window.scrollTo(0, document.body.scrollHeight);",
                "document.querySelector('.load-more')?.click();"
            ],
            wait_for="css:.dynamic-content"
        )
        
        return {
            "content": result.markdown,
            "screenshot": result.screenshot,
            "success": result.success
        }
```# Content Selection

Crawl4AI provides multiple ways to select and filter specific content from webpages. Learn how to precisely target the content you need.

## CSS Selectors

The simplest way to extract specific content:

```python
# Extract specific content using CSS selector
result = await crawler.arun(
    url="https://example.com",
    css_selector=".main-article"  # Target main article content
)

# Multiple selectors
result = await crawler.arun(
    url="https://example.com",
    css_selector="article h1, article .content"  # Target heading and content
)
```

## Content Filtering

Control what content is included or excluded:

```python
result = await crawler.arun(
    url="https://example.com",
    # Content thresholds
    word_count_threshold=10,        # Minimum words per block
    
    # Tag exclusions
    excluded_tags=['form', 'header', 'footer', 'nav'],
    
    # Link filtering
    exclude_external_links=True,    # Remove external links
    exclude_social_media_links=True,  # Remove social media links
    
    # Media filtering
    exclude_external_images=True   # Remove external images
)
```

## Iframe Content

Process content inside iframes:

```python
result = await crawler.arun(
    url="https://example.com",
    process_iframes=True,  # Extract iframe content
    remove_overlay_elements=True  # Remove popups/modals that might block iframes
)
```

## Structured Content Selection

### Using LLMs for Smart Selection

Use LLMs to intelligently extract specific types of content:

```python
from pydantic import BaseModel
from crawl4ai.extraction_strategy import LLMExtractionStrategy

class ArticleContent(BaseModel):
    title: str
    main_points: List[str]
    conclusion: str

strategy = LLMExtractionStrategy(
    provider="ollama/nemotron",  # Works with any supported LLM
    schema=ArticleContent.schema(),
    instruction="Extract the main article title, key points, and conclusion"
)

result = await crawler.arun(
    url="https://example.com",
    extraction_strategy=strategy
)
article = json.loads(result.extracted_content)
```

### Pattern-Based Selection

For repeated content patterns (like product listings, news feeds):

```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy

schema = {
    "name": "News Articles",
    "baseSelector": "article.news-item",  # Repeated element
    "fields": [
        {"name": "headline", "selector": "h2", "type": "text"},
        {"name": "summary", "selector": ".summary", "type": "text"},
        {"name": "category", "selector": ".category", "type": "text"},
        {
            "name": "metadata",
            "type": "nested",
            "fields": [
                {"name": "author", "selector": ".author", "type": "text"},
                {"name": "date", "selector": ".date", "type": "text"}
            ]
        }
    ]
}

strategy = JsonCssExtractionStrategy(schema)
result = await crawler.arun(
    url="https://example.com",
    extraction_strategy=strategy
)
articles = json.loads(result.extracted_content)
```

## Domain-Based Filtering

Control content based on domains:

```python
result = await crawler.arun(
    url="https://example.com",
    exclude_domains=["ads.com", "tracker.com"],
    exclude_social_media_domains=["facebook.com", "twitter.com"],  # Custom social media domains to exclude
    exclude_social_media_links=True
)
```

## Media Selection

Select specific types of media:

```python
result = await crawler.arun(url="https://example.com")

# Access different media types
images = result.media["images"]  # List of image details
videos = result.media["videos"]  # List of video details
audios = result.media["audios"]  # List of audio details

# Image with metadata
for image in images:
    print(f"URL: {image['src']}")
    print(f"Alt text: {image['alt']}")
    print(f"Description: {image['desc']}")
    print(f"Relevance score: {image['score']}")
```

## Comprehensive Example

Here's how to combine different selection methods:

```python
async def extract_article_content(url: str):
    # Define structured extraction
    article_schema = {
        "name": "Article",
        "baseSelector": "article.main",
        "fields": [
            {"name": "title", "selector": "h1", "type": "text"},
            {"name": "content", "selector": ".content", "type": "text"}
        ]
    }
    
    # Define LLM extraction
    class ArticleAnalysis(BaseModel):
        key_points: List[str]
        sentiment: str
        category: str

    async with AsyncWebCrawler() as crawler:
        # Get structured content
        pattern_result = await crawler.arun(
            url=url,
            extraction_strategy=JsonCssExtractionStrategy(article_schema),
            word_count_threshold=10,
            excluded_tags=['nav', 'footer'],
            exclude_external_links=True
        )
        
        # Get semantic analysis
        analysis_result = await crawler.arun(
            url=url,
            extraction_strategy=LLMExtractionStrategy(
                provider="ollama/nemotron",
                schema=ArticleAnalysis.schema(),
                instruction="Analyze the article content"
            )
        )
        
        # Combine results
        return {
            "article": json.loads(pattern_result.extracted_content),
            "analysis": json.loads(analysis_result.extracted_content),
            "media": pattern_result.media
        }
```# Installation 💻

Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package, use it with Docker, or run it as a local server.

## Option 1: Python Package Installation (Recommended)

Crawl4AI is now available on PyPI, making installation easier than ever. Choose the option that best fits your needs:

### Basic Installation

For basic web crawling and scraping tasks:

```bash
pip install crawl4ai
playwright install # Install Playwright dependencies
```

### Installation with PyTorch

For advanced text clustering (includes CosineSimilarity cluster strategy):

```bash
pip install crawl4ai[torch]
```

### Installation with Transformers

For text summarization and Hugging Face models:

```bash
pip install crawl4ai[transformer]
```

### Full Installation

For all features:

```bash
pip install crawl4ai[all]
```

### Development Installation

For contributors who plan to modify the source code:

```bash
git clone https://github.com/unclecode/crawl4ai.git
cd crawl4ai
pip install -e ".[all]"
playwright install # Install Playwright dependencies
```

💡 After installation with "torch", "transformer", or "all" options, it's recommended to run the following CLI command to load the required models:

```bash
crawl4ai-download-models
```

This is optional but will boost the performance and speed of the crawler. You only need to do this once after installation.

## Option 2: Using Docker (Coming Soon)

Docker support for Crawl4AI is currently in progress and will be available soon. This will allow you to run Crawl4AI in a containerized environment, ensuring consistency across different systems.

## Option 3: Local Server Installation

For those who prefer to run Crawl4AI as a local server, instructions will be provided once the Docker implementation is complete.

## Verifying Your Installation

After installation, you can verify that Crawl4AI is working correctly by running a simple Python script:

```python
import asyncio
from crawl4ai import AsyncWebCrawler

async def main():
    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(url="https://www.example.com")
        print(result.markdown[:500])  # Print first 500 characters

if __name__ == "__main__":
    asyncio.run(main())
```

This script should successfully crawl the example website and print the first 500 characters of the extracted content.

## Getting Help

If you encounter any issues during installation or usage, please check the [documentation](https://crawl4ai.com/mkdocs/) or raise an issue on the [GitHub repository](https://github.com/unclecode/crawl4ai/issues).

Happy crawling! 🕷️🤖# Output Formats

Crawl4AI provides multiple output formats to suit different needs, from raw HTML to structured data using LLM or pattern-based extraction.

## Basic Formats

```python
result = await crawler.arun(url="https://example.com")

# Access different formats
raw_html = result.html           # Original HTML
clean_html = result.cleaned_html # Sanitized HTML
markdown = result.markdown       # Standard markdown
fit_md = result.fit_markdown    # Most relevant content in markdown
```

## Raw HTML

Original, unmodified HTML from the webpage. Useful when you need to:
- Preserve the exact page structure
- Process HTML with your own tools
- Debug page issues

```python
result = await crawler.arun(url="https://example.com")
print(result.html)  # Complete HTML including headers, scripts, etc.
```

## Cleaned HTML

Sanitized HTML with unnecessary elements removed. Automatically:
- Removes scripts and styles
- Cleans up formatting
- Preserves semantic structure

```python
result = await crawler.arun(
    url="https://example.com",
    excluded_tags=['form', 'header', 'footer'],  # Additional tags to remove
    keep_data_attributes=False  # Remove data-* attributes
)
print(result.cleaned_html)
```

## Standard Markdown

HTML converted to clean markdown format. Great for:
- Content analysis
- Documentation
- Readability

```python
result = await crawler.arun(
    url="https://example.com",
    include_links_on_markdown=True  # Include links in markdown
)
print(result.markdown)
```

## Fit Markdown

Most relevant content extracted and converted to markdown. Ideal for:
- Article extraction
- Main content focus
- Removing boilerplate

```python
result = await crawler.arun(url="https://example.com")
print(result.fit_markdown)  # Only the main content
```

## Structured Data Extraction

Crawl4AI offers two powerful approaches for structured data extraction:

### 1. LLM-Based Extraction

Use any LLM (OpenAI, HuggingFace, Ollama, etc.) to extract structured data with high accuracy:

```python
from pydantic import BaseModel
from crawl4ai.extraction_strategy import LLMExtractionStrategy

class KnowledgeGraph(BaseModel):
    entities: List[dict]
    relationships: List[dict]

strategy = LLMExtractionStrategy(
    provider="ollama/nemotron",  # or "huggingface/...", "ollama/..."
    api_token="your-token",   # not needed for Ollama
    schema=KnowledgeGraph.schema(),
    instruction="Extract entities and relationships from the content"
)

result = await crawler.arun(
    url="https://example.com",
    extraction_strategy=strategy
)
knowledge_graph = json.loads(result.extracted_content)
```

### 2. Pattern-Based Extraction

For pages with repetitive patterns (e.g., product listings, article feeds), use JsonCssExtractionStrategy:

```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy

schema = {
    "name": "Product Listing",
    "baseSelector": ".product-card",  # Repeated element
    "fields": [
        {"name": "title", "selector": "h2", "type": "text"},
        {"name": "price", "selector": ".price", "type": "text"},
        {"name": "description", "selector": ".desc", "type": "text"}
    ]
}

strategy = JsonCssExtractionStrategy(schema)
result = await crawler.arun(
    url="https://example.com",
    extraction_strategy=strategy
)
products = json.loads(result.extracted_content)
```

## Content Customization

### HTML to Text Options

Configure markdown conversion:

```python
result = await crawler.arun(
    url="https://example.com",
    html2text={
        "escape_dot": False,
        "body_width": 0,
        "protect_links": True,
        "unicode_snob": True
    }
)
```

### Content Filters

Control what content is included:

```python
result = await crawler.arun(
    url="https://example.com",
    word_count_threshold=10,        # Minimum words per block
    exclude_external_links=True,    # Remove external links
    exclude_external_images=True,   # Remove external images
    excluded_tags=['form', 'nav']   # Remove specific HTML tags
)
```

## Comprehensive Example

Here's how to use multiple output formats together:

```python
async def crawl_content(url: str):
    async with AsyncWebCrawler() as crawler:
        # Extract main content with fit markdown
        result = await crawler.arun(
            url=url,
            word_count_threshold=10,
            exclude_external_links=True
        )
        
        # Get structured data using LLM
        llm_result = await crawler.arun(
            url=url,
            extraction_strategy=LLMExtractionStrategy(
                provider="ollama/nemotron",
                schema=YourSchema.schema(),
                instruction="Extract key information"
            )
        )
        
        # Get repeated patterns (if any)
        pattern_result = await crawler.arun(
            url=url,
            extraction_strategy=JsonCssExtractionStrategy(your_schema)
        )
        
        return {
            "main_content": result.fit_markdown,
            "structured_data": json.loads(llm_result.extracted_content),
            "pattern_data": json.loads(pattern_result.extracted_content),
            "media": result.media
        }
```# Page Interaction

Crawl4AI provides powerful features for interacting with dynamic webpages, handling JavaScript execution, and managing page events.

## JavaScript Execution

### Basic Execution

```python
# Single JavaScript command
result = await crawler.arun(
    url="https://example.com",
    js_code="window.scrollTo(0, document.body.scrollHeight);"
)

# Multiple commands
js_commands = [
    "window.scrollTo(0, document.body.scrollHeight);",
    "document.querySelector('.load-more').click();",
    "document.querySelector('#consent-button').click();"
]
result = await crawler.arun(
    url="https://example.com",
    js_code=js_commands
)
```

## Wait Conditions

### CSS-Based Waiting

Wait for elements to appear:

```python
result = await crawler.arun(
    url="https://example.com",
    wait_for="css:.dynamic-content"  # Wait for element with class 'dynamic-content'
)
```

### JavaScript-Based Waiting

Wait for custom conditions:

```python
# Wait for number of elements
wait_condition = """() => {
    return document.querySelectorAll('.item').length > 10;
}"""

result = await crawler.arun(
    url="https://example.com",
    wait_for=f"js:{wait_condition}"
)

# Wait for dynamic content to load
wait_for_content = """() => {
    const content = document.querySelector('.content');
    return content && content.innerText.length > 100;
}"""

result = await crawler.arun(
    url="https://example.com",
    wait_for=f"js:{wait_for_content}"
)
```

## Handling Dynamic Content

### Load More Content

Handle infinite scroll or load more buttons:

```python
# Scroll and wait pattern
result = await crawler.arun(
    url="https://example.com",
    js_code=[
        # Scroll to bottom
        "window.scrollTo(0, document.body.scrollHeight);",
        # Click load more if exists
        "const loadMore = document.querySelector('.load-more'); if(loadMore) loadMore.click();"
    ],
    # Wait for new content
    wait_for="js:() => document.querySelectorAll('.item').length > previousCount"
)
```

### Form Interaction

Handle forms and inputs:

```python
js_form_interaction = """
    // Fill form fields
    document.querySelector('#search').value = 'search term';
    // Submit form
    document.querySelector('form').submit();
"""

result = await crawler.arun(
    url="https://example.com",
    js_code=js_form_interaction,
    wait_for="css:.results"  # Wait for results to load
)
```

## Timing Control

### Delays and Timeouts

Control timing of interactions:

```python
result = await crawler.arun(
    url="https://example.com",
    page_timeout=60000,              # Page load timeout (ms)
    delay_before_return_html=2.0,    # Wait before capturing content
)
```

## Complex Interactions Example

Here's an example of handling a dynamic page with multiple interactions:

```python
async def crawl_dynamic_content():
    async with AsyncWebCrawler() as crawler:
        # Initial page load
        result = await crawler.arun(
            url="https://example.com",
            # Handle cookie consent
            js_code="document.querySelector('.cookie-accept')?.click();",
            wait_for="css:.main-content"
        )

        # Load more content
        session_id = "dynamic_session"  # Keep session for multiple interactions
        
        for page in range(3):  # Load 3 pages of content
            result = await crawler.arun(
                url="https://example.com",
                session_id=session_id,
                js_code=[
                    # Scroll to bottom
                    "window.scrollTo(0, document.body.scrollHeight);",
                    # Store current item count
                    "window.previousCount = document.querySelectorAll('.item').length;",
                    # Click load more
                    "document.querySelector('.load-more')?.click();"
                ],
                # Wait for new items
                wait_for="""() => {
                    const currentCount = document.querySelectorAll('.item').length;
                    return currentCount > window.previousCount;
                }""",
                # Only execute JS without reloading page
                js_only=True if page > 0 else False
            )
            
            # Process content after each load
            print(f"Page {page + 1} items:", len(result.cleaned_html))
            
        # Clean up session
        await crawler.crawler_strategy.kill_session(session_id)
```

## Using with Extraction Strategies

Combine page interaction with structured extraction:

```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy

# Pattern-based extraction after interaction
schema = {
    "name": "Dynamic Items",
    "baseSelector": ".item",
    "fields": [
        {"name": "title", "selector": "h2", "type": "text"},
        {"name": "description", "selector": ".desc", "type": "text"}
    ]
}

result = await crawler.arun(
    url="https://example.com",
    js_code="window.scrollTo(0, document.body.scrollHeight);",
    wait_for="css:.item:nth-child(10)",  # Wait for 10 items
    extraction_strategy=JsonCssExtractionStrategy(schema)
)

# Or use LLM to analyze dynamic content
class ContentAnalysis(BaseModel):
    topics: List[str]
    summary: str

result = await crawler.arun(
    url="https://example.com",
    js_code="document.querySelector('.show-more').click();",
    wait_for="css:.full-content",
    extraction_strategy=LLMExtractionStrategy(
        provider="ollama/nemotron",
        schema=ContentAnalysis.schema(),
        instruction="Analyze the full content"
    )
)
```# Quick Start Guide 🚀

Welcome to the Crawl4AI Quickstart Guide! In this tutorial, we'll walk you through the basic usage of Crawl4AI with a friendly and humorous tone. We'll cover everything from basic usage to advanced features like chunking and extraction strategies, all with the power of asynchronous programming. Let's dive in! 🌟

## Getting Started 🛠️

First, let's import the necessary modules and create an instance of `AsyncWebCrawler`. We'll use an async context manager, which handles the setup and teardown of the crawler for us.

```python
import asyncio
from crawl4ai import AsyncWebCrawler

async def main():
    async with AsyncWebCrawler(verbose=True) as crawler:
        # We'll add our crawling code here
        pass

if __name__ == "__main__":
    asyncio.run(main())
```

### Basic Usage

Simply provide a URL and let Crawl4AI do the magic!

```python
async def main():
    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(url="https://www.nbcnews.com/business")
        print(f"Basic crawl result: {result.markdown[:500]}")  # Print first 500 characters

asyncio.run(main())
```

### Taking Screenshots 📸

Capture screenshots of web pages easily:

```python
async def capture_and_save_screenshot(url: str, output_path: str):
    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(
            url=url,
            screenshot=True,
            bypass_cache=True
        )
        
        if result.success and result.screenshot:
            import base64
            screenshot_data = base64.b64decode(result.screenshot)
            with open(output_path, 'wb') as f:
                f.write(screenshot_data)
            print(f"Screenshot saved successfully to {output_path}")
        else:
            print("Failed to capture screenshot")
```

### Browser Selection 🌐

Crawl4AI supports multiple browser engines. Here's how to use different browsers:

```python
# Use Firefox
async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless=True) as crawler:
    result = await crawler.arun(url="https://www.example.com", bypass_cache=True)

# Use WebKit
async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless=True) as crawler:
    result = await crawler.arun(url="https://www.example.com", bypass_cache=True)

# Use Chromium (default)
async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
    result = await crawler.arun(url="https://www.example.com", bypass_cache=True)
```

### User Simulation 🎭

Simulate real user behavior to avoid detection:

```python
async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
    result = await crawler.arun(
        url="YOUR-URL-HERE",
        bypass_cache=True,
        simulate_user=True,  # Causes random mouse movements and clicks
        override_navigator=True  # Makes the browser appear more like a real user
    )
```

### Understanding Parameters 🧠

By default, Crawl4AI caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action.

```python
async def main():
    async with AsyncWebCrawler(verbose=True) as crawler:
        # First crawl (caches the result)
        result1 = await crawler.arun(url="https://www.nbcnews.com/business")
        print(f"First crawl result: {result1.markdown[:100]}...")

        # Force to crawl again
        result2 = await crawler.arun(url="https://www.nbcnews.com/business", bypass_cache=True)
        print(f"Second crawl result: {result2.markdown[:100]}...")

asyncio.run(main())
```

### Adding a Chunking Strategy 🧩

Let's add a chunking strategy: `RegexChunking`! This strategy splits the text based on a given regex pattern.

```python
from crawl4ai.chunking_strategy import RegexChunking

async def main():
    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(
            url="https://www.nbcnews.com/business",
            chunking_strategy=RegexChunking(patterns=["\n\n"])
        )
        print(f"RegexChunking result: {result.extracted_content[:200]}...")

asyncio.run(main())
```

### Using LLMExtractionStrategy with Different Providers 🤖

Crawl4AI supports multiple LLM providers for extraction:

```python
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from pydantic import BaseModel, Field

class OpenAIModelFee(BaseModel):
    model_name: str = Field(..., description="Name of the OpenAI model.")
    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
    output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")

# OpenAI
await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))

# Hugging Face
await extract_structured_data_using_llm(
    "huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", 
    os.getenv("HUGGINGFACE_API_KEY")
)

# Ollama
await extract_structured_data_using_llm("ollama/llama3.2")

# With custom headers
custom_headers = {
    "Authorization": "Bearer your-custom-token",
    "X-Custom-Header": "Some-Value"
}
await extract_structured_data_using_llm(extra_headers=custom_headers)
```

### Knowledge Graph Generation 🕸️

Generate knowledge graphs from web content:

```python
from pydantic import BaseModel
from typing import List

class Entity(BaseModel):
    name: str
    description: str
    
class Relationship(BaseModel):
    entity1: Entity
    entity2: Entity
    description: str
    relation_type: str

class KnowledgeGraph(BaseModel):
    entities: List[Entity]
    relationships: List[Relationship]

extraction_strategy = LLMExtractionStrategy(
    provider='openai/gpt-4o-mini',
    api_token=os.getenv('OPENAI_API_KEY'),
    schema=KnowledgeGraph.model_json_schema(),
    extraction_type="schema",
    instruction="Extract entities and relationships from the given text."
)

async with AsyncWebCrawler() as crawler:
    result = await crawler.arun(
        url="https://paulgraham.com/love.html",
        bypass_cache=True,
        extraction_strategy=extraction_strategy
    )
```

### Advanced Session-Based Crawling with Dynamic Content 🔄

For modern web applications with dynamic content loading, here's how to handle pagination and content updates:

```python
async def crawl_dynamic_content():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://github.com/microsoft/TypeScript/commits/main"
        session_id = "typescript_commits_session"
        
        js_next_page = """
        const button = document.querySelector('a[data-testid="pagination-next-button"]');
        if (button) button.click();
        """

        wait_for = """() => {
            const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
            if (commits.length === 0) return false;
            const firstCommit = commits[0].textContent.trim();
            return firstCommit !== window.firstCommit;
        }"""
        
        schema = {
            "name": "Commit Extractor",
            "baseSelector": "li.Box-sc-g0xbh4-0",
            "fields": [
                {
                    "name": "title",
                    "selector": "h4.markdown-title",
                    "type": "text",
                    "transform": "strip",
                },
            ],
        }
        extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)

        for page in range(3):  # Crawl 3 pages
            result = await crawler.arun(
                url=url,
                session_id=session_id,
                css_selector="li.Box-sc-g0xbh4-0",
                extraction_strategy=extraction_strategy,
                js_code=js_next_page if page > 0 else None,
                wait_for=wait_for if page > 0 else None,
                js_only=page > 0,
                bypass_cache=True,
                headless=False,
            )

        await crawler.crawler_strategy.kill_session(session_id)
```

### Handling Overlays and Fitting Content 📏

Remove overlay elements and fit content appropriately:

```python
async with AsyncWebCrawler(headless=False) as crawler:
    result = await crawler.arun(
        url="your-url-here",
        bypass_cache=True,
        word_count_threshold=10,
        remove_overlay_elements=True,
        screenshot=True
    )
```

## Performance Comparison 🏎️

Crawl4AI offers impressive performance compared to other solutions:

```python
# Firecrawl comparison
from firecrawl import FirecrawlApp
app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])
start = time.time()
scrape_status = app.scrape_url(
    'https://www.nbcnews.com/business',
    params={'formats': ['markdown', 'html']}
)
end = time.time()

# Crawl4AI comparison
async with AsyncWebCrawler() as crawler:
    start = time.time()
    result = await crawler.arun(
        url="https://www.nbcnews.com/business",
        word_count_threshold=0,
        bypass_cache=True,
        verbose=False,
    )
    end = time.time()
```

Note: Performance comparisons should be conducted in environments with stable and fast internet connections for accurate results.

## Congratulations! 🎉

You've made it through the updated Crawl4AI Quickstart Guide! Now you're equipped with even more powerful features to crawl the web asynchronously like a pro! 🕸️

Happy crawling! 🚀# Simple Crawling

This guide covers the basics of web crawling with Crawl4AI. You'll learn how to set up a crawler, make your first request, and understand the response.

## Basic Usage

Here's the simplest way to crawl a webpage:

```python
import asyncio
from crawl4ai import AsyncWebCrawler

async def main():
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(url="https://example.com")
        print(result.markdown)  # Print clean markdown content

if __name__ == "__main__":
    asyncio.run(main())
```

## Understanding the Response

The `arun()` method returns a `CrawlResult` object with several useful properties. Here's a quick overview (see [CrawlResult](../api/crawl-result.md) for complete details):

```python
result = await crawler.arun(url="https://example.com")

# Different content formats
print(result.html)         # Raw HTML
print(result.cleaned_html) # Cleaned HTML
print(result.markdown)     # Markdown version
print(result.fit_markdown) # Most relevant content in markdown

# Check success status
print(result.success)      # True if crawl succeeded
print(result.status_code)  # HTTP status code (e.g., 200, 404)

# Access extracted media and links
print(result.media)        # Dictionary of found media (images, videos, audio)
print(result.links)        # Dictionary of internal and external links
```

## Adding Basic Options

Customize your crawl with these common options:

```python
result = await crawler.arun(
    url="https://example.com",
    word_count_threshold=10,        # Minimum words per content block
    exclude_external_links=True,    # Remove external links
    remove_overlay_elements=True,   # Remove popups/modals
    process_iframes=True           # Process iframe content
)
```

## Handling Errors

Always check if the crawl was successful:

```python
result = await crawler.arun(url="https://example.com")
if not result.success:
    print(f"Crawl failed: {result.error_message}")
    print(f"Status code: {result.status_code}")
```

## Logging and Debugging

Enable verbose mode for detailed logging:

```python
async with AsyncWebCrawler(verbose=True) as crawler:
    result = await crawler.arun(url="https://example.com")
```

## Complete Example

Here's a more comprehensive example showing common usage patterns:

```python
import asyncio
from crawl4ai import AsyncWebCrawler

async def main():
    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(
            url="https://example.com",
            # Content filtering
            word_count_threshold=10,
            excluded_tags=['form', 'header'],
            exclude_external_links=True,
            
            # Content processing
            process_iframes=True,
            remove_overlay_elements=True,
            
            # Cache control
            bypass_cache=False  # Use cache if available
        )
        
        if result.success:
            # Print clean content
            print("Content:", result.markdown[:500])  # First 500 chars
            
            # Process images
            for image in result.media["images"]:
                print(f"Found image: {image['src']}")
            
            # Process links
            for link in result.links["internal"]:
                print(f"Internal link: {link['href']}")
                
        else:
            print(f"Crawl failed: {result.error_message}")

if __name__ == "__main__":
    asyncio.run(main())
```
