| """ |
| Crawl4AI v0.4.24 Feature Walkthrough |
| =================================== |
| |
| This script demonstrates the new features introduced in Crawl4AI v0.4.24. |
| Each section includes detailed examples and explanations of the new capabilities. |
| """ |
|
|
| import asyncio |
| import os |
| import json |
| import re |
| from typing import List, Optional, Dict, Any |
| from pydantic import BaseModel, Field |
| from crawl4ai import ( |
| AsyncWebCrawler, |
| BrowserConfig, |
| CrawlerRunConfig, |
| CacheMode, |
| LLMExtractionStrategy, |
| JsonCssExtractionStrategy |
| ) |
| from crawl4ai.content_filter_strategy import RelevantContentFilter |
| from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator |
| from bs4 import BeautifulSoup |
|
|
| |
| SAMPLE_HTML = """ |
| <div class="article-list"> |
| <article class="post" data-category="tech" data-author="john"> |
| <h2 class="title"><a href="/post-1">First Post</a></h2> |
| <div class="meta"> |
| <a href="/author/john" class="author">John Doe</a> |
| <span class="date">2023-12-31</span> |
| </div> |
| <div class="content"> |
| <p>First post content...</p> |
| <a href="/read-more-1" class="read-more">Read More</a> |
| </div> |
| </article> |
| <article class="post" data-category="science" data-author="jane"> |
| <h2 class="title"><a href="/post-2">Second Post</a></h2> |
| <div class="meta"> |
| <a href="/author/jane" class="author">Jane Smith</a> |
| <span class="date">2023-12-30</span> |
| </div> |
| <div class="content"> |
| <p>Second post content...</p> |
| <a href="/read-more-2" class="read-more">Read More</a> |
| </div> |
| </article> |
| </div> |
| """ |
|
|
| async def demo_ssl_features(): |
| """ |
| Enhanced SSL & Security Features Demo |
| ----------------------------------- |
| |
| This example demonstrates the new SSL certificate handling and security features: |
| 1. Custom certificate paths |
| 2. SSL verification options |
| 3. HTTPS error handling |
| 4. Certificate validation configurations |
| |
| These features are particularly useful when: |
| - Working with self-signed certificates |
| - Dealing with corporate proxies |
| - Handling mixed content websites |
| - Managing different SSL security levels |
| """ |
| print("\n1. Enhanced SSL & Security Demo") |
| print("--------------------------------") |
|
|
| browser_config = BrowserConfig() |
|
|
| run_config = CrawlerRunConfig( |
| cache_mode=CacheMode.BYPASS, |
| fetch_ssl_certificate=True |
| ) |
|
|
| async with AsyncWebCrawler(config=browser_config) as crawler: |
| result = await crawler.arun( |
| url="https://example.com", |
| config=run_config |
| ) |
| print(f"SSL Crawl Success: {result.success}") |
| result.ssl_certificate.to_json( |
| os.path.join(os.getcwd(), "ssl_certificate.json") |
| ) |
| if not result.success: |
| print(f"SSL Error: {result.error_message}") |
|
|
| async def demo_content_filtering(): |
| """ |
| Smart Content Filtering Demo |
| ---------------------- |
| |
| Demonstrates advanced content filtering capabilities: |
| 1. Custom filter to identify and extract specific content |
| 2. Integration with markdown generation |
| 3. Flexible pruning rules |
| """ |
| print("\n2. Smart Content Filtering Demo") |
| print("--------------------------------") |
|
|
| |
| class CustomNewsFilter(RelevantContentFilter): |
| def __init__(self): |
| super().__init__() |
| |
| self.negative_patterns = re.compile( |
| r'nav|footer|header|sidebar|ads|comment|share|related|recommended|popular|trending', |
| re.I |
| ) |
| self.min_word_count = 30 |
|
|
| def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]: |
| """ |
| Implements news-specific content filtering logic. |
| |
| Args: |
| html (str): HTML content to be filtered |
| min_word_threshold (int, optional): Minimum word count threshold |
| |
| Returns: |
| List[str]: List of filtered HTML content blocks |
| """ |
| if not html or not isinstance(html, str): |
| return [] |
| |
| soup = BeautifulSoup(html, 'lxml') |
| if not soup.body: |
| soup = BeautifulSoup(f'<body>{html}</body>', 'lxml') |
| |
| body = soup.find('body') |
| |
| |
| chunks = self.extract_text_chunks(body, min_word_threshold or self.min_word_count) |
| |
| |
| filtered_chunks = [] |
| for _, text, tag_type, element in chunks: |
| |
| if self.is_excluded(element): |
| continue |
| |
| |
| if tag_type == 'header': |
| filtered_chunks.append(self.clean_element(element)) |
| continue |
| |
| |
| text = element.get_text(strip=True) |
| if len(text.split()) >= (min_word_threshold or self.min_word_count): |
| |
| links_text = ' '.join(a.get_text(strip=True) for a in element.find_all('a')) |
| link_density = len(links_text) / len(text) if text else 1 |
| |
| |
| if link_density < 0.5: |
| filtered_chunks.append(self.clean_element(element)) |
| |
| return filtered_chunks |
|
|
| |
| markdown_gen = DefaultMarkdownGenerator( |
| content_filter=CustomNewsFilter() |
| ) |
|
|
| run_config = CrawlerRunConfig( |
| markdown_generator=markdown_gen, |
| cache_mode=CacheMode.BYPASS |
| ) |
|
|
| async with AsyncWebCrawler() as crawler: |
| result = await crawler.arun( |
| url="https://news.ycombinator.com", |
| config=run_config |
| ) |
| print("Filtered Content Sample:") |
| print(result.markdown[:500]) |
|
|
| async def demo_json_extraction(): |
| """ |
| Improved JSON Extraction Demo |
| --------------------------- |
| |
| Demonstrates the enhanced JSON extraction capabilities: |
| 1. Base element attributes extraction |
| 2. Complex nested structures |
| 3. Multiple extraction patterns |
| |
| Key features shown: |
| - Extracting attributes from base elements (href, data-* attributes) |
| - Processing repeated patterns |
| - Handling optional fields |
| """ |
| print("\n3. Improved JSON Extraction Demo") |
| print("--------------------------------") |
|
|
| |
| json_strategy = JsonCssExtractionStrategy( |
| schema={ |
| "name": "Blog Posts", |
| "baseSelector": "div.article-list", |
| "baseFields": [ |
| {"name": "list_id", "type": "attribute", "attribute": "data-list-id"}, |
| {"name": "category", "type": "attribute", "attribute": "data-category"} |
| ], |
| "fields": [ |
| { |
| "name": "posts", |
| "selector": "article.post", |
| "type": "nested_list", |
| "baseFields": [ |
| {"name": "post_id", "type": "attribute", "attribute": "data-post-id"}, |
| {"name": "author_id", "type": "attribute", "attribute": "data-author"} |
| ], |
| "fields": [ |
| { |
| "name": "title", |
| "selector": "h2.title a", |
| "type": "text", |
| "baseFields": [ |
| {"name": "url", "type": "attribute", "attribute": "href"} |
| ] |
| }, |
| { |
| "name": "author", |
| "selector": "div.meta a.author", |
| "type": "text", |
| "baseFields": [ |
| {"name": "profile_url", "type": "attribute", "attribute": "href"} |
| ] |
| }, |
| { |
| "name": "date", |
| "selector": "span.date", |
| "type": "text" |
| }, |
| { |
| "name": "read_more", |
| "selector": "a.read-more", |
| "type": "nested", |
| "fields": [ |
| {"name": "text", "type": "text"}, |
| {"name": "url", "type": "attribute", "attribute": "href"} |
| ] |
| } |
| ] |
| } |
| ] |
| } |
| ) |
|
|
| |
| run_config = CrawlerRunConfig( |
| extraction_strategy=json_strategy, |
| cache_mode=CacheMode.BYPASS |
| ) |
|
|
| async with AsyncWebCrawler() as crawler: |
| result = await crawler.arun( |
| url="raw:" + SAMPLE_HTML, |
| config=run_config |
| ) |
| print("Extracted Content:") |
| print(result.extracted_content) |
|
|
| async def demo_input_formats(): |
| """ |
| Input Format Handling Demo |
| ---------------------- |
| |
| Demonstrates how LLM extraction can work with different input formats: |
| 1. Markdown (default) - Good for simple text extraction |
| 2. HTML - Better when you need structure and attributes |
| |
| This example shows how HTML input can be beneficial when: |
| - You need to understand the DOM structure |
| - You want to extract both visible text and HTML attributes |
| - The content has complex layouts like tables or forms |
| """ |
| print("\n4. Input Format Handling Demo") |
| print("---------------------------") |
|
|
| |
| dummy_html = """ |
| <div class="job-posting" data-post-id="12345"> |
| <header class="job-header"> |
| <h1 class="job-title">Senior AI/ML Engineer</h1> |
| <div class="job-meta"> |
| <span class="department">AI Research Division</span> |
| <span class="location" data-remote="hybrid">San Francisco (Hybrid)</span> |
| </div> |
| <div class="salary-info" data-currency="USD"> |
| <span class="range">$150,000 - $220,000</span> |
| <span class="period">per year</span> |
| </div> |
| </header> |
| |
| <section class="requirements"> |
| <div class="technical-skills"> |
| <h3>Technical Requirements</h3> |
| <ul class="required-skills"> |
| <li class="skill required" data-priority="must-have"> |
| 5+ years experience in Machine Learning |
| </li> |
| <li class="skill required" data-priority="must-have"> |
| Proficiency in Python and PyTorch/TensorFlow |
| </li> |
| <li class="skill preferred" data-priority="nice-to-have"> |
| Experience with distributed training systems |
| </li> |
| </ul> |
| </div> |
| |
| <div class="soft-skills"> |
| <h3>Professional Skills</h3> |
| <ul class="required-skills"> |
| <li class="skill required" data-priority="must-have"> |
| Strong problem-solving abilities |
| </li> |
| <li class="skill preferred" data-priority="nice-to-have"> |
| Experience leading technical teams |
| </li> |
| </ul> |
| </div> |
| </section> |
| |
| <section class="timeline"> |
| <time class="deadline" datetime="2024-02-28"> |
| Application Deadline: February 28, 2024 |
| </time> |
| </section> |
| |
| <footer class="contact-section"> |
| <div class="hiring-manager"> |
| <h4>Hiring Manager</h4> |
| <div class="contact-info"> |
| <span class="name">Dr. Sarah Chen</span> |
| <span class="title">Director of AI Research</span> |
| <span class="email">ai.hiring@example.com</span> |
| </div> |
| </div> |
| <div class="team-info"> |
| <p>Join our team of 50+ researchers working on cutting-edge AI applications</p> |
| </div> |
| </footer> |
| </div> |
| """ |
| |
| |
| url = f"raw://{dummy_html}" |
|
|
| from pydantic import BaseModel, Field |
| from typing import List, Optional |
|
|
| |
| class JobRequirement(BaseModel): |
| category: str = Field(description="Category of the requirement (e.g., Technical, Soft Skills)") |
| items: List[str] = Field(description="List of specific requirements in this category") |
| priority: str = Field(description="Priority level (Required/Preferred) based on the HTML class or context") |
|
|
| class JobPosting(BaseModel): |
| title: str = Field(description="Job title") |
| department: str = Field(description="Department or team") |
| location: str = Field(description="Job location, including remote options") |
| salary_range: Optional[str] = Field(description="Salary range if specified") |
| requirements: List[JobRequirement] = Field(description="Categorized job requirements") |
| application_deadline: Optional[str] = Field(description="Application deadline if specified") |
| contact_info: Optional[dict] = Field(description="Contact information from footer or contact section") |
|
|
| |
| markdown_strategy = LLMExtractionStrategy( |
| provider="openai/gpt-4o", |
| api_token=os.getenv("OPENAI_API_KEY"), |
| schema=JobPosting.model_json_schema(), |
| extraction_type="schema", |
| instruction=""" |
| Extract job posting details into structured data. Focus on the visible text content |
| and organize requirements into categories. |
| """, |
| input_format="markdown" |
| ) |
|
|
| |
| html_strategy = LLMExtractionStrategy( |
| provider="openai/gpt-4", |
| api_token=os.getenv("OPENAI_API_KEY"), |
| schema=JobPosting.model_json_schema(), |
| extraction_type="schema", |
| instruction=""" |
| Extract job posting details, using HTML structure to: |
| 1. Identify requirement priorities from CSS classes (e.g., 'required' vs 'preferred') |
| 2. Extract contact info from the page footer or dedicated contact section |
| 3. Parse salary information from specially formatted elements |
| 4. Determine application deadline from timestamp or date elements |
| |
| Use HTML attributes and classes to enhance extraction accuracy. |
| """, |
| input_format="html" |
| ) |
|
|
| async with AsyncWebCrawler() as crawler: |
| |
| markdown_config = CrawlerRunConfig( |
| extraction_strategy=markdown_strategy |
| ) |
| markdown_result = await crawler.arun( |
| url=url, |
| config=markdown_config |
| ) |
| print("\nMarkdown-based Extraction Result:") |
| items = json.loads(markdown_result.extracted_content) |
| print(json.dumps(items, indent=2)) |
|
|
| |
| html_config = CrawlerRunConfig( |
| extraction_strategy=html_strategy |
| ) |
| html_result = await crawler.arun( |
| url=url, |
| config=html_config |
| ) |
| print("\nHTML-based Extraction Result:") |
| items = json.loads(html_result.extracted_content) |
| print(json.dumps(items, indent=2)) |
|
|
| |
| async def main(): |
| print("Crawl4AI v0.4.24 Feature Walkthrough") |
| print("====================================") |
|
|
| |
| await demo_ssl_features() |
| await demo_content_filtering() |
| await demo_json_extraction() |
| |
|
|
| if __name__ == "__main__": |
| asyncio.run(main()) |
|
|