| | import os, sys |
| | |
| | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))); os.environ['FIRECRAWL_API_KEY'] = "fc-84b370ccfad44beabc686b38f1769692"; |
| |
|
| | import asyncio |
| | |
| | |
| |
|
| | import time |
| | import json |
| | import os |
| | import re |
| | from typing import Dict, List |
| | from bs4 import BeautifulSoup |
| | from pydantic import BaseModel, Field |
| | from crawl4ai import AsyncWebCrawler, CacheMode |
| | from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator |
| | from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter |
| | from crawl4ai.extraction_strategy import ( |
| | JsonCssExtractionStrategy, |
| | LLMExtractionStrategy, |
| | ) |
| |
|
| | __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) |
| |
|
| | print("Crawl4AI: Advanced Web Crawling and Data Extraction") |
| | print("GitHub Repository: https://github.com/unclecode/crawl4ai") |
| | print("Twitter: @unclecode") |
| | print("Website: https://crawl4ai.com") |
| |
|
| |
|
| | async def simple_crawl(): |
| | print("\n--- Basic Usage ---") |
| | async with AsyncWebCrawler(verbose=True) as crawler: |
| | result = await crawler.arun(url="https://www.nbcnews.com/business", cache_mode= CacheMode.BYPASS) |
| | print(result.markdown[:500]) |
| |
|
| | async def simple_example_with_running_js_code(): |
| | print("\n--- Executing JavaScript and Using CSS Selectors ---") |
| | |
| | wait_for = """() => { |
| | return Array.from(document.querySelectorAll('article.tease-card')).length > 10; |
| | }""" |
| |
|
| | |
| | |
| |
|
| | async with AsyncWebCrawler(verbose=True) as crawler: |
| | js_code = [ |
| | "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();" |
| | ] |
| | result = await crawler.arun( |
| | url="https://www.nbcnews.com/business", |
| | js_code=js_code, |
| | |
| | cache_mode=CacheMode.BYPASS, |
| | ) |
| | print(result.markdown[:500]) |
| |
|
| | async def simple_example_with_css_selector(): |
| | print("\n--- Using CSS Selectors ---") |
| | async with AsyncWebCrawler(verbose=True) as crawler: |
| | result = await crawler.arun( |
| | url="https://www.nbcnews.com/business", |
| | css_selector=".wide-tease-item__description", |
| | cache_mode=CacheMode.BYPASS, |
| | ) |
| | print(result.markdown[:500]) |
| |
|
| | async def use_proxy(): |
| | print("\n--- Using a Proxy ---") |
| | print( |
| | "Note: Replace 'http://your-proxy-url:port' with a working proxy to run this example." |
| | ) |
| | |
| | async with AsyncWebCrawler(verbose=True, proxy="http://your-proxy-url:port") as crawler: |
| | result = await crawler.arun( |
| | url="https://www.nbcnews.com/business", |
| | cache_mode= CacheMode.BYPASS |
| | ) |
| | if result.success: |
| | print(result.markdown[:500]) |
| |
|
| | async def capture_and_save_screenshot(url: str, output_path: str): |
| | async with AsyncWebCrawler(verbose=True) as crawler: |
| | result = await crawler.arun( |
| | url=url, |
| | screenshot=True, |
| | cache_mode= CacheMode.BYPASS |
| | ) |
| | |
| | if result.success and result.screenshot: |
| | import base64 |
| | |
| | |
| | screenshot_data = base64.b64decode(result.screenshot) |
| | |
| | |
| | with open(output_path, 'wb') as f: |
| | f.write(screenshot_data) |
| | |
| | print(f"Screenshot saved successfully to {output_path}") |
| | else: |
| | print("Failed to capture screenshot") |
| |
|
| | class OpenAIModelFee(BaseModel): |
| | model_name: str = Field(..., description="Name of the OpenAI model.") |
| | input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") |
| | output_fee: str = Field( |
| | ..., description="Fee for output token for the OpenAI model." |
| | ) |
| |
|
| | async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None): |
| | print(f"\n--- Extracting Structured Data with {provider} ---") |
| | |
| | if api_token is None and provider != "ollama": |
| | print(f"API token is required for {provider}. Skipping this example.") |
| | return |
| |
|
| | |
| | extra_args={ |
| | "temperature": 0, |
| | "top_p": 0.9, |
| | "max_tokens": 2000, |
| | |
| | } |
| | if extra_headers: |
| | extra_args["extra_headers"] = extra_headers |
| |
|
| | async with AsyncWebCrawler(verbose=True) as crawler: |
| | result = await crawler.arun( |
| | url="https://openai.com/api/pricing/", |
| | word_count_threshold=1, |
| | extraction_strategy=LLMExtractionStrategy( |
| | provider=provider, |
| | api_token=api_token, |
| | schema=OpenAIModelFee.model_json_schema(), |
| | extraction_type="schema", |
| | instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. |
| | Do not miss any models in the entire content. One extracted model JSON format should look like this: |
| | {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""", |
| | extra_args=extra_args |
| | ), |
| | cache_mode=CacheMode.BYPASS, |
| | ) |
| | print(result.extracted_content) |
| |
|
| | async def extract_structured_data_using_css_extractor(): |
| | print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---") |
| | schema = { |
| | "name": "KidoCode Courses", |
| | "baseSelector": "section.charge-methodology .w-tab-content > div", |
| | "fields": [ |
| | { |
| | "name": "section_title", |
| | "selector": "h3.heading-50", |
| | "type": "text", |
| | }, |
| | { |
| | "name": "section_description", |
| | "selector": ".charge-content", |
| | "type": "text", |
| | }, |
| | { |
| | "name": "course_name", |
| | "selector": ".text-block-93", |
| | "type": "text", |
| | }, |
| | { |
| | "name": "course_description", |
| | "selector": ".course-content-text", |
| | "type": "text", |
| | }, |
| | { |
| | "name": "course_icon", |
| | "selector": ".image-92", |
| | "type": "attribute", |
| | "attribute": "src" |
| | } |
| | ] |
| | } |
| |
|
| | async with AsyncWebCrawler( |
| | headless=True, |
| | verbose=True |
| | ) as crawler: |
| | |
| | |
| | js_click_tabs = """ |
| | (async () => { |
| | const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div"); |
| | |
| | for(let tab of tabs) { |
| | // scroll to the tab |
| | tab.scrollIntoView(); |
| | tab.click(); |
| | // Wait for content to load and animations to complete |
| | await new Promise(r => setTimeout(r, 500)); |
| | } |
| | })(); |
| | """ |
| |
|
| | result = await crawler.arun( |
| | url="https://www.kidocode.com/degrees/technology", |
| | extraction_strategy=JsonCssExtractionStrategy(schema, verbose=True), |
| | js_code=[js_click_tabs], |
| | cache_mode=CacheMode.BYPASS |
| | ) |
| |
|
| | companies = json.loads(result.extracted_content) |
| | print(f"Successfully extracted {len(companies)} companies") |
| | print(json.dumps(companies[0], indent=2)) |
| |
|
| | |
| | async def crawl_dynamic_content_pages_method_1(): |
| | print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---") |
| | first_commit = "" |
| |
|
| | async def on_execution_started(page): |
| | nonlocal first_commit |
| | try: |
| | while True: |
| | await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4") |
| | commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4") |
| | commit = await commit.evaluate("(element) => element.textContent") |
| | commit = re.sub(r"\s+", "", commit) |
| | if commit and commit != first_commit: |
| | first_commit = commit |
| | break |
| | await asyncio.sleep(0.5) |
| | except Exception as e: |
| | print(f"Warning: New content didn't appear after JavaScript execution: {e}") |
| |
|
| | async with AsyncWebCrawler(verbose=True) as crawler: |
| | crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started) |
| |
|
| | url = "https://github.com/microsoft/TypeScript/commits/main" |
| | session_id = "typescript_commits_session" |
| | all_commits = [] |
| |
|
| | js_next_page = """ |
| | (() => { |
| | const button = document.querySelector('a[data-testid="pagination-next-button"]'); |
| | if (button) button.click(); |
| | })(); |
| | """ |
| |
|
| | for page in range(3): |
| | result = await crawler.arun( |
| | url=url, |
| | session_id=session_id, |
| | css_selector="li.Box-sc-g0xbh4-0", |
| | js=js_next_page if page > 0 else None, |
| | cache_mode=CacheMode.BYPASS, |
| | js_only=page > 0, |
| | headless=False, |
| | ) |
| |
|
| | assert result.success, f"Failed to crawl page {page + 1}" |
| |
|
| | soup = BeautifulSoup(result.cleaned_html, "html.parser") |
| | commits = soup.select("li") |
| | all_commits.extend(commits) |
| |
|
| | print(f"Page {page + 1}: Found {len(commits)} commits") |
| |
|
| | await crawler.crawler_strategy.kill_session(session_id) |
| | print(f"Successfully crawled {len(all_commits)} commits across 3 pages") |
| |
|
| | async def crawl_dynamic_content_pages_method_2(): |
| | print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---") |
| |
|
| | async with AsyncWebCrawler(verbose=True) as crawler: |
| | url = "https://github.com/microsoft/TypeScript/commits/main" |
| | session_id = "typescript_commits_session" |
| | all_commits = [] |
| | last_commit = "" |
| |
|
| | js_next_page_and_wait = """ |
| | (async () => { |
| | const getCurrentCommit = () => { |
| | const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4'); |
| | return commits.length > 0 ? commits[0].textContent.trim() : null; |
| | }; |
| | |
| | const initialCommit = getCurrentCommit(); |
| | const button = document.querySelector('a[data-testid="pagination-next-button"]'); |
| | if (button) button.click(); |
| | |
| | // Poll for changes |
| | while (true) { |
| | await new Promise(resolve => setTimeout(resolve, 100)); // Wait 100ms |
| | const newCommit = getCurrentCommit(); |
| | if (newCommit && newCommit !== initialCommit) { |
| | break; |
| | } |
| | } |
| | })(); |
| | """ |
| |
|
| | schema = { |
| | "name": "Commit Extractor", |
| | "baseSelector": "li.Box-sc-g0xbh4-0", |
| | "fields": [ |
| | { |
| | "name": "title", |
| | "selector": "h4.markdown-title", |
| | "type": "text", |
| | "transform": "strip", |
| | }, |
| | ], |
| | } |
| | extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) |
| |
|
| | for page in range(3): |
| | result = await crawler.arun( |
| | url=url, |
| | session_id=session_id, |
| | css_selector="li.Box-sc-g0xbh4-0", |
| | extraction_strategy=extraction_strategy, |
| | js_code=js_next_page_and_wait if page > 0 else None, |
| | js_only=page > 0, |
| | cache_mode=CacheMode.BYPASS, |
| | headless=False, |
| | ) |
| |
|
| | assert result.success, f"Failed to crawl page {page + 1}" |
| |
|
| | commits = json.loads(result.extracted_content) |
| | all_commits.extend(commits) |
| |
|
| | print(f"Page {page + 1}: Found {len(commits)} commits") |
| |
|
| | await crawler.crawler_strategy.kill_session(session_id) |
| | print(f"Successfully crawled {len(all_commits)} commits across 3 pages") |
| |
|
| | async def crawl_dynamic_content_pages_method_3(): |
| | print("\n--- Advanced Multi-Page Crawling with JavaScript Execution using `wait_for` ---") |
| |
|
| | async with AsyncWebCrawler(verbose=True) as crawler: |
| | url = "https://github.com/microsoft/TypeScript/commits/main" |
| | session_id = "typescript_commits_session" |
| | all_commits = [] |
| |
|
| | js_next_page = """ |
| | const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4'); |
| | if (commits.length > 0) { |
| | window.firstCommit = commits[0].textContent.trim(); |
| | } |
| | const button = document.querySelector('a[data-testid="pagination-next-button"]'); |
| | if (button) button.click(); |
| | """ |
| |
|
| | wait_for = """() => { |
| | const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4'); |
| | if (commits.length === 0) return false; |
| | const firstCommit = commits[0].textContent.trim(); |
| | return firstCommit !== window.firstCommit; |
| | }""" |
| | |
| | schema = { |
| | "name": "Commit Extractor", |
| | "baseSelector": "li.Box-sc-g0xbh4-0", |
| | "fields": [ |
| | { |
| | "name": "title", |
| | "selector": "h4.markdown-title", |
| | "type": "text", |
| | "transform": "strip", |
| | }, |
| | ], |
| | } |
| | extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) |
| |
|
| | for page in range(3): |
| | result = await crawler.arun( |
| | url=url, |
| | session_id=session_id, |
| | css_selector="li.Box-sc-g0xbh4-0", |
| | extraction_strategy=extraction_strategy, |
| | js_code=js_next_page if page > 0 else None, |
| | wait_for=wait_for if page > 0 else None, |
| | js_only=page > 0, |
| | cache_mode=CacheMode.BYPASS, |
| | headless=False, |
| | ) |
| |
|
| | assert result.success, f"Failed to crawl page {page + 1}" |
| |
|
| | commits = json.loads(result.extracted_content) |
| | all_commits.extend(commits) |
| |
|
| | print(f"Page {page + 1}: Found {len(commits)} commits") |
| |
|
| | await crawler.crawler_strategy.kill_session(session_id) |
| | print(f"Successfully crawled {len(all_commits)} commits across 3 pages") |
| |
|
| | async def crawl_custom_browser_type(): |
| | |
| | start = time.time() |
| | async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless = True) as crawler: |
| | result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS) |
| | print(result.markdown[:500]) |
| | print("Time taken: ", time.time() - start) |
| |
|
| | |
| | start = time.time() |
| | async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless = True) as crawler: |
| | result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS) |
| | print(result.markdown[:500]) |
| | print("Time taken: ", time.time() - start) |
| |
|
| | |
| | start = time.time() |
| | async with AsyncWebCrawler(verbose=True, headless = True) as crawler: |
| | result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS) |
| | print(result.markdown[:500]) |
| | print("Time taken: ", time.time() - start) |
| |
|
| | async def crawl_with_user_simultion(): |
| | async with AsyncWebCrawler(verbose=True, headless=True) as crawler: |
| | url = "YOUR-URL-HERE" |
| | result = await crawler.arun( |
| | url=url, |
| | cache_mode=CacheMode.BYPASS, |
| | magic = True, |
| | |
| | |
| | ) |
| | |
| | print(result.markdown) |
| |
|
| | async def speed_comparison(): |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | from firecrawl import FirecrawlApp |
| | app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY']) |
| | start = time.time() |
| | scrape_status = app.scrape_url( |
| | 'https://www.nbcnews.com/business', |
| | params={'formats': ['markdown', 'html']} |
| | ) |
| | end = time.time() |
| | print("Firecrawl:") |
| | print(f"Time taken: {end - start:.2f} seconds") |
| | print(f"Content length: {len(scrape_status['markdown'])} characters") |
| | print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}") |
| | print() |
| |
|
| | async with AsyncWebCrawler() as crawler: |
| | |
| | start = time.time() |
| | result = await crawler.arun( |
| | url="https://www.nbcnews.com/business", |
| | word_count_threshold=0, |
| | cache_mode=CacheMode.BYPASS, |
| | verbose=False, |
| | ) |
| | end = time.time() |
| | print("Crawl4AI (simple crawl):") |
| | print(f"Time taken: {end - start:.2f} seconds") |
| | print(f"Content length: {len(result.markdown)} characters") |
| | print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}") |
| | print() |
| |
|
| | |
| | start = time.time() |
| | result = await crawler.arun( |
| | url="https://www.nbcnews.com/business", |
| | word_count_threshold=0, |
| | markdown_generator=DefaultMarkdownGenerator( |
| | content_filter = PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0) |
| | |
| | ), |
| | cache_mode=CacheMode.BYPASS, |
| | verbose=False, |
| | ) |
| | end = time.time() |
| | print("Crawl4AI (Markdown Plus):") |
| | print(f"Time taken: {end - start:.2f} seconds") |
| | print(f"Content length: {len(result.markdown_v2.raw_markdown)} characters") |
| | print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters") |
| | print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}") |
| | print() |
| |
|
| | |
| | start = time.time() |
| | result = await crawler.arun( |
| | url="https://www.nbcnews.com/business", |
| | js_code=[ |
| | "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();" |
| | ], |
| | word_count_threshold=0, |
| | cache_mode=CacheMode.BYPASS, |
| | markdown_generator=DefaultMarkdownGenerator( |
| | content_filter = PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0) |
| | |
| | ), |
| | verbose=False, |
| | ) |
| | end = time.time() |
| | print("Crawl4AI (with JavaScript execution):") |
| | print(f"Time taken: {end - start:.2f} seconds") |
| | print(f"Content length: {len(result.markdown)} characters") |
| | print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters") |
| | print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}") |
| |
|
| | print("\nNote on Speed Comparison:") |
| | print("The speed test conducted here may not reflect optimal conditions.") |
| | print("When we call Firecrawl's API, we're seeing its best performance,") |
| | print("while Crawl4AI's performance is limited by the local network speed.") |
| | print("For a more accurate comparison, it's recommended to run these tests") |
| | print("on servers with a stable and fast internet connection.") |
| | print("Despite these limitations, Crawl4AI still demonstrates faster performance.") |
| | print("If you run these tests in an environment with better network conditions,") |
| | print("you may observe an even more significant speed advantage for Crawl4AI.") |
| |
|
| | async def generate_knowledge_graph(): |
| | class Entity(BaseModel): |
| | name: str |
| | description: str |
| | |
| | class Relationship(BaseModel): |
| | entity1: Entity |
| | entity2: Entity |
| | description: str |
| | relation_type: str |
| |
|
| | class KnowledgeGraph(BaseModel): |
| | entities: List[Entity] |
| | relationships: List[Relationship] |
| |
|
| | extraction_strategy = LLMExtractionStrategy( |
| | provider='openai/gpt-4o-mini', |
| | api_token=os.getenv('OPENAI_API_KEY'), |
| | schema=KnowledgeGraph.model_json_schema(), |
| | extraction_type="schema", |
| | instruction="""Extract entities and relationships from the given text.""" |
| | ) |
| | async with AsyncWebCrawler() as crawler: |
| | url = "https://paulgraham.com/love.html" |
| | result = await crawler.arun( |
| | url=url, |
| | cache_mode=CacheMode.BYPASS, |
| | extraction_strategy=extraction_strategy, |
| | |
| | ) |
| | |
| | with open(os.path.join(__location__, "kb.json"), "w") as f: |
| | f.write(result.extracted_content) |
| |
|
| | async def fit_markdown_remove_overlay(): |
| | |
| | async with AsyncWebCrawler( |
| | headless=True, |
| | verbose=True, |
| | user_agent_mode="random", |
| | user_agent_generator_config={ |
| | "device_type": "mobile", |
| | "os_type": "android" |
| | }, |
| | ) as crawler: |
| | result = await crawler.arun( |
| | url='https://www.kidocode.com/degrees/technology', |
| | cache_mode=CacheMode.BYPASS, |
| | markdown_generator=DefaultMarkdownGenerator( |
| | content_filter=PruningContentFilter( |
| | threshold=0.48, threshold_type="fixed", min_word_threshold=0 |
| | ), |
| | options={ |
| | "ignore_links": True |
| | } |
| | ), |
| | |
| | |
| | |
| | |
| | |
| | |
| | ) |
| | |
| | if result.success: |
| | print(len(result.markdown_v2.raw_markdown)) |
| | print(len(result.markdown_v2.markdown_with_citations)) |
| | print(len(result.markdown_v2.fit_markdown)) |
| | |
| | |
| | with open(os.path.join(__location__, "output/cleaned_html.html"), "w") as f: |
| | f.write(result.cleaned_html) |
| | |
| | with open(os.path.join(__location__, "output/output_raw_markdown.md"), "w") as f: |
| | f.write(result.markdown_v2.raw_markdown) |
| | |
| | with open(os.path.join(__location__, "output/output_markdown_with_citations.md"), "w") as f: |
| | f.write(result.markdown_v2.markdown_with_citations) |
| | |
| | with open(os.path.join(__location__, "output/output_fit_markdown.md"), "w") as f: |
| | f.write(result.markdown_v2.fit_markdown) |
| | |
| | print("Done") |
| |
|
| |
|
| | async def main(): |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | await crawl_dynamic_content_pages_method_3() |
| | |
| | |
| | |
| | |
| |
|
| |
|
| | if __name__ == "__main__": |
| | asyncio.run(main()) |
| |
|