c4ai

Running

c4ai / docs /examples /extraction_strategies_example.py

amaye15

test

03c0888 about 1 year ago

4.55 kB

	"""
	Example demonstrating different extraction strategies with various input formats.
	This example shows how to:
	1. Use different input formats (markdown, HTML, fit_markdown)
	2. Work with JSON-based extractors (CSS and XPath)
	3. Use LLM-based extraction with different input formats
	4. Configure browser and crawler settings properly
	"""

	import asyncio
	import os
	from typing import Dict, Any

	from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
	from crawl4ai.extraction_strategy import (
	LLMExtractionStrategy,
	JsonCssExtractionStrategy,
	JsonXPathExtractionStrategy
	)
	from crawl4ai.chunking_strategy import RegexChunking, IdentityChunking
	from crawl4ai.content_filter_strategy import PruningContentFilter
	from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator

	async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str):
	"""Helper function to run extraction with proper configuration"""
	try:
	# Configure the crawler run settings
	config = CrawlerRunConfig(
	cache_mode=CacheMode.BYPASS,
	extraction_strategy=strategy,
	markdown_generator=DefaultMarkdownGenerator(
	content_filter=PruningContentFilter() # For fit_markdown support
	)
	)

	# Run the crawler
	result = await crawler.arun(url=url, config=config)

	if result.success:
	print(f"\n=== {name} Results ===")
	print(f"Extracted Content: {result.extracted_content}")
	print(f"Raw Markdown Length: {len(result.markdown_v2.raw_markdown)}")
	print(f"Citations Markdown Length: {len(result.markdown_v2.markdown_with_citations)}")
	else:
	print(f"Error in {name}: Crawl failed")

	except Exception as e:
	print(f"Error in {name}: {str(e)}")

	async def main():
	# Example URL (replace with actual URL)
	url = "https://example.com/product-page"

	# Configure browser settings
	browser_config = BrowserConfig(
	headless=True,
	verbose=True
	)

	# Initialize extraction strategies

	# 1. LLM Extraction with different input formats
	markdown_strategy = LLMExtractionStrategy(
	provider="openai/gpt-4o-mini",
	api_token=os.getenv("OPENAI_API_KEY"),
	instruction="Extract product information including name, price, and description"
	)

	html_strategy = LLMExtractionStrategy(
	input_format="html",
	provider="openai/gpt-4o-mini",
	api_token=os.getenv("OPENAI_API_KEY"),
	instruction="Extract product information from HTML including structured data"
	)

	fit_markdown_strategy = LLMExtractionStrategy(
	input_format="fit_markdown",
	provider="openai/gpt-4o-mini",
	api_token=os.getenv("OPENAI_API_KEY"),
	instruction="Extract product information from cleaned markdown"
	)

	# 2. JSON CSS Extraction (automatically uses HTML input)
	css_schema = {
	"baseSelector": ".product",
	"fields": [
	{"name": "title", "selector": "h1.product-title", "type": "text"},
	{"name": "price", "selector": ".price", "type": "text"},
	{"name": "description", "selector": ".description", "type": "text"}
	]
	}
	css_strategy = JsonCssExtractionStrategy(schema=css_schema)

	# 3. JSON XPath Extraction (automatically uses HTML input)
	xpath_schema = {
	"baseSelector": "//div[@class='product']",
	"fields": [
	{"name": "title", "selector": ".//h1[@class='product-title']/text()", "type": "text"},
	{"name": "price", "selector": ".//span[@class='price']/text()", "type": "text"},
	{"name": "description", "selector": ".//div[@class='description']/text()", "type": "text"}
	]
	}
	xpath_strategy = JsonXPathExtractionStrategy(schema=xpath_schema)

	# Use context manager for proper resource handling
	async with AsyncWebCrawler(config=browser_config) as crawler:
	# Run all strategies
	await run_extraction(crawler, url, markdown_strategy, "Markdown LLM")
	await run_extraction(crawler, url, html_strategy, "HTML LLM")
	await run_extraction(crawler, url, fit_markdown_strategy, "Fit Markdown LLM")
	await run_extraction(crawler, url, css_strategy, "CSS Extraction")
	await run_extraction(crawler, url, xpath_strategy, "XPath Extraction")

	if __name__ == "__main__":
	asyncio.run(main())