Spaces:

bacancydataprophets
/

agent_insurance_scrapper

Sleeping

App Files Files Community

agent_insurance_scrapper / scraper.py

gneya-bacancy

Upload 8 files

03c34b1 verified 11 months ago

raw

history blame contribute delete

5.93 kB

	import json
	from pydantic import BaseModel
	from typing import List, Set, Tuple
	from crawl4ai import (
	AsyncWebCrawler,
	BrowserConfig,
	CacheMode,
	CrawlerRunConfig,
	LLMExtractionStrategy,
	)
	from utils import is_duplicated
	from config import LLM_MODEL, API_TOKEN


	def get_browser_config() -> BrowserConfig:
	"""
	Returns the browser configuration for the crawler.

	Returns:
	BrowserConfig: The configuration settings for the browser.
	"""
	# https://docs.crawl4ai.com/core/browser-crawler-config/
	return BrowserConfig(
	browser_type="chromium", # Type of browser to simulate
	headless=True, # Whether to run in headless mode (no GUI)
	verbose=True, # Enable verbose logging
	)


	def get_llm_strategy(llm_instructions: str, output_format: BaseModel) -> LLMExtractionStrategy:
	"""
	Returns the configuration for the language model extraction strategy.

	Returns:
	LLMExtractionStrategy: The settings for how to extract data using LLM.
	"""
	# https://docs.crawl4ai.com/api/strategies/#llmextractionstrategy
	return LLMExtractionStrategy(
	provider=LLM_MODEL, # Name of the LLM provider
	api_token=API_TOKEN, # API token for authentication
	schema=output_format.model_json_schema(), # JSON schema of the data model
	extraction_type="schema", # Type of extraction to perform
	instruction=llm_instructions, # Instructions for the LLM
	input_format="markdown", # Format of the input content
	verbose=True, # Enable verbose logging
	)

	async def check_no_results(
	crawler: AsyncWebCrawler,
	url: str,
	session_id: str,
	) -> bool:
	"""
	Checks if the "No Results Found" message is present on the page.

	Args:
	crawler (AsyncWebCrawler): The web crawler instance.
	url (str): The URL to check.
	session_id (str): The session identifier.

	Returns:
	bool: True if "No Results Found" message is found, False otherwise.
	"""
	# Fetch the page without any CSS selector or extraction strategy
	result = await crawler.arun(
	url=url,
	config=CrawlerRunConfig(
	cache_mode=CacheMode.BYPASS,
	session_id=session_id,
	),
	)

	if result.success:
	if "No Results Found" in result.cleaned_html:
	return True
	else:
	print(
	f"Error fetching page for 'No Results Found' check: {result.error_message}"
	)

	return False


	async def fetch_and_process_page(
	crawler: AsyncWebCrawler,
	page_number: int,
	base_url: str,
	css_selector: str,
	llm_strategy: LLMExtractionStrategy,
	session_id: str,
	seen_names: Set[str],
	) -> Tuple[List[dict], bool]:
	"""
	Fetches and processes a single page from yellowpages.

	Args:
	crawler (AsyncWebCrawler): The web crawler instance.
	page_number (int): The page number to fetch.
	base_url (str): The base URL of the website.
	css_selector (str): The CSS selector to target the content.
	llm_strategy (LLMExtractionStrategy): The LLM extraction strategy.
	session_id (str): The session identifier.
	required_keys (List[str]): List of required keys in the business data.
	seen_names (Set[str]): Set of business names that have already been seen.

	Returns:
	Tuple[List[dict], bool]:
	- List[dict]: A list of processed businesss from the page.
	- bool: A flag indicating if the "No Results Found" message was encountered.
	"""
	url = base_url.format(page_number=page_number)
	print(f"Loading page {page_number}...")

	# Check if "No Results Found" message is present
	no_results = await check_no_results(crawler, url, session_id)
	if no_results:
	return [], True # No more results, signal to stop crawling

	# Fetch page content with the extraction strategy
	result = await crawler.arun(
	url=url,
	config=CrawlerRunConfig(
	cache_mode=CacheMode.BYPASS, # Do not use cached data
	extraction_strategy=llm_strategy, # Strategy for data extraction
	css_selector=css_selector, # Target specific content on the page
	session_id=session_id, # Unique session ID for the crawl
	),
	)
	print("----------------------------- Result-----------------------------")
	print(result.extracted_content)

	if not (result.success and result.extracted_content):
	print(f"Error fetching page {page_number}: {result.error_message}")
	return [], False

	# Parse extracted content
	extracted_data = json.loads(result.extracted_content)
	print("----------------------------Exracted Data----------------------------")
	print(extracted_data)
	if not extracted_data:
	print(f"No businesss found on page {page_number}.")
	return [], False

	# After parsing extracted content
	print("Extracted data:", extracted_data)

	# Process businesss
	all_businesses = []
	for business in extracted_data:
	# Debugging: Print each business to understand its structure
	print("Processing business:", business)

	# Ignore the 'error' key if it's False
	if business.get("error") is False:
	business.pop("error", None) # Remove the 'error' key if it's False

	if is_duplicated(business["name"], seen_names):
	print(f"Duplicate business '{business['name']}' found. Skipping.")
	continue # Skip duplicate businesss

	# Add business to the list
	seen_names.add(business["name"])
	all_businesses.append(business)

	if not all_businesses:
	print(f"No complete businesss found on page {page_number}.")
	return [], False

	print(f"Extracted {len(all_businesses)} businesss from page {page_number}.")
	return all_businesses, False # Continue crawling