import asyncio from crawl4ai import AsyncWebCrawler from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig import re def remove_links_and_pics(input_text): # Remove all links (URLs) text_without_links = re.sub(r'https?:\/\/[^\s<>]+|]+>', '', input_text) # Remove all image references (markdown-style image syntax) text_without_images = re.sub(r'!\[.*?\]\(.*?\)', '', text_without_links) # Remove HTML tags text_without_html = re.sub(r'<[^>]+>', '', text_without_images) # Remove special characters and leave only text text_without_special_chars = re.sub(r'[^a-zA-Z\s]', '', text_without_html) # Remove special characters and leave only text text_without_brackets = re.sub(r'\[.*?\]|\(.*?\)', '', text_without_special_chars) return text_without_brackets.strip() # Crawling for marketing async def marketing_crawling(url): browser_config = BrowserConfig() # Default browser configuration run_config = CrawlerRunConfig() # Default crawl run configuration async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun( url = url, config=run_config ) cleaned_text = remove_links_and_pics(result.markdown) # type: ignore return cleaned_text # Crawling for SEO async def seo_crawling(url): browser_config = BrowserConfig() # Default browser configuration run_config = CrawlerRunConfig() # Default crawl run configuration async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun( url = url, config=run_config ) text = result.markdown # type: ignore return text # asyncio.run(marketing_crawling("https://allsolarworks.com/"))