File size: 1,826 Bytes
7dc121a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a5ef29
7dc121a
 
 
 
 
 
 
 
 
 
 
 
 
7a5ef29
7dc121a
 
 
 
 
 
 
 
 
 
0777aa4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
import re

def remove_links_and_pics(input_text):
    # Remove all links (URLs)
    text_without_links = re.sub(r'https?:\/\/[^\s<>]+|<https?:\/\/[^\s<>]+>', '', input_text)
    
    # Remove all image references (markdown-style image syntax)
    text_without_images = re.sub(r'!\[.*?\]\(.*?\)', '', text_without_links)
    
    # Remove HTML tags
    text_without_html = re.sub(r'<[^>]+>', '', text_without_images)
    
    # Remove special characters and leave only text
    text_without_special_chars = re.sub(r'[^a-zA-Z\s]', '', text_without_html)
    
    # Remove special characters and leave only text
    text_without_brackets = re.sub(r'\[.*?\]|\(.*?\)', '', text_without_special_chars)

    return text_without_brackets.strip()


# Crawling for marketing
async def marketing_crawling(url):
    browser_config = BrowserConfig()  # Default browser configuration
    run_config = CrawlerRunConfig()   # Default crawl run configuration

    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun(
            url = url,
            config=run_config
        )
        cleaned_text = remove_links_and_pics(result.markdown) # type: ignore
        return cleaned_text


# Crawling for SEO
async def seo_crawling(url):
    browser_config = BrowserConfig()  # Default browser configuration
    run_config = CrawlerRunConfig()   # Default crawl run configuration

    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun(
            url = url,
            config=run_config
        )
        text = result.markdown # type: ignore
        return text
    
# asyncio.run(marketing_crawling("https://allsolarworks.com/"))