Spaces:

Metaviz-Pro
/

Blog_Post_Generation

Sleeping

File size: 1,826 Bytes

import asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
import re

def remove_links_and_pics(input_text):
    # Remove all links (URLs)
    text_without_links = re.sub(r'https?:\/\/[^\s<>]+|<https?:\/\/[^\s<>]+>', '', input_text)
    
    # Remove all image references (markdown-style image syntax)
    text_without_images = re.sub(r'!\[.*?\]\(.*?\)', '', text_without_links)
    
    # Remove HTML tags
    text_without_html = re.sub(r'<[^>]+>', '', text_without_images)
    
    # Remove special characters and leave only text
    text_without_special_chars = re.sub(r'[^a-zA-Z\s]', '', text_without_html)
    
    # Remove special characters and leave only text
    text_without_brackets = re.sub(r'\[.*?\]|\(.*?\)', '', text_without_special_chars)

    return text_without_brackets.strip()


# Crawling for marketing
async def marketing_crawling(url):
    browser_config = BrowserConfig()  # Default browser configuration
    run_config = CrawlerRunConfig()   # Default crawl run configuration

    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun(
            url = url,
            config=run_config
        )
        cleaned_text = remove_links_and_pics(result.markdown) # type: ignore
        return cleaned_text


# Crawling for SEO
async def seo_crawling(url):
    browser_config = BrowserConfig()  # Default browser configuration
    run_config = CrawlerRunConfig()   # Default crawl run configuration

    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun(
            url = url,
            config=run_config
        )
        text = result.markdown # type: ignore
        return text
    
# asyncio.run(marketing_crawling("https://allsolarworks.com/"))