Spaces:

Metaviz-Pro
/

Blog_Post_Generation

Sleeping

App Files Files Community

AhsanRazi commited on Mar 17, 2025

Commit

7dc121a

verified ·

1 Parent(s): 6552629

Update crawl.py

Browse files

Files changed (1) hide show

crawl.py +51 -51

crawl.py CHANGED Viewed

@@ -1,52 +1,52 @@
-import asyncio
-from crawl4ai import AsyncWebCrawler
-from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
-import re
-def remove_links_and_pics(input_text):
-    # Remove all links (URLs)
-    text_without_links = re.sub(r'https?:\/\/[^\s<>]+|<https?:\/\/[^\s<>]+>', '', input_text)
-    # Remove all image references (markdown-style image syntax)
-    text_without_images = re.sub(r'!\[.*?\]\(.*?\)', '', text_without_links)
-    # Remove HTML tags
-    text_without_html = re.sub(r'<[^>]+>', '', text_without_images)
-    # Remove special characters and leave only text
-    text_without_special_chars = re.sub(r'[^a-zA-Z\s]', '', text_without_html)
-    # Remove special characters and leave only text
-    text_without_brackets = re.sub(r'\[.*?\]|\(.*?\)', '', text_without_special_chars)
-    return text_without_brackets.strip()
-# Crawling for marketing
-async def marketing_crawling(url):
-    browser_config = BrowserConfig()  # Default browser configuration
-    run_config = CrawlerRunConfig()   # Default crawl run configuration
-    async with AsyncWebCrawler(config=browser_config) as crawler:
-        result = await crawler.arun(
-            url = url,
-            config=run_config
-        )
-        cleaned_text = remove_links_and_pics(result.markdown) # type: ignore
-        return cleaned_text
-# Crawling for SEO
-async def seo_crawling(url):
-    browser_config = BrowserConfig()  # Default browser configuration
-    run_config = CrawlerRunConfig()   # Default crawl run configuration
-    async with AsyncWebCrawler(config=browser_config) as crawler:
-        result = await crawler.arun(
-            url = url,
-            config=run_config
-        )
-        text = result.markdown # type: ignore
-        return text
 # asyncio.run(marketing_crawling("https://allsolarworks.com/"))

+import asyncio
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+import re
+def remove_links_and_pics(input_text):
+    # Remove all links (URLs)
+    text_without_links = re.sub(r'https?:\/\/[^\s<>]+|<https?:\/\/[^\s<>]+>', '', input_text)
+    # Remove all image references (markdown-style image syntax)
+    text_without_images = re.sub(r'!\[.*?\]\(.*?\)', '', text_without_links)
+    # Remove HTML tags
+    text_without_html = re.sub(r'<[^>]+>', '', text_without_images)
+    # Remove special characters and leave only text
+    text_without_special_chars = re.sub(r'[^a-zA-Z\s]', '', text_without_html)
+    # Remove special characters and leave only text
+    text_without_brackets = re.sub(r'\[.*?\]|\(.*?\)', '', text_without_special_chars)
+    return text_without_brackets.strip()
+# Crawling for marketing
+async def marketing_crawling(url):
+    browser_config = BrowserConfig(headless=True)  # Default browser configuration
+    run_config = CrawlerRunConfig()   # Default crawl run configuration
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url = url,
+            config=run_config
+        )
+        cleaned_text = remove_links_and_pics(result.markdown) # type: ignore
+        return cleaned_text
+# Crawling for SEO
+async def seo_crawling(url):
+    browser_config = BrowserConfig(headless=True)  # Default browser configuration
+    run_config = CrawlerRunConfig()   # Default crawl run configuration
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url = url,
+            config=run_config
+        )
+        text = result.markdown # type: ignore
+        return text
 # asyncio.run(marketing_crawling("https://allsolarworks.com/"))