AhsanRazi commited on
Commit
7dc121a
·
verified ·
1 Parent(s): 6552629

Update crawl.py

Browse files
Files changed (1) hide show
  1. crawl.py +51 -51
crawl.py CHANGED
@@ -1,52 +1,52 @@
1
- import asyncio
2
- from crawl4ai import AsyncWebCrawler
3
- from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
4
- import re
5
-
6
- def remove_links_and_pics(input_text):
7
- # Remove all links (URLs)
8
- text_without_links = re.sub(r'https?:\/\/[^\s<>]+|<https?:\/\/[^\s<>]+>', '', input_text)
9
-
10
- # Remove all image references (markdown-style image syntax)
11
- text_without_images = re.sub(r'!\[.*?\]\(.*?\)', '', text_without_links)
12
-
13
- # Remove HTML tags
14
- text_without_html = re.sub(r'<[^>]+>', '', text_without_images)
15
-
16
- # Remove special characters and leave only text
17
- text_without_special_chars = re.sub(r'[^a-zA-Z\s]', '', text_without_html)
18
-
19
- # Remove special characters and leave only text
20
- text_without_brackets = re.sub(r'\[.*?\]|\(.*?\)', '', text_without_special_chars)
21
-
22
- return text_without_brackets.strip()
23
-
24
-
25
- # Crawling for marketing
26
- async def marketing_crawling(url):
27
- browser_config = BrowserConfig() # Default browser configuration
28
- run_config = CrawlerRunConfig() # Default crawl run configuration
29
-
30
- async with AsyncWebCrawler(config=browser_config) as crawler:
31
- result = await crawler.arun(
32
- url = url,
33
- config=run_config
34
- )
35
- cleaned_text = remove_links_and_pics(result.markdown) # type: ignore
36
- return cleaned_text
37
-
38
-
39
- # Crawling for SEO
40
- async def seo_crawling(url):
41
- browser_config = BrowserConfig() # Default browser configuration
42
- run_config = CrawlerRunConfig() # Default crawl run configuration
43
-
44
- async with AsyncWebCrawler(config=browser_config) as crawler:
45
- result = await crawler.arun(
46
- url = url,
47
- config=run_config
48
- )
49
- text = result.markdown # type: ignore
50
- return text
51
-
52
  # asyncio.run(marketing_crawling("https://allsolarworks.com/"))
 
1
+ import asyncio
2
+ from crawl4ai import AsyncWebCrawler
3
+ from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
4
+ import re
5
+
6
+ def remove_links_and_pics(input_text):
7
+ # Remove all links (URLs)
8
+ text_without_links = re.sub(r'https?:\/\/[^\s<>]+|<https?:\/\/[^\s<>]+>', '', input_text)
9
+
10
+ # Remove all image references (markdown-style image syntax)
11
+ text_without_images = re.sub(r'!\[.*?\]\(.*?\)', '', text_without_links)
12
+
13
+ # Remove HTML tags
14
+ text_without_html = re.sub(r'<[^>]+>', '', text_without_images)
15
+
16
+ # Remove special characters and leave only text
17
+ text_without_special_chars = re.sub(r'[^a-zA-Z\s]', '', text_without_html)
18
+
19
+ # Remove special characters and leave only text
20
+ text_without_brackets = re.sub(r'\[.*?\]|\(.*?\)', '', text_without_special_chars)
21
+
22
+ return text_without_brackets.strip()
23
+
24
+
25
+ # Crawling for marketing
26
+ async def marketing_crawling(url):
27
+ browser_config = BrowserConfig(headless=True) # Default browser configuration
28
+ run_config = CrawlerRunConfig() # Default crawl run configuration
29
+
30
+ async with AsyncWebCrawler(config=browser_config) as crawler:
31
+ result = await crawler.arun(
32
+ url = url,
33
+ config=run_config
34
+ )
35
+ cleaned_text = remove_links_and_pics(result.markdown) # type: ignore
36
+ return cleaned_text
37
+
38
+
39
+ # Crawling for SEO
40
+ async def seo_crawling(url):
41
+ browser_config = BrowserConfig(headless=True) # Default browser configuration
42
+ run_config = CrawlerRunConfig() # Default crawl run configuration
43
+
44
+ async with AsyncWebCrawler(config=browser_config) as crawler:
45
+ result = await crawler.arun(
46
+ url = url,
47
+ config=run_config
48
+ )
49
+ text = result.markdown # type: ignore
50
+ return text
51
+
52
  # asyncio.run(marketing_crawling("https://allsolarworks.com/"))