File size: 1,974 Bytes
afce8a8
9155a62
 
 
 
 
9635653
afce8a8
 
 
 
 
 
 
 
 
 
 
9635653
 
 
 
 
 
 
 
62283c0
9635653
 
 
 
 
 
62283c0
9635653
 
 
 
 
afce8a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9155a62
afce8a8
9635653
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import asyncio
import json

from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode


async def main(urls):
    base_browser = BrowserConfig(
        browser_type="chromium",
        headless=True,
        viewport_width=1920,
        viewport_height=1080,
        accept_downloads=True,
    )

    # Create an instance of AsyncWebCrawler
    async with AsyncWebCrawler(config=base_browser) as crawler:
        # Run the crawler on a URL
        results = await crawler.arun_many(
            urls=urls,
            screenshot=False,
            cache_mode=CacheMode.BYPASS,
            scan_full_page=True,
            semaphore_count=3,
            wait_for_images=True,
        )
        open("output.log.json", "w").close()
        for result in results:
            if result.success:
                dump_result = {
                    "url": result.url,
                    "markdown": result.markdown,
                }
                with open("output.log.json", "a") as f:
                    json.dump(dump_result, f)
                # Print the extracted content
                hr = lambda n=1: print(("-" * 80) * 2 * n)
                print("[OK] URL:", result.url)
                hr()

        # if result.success:
        #     # Save screenshot
        #     if result.screenshot:
        #         with open("screenshot.png", "wb") as f:
        #             f.write(b64decode(result.screenshot))
        #
        #     # Save PDF
        #     if result.pdf:
        #         with open("download.pdf", "wb") as f:
        #             f.write(result.pdf)
        #
        #     print("[OK] PDF & screenshot captured.")
        # else:
        #     print("[ERROR]", result.error_message)


if __name__ == "__main__":
    urls = [
        "https://www.google.com",
        "https://www.amazon.com",
        "https://www.facebook.com",
        "https://www.twitter.com",
        "https://www.instagram.com",
    ]
    asyncio.run(main(urls))