Spaces:

sohamw03
/

knowledge-net

Paused

File size: 1,974 Bytes

import asyncio
import json

from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode


async def main(urls):
    base_browser = BrowserConfig(
        browser_type="chromium",
        headless=True,
        viewport_width=1920,
        viewport_height=1080,
        accept_downloads=True,
    )

    # Create an instance of AsyncWebCrawler
    async with AsyncWebCrawler(config=base_browser) as crawler:
        # Run the crawler on a URL
        results = await crawler.arun_many(
            urls=urls,
            screenshot=False,
            cache_mode=CacheMode.BYPASS,
            scan_full_page=True,
            semaphore_count=3,
            wait_for_images=True,
        )
        open("output.log.json", "w").close()
        for result in results:
            if result.success:
                dump_result = {
                    "url": result.url,
                    "markdown": result.markdown,
                }
                with open("output.log.json", "a") as f:
                    json.dump(dump_result, f)
                # Print the extracted content
                hr = lambda n=1: print(("-" * 80) * 2 * n)
                print("[OK] URL:", result.url)
                hr()

        # if result.success:
        #     # Save screenshot
        #     if result.screenshot:
        #         with open("screenshot.png", "wb") as f:
        #             f.write(b64decode(result.screenshot))
        #
        #     # Save PDF
        #     if result.pdf:
        #         with open("download.pdf", "wb") as f:
        #             f.write(result.pdf)
        #
        #     print("[OK] PDF & screenshot captured.")
        # else:
        #     print("[ERROR]", result.error_message)


if __name__ == "__main__":
    urls = [
        "https://www.google.com",
        "https://www.amazon.com",
        "https://www.facebook.com",
        "https://www.twitter.com",
        "https://www.instagram.com",
    ]
    asyncio.run(main(urls))