Spaces:
Paused
Paused
File size: 1,974 Bytes
afce8a8 9155a62 9635653 afce8a8 9635653 62283c0 9635653 62283c0 9635653 afce8a8 9155a62 afce8a8 9635653 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import asyncio
import json
from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode
async def main(urls):
base_browser = BrowserConfig(
browser_type="chromium",
headless=True,
viewport_width=1920,
viewport_height=1080,
accept_downloads=True,
)
# Create an instance of AsyncWebCrawler
async with AsyncWebCrawler(config=base_browser) as crawler:
# Run the crawler on a URL
results = await crawler.arun_many(
urls=urls,
screenshot=False,
cache_mode=CacheMode.BYPASS,
scan_full_page=True,
semaphore_count=3,
wait_for_images=True,
)
open("output.log.json", "w").close()
for result in results:
if result.success:
dump_result = {
"url": result.url,
"markdown": result.markdown,
}
with open("output.log.json", "a") as f:
json.dump(dump_result, f)
# Print the extracted content
hr = lambda n=1: print(("-" * 80) * 2 * n)
print("[OK] URL:", result.url)
hr()
# if result.success:
# # Save screenshot
# if result.screenshot:
# with open("screenshot.png", "wb") as f:
# f.write(b64decode(result.screenshot))
#
# # Save PDF
# if result.pdf:
# with open("download.pdf", "wb") as f:
# f.write(result.pdf)
#
# print("[OK] PDF & screenshot captured.")
# else:
# print("[ERROR]", result.error_message)
if __name__ == "__main__":
urls = [
"https://www.google.com",
"https://www.amazon.com",
"https://www.facebook.com",
"https://www.twitter.com",
"https://www.instagram.com",
]
asyncio.run(main(urls))
|