Spaces:
Sleeping
Sleeping
File size: 4,051 Bytes
fab8f73 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import asyncio
import os
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
async def main():
urls = [
"https://www.locus.com.np/",
"https://www.locus.com.np/about-us",
"https://www.locus.com.np/events",
"https://www.locus.com.np/zerone",
"https://www.locus.com.np/sponsors",
"https://www.locus.com.np/blogs",
"https://www.locus.com.np/teams",
"https://www.locus.com.np/contact-us",
"https://www.locus.com.np/event/HARDWARE-FELLOWSHIP",
"https://www.locus.com.np/past-locus",
"https://www.locus.com.np/event/CYBER%20SHIELD",
"https://www.locus.com.np/event/GIRLS%20LOCUS%20CUP",
"https://www.locus.com.np/event/HACK-A-WEEK",
"https://www.locus.com.np/event/CODE-JAM",
"https://www.locus.com.np/event/GIRLS-TO-CODE",
"https://www.locus.com.np/event/15%20DAYS%20OF%20LEARNING",
"https://www.locus.com.np/event/DATAVERSE",
"https://www.locus.com.np/event/Walkathon",
"https://www.locus.com.np/event/Flashmob",
"https://www.locus.com.np/event/Energy%20Hackathon",
"https://www.locus.com.np/event/LOCUS%20Exhibition",
"https://www.locus.com.np/event/Dronacharya%202026",
"https://www.locus.com.np/event/RoboWarz%202026",
"https://www.locus.com.np/event/RoboSoccer%202026",
"https://www.locus.com.np/event/RoboPop%202026",
"https://www.locus.com.np/event/Robo%20Line%20Dash%202026",
# Important External Links
"https://medium.com/zerone-magazine/tagged/blogging-competition",
"https://medium.com/zerone-magazine/tagged/technical-competition",
"https://drive.google.com/file/u/2/d/1S99bX6EzDScZ2Is4MLLpNWGO4suq-fwj/view?usp=sharing",
"https://drive.google.com/file/d/1RRV3RALnP9BzUy1vq5KzS0HeH2XGH6HP/view?usp=sharing",
"https://drive.google.com/drive/folders/14UPc7jaazIt-E7RBRF6IJ9LmFrwzTRjc",
"https://drive.google.com/file/d/1pJMHmXZUcOCYBG5xt2Do5OMD9xEB7yD5/view",
"https://drive.google.com/file/d/1mm78B6Hc3oLw3IrT1R6fMyI9C8di5ga5/view",
"https://drive.google.com/file/d/1wn-QI5akpAgEA2vs8P9jeJDVxRbfNhPm/view?usp=drive_link",
"https://drive.google.com/file/d/1X-QENlLrFw0GM8NIlmTP4hsKYESjeRbh/view?usp=drive_link",
"https://drive.google.com/file/d/1b8MlmHNM_0GeYJhxAajb8YC22euZwyZQ/view?usp=drive_link",
"https://drive.google.com/file/d/14UPRPobB6AXR7YmS1570rtIHgWdVpyDY/view"
]
os.makedirs("data", exist_ok=True)
browser_config = BrowserConfig(
headless=True,
extra_args=["--disable-gpu", "--disable-software-rasterizer", "--no-sandbox"]
)
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
wait_for="css:body",
delay_before_return_html=10.0,
page_timeout=120000
)
async with AsyncWebCrawler(config=browser_config) as crawler:
for url in urls:
print(f"Crawling {url}...")
try:
result = await crawler.arun(url=url, config=run_config)
if result.success:
# Create a safe filename from the URL
filename = url.replace("https://", "").replace("www.", "").replace("/", "_").strip("_")
if not filename or filename == "locus.com.np":
filename = "index"
else:
filename = filename.replace("locus.com.np_", "")
filepath = f"data/{filename}.md"
with open(filepath, "w", encoding="utf-8") as f:
f.write(result.markdown)
print(f"Saved to {filepath}")
else:
print(f"Failed to crawl {url}: {result.error_message}")
except Exception as e:
print(f"Exception during crawl of {url}: {str(e)}")
if __name__ == "__main__":
asyncio.run(main())
|