Spaces:
Sleeping
Sleeping
| import asyncio | |
| import os | |
| from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode | |
| async def main(): | |
| urls = [ | |
| "https://www.locus.com.np/", | |
| "https://www.locus.com.np/about-us", | |
| "https://www.locus.com.np/events", | |
| "https://www.locus.com.np/zerone", | |
| "https://www.locus.com.np/sponsors", | |
| "https://www.locus.com.np/blogs", | |
| "https://www.locus.com.np/teams", | |
| "https://www.locus.com.np/contact-us", | |
| "https://www.locus.com.np/event/HARDWARE-FELLOWSHIP", | |
| "https://www.locus.com.np/past-locus", | |
| "https://www.locus.com.np/event/CYBER%20SHIELD", | |
| "https://www.locus.com.np/event/GIRLS%20LOCUS%20CUP", | |
| "https://www.locus.com.np/event/HACK-A-WEEK", | |
| "https://www.locus.com.np/event/CODE-JAM", | |
| "https://www.locus.com.np/event/GIRLS-TO-CODE", | |
| "https://www.locus.com.np/event/15%20DAYS%20OF%20LEARNING", | |
| "https://www.locus.com.np/event/DATAVERSE", | |
| "https://www.locus.com.np/event/Walkathon", | |
| "https://www.locus.com.np/event/Flashmob", | |
| "https://www.locus.com.np/event/Energy%20Hackathon", | |
| "https://www.locus.com.np/event/LOCUS%20Exhibition", | |
| "https://www.locus.com.np/event/Dronacharya%202026", | |
| "https://www.locus.com.np/event/RoboWarz%202026", | |
| "https://www.locus.com.np/event/RoboSoccer%202026", | |
| "https://www.locus.com.np/event/RoboPop%202026", | |
| "https://www.locus.com.np/event/Robo%20Line%20Dash%202026", | |
| # Important External Links | |
| "https://medium.com/zerone-magazine/tagged/blogging-competition", | |
| "https://medium.com/zerone-magazine/tagged/technical-competition", | |
| "https://drive.google.com/file/u/2/d/1S99bX6EzDScZ2Is4MLLpNWGO4suq-fwj/view?usp=sharing", | |
| "https://drive.google.com/file/d/1RRV3RALnP9BzUy1vq5KzS0HeH2XGH6HP/view?usp=sharing", | |
| "https://drive.google.com/drive/folders/14UPc7jaazIt-E7RBRF6IJ9LmFrwzTRjc", | |
| "https://drive.google.com/file/d/1pJMHmXZUcOCYBG5xt2Do5OMD9xEB7yD5/view", | |
| "https://drive.google.com/file/d/1mm78B6Hc3oLw3IrT1R6fMyI9C8di5ga5/view", | |
| "https://drive.google.com/file/d/1wn-QI5akpAgEA2vs8P9jeJDVxRbfNhPm/view?usp=drive_link", | |
| "https://drive.google.com/file/d/1X-QENlLrFw0GM8NIlmTP4hsKYESjeRbh/view?usp=drive_link", | |
| "https://drive.google.com/file/d/1b8MlmHNM_0GeYJhxAajb8YC22euZwyZQ/view?usp=drive_link", | |
| "https://drive.google.com/file/d/14UPRPobB6AXR7YmS1570rtIHgWdVpyDY/view" | |
| ] | |
| os.makedirs("data", exist_ok=True) | |
| browser_config = BrowserConfig( | |
| headless=True, | |
| extra_args=["--disable-gpu", "--disable-software-rasterizer", "--no-sandbox"] | |
| ) | |
| run_config = CrawlerRunConfig( | |
| cache_mode=CacheMode.BYPASS, | |
| wait_for="css:body", | |
| delay_before_return_html=10.0, | |
| page_timeout=120000 | |
| ) | |
| async with AsyncWebCrawler(config=browser_config) as crawler: | |
| for url in urls: | |
| print(f"Crawling {url}...") | |
| try: | |
| result = await crawler.arun(url=url, config=run_config) | |
| if result.success: | |
| # Create a safe filename from the URL | |
| filename = url.replace("https://", "").replace("www.", "").replace("/", "_").strip("_") | |
| if not filename or filename == "locus.com.np": | |
| filename = "index" | |
| else: | |
| filename = filename.replace("locus.com.np_", "") | |
| filepath = f"data/{filename}.md" | |
| with open(filepath, "w", encoding="utf-8") as f: | |
| f.write(result.markdown) | |
| print(f"Saved to {filepath}") | |
| else: | |
| print(f"Failed to crawl {url}: {result.error_message}") | |
| except Exception as e: | |
| print(f"Exception during crawl of {url}: {str(e)}") | |
| if __name__ == "__main__": | |
| asyncio.run(main()) | |