File size: 4,051 Bytes
fab8f73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import asyncio
import os
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode

async def main():
    urls = [
        "https://www.locus.com.np/",
        "https://www.locus.com.np/about-us",
        "https://www.locus.com.np/events",
        "https://www.locus.com.np/zerone",
        "https://www.locus.com.np/sponsors",
        "https://www.locus.com.np/blogs",
        "https://www.locus.com.np/teams",
        "https://www.locus.com.np/contact-us",
        "https://www.locus.com.np/event/HARDWARE-FELLOWSHIP",
        "https://www.locus.com.np/past-locus",
        "https://www.locus.com.np/event/CYBER%20SHIELD",
        "https://www.locus.com.np/event/GIRLS%20LOCUS%20CUP",
        "https://www.locus.com.np/event/HACK-A-WEEK",
        "https://www.locus.com.np/event/CODE-JAM",
        "https://www.locus.com.np/event/GIRLS-TO-CODE",
        "https://www.locus.com.np/event/15%20DAYS%20OF%20LEARNING",
        "https://www.locus.com.np/event/DATAVERSE",
        "https://www.locus.com.np/event/Walkathon",
        "https://www.locus.com.np/event/Flashmob",
        "https://www.locus.com.np/event/Energy%20Hackathon",
        "https://www.locus.com.np/event/LOCUS%20Exhibition",
        "https://www.locus.com.np/event/Dronacharya%202026",
        "https://www.locus.com.np/event/RoboWarz%202026",
        "https://www.locus.com.np/event/RoboSoccer%202026",
        "https://www.locus.com.np/event/RoboPop%202026",
        "https://www.locus.com.np/event/Robo%20Line%20Dash%202026",
        # Important External Links
        "https://medium.com/zerone-magazine/tagged/blogging-competition",
        "https://medium.com/zerone-magazine/tagged/technical-competition",
        "https://drive.google.com/file/u/2/d/1S99bX6EzDScZ2Is4MLLpNWGO4suq-fwj/view?usp=sharing",
        "https://drive.google.com/file/d/1RRV3RALnP9BzUy1vq5KzS0HeH2XGH6HP/view?usp=sharing",
        "https://drive.google.com/drive/folders/14UPc7jaazIt-E7RBRF6IJ9LmFrwzTRjc",
        "https://drive.google.com/file/d/1pJMHmXZUcOCYBG5xt2Do5OMD9xEB7yD5/view",
        "https://drive.google.com/file/d/1mm78B6Hc3oLw3IrT1R6fMyI9C8di5ga5/view",
        "https://drive.google.com/file/d/1wn-QI5akpAgEA2vs8P9jeJDVxRbfNhPm/view?usp=drive_link",
        "https://drive.google.com/file/d/1X-QENlLrFw0GM8NIlmTP4hsKYESjeRbh/view?usp=drive_link",
        "https://drive.google.com/file/d/1b8MlmHNM_0GeYJhxAajb8YC22euZwyZQ/view?usp=drive_link",
        "https://drive.google.com/file/d/14UPRPobB6AXR7YmS1570rtIHgWdVpyDY/view"
    ]
    
    os.makedirs("data", exist_ok=True)
    
    browser_config = BrowserConfig(
        headless=True,
        extra_args=["--disable-gpu", "--disable-software-rasterizer", "--no-sandbox"]
    )
    
    run_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        wait_for="css:body", 
        delay_before_return_html=10.0, 
        page_timeout=120000 
    )
    
    async with AsyncWebCrawler(config=browser_config) as crawler:
        for url in urls:
            print(f"Crawling {url}...")
            try:
                result = await crawler.arun(url=url, config=run_config)
                
                if result.success:
                    # Create a safe filename from the URL
                    filename = url.replace("https://", "").replace("www.", "").replace("/", "_").strip("_")
                    if not filename or filename == "locus.com.np":
                        filename = "index"
                    else:
                        filename = filename.replace("locus.com.np_", "")
                    
                    filepath = f"data/{filename}.md"
                    with open(filepath, "w", encoding="utf-8") as f:
                        f.write(result.markdown)
                    print(f"Saved to {filepath}")
                else:
                    print(f"Failed to crawl {url}: {result.error_message}")
            except Exception as e:
                print(f"Exception during crawl of {url}: {str(e)}")

if __name__ == "__main__":
    asyncio.run(main())