khagu commited on
Commit
fab8f73
·
0 Parent(s):

(feat): add script to fetch data from website

Browse files
Files changed (1) hide show
  1. scripts/ingest.py +85 -0
scripts/ingest.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import os
3
+ from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
4
+
5
+ async def main():
6
+ urls = [
7
+ "https://www.locus.com.np/",
8
+ "https://www.locus.com.np/about-us",
9
+ "https://www.locus.com.np/events",
10
+ "https://www.locus.com.np/zerone",
11
+ "https://www.locus.com.np/sponsors",
12
+ "https://www.locus.com.np/blogs",
13
+ "https://www.locus.com.np/teams",
14
+ "https://www.locus.com.np/contact-us",
15
+ "https://www.locus.com.np/event/HARDWARE-FELLOWSHIP",
16
+ "https://www.locus.com.np/past-locus",
17
+ "https://www.locus.com.np/event/CYBER%20SHIELD",
18
+ "https://www.locus.com.np/event/GIRLS%20LOCUS%20CUP",
19
+ "https://www.locus.com.np/event/HACK-A-WEEK",
20
+ "https://www.locus.com.np/event/CODE-JAM",
21
+ "https://www.locus.com.np/event/GIRLS-TO-CODE",
22
+ "https://www.locus.com.np/event/15%20DAYS%20OF%20LEARNING",
23
+ "https://www.locus.com.np/event/DATAVERSE",
24
+ "https://www.locus.com.np/event/Walkathon",
25
+ "https://www.locus.com.np/event/Flashmob",
26
+ "https://www.locus.com.np/event/Energy%20Hackathon",
27
+ "https://www.locus.com.np/event/LOCUS%20Exhibition",
28
+ "https://www.locus.com.np/event/Dronacharya%202026",
29
+ "https://www.locus.com.np/event/RoboWarz%202026",
30
+ "https://www.locus.com.np/event/RoboSoccer%202026",
31
+ "https://www.locus.com.np/event/RoboPop%202026",
32
+ "https://www.locus.com.np/event/Robo%20Line%20Dash%202026",
33
+ # Important External Links
34
+ "https://medium.com/zerone-magazine/tagged/blogging-competition",
35
+ "https://medium.com/zerone-magazine/tagged/technical-competition",
36
+ "https://drive.google.com/file/u/2/d/1S99bX6EzDScZ2Is4MLLpNWGO4suq-fwj/view?usp=sharing",
37
+ "https://drive.google.com/file/d/1RRV3RALnP9BzUy1vq5KzS0HeH2XGH6HP/view?usp=sharing",
38
+ "https://drive.google.com/drive/folders/14UPc7jaazIt-E7RBRF6IJ9LmFrwzTRjc",
39
+ "https://drive.google.com/file/d/1pJMHmXZUcOCYBG5xt2Do5OMD9xEB7yD5/view",
40
+ "https://drive.google.com/file/d/1mm78B6Hc3oLw3IrT1R6fMyI9C8di5ga5/view",
41
+ "https://drive.google.com/file/d/1wn-QI5akpAgEA2vs8P9jeJDVxRbfNhPm/view?usp=drive_link",
42
+ "https://drive.google.com/file/d/1X-QENlLrFw0GM8NIlmTP4hsKYESjeRbh/view?usp=drive_link",
43
+ "https://drive.google.com/file/d/1b8MlmHNM_0GeYJhxAajb8YC22euZwyZQ/view?usp=drive_link",
44
+ "https://drive.google.com/file/d/14UPRPobB6AXR7YmS1570rtIHgWdVpyDY/view"
45
+ ]
46
+
47
+ os.makedirs("data", exist_ok=True)
48
+
49
+ browser_config = BrowserConfig(
50
+ headless=True,
51
+ extra_args=["--disable-gpu", "--disable-software-rasterizer", "--no-sandbox"]
52
+ )
53
+
54
+ run_config = CrawlerRunConfig(
55
+ cache_mode=CacheMode.BYPASS,
56
+ wait_for="css:body",
57
+ delay_before_return_html=10.0,
58
+ page_timeout=120000
59
+ )
60
+
61
+ async with AsyncWebCrawler(config=browser_config) as crawler:
62
+ for url in urls:
63
+ print(f"Crawling {url}...")
64
+ try:
65
+ result = await crawler.arun(url=url, config=run_config)
66
+
67
+ if result.success:
68
+ # Create a safe filename from the URL
69
+ filename = url.replace("https://", "").replace("www.", "").replace("/", "_").strip("_")
70
+ if not filename or filename == "locus.com.np":
71
+ filename = "index"
72
+ else:
73
+ filename = filename.replace("locus.com.np_", "")
74
+
75
+ filepath = f"data/{filename}.md"
76
+ with open(filepath, "w", encoding="utf-8") as f:
77
+ f.write(result.markdown)
78
+ print(f"Saved to {filepath}")
79
+ else:
80
+ print(f"Failed to crawl {url}: {result.error_message}")
81
+ except Exception as e:
82
+ print(f"Exception during crawl of {url}: {str(e)}")
83
+
84
+ if __name__ == "__main__":
85
+ asyncio.run(main())