Spaces:
Sleeping
Sleeping
Commit ·
fab8f73
0
Parent(s):
(feat): add script to fetch data from website
Browse files- scripts/ingest.py +85 -0
scripts/ingest.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import os
|
| 3 |
+
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
| 4 |
+
|
| 5 |
+
async def main():
|
| 6 |
+
urls = [
|
| 7 |
+
"https://www.locus.com.np/",
|
| 8 |
+
"https://www.locus.com.np/about-us",
|
| 9 |
+
"https://www.locus.com.np/events",
|
| 10 |
+
"https://www.locus.com.np/zerone",
|
| 11 |
+
"https://www.locus.com.np/sponsors",
|
| 12 |
+
"https://www.locus.com.np/blogs",
|
| 13 |
+
"https://www.locus.com.np/teams",
|
| 14 |
+
"https://www.locus.com.np/contact-us",
|
| 15 |
+
"https://www.locus.com.np/event/HARDWARE-FELLOWSHIP",
|
| 16 |
+
"https://www.locus.com.np/past-locus",
|
| 17 |
+
"https://www.locus.com.np/event/CYBER%20SHIELD",
|
| 18 |
+
"https://www.locus.com.np/event/GIRLS%20LOCUS%20CUP",
|
| 19 |
+
"https://www.locus.com.np/event/HACK-A-WEEK",
|
| 20 |
+
"https://www.locus.com.np/event/CODE-JAM",
|
| 21 |
+
"https://www.locus.com.np/event/GIRLS-TO-CODE",
|
| 22 |
+
"https://www.locus.com.np/event/15%20DAYS%20OF%20LEARNING",
|
| 23 |
+
"https://www.locus.com.np/event/DATAVERSE",
|
| 24 |
+
"https://www.locus.com.np/event/Walkathon",
|
| 25 |
+
"https://www.locus.com.np/event/Flashmob",
|
| 26 |
+
"https://www.locus.com.np/event/Energy%20Hackathon",
|
| 27 |
+
"https://www.locus.com.np/event/LOCUS%20Exhibition",
|
| 28 |
+
"https://www.locus.com.np/event/Dronacharya%202026",
|
| 29 |
+
"https://www.locus.com.np/event/RoboWarz%202026",
|
| 30 |
+
"https://www.locus.com.np/event/RoboSoccer%202026",
|
| 31 |
+
"https://www.locus.com.np/event/RoboPop%202026",
|
| 32 |
+
"https://www.locus.com.np/event/Robo%20Line%20Dash%202026",
|
| 33 |
+
# Important External Links
|
| 34 |
+
"https://medium.com/zerone-magazine/tagged/blogging-competition",
|
| 35 |
+
"https://medium.com/zerone-magazine/tagged/technical-competition",
|
| 36 |
+
"https://drive.google.com/file/u/2/d/1S99bX6EzDScZ2Is4MLLpNWGO4suq-fwj/view?usp=sharing",
|
| 37 |
+
"https://drive.google.com/file/d/1RRV3RALnP9BzUy1vq5KzS0HeH2XGH6HP/view?usp=sharing",
|
| 38 |
+
"https://drive.google.com/drive/folders/14UPc7jaazIt-E7RBRF6IJ9LmFrwzTRjc",
|
| 39 |
+
"https://drive.google.com/file/d/1pJMHmXZUcOCYBG5xt2Do5OMD9xEB7yD5/view",
|
| 40 |
+
"https://drive.google.com/file/d/1mm78B6Hc3oLw3IrT1R6fMyI9C8di5ga5/view",
|
| 41 |
+
"https://drive.google.com/file/d/1wn-QI5akpAgEA2vs8P9jeJDVxRbfNhPm/view?usp=drive_link",
|
| 42 |
+
"https://drive.google.com/file/d/1X-QENlLrFw0GM8NIlmTP4hsKYESjeRbh/view?usp=drive_link",
|
| 43 |
+
"https://drive.google.com/file/d/1b8MlmHNM_0GeYJhxAajb8YC22euZwyZQ/view?usp=drive_link",
|
| 44 |
+
"https://drive.google.com/file/d/14UPRPobB6AXR7YmS1570rtIHgWdVpyDY/view"
|
| 45 |
+
]
|
| 46 |
+
|
| 47 |
+
os.makedirs("data", exist_ok=True)
|
| 48 |
+
|
| 49 |
+
browser_config = BrowserConfig(
|
| 50 |
+
headless=True,
|
| 51 |
+
extra_args=["--disable-gpu", "--disable-software-rasterizer", "--no-sandbox"]
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
run_config = CrawlerRunConfig(
|
| 55 |
+
cache_mode=CacheMode.BYPASS,
|
| 56 |
+
wait_for="css:body",
|
| 57 |
+
delay_before_return_html=10.0,
|
| 58 |
+
page_timeout=120000
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
async with AsyncWebCrawler(config=browser_config) as crawler:
|
| 62 |
+
for url in urls:
|
| 63 |
+
print(f"Crawling {url}...")
|
| 64 |
+
try:
|
| 65 |
+
result = await crawler.arun(url=url, config=run_config)
|
| 66 |
+
|
| 67 |
+
if result.success:
|
| 68 |
+
# Create a safe filename from the URL
|
| 69 |
+
filename = url.replace("https://", "").replace("www.", "").replace("/", "_").strip("_")
|
| 70 |
+
if not filename or filename == "locus.com.np":
|
| 71 |
+
filename = "index"
|
| 72 |
+
else:
|
| 73 |
+
filename = filename.replace("locus.com.np_", "")
|
| 74 |
+
|
| 75 |
+
filepath = f"data/{filename}.md"
|
| 76 |
+
with open(filepath, "w", encoding="utf-8") as f:
|
| 77 |
+
f.write(result.markdown)
|
| 78 |
+
print(f"Saved to {filepath}")
|
| 79 |
+
else:
|
| 80 |
+
print(f"Failed to crawl {url}: {result.error_message}")
|
| 81 |
+
except Exception as e:
|
| 82 |
+
print(f"Exception during crawl of {url}: {str(e)}")
|
| 83 |
+
|
| 84 |
+
if __name__ == "__main__":
|
| 85 |
+
asyncio.run(main())
|