IJNet-assistant / scraper.py
Mohammad Haris
Deploy IJNet assistant
b87aca1
Raw
History Blame Contribute Delete
6.74 kB
"""
IJNet Public Data Scraper
Scrapes publicly available opportunity listings and articles from ijnet.org
to build a knowledge base for the RAG chatbot.
"""
import json
import re
import sys
import time
from datetime import datetime, timedelta
from pathlib import Path
try:
import requests
from bs4 import BeautifulSoup
except ImportError:
print("Installing required packages...")
import subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install",
"requests", "beautifulsoup4", "--break-system-packages", "-q"])
import requests
from bs4 import BeautifulSoup
BASE_URL = "https://ijnet.org"
HEADERS = {
"User-Agent": "Mozilla/5.0 (compatible; IJNetBot/1.0; +research-prototype)"
}
def scrape_opportunities(max_pages: int = 3) -> list[dict]:
"""Scrape opportunity listings from IJNet's public opportunities page."""
opportunities = []
for page in range(max_pages):
url = f"{BASE_URL}/opportunities?page={page}"
print(f" Fetching: {url}")
try:
resp = requests.get(url, headers=HEADERS, timeout=15)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
# Find opportunity cards/items
articles = soup.find_all("article") or soup.find_all("div", class_=re.compile(r"node--type-opportunity|views-row|opportunity"))
for article in articles:
opp = {}
# Title
title_tag = article.find(["h2", "h3", "h4"])
if title_tag:
link = title_tag.find("a")
opp["title"] = (link or title_tag).get_text(strip=True)
if link and link.get("href"):
href = link["href"]
opp["url"] = href if href.startswith("http") else BASE_URL + href
# Description/body
body = article.find("div", class_=re.compile(r"field--name-body|field--name-field-summary|summary|teaser"))
if body:
opp["description"] = body.get_text(strip=True)[:500]
# Deadline
deadline_el = article.find("time") or article.find("span", class_=re.compile(r"date|deadline"))
if deadline_el:
opp["deadline"] = deadline_el.get_text(strip=True)
if opp.get("title"):
opportunities.append(opp)
time.sleep(1) # Be polite
except Exception as e:
print(f" Warning: Could not fetch {url}: {e}")
return opportunities
def scrape_articles(max_pages: int = 2) -> list[dict]:
"""Scrape article/resource listings from IJNet."""
articles_list = []
# Try different content sections
sections = [
"/latest-stories",
"/resources",
]
for section in sections:
url = f"{BASE_URL}{section}"
print(f" Fetching: {url}")
try:
resp = requests.get(url, headers=HEADERS, timeout=15)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
items = soup.find_all("article") or soup.find_all("div", class_=re.compile(r"views-row|node--type"))
for item in items:
art = {"section": section.strip("/")}
title_tag = item.find(["h2", "h3", "h4"])
if title_tag:
link = title_tag.find("a")
art["title"] = (link or title_tag).get_text(strip=True)
if link and link.get("href"):
href = link["href"]
art["url"] = href if href.startswith("http") else BASE_URL + href
body = item.find("div", class_=re.compile(r"field--name-body|summary|teaser"))
if body:
art["description"] = body.get_text(strip=True)[:500]
if art.get("title"):
articles_list.append(art)
time.sleep(1)
except Exception as e:
print(f" Warning: Could not fetch {url}: {e}")
return articles_list
def scrape_article_detail(url: str) -> str:
"""Fetch full text of a single article page."""
try:
resp = requests.get(url, headers=HEADERS, timeout=15)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
# Try to find the main content area
content = (
soup.find("div", class_=re.compile(r"field--name-body")) or
soup.find("article") or
soup.find("main")
)
if content:
# Remove scripts, styles, nav elements
for tag in content.find_all(["script", "style", "nav", "footer"]):
tag.decompose()
return content.get_text(separator="\n", strip=True)[:3000]
except Exception as e:
print(f" Warning: Could not fetch article detail {url}: {e}")
return ""
def run_scraper():
"""Main scraper entry point."""
data_dir = Path(__file__).parent / "data"
data_dir.mkdir(exist_ok=True)
print("\n=== IJNet Data Scraper ===\n")
# --- Scrape opportunities ---
print("[1/3] Scraping opportunities...")
opportunities = scrape_opportunities(max_pages=3)
print(f" Found {len(opportunities)} opportunities from scraping")
# --- Scrape articles ---
print("[2/3] Scraping articles & resources...")
articles = scrape_articles(max_pages=2)
print(f" Found {len(articles)} articles/resources from scraping")
# --- Fetch article details for a few ---
print("[3/3] Fetching article details...")
articles_with_detail = []
for art in articles[:5]: # Limit to avoid hammering
if art.get("url"):
detail = scrape_article_detail(art["url"])
if detail:
art["full_text"] = detail
articles_with_detail.append(art)
time.sleep(1)
scraped = {
"opportunities": opportunities,
"articles": articles,
"articles_with_detail": articles_with_detail,
"scraped_at": datetime.now().isoformat(),
}
out_path = data_dir / "scraped_ijnet.json"
with open(out_path, "w") as f:
json.dump(scraped, f, indent=2, ensure_ascii=False)
print(f"\nSaved {len(opportunities)} opportunities + {len(articles)} articles to {out_path}")
return scraped
if __name__ == "__main__":
run_scraper()