""" IJNet Public Data Scraper Scrapes publicly available opportunity listings and articles from ijnet.org to build a knowledge base for the RAG chatbot. """ import json import re import sys import time from datetime import datetime, timedelta from pathlib import Path try: import requests from bs4 import BeautifulSoup except ImportError: print("Installing required packages...") import subprocess subprocess.check_call([sys.executable, "-m", "pip", "install", "requests", "beautifulsoup4", "--break-system-packages", "-q"]) import requests from bs4 import BeautifulSoup BASE_URL = "https://ijnet.org" HEADERS = { "User-Agent": "Mozilla/5.0 (compatible; IJNetBot/1.0; +research-prototype)" } def scrape_opportunities(max_pages: int = 3) -> list[dict]: """Scrape opportunity listings from IJNet's public opportunities page.""" opportunities = [] for page in range(max_pages): url = f"{BASE_URL}/opportunities?page={page}" print(f" Fetching: {url}") try: resp = requests.get(url, headers=HEADERS, timeout=15) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") # Find opportunity cards/items articles = soup.find_all("article") or soup.find_all("div", class_=re.compile(r"node--type-opportunity|views-row|opportunity")) for article in articles: opp = {} # Title title_tag = article.find(["h2", "h3", "h4"]) if title_tag: link = title_tag.find("a") opp["title"] = (link or title_tag).get_text(strip=True) if link and link.get("href"): href = link["href"] opp["url"] = href if href.startswith("http") else BASE_URL + href # Description/body body = article.find("div", class_=re.compile(r"field--name-body|field--name-field-summary|summary|teaser")) if body: opp["description"] = body.get_text(strip=True)[:500] # Deadline deadline_el = article.find("time") or article.find("span", class_=re.compile(r"date|deadline")) if deadline_el: opp["deadline"] = deadline_el.get_text(strip=True) if opp.get("title"): opportunities.append(opp) time.sleep(1) # Be polite except Exception as e: print(f" Warning: Could not fetch {url}: {e}") return opportunities def scrape_articles(max_pages: int = 2) -> list[dict]: """Scrape article/resource listings from IJNet.""" articles_list = [] # Try different content sections sections = [ "/latest-stories", "/resources", ] for section in sections: url = f"{BASE_URL}{section}" print(f" Fetching: {url}") try: resp = requests.get(url, headers=HEADERS, timeout=15) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") items = soup.find_all("article") or soup.find_all("div", class_=re.compile(r"views-row|node--type")) for item in items: art = {"section": section.strip("/")} title_tag = item.find(["h2", "h3", "h4"]) if title_tag: link = title_tag.find("a") art["title"] = (link or title_tag).get_text(strip=True) if link and link.get("href"): href = link["href"] art["url"] = href if href.startswith("http") else BASE_URL + href body = item.find("div", class_=re.compile(r"field--name-body|summary|teaser")) if body: art["description"] = body.get_text(strip=True)[:500] if art.get("title"): articles_list.append(art) time.sleep(1) except Exception as e: print(f" Warning: Could not fetch {url}: {e}") return articles_list def scrape_article_detail(url: str) -> str: """Fetch full text of a single article page.""" try: resp = requests.get(url, headers=HEADERS, timeout=15) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") # Try to find the main content area content = ( soup.find("div", class_=re.compile(r"field--name-body")) or soup.find("article") or soup.find("main") ) if content: # Remove scripts, styles, nav elements for tag in content.find_all(["script", "style", "nav", "footer"]): tag.decompose() return content.get_text(separator="\n", strip=True)[:3000] except Exception as e: print(f" Warning: Could not fetch article detail {url}: {e}") return "" def run_scraper(): """Main scraper entry point.""" data_dir = Path(__file__).parent / "data" data_dir.mkdir(exist_ok=True) print("\n=== IJNet Data Scraper ===\n") # --- Scrape opportunities --- print("[1/3] Scraping opportunities...") opportunities = scrape_opportunities(max_pages=3) print(f" Found {len(opportunities)} opportunities from scraping") # --- Scrape articles --- print("[2/3] Scraping articles & resources...") articles = scrape_articles(max_pages=2) print(f" Found {len(articles)} articles/resources from scraping") # --- Fetch article details for a few --- print("[3/3] Fetching article details...") articles_with_detail = [] for art in articles[:5]: # Limit to avoid hammering if art.get("url"): detail = scrape_article_detail(art["url"]) if detail: art["full_text"] = detail articles_with_detail.append(art) time.sleep(1) scraped = { "opportunities": opportunities, "articles": articles, "articles_with_detail": articles_with_detail, "scraped_at": datetime.now().isoformat(), } out_path = data_dir / "scraped_ijnet.json" with open(out_path, "w") as f: json.dump(scraped, f, indent=2, ensure_ascii=False) print(f"\nSaved {len(opportunities)} opportunities + {len(articles)} articles to {out_path}") return scraped if __name__ == "__main__": run_scraper()