Spaces:

ThatITGuy
/

IJNet-assistant

Sleeping

Mohammad Haris

Deploy IJNet assistant

b87aca1 about 1 month ago

6.74 kB

	"""
	IJNet Public Data Scraper
	Scrapes publicly available opportunity listings and articles from ijnet.org
	to build a knowledge base for the RAG chatbot.
	"""

	import json
	import re
	import sys
	import time
	from datetime import datetime, timedelta
	from pathlib import Path

	try:
	import requests
	from bs4 import BeautifulSoup
	except ImportError:
	print("Installing required packages...")
	import subprocess
	subprocess.check_call([sys.executable, "-m", "pip", "install",
	"requests", "beautifulsoup4", "--break-system-packages", "-q"])
	import requests
	from bs4 import BeautifulSoup


	BASE_URL = "https://ijnet.org"
	HEADERS = {
	"User-Agent": "Mozilla/5.0 (compatible; IJNetBot/1.0; +research-prototype)"
	}

	def scrape_opportunities(max_pages: int = 3) -> list[dict]:
	"""Scrape opportunity listings from IJNet's public opportunities page."""
	opportunities = []

	for page in range(max_pages):
	url = f"{BASE_URL}/opportunities?page={page}"
	print(f" Fetching: {url}")
	try:
	resp = requests.get(url, headers=HEADERS, timeout=15)
	resp.raise_for_status()
	soup = BeautifulSoup(resp.text, "html.parser")

	# Find opportunity cards/items
	articles = soup.find_all("article") or soup.find_all("div", class_=re.compile(r"node--type-opportunity\|views-row\|opportunity"))

	for article in articles:
	opp = {}

	# Title
	title_tag = article.find(["h2", "h3", "h4"])
	if title_tag:
	link = title_tag.find("a")
	opp["title"] = (link or title_tag).get_text(strip=True)
	if link and link.get("href"):
	href = link["href"]
	opp["url"] = href if href.startswith("http") else BASE_URL + href

	# Description/body
	body = article.find("div", class_=re.compile(r"field--name-body\|field--name-field-summary\|summary\|teaser"))
	if body:
	opp["description"] = body.get_text(strip=True)[:500]

	# Deadline
	deadline_el = article.find("time") or article.find("span", class_=re.compile(r"date\|deadline"))
	if deadline_el:
	opp["deadline"] = deadline_el.get_text(strip=True)

	if opp.get("title"):
	opportunities.append(opp)

	time.sleep(1) # Be polite
	except Exception as e:
	print(f" Warning: Could not fetch {url}: {e}")

	return opportunities


	def scrape_articles(max_pages: int = 2) -> list[dict]:
	"""Scrape article/resource listings from IJNet."""
	articles_list = []

	# Try different content sections
	sections = [
	"/latest-stories",
	"/resources",
	]

	for section in sections:
	url = f"{BASE_URL}{section}"
	print(f" Fetching: {url}")
	try:
	resp = requests.get(url, headers=HEADERS, timeout=15)
	resp.raise_for_status()
	soup = BeautifulSoup(resp.text, "html.parser")

	items = soup.find_all("article") or soup.find_all("div", class_=re.compile(r"views-row\|node--type"))

	for item in items:
	art = {"section": section.strip("/")}

	title_tag = item.find(["h2", "h3", "h4"])
	if title_tag:
	link = title_tag.find("a")
	art["title"] = (link or title_tag).get_text(strip=True)
	if link and link.get("href"):
	href = link["href"]
	art["url"] = href if href.startswith("http") else BASE_URL + href

	body = item.find("div", class_=re.compile(r"field--name-body\|summary\|teaser"))
	if body:
	art["description"] = body.get_text(strip=True)[:500]

	if art.get("title"):
	articles_list.append(art)

	time.sleep(1)
	except Exception as e:
	print(f" Warning: Could not fetch {url}: {e}")

	return articles_list


	def scrape_article_detail(url: str) -> str:
	"""Fetch full text of a single article page."""
	try:
	resp = requests.get(url, headers=HEADERS, timeout=15)
	resp.raise_for_status()
	soup = BeautifulSoup(resp.text, "html.parser")

	# Try to find the main content area
	content = (
	soup.find("div", class_=re.compile(r"field--name-body")) or
	soup.find("article") or
	soup.find("main")
	)
	if content:
	# Remove scripts, styles, nav elements
	for tag in content.find_all(["script", "style", "nav", "footer"]):
	tag.decompose()
	return content.get_text(separator="\n", strip=True)[:3000]
	except Exception as e:
	print(f" Warning: Could not fetch article detail {url}: {e}")
	return ""


	def run_scraper():
	"""Main scraper entry point."""
	data_dir = Path(__file__).parent / "data"
	data_dir.mkdir(exist_ok=True)

	print("\n=== IJNet Data Scraper ===\n")

	# --- Scrape opportunities ---
	print("[1/3] Scraping opportunities...")
	opportunities = scrape_opportunities(max_pages=3)
	print(f" Found {len(opportunities)} opportunities from scraping")

	# --- Scrape articles ---
	print("[2/3] Scraping articles & resources...")
	articles = scrape_articles(max_pages=2)
	print(f" Found {len(articles)} articles/resources from scraping")

	# --- Fetch article details for a few ---
	print("[3/3] Fetching article details...")
	articles_with_detail = []
	for art in articles[:5]: # Limit to avoid hammering
	if art.get("url"):
	detail = scrape_article_detail(art["url"])
	if detail:
	art["full_text"] = detail
	articles_with_detail.append(art)
	time.sleep(1)

	scraped = {
	"opportunities": opportunities,
	"articles": articles,
	"articles_with_detail": articles_with_detail,
	"scraped_at": datetime.now().isoformat(),
	}

	out_path = data_dir / "scraped_ijnet.json"
	with open(out_path, "w") as f:
	json.dump(scraped, f, indent=2, ensure_ascii=False)

	print(f"\nSaved {len(opportunities)} opportunities + {len(articles)} articles to {out_path}")
	return scraped


	if __name__ == "__main__":
	run_scraper()