Spaces:
Sleeping
Sleeping
| """ | |
| IJNet Public Data Scraper | |
| Scrapes publicly available opportunity listings and articles from ijnet.org | |
| to build a knowledge base for the RAG chatbot. | |
| """ | |
| import json | |
| import re | |
| import sys | |
| import time | |
| from datetime import datetime, timedelta | |
| from pathlib import Path | |
| try: | |
| import requests | |
| from bs4 import BeautifulSoup | |
| except ImportError: | |
| print("Installing required packages...") | |
| import subprocess | |
| subprocess.check_call([sys.executable, "-m", "pip", "install", | |
| "requests", "beautifulsoup4", "--break-system-packages", "-q"]) | |
| import requests | |
| from bs4 import BeautifulSoup | |
| BASE_URL = "https://ijnet.org" | |
| HEADERS = { | |
| "User-Agent": "Mozilla/5.0 (compatible; IJNetBot/1.0; +research-prototype)" | |
| } | |
| def scrape_opportunities(max_pages: int = 3) -> list[dict]: | |
| """Scrape opportunity listings from IJNet's public opportunities page.""" | |
| opportunities = [] | |
| for page in range(max_pages): | |
| url = f"{BASE_URL}/opportunities?page={page}" | |
| print(f" Fetching: {url}") | |
| try: | |
| resp = requests.get(url, headers=HEADERS, timeout=15) | |
| resp.raise_for_status() | |
| soup = BeautifulSoup(resp.text, "html.parser") | |
| # Find opportunity cards/items | |
| articles = soup.find_all("article") or soup.find_all("div", class_=re.compile(r"node--type-opportunity|views-row|opportunity")) | |
| for article in articles: | |
| opp = {} | |
| # Title | |
| title_tag = article.find(["h2", "h3", "h4"]) | |
| if title_tag: | |
| link = title_tag.find("a") | |
| opp["title"] = (link or title_tag).get_text(strip=True) | |
| if link and link.get("href"): | |
| href = link["href"] | |
| opp["url"] = href if href.startswith("http") else BASE_URL + href | |
| # Description/body | |
| body = article.find("div", class_=re.compile(r"field--name-body|field--name-field-summary|summary|teaser")) | |
| if body: | |
| opp["description"] = body.get_text(strip=True)[:500] | |
| # Deadline | |
| deadline_el = article.find("time") or article.find("span", class_=re.compile(r"date|deadline")) | |
| if deadline_el: | |
| opp["deadline"] = deadline_el.get_text(strip=True) | |
| if opp.get("title"): | |
| opportunities.append(opp) | |
| time.sleep(1) # Be polite | |
| except Exception as e: | |
| print(f" Warning: Could not fetch {url}: {e}") | |
| return opportunities | |
| def scrape_articles(max_pages: int = 2) -> list[dict]: | |
| """Scrape article/resource listings from IJNet.""" | |
| articles_list = [] | |
| # Try different content sections | |
| sections = [ | |
| "/latest-stories", | |
| "/resources", | |
| ] | |
| for section in sections: | |
| url = f"{BASE_URL}{section}" | |
| print(f" Fetching: {url}") | |
| try: | |
| resp = requests.get(url, headers=HEADERS, timeout=15) | |
| resp.raise_for_status() | |
| soup = BeautifulSoup(resp.text, "html.parser") | |
| items = soup.find_all("article") or soup.find_all("div", class_=re.compile(r"views-row|node--type")) | |
| for item in items: | |
| art = {"section": section.strip("/")} | |
| title_tag = item.find(["h2", "h3", "h4"]) | |
| if title_tag: | |
| link = title_tag.find("a") | |
| art["title"] = (link or title_tag).get_text(strip=True) | |
| if link and link.get("href"): | |
| href = link["href"] | |
| art["url"] = href if href.startswith("http") else BASE_URL + href | |
| body = item.find("div", class_=re.compile(r"field--name-body|summary|teaser")) | |
| if body: | |
| art["description"] = body.get_text(strip=True)[:500] | |
| if art.get("title"): | |
| articles_list.append(art) | |
| time.sleep(1) | |
| except Exception as e: | |
| print(f" Warning: Could not fetch {url}: {e}") | |
| return articles_list | |
| def scrape_article_detail(url: str) -> str: | |
| """Fetch full text of a single article page.""" | |
| try: | |
| resp = requests.get(url, headers=HEADERS, timeout=15) | |
| resp.raise_for_status() | |
| soup = BeautifulSoup(resp.text, "html.parser") | |
| # Try to find the main content area | |
| content = ( | |
| soup.find("div", class_=re.compile(r"field--name-body")) or | |
| soup.find("article") or | |
| soup.find("main") | |
| ) | |
| if content: | |
| # Remove scripts, styles, nav elements | |
| for tag in content.find_all(["script", "style", "nav", "footer"]): | |
| tag.decompose() | |
| return content.get_text(separator="\n", strip=True)[:3000] | |
| except Exception as e: | |
| print(f" Warning: Could not fetch article detail {url}: {e}") | |
| return "" | |
| def run_scraper(): | |
| """Main scraper entry point.""" | |
| data_dir = Path(__file__).parent / "data" | |
| data_dir.mkdir(exist_ok=True) | |
| print("\n=== IJNet Data Scraper ===\n") | |
| # --- Scrape opportunities --- | |
| print("[1/3] Scraping opportunities...") | |
| opportunities = scrape_opportunities(max_pages=3) | |
| print(f" Found {len(opportunities)} opportunities from scraping") | |
| # --- Scrape articles --- | |
| print("[2/3] Scraping articles & resources...") | |
| articles = scrape_articles(max_pages=2) | |
| print(f" Found {len(articles)} articles/resources from scraping") | |
| # --- Fetch article details for a few --- | |
| print("[3/3] Fetching article details...") | |
| articles_with_detail = [] | |
| for art in articles[:5]: # Limit to avoid hammering | |
| if art.get("url"): | |
| detail = scrape_article_detail(art["url"]) | |
| if detail: | |
| art["full_text"] = detail | |
| articles_with_detail.append(art) | |
| time.sleep(1) | |
| scraped = { | |
| "opportunities": opportunities, | |
| "articles": articles, | |
| "articles_with_detail": articles_with_detail, | |
| "scraped_at": datetime.now().isoformat(), | |
| } | |
| out_path = data_dir / "scraped_ijnet.json" | |
| with open(out_path, "w") as f: | |
| json.dump(scraped, f, indent=2, ensure_ascii=False) | |
| print(f"\nSaved {len(opportunities)} opportunities + {len(articles)} articles to {out_path}") | |
| return scraped | |
| if __name__ == "__main__": | |
| run_scraper() | |