"""
IJNet Public Data Scraper
Scrapes publicly available opportunity listings and articles from ijnet.org
to build a knowledge base for the RAG chatbot.
"""

import json
import re
import sys
import time
from datetime import datetime, timedelta
from pathlib import Path

try:
    import requests
    from bs4 import BeautifulSoup
except ImportError:
    print("Installing required packages...")
    import subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", 
                           "requests", "beautifulsoup4", "--break-system-packages", "-q"])
    import requests
    from bs4 import BeautifulSoup


BASE_URL = "https://ijnet.org"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; IJNetBot/1.0; +research-prototype)"
}

def scrape_opportunities(max_pages: int = 3) -> list[dict]:
    """Scrape opportunity listings from IJNet's public opportunities page."""
    opportunities = []
    
    for page in range(max_pages):
        url = f"{BASE_URL}/opportunities?page={page}"
        print(f"  Fetching: {url}")
        try:
            resp = requests.get(url, headers=HEADERS, timeout=15)
            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, "html.parser")
            
            # Find opportunity cards/items
            articles = soup.find_all("article") or soup.find_all("div", class_=re.compile(r"node--type-opportunity|views-row|opportunity"))
            
            for article in articles:
                opp = {}
                
                # Title
                title_tag = article.find(["h2", "h3", "h4"])
                if title_tag:
                    link = title_tag.find("a")
                    opp["title"] = (link or title_tag).get_text(strip=True)
                    if link and link.get("href"):
                        href = link["href"]
                        opp["url"] = href if href.startswith("http") else BASE_URL + href
                
                # Description/body
                body = article.find("div", class_=re.compile(r"field--name-body|field--name-field-summary|summary|teaser"))
                if body:
                    opp["description"] = body.get_text(strip=True)[:500]
                
                # Deadline
                deadline_el = article.find("time") or article.find("span", class_=re.compile(r"date|deadline"))
                if deadline_el:
                    opp["deadline"] = deadline_el.get_text(strip=True)
                
                if opp.get("title"):
                    opportunities.append(opp)
            
            time.sleep(1)  # Be polite
        except Exception as e:
            print(f"  Warning: Could not fetch {url}: {e}")
    
    return opportunities


def scrape_articles(max_pages: int = 2) -> list[dict]:
    """Scrape article/resource listings from IJNet."""
    articles_list = []
    
    # Try different content sections
    sections = [
        "/latest-stories",
        "/resources",
    ]
    
    for section in sections:
        url = f"{BASE_URL}{section}"
        print(f"  Fetching: {url}")
        try:
            resp = requests.get(url, headers=HEADERS, timeout=15)
            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, "html.parser")
            
            items = soup.find_all("article") or soup.find_all("div", class_=re.compile(r"views-row|node--type"))
            
            for item in items:
                art = {"section": section.strip("/")}
                
                title_tag = item.find(["h2", "h3", "h4"])
                if title_tag:
                    link = title_tag.find("a")
                    art["title"] = (link or title_tag).get_text(strip=True)
                    if link and link.get("href"):
                        href = link["href"]
                        art["url"] = href if href.startswith("http") else BASE_URL + href
                
                body = item.find("div", class_=re.compile(r"field--name-body|summary|teaser"))
                if body:
                    art["description"] = body.get_text(strip=True)[:500]
                
                if art.get("title"):
                    articles_list.append(art)
            
            time.sleep(1)
        except Exception as e:
            print(f"  Warning: Could not fetch {url}: {e}")
    
    return articles_list


def scrape_article_detail(url: str) -> str:
    """Fetch full text of a single article page."""
    try:
        resp = requests.get(url, headers=HEADERS, timeout=15)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")
        
        # Try to find the main content area
        content = (
            soup.find("div", class_=re.compile(r"field--name-body")) or
            soup.find("article") or
            soup.find("main")
        )
        if content:
            # Remove scripts, styles, nav elements
            for tag in content.find_all(["script", "style", "nav", "footer"]):
                tag.decompose()
            return content.get_text(separator="\n", strip=True)[:3000]
    except Exception as e:
        print(f"  Warning: Could not fetch article detail {url}: {e}")
    return ""


def run_scraper():
    """Main scraper entry point."""
    data_dir = Path(__file__).parent / "data"
    data_dir.mkdir(exist_ok=True)
    
    print("\n=== IJNet Data Scraper ===\n")
    
    # --- Scrape opportunities ---
    print("[1/3] Scraping opportunities...")
    opportunities = scrape_opportunities(max_pages=3)
    print(f"  Found {len(opportunities)} opportunities from scraping")
    
    # --- Scrape articles ---
    print("[2/3] Scraping articles & resources...")
    articles = scrape_articles(max_pages=2)
    print(f"  Found {len(articles)} articles/resources from scraping")
    
    # --- Fetch article details for a few ---
    print("[3/3] Fetching article details...")
    articles_with_detail = []
    for art in articles[:5]:  # Limit to avoid hammering
        if art.get("url"):
            detail = scrape_article_detail(art["url"])
            if detail:
                art["full_text"] = detail
                articles_with_detail.append(art)
            time.sleep(1)
    
    scraped = {
        "opportunities": opportunities,
        "articles": articles,
        "articles_with_detail": articles_with_detail,
        "scraped_at": datetime.now().isoformat(),
    }
    
    out_path = data_dir / "scraped_ijnet.json"
    with open(out_path, "w") as f:
        json.dump(scraped, f, indent=2, ensure_ascii=False)
    
    print(f"\nSaved {len(opportunities)} opportunities + {len(articles)} articles to {out_path}")
    return scraped


if __name__ == "__main__":
    run_scraper()