Spaces:
Sleeping
Sleeping
| """ | |
| Multi-Term NEU Course Scraper - Merges data from multiple terms | |
| Fixes: Missing courses by scraping Fall/Spring/Summer catalogs | |
| """ | |
| import requests | |
| import pickle | |
| import networkx as nx | |
| import time | |
| import logging | |
| from typing import List, Dict, Set, Any | |
| from datetime import datetime | |
| from collections import defaultdict | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") | |
| logger = logging.getLogger(__name__) | |
| class MultiTermScraper: | |
| def __init__(self, term_ids: List[str], api_url: str = "https://searchneu.com/graphql"): | |
| self.term_ids = term_ids | |
| self.api_url = api_url | |
| self.headers = {"Content-Type": "application/json"} | |
| self.merged_courses: Dict[str, Dict] = {} # cid -> course data | |
| self.graph = nx.DiGraph() | |
| def get_all_courses_by_subject(self, term_id: str, subject: str, batch_size: int = 100) -> List[Dict]: | |
| """Fetch ALL courses for a specific subject/term via pagination.""" | |
| all_courses = [] | |
| offset = 0 | |
| page = 1 | |
| while True: | |
| query = """ | |
| query searchQuery($termId: String!, $query: String!, $first: Int, $offset: Int) { | |
| search(termId: $termId, query: $query, first: $first, offset: $offset) { | |
| totalCount | |
| nodes { | |
| __typename | |
| ... on ClassOccurrence { | |
| subject | |
| classId | |
| name | |
| desc | |
| prereqs | |
| coreqs | |
| minCredits | |
| maxCredits | |
| } | |
| } | |
| } | |
| } | |
| """ | |
| variables = { | |
| "termId": term_id, | |
| "query": subject, | |
| "first": batch_size, | |
| "offset": offset | |
| } | |
| try: | |
| resp = requests.post(self.api_url, json={"query": query, "variables": variables}, headers=self.headers, timeout=10) | |
| resp.raise_for_status() | |
| data = resp.json() | |
| if "errors" in data: | |
| logger.error(f"GraphQL errors for {term_id}/{subject}: {data['errors']}") | |
| break | |
| search_data = data.get("data", {}).get("search", {}) | |
| nodes = search_data.get("nodes", []) | |
| page_courses = [c for c in nodes if c.get("__typename") == "ClassOccurrence"] | |
| all_courses.extend(page_courses) | |
| logger.info(f"[{term_id}] {subject} Page {page}: {len(page_courses)} courses (Total: {len(all_courses)})") | |
| if len(page_courses) < batch_size: | |
| break | |
| offset += batch_size | |
| page += 1 | |
| time.sleep(0.1) | |
| except Exception as e: | |
| logger.error(f"Error fetching {term_id}/{subject} page {page}: {e}") | |
| break | |
| logger.info(f"[{term_id}] {subject}: {len(all_courses)} total courses") | |
| return all_courses | |
| def _recursive_parse_prereqs(self, prereq_obj: Any) -> Set[str]: | |
| """Extract course IDs from nested prereq structures.""" | |
| ids = set() | |
| if not isinstance(prereq_obj, dict): | |
| return ids | |
| if "classId" in prereq_obj and "subject" in prereq_obj: | |
| ids.add(f"{prereq_obj['subject']}{prereq_obj['classId']}") | |
| return ids | |
| if prereq_obj.get("type") in ["and", "or"]: | |
| for val in prereq_obj.get("values", []): | |
| ids |= self._recursive_parse_prereqs(val) | |
| elif "values" in prereq_obj: | |
| for val in prereq_obj.get("values", []): | |
| ids |= self._recursive_parse_prereqs(val) | |
| return ids | |
| def scrape_all_terms(self, subjects: List[str]): | |
| """Scrape courses from all terms and merge by course ID.""" | |
| term_data = defaultdict(lambda: defaultdict(list)) # term_id -> subject -> courses | |
| for term_id in self.term_ids: | |
| logger.info(f"\n{'='*70}") | |
| logger.info(f"SCRAPING TERM: {term_id}") | |
| logger.info(f"{'='*70}") | |
| for subject in subjects: | |
| courses = self.get_all_courses_by_subject(term_id, subject) | |
| term_data[term_id][subject] = courses | |
| time.sleep(0.5) | |
| # Merge courses across terms (prefer most recent data for duplicates) | |
| for term_id in self.term_ids: | |
| for subject in subjects: | |
| for course in term_data[term_id][subject]: | |
| cid = f"{course['subject']}{course['classId']}" | |
| # Only update if we don't have this course OR this term is newer | |
| if cid not in self.merged_courses: | |
| self.merged_courses[cid] = course | |
| logger.debug(f"Added {cid} from {term_id}") | |
| else: | |
| # Update if current course has more complete data | |
| existing = self.merged_courses[cid] | |
| if not existing.get('desc') and course.get('desc'): | |
| self.merged_courses[cid] = course | |
| logger.debug(f"Updated {cid} from {term_id} (better description)") | |
| logger.info(f"\n{'='*70}") | |
| logger.info(f"MERGE COMPLETE: {len(self.merged_courses)} unique courses") | |
| logger.info(f"{'='*70}") | |
| # Log subject breakdown | |
| subject_counts = defaultdict(int) | |
| for cid in self.merged_courses: | |
| subject = self.merged_courses[cid].get('subject', 'UNKNOWN') | |
| subject_counts[subject] += 1 | |
| logger.info("\nSubject breakdown:") | |
| for subject in sorted(subject_counts.keys()): | |
| logger.info(f" {subject}: {subject_counts[subject]} courses") | |
| def build_graph(self): | |
| """Build NetworkX graph from merged course data.""" | |
| logger.info("\nBuilding course dependency graph...") | |
| # Add all courses as nodes | |
| for cid, cdata in self.merged_courses.items(): | |
| self.graph.add_node(cid, **{ | |
| "name": cdata.get("name", ""), | |
| "subject": cdata.get("subject", ""), | |
| "classId": cdata.get("classId", ""), | |
| "description": cdata.get("desc", ""), | |
| "minCredits": cdata.get("minCredits", 0), | |
| "maxCredits": cdata.get("maxCredits", 0) | |
| }) | |
| # Add prerequisite edges | |
| edge_count = 0 | |
| for cid, cdata in self.merged_courses.items(): | |
| prereqs = cdata.get("prereqs", {}) | |
| if prereqs: | |
| prereq_ids = self._recursive_parse_prereqs(prereqs) | |
| for pid in prereq_ids: | |
| if pid in self.graph: | |
| self.graph.add_edge(pid, cid, relationship="prerequisite") | |
| edge_count += 1 | |
| else: | |
| logger.warning(f"Prerequisite {pid} for {cid} not in graph") | |
| logger.info(f"Graph built: {self.graph.number_of_nodes()} nodes, {edge_count} edges") | |
| def save_data(self, prefix: str): | |
| """Save merged graph and courses.""" | |
| ts = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| gfile = f"{prefix}_graph_{ts}.pkl" | |
| cfile = f"{prefix}_courses_{ts}.pkl" | |
| with open(gfile, "wb") as gf: | |
| pickle.dump(self.graph, gf) | |
| with open(cfile, "wb") as cf: | |
| pickle.dump(self.merged_courses, cf) | |
| logger.info(f"\nData saved:") | |
| logger.info(f" Graph: {gfile}") | |
| logger.info(f" Courses: {cfile}") | |
| # Save merge report | |
| report_file = f"{prefix}_merge_report_{ts}.txt" | |
| with open(report_file, "w") as rf: | |
| rf.write(f"Multi-Term Scrape Report\n") | |
| rf.write(f"{'='*70}\n\n") | |
| rf.write(f"Terms scraped: {', '.join(self.term_ids)}\n") | |
| rf.write(f"Total unique courses: {len(self.merged_courses)}\n") | |
| rf.write(f"Total edges: {self.graph.number_of_edges()}\n\n") | |
| rf.write("Subject breakdown:\n") | |
| subject_counts = defaultdict(int) | |
| for cid in self.merged_courses: | |
| subject = self.merged_courses[cid].get('subject', 'UNKNOWN') | |
| subject_counts[subject] += 1 | |
| for subject in sorted(subject_counts.keys()): | |
| rf.write(f" {subject}: {subject_counts[subject]}\n") | |
| logger.info(f" Report: {report_file}") | |
| def main(): | |
| import argparse | |
| parser = argparse.ArgumentParser(description="Multi-Term NEU Catalog Scraper") | |
| parser.add_argument("--terms", nargs="+", required=True, help="Term IDs (e.g., 202510 202520 202530)") | |
| parser.add_argument("--subjects", nargs="+", required=True, help="Subjects (e.g., CS DS STAT)") | |
| parser.add_argument("--prefix", default="neu_merged", help="Output prefix") | |
| parser.add_argument("--batch-size", type=int, default=100, help="Courses per page") | |
| args = parser.parse_args() | |
| scraper = MultiTermScraper(term_ids=args.terms) | |
| scraper.scrape_all_terms(args.subjects) | |
| scraper.build_graph() | |
| scraper.save_data(args.prefix) | |
| logger.info("\n✅ Multi-term scraping complete!") | |
| if __name__ == "__main__": | |
| main() |