Spaces:

internationalscholarsprogram
/

handbook_engine

Running

App Files Files Community

handbook_engine / app /services /data_fetcher.py

internationalscholarsprogram

feat: tier-one / tier-two ordering, labels, and grouped PDF summary

fdf5f84 1 day ago

raw

history blame contribute delete

7.92 kB

	"""Data fetcher service — mirrors PHP fetchers.php.

	Fetches handbook data from the two external JSON APIs (source of truth),
	normalises the payloads, and returns typed dicts identical to what the
	PHP code produced.
	"""

	from __future__ import annotations

	import json
	import logging
	from typing import Any

	import httpx

	from app.core.config import get_settings

	logger = logging.getLogger(__name__)


	def _normalize_section_json(raw: Any, context: str, sid: Any = None) -> dict \| list:
	"""Mirrors PHP handbook_normalize_section_json."""
	if isinstance(raw, dict) or isinstance(raw, list):
	return raw
	if isinstance(raw, str):
	raw = raw.strip()
	if not raw:
	return {}
	try:
	decoded = json.loads(raw)
	if isinstance(decoded, (dict, list)):
	return decoded
	except (json.JSONDecodeError, ValueError):
	logger.warning(
	"section_json parse failed ctx=%s id=%s snippet=%.180s",
	context, sid, raw,
	)
	return {}
	return {}


	def _is_truthy(val: Any) -> bool:
	"""Mirrors PHP handbook_true."""
	if isinstance(val, bool):
	return val
	if isinstance(val, int):
	return val != 0
	s = str(val).lower().strip()
	return s not in ("0", "false", "")


	def _tier_section_rank(section_key: str) -> int:
	"""Return sort priority for tier-related section keys.

	Tier One sections sort before Tier Two; non-tier sections get 99 (neutral).
	"""
	k = section_key.lower().replace("-", "_").replace(" ", "_")
	if "tier_one" in k or "non_cosigner" in k:
	return 0
	if "tier_two" in k or k in ("cosigner_schools", "cosigner"):
	return 1
	return 99


	def _sort_sections_stable(sections: list[dict]) -> list[dict]:
	"""Mirrors PHP sortHandbookSectionsStable with tier-aware tiebreaker."""
	for i, s in enumerate(sections):
	s.setdefault("_i", i)

	def sort_key(s: dict):
	so = s.get("sort_order")
	sid = s.get("id")
	# None values sort after numeric values
	so_key = (0, so) if so is not None else (1, 0)
	# Tier-aware tiebreaker: Tier One before Tier Two when sort_order ties
	tier_rank = _tier_section_rank(str(s.get("section_key", "")))
	sid_key = (0, sid) if sid is not None else (1, 0)
	return (so_key, tier_rank, sid_key, s.get("_i", 0))

	sections.sort(key=sort_key)
	for s in sections:
	s.pop("_i", None)
	return sections


	async def fetch_global_sections(catalog_id: int = 0) -> list[dict[str, Any]]:
	"""Fetch and normalise global handbook sections from the external API.

	Mirrors PHP fetchGlobalSections().
	"""
	settings = get_settings()
	url = settings.general_endpoint_url
	if catalog_id:
	sep = "&" if "?" in url else "?"
	url += f"{sep}catalog_id={catalog_id}"

	try:
	async with httpx.AsyncClient(verify=False, timeout=settings.http_timeout) as client:
	resp = await client.get(url)
	resp.raise_for_status()
	payload = resp.json()
	except Exception as exc:
	logger.error("Global sections fetch failed: %s url=%s", exc, url)
	return []

	if not payload.get("ok"):
	logger.warning("Global sections API returned ok=false: %s", payload)
	return []

	# Accept common shapes
	sections_raw = (
	payload.get("general_sections")
	or payload.get("sections")
	or payload.get("globals")
	or payload.get("data")
	or []
	)
	if not isinstance(sections_raw, list):
	sections_raw = []

	out: list[dict[str, Any]] = []
	for i, s in enumerate(sections_raw):
	if not isinstance(s, dict):
	continue

	k = str(s.get("section_key", ""))
	t = str(s.get("section_title", ""))
	j = _normalize_section_json(s.get("section_json", {}), "global", s.get("id"))

	sort_raw = s.get("sort_order") or s.get("sortOrder")
	sort_val = int(sort_raw) if sort_raw is not None and str(sort_raw).lstrip("-").isdigit() else None

	if not k and not t and (not j or j == {}):
	continue

	out.append({
	"section_key": k,
	"section_title": t,
	"section_json": j,
	"sort_order": sort_val,
	"id": int(s["id"]) if s.get("id") is not None else None,
	"_i": i,
	})

	out = _sort_sections_stable(out)

	logger.info(
	"Global sections fetched catalog_id=%d count=%d keys=%s",
	catalog_id,
	len(out),
	[s.get("section_key") for s in out],
	)
	return out


	async def fetch_university_sections() -> dict[int, dict[str, Any]]:
	"""Fetch and normalise university handbook sections.

	Returns dict keyed by university_id.
	Mirrors PHP fetchUniversitySections().
	"""
	settings = get_settings()
	url = settings.university_endpoint_url

	try:
	async with httpx.AsyncClient(verify=False, timeout=settings.http_timeout) as client:
	resp = await client.get(url)
	resp.raise_for_status()
	payload = resp.json()
	except Exception as exc:
	logger.error("University sections fetch failed: %s url=%s", exc, url)
	return {}

	if not payload.get("ok"):
	logger.warning("University sections API returned ok=false")
	return {}

	universities = payload.get("universities", [])
	if not isinstance(universities, list):
	universities = []

	by_uni: dict[int, dict[str, Any]] = {}
	for u in universities:
	if not isinstance(u, dict):
	continue
	uid = int(u.get("university_id", 0))
	if uid <= 0:
	continue

	name = str(u.get("university_name", f"University #{uid}"))
	is_active_raw = u.get("is_active", u.get("isActive", 1))
	website = str(u.get("website", u.get("website_url", "")))
	is_active = _is_truthy(is_active_raw)

	sections_raw = u.get("sections", [])
	if not isinstance(sections_raw, list):
	sections_raw = []

	norm_sections: list[dict[str, Any]] = []
	for s in sections_raw:
	if not isinstance(s, dict):
	continue
	k = str(s.get("section_key", ""))
	t = str(s.get("section_title", ""))
	j = _normalize_section_json(s.get("section_json", {}), "university", s.get("id"))
	if not k and not t and (not j or j == {}):
	continue
	norm_sections.append({
	"section_key": k,
	"section_title": t,
	"section_json": j,
	})

	# Derive tier from school_category (backward-compatible — older APIs may omit these)
	school_category = str(u.get("school_category", "")).strip()
	tier = u.get("tier")
	tier_label = u.get("tier_label", "")
	if tier is None and school_category:
	# Derive from school_category if tier not explicitly provided
	if school_category == "non_cosigner":
	tier, tier_label = 1, "Tier One"
	elif school_category == "cosigner":
	tier, tier_label = 2, "Tier Two"

	by_uni[uid] = {
	"university_name": name,
	"sections": norm_sections,
	"is_active": is_active,
	"website": website,
	"school_category": school_category,
	"tier": tier,
	"tier_label": tier_label or "",
	}

	# Sort: Tier One (non_cosigner) first, then Tier Two (cosigner), then by name
	def _uni_sort_key(item: tuple[int, dict]) -> tuple:
	uid, data = item
	t = data.get("tier")
	tier_rank = t if isinstance(t, int) else 99
	return (tier_rank, data.get("university_name", "").lower(), uid)

	return dict(sorted(by_uni.items(), key=_uni_sort_key))