handbook_engine / app /services /data_fetcher.py
internationalscholarsprogram's picture
feat: tier-one / tier-two ordering, labels, and grouped PDF summary
fdf5f84
"""Data fetcher service — mirrors PHP fetchers.php.
Fetches handbook data from the two external JSON APIs (source of truth),
normalises the payloads, and returns typed dicts identical to what the
PHP code produced.
"""
from __future__ import annotations
import json
import logging
from typing import Any
import httpx
from app.core.config import get_settings
logger = logging.getLogger(__name__)
def _normalize_section_json(raw: Any, context: str, sid: Any = None) -> dict | list:
"""Mirrors PHP handbook_normalize_section_json."""
if isinstance(raw, dict) or isinstance(raw, list):
return raw
if isinstance(raw, str):
raw = raw.strip()
if not raw:
return {}
try:
decoded = json.loads(raw)
if isinstance(decoded, (dict, list)):
return decoded
except (json.JSONDecodeError, ValueError):
logger.warning(
"section_json parse failed ctx=%s id=%s snippet=%.180s",
context, sid, raw,
)
return {}
return {}
def _is_truthy(val: Any) -> bool:
"""Mirrors PHP handbook_true."""
if isinstance(val, bool):
return val
if isinstance(val, int):
return val != 0
s = str(val).lower().strip()
return s not in ("0", "false", "")
def _tier_section_rank(section_key: str) -> int:
"""Return sort priority for tier-related section keys.
Tier One sections sort before Tier Two; non-tier sections get 99 (neutral).
"""
k = section_key.lower().replace("-", "_").replace(" ", "_")
if "tier_one" in k or "non_cosigner" in k:
return 0
if "tier_two" in k or k in ("cosigner_schools", "cosigner"):
return 1
return 99
def _sort_sections_stable(sections: list[dict]) -> list[dict]:
"""Mirrors PHP sortHandbookSectionsStable with tier-aware tiebreaker."""
for i, s in enumerate(sections):
s.setdefault("_i", i)
def sort_key(s: dict):
so = s.get("sort_order")
sid = s.get("id")
# None values sort after numeric values
so_key = (0, so) if so is not None else (1, 0)
# Tier-aware tiebreaker: Tier One before Tier Two when sort_order ties
tier_rank = _tier_section_rank(str(s.get("section_key", "")))
sid_key = (0, sid) if sid is not None else (1, 0)
return (so_key, tier_rank, sid_key, s.get("_i", 0))
sections.sort(key=sort_key)
for s in sections:
s.pop("_i", None)
return sections
async def fetch_global_sections(catalog_id: int = 0) -> list[dict[str, Any]]:
"""Fetch and normalise global handbook sections from the external API.
Mirrors PHP fetchGlobalSections().
"""
settings = get_settings()
url = settings.general_endpoint_url
if catalog_id:
sep = "&" if "?" in url else "?"
url += f"{sep}catalog_id={catalog_id}"
try:
async with httpx.AsyncClient(verify=False, timeout=settings.http_timeout) as client:
resp = await client.get(url)
resp.raise_for_status()
payload = resp.json()
except Exception as exc:
logger.error("Global sections fetch failed: %s url=%s", exc, url)
return []
if not payload.get("ok"):
logger.warning("Global sections API returned ok=false: %s", payload)
return []
# Accept common shapes
sections_raw = (
payload.get("general_sections")
or payload.get("sections")
or payload.get("globals")
or payload.get("data")
or []
)
if not isinstance(sections_raw, list):
sections_raw = []
out: list[dict[str, Any]] = []
for i, s in enumerate(sections_raw):
if not isinstance(s, dict):
continue
k = str(s.get("section_key", ""))
t = str(s.get("section_title", ""))
j = _normalize_section_json(s.get("section_json", {}), "global", s.get("id"))
sort_raw = s.get("sort_order") or s.get("sortOrder")
sort_val = int(sort_raw) if sort_raw is not None and str(sort_raw).lstrip("-").isdigit() else None
if not k and not t and (not j or j == {}):
continue
out.append({
"section_key": k,
"section_title": t,
"section_json": j,
"sort_order": sort_val,
"id": int(s["id"]) if s.get("id") is not None else None,
"_i": i,
})
out = _sort_sections_stable(out)
logger.info(
"Global sections fetched catalog_id=%d count=%d keys=%s",
catalog_id,
len(out),
[s.get("section_key") for s in out],
)
return out
async def fetch_university_sections() -> dict[int, dict[str, Any]]:
"""Fetch and normalise university handbook sections.
Returns dict keyed by university_id.
Mirrors PHP fetchUniversitySections().
"""
settings = get_settings()
url = settings.university_endpoint_url
try:
async with httpx.AsyncClient(verify=False, timeout=settings.http_timeout) as client:
resp = await client.get(url)
resp.raise_for_status()
payload = resp.json()
except Exception as exc:
logger.error("University sections fetch failed: %s url=%s", exc, url)
return {}
if not payload.get("ok"):
logger.warning("University sections API returned ok=false")
return {}
universities = payload.get("universities", [])
if not isinstance(universities, list):
universities = []
by_uni: dict[int, dict[str, Any]] = {}
for u in universities:
if not isinstance(u, dict):
continue
uid = int(u.get("university_id", 0))
if uid <= 0:
continue
name = str(u.get("university_name", f"University #{uid}"))
is_active_raw = u.get("is_active", u.get("isActive", 1))
website = str(u.get("website", u.get("website_url", "")))
is_active = _is_truthy(is_active_raw)
sections_raw = u.get("sections", [])
if not isinstance(sections_raw, list):
sections_raw = []
norm_sections: list[dict[str, Any]] = []
for s in sections_raw:
if not isinstance(s, dict):
continue
k = str(s.get("section_key", ""))
t = str(s.get("section_title", ""))
j = _normalize_section_json(s.get("section_json", {}), "university", s.get("id"))
if not k and not t and (not j or j == {}):
continue
norm_sections.append({
"section_key": k,
"section_title": t,
"section_json": j,
})
# Derive tier from school_category (backward-compatible — older APIs may omit these)
school_category = str(u.get("school_category", "")).strip()
tier = u.get("tier")
tier_label = u.get("tier_label", "")
if tier is None and school_category:
# Derive from school_category if tier not explicitly provided
if school_category == "non_cosigner":
tier, tier_label = 1, "Tier One"
elif school_category == "cosigner":
tier, tier_label = 2, "Tier Two"
by_uni[uid] = {
"university_name": name,
"sections": norm_sections,
"is_active": is_active,
"website": website,
"school_category": school_category,
"tier": tier,
"tier_label": tier_label or "",
}
# Sort: Tier One (non_cosigner) first, then Tier Two (cosigner), then by name
def _uni_sort_key(item: tuple[int, dict]) -> tuple:
uid, data = item
t = data.get("tier")
tier_rank = t if isinstance(t, int) else 99
return (tier_rank, data.get("university_name", "").lower(), uid)
return dict(sorted(by_uni.items(), key=_uni_sort_key))