""" catalog_loader.py — Loads and validates the SHL catalog from disk. Why a separate module? Separates I/O from business logic. If the catalog source changes (e.g., live API instead of JSON file), only this file changes; retrieval.py and agent.py are untouched. Interview Q: "How would you scale to a live catalog?" A: Replace load_catalog() with an HTTP fetch + TTL cache. The rest of the system is unaware of the source. Trade-off: We load the full catalog into memory at startup. At ~35 items this is trivial. For a catalog with tens of thousands of items, a streaming/lazy approach would be needed. """ import json import os from typing import List, Dict, Any # Resolve path relative to this file so the module works regardless of cwd. _CATALOG_PATH = os.path.join( os.path.dirname(__file__), "..", "data", "shl_catalog.json" ) def load_catalog() -> List[Dict[str, Any]]: """ Load and return the SHL catalog as a list of dicts. Raises FileNotFoundError if the catalog is missing (surfaces at startup, not at request time — fail fast principle). Each item is expected to have at minimum: name, url, test_type, description. Extra fields (duration, languages, keys, seniority, domains) are used for richer retrieval context but are optional. """ catalog_path = os.path.abspath(_CATALOG_PATH) if not os.path.exists(catalog_path): raise FileNotFoundError( f"SHL catalog not found at {catalog_path}. " "Ensure data/shl_catalog.json exists before starting the server." ) with open(catalog_path, "r", encoding="utf-8") as f: catalog = json.load(f) if not isinstance(catalog, list) or len(catalog) == 0: raise ValueError("Catalog must be a non-empty JSON array.") # Basic validation: every item must have the four mandatory fields. required_fields = {"name", "url", "test_type", "description"} for i, item in enumerate(catalog): missing = required_fields - set(item.keys()) if missing: raise ValueError( f"Catalog item {i} is missing required fields: {missing}" ) return catalog