Spaces:
Sleeping
Sleeping
| """ | |
| src/utils/validator.py | |
| ---------------------- | |
| Validates the raw JSON string returned by the LLM before | |
| it is parsed into a Lesson object. | |
| Why this exists: | |
| LLMs sometimes return malformed JSON, add markdown code fences | |
| (```json ... ```), or omit required fields. This module catches | |
| all of that before it causes an error downstream. | |
| What will be defined here: | |
| - strip_code_fences(raw) : Remove ```json and ``` wrappers | |
| if the LLM added them | |
| - parse_json_safely(raw) : Try to parse JSON, return None | |
| and log error if it fails | |
| - validate_required_fields(data): Check all required lesson fields | |
| are present in the parsed dict | |
| - validate_against_schema(data) : Use Pydantic to validate the full | |
| lesson structure against our schema | |
| - validate_llm_output(raw) : Master function β runs all of the | |
| above in sequence, returns a | |
| validated Lesson object or raises | |
| a descriptive ValidationError | |
| Design note: | |
| This is the safety net between the LLM and our data layer. | |
| Nothing touches data/generated/ without passing through here first. | |
| """ | |
| import json | |
| import re | |
| from src.guardrails.checks import run_post_checks | |
| # ============================================================================= | |
| # CUSTOM EXCEPTION | |
| # Raised when validation fails at any step. | |
| # Always includes a descriptive message so the generator knows | |
| # exactly what went wrong and can log it clearly. | |
| # ============================================================================= | |
| class ValidationError(Exception): | |
| """ | |
| Raised when the LLM output fails any validation step. | |
| The message describes exactly which step failed and why. | |
| """ | |
| pass | |
| # ============================================================================= | |
| # REQUIRED FIELDS | |
| # The top-level fields every lesson must have. | |
| # Nested field validation is handled by run_post_checks(). | |
| # ============================================================================= | |
| REQUIRED_TOP_LEVEL_FIELDS = [ | |
| "lesson_id", | |
| "metadata", | |
| "lesson_flow", | |
| "guardrail_flags", | |
| ] | |
| REQUIRED_METADATA_FIELDS = [ | |
| "grade_band", | |
| "ela_domain", | |
| "lesson_type", | |
| "theme", | |
| "primary_skill", | |
| "voice_markers", | |
| "estimated_duration_minutes", | |
| "ccss_anchor", | |
| ] | |
| REQUIRED_FLOW_FIELDS = [ | |
| "hook", | |
| "model", | |
| "practice", | |
| "reflect", | |
| ] | |
| # ============================================================================= | |
| # STEP 1: STRIP CODE FENCES | |
| # LLMs frequently wrap JSON in markdown code fences even when told not to. | |
| # This strips them before attempting to parse. | |
| # ============================================================================= | |
| def strip_code_fences(raw: str) -> str: | |
| """ | |
| Remove markdown code fences from a raw LLM response string. | |
| Handles all common fence patterns: | |
| ```json ... ``` | |
| ``` ... ``` | |
| ` ... ` | |
| Args: | |
| raw: The raw string returned by the LLM. | |
| Returns: | |
| The cleaned string with fences removed and whitespace stripped. | |
| """ | |
| # Remove ```json or ``` fences | |
| cleaned = re.sub(r"^```(?:json)?\s*", "", raw.strip(), flags=re.MULTILINE) | |
| cleaned = re.sub(r"\s*```$", "", cleaned.strip(), flags=re.MULTILINE) | |
| # Remove single backtick wrapping (less common but possible) | |
| if cleaned.startswith("`") and cleaned.endswith("`"): | |
| cleaned = cleaned[1:-1] | |
| return cleaned.strip() | |
| # ============================================================================= | |
| # STEP 2: PARSE JSON SAFELY | |
| # Attempt json.loads() and raise a descriptive ValidationError if it fails. | |
| # ============================================================================= | |
| def parse_json_safely(raw: str) -> dict: | |
| """ | |
| Parse a JSON string into a Python dict. | |
| Raises ValidationError with a clear message if parsing fails. | |
| Args: | |
| raw: A (hopefully) valid JSON string. | |
| Returns: | |
| A Python dict. | |
| Raises: | |
| ValidationError: If the string is not valid JSON. | |
| """ | |
| try: | |
| return json.loads(raw) | |
| except json.JSONDecodeError as e: | |
| # Show the first 300 chars of the raw string to help debugging | |
| preview = raw[:300] + "..." if len(raw) > 300 else raw | |
| raise ValidationError( | |
| f"[validator] LLM returned invalid JSON.\n" | |
| f"JSON error: {e}\n" | |
| f"Raw output preview:\n{preview}" | |
| ) | |
| # ============================================================================= | |
| # STEP 3: VALIDATE REQUIRED FIELDS | |
| # Check that the parsed dict contains all required top-level, | |
| # metadata, and lesson_flow fields. | |
| # ============================================================================= | |
| def autocorrect_practice(lesson_dict: dict) -> dict: | |
| """ | |
| Auto-correct common LLM mistakes in the practice field. | |
| Handles two known cases: | |
| 1. practice is a dict with string keys β convert values to list | |
| 2. practice is a dict with a "prompts" key β unwrap the list | |
| """ | |
| practice = lesson_dict.get("lesson_flow", {}).get("practice") | |
| if isinstance(practice, dict): | |
| # Case 1: {"prompts": [...]} β unwrap the list | |
| if "prompts" in practice and isinstance(practice["prompts"], list): | |
| print("[validator] β οΈ practice was a dict with 'prompts' key β unwrapping...") | |
| lesson_dict["lesson_flow"]["practice"] = practice["prompts"] | |
| else: | |
| # Case 2: {"P1": {...}, "P2": {...}} β convert values to list | |
| # Case 3: any other dict β find the first list value that looks like prompts | |
| extracted = False | |
| for value in practice.values(): | |
| if isinstance(value, list) and len(value) > 0: | |
| print("[validator] β οΈ practice was an unknown dict β extracting first list value...") | |
| lesson_dict["lesson_flow"]["practice"] = value | |
| extracted = True | |
| break | |
| if not extracted: | |
| print("[validator] β οΈ practice was a dict β auto-correcting to list...") | |
| lesson_dict["lesson_flow"]["practice"] = list(practice.values()) | |
| corrected = lesson_dict["lesson_flow"]["practice"] | |
| print(f"[validator] Auto-corrected practice β {len(corrected)} prompts") | |
| return lesson_dict | |
| def validate_required_fields(data: dict) -> None: | |
| """ | |
| Check that all required fields are present in the parsed lesson dict. | |
| Checks top-level, metadata, and lesson_flow fields. | |
| Args: | |
| data: The parsed lesson dict. | |
| Raises: | |
| ValidationError: If any required field is missing. | |
| """ | |
| # Check top-level fields | |
| missing_top = [f for f in REQUIRED_TOP_LEVEL_FIELDS if f not in data] | |
| if missing_top: | |
| raise ValidationError( | |
| f"[validator] Missing top-level fields: {missing_top}" | |
| ) | |
| # Check metadata fields | |
| metadata = data.get("metadata", {}) | |
| missing_meta = [f for f in REQUIRED_METADATA_FIELDS if f not in metadata] | |
| if missing_meta: | |
| raise ValidationError( | |
| f"[validator] Missing metadata fields: {missing_meta}" | |
| ) | |
| # Check lesson_flow fields | |
| flow = data.get("lesson_flow", {}) | |
| missing_flow = [f for f in REQUIRED_FLOW_FIELDS if f not in flow] | |
| if missing_flow: | |
| raise ValidationError( | |
| f"[validator] Missing lesson_flow fields: {missing_flow}" | |
| ) | |
| # Check practice is a non-empty list | |
| practice = flow.get("practice", []) | |
| if not isinstance(practice, list) or len(practice) == 0: | |
| raise ValidationError( | |
| f"[validator] lesson_flow.practice must be a non-empty list. " | |
| f"Got: {type(practice).__name__}" | |
| ) | |
| # Check practice has no more than 3 prompts | |
| if len(practice) > 3: | |
| raise ValidationError( | |
| f"[validator] lesson_flow.practice has {len(practice)} prompts. " | |
| f"Maximum allowed is 3." | |
| ) | |
| # ============================================================================= | |
| # STEP 4: RUN POST-GENERATION GUARDRAIL CHECKS | |
| # Runs all checks from checks.py and embeds the results back | |
| # into the lesson dict. Flags are recorded but do not block saving. | |
| # ============================================================================= | |
| def validate_against_schema(data: dict) -> dict: | |
| """ | |
| Run post-generation guardrail checks and embed results | |
| into the guardrail_flags section of the lesson dict. | |
| This does NOT raise on flag β it records the flag so the | |
| lesson is self-documenting about any issues found. | |
| Args: | |
| data: The parsed and field-validated lesson dict. | |
| Returns: | |
| The lesson dict with guardrail_flags updated from checks. | |
| """ | |
| check_results = run_post_checks(data) | |
| # Overwrite the LLM's self-assessed guardrail flags with | |
| # our programmatic checks β our checks are more reliable | |
| data["guardrail_flags"] = { | |
| name: { | |
| "status": result.status, | |
| "message": result.message, | |
| } | |
| for name, result in check_results.items() | |
| } | |
| return data | |
| def strip_non_speaking_fields(lesson_dict: dict) -> dict: | |
| """ | |
| Remove Speaking-specific fields from lessons that are not | |
| Speaking or Reading β Speaking β the LLM includes them anyway | |
| despite instructions. | |
| """ | |
| ela_domain = lesson_dict.get("metadata", {}).get("ela_domain", "") | |
| if ela_domain in ("Speaking", "Reading β Speaking"): | |
| return lesson_dict | |
| # Strip voice markers | |
| lesson_dict.get("metadata", {})["voice_markers"] = [] | |
| # Strip learning_goal_connection from hook | |
| lesson_dict.get("lesson_flow", {}).get("hook", {}).pop("learning_goal_connection", None) | |
| # Strip from practice prompts | |
| for p in lesson_dict.get("lesson_flow", {}).get("practice", []): | |
| p.pop("learning_goal_connection", None) | |
| # Strip from reflect | |
| lesson_dict.get("lesson_flow", {}).get("reflect", {}).pop("learning_goal_connection", None) | |
| return lesson_dict | |
| def normalize_reflect(lesson_dict: dict) -> dict: | |
| """ | |
| Flatten feedback_anchors into reflect if the LLM nested them. | |
| Normalises: reflect.feedback_anchors.x β reflect.x | |
| """ | |
| reflect = lesson_dict.get("lesson_flow", {}).get("reflect", {}) | |
| if "feedback_anchors" in reflect: | |
| anchors = reflect.pop("feedback_anchors") | |
| reflect.update(anchors) | |
| return lesson_dict | |
| # ============================================================================= | |
| # MASTER VALIDATION FUNCTION | |
| # Runs all four steps in sequence. | |
| # This is the only function the generator needs to call. | |
| # ============================================================================= | |
| def validate_llm_output(raw: str) -> dict: | |
| """ | |
| Master validation function. Runs the full pipeline: | |
| 1. Strip code fences | |
| 2. Parse JSON | |
| 3. Validate required fields | |
| 4. Run guardrail checks and embed results | |
| 5. Strip Speaking-specific fields if needed | |
| 6. Normalize reflect | |
| Args: | |
| raw: The raw string returned by the Groq API. | |
| Returns: | |
| A clean, validated lesson dict ready to be saved. | |
| Raises: | |
| ValidationError: If steps 1-3 fail (structural problems). | |
| Step 4 never raises β it flags and records issues. | |
| """ | |
| print("[validator] Starting validation pipeline...") | |
| # Step 1 | |
| cleaned = strip_code_fences(raw) | |
| print("[validator] Step 1 β Code fences stripped β ") | |
| # Step 2 | |
| data = parse_json_safely(cleaned) | |
| print("[validator] Step 2 β JSON parsed successfully β ") | |
| # Step 2.5: Auto-correct known LLM formatting mistakes | |
| data = autocorrect_practice(data) | |
| # Step 3 | |
| validate_required_fields(data) | |
| print("[validator] Step 3 β Required fields present β ") | |
| # Step 4 | |
| data = validate_against_schema(data) | |
| print("[validator] Step 4 β Guardrail checks complete β ") | |
| # Step 5 | |
| data = strip_non_speaking_fields(data) | |
| print("[validator] Step 5 β Speaking-specific fields stripped β ") | |
| # Step 6 | |
| data = normalize_reflect(data) | |
| print("[validator] Step 6 β Reflect normalized β ") | |
| print(f"[validator] Validation passed for lesson: {data.get('lesson_id', 'UNKNOWN')}") | |
| return data |