Spaces:

aankitdas
/

bantrly-lesson-generator

Sleeping

File size: 12,598 Bytes

"""
src/utils/validator.py
----------------------
Validates the raw JSON string returned by the LLM before
it is parsed into a Lesson object.

Why this exists:
    LLMs sometimes return malformed JSON, add markdown code fences
    (```json ... ```), or omit required fields. This module catches
    all of that before it causes an error downstream.

What will be defined here:
    - strip_code_fences(raw)        : Remove ```json and ``` wrappers
                                      if the LLM added them
    - parse_json_safely(raw)        : Try to parse JSON, return None
                                      and log error if it fails
    - validate_required_fields(data): Check all required lesson fields
                                      are present in the parsed dict
    - validate_against_schema(data) : Use Pydantic to validate the full
                                      lesson structure against our schema
    - validate_llm_output(raw)      : Master function — runs all of the
                                      above in sequence, returns a
                                      validated Lesson object or raises
                                      a descriptive ValidationError

Design note:
    This is the safety net between the LLM and our data layer.
    Nothing touches data/generated/ without passing through here first.
"""

import json
import re
from src.guardrails.checks import run_post_checks


# =============================================================================
# CUSTOM EXCEPTION
# Raised when validation fails at any step.
# Always includes a descriptive message so the generator knows
# exactly what went wrong and can log it clearly.
# =============================================================================

class ValidationError(Exception):
    """
    Raised when the LLM output fails any validation step.
    The message describes exactly which step failed and why.
    """
    pass


# =============================================================================
# REQUIRED FIELDS
# The top-level fields every lesson must have.
# Nested field validation is handled by run_post_checks().
# =============================================================================

REQUIRED_TOP_LEVEL_FIELDS = [
    "lesson_id",
    "metadata",
    "lesson_flow",
    "guardrail_flags",
]

REQUIRED_METADATA_FIELDS = [
    "grade_band",
    "ela_domain",
    "lesson_type",
    "theme",
    "primary_skill",
    "voice_markers",
    "estimated_duration_minutes",
    "ccss_anchor",
]

REQUIRED_FLOW_FIELDS = [
    "hook",
    "model",
    "practice",
    "reflect",
]


# =============================================================================
# STEP 1: STRIP CODE FENCES
# LLMs frequently wrap JSON in markdown code fences even when told not to.
# This strips them before attempting to parse.
# =============================================================================

def strip_code_fences(raw: str) -> str:
    """
    Remove markdown code fences from a raw LLM response string.

    Handles all common fence patterns:
        ```json ... ```
        ```      ... ```
        `        ... `

    Args:
        raw: The raw string returned by the LLM.

    Returns:
        The cleaned string with fences removed and whitespace stripped.
    """
    # Remove ```json or ``` fences
    cleaned = re.sub(r"^```(?:json)?\s*", "", raw.strip(), flags=re.MULTILINE)
    cleaned = re.sub(r"\s*```$",          "", cleaned.strip(), flags=re.MULTILINE)

    # Remove single backtick wrapping (less common but possible)
    if cleaned.startswith("`") and cleaned.endswith("`"):
        cleaned = cleaned[1:-1]

    return cleaned.strip()


# =============================================================================
# STEP 2: PARSE JSON SAFELY
# Attempt json.loads() and raise a descriptive ValidationError if it fails.
# =============================================================================

def parse_json_safely(raw: str) -> dict:
    """
    Parse a JSON string into a Python dict.
    Raises ValidationError with a clear message if parsing fails.

    Args:
        raw: A (hopefully) valid JSON string.

    Returns:
        A Python dict.

    Raises:
        ValidationError: If the string is not valid JSON.
    """
    try:
        return json.loads(raw)
    except json.JSONDecodeError as e:
        # Show the first 300 chars of the raw string to help debugging
        preview = raw[:300] + "..." if len(raw) > 300 else raw
        raise ValidationError(
            f"[validator] LLM returned invalid JSON.\n"
            f"JSON error: {e}\n"
            f"Raw output preview:\n{preview}"
        )


# =============================================================================
# STEP 3: VALIDATE REQUIRED FIELDS
# Check that the parsed dict contains all required top-level,
# metadata, and lesson_flow fields.
# =============================================================================

def autocorrect_practice(lesson_dict: dict) -> dict:
    """
    Auto-correct common LLM mistakes in the practice field.

    Handles two known cases:
        1. practice is a dict with string keys → convert values to list
        2. practice is a dict with a "prompts" key → unwrap the list
    """
    practice = lesson_dict.get("lesson_flow", {}).get("practice")

    if isinstance(practice, dict):
        # Case 1: {"prompts": [...]} — unwrap the list
        if "prompts" in practice and isinstance(practice["prompts"], list):
            print("[validator] ⚠️  practice was a dict with 'prompts' key — unwrapping...")
            lesson_dict["lesson_flow"]["practice"] = practice["prompts"]

        else:
            # Case 2: {"P1": {...}, "P2": {...}} — convert values to list
            # Case 3: any other dict — find the first list value that looks like prompts
            extracted = False
            for value in practice.values():
                if isinstance(value, list) and len(value) > 0:
                    print("[validator] ⚠️  practice was an unknown dict — extracting first list value...")
                    lesson_dict["lesson_flow"]["practice"] = value
                    extracted = True
                    break

            if not extracted:
                print("[validator] ⚠️  practice was a dict — auto-correcting to list...")
                lesson_dict["lesson_flow"]["practice"] = list(practice.values())

        corrected = lesson_dict["lesson_flow"]["practice"]
        print(f"[validator] Auto-corrected practice → {len(corrected)} prompts")

    return lesson_dict


def validate_required_fields(data: dict) -> None:
    """
    Check that all required fields are present in the parsed lesson dict.
    Checks top-level, metadata, and lesson_flow fields.

    Args:
        data: The parsed lesson dict.

    Raises:
        ValidationError: If any required field is missing.
    """
    # Check top-level fields
    missing_top = [f for f in REQUIRED_TOP_LEVEL_FIELDS if f not in data]
    if missing_top:
        raise ValidationError(
            f"[validator] Missing top-level fields: {missing_top}"
        )

    # Check metadata fields
    metadata = data.get("metadata", {})
    missing_meta = [f for f in REQUIRED_METADATA_FIELDS if f not in metadata]
    if missing_meta:
        raise ValidationError(
            f"[validator] Missing metadata fields: {missing_meta}"
        )

    # Check lesson_flow fields
    flow = data.get("lesson_flow", {})
    missing_flow = [f for f in REQUIRED_FLOW_FIELDS if f not in flow]
    if missing_flow:
        raise ValidationError(
            f"[validator] Missing lesson_flow fields: {missing_flow}"
        )

    # Check practice is a non-empty list
    practice = flow.get("practice", [])
    if not isinstance(practice, list) or len(practice) == 0:
        raise ValidationError(
            f"[validator] lesson_flow.practice must be a non-empty list. "
            f"Got: {type(practice).__name__}"
        )

    # Check practice has no more than 3 prompts
    if len(practice) > 3:
        raise ValidationError(
            f"[validator] lesson_flow.practice has {len(practice)} prompts. "
            f"Maximum allowed is 3."
        )


# =============================================================================
# STEP 4: RUN POST-GENERATION GUARDRAIL CHECKS
# Runs all checks from checks.py and embeds the results back
# into the lesson dict. Flags are recorded but do not block saving.
# =============================================================================

def validate_against_schema(data: dict) -> dict:
    """
    Run post-generation guardrail checks and embed results
    into the guardrail_flags section of the lesson dict.

    This does NOT raise on flag — it records the flag so the
    lesson is self-documenting about any issues found.

    Args:
        data: The parsed and field-validated lesson dict.

    Returns:
        The lesson dict with guardrail_flags updated from checks.
    """
    check_results = run_post_checks(data)

    # Overwrite the LLM's self-assessed guardrail flags with
    # our programmatic checks — our checks are more reliable
    data["guardrail_flags"] = {
        name: {
            "status":  result.status,
            "message": result.message,
        }
        for name, result in check_results.items()
    }

    return data

def strip_non_speaking_fields(lesson_dict: dict) -> dict:
    """
    Remove Speaking-specific fields from lessons that are not
    Speaking or Reading → Speaking — the LLM includes them anyway
    despite instructions.
    """
    ela_domain = lesson_dict.get("metadata", {}).get("ela_domain", "")

    if ela_domain in ("Speaking", "Reading → Speaking"):
        return lesson_dict

    # Strip voice markers
    lesson_dict.get("metadata", {})["voice_markers"] = []

    # Strip learning_goal_connection from hook
    lesson_dict.get("lesson_flow", {}).get("hook", {}).pop("learning_goal_connection", None)

    # Strip from practice prompts
    for p in lesson_dict.get("lesson_flow", {}).get("practice", []):
        p.pop("learning_goal_connection", None)

    # Strip from reflect
    lesson_dict.get("lesson_flow", {}).get("reflect", {}).pop("learning_goal_connection", None)
    
    return lesson_dict

def normalize_reflect(lesson_dict: dict) -> dict:
    """
    Flatten feedback_anchors into reflect if the LLM nested them.
    Normalises: reflect.feedback_anchors.x → reflect.x
    """
    reflect = lesson_dict.get("lesson_flow", {}).get("reflect", {})
    if "feedback_anchors" in reflect:
        anchors = reflect.pop("feedback_anchors")
        reflect.update(anchors)
    return lesson_dict
# =============================================================================
# MASTER VALIDATION FUNCTION
# Runs all four steps in sequence.
# This is the only function the generator needs to call.
# =============================================================================

def validate_llm_output(raw: str) -> dict:
    """
    Master validation function. Runs the full pipeline:
        1. Strip code fences
        2. Parse JSON
        3. Validate required fields
        4. Run guardrail checks and embed results
        5. Strip Speaking-specific fields if needed
        6. Normalize reflect
    Args:
        raw: The raw string returned by the Groq API.

    Returns:
        A clean, validated lesson dict ready to be saved.

    Raises:
        ValidationError: If steps 1-3 fail (structural problems).
        Step 4 never raises — it flags and records issues.
    """
    print("[validator] Starting validation pipeline...")

    # Step 1
    cleaned = strip_code_fences(raw)
    print("[validator] Step 1 — Code fences stripped ✅")

    # Step 2
    data = parse_json_safely(cleaned)
    print("[validator] Step 2 — JSON parsed successfully ✅")

    # Step 2.5: Auto-correct known LLM formatting mistakes
    data = autocorrect_practice(data)

    # Step 3
    validate_required_fields(data)
    print("[validator] Step 3 — Required fields present ✅")

    # Step 4
    data = validate_against_schema(data)
    print("[validator] Step 4 — Guardrail checks complete ✅")

    # Step 5
    data = strip_non_speaking_fields(data)
    print("[validator] Step 5 — Speaking-specific fields stripped ✅")

    # Step 6
    data = normalize_reflect(data)
    print("[validator] Step 6 — Reflect normalized ✅")

    print(f"[validator] Validation passed for lesson: {data.get('lesson_id', 'UNKNOWN')}")
    return data