File size: 12,598 Bytes
1862200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ef3e92
1862200
3ef3e92
1862200
3ef3e92
 
 
1862200
3ef3e92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1862200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6be35e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1862200
6be35e1
 
 
 
 
 
 
 
 
 
1862200
 
 
 
 
 
 
 
 
 
 
 
 
6be35e1
 
1862200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6be35e1
 
 
 
 
 
 
 
1862200
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
"""
src/utils/validator.py
----------------------
Validates the raw JSON string returned by the LLM before
it is parsed into a Lesson object.

Why this exists:
    LLMs sometimes return malformed JSON, add markdown code fences
    (```json ... ```), or omit required fields. This module catches
    all of that before it causes an error downstream.

What will be defined here:
    - strip_code_fences(raw)        : Remove ```json and ``` wrappers
                                      if the LLM added them
    - parse_json_safely(raw)        : Try to parse JSON, return None
                                      and log error if it fails
    - validate_required_fields(data): Check all required lesson fields
                                      are present in the parsed dict
    - validate_against_schema(data) : Use Pydantic to validate the full
                                      lesson structure against our schema
    - validate_llm_output(raw)      : Master function β€” runs all of the
                                      above in sequence, returns a
                                      validated Lesson object or raises
                                      a descriptive ValidationError

Design note:
    This is the safety net between the LLM and our data layer.
    Nothing touches data/generated/ without passing through here first.
"""

import json
import re
from src.guardrails.checks import run_post_checks


# =============================================================================
# CUSTOM EXCEPTION
# Raised when validation fails at any step.
# Always includes a descriptive message so the generator knows
# exactly what went wrong and can log it clearly.
# =============================================================================

class ValidationError(Exception):
    """
    Raised when the LLM output fails any validation step.
    The message describes exactly which step failed and why.
    """
    pass


# =============================================================================
# REQUIRED FIELDS
# The top-level fields every lesson must have.
# Nested field validation is handled by run_post_checks().
# =============================================================================

REQUIRED_TOP_LEVEL_FIELDS = [
    "lesson_id",
    "metadata",
    "lesson_flow",
    "guardrail_flags",
]

REQUIRED_METADATA_FIELDS = [
    "grade_band",
    "ela_domain",
    "lesson_type",
    "theme",
    "primary_skill",
    "voice_markers",
    "estimated_duration_minutes",
    "ccss_anchor",
]

REQUIRED_FLOW_FIELDS = [
    "hook",
    "model",
    "practice",
    "reflect",
]


# =============================================================================
# STEP 1: STRIP CODE FENCES
# LLMs frequently wrap JSON in markdown code fences even when told not to.
# This strips them before attempting to parse.
# =============================================================================

def strip_code_fences(raw: str) -> str:
    """
    Remove markdown code fences from a raw LLM response string.

    Handles all common fence patterns:
        ```json ... ```
        ```      ... ```
        `        ... `

    Args:
        raw: The raw string returned by the LLM.

    Returns:
        The cleaned string with fences removed and whitespace stripped.
    """
    # Remove ```json or ``` fences
    cleaned = re.sub(r"^```(?:json)?\s*", "", raw.strip(), flags=re.MULTILINE)
    cleaned = re.sub(r"\s*```$",          "", cleaned.strip(), flags=re.MULTILINE)

    # Remove single backtick wrapping (less common but possible)
    if cleaned.startswith("`") and cleaned.endswith("`"):
        cleaned = cleaned[1:-1]

    return cleaned.strip()


# =============================================================================
# STEP 2: PARSE JSON SAFELY
# Attempt json.loads() and raise a descriptive ValidationError if it fails.
# =============================================================================

def parse_json_safely(raw: str) -> dict:
    """
    Parse a JSON string into a Python dict.
    Raises ValidationError with a clear message if parsing fails.

    Args:
        raw: A (hopefully) valid JSON string.

    Returns:
        A Python dict.

    Raises:
        ValidationError: If the string is not valid JSON.
    """
    try:
        return json.loads(raw)
    except json.JSONDecodeError as e:
        # Show the first 300 chars of the raw string to help debugging
        preview = raw[:300] + "..." if len(raw) > 300 else raw
        raise ValidationError(
            f"[validator] LLM returned invalid JSON.\n"
            f"JSON error: {e}\n"
            f"Raw output preview:\n{preview}"
        )


# =============================================================================
# STEP 3: VALIDATE REQUIRED FIELDS
# Check that the parsed dict contains all required top-level,
# metadata, and lesson_flow fields.
# =============================================================================

def autocorrect_practice(lesson_dict: dict) -> dict:
    """
    Auto-correct common LLM mistakes in the practice field.

    Handles two known cases:
        1. practice is a dict with string keys β†’ convert values to list
        2. practice is a dict with a "prompts" key β†’ unwrap the list
    """
    practice = lesson_dict.get("lesson_flow", {}).get("practice")

    if isinstance(practice, dict):
        # Case 1: {"prompts": [...]} β€” unwrap the list
        if "prompts" in practice and isinstance(practice["prompts"], list):
            print("[validator] ⚠️  practice was a dict with 'prompts' key β€” unwrapping...")
            lesson_dict["lesson_flow"]["practice"] = practice["prompts"]

        else:
            # Case 2: {"P1": {...}, "P2": {...}} β€” convert values to list
            # Case 3: any other dict β€” find the first list value that looks like prompts
            extracted = False
            for value in practice.values():
                if isinstance(value, list) and len(value) > 0:
                    print("[validator] ⚠️  practice was an unknown dict β€” extracting first list value...")
                    lesson_dict["lesson_flow"]["practice"] = value
                    extracted = True
                    break

            if not extracted:
                print("[validator] ⚠️  practice was a dict β€” auto-correcting to list...")
                lesson_dict["lesson_flow"]["practice"] = list(practice.values())

        corrected = lesson_dict["lesson_flow"]["practice"]
        print(f"[validator] Auto-corrected practice β†’ {len(corrected)} prompts")

    return lesson_dict


def validate_required_fields(data: dict) -> None:
    """
    Check that all required fields are present in the parsed lesson dict.
    Checks top-level, metadata, and lesson_flow fields.

    Args:
        data: The parsed lesson dict.

    Raises:
        ValidationError: If any required field is missing.
    """
    # Check top-level fields
    missing_top = [f for f in REQUIRED_TOP_LEVEL_FIELDS if f not in data]
    if missing_top:
        raise ValidationError(
            f"[validator] Missing top-level fields: {missing_top}"
        )

    # Check metadata fields
    metadata = data.get("metadata", {})
    missing_meta = [f for f in REQUIRED_METADATA_FIELDS if f not in metadata]
    if missing_meta:
        raise ValidationError(
            f"[validator] Missing metadata fields: {missing_meta}"
        )

    # Check lesson_flow fields
    flow = data.get("lesson_flow", {})
    missing_flow = [f for f in REQUIRED_FLOW_FIELDS if f not in flow]
    if missing_flow:
        raise ValidationError(
            f"[validator] Missing lesson_flow fields: {missing_flow}"
        )

    # Check practice is a non-empty list
    practice = flow.get("practice", [])
    if not isinstance(practice, list) or len(practice) == 0:
        raise ValidationError(
            f"[validator] lesson_flow.practice must be a non-empty list. "
            f"Got: {type(practice).__name__}"
        )

    # Check practice has no more than 3 prompts
    if len(practice) > 3:
        raise ValidationError(
            f"[validator] lesson_flow.practice has {len(practice)} prompts. "
            f"Maximum allowed is 3."
        )


# =============================================================================
# STEP 4: RUN POST-GENERATION GUARDRAIL CHECKS
# Runs all checks from checks.py and embeds the results back
# into the lesson dict. Flags are recorded but do not block saving.
# =============================================================================

def validate_against_schema(data: dict) -> dict:
    """
    Run post-generation guardrail checks and embed results
    into the guardrail_flags section of the lesson dict.

    This does NOT raise on flag β€” it records the flag so the
    lesson is self-documenting about any issues found.

    Args:
        data: The parsed and field-validated lesson dict.

    Returns:
        The lesson dict with guardrail_flags updated from checks.
    """
    check_results = run_post_checks(data)

    # Overwrite the LLM's self-assessed guardrail flags with
    # our programmatic checks β€” our checks are more reliable
    data["guardrail_flags"] = {
        name: {
            "status":  result.status,
            "message": result.message,
        }
        for name, result in check_results.items()
    }

    return data

def strip_non_speaking_fields(lesson_dict: dict) -> dict:
    """
    Remove Speaking-specific fields from lessons that are not
    Speaking or Reading β†’ Speaking β€” the LLM includes them anyway
    despite instructions.
    """
    ela_domain = lesson_dict.get("metadata", {}).get("ela_domain", "")

    if ela_domain in ("Speaking", "Reading β†’ Speaking"):
        return lesson_dict

    # Strip voice markers
    lesson_dict.get("metadata", {})["voice_markers"] = []

    # Strip learning_goal_connection from hook
    lesson_dict.get("lesson_flow", {}).get("hook", {}).pop("learning_goal_connection", None)

    # Strip from practice prompts
    for p in lesson_dict.get("lesson_flow", {}).get("practice", []):
        p.pop("learning_goal_connection", None)

    # Strip from reflect
    lesson_dict.get("lesson_flow", {}).get("reflect", {}).pop("learning_goal_connection", None)
    
    return lesson_dict

def normalize_reflect(lesson_dict: dict) -> dict:
    """
    Flatten feedback_anchors into reflect if the LLM nested them.
    Normalises: reflect.feedback_anchors.x β†’ reflect.x
    """
    reflect = lesson_dict.get("lesson_flow", {}).get("reflect", {})
    if "feedback_anchors" in reflect:
        anchors = reflect.pop("feedback_anchors")
        reflect.update(anchors)
    return lesson_dict
# =============================================================================
# MASTER VALIDATION FUNCTION
# Runs all four steps in sequence.
# This is the only function the generator needs to call.
# =============================================================================

def validate_llm_output(raw: str) -> dict:
    """
    Master validation function. Runs the full pipeline:
        1. Strip code fences
        2. Parse JSON
        3. Validate required fields
        4. Run guardrail checks and embed results
        5. Strip Speaking-specific fields if needed
        6. Normalize reflect
    Args:
        raw: The raw string returned by the Groq API.

    Returns:
        A clean, validated lesson dict ready to be saved.

    Raises:
        ValidationError: If steps 1-3 fail (structural problems).
        Step 4 never raises β€” it flags and records issues.
    """
    print("[validator] Starting validation pipeline...")

    # Step 1
    cleaned = strip_code_fences(raw)
    print("[validator] Step 1 β€” Code fences stripped βœ…")

    # Step 2
    data = parse_json_safely(cleaned)
    print("[validator] Step 2 β€” JSON parsed successfully βœ…")

    # Step 2.5: Auto-correct known LLM formatting mistakes
    data = autocorrect_practice(data)

    # Step 3
    validate_required_fields(data)
    print("[validator] Step 3 β€” Required fields present βœ…")

    # Step 4
    data = validate_against_schema(data)
    print("[validator] Step 4 β€” Guardrail checks complete βœ…")

    # Step 5
    data = strip_non_speaking_fields(data)
    print("[validator] Step 5 β€” Speaking-specific fields stripped βœ…")

    # Step 6
    data = normalize_reflect(data)
    print("[validator] Step 6 β€” Reflect normalized βœ…")

    print(f"[validator] Validation passed for lesson: {data.get('lesson_id', 'UNKNOWN')}")
    return data