aankitdas's picture
components connect to learning goals
6be35e1
"""
src/utils/validator.py
----------------------
Validates the raw JSON string returned by the LLM before
it is parsed into a Lesson object.
Why this exists:
LLMs sometimes return malformed JSON, add markdown code fences
(```json ... ```), or omit required fields. This module catches
all of that before it causes an error downstream.
What will be defined here:
- strip_code_fences(raw) : Remove ```json and ``` wrappers
if the LLM added them
- parse_json_safely(raw) : Try to parse JSON, return None
and log error if it fails
- validate_required_fields(data): Check all required lesson fields
are present in the parsed dict
- validate_against_schema(data) : Use Pydantic to validate the full
lesson structure against our schema
- validate_llm_output(raw) : Master function β€” runs all of the
above in sequence, returns a
validated Lesson object or raises
a descriptive ValidationError
Design note:
This is the safety net between the LLM and our data layer.
Nothing touches data/generated/ without passing through here first.
"""
import json
import re
from src.guardrails.checks import run_post_checks
# =============================================================================
# CUSTOM EXCEPTION
# Raised when validation fails at any step.
# Always includes a descriptive message so the generator knows
# exactly what went wrong and can log it clearly.
# =============================================================================
class ValidationError(Exception):
"""
Raised when the LLM output fails any validation step.
The message describes exactly which step failed and why.
"""
pass
# =============================================================================
# REQUIRED FIELDS
# The top-level fields every lesson must have.
# Nested field validation is handled by run_post_checks().
# =============================================================================
REQUIRED_TOP_LEVEL_FIELDS = [
"lesson_id",
"metadata",
"lesson_flow",
"guardrail_flags",
]
REQUIRED_METADATA_FIELDS = [
"grade_band",
"ela_domain",
"lesson_type",
"theme",
"primary_skill",
"voice_markers",
"estimated_duration_minutes",
"ccss_anchor",
]
REQUIRED_FLOW_FIELDS = [
"hook",
"model",
"practice",
"reflect",
]
# =============================================================================
# STEP 1: STRIP CODE FENCES
# LLMs frequently wrap JSON in markdown code fences even when told not to.
# This strips them before attempting to parse.
# =============================================================================
def strip_code_fences(raw: str) -> str:
"""
Remove markdown code fences from a raw LLM response string.
Handles all common fence patterns:
```json ... ```
``` ... ```
` ... `
Args:
raw: The raw string returned by the LLM.
Returns:
The cleaned string with fences removed and whitespace stripped.
"""
# Remove ```json or ``` fences
cleaned = re.sub(r"^```(?:json)?\s*", "", raw.strip(), flags=re.MULTILINE)
cleaned = re.sub(r"\s*```$", "", cleaned.strip(), flags=re.MULTILINE)
# Remove single backtick wrapping (less common but possible)
if cleaned.startswith("`") and cleaned.endswith("`"):
cleaned = cleaned[1:-1]
return cleaned.strip()
# =============================================================================
# STEP 2: PARSE JSON SAFELY
# Attempt json.loads() and raise a descriptive ValidationError if it fails.
# =============================================================================
def parse_json_safely(raw: str) -> dict:
"""
Parse a JSON string into a Python dict.
Raises ValidationError with a clear message if parsing fails.
Args:
raw: A (hopefully) valid JSON string.
Returns:
A Python dict.
Raises:
ValidationError: If the string is not valid JSON.
"""
try:
return json.loads(raw)
except json.JSONDecodeError as e:
# Show the first 300 chars of the raw string to help debugging
preview = raw[:300] + "..." if len(raw) > 300 else raw
raise ValidationError(
f"[validator] LLM returned invalid JSON.\n"
f"JSON error: {e}\n"
f"Raw output preview:\n{preview}"
)
# =============================================================================
# STEP 3: VALIDATE REQUIRED FIELDS
# Check that the parsed dict contains all required top-level,
# metadata, and lesson_flow fields.
# =============================================================================
def autocorrect_practice(lesson_dict: dict) -> dict:
"""
Auto-correct common LLM mistakes in the practice field.
Handles two known cases:
1. practice is a dict with string keys β†’ convert values to list
2. practice is a dict with a "prompts" key β†’ unwrap the list
"""
practice = lesson_dict.get("lesson_flow", {}).get("practice")
if isinstance(practice, dict):
# Case 1: {"prompts": [...]} β€” unwrap the list
if "prompts" in practice and isinstance(practice["prompts"], list):
print("[validator] ⚠️ practice was a dict with 'prompts' key β€” unwrapping...")
lesson_dict["lesson_flow"]["practice"] = practice["prompts"]
else:
# Case 2: {"P1": {...}, "P2": {...}} β€” convert values to list
# Case 3: any other dict β€” find the first list value that looks like prompts
extracted = False
for value in practice.values():
if isinstance(value, list) and len(value) > 0:
print("[validator] ⚠️ practice was an unknown dict β€” extracting first list value...")
lesson_dict["lesson_flow"]["practice"] = value
extracted = True
break
if not extracted:
print("[validator] ⚠️ practice was a dict β€” auto-correcting to list...")
lesson_dict["lesson_flow"]["practice"] = list(practice.values())
corrected = lesson_dict["lesson_flow"]["practice"]
print(f"[validator] Auto-corrected practice β†’ {len(corrected)} prompts")
return lesson_dict
def validate_required_fields(data: dict) -> None:
"""
Check that all required fields are present in the parsed lesson dict.
Checks top-level, metadata, and lesson_flow fields.
Args:
data: The parsed lesson dict.
Raises:
ValidationError: If any required field is missing.
"""
# Check top-level fields
missing_top = [f for f in REQUIRED_TOP_LEVEL_FIELDS if f not in data]
if missing_top:
raise ValidationError(
f"[validator] Missing top-level fields: {missing_top}"
)
# Check metadata fields
metadata = data.get("metadata", {})
missing_meta = [f for f in REQUIRED_METADATA_FIELDS if f not in metadata]
if missing_meta:
raise ValidationError(
f"[validator] Missing metadata fields: {missing_meta}"
)
# Check lesson_flow fields
flow = data.get("lesson_flow", {})
missing_flow = [f for f in REQUIRED_FLOW_FIELDS if f not in flow]
if missing_flow:
raise ValidationError(
f"[validator] Missing lesson_flow fields: {missing_flow}"
)
# Check practice is a non-empty list
practice = flow.get("practice", [])
if not isinstance(practice, list) or len(practice) == 0:
raise ValidationError(
f"[validator] lesson_flow.practice must be a non-empty list. "
f"Got: {type(practice).__name__}"
)
# Check practice has no more than 3 prompts
if len(practice) > 3:
raise ValidationError(
f"[validator] lesson_flow.practice has {len(practice)} prompts. "
f"Maximum allowed is 3."
)
# =============================================================================
# STEP 4: RUN POST-GENERATION GUARDRAIL CHECKS
# Runs all checks from checks.py and embeds the results back
# into the lesson dict. Flags are recorded but do not block saving.
# =============================================================================
def validate_against_schema(data: dict) -> dict:
"""
Run post-generation guardrail checks and embed results
into the guardrail_flags section of the lesson dict.
This does NOT raise on flag β€” it records the flag so the
lesson is self-documenting about any issues found.
Args:
data: The parsed and field-validated lesson dict.
Returns:
The lesson dict with guardrail_flags updated from checks.
"""
check_results = run_post_checks(data)
# Overwrite the LLM's self-assessed guardrail flags with
# our programmatic checks β€” our checks are more reliable
data["guardrail_flags"] = {
name: {
"status": result.status,
"message": result.message,
}
for name, result in check_results.items()
}
return data
def strip_non_speaking_fields(lesson_dict: dict) -> dict:
"""
Remove Speaking-specific fields from lessons that are not
Speaking or Reading β†’ Speaking β€” the LLM includes them anyway
despite instructions.
"""
ela_domain = lesson_dict.get("metadata", {}).get("ela_domain", "")
if ela_domain in ("Speaking", "Reading β†’ Speaking"):
return lesson_dict
# Strip voice markers
lesson_dict.get("metadata", {})["voice_markers"] = []
# Strip learning_goal_connection from hook
lesson_dict.get("lesson_flow", {}).get("hook", {}).pop("learning_goal_connection", None)
# Strip from practice prompts
for p in lesson_dict.get("lesson_flow", {}).get("practice", []):
p.pop("learning_goal_connection", None)
# Strip from reflect
lesson_dict.get("lesson_flow", {}).get("reflect", {}).pop("learning_goal_connection", None)
return lesson_dict
def normalize_reflect(lesson_dict: dict) -> dict:
"""
Flatten feedback_anchors into reflect if the LLM nested them.
Normalises: reflect.feedback_anchors.x β†’ reflect.x
"""
reflect = lesson_dict.get("lesson_flow", {}).get("reflect", {})
if "feedback_anchors" in reflect:
anchors = reflect.pop("feedback_anchors")
reflect.update(anchors)
return lesson_dict
# =============================================================================
# MASTER VALIDATION FUNCTION
# Runs all four steps in sequence.
# This is the only function the generator needs to call.
# =============================================================================
def validate_llm_output(raw: str) -> dict:
"""
Master validation function. Runs the full pipeline:
1. Strip code fences
2. Parse JSON
3. Validate required fields
4. Run guardrail checks and embed results
5. Strip Speaking-specific fields if needed
6. Normalize reflect
Args:
raw: The raw string returned by the Groq API.
Returns:
A clean, validated lesson dict ready to be saved.
Raises:
ValidationError: If steps 1-3 fail (structural problems).
Step 4 never raises β€” it flags and records issues.
"""
print("[validator] Starting validation pipeline...")
# Step 1
cleaned = strip_code_fences(raw)
print("[validator] Step 1 β€” Code fences stripped βœ…")
# Step 2
data = parse_json_safely(cleaned)
print("[validator] Step 2 β€” JSON parsed successfully βœ…")
# Step 2.5: Auto-correct known LLM formatting mistakes
data = autocorrect_practice(data)
# Step 3
validate_required_fields(data)
print("[validator] Step 3 β€” Required fields present βœ…")
# Step 4
data = validate_against_schema(data)
print("[validator] Step 4 β€” Guardrail checks complete βœ…")
# Step 5
data = strip_non_speaking_fields(data)
print("[validator] Step 5 β€” Speaking-specific fields stripped βœ…")
# Step 6
data = normalize_reflect(data)
print("[validator] Step 6 β€” Reflect normalized βœ…")
print(f"[validator] Validation passed for lesson: {data.get('lesson_id', 'UNKNOWN')}")
return data