Spaces:
Sleeping
Sleeping
File size: 12,598 Bytes
1862200 3ef3e92 1862200 3ef3e92 1862200 3ef3e92 1862200 3ef3e92 1862200 6be35e1 1862200 6be35e1 1862200 6be35e1 1862200 6be35e1 1862200 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 | """
src/utils/validator.py
----------------------
Validates the raw JSON string returned by the LLM before
it is parsed into a Lesson object.
Why this exists:
LLMs sometimes return malformed JSON, add markdown code fences
(```json ... ```), or omit required fields. This module catches
all of that before it causes an error downstream.
What will be defined here:
- strip_code_fences(raw) : Remove ```json and ``` wrappers
if the LLM added them
- parse_json_safely(raw) : Try to parse JSON, return None
and log error if it fails
- validate_required_fields(data): Check all required lesson fields
are present in the parsed dict
- validate_against_schema(data) : Use Pydantic to validate the full
lesson structure against our schema
- validate_llm_output(raw) : Master function β runs all of the
above in sequence, returns a
validated Lesson object or raises
a descriptive ValidationError
Design note:
This is the safety net between the LLM and our data layer.
Nothing touches data/generated/ without passing through here first.
"""
import json
import re
from src.guardrails.checks import run_post_checks
# =============================================================================
# CUSTOM EXCEPTION
# Raised when validation fails at any step.
# Always includes a descriptive message so the generator knows
# exactly what went wrong and can log it clearly.
# =============================================================================
class ValidationError(Exception):
"""
Raised when the LLM output fails any validation step.
The message describes exactly which step failed and why.
"""
pass
# =============================================================================
# REQUIRED FIELDS
# The top-level fields every lesson must have.
# Nested field validation is handled by run_post_checks().
# =============================================================================
REQUIRED_TOP_LEVEL_FIELDS = [
"lesson_id",
"metadata",
"lesson_flow",
"guardrail_flags",
]
REQUIRED_METADATA_FIELDS = [
"grade_band",
"ela_domain",
"lesson_type",
"theme",
"primary_skill",
"voice_markers",
"estimated_duration_minutes",
"ccss_anchor",
]
REQUIRED_FLOW_FIELDS = [
"hook",
"model",
"practice",
"reflect",
]
# =============================================================================
# STEP 1: STRIP CODE FENCES
# LLMs frequently wrap JSON in markdown code fences even when told not to.
# This strips them before attempting to parse.
# =============================================================================
def strip_code_fences(raw: str) -> str:
"""
Remove markdown code fences from a raw LLM response string.
Handles all common fence patterns:
```json ... ```
``` ... ```
` ... `
Args:
raw: The raw string returned by the LLM.
Returns:
The cleaned string with fences removed and whitespace stripped.
"""
# Remove ```json or ``` fences
cleaned = re.sub(r"^```(?:json)?\s*", "", raw.strip(), flags=re.MULTILINE)
cleaned = re.sub(r"\s*```$", "", cleaned.strip(), flags=re.MULTILINE)
# Remove single backtick wrapping (less common but possible)
if cleaned.startswith("`") and cleaned.endswith("`"):
cleaned = cleaned[1:-1]
return cleaned.strip()
# =============================================================================
# STEP 2: PARSE JSON SAFELY
# Attempt json.loads() and raise a descriptive ValidationError if it fails.
# =============================================================================
def parse_json_safely(raw: str) -> dict:
"""
Parse a JSON string into a Python dict.
Raises ValidationError with a clear message if parsing fails.
Args:
raw: A (hopefully) valid JSON string.
Returns:
A Python dict.
Raises:
ValidationError: If the string is not valid JSON.
"""
try:
return json.loads(raw)
except json.JSONDecodeError as e:
# Show the first 300 chars of the raw string to help debugging
preview = raw[:300] + "..." if len(raw) > 300 else raw
raise ValidationError(
f"[validator] LLM returned invalid JSON.\n"
f"JSON error: {e}\n"
f"Raw output preview:\n{preview}"
)
# =============================================================================
# STEP 3: VALIDATE REQUIRED FIELDS
# Check that the parsed dict contains all required top-level,
# metadata, and lesson_flow fields.
# =============================================================================
def autocorrect_practice(lesson_dict: dict) -> dict:
"""
Auto-correct common LLM mistakes in the practice field.
Handles two known cases:
1. practice is a dict with string keys β convert values to list
2. practice is a dict with a "prompts" key β unwrap the list
"""
practice = lesson_dict.get("lesson_flow", {}).get("practice")
if isinstance(practice, dict):
# Case 1: {"prompts": [...]} β unwrap the list
if "prompts" in practice and isinstance(practice["prompts"], list):
print("[validator] β οΈ practice was a dict with 'prompts' key β unwrapping...")
lesson_dict["lesson_flow"]["practice"] = practice["prompts"]
else:
# Case 2: {"P1": {...}, "P2": {...}} β convert values to list
# Case 3: any other dict β find the first list value that looks like prompts
extracted = False
for value in practice.values():
if isinstance(value, list) and len(value) > 0:
print("[validator] β οΈ practice was an unknown dict β extracting first list value...")
lesson_dict["lesson_flow"]["practice"] = value
extracted = True
break
if not extracted:
print("[validator] β οΈ practice was a dict β auto-correcting to list...")
lesson_dict["lesson_flow"]["practice"] = list(practice.values())
corrected = lesson_dict["lesson_flow"]["practice"]
print(f"[validator] Auto-corrected practice β {len(corrected)} prompts")
return lesson_dict
def validate_required_fields(data: dict) -> None:
"""
Check that all required fields are present in the parsed lesson dict.
Checks top-level, metadata, and lesson_flow fields.
Args:
data: The parsed lesson dict.
Raises:
ValidationError: If any required field is missing.
"""
# Check top-level fields
missing_top = [f for f in REQUIRED_TOP_LEVEL_FIELDS if f not in data]
if missing_top:
raise ValidationError(
f"[validator] Missing top-level fields: {missing_top}"
)
# Check metadata fields
metadata = data.get("metadata", {})
missing_meta = [f for f in REQUIRED_METADATA_FIELDS if f not in metadata]
if missing_meta:
raise ValidationError(
f"[validator] Missing metadata fields: {missing_meta}"
)
# Check lesson_flow fields
flow = data.get("lesson_flow", {})
missing_flow = [f for f in REQUIRED_FLOW_FIELDS if f not in flow]
if missing_flow:
raise ValidationError(
f"[validator] Missing lesson_flow fields: {missing_flow}"
)
# Check practice is a non-empty list
practice = flow.get("practice", [])
if not isinstance(practice, list) or len(practice) == 0:
raise ValidationError(
f"[validator] lesson_flow.practice must be a non-empty list. "
f"Got: {type(practice).__name__}"
)
# Check practice has no more than 3 prompts
if len(practice) > 3:
raise ValidationError(
f"[validator] lesson_flow.practice has {len(practice)} prompts. "
f"Maximum allowed is 3."
)
# =============================================================================
# STEP 4: RUN POST-GENERATION GUARDRAIL CHECKS
# Runs all checks from checks.py and embeds the results back
# into the lesson dict. Flags are recorded but do not block saving.
# =============================================================================
def validate_against_schema(data: dict) -> dict:
"""
Run post-generation guardrail checks and embed results
into the guardrail_flags section of the lesson dict.
This does NOT raise on flag β it records the flag so the
lesson is self-documenting about any issues found.
Args:
data: The parsed and field-validated lesson dict.
Returns:
The lesson dict with guardrail_flags updated from checks.
"""
check_results = run_post_checks(data)
# Overwrite the LLM's self-assessed guardrail flags with
# our programmatic checks β our checks are more reliable
data["guardrail_flags"] = {
name: {
"status": result.status,
"message": result.message,
}
for name, result in check_results.items()
}
return data
def strip_non_speaking_fields(lesson_dict: dict) -> dict:
"""
Remove Speaking-specific fields from lessons that are not
Speaking or Reading β Speaking β the LLM includes them anyway
despite instructions.
"""
ela_domain = lesson_dict.get("metadata", {}).get("ela_domain", "")
if ela_domain in ("Speaking", "Reading β Speaking"):
return lesson_dict
# Strip voice markers
lesson_dict.get("metadata", {})["voice_markers"] = []
# Strip learning_goal_connection from hook
lesson_dict.get("lesson_flow", {}).get("hook", {}).pop("learning_goal_connection", None)
# Strip from practice prompts
for p in lesson_dict.get("lesson_flow", {}).get("practice", []):
p.pop("learning_goal_connection", None)
# Strip from reflect
lesson_dict.get("lesson_flow", {}).get("reflect", {}).pop("learning_goal_connection", None)
return lesson_dict
def normalize_reflect(lesson_dict: dict) -> dict:
"""
Flatten feedback_anchors into reflect if the LLM nested them.
Normalises: reflect.feedback_anchors.x β reflect.x
"""
reflect = lesson_dict.get("lesson_flow", {}).get("reflect", {})
if "feedback_anchors" in reflect:
anchors = reflect.pop("feedback_anchors")
reflect.update(anchors)
return lesson_dict
# =============================================================================
# MASTER VALIDATION FUNCTION
# Runs all four steps in sequence.
# This is the only function the generator needs to call.
# =============================================================================
def validate_llm_output(raw: str) -> dict:
"""
Master validation function. Runs the full pipeline:
1. Strip code fences
2. Parse JSON
3. Validate required fields
4. Run guardrail checks and embed results
5. Strip Speaking-specific fields if needed
6. Normalize reflect
Args:
raw: The raw string returned by the Groq API.
Returns:
A clean, validated lesson dict ready to be saved.
Raises:
ValidationError: If steps 1-3 fail (structural problems).
Step 4 never raises β it flags and records issues.
"""
print("[validator] Starting validation pipeline...")
# Step 1
cleaned = strip_code_fences(raw)
print("[validator] Step 1 β Code fences stripped β
")
# Step 2
data = parse_json_safely(cleaned)
print("[validator] Step 2 β JSON parsed successfully β
")
# Step 2.5: Auto-correct known LLM formatting mistakes
data = autocorrect_practice(data)
# Step 3
validate_required_fields(data)
print("[validator] Step 3 β Required fields present β
")
# Step 4
data = validate_against_schema(data)
print("[validator] Step 4 β Guardrail checks complete β
")
# Step 5
data = strip_non_speaking_fields(data)
print("[validator] Step 5 β Speaking-specific fields stripped β
")
# Step 6
data = normalize_reflect(data)
print("[validator] Step 6 β Reflect normalized β
")
print(f"[validator] Validation passed for lesson: {data.get('lesson_id', 'UNKNOWN')}")
return data |