Spaces:

aankitdas
/

bantrly-lesson-generator

Sleeping

App Files Files Community

bantrly-lesson-generator / src /utils /validator.py

aankitdas

components connect to learning goals

6be35e1 3 months ago

raw

history blame contribute delete

12.6 kB

	"""
	src/utils/validator.py
	----------------------
	Validates the raw JSON string returned by the LLM before
	it is parsed into a Lesson object.

	Why this exists:
	LLMs sometimes return malformed JSON, add markdown code fences
	(```json ... ```), or omit required fields. This module catches
	all of that before it causes an error downstream.

	What will be defined here:
	- strip_code_fences(raw) : Remove ```json and ``` wrappers
	if the LLM added them
	- parse_json_safely(raw) : Try to parse JSON, return None
	and log error if it fails
	- validate_required_fields(data): Check all required lesson fields
	are present in the parsed dict
	- validate_against_schema(data) : Use Pydantic to validate the full
	lesson structure against our schema
	- validate_llm_output(raw) : Master function — runs all of the
	above in sequence, returns a
	validated Lesson object or raises
	a descriptive ValidationError

	Design note:
	This is the safety net between the LLM and our data layer.
	Nothing touches data/generated/ without passing through here first.
	"""

	import json
	import re
	from src.guardrails.checks import run_post_checks


	# =============================================================================
	# CUSTOM EXCEPTION
	# Raised when validation fails at any step.
	# Always includes a descriptive message so the generator knows
	# exactly what went wrong and can log it clearly.
	# =============================================================================

	class ValidationError(Exception):
	"""
	Raised when the LLM output fails any validation step.
	The message describes exactly which step failed and why.
	"""
	pass


	# =============================================================================
	# REQUIRED FIELDS
	# The top-level fields every lesson must have.
	# Nested field validation is handled by run_post_checks().
	# =============================================================================

	REQUIRED_TOP_LEVEL_FIELDS = [
	"lesson_id",
	"metadata",
	"lesson_flow",
	"guardrail_flags",
	]

	REQUIRED_METADATA_FIELDS = [
	"grade_band",
	"ela_domain",
	"lesson_type",
	"theme",
	"primary_skill",
	"voice_markers",
	"estimated_duration_minutes",
	"ccss_anchor",
	]

	REQUIRED_FLOW_FIELDS = [
	"hook",
	"model",
	"practice",
	"reflect",
	]


	# =============================================================================
	# STEP 1: STRIP CODE FENCES
	# LLMs frequently wrap JSON in markdown code fences even when told not to.
	# This strips them before attempting to parse.
	# =============================================================================

	def strip_code_fences(raw: str) -> str:
	"""
	Remove markdown code fences from a raw LLM response string.

	Handles all common fence patterns:
	```json ... ```
	``` ... ```
	` ... `

	Args:
	raw: The raw string returned by the LLM.

	Returns:
	The cleaned string with fences removed and whitespace stripped.
	"""
	# Remove ```json or ``` fences
	cleaned = re.sub(r"^```(?:json)?\s*", "", raw.strip(), flags=re.MULTILINE)
	cleaned = re.sub(r"\s*```$", "", cleaned.strip(), flags=re.MULTILINE)

	# Remove single backtick wrapping (less common but possible)
	if cleaned.startswith("`") and cleaned.endswith("`"):
	cleaned = cleaned[1:-1]

	return cleaned.strip()


	# =============================================================================
	# STEP 2: PARSE JSON SAFELY
	# Attempt json.loads() and raise a descriptive ValidationError if it fails.
	# =============================================================================

	def parse_json_safely(raw: str) -> dict:
	"""
	Parse a JSON string into a Python dict.
	Raises ValidationError with a clear message if parsing fails.

	Args:
	raw: A (hopefully) valid JSON string.

	Returns:
	A Python dict.

	Raises:
	ValidationError: If the string is not valid JSON.
	"""
	try:
	return json.loads(raw)
	except json.JSONDecodeError as e:
	# Show the first 300 chars of the raw string to help debugging
	preview = raw[:300] + "..." if len(raw) > 300 else raw
	raise ValidationError(
	f"[validator] LLM returned invalid JSON.\n"
	f"JSON error: {e}\n"
	f"Raw output preview:\n{preview}"
	)


	# =============================================================================
	# STEP 3: VALIDATE REQUIRED FIELDS
	# Check that the parsed dict contains all required top-level,
	# metadata, and lesson_flow fields.
	# =============================================================================

	def autocorrect_practice(lesson_dict: dict) -> dict:
	"""
	Auto-correct common LLM mistakes in the practice field.

	Handles two known cases:
	1. practice is a dict with string keys → convert values to list
	2. practice is a dict with a "prompts" key → unwrap the list
	"""
	practice = lesson_dict.get("lesson_flow", {}).get("practice")

	if isinstance(practice, dict):
	# Case 1: {"prompts": [...]} — unwrap the list
	if "prompts" in practice and isinstance(practice["prompts"], list):
	print("[validator] ⚠️ practice was a dict with 'prompts' key — unwrapping...")
	lesson_dict["lesson_flow"]["practice"] = practice["prompts"]

	else:
	# Case 2: {"P1": {...}, "P2": {...}} — convert values to list
	# Case 3: any other dict — find the first list value that looks like prompts
	extracted = False
	for value in practice.values():
	if isinstance(value, list) and len(value) > 0:
	print("[validator] ⚠️ practice was an unknown dict — extracting first list value...")
	lesson_dict["lesson_flow"]["practice"] = value
	extracted = True
	break

	if not extracted:
	print("[validator] ⚠️ practice was a dict — auto-correcting to list...")
	lesson_dict["lesson_flow"]["practice"] = list(practice.values())

	corrected = lesson_dict["lesson_flow"]["practice"]
	print(f"[validator] Auto-corrected practice → {len(corrected)} prompts")

	return lesson_dict


	def validate_required_fields(data: dict) -> None:
	"""
	Check that all required fields are present in the parsed lesson dict.
	Checks top-level, metadata, and lesson_flow fields.

	Args:
	data: The parsed lesson dict.

	Raises:
	ValidationError: If any required field is missing.
	"""
	# Check top-level fields
	missing_top = [f for f in REQUIRED_TOP_LEVEL_FIELDS if f not in data]
	if missing_top:
	raise ValidationError(
	f"[validator] Missing top-level fields: {missing_top}"
	)

	# Check metadata fields
	metadata = data.get("metadata", {})
	missing_meta = [f for f in REQUIRED_METADATA_FIELDS if f not in metadata]
	if missing_meta:
	raise ValidationError(
	f"[validator] Missing metadata fields: {missing_meta}"
	)

	# Check lesson_flow fields
	flow = data.get("lesson_flow", {})
	missing_flow = [f for f in REQUIRED_FLOW_FIELDS if f not in flow]
	if missing_flow:
	raise ValidationError(
	f"[validator] Missing lesson_flow fields: {missing_flow}"
	)

	# Check practice is a non-empty list
	practice = flow.get("practice", [])
	if not isinstance(practice, list) or len(practice) == 0:
	raise ValidationError(
	f"[validator] lesson_flow.practice must be a non-empty list. "
	f"Got: {type(practice).__name__}"
	)

	# Check practice has no more than 3 prompts
	if len(practice) > 3:
	raise ValidationError(
	f"[validator] lesson_flow.practice has {len(practice)} prompts. "
	f"Maximum allowed is 3."
	)


	# =============================================================================
	# STEP 4: RUN POST-GENERATION GUARDRAIL CHECKS
	# Runs all checks from checks.py and embeds the results back
	# into the lesson dict. Flags are recorded but do not block saving.
	# =============================================================================

	def validate_against_schema(data: dict) -> dict:
	"""
	Run post-generation guardrail checks and embed results
	into the guardrail_flags section of the lesson dict.

	This does NOT raise on flag — it records the flag so the
	lesson is self-documenting about any issues found.

	Args:
	data: The parsed and field-validated lesson dict.

	Returns:
	The lesson dict with guardrail_flags updated from checks.
	"""
	check_results = run_post_checks(data)

	# Overwrite the LLM's self-assessed guardrail flags with
	# our programmatic checks — our checks are more reliable
	data["guardrail_flags"] = {
	name: {
	"status": result.status,
	"message": result.message,
	}
	for name, result in check_results.items()
	}

	return data

	def strip_non_speaking_fields(lesson_dict: dict) -> dict:
	"""
	Remove Speaking-specific fields from lessons that are not
	Speaking or Reading → Speaking — the LLM includes them anyway
	despite instructions.
	"""
	ela_domain = lesson_dict.get("metadata", {}).get("ela_domain", "")

	if ela_domain in ("Speaking", "Reading → Speaking"):
	return lesson_dict

	# Strip voice markers
	lesson_dict.get("metadata", {})["voice_markers"] = []

	# Strip learning_goal_connection from hook
	lesson_dict.get("lesson_flow", {}).get("hook", {}).pop("learning_goal_connection", None)

	# Strip from practice prompts
	for p in lesson_dict.get("lesson_flow", {}).get("practice", []):
	p.pop("learning_goal_connection", None)

	# Strip from reflect
	lesson_dict.get("lesson_flow", {}).get("reflect", {}).pop("learning_goal_connection", None)

	return lesson_dict

	def normalize_reflect(lesson_dict: dict) -> dict:
	"""
	Flatten feedback_anchors into reflect if the LLM nested them.
	Normalises: reflect.feedback_anchors.x → reflect.x
	"""
	reflect = lesson_dict.get("lesson_flow", {}).get("reflect", {})
	if "feedback_anchors" in reflect:
	anchors = reflect.pop("feedback_anchors")
	reflect.update(anchors)
	return lesson_dict
	# =============================================================================
	# MASTER VALIDATION FUNCTION
	# Runs all four steps in sequence.
	# This is the only function the generator needs to call.
	# =============================================================================

	def validate_llm_output(raw: str) -> dict:
	"""
	Master validation function. Runs the full pipeline:
	1. Strip code fences
	2. Parse JSON
	3. Validate required fields
	4. Run guardrail checks and embed results
	5. Strip Speaking-specific fields if needed
	6. Normalize reflect
	Args:
	raw: The raw string returned by the Groq API.

	Returns:
	A clean, validated lesson dict ready to be saved.

	Raises:
	ValidationError: If steps 1-3 fail (structural problems).
	Step 4 never raises — it flags and records issues.
	"""
	print("[validator] Starting validation pipeline...")

	# Step 1
	cleaned = strip_code_fences(raw)
	print("[validator] Step 1 — Code fences stripped ✅")

	# Step 2
	data = parse_json_safely(cleaned)
	print("[validator] Step 2 — JSON parsed successfully ✅")

	# Step 2.5: Auto-correct known LLM formatting mistakes
	data = autocorrect_practice(data)

	# Step 3
	validate_required_fields(data)
	print("[validator] Step 3 — Required fields present ✅")

	# Step 4
	data = validate_against_schema(data)
	print("[validator] Step 4 — Guardrail checks complete ✅")

	# Step 5
	data = strip_non_speaking_fields(data)
	print("[validator] Step 5 — Speaking-specific fields stripped ✅")

	# Step 6
	data = normalize_reflect(data)
	print("[validator] Step 6 — Reflect normalized ✅")

	print(f"[validator] Validation passed for lesson: {data.get('lesson_id', 'UNKNOWN')}")
	return data