import json import re from openai import OpenAI PROMPT_TEMPLATE = """ You are a math MCQ validator for Grade {grade} students at {difficulty} difficulty level. Context: - Grade {grade} students have age-appropriate math knowledge. - Difficulty level is {difficulty} — ensure the question and answer reflect this level. Task: Validate that the correct answer matches the choices, and that the question is appropriate for Grade {grade} at {difficulty} difficulty. Rules: 1. Check the value of the correct answer. 2. Verify that this value exists in the choices. 3. Ensure the question complexity suits Grade {grade} at {difficulty} difficulty. Cases: Case 1 — Valid: If the correct answer value exists in the choices AND the correct_answer letter points to that value, return the JSON unchanged. Case 2 — Wrong answer letter: If the correct answer value exists in the choices BUT the correct_answer letter is incorrect, update correct_answer to the letter that corresponds to the correct value. Case 3 — Correct value missing: If the correct answer value does NOT exist in the choices, replace one incorrect choice with the correct value and assign the correct_answer letter to that choice. Case 4 — Question and answer both incorrect: If the question is not understandable or not appropriate for Grade {grade} at {difficulty} difficulty, rewrite it to be a clear, grade-appropriate question, set the correct answer, and ensure it exists in the choices. Constraints: - Keep exactly four choices (A, B, C, D). - Choices must remain numbers. - Return ONLY valid JSON. Input JSON: {mcq_json} """ class MCQValidator: """ Parses raw model output text into structured MCQ JSON, then validates and corrects it using GPT. """ def __init__(self, key_string: str, model: str = "gpt-5-nano"): self.client = OpenAI(api_key=key_string) self.model = model # ------------------------------------------------------------------ # STEP 1: Parse raw model output into structured dict # ------------------------------------------------------------------ def _extract_choices(self, text: str) -> dict: """ Try multiple strategies to extract choices. Normalizes all keys to uppercase A-D and values to rounded floats. Supported formats: - Formatted list: A) 2 or A) 0.33 - JSON letter key: "A": 2.0 - Verbose key: "choice a": 0.418... """ letter_map = { 'a': 'A', 'b': 'B', 'c': 'C', 'd': 'D', 'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', } # Strategy 1: formatted list "A) 2" matches = re.findall(r"([A-D])\)\s*(-?\d+(?:\.\d+)?)", text) if len(matches) >= 2: return {letter_map[k]: round(float(v), 4) for k, v in matches} # Strategy 2: JSON letter key "A": 2.0 matches = re.findall(r'"([A-Da-d])"\s*:\s*(-?\d+(?:\.\d+)?)', text) if len(matches) >= 2: return {letter_map[k]: round(float(v), 4) for k, v in matches} # Strategy 3: verbose key "choice a": 0.418... matches = re.findall(r'"[\s]*choice\s+([a-dA-D])"\s*:\s*(-?\d+(?:\.\d+)?)', text) if len(matches) >= 2: return {letter_map[k]: round(float(v), 4) for k, v in matches} return {} def _extract_correct_answer(self, text: str, choices: dict = None) -> str | None: """ Extract correct_answer letter, handling formats: - "correct_answer": "D" - "correct_answer": "choice b" - Fallback: evaluate arithmetic expression in question and match to choices (handles truncated output where correct_answer field is missing) Always returns uppercase A-D or None. """ letter_map = {'a': 'A', 'b': 'B', 'c': 'C', 'd': 'D'} # Format 1: "correct_answer": "D" m = re.search(r'"correct_answer"\s*:\s*"([A-D])"', text) if m: return m.group(1) # Format 2: "correct_answer": "choice b" m = re.search(r'"correct_answer"\s*:\s*"[\s]*choice\s+([a-dA-D])"', text) if m: return letter_map.get(m.group(1).lower()) # Format 3: Fallback for truncated output — correct_answer field never appeared. # Evaluate the arithmetic expression in the question and match to choices. if choices: q_match = re.search(r'"question":\s*"(.*?)"', text) if q_match: question_text = q_match.group(1) expr = re.search(r'(\d+)\s*([\+\-\*\/])\s*(\d+)', question_text) if expr: a, op, b = expr.groups() computed = None if op == '+': computed = int(a) + int(b) elif op == '-': computed = int(a) - int(b) elif op == '*': computed = int(a) * int(b) elif op == '/' and int(b) != 0: computed = round(int(a) / int(b), 4) if computed is not None: for letter, val in choices.items(): if round(float(val), 4) == round(float(computed), 4): return letter return None def parse_raw_output(self, text: str) -> dict: """ Extract question, choices, and correct_answer from raw model output string. Returns a dict ready for validation. Raises ValueError if any field cannot be extracted. """ # Extract question from JSON block question_match = re.search(r'"question":\s*"(.*?)"', text) if not question_match: raise ValueError("Could not extract 'question' from model output.") question = question_match.group(1).strip() # Extract choices using multi-strategy parser choices = self._extract_choices(text) if len(choices) < 2: raise ValueError(f"Could not extract choices. Found: {choices}") # Extract correct answer — pass choices for arithmetic fallback correct = self._extract_correct_answer(text, choices=choices) if not correct: raise ValueError("Could not extract 'correct_answer' from model output.") return { "question": question, "choices": choices, "correct_answer": correct } def validate_with_gpt(self, mcq_dict: dict, grade: int = 3, difficulty: str = "easy") -> dict: """ Send the parsed MCQ dict to GPT for validation and correction. Returns a validated/corrected MCQ dict. """ mcq_json_str = json.dumps(mcq_dict, indent=2) prompt = PROMPT_TEMPLATE.format(mcq_json=mcq_json_str, grade=grade, difficulty=difficulty) response = self.client.chat.completions.create( model=self.model, messages=[{"role": "user", "content": prompt}], ) raw_response = response.choices[0].message.content.strip() return self._parse_gpt_response(raw_response, fallback=mcq_dict) def _parse_gpt_response(self, text: str, fallback: dict) -> dict: """ Robustly parse GPT JSON response. Falls back to the original parsed MCQ if GPT output cannot be parsed. """ # Strip markdown fences if present text = re.sub(r"```json|```", "", text).strip() try: parsed = json.loads(text) if "choices" in parsed: parsed["choices"] = {k: round(float(v), 4) for k, v in parsed["choices"].items()} return parsed except json.JSONDecodeError: pass # Try extracting JSON block try: start = text.find('{') end = text.rfind('}') + 1 if start != -1 and end > start: parsed = json.loads(text[start:end]) if "choices" in parsed: parsed["choices"] = {k: round(float(v), 4) for k, v in parsed["choices"].items()} return parsed except Exception: pass return fallback GPT_FALLBACK_PROMPT = """ You are a math MCQ generator for Grade {grade} students at {difficulty} difficulty level. The fine-tuned model failed to produce a parseable question for the topic: {topic}. Generate ONE valid math MCQ appropriate for Grade {grade} at {difficulty} difficulty. Rules: - The question must directly test the topic: {topic} - Keep the question simple and age-appropriate for Grade {grade} - Provide exactly 4 numeric answer choices labeled A, B, C, D - Only one choice must be the correct answer - Return ONLY valid JSON in exactly this format: {{ "question": "", "choices": {{"A": , "B": , "C": , "D": }}, "correct_answer": "" }} """ def _generate_fallback_question(self, topic: str, grade: int, difficulty: str) -> dict: """ Called when parsing fails. Generates a fresh MCQ via GPT in the same format as the fine-tuned model output. """ prompt = self.GPT_FALLBACK_PROMPT.format( topic=topic, grade=grade, difficulty=difficulty, ) response = self.client.chat.completions.create( model=self.model, messages=[{"role": "user", "content": prompt}], ) raw = response.choices[0].message.content.strip() return self._parse_gpt_response(raw, fallback={ "error": "GPT fallback also failed to produce valid JSON", "question": f"What is a basic {topic} problem?", "choices": {"A": 1, "B": 2, "C": 3, "D": 4}, "correct_answer": "A", }) def validate(self, raw_model_output: str, grade: int = 3, difficulty: str = "easy", topic: str = "") -> dict: """ Full pipeline: 1. Parse raw model output text into structured MCQ dict 2. Validate and correct via GPT using grade and difficulty context 3. Return final validated MCQ dict Args: raw_model_output: The string returned by infer_question_gen() grade: Grade level (e.g. 3, 4, 5) difficulty: Difficulty level (e.g. "easy", "medium", "hard") Returns: Validated MCQ dict with keys: question, choices, correct_answer """ try: mcq_dict = self.parse_raw_output(raw_model_output) except ValueError: # Parsing failed — fine-tuned model output was unusable. # Fall back to GPT to generate a fresh question for the same topic/grade/difficulty. return self._generate_fallback_question(topic=topic, grade=grade, difficulty=difficulty) validated = self.validate_with_gpt(mcq_dict, grade=grade, difficulty=difficulty) return validated TOPIC_IMPROVEMENTS_PROMPT = """ You are an educational content reviewer for elementary school mathematics. A question-topic matching model returned a score but gave no explanation. Your job is to write the improvements list explaining why the question does not perfectly match the topic. Use this exact style from examples: - Score 0.75 → "This question is somewhat related to {topic} but does not focus on its core concept. It [what the question actually tests] rather than {topic}." - Score 0.5 → "This question is partially related to the topic. It [what the question actually tests] rather than addressing {topic} directly." - Score 0.0 → "This question does not match the topic because it [what the question actually tests] instead of {topic}." Rules: - Write 1 clear improvement sentence explaining the mismatch. - Be specific about what the question actually tests vs what the topic expects. - Return ONLY valid JSON in this exact format: {{"improvements": [""]}} Input: Topic: {topic} Grade: {grade} Question: {question} Matching Score: {score} """ class TopicMatchValidator: """ Validates output from the question-topic matching model. If score < 1.0 and improvements is empty, calls GPT to generate them. """ def __init__(self, key_string: str, model: str = "gpt-5-nano"): self.client = OpenAI(api_key=key_string) self.model = model def _needs_improvements(self, result: dict) -> bool: """ Returns True if the model returned a non-perfect score but left the improvements list empty. """ score = result.get("matching_score", 1.0) improvements = result.get("improvements", []) return score < 1.0 and (not improvements or improvements == []) def _generate_improvements(self, topic: str, grade: int, question: str, score: float) -> list: """ Calls GPT to generate improvements in the dataset style. Returns a list with one improvement string. """ prompt = TOPIC_IMPROVEMENTS_PROMPT.format( topic=topic, grade=grade, question=question, score=score, ) response = self.client.chat.completions.create( model=self.model, messages=[{"role": "user", "content": prompt}], ) raw = response.choices[0].message.content.strip() return self._parse_improvements_response(raw) def _parse_improvements_response(self, text: str) -> list: """ Safely parse GPT improvements response. Returns a list of improvement strings. """ text = re.sub(r"```json|```", "", text).strip() try: parsed = json.loads(text) if "improvements" in parsed and isinstance(parsed["improvements"], list): return parsed["improvements"] except json.JSONDecodeError: pass # Try extracting JSON block try: start = text.find('{') end = text.rfind('}') + 1 if start != -1 and end > start: parsed = json.loads(text[start:end]) if "improvements" in parsed and isinstance(parsed["improvements"], list): return parsed["improvements"] except Exception: pass # Last resort: return the raw text as a single improvement if text: return [text[:300]] return ["The question does not sufficiently match the stated topic."] def validate(self, result: dict, topic: str, grade: int, question: str) -> dict: """ Validates the topic match result. If score < 1.0 and improvements is empty, generates improvements via GPT. Args: result: The dict returned by evaluate_question_topic_match() topic: The topic string from the original request grade: The grade level from the original request question: The question string from the original request Returns: The result dict, guaranteed to have improvements if score < 1.0 """ if self._needs_improvements(result): score = result.get("matching_score", 0.0) result["improvements"] = self._generate_improvements(topic, grade, question, score) return result