Spaces:
Running
Running
| from typing import List, Literal, Optional, Any, Dict | |
| from pydantic import BaseModel | |
| import re | |
| import json | |
| import logging | |
| import uuid | |
| from datetime import datetime | |
| logger = logging.getLogger(__name__) | |
| class ExampleItem(BaseModel): | |
| prompt: str | |
| solutionSteps: list[str] | |
| source: Literal["teacher_file", "deped_rag", "mixed"] | |
| class SectionItem(BaseModel): | |
| id: str | |
| title: str | |
| sectionType: Literal["content"] | |
| body: str | |
| keyPoints: list[str] | |
| examples: list[ExampleItem] | |
| class PracticeItem(BaseModel): | |
| id: str | |
| questionType: Literal["multiple_choice", "open_ended", "numeric"] | |
| prompt: str | |
| choices: list[str] | None # None for open_ended/numeric | |
| correctAnswer: str | |
| explanation: str | |
| source: Literal["teacher_file", "deped_rag", "mixed"] | |
| class AiSafety(BaseModel): | |
| requiresGrounding: bool = True | |
| allowedModels: list[str] | |
| groundingSources: list[Literal["teacher_file", "deped_rag"]] | |
| class TeacherModule(BaseModel): | |
| moduleId: str | |
| title: str | |
| gradeLevel: str | |
| subject: str | |
| quarter: Literal["Q1", "Q2", "Q3", "Q4", "All", "Unknown"] | |
| strandOrTrack: str | None | |
| competencyTags: list[str] | |
| moduleType: Literal["teacher_uploaded"] = "teacher_uploaded" | |
| sourceLabel: Literal["Teacher Upload"] = "Teacher Upload" | |
| originNote: str | |
| summary: str | |
| learningObjectives: list[str] | |
| sections: list[SectionItem] | |
| practice: list[PracticeItem] | |
| aiSafety: AiSafety | |
| TEACHER_MATERIAL_MODULE_SYSTEM_PROMPT = """You are the curriculum ingestion and lesson-design assistant inside MathPulse AI, an AI-powered math education platform aligned with the Philippine DepEd curriculum. A teacher has uploaded a lesson file (PDF or DOCX). | |
| You receive: | |
| - COURSE_MATERIAL_TEXT: text extracted from the teacher's file. | |
| - RAG_RESULTS: passages retrieved from the DepEd curriculum vector store that match the topic, grade level, and subject. | |
| Your job is to output only valid JSON describing a single new teacher_uploaded module for the student-facing Curriculum Modules screen, using the exact schema provided. | |
| Rules: | |
| 1. Do not hallucinate content. All explanations, examples, and practice questions must be clearly supported by COURSE_MATERIAL_TEXT and/or RAG_RESULTS. | |
| 2. If either source does not contain some detail, omit it or explicitly say that the detail is not available. | |
| 3. Set "moduleType": "teacher_uploaded" and "sourceLabel": "Teacher Upload". | |
| 4. Use the teacher file's topic and structure to decide the module title and sections. | |
| 5. Use DepEd passages in RAG_RESULTS only to align competencies, terminology, and phrasing with the official curriculum. | |
| 6. Do not mention RAG, embeddings, or internal system components in student-visible text. | |
| 7. Respond with JSON only, no extra text. | |
| 8. Generate realistic worked examples with step-by-step solution steps. | |
| 9. Generate practice questions that assess understanding (multiple choice preferred, with 4 choices A-D). | |
| 10. Set competencyTags based on DepEd curriculum alignment. | |
| """ | |
| def generate_module_id(title: str, teacher_id: str) -> str: | |
| # Create a stable slug from title + teacher_id + timestamp | |
| # e.g., "quadratic-equations-grace-math-teacher-2026-05-13" | |
| import unicodedata | |
| import time | |
| title_slug = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii').lower() | |
| title_slug = re.sub(r'[^a-z0-9]+', '-', title_slug).strip('-') | |
| timestamp = datetime.now().strftime("%Y-%m-%d") | |
| return f"{title_slug}-{teacher_id}-{timestamp}" | |
| def _parse_module_json(raw: str) -> Optional[Dict[str, Any]]: | |
| """Robustly extract a JSON object from LLM output.""" | |
| cleaned = raw.strip() | |
| # Remove markdown fences | |
| cleaned = re.sub(r"^```(?:json)?\s*\n?", "", cleaned, flags=re.IGNORECASE) | |
| cleaned = re.sub(r"\n?```\s*$", "", cleaned) | |
| cleaned = cleaned.strip() | |
| # Remove reasoning blocks | |
| cleaned = re.sub(r"<think>[\s\S]*?</think>", "", cleaned, flags=re.IGNORECASE) | |
| cleaned = cleaned.strip() | |
| try: | |
| return json.loads(cleaned) | |
| except json.JSONDecodeError: | |
| # Try to find a JSON object in the string | |
| try: | |
| start_idx = cleaned.find('{') | |
| end_idx = cleaned.rfind('}') | |
| if start_idx != -1 and end_idx != -1 and end_idx >= start_idx: | |
| json_str = cleaned[start_idx:end_idx + 1] | |
| return json.loads(json_str) | |
| except Exception: | |
| pass | |
| return None | |
| async def generate_teacher_module( | |
| course_material_text: str, | |
| rag_results: str, | |
| metadata: dict | |
| ) -> TeacherModule: | |
| # Import inside the function to avoid circular imports if imported from main | |
| import sys | |
| import os | |
| # Ensure backend path is in sys.path | |
| if os.path.dirname(os.path.dirname(__file__)) not in sys.path: | |
| sys.path.append(os.path.dirname(os.path.dirname(__file__))) | |
| from main import call_hf_chat_async | |
| prompt = f""" | |
| COURSE_MATERIAL_TEXT: | |
| {course_material_text} | |
| RAG_RESULTS: | |
| {rag_results} | |
| METADATA: | |
| Grade Level: {metadata.get('grade_level', 'Unknown')} | |
| Subject: {metadata.get('subject', 'Unknown')} | |
| Quarter: {metadata.get('quarter', 'Unknown')} | |
| Strand/Track: {metadata.get('strand', 'Unknown')} | |
| Module Title Hint: {metadata.get('title', 'Unknown')} | |
| Generate the module JSON according to the system prompt rules and schema. | |
| Ensure moduleType is "teacher_uploaded" and sourceLabel is "Teacher Upload". | |
| """ | |
| messages = [ | |
| {"role": "system", "content": TEACHER_MATERIAL_MODULE_SYSTEM_PROMPT}, | |
| {"role": "user", "content": prompt}, | |
| ] | |
| logger.info(f"Generating teacher module for {metadata.get('title', 'Unknown')}") | |
| # We use a larger max_tokens because a full module with sections and practice items can be quite long. | |
| raw_content = await call_hf_chat_async( | |
| messages, | |
| max_tokens=8192, | |
| temperature=0.3, | |
| top_p=0.9, | |
| timeout=180, | |
| task_type="chat", | |
| ) | |
| parsed_json = _parse_module_json(raw_content) | |
| if not parsed_json: | |
| logger.error(f"Failed to parse teacher module JSON. Raw content:\n{raw_content[:500]}...") | |
| raise ValueError("Failed to generate valid JSON for the teacher module") | |
| # Generate an ID if missing | |
| if "moduleId" not in parsed_json or not parsed_json["moduleId"]: | |
| parsed_json["moduleId"] = generate_module_id( | |
| parsed_json.get("title", metadata.get("title", "module")), | |
| metadata.get("teacher_id", "teacher") | |
| ) | |
| try: | |
| # Pydantic will validate the schema | |
| module = TeacherModule(**parsed_json) | |
| return module | |
| except Exception as e: | |
| logger.error(f"Failed to validate teacher module against schema: {e}") | |
| logger.error(f"Parsed JSON: {json.dumps(parsed_json)[:500]}") | |
| raise ValueError(f"Teacher module failed schema validation: {e}") | |