Spaces:

Deign86
/

mathpulse-api-v3test

Running

App Files Files Community

github-actions[bot] commited on 29 days ago

Commit

f717a11

1 Parent(s): ac19778

🚀 Auto-deploy backend from GitHub (46778ac)

Browse files

Files changed (9) hide show

backend/main.py +0 -0
backend/rag/curriculum_rag.py +0 -318
backend/routes/rag_routes.py +0 -427
main.py +2 -0
rag/curriculum_rag.py +3 -3
routes/rag_routes.py +115 -58
test_full_rag.py +75 -0
test_retrieval.py +39 -0
tests/test_rag_pipeline.py +16 -10

backend/main.py DELETED Viewed

The diff for this file is too large to render. See raw diff

backend/rag/curriculum_rag.py DELETED Viewed

@@ -1,318 +0,0 @@
-"""
-Updated curriculum RAG with exact match retrieval and 7-section notebook output.
-"""
-from __future__ import annotations
-from typing import Dict, List, Optional, Tuple
-def _to_where(
-    subject: Optional[str] = None,
-    quarter: Optional[int] = None,
-    content_domain: Optional[str] = None,
-    chunk_type: Optional[str] = None,
-    module_id: Optional[str] = None,
-    lesson_id: Optional[str] = None,
-    competency_code: Optional[str] = None,
-    storage_path: Optional[str] = None,
-) -> Optional[Dict[str, object]]:
-    clauses = []
-    if subject:
-        clauses.append({"subject": {"$eq": subject}})
-    if quarter is not None:
-        clauses.append({"quarter": {"$eq": int(quarter)}})
-    if content_domain:
-        clauses.append({"content_domain": {"$eq": content_domain}})
-    if chunk_type:
-        clauses.append({"chunk_type": {"$eq": chunk_type}})
-    if module_id:
-        clauses.append({"module_id": {"$eq": module_id}})
-    if lesson_id:
-        clauses.append({"lesson_id": {"$eq": lesson_id}})
-    if competency_code:
-        clauses.append({"competency_code": {"$eq": competency_code}})
-    if storage_path:
-        clauses.append({"storage_path": {"$eq": storage_path}})
-    if not clauses:
-        return None
-    if len(clauses) == 1:
-        return clauses[0]
-    return {"$and": clauses}
-def _distance_to_score(distance: float) -> float:
-    return round(1.0 / (1.0 + max(distance, 0.0)), 4)
-def retrieve_curriculum_context(
-    query: str,
-    subject: str | None = None,
-    quarter: int | None = None,
-    content_domain: str | None = None,
-    chunk_type: str | None = None,
-    module_id: str | None = None,
-    lesson_id: str | None = None,
-    competency_code: str | None = None,
-    storage_path: str | None = None,
-    top_k: int = 8,
-) -> list[dict]:
-    from rag.vectorstore_loader import get_vectorstore_components
-    _, collection, embedder = get_vectorstore_components()
-    where = _to_where(subject, quarter, content_domain, chunk_type, module_id, lesson_id, competency_code, storage_path)
-    prefixed_query = f"Represent this sentence for searching relevant passages: {query}"
-    query_embedding = embedder.encode(
-        prefixed_query,
-        normalize_embeddings=True,
-    ).tolist()
-    result = collection.query(
-        query_embeddings=[query_embedding],
-        n_results=max(1, top_k),
-        where=where,
-        include=["documents", "metadatas", "distances"],
-    )
-    documents = (result.get("documents") or [[]])[0]
-    metadatas = (result.get("metadatas") or [[]])[0]
-    distances = (result.get("distances") or [[]])[0]
-    rows: List[dict] = []
-    for idx, content in enumerate(documents):
-        md = metadatas[idx] if idx < len(metadatas) and isinstance(metadatas[idx], dict) else {}
-        distance = float(distances[idx]) if idx < len(distances) else 1.0
-        rows.append({
-            "content": str(content or ""),
-            "subject": str(md.get("subject") or "unknown"),
-            "quarter": int(md.get("quarter") or 0),
-            "content_domain": str(md.get("content_domain") or "general"),
-            "chunk_type": str(md.get("chunk_type") or "concept"),
-            "source_file": str(md.get("source_file") or ""),
-            "storage_path": str(md.get("storage_path") or ""),
-            "module_id": str(md.get("module_id") or ""),
-            "lesson_id": str(md.get("lesson_id") or ""),
-            "competency_code": str(md.get("competency_code") or ""),
-            "page": int(md.get("page") or 0),
-            "score": _distance_to_score(distance),
-        })
-    return rows
-def build_exact_lesson_query(
-    topic: str,
-    subject: str,
-    quarter: int,
-    lesson_title: str | None = None,
-    competency: str | None = None,
-    module_unit: str | None = None,
-    learner_level: str | None = None,
-    competency_code: str | None = None,
-) -> str:
-    parts = [topic, subject, f"Quarter {quarter}"]
-    for value in (lesson_title, competency, module_unit, learner_level, competency_code):
-        clean = str(value or "").strip()
-        if clean:
-            parts.append(clean)
-    return " | ".join(parts)
-def build_lesson_query(
-    topic: str,
-    subject: str,
-    quarter: int,
-    *,
-    lesson_title: Optional[str] = None,
-    competency: Optional[str] = None,
-    module_unit: Optional[str] = None,
-    learner_level: Optional[str] = None,
-) -> str:
-    parts = [topic, subject, f"Quarter {quarter}"]
-    for value in (lesson_title, competency, module_unit, learner_level):
-        clean_value = str(value or "").strip()
-        if clean_value:
-            parts.append(clean_value)
-    return " | ".join(parts)
-def retrieve_lesson_pdf_context(
-    topic: str,
-    subject: str,
-    quarter: int,
-    lesson_title: str | None = None,
-    competency: str | None = None,
-    module_id: str | None = None,
-    lesson_id: str | None = None,
-    competency_code: str | None = None,
-    storage_path: str | None = None,
-    top_k: int = 8,
-) -> Tuple[list[dict], str]:
-    """Retrieve chunks by storage_path exact match + semantic ranking; fallback to general query."""
-    if storage_path:
-        exact_chunks = retrieve_curriculum_context(
-            query=topic,
-            subject=subject,
-            quarter=quarter,
-            storage_path=storage_path,
-            top_k=top_k,
-        )
-        if exact_chunks and any(c["score"] >= 0.65 for c in exact_chunks):
-            return exact_chunks, "exact"
-    general_chunks = retrieve_curriculum_context(
-        query=topic,
-        subject=subject,
-        quarter=quarter,
-        top_k=top_k,
-    )
-    if storage_path and exact_chunks:
-        all_chunks = exact_chunks + general_chunks
-        seen = set()
-        deduped = []
-        for c in all_chunks:
-            key = f"{c.get('source_file')}:{c.get('page')}:{c.get('content', '')[:60]}"
-            if key not in seen:
-                seen.add(key)
-                deduped.append(c)
-        deduped.sort(key=lambda x: x.get("score", 0), reverse=True)
-        return deduped[:top_k], "hybrid"
-    return general_chunks, "general"
-def format_retrieved_chunks(curriculum_chunks: list[dict]) -> str:
-    refs = []
-    for i, chunk in enumerate(curriculum_chunks, start=1):
-        refs.append(
-            f"{i}. [{chunk.get('source_file')} p.{chunk.get('page')}] "
-            f"({chunk.get('content_domain')}/{chunk.get('chunk_type')}) score={chunk.get('score')}\n"
-            f"   Excerpt: {chunk.get('content', '')}"
-        )
-    return "\n".join(refs) if refs else "No curriculum context retrieved."
-def summarize_retrieval_confidence(curriculum_chunks: list[dict]) -> Dict[str, any]:
-    if not curriculum_chunks:
-        return {"confidence": 0.0, "band": "low"}
-    top_scores = [float(c.get("score") or 0.0) for c in curriculum_chunks[:5]]
-    score = sum(top_scores) / max(1, len(top_scores))
-    band = "high" if score >= 0.72 else "medium" if score >= 0.5 else "low"
-    return {"confidence": round(score, 3), "band": band}
-def organize_chunks_by_section(chunks: list[dict]) -> Dict[str, List[dict]]:
-    """Organize retrieved chunks into lesson section categories."""
-    sections: Dict[str, List[dict]] = {
-        "introduction": [],
-        "key_concepts": [],
-        "worked_examples": [],
-        "important_notes": [],
-        "practice": [],
-        "summary": [],
-        "assessment": [],
-        "general": [],
-    }
-    domain_priority = {
-        "introduction": 1, "key_concepts": 2, "worked_examples": 3,
-        "important_notes": 4, "practice": 5, "summary": 6,
-        "assessment": 7, "general": 8,
-    }
-    for chunk in chunks:
-        domain = chunk.get("content_domain", "general")
-        if domain in sections:
-            sections[domain].append(chunk)
-        else:
-            sections["general"].append(chunk)
-    return sections
-def build_lesson_prompt(
-    *,
-    lesson_title: str,
-    competency: str,
-    grade_level: str,
-    subject: str,
-    quarter: int,
-    learner_level: Optional[str],
-    module_unit: Optional[str],
-    curriculum_chunks: list[dict],
-    competency_code: Optional[str] = None,
-) -> str:
-    refs_text = format_retrieved_chunks(curriculum_chunks)
-    organized = organize_chunks_by_section(curriculum_chunks)
-    return (
-        "You are a DepEd-aligned Grade 11-12 mathematics instructional designer.\n"
-        "Generate a lesson in JSON format. Use ONLY the retrieved curriculum evidence below.\n"
-        "Do NOT invent content. Do NOT add generic motivational text. All content must be grounded in the retrieved excerpts.\n\n"
-        f"Lesson title: {lesson_title}\n"
-        f"Competency code: {competency_code or 'n/a'}\n"
-        f"Curriculum competency: {competency}\n"
-        f"Grade level: {grade_level}\n"
-        f"Subject: {subject}\n"
-        f"Quarter: Q{quarter}\n"
-        f"Learner level: {learner_level or 'Grade 11-12'}\n"
-        f"Module/unit: {module_unit or 'n/a'}\n\n"
-        "[CURRICULUM CONTEXT]\n"
-        f"{refs_text}\n\n"
-        "Return ONLY valid JSON with this exact structure. All 7 sections are required:\n"
-        "{\n"
-        '  "sections": [\n'
-        '    {"type": "introduction",    "title": "Introduction",       "content": "..."},\n'
-        '    {"type": "key_concepts",    "title": "Key Concepts",      "content": "...", "callouts": [{"type":"important|ti..."}]\n},'
-        '    {"type": "video",           "title": "Video Lesson",      "content": "...", "videoId": "", "videoTitle": "", "videoChannel": "", "embedUrl": "", "thumbnailUrl": ""},\n'
-        '    {"type": "worked_examples",  "title": "Worked Examples",    "examples": [{"problem":"...","steps":["Step 1: ...","Step 2: ..."],"answer":"..."}]},\n'
-        '    {"type": "important_notes",  "title": "Important Notes",   "bulletPoints": ["...","..."]},\n'
-        '    {"type": "try_it_yourself", "title": "Try It Yourself",   "practiceProblems": [{"question":"...","solution":"..."}]},\n'
-        '    {"type": "summary",         "title": "Summary",           "content": "..."}\n'
-        "  ],\n"
-        '  "needsReview": false\n'
-        "}\n\n"
-        "Rules:\n"
-        "- content in introduction, key_concepts, important_notes, summary: use paragraph/bullet text grounded in retrieved chunks\n"
-        "- examples must reflect actual content from the retrieved curriculum (real formulas, real contexts)\n"
-        "- practiceProblems should be derivable from worked examples\n"
-        "- callouts: type is 'important', 'tip', or 'warning'\n"
-        "- video section: content is a brief sentence, leave videoId empty (will be filled by backend)\n"
-        "- Do not use placeholder text like 'placeholder' or 'example text'\n"
-        "- Do not fabricate worked examples - use actual curriculum content\n"
-    )
-def build_problem_generation_prompt(topic: str, difficulty: str, curriculum_chunks: list[dict]) -> str:
-    refs = []
-    for i, chunk in enumerate(curriculum_chunks, start=1):
-        refs.append(
-            f"{i}. [{chunk.get('source_file')} p.{chunk.get('page')}] "
-            f"({chunk.get('content_domain')}/{chunk.get('chunk_type')}) {chunk.get('content', '')}"
-        )
-    refs_text = "\n".join(refs) if refs else "No curriculum context retrieved."
-    return (
-        "Generate one practice problem strictly aligned to the retrieved DepEd competency scope.\n"
-        "Do not include topics outside the competency context.\n\n"
-        f"Topic: {topic}\n"
-        f"Difficulty: {difficulty}\n\n"
-        "[CURRICULUM CONTEXT]\n"
-        f"{refs_text}\n\n"
-        "Return JSON with keys: problem, solution, competencyReference"
-    )
-def build_analysis_curriculum_context(weak_topics: list[str], subject: str) -> list[dict]:
-    dedup: Dict[str, dict] = {}
-    for weak_topic in weak_topics:
-        rows = retrieve_curriculum_context(
-            query=f"DepEd learning competency for {weak_topic}",
-            subject=subject,
-            chunk_type="learning_competency",
-            top_k=2,
-        )
-        for row in rows:
-            key = f"{row.get('source_file')}::{row.get('page')}::{row.get('content', '')[:80]}"
-            if key not in dedup:
-                dedup[key] = row
-    return list(dedup.values())

backend/routes/rag_routes.py DELETED Viewed

@@ -1,427 +0,0 @@
-from __future__ import annotations
-import json
-import logging
-import os
-import re
-from datetime import datetime, timezone
-from threading import Lock
-from typing import Any, Dict, List, Optional
-from fastapi import APIRouter, HTTPException, Request
-from pydantic import BaseModel, Field
-from services.inference_client import (
-    InferenceRequest,
-    create_default_client,
-    is_sequential_model,
-    get_model_for_task,
-)
-from rag.curriculum_rag import (
-    build_analysis_curriculum_context,
-    build_lesson_prompt,
-    build_lesson_query,
-    build_problem_generation_prompt,
-    format_retrieved_chunks,
-    retrieve_curriculum_context,
-    retrieve_lesson_pdf_context,
-    summarize_retrieval_confidence,
-)
-from rag.vectorstore_loader import get_vectorstore_health, reset_vectorstore_singleton
-try:
-    from firebase_admin import firestore as firebase_firestore
-except Exception:
-    firebase_firestore = None
-logger = logging.getLogger("mathpulse.rag")
-router = APIRouter(prefix="/api/rag", tags=["rag"])
-_inference_client = None
-_inference_lock = Lock()
-def _get_inference_client():
-    global _inference_client
-    if _inference_client is None:
-        with _inference_lock:
-            if _inference_client is None:
-                _inference_client = create_default_client()
-    return _inference_client
-async def _generate_text(
-    prompt: str,
-    task_type: str,
-    max_new_tokens: int = 900,
-    enable_thinking: bool = False,
-) -> str:
-    request = InferenceRequest(
-        messages=[
-            {"role": "system", "content": "You are a precise DepEd-aligned curriculum assistant."},
-            {"role": "user", "content": prompt},
-        ],
-        task_type=task_type,
-        max_new_tokens=max_new_tokens,
-        temperature=0.2,
-        top_p=0.9,
-        enable_thinking=enable_thinking,
-    )
-    return _get_inference_client().generate_from_messages(request)
-def _log_rag_usage(
-    request: Request,
-    *,
-    event_type: str,
-    topic: str,
-    subject: str,
-    quarter: Optional[int],
-    chunks: List[Dict[str, Any]],
-) -> None:
-    if firebase_firestore is None:
-        return
-    try:
-        user = getattr(request.state, "user", None)
-        uid = getattr(user, "uid", None)
-        domains = sorted({str(chunk.get("content_domain") or "").strip() for chunk in chunks if chunk.get("content_domain")})
-        top_score = max((float(chunk.get("score") or 0.0) for chunk in chunks), default=0.0)
-        payload = {
-            "userId": uid,
-            "type": event_type,
-            "topic": topic,
-            "subject": subject,
-            "quarter": quarter,
-            "retrievedChunks": len(chunks),
-            "topScore": top_score,
-            "curriculumDomainsHit": domains,
-            "timestamp": firebase_firestore.SERVER_TIMESTAMP,
-            "createdAtIso": datetime.now(timezone.utc).isoformat(),
-        }
-        firebase_firestore.client().collection("rag_usage").add(payload)
-    except Exception as exc:
-        logger.warning("rag_usage logging skipped: %s", exc)
-def _strip_thinking_and_parse(text: str) -> dict:
-    cleaned = text.strip()
-    cleaned = re.sub(r" </think>", "", cleaned, flags=re.DOTALL).strip()
-    if "{" in cleaned and "}" in cleaned:
-        try:
-            start = cleaned.find("{")
-            end = cleaned.rfind("}") + 1
-            parsed = json.loads(cleaned[start:end])
-            if isinstance(parsed, dict):
-                return parsed
-        except Exception:
-            pass
-    return {"explanation": text}
-class RagLessonRequest(BaseModel):
-    topic: str
-    subject: str
-    quarter: int
-    lessonTitle: Optional[str] = None
-    learningCompetency: Optional[str] = None
-    moduleUnit: Optional[str] = None
-    learnerLevel: Optional[str] = None
-    userId: Optional[str] = None
-    moduleId: Optional[str] = None
-    lessonId: Optional[str] = None
-    competencyCode: Optional[str] = None
-    storagePath: Optional[str] = None
-class RagProblemRequest(BaseModel):
-    topic: str
-    subject: str
-    quarter: int
-    difficulty: str = Field(default="medium")
-    userId: Optional[str] = None
-class RagAnalysisContextRequest(BaseModel):
-    weakTopics: List[str]
-    subject: str
-    userId: Optional[str] = None
-@router.get("/health")
-async def rag_health():
-    active_model = get_model_for_task("rag_lesson")
-    is_seq = is_sequential_model(active_model)
-    try:
-        health = get_vectorstore_health()
-        return {
-            "status": "ok",
-            "chunkCount": health["chunkCount"],
-            "subjects": health["subjects"],
-            "lastIngested": datetime.now(timezone.utc).isoformat(),
-            "activeModel": active_model,
-            "isSequentialModel": is_seq,
-        }
-    except Exception as exc:
-        return {
-            "status": "degraded",
-            "chunkCount": 0,
-            "subjects": {},
-            "lastIngested": None,
-            "activeModel": active_model,
-            "isSequentialModel": is_seq,
-            "warning": str(exc),
-        }
-def _fetch_youtube_video(lesson_title: str, subject: str, competency: str, quarter: int) -> dict:
-    try:
-        from backend.services.youtube_service import get_video_for_lesson
-    except ImportError:
-        return {}
-    try:
-        video = get_video_for_lesson(lesson_title, subject, competency, quarter)
-        return video or {}
-    except Exception as e:
-        logger.warning("YouTube search failed: %s", e)
-        return {}
-def _ensure_7_sections(lesson_data: dict, lesson_title: str) -> dict:
-    sections = lesson_data.get("sections", [])
-    section_types = {s.get("type") for s in sections}
-    required = ["introduction", "key_concepts", "video", "worked_examples", "important_notes", "try_it_yourself", "summary"]
-    default_content = {
-        "introduction": {"type": "introduction", "title": "Introduction", "content": f"Welcome to the lesson on {lesson_title}."},
-        "key_concepts": {"type": "key_concepts", "title": "Key Concepts", "content": "Below are the key concepts covered in this lesson.", "callouts": []},
-        "video": {"type": "video", "title": "Video Lesson", "content": "Watch this explanation to understand the concepts visually.", "videoId": "", "videoTitle": "", "videoChannel": "", "embedUrl": "", "thumbnailUrl": ""},
-        "worked_examples": {"type": "worked_examples", "title": "Worked Examples", "examples": []},
-        "important_notes": {"type": "important_notes", "title": "Important Notes", "bulletPoints": []},
-        "try_it_yourself": {"type": "try_it_yourself", "title": "Try It Yourself", "practiceProblems": []},
-        "summary": {"type": "summary", "title": "Summary", "content": f"Great job completing the lesson on {lesson_title}!"},
-    }
-    filled = {}
-    for req_type in required:
-        for existing in sections:
-            if existing.get("type") == req_type:
-                filled[req_type] = existing
-                break
-        else:
-            filled[req_type] = default_content[req_type]
-    ordered = [filled[t] for t in required]
-    for i, section in enumerate(ordered):
-        s_type = section.get("type")
-        if s_type == "key_concepts" and not section.get("callouts"):
-            section["callouts"] = []
-        if s_type == "worked_examples" and not section.get("examples"):
-            section["examples"] = []
-        if s_type == "important_notes" and not section.get("bulletPoints"):
-            section["bulletPoints"] = []
-        if s_type == "try_it_yourself" and not section.get("practiceProblems"):
-            section["practiceProblems"] = []
-        ordered[i] = section
-    return {**lesson_data, "sections": ordered}
-@router.post("/lesson")
-async def rag_lesson(request: Request, payload: RagLessonRequest):
-    try:
-        chunks, retrieval_mode = retrieve_lesson_pdf_context(
-        query=build_lesson_query(
-            payload.topic,
-            payload.subject,
-            payload.quarter,
-            lesson_title=payload.lessonTitle,
-            competency=payload.learningCompetency,
-            module_unit=payload.moduleUnit,
-            learner_level=payload.learnerLevel,
-        ),
-        subject=payload.subject,
-        quarter=payload.quarter,
-        lesson_title=payload.lessonTitle,
-        competency=payload.learningCompetency,
-        module_id=payload.moduleId,
-        lesson_id=payload.lessonId,
-        competency_code=payload.competencyCode,
-        storage_path=payload.storagePath,
-        top_k=8,
-    )
-    if not chunks:
-        raise HTTPException(
-            status_code=404,
-            detail={
-                "error": "no_curriculum_context",
-                "message": f"No curriculum content found for lesson '{payload.lessonTitle}' ({payload.subject} Q{payload.quarter}). Please ensure the PDF has been ingested.",
-                "retrievalBand": "low",
-                "sources": [],
-            },
-        )
-    prompt = build_lesson_prompt(
-        lesson_title=payload.lessonTitle or payload.topic,
-        competency=payload.learningCompetency or payload.topic,
-        grade_level="Grade 11-12",
-        subject=payload.subject,
-        quarter=payload.quarter,
-        learner_level=payload.learnerLevel,
-        module_unit=payload.moduleUnit,
-        curriculum_chunks=chunks,
-        competency_code=payload.competencyCode,
-    )
-    raw_explanation = await _generate_text(
-        prompt,
-        task_type="lesson_generation",
-        max_new_tokens=1800,
-        enable_thinking=True,
-    )
-    parsed_lesson = _strip_thinking_and_parse(raw_explanation)
-    parsed_lesson = _ensure_7_sections(parsed_lesson, payload.lessonTitle or payload.topic)
-    if parsed_lesson.get("sections"):
-        video_section = next((s for s in parsed_lesson["sections"] if s.get("type") == "video"), None)
-        if video_section:
-            video_data = _fetch_youtube_video(
-                payload.lessonTitle or payload.topic,
-                payload.subject,
-                payload.learningCompetency or "",
-                payload.quarter,
-            )
-            if video_data:
-                video_section["videoId"] = video_data.get("videoId", "")
-                video_section["videoTitle"] = video_data.get("videoTitle", "")
-                video_section["videoChannel"] = video_data.get("videoChannel", "")
-                video_section["embedUrl"] = video_data.get("embedUrl", "")
-                video_section["thumbnailUrl"] = video_data.get("thumbnailUrl", "")
-    retrieval_summary = summarize_retrieval_confidence(chunks)
-    _log_rag_usage(
-        request,
-        event_type="lesson",
-        topic=build_lesson_query(payload.topic, payload.subject, payload.quarter, lesson_title=payload.lessonTitle),
-        subject=payload.subject,
-        quarter=payload.quarter,
-        chunks=chunks,
-    )
-    needs_review = parsed_lesson.get("needsReview", False)
-    if retrieval_summary.get("band") == "low":
-        needs_review = True
-    return {
-        **parsed_lesson,
-        "retrievalConfidence": retrieval_summary.get("confidence", 0.0),
-        "retrievalBand": retrieval_summary.get("band", "low"),
-        "retrievalMode": retrieval_mode,
-        "needsReview": needs_review,
-        "sources": [
-            {
-                "subject": row.get("subject"),
-                "quarter": row.get("quarter"),
-                "source_file": row.get("source_file"),
-                "storage_path": row.get("storage_path"),
-                "page": row.get("page"),
-                "score": row.get("score"),
-                "content_domain": row.get("content_domain"),
-                "chunk_type": row.get("chunk_type"),
-                "content": row.get("content"),
-            }
-            for row in chunks
-        ],
-        "activeModel": get_model_for_task("rag_lesson"),
-    }
-    except Exception as exc:
-        import traceback
-        logger.error(f"RAG lesson error: {type(exc).__name__}: {exc}\n{traceback.format_exc()}")
-        raise HTTPException(
-            status_code=500,
-            detail={
-                "error": type(exc).__name__,
-                "message": str(exc),
-                "traceback": traceback.format_exc(),
-            },
-        )
-@router.post("/generate-problem")
-async def rag_generate_problem(request: Request, payload: RagProblemRequest):
-    chunks = retrieve_curriculum_context(
-        query=payload.topic,
-        subject=payload.subject,
-        quarter=payload.quarter,
-        top_k=5,
-    )
-    prompt = build_problem_generation_prompt(payload.topic, payload.difficulty, chunks)
-    raw = await _generate_text(
-        prompt,
-        task_type="quiz_generation",
-        max_new_tokens=600,
-        enable_thinking=False,
-    )
-    parsed = _strip_thinking_and_parse(raw)
-    problem = str(parsed.get("problem") or raw)
-    if not problem or problem.startswith("{"):
-        problem = str(parsed.get("content") or str(parsed))
-    if len(problem) < 3 or problem.startswith("{"):
-        problem = raw
-    solution = str(parsed.get("solution") or "")
-    competency_ref = str(parsed.get("competencyReference") or "DepEd competency-aligned")
-    _log_rag_usage(
-        request,
-        event_type="problem_generation",
-        topic=payload.topic,
-        subject=payload.subject,
-        quarter=payload.quarter,
-        chunks=chunks,
-    )
-    return {
-        "problem": problem,
-        "solution": solution,
-        "competencyReference": competency_ref,
-        "sources": [
-            {
-                "subject": row.get("subject"),
-                "quarter": row.get("quarter"),
-                "source_file": row.get("source_file"),
-                "page": row.get("page"),
-                "score": row.get("score"),
-            }
-            for row in chunks
-        ],
-    }
-@router.post("/analysis-context")
-async def rag_analysis_context(request: Request, payload: RagAnalysisContextRequest):
-    if not payload.weakTopics:
-        raise HTTPException(status_code=400, detail="weakTopics must be a non-empty list")
-    chunks = build_analysis_curriculum_context(payload.weakTopics, payload.subject)
-    lines = ["LEARNING COMPETENCIES:"]
-    for index, row in enumerate(chunks, start=1):
-        lines.append(
-            f"{index}. {row.get('content')} (Source: {row.get('source_file')} p.{row.get('page')}, "
-            f"Q{row.get('quarter')}, {row.get('content_domain')})"
-        )
-    _log_rag_usage(
-        request,
-        event_type="analysis_context",
-        topic=", ".join(payload.weakTopics),
-        subject=payload.subject,
-        quarter=None,
-        chunks=chunks,
-    )
-    return {"curriculumContext": "\n".join(lines)}

main.py CHANGED Viewed

@@ -1000,6 +1000,8 @@ class RequestMiddleware(BaseHTTPMiddleware):
                 status_code=500,
                 content={
                     "detail": "Internal server error",
                     "requestId": request_id,
                 },
                 headers={"X-Request-ID": request_id},

                 status_code=500,
                 content={
                     "detail": "Internal server error",
+                    "error": type(exc).__name__,
+                    "message": str(exc),
                     "requestId": request_id,
                 },
                 headers={"X-Request-ID": request_id},

rag/curriculum_rag.py CHANGED Viewed

@@ -57,7 +57,7 @@ def retrieve_curriculum_context(
     storage_path: str | None = None,
     top_k: int = 8,
 ) -> list[dict]:
-    from backend.rag.vectorstore_loader import get_vectorstore_components
     _, collection, embedder = get_vectorstore_components()
     where = _to_where(subject, quarter, content_domain, chunk_type, module_id, lesson_id, competency_code, storage_path)
@@ -195,12 +195,12 @@ def format_retrieved_chunks(curriculum_chunks: list[dict]) -> str:
 def summarize_retrieval_confidence(curriculum_chunks: list[dict]) -> Dict[str, any]:
     if not curriculum_chunks:
-        return {"confidence": 0.0, "band": "low"}
     top_scores = [float(c.get("score") or 0.0) for c in curriculum_chunks[:5]]
     score = sum(top_scores) / max(1, len(top_scores))
     band = "high" if score >= 0.72 else "medium" if score >= 0.5 else "low"
-    return {"confidence": round(score, 3), "band": band}
 def organize_chunks_by_section(chunks: list[dict]) -> Dict[str, List[dict]]:

     storage_path: str | None = None,
     top_k: int = 8,
 ) -> list[dict]:
+    from rag.vectorstore_loader import get_vectorstore_components
     _, collection, embedder = get_vectorstore_components()
     where = _to_where(subject, quarter, content_domain, chunk_type, module_id, lesson_id, competency_code, storage_path)
 def summarize_retrieval_confidence(curriculum_chunks: list[dict]) -> Dict[str, any]:
     if not curriculum_chunks:
+        return {"confidence": 0.0, "band": "low", "chunkCount": 0}
     top_scores = [float(c.get("score") or 0.0) for c in curriculum_chunks[:5]]
     score = sum(top_scores) / max(1, len(top_scores))
     band = "high" if score >= 0.72 else "medium" if score >= 0.5 else "low"
+    return {"confidence": round(score, 3), "band": band, "chunkCount": len(curriculum_chunks)}
 def organize_chunks_by_section(chunks: list[dict]) -> Dict[str, List[dict]]:

routes/rag_routes.py CHANGED Viewed

@@ -229,26 +229,39 @@ def _ensure_7_sections(lesson_data: dict, lesson_title: str) -> dict:
 @router.post("/lesson")
 async def rag_lesson(request: Request, payload: RagLessonRequest):
-    chunks, retrieval_mode = retrieve_lesson_pdf_context(
-        query=build_lesson_query(
-            payload.topic,
-            payload.subject,
-            payload.quarter,
             lesson_title=payload.lessonTitle,
             competency=payload.learningCompetency,
-            module_unit=payload.moduleUnit,
-            learner_level=payload.learnerLevel,
-        ),
-        subject=payload.subject,
-        quarter=payload.quarter,
-        lesson_title=payload.lessonTitle,
-        competency=payload.learningCompetency,
-        module_id=payload.moduleId,
-        lesson_id=payload.lessonId,
-        competency_code=payload.competencyCode,
-        storage_path=payload.storagePath,
-        top_k=8,
-    )
     if not chunks:
         raise HTTPException(
@@ -261,54 +274,98 @@ async def rag_lesson(request: Request, payload: RagLessonRequest):
             },
         )
-    prompt = build_lesson_prompt(
-        lesson_title=payload.lessonTitle or payload.topic,
-        competency=payload.learningCompetency or payload.topic,
-        grade_level="Grade 11-12",
-        subject=payload.subject,
-        quarter=payload.quarter,
-        learner_level=payload.learnerLevel,
-        module_unit=payload.moduleUnit,
-        curriculum_chunks=chunks,
-        competency_code=payload.competencyCode,
-    )
-    raw_explanation = await _generate_text(
-        prompt,
-        task_type="lesson_generation",
-        max_new_tokens=1800,
-        enable_thinking=True,
-    )
-    parsed_lesson = _strip_thinking_and_parse(raw_explanation)
-    parsed_lesson = _ensure_7_sections(parsed_lesson, payload.lessonTitle or payload.topic)
     if parsed_lesson.get("sections"):
         video_section = next((s for s in parsed_lesson["sections"] if s.get("type") == "video"), None)
         if video_section:
-            video_data = _fetch_youtube_video(
-                payload.lessonTitle or payload.topic,
-                payload.subject,
-                payload.learningCompetency or "",
-                payload.quarter,
-            )
-            if video_data:
-                video_section["videoId"] = video_data.get("videoId", "")
-                video_section["videoTitle"] = video_data.get("videoTitle", "")
-                video_section["videoChannel"] = video_data.get("videoChannel", "")
-                video_section["embedUrl"] = video_data.get("embedUrl", "")
-                video_section["thumbnailUrl"] = video_data.get("thumbnailUrl", "")
     retrieval_summary = summarize_retrieval_confidence(chunks)
-    _log_rag_usage(
-        request,
-        event_type="lesson",
-        topic=build_lesson_query(payload.topic, payload.subject, payload.quarter, lesson_title=payload.lessonTitle),
-        subject=payload.subject,
-        quarter=payload.quarter,
-        chunks=chunks,
-    )
     needs_review = parsed_lesson.get("needsReview", False)
     if retrieval_summary.get("band") == "low":

 @router.post("/lesson")
 async def rag_lesson(request: Request, payload: RagLessonRequest):
+    # ── Step 1: Retrieve curriculum chunks ───────────────────────────────────
+    try:
+        chunks, retrieval_mode = retrieve_lesson_pdf_context(
+            topic=build_lesson_query(
+                payload.topic,
+                payload.subject,
+                payload.quarter,
+                lesson_title=payload.lessonTitle,
+                competency=payload.learningCompetency,
+                module_unit=payload.moduleUnit,
+                learner_level=payload.learnerLevel,
+            ),
+            subject=payload.subject,
+            quarter=payload.quarter,
             lesson_title=payload.lessonTitle,
             competency=payload.learningCompetency,
+            module_id=payload.moduleId,
+            lesson_id=payload.lessonId,
+            competency_code=payload.competencyCode,
+            storage_path=payload.storagePath,
+            top_k=8,
+        )
+    except Exception as exc:
+        import traceback
+        logger.error(f"RAG retrieval error: {type(exc).__name__}: {exc}\n{traceback.format_exc()}")
+        raise HTTPException(
+            status_code=503,
+            detail={
+                "error": "retrieval_failed",
+                "message": f"Curriculum retrieval failed: {exc}",
+                "type": type(exc).__name__,
+            },
+        )
     if not chunks:
         raise HTTPException(
             },
         )
+    # ── Step 2: Build prompt ─────────────────────────────────────────────────
+    try:
+        prompt = build_lesson_prompt(
+            lesson_title=payload.lessonTitle or payload.topic,
+            competency=payload.learningCompetency or payload.topic,
+            grade_level="Grade 11-12",
+            subject=payload.subject,
+            quarter=payload.quarter,
+            learner_level=payload.learnerLevel,
+            module_unit=payload.moduleUnit,
+            curriculum_chunks=chunks,
+            competency_code=payload.competencyCode,
+        )
+    except Exception as exc:
+        logger.error(f"RAG prompt build error: {type(exc).__name__}: {exc}")
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "error": "prompt_build_failed",
+                "message": f"Failed to build lesson prompt: {exc}",
+                "type": type(exc).__name__,
+            },
+        )
+    # ── Step 3: AI inference ─────────────────────────────────────────────────
+    try:
+        raw_explanation = await _generate_text(
+            prompt,
+            task_type="rag_lesson",
+            max_new_tokens=1800,
+            enable_thinking=True,
+        )
+    except Exception as exc:
+        logger.error(f"RAG inference error: {type(exc).__name__}: {exc}")
+        raise HTTPException(
+            status_code=502,
+            detail={
+                "error": "inference_failed",
+                "message": f"AI model call failed: {exc}",
+                "type": type(exc).__name__,
+            },
+        )
+    # ── Step 4: Parse & validate response ────────────────────────────────────
+    try:
+        parsed_lesson = _strip_thinking_and_parse(raw_explanation)
+        parsed_lesson = _ensure_7_sections(parsed_lesson, payload.lessonTitle or payload.topic)
+    except Exception as exc:
+        logger.error(f"RAG parse error: {type(exc).__name__}: {exc}")
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "error": "parse_failed",
+                "message": f"Failed to parse AI response: {exc}",
+                "type": type(exc).__name__,
+            },
+        )
+    # ── Step 5: Enrich with video ────────────────────────────────────────────
     if parsed_lesson.get("sections"):
         video_section = next((s for s in parsed_lesson["sections"] if s.get("type") == "video"), None)
         if video_section:
+            try:
+                video_data = _fetch_youtube_video(
+                    payload.lessonTitle or payload.topic,
+                    payload.subject,
+                    payload.learningCompetency or "",
+                    payload.quarter,
+                )
+                if video_data:
+                    video_section["videoId"] = video_data.get("videoId", "")
+                    video_section["videoTitle"] = video_data.get("videoTitle", "")
+                    video_section["videoChannel"] = video_data.get("videoChannel", "")
+                    video_section["embedUrl"] = video_data.get("embedUrl", "")
+                    video_section["thumbnailUrl"] = video_data.get("thumbnailUrl", "")
+            except Exception as exc:
+                logger.warning("YouTube enrichment skipped: %s", exc)
+    # ── Step 6: Assemble response ────────────────────────────────────────────
     retrieval_summary = summarize_retrieval_confidence(chunks)
+    try:
+        _log_rag_usage(
+            request,
+            event_type="lesson",
+            topic=build_lesson_query(payload.topic, payload.subject, payload.quarter, lesson_title=payload.lessonTitle),
+            subject=payload.subject,
+            quarter=payload.quarter,
+            chunks=chunks,
+        )
+    except Exception as exc:
+        logger.warning("RAG usage logging skipped: %s", exc)
     needs_review = parsed_lesson.get("needsReview", False)
     if retrieval_summary.get("band") == "low":

test_full_rag.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import sys
+import os
+sys.path.insert(0, 'backend')
+# Set required env vars
+os.environ['DEEPSEEK_API_KEY'] = os.getenv('DEEPSEEK_API_KEY', '')
+os.environ['DEEPSEEK_BASE_URL'] = os.getenv('DEEPSEEK_BASE_URL', 'https://api.deepseek.com')
+from rag.curriculum_rag import retrieve_lesson_pdf_context, build_lesson_prompt
+from services.inference_client import InferenceClient, InferenceRequest
+# Test retrieval
+print("Testing retrieval...")
+try:
+    chunks, mode = retrieve_lesson_pdf_context(
+        topic="Represent real-life relationships as functions and interpret domain/range.",
+        subject="General Mathematics",
+        quarter=2,
+        lesson_title="Represent real-life relationships as functions and interpret domain/range.",
+        module_id="gen-math",
+        lesson_id="gm-q2-functions-graphs-l1",
+        competency_code="GM11-FG-1",
+        top_k=8,
+    )
+    print(f"Retrieved {len(chunks)} chunks, mode={mode}")
+except Exception as e:
+    print(f"Retrieval ERROR: {type(e).__name__}: {e}")
+    import traceback
+    traceback.print_exc()
+    sys.exit(1)
+# Test prompt building
+print("\nTesting prompt building...")
+try:
+    prompt = build_lesson_prompt(
+        lesson_title="Represent real-life relationships as functions and interpret domain/range.",
+        competency="Represent real-life relationships as functions and interpret domain/range.",
+        grade_level="Grade 11-12",
+        subject="General Mathematics",
+        quarter=2,
+        learner_level="Grade 11-12",
+        module_unit="n/a",
+        curriculum_chunks=chunks,
+        competency_code="GM11-FG-1",
+    )
+    print(f"Prompt length: {len(prompt)} chars")
+    print(f"Prompt preview: {prompt[:200]}...")
+except Exception as e:
+    print(f"Prompt building ERROR: {type(e).__name__}: {e}")
+    import traceback
+    traceback.print_exc()
+    sys.exit(1)
+# Test inference (optional - might cost money)
+print("\nTesting inference...")
+try:
+    client = InferenceClient()
+    req = InferenceRequest(
+        messages=[
+            {"role": "system", "content": "You are a precise DepEd-aligned curriculum assistant."},
+            {"role": "user", "content": prompt},
+        ],
+        task_type="lesson_generation",
+        max_new_tokens=100,  # Small for testing
+        temperature=0.2,
+        top_p=0.9,
+        enable_thinking=True,
+    )
+    result = client.generate_from_messages(req)
+    print(f"Inference result: {result[:200]}...")
+    print("SUCCESS!")
+except Exception as e:
+    print(f"Inference ERROR: {type(e).__name__}: {e}")
+    import traceback
+    traceback.print_exc()

test_retrieval.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import sys
+sys.path.insert(0, '.')
+from rag.curriculum_rag import retrieve_lesson_pdf_context, retrieve_curriculum_context
+# Test retrieval with the same params as the frontend
+try:
+    chunks, mode = retrieve_lesson_pdf_context(
+        topic="Represent real-life relationships as functions and interpret domain/range.",
+        subject="General Mathematics",
+        quarter=2,
+        lesson_title="Represent real-life relationships as functions and interpret domain/range.",
+        module_id="gen-math",
+        lesson_id="gm-q2-functions-graphs-l1",
+        competency_code="GM11-FG-1",
+        top_k=8,
+    )
+    print(f"Retrieved {len(chunks)} chunks, mode={mode}")
+    for i, chunk in enumerate(chunks[:3]):
+        print(f"  Chunk {i}: score={chunk.get('score')}, domain={chunk.get('content_domain')}, source={chunk.get('source_file')}")
+        print(f"    Content: {chunk.get('content', '')[:100]}...")
+except Exception as e:
+    print(f"ERROR: {type(e).__name__}: {e}")
+    import traceback
+    traceback.print_exc()
+# Also test without module/lesson filters
+try:
+    chunks2 = retrieve_curriculum_context(
+        query="Represent real-life relationships as functions and interpret domain/range.",
+        subject="General Mathematics",
+        quarter=2,
+        top_k=8,
+    )
+    print(f"\nGeneral retrieval: {len(chunks2)} chunks")
+except Exception as e:
+    print(f"\nGeneral ERROR: {type(e).__name__}: {e}")
+    import traceback
+    traceback.print_exc()

tests/test_rag_pipeline.py CHANGED Viewed

@@ -23,13 +23,18 @@ def _mock_vectorstore_components(collection_mock, embedder_mock):
 class TestRetrieveCurriculumContext:
     def test_empty_collection_returns_empty_list(self):
         collection = MagicMock()
-        collection_get_result = collection.get.return_value
-        collection_get_result.__getitem__.return_value = []
         embedder = MagicMock()
         with patch(
-            "rag.curriculum_rag.get_vectorstore_components",
             return_value=(MagicMock(), collection, embedder),
         ):
             result = retrieve_curriculum_context(
@@ -73,14 +78,12 @@ class TestBuildLessonPrompt:
             ],
         )
         assert "JSON" in prompt
-        assert "lessonTitle" in prompt
         assert "needsReview" in prompt
-        ph_context_terms = [
-            "payroll", "VAT", "discounts", "loans", "Pag-IBIG", "school",
-        ]
-        assert any(term in prompt for term in ph_context_terms)
-    def test_contains_thinking_hint(self):
         prompt = build_lesson_prompt(
             lesson_title="Functions",
             competency="M11GM-Ia-1",
@@ -91,7 +94,10 @@ class TestBuildLessonPrompt:
             module_unit=None,
             curriculum_chunks=[],
         )
-        assert "Think step by step" in prompt
 class TestSummarizeRetrievalConfidence:

 class TestRetrieveCurriculumContext:
     def test_empty_collection_returns_empty_list(self):
         collection = MagicMock()
+        collection.query.return_value = {
+            "documents": [[]],
+            "metadatas": [[]],
+            "distances": [[]],
+        }
         embedder = MagicMock()
+        embedder.encode.return_value = MagicMock()
+        embedder.encode.return_value.tolist.return_value = [0.0] * 768
         with patch(
+            "rag.vectorstore_loader.get_vectorstore_components",
             return_value=(MagicMock(), collection, embedder),
         ):
             result = retrieve_curriculum_context(
             ],
         )
         assert "JSON" in prompt
+        assert "Lesson title:" in prompt
         assert "needsReview" in prompt
+        assert "DepEd-aligned" in prompt
+        assert "7 sections" in prompt
+    def test_contains_required_sections_in_prompt(self):
         prompt = build_lesson_prompt(
             lesson_title="Functions",
             competency="M11GM-Ia-1",
             module_unit=None,
             curriculum_chunks=[],
         )
+        assert "introduction" in prompt
+        assert "key_concepts" in prompt
+        assert "worked_examples" in prompt
+        assert "try_it_yourself" in prompt
 class TestSummarizeRetrievalConfidence: