Spaces:

chuckfinca
/

fot-recommender-api

Sleeping

chuckfinca commited on Aug 8, 2025

Commit

280d562

1 Parent(s): 79336f1

style: Format code and fix linter warnings

Applied `ruff format` to the entire codebase for consistency.

Also fixed all outstanding Pyright and Ruff linter warnings:
- Ignored warnings for correct but "private" imports (gradio, genai).
- Added a check for the API key to fix a potential `None` type error.
- Suppressed E402 import errors in the build script where required.

Files changed (9) hide show

app.py +148 -35
notebooks/fot_recommender_poc.ipynb +36 -17
scripts/build_knowledge_base.py +6 -4
src/fot_recommender/config.py +3 -1
src/fot_recommender/main.py +1 -4
src/fot_recommender/rag_pipeline.py +8 -6
src/fot_recommender/utils.py +42 -37
tests/test_chunking.py +23 -6
tests/test_pipeline.py +20 -10

app.py CHANGED Viewed

@@ -14,7 +14,7 @@ from fot_recommender.config import (
     FOT_GOOGLE_API_KEY,
     DEMO_PASSWORD,
     SEARCH_RESULT_COUNT_K,
-    MIN_SIMILARITY_SCORE
 )
 from fot_recommender.utils import load_citations, format_evidence_for_display
 from fot_recommender.rag_pipeline import (
@@ -28,18 +28,18 @@ EXAMPLE_NARRATIVES = [
     {
         "short_title": "Overwhelmed",
         "title": "Overwhelmed Freshman (Academic & Attendance)",
-        "narrative": "A comprehensive support plan is urgently needed for this freshman. Academic performance is a critical concern, with failures in both Math and English leading to a credit deficiency of only 2 out of 4 expected credits. This academic struggle is compounded by a drop in attendance to 85% and a recent behavioral flag for an outburst in class, suggesting the student is significantly overwhelmed by the transition to high school."
     },
     {
         "short_title": "Withdrawn",
         "title": "Withdrawn Freshman (Social-Emotional)",
-        "narrative": "Academically, this freshman appears to be thriving, with a high GPA and perfect attendance. A closer look at classroom performance, however, reveals a student who is completely withdrawn. They do not participate in discussions or engage in any extracurricular activities, and teacher notes repeatedly describe them as 'isolated.' The lack of behavioral flags is a result of non-engagement, not positive conduct, pointing to a clear need for interventions focused on social-emotional learning and school connectedness."
     },
     {
         "short_title": "Disruptive",
         "title": "Disruptive Freshman (Behavioral)",
-        "narrative": "While this student's academics and credits earned are currently on track and attendance is acceptable at 92%, a significant pattern of disruptive behavior is jeopardizing their long-term success. An accumulation of five behavioral flags across multiple classes indicates a primary need for interventions in behavior management and positive conduct. Support should be focused on mentoring and strategies to foster appropriate classroom engagement before these behaviors begin to negatively impact their academic standing."
-    }
 ]
 EXAMPLE_MAP = {ex["short_title"]: ex["narrative"] for ex in EXAMPLE_NARRATIVES}
 EXAMPLE_TITLES = list(EXAMPLE_MAP.keys())
@@ -52,25 +52,66 @@ citations_map = load_citations(str(CITATIONS_PATH))
 embedding_model = initialize_embedding_model()
 print("✅ API initialized successfully.")
 def get_recommendations_api(student_narrative, persona, password):
     """The main function that runs the RAG pipeline and prepares data for export."""
     if password != DEMO_PASSWORD:
-        yield "Authentication failed. Please enter a valid Access Key.", gr.update(interactive=True), gr.update(visible=False), None, gr.update(visible=False)
         return
     if not student_narrative:
-        yield "Please enter a student narrative.", gr.update(interactive=True), gr.update(visible=False), None, gr.update(visible=False)
         return
-    yield "Processing...", gr.update(interactive=False), gr.update(visible=False), None, gr.update(visible=False)
     # 1. RETRIEVE
-    query_embedding = np.asarray(embedding_model.encode([student_narrative])).astype("float32")
     scores, indices = index.search(query_embedding, k=SEARCH_RESULT_COUNT_K)
-    retrieved_chunks_with_scores = [(knowledge_base_chunks[i], score) for i, score in zip(indices[0], scores[0]) if score >= MIN_SIMILARITY_SCORE]
     if not retrieved_chunks_with_scores:
-        yield "Could not find relevant interventions.", gr.update(interactive=True), gr.update(visible=False), None, gr.update(visible=False)
         return
     # 2. GENERATE
@@ -82,7 +123,9 @@ def get_recommendations_api(student_narrative, persona, password):
     )
     # 3. Augment with evidence for UI
-    formatted_evidence = format_evidence_for_display(retrieved_chunks_with_scores, citations_map)
     evidence_header = "\n\n---\n\n### Evidence Base\n"
     evidence_list_str = ""
     for evidence in formatted_evidence:
@@ -90,72 +133,142 @@ def get_recommendations_api(student_narrative, persona, password):
         evidence_list_str += f"  - **Source:** {evidence['source']}\n"
         evidence_list_str += f"  - **Page(s):** {evidence['pages']}\n"
         evidence_list_str += f"  - **Relevance Score:** {evidence['score']}\n"
-        evidence_list_str += f"  - **Content Snippet:**\n  > {evidence['content_snippet']}\n"
     final_output = synthesized_recommendation + evidence_header + evidence_list_str
     # 4. Assemble Evaluation Data
     evaluation_data = {
         "timestamp": datetime.datetime.now().isoformat(),
         "inputs": {"student_narrative": student_narrative, "persona": persona},
         "retrieval_results": [
             {
-                "chunk_title": chunk['title'], "relevance_score": float(score),
-                "source_document": chunk['source_document'], "page_info": chunk.get('fot_pages', 'N/A'),
-                "original_content": chunk.get('original_content', ''), "citation_info": citations_map.get(chunk['source_document'], {})
-            } for chunk, score in retrieved_chunks_with_scores
         ],
         "llm_output": {"synthesized_recommendation": synthesized_recommendation},
-        "final_ui_output": final_output
     }
     # 5. Create a temporary file for download
-    with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".json", encoding='utf-8') as f:
         json.dump(evaluation_data, f, indent=4)
         temp_file_path = f.name
-    yield final_output, gr.update(interactive=True), gr.update(visible=True), evaluation_data, gr.update(value=temp_file_path, visible=True)
 # --- UI Helper Functions ---
 def clear_all():
-    return "", None, "", gr.update(visible=False), None, gr.update(visible=False, value=None)
 def update_narrative_from_example(selection):
     return EXAMPLE_MAP.get(selection, "")
 CUSTOM_CSS = """
 .radio-horizontal .gr-form { flex-direction: row; flex-wrap: wrap; gap: 0.5rem; }
 """
 # --- Gradio Interface ---
-with gr.Blocks(theme=gr.themes.Soft(), css=CUSTOM_CSS) as interface:
-    gr.Markdown("# Freshman On-Track Intervention Recommender\n*A live API demonstrating the FOT Recommender.*")
     with gr.Row(equal_height=False):
         with gr.Column(scale=1):
             with gr.Group():
-                narrative_input = gr.Textbox(lines=8, label="Student Narrative", placeholder="Describe the student's situation here, or select an example below.")
-                example_radio = gr.Radio(EXAMPLE_TITLES, label="Load an Example Scenario", info="Select one to populate the narrative above. Typing a custom narrative will clear this selection.", elem_classes=["radio-horizontal"])
-                persona_input = gr.Radio(["teacher", "parent", "principal"], label="Who is this recommendation for?", value="teacher", elem_classes=["radio-horizontal"])
-                password_input = gr.Textbox(label="Access Key", type="password", info="Enter the access key for the demo.")
                 with gr.Row():
                     clear_btn = gr.Button("Clear")
                     submit_btn = gr.Button("Submit", variant="primary")
         with gr.Column(scale=2):
-            recommendation_output = gr.Markdown(label="Synthesized Recommendation", show_copy_button=True)
-            with gr.Accordion("Evaluation Data", open=False, visible=False) as eval_accordion:
                 json_viewer = gr.JSON(label="Evaluation JSON")
                 download_btn = gr.DownloadButton("Download JSON", visible=False)
     # --- Event Handlers ---
-    example_radio.change(fn=update_narrative_from_example, inputs=example_radio, outputs=narrative_input)
     narrative_input.input(fn=lambda: None, inputs=None, outputs=example_radio)
-    submit_btn.click(fn=get_recommendations_api, inputs=[narrative_input, persona_input, password_input], outputs=[recommendation_output, submit_btn, eval_accordion, json_viewer, download_btn])
-    clear_btn.click(fn=clear_all, inputs=[], outputs=[narrative_input, example_radio, recommendation_output, eval_accordion, json_viewer, download_btn])
 if __name__ == "__main__":
     # Add project src to the sys.path for when running as a script
     APP_ROOT = Path(__file__).parent
     sys.path.insert(0, str(APP_ROOT / "src"))
-    interface.launch()

     FOT_GOOGLE_API_KEY,
     DEMO_PASSWORD,
     SEARCH_RESULT_COUNT_K,
+    MIN_SIMILARITY_SCORE,
 )
 from fot_recommender.utils import load_citations, format_evidence_for_display
 from fot_recommender.rag_pipeline import (
     {
         "short_title": "Overwhelmed",
         "title": "Overwhelmed Freshman (Academic & Attendance)",
+        "narrative": "A comprehensive support plan is urgently needed for this freshman. Academic performance is a critical concern, with failures in both Math and English leading to a credit deficiency of only 2 out of 4 expected credits. This academic struggle is compounded by a drop in attendance to 85% and a recent behavioral flag for an outburst in class, suggesting the student is significantly overwhelmed by the transition to high school.",
     },
     {
         "short_title": "Withdrawn",
         "title": "Withdrawn Freshman (Social-Emotional)",
+        "narrative": "Academically, this freshman appears to be thriving, with a high GPA and perfect attendance. A closer look at classroom performance, however, reveals a student who is completely withdrawn. They do not participate in discussions or engage in any extracurricular activities, and teacher notes repeatedly describe them as 'isolated.' The lack of behavioral flags is a result of non-engagement, not positive conduct, pointing to a clear need for interventions focused on social-emotional learning and school connectedness.",
     },
     {
         "short_title": "Disruptive",
         "title": "Disruptive Freshman (Behavioral)",
+        "narrative": "While this student's academics and credits earned are currently on track and attendance is acceptable at 92%, a significant pattern of disruptive behavior is jeopardizing their long-term success. An accumulation of five behavioral flags across multiple classes indicates a primary need for interventions in behavior management and positive conduct. Support should be focused on mentoring and strategies to foster appropriate classroom engagement before these behaviors begin to negatively impact their academic standing.",
+    },
 ]
 EXAMPLE_MAP = {ex["short_title"]: ex["narrative"] for ex in EXAMPLE_NARRATIVES}
 EXAMPLE_TITLES = list(EXAMPLE_MAP.keys())
 embedding_model = initialize_embedding_model()
 print("✅ API initialized successfully.")
 def get_recommendations_api(student_narrative, persona, password):
     """The main function that runs the RAG pipeline and prepares data for export."""
     if password != DEMO_PASSWORD:
+        yield (
+            "Authentication failed. Please enter a valid Access Key.",
+            gr.update(interactive=True),
+            gr.update(visible=False),
+            None,
+            gr.update(visible=False),
+        )
+        return
+    if not FOT_GOOGLE_API_KEY:
+        yield (
+            "ERROR: The Google API Key is not configured. Please set the FOT_GOOGLE_API_KEY in the .env file.",
+            gr.update(interactive=True),
+            gr.update(visible=False),
+            None,
+            gr.update(visible=False),
+        )
         return
     if not student_narrative:
+        yield (
+            "Please enter a student narrative.",
+            gr.update(interactive=True),
+            gr.update(visible=False),
+            None,
+            gr.update(visible=False),
+        )
         return
+    yield (
+        "Processing...",
+        gr.update(interactive=False),
+        gr.update(visible=False),
+        None,
+        gr.update(visible=False),
+    )
     # 1. RETRIEVE
+    query_embedding = np.asarray(embedding_model.encode([student_narrative])).astype(
+        "float32"
+    )
     scores, indices = index.search(query_embedding, k=SEARCH_RESULT_COUNT_K)
+    retrieved_chunks_with_scores = [
+        (knowledge_base_chunks[i], score)
+        for i, score in zip(indices[0], scores[0])
+        if score >= MIN_SIMILARITY_SCORE
+    ]
     if not retrieved_chunks_with_scores:
+        yield (
+            "Could not find relevant interventions.",
+            gr.update(interactive=True),
+            gr.update(visible=False),
+            None,
+            gr.update(visible=False),
+        )
         return
     # 2. GENERATE
     )
     # 3. Augment with evidence for UI
+    formatted_evidence = format_evidence_for_display(
+        retrieved_chunks_with_scores, citations_map
+    )
     evidence_header = "\n\n---\n\n### Evidence Base\n"
     evidence_list_str = ""
     for evidence in formatted_evidence:
         evidence_list_str += f"  - **Source:** {evidence['source']}\n"
         evidence_list_str += f"  - **Page(s):** {evidence['pages']}\n"
         evidence_list_str += f"  - **Relevance Score:** {evidence['score']}\n"
+        evidence_list_str += (
+            f"  - **Content Snippet:**\n  > {evidence['content_snippet']}\n"
+        )
     final_output = synthesized_recommendation + evidence_header + evidence_list_str
     # 4. Assemble Evaluation Data
     evaluation_data = {
         "timestamp": datetime.datetime.now().isoformat(),
         "inputs": {"student_narrative": student_narrative, "persona": persona},
         "retrieval_results": [
             {
+                "chunk_title": chunk["title"],
+                "relevance_score": float(score),
+                "source_document": chunk["source_document"],
+                "page_info": chunk.get("fot_pages", "N/A"),
+                "original_content": chunk.get("original_content", ""),
+                "citation_info": citations_map.get(chunk["source_document"], {}),
+            }
+            for chunk, score in retrieved_chunks_with_scores
         ],
         "llm_output": {"synthesized_recommendation": synthesized_recommendation},
+        "final_ui_output": final_output,
     }
     # 5. Create a temporary file for download
+    with tempfile.NamedTemporaryFile(
+        mode="w", delete=False, suffix=".json", encoding="utf-8"
+    ) as f:
         json.dump(evaluation_data, f, indent=4)
         temp_file_path = f.name
+    yield (
+        final_output,
+        gr.update(interactive=True),
+        gr.update(visible=True),
+        evaluation_data,
+        gr.update(value=temp_file_path, visible=True),
+    )
 # --- UI Helper Functions ---
 def clear_all():
+    return (
+        "",
+        None,
+        "",
+        gr.update(visible=False),
+        None,
+        gr.update(visible=False, value=None),
+    )
 def update_narrative_from_example(selection):
     return EXAMPLE_MAP.get(selection, "")
 CUSTOM_CSS = """
 .radio-horizontal .gr-form { flex-direction: row; flex-wrap: wrap; gap: 0.5rem; }
 """
 # --- Gradio Interface ---
+with gr.Blocks(theme=gr.themes.Soft(), css=CUSTOM_CSS) as interface:  # type: ignore
+    gr.Markdown(
+        "# Freshman On-Track Intervention Recommender\n*A live API demonstrating the FOT Recommender.*"
+    )
     with gr.Row(equal_height=False):
         with gr.Column(scale=1):
             with gr.Group():
+                narrative_input = gr.Textbox(
+                    lines=8,
+                    label="Student Narrative",
+                    placeholder="Describe the student's situation here, or select an example below.",
+                )
+                example_radio = gr.Radio(
+                    EXAMPLE_TITLES,
+                    label="Load an Example Scenario",
+                    info="Select one to populate the narrative above. Typing a custom narrative will clear this selection.",
+                    elem_classes=["radio-horizontal"],
+                )
+                persona_input = gr.Radio(
+                    ["teacher", "parent", "principal"],
+                    label="Who is this recommendation for?",
+                    value="teacher",
+                    elem_classes=["radio-horizontal"],
+                )
+                password_input = gr.Textbox(
+                    label="Access Key",
+                    type="password",
+                    info="Enter the access key for the demo.",
+                )
                 with gr.Row():
                     clear_btn = gr.Button("Clear")
                     submit_btn = gr.Button("Submit", variant="primary")
         with gr.Column(scale=2):
+            recommendation_output = gr.Markdown(
+                label="Synthesized Recommendation", show_copy_button=True
+            )
+            with gr.Accordion(
+                "Evaluation Data", open=False, visible=False
+            ) as eval_accordion:
                 json_viewer = gr.JSON(label="Evaluation JSON")
                 download_btn = gr.DownloadButton("Download JSON", visible=False)
     # --- Event Handlers ---
+    example_radio.change(
+        fn=update_narrative_from_example, inputs=example_radio, outputs=narrative_input
+    )
     narrative_input.input(fn=lambda: None, inputs=None, outputs=example_radio)
+    submit_btn.click(
+        fn=get_recommendations_api,
+        inputs=[narrative_input, persona_input, password_input],
+        outputs=[
+            recommendation_output,
+            submit_btn,
+            eval_accordion,
+            json_viewer,
+            download_btn,
+        ],
+    )
+    clear_btn.click(
+        fn=clear_all,
+        inputs=[],
+        outputs=[
+            narrative_input,
+            example_radio,
+            recommendation_output,
+            eval_accordion,
+            json_viewer,
+            download_btn,
+        ],
+    )
 if __name__ == "__main__":
     # Add project src to the sys.path for when running as a script
     APP_ROOT = Path(__file__).parent
     sys.path.insert(0, str(APP_ROOT / "src"))
+    interface.launch()

notebooks/fot_recommender_poc.ipynb CHANGED Viewed

@@ -27,7 +27,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "id": "97f37783",
    "metadata": {},
    "outputs": [
@@ -47,7 +47,9 @@
     "\n",
     "# This prevents common, harmless warnings from cluttering the output.\n",
     "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
-    "warnings.filterwarnings(\"ignore\", category=FutureWarning) # Suppress specific torch warning\n",
     "\n",
     "# Clones the project from GitHub if not already present.\n",
     "PROJECT_DIR = \"fot-intervention-recommender\"\n",
@@ -80,7 +82,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "id": "3784865f",
    "metadata": {},
    "outputs": [
@@ -139,28 +141,40 @@
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/charlesfeinn/Developer/job_applications/fot-intervention-recommender/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
-     ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "Initializing embedding model: all-MiniLM-L6-v2...\n",
       "Model initialized successfully.\n",
       "Creating embeddings for 27 chunks...\n"
      ]
     },
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Batches: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.12s/it]\n"
-     ]
     },
     {
      "name": "stdout",
@@ -292,12 +306,15 @@
     }
    ],
    "source": [
     "from fot_recommender.rag_pipeline import (\n",
     "    load_knowledge_base,\n",
     "    initialize_embedding_model,\n",
     "    create_embeddings,\n",
     "    create_vector_db,\n",
-    "    search_interventions\n",
     ")\n",
     "from fot_recommender.utils import display_recommendations\n",
     "\n",
@@ -320,11 +337,13 @@
     "    index=vector_db,\n",
     "    knowledge_base=knowledge_base_chunks,\n",
     "    k=3,\n",
-    "    min_similarity_score=0.4\n",
     ")\n",
     "\n",
     "# 4. Display a clean summary and the rich results\n",
-    "print(f\"✅ Successfully loaded models and retrieved the top {len(retrieved_interventions)} most relevant interventions from the knowledge base.\")\n",
     "display_recommendations(retrieved_interventions, citations_map)"
    ]
   },

   },
   {
    "cell_type": "code",
+   "execution_count": 1,
    "id": "97f37783",
    "metadata": {},
    "outputs": [
     "\n",
     "# This prevents common, harmless warnings from cluttering the output.\n",
     "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+    "warnings.filterwarnings(\n",
+    "    \"ignore\", category=FutureWarning\n",
+    ")  # Suppress specific torch warning\n",
     "\n",
     "# Clones the project from GitHub if not already present.\n",
     "PROJECT_DIR = \"fot-intervention-recommender\"\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "id": "3784865f",
    "metadata": {},
    "outputs": [
    "metadata": {},
    "outputs": [
     {
+     "data": {
+      "text/markdown": [
+       "🚀 **Starting the retrieval pipeline...**"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "This may take a moment as the system loads the embedding model, prepares the knowledge base, and performs the search.\n",
       "Initializing embedding model: all-MiniLM-L6-v2...\n",
       "Model initialized successfully.\n",
       "Creating embeddings for 27 chunks...\n"
      ]
     },
     {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "aed4d46c859d4f8a88caf88daa5a38cc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     },
     {
      "name": "stdout",
     }
    ],
    "source": [
+    "display(Markdown(\"🚀 **Starting the retrieval pipeline...**\"))\n",
+    "print(\"This may take a moment as the system loads the embedding model, prepares the knowledge base, and performs the search.\")\n",
+    "\n",
     "from fot_recommender.rag_pipeline import (\n",
     "    load_knowledge_base,\n",
     "    initialize_embedding_model,\n",
     "    create_embeddings,\n",
     "    create_vector_db,\n",
+    "    search_interventions,\n",
     ")\n",
     "from fot_recommender.utils import display_recommendations\n",
     "\n",
     "    index=vector_db,\n",
     "    knowledge_base=knowledge_base_chunks,\n",
     "    k=3,\n",
+    "    min_similarity_score=0.4,\n",
     ")\n",
     "\n",
     "# 4. Display a clean summary and the rich results\n",
+    "print(\n",
+    "    f\"✅ Successfully loaded models and retrieved the top {len(retrieved_interventions)} most relevant interventions from the knowledge base.\"\n",
+    ")\n",
     "display_recommendations(retrieved_interventions, citations_map)"
    ]
   },

scripts/build_knowledge_base.py CHANGED Viewed

@@ -7,15 +7,17 @@ from pathlib import Path
 project_root = Path(__file__).parent.parent
 sys.path.append(str(project_root))
-from src.fot_recommender.config import (
     PROCESSED_DATA_DIR,
     RAW_KB_PATH,
     FINAL_KB_CHUNKS_PATH,
     FAISS_INDEX_PATH,
     EMBEDDING_MODEL_NAME,
 )
-from src.fot_recommender.semantic_chunker import chunk_by_concept
-from src.fot_recommender.rag_pipeline import (
     initialize_embedding_model,
     create_embeddings,
 )
@@ -61,4 +63,4 @@ def build():
 if __name__ == "__main__":
-    build()

 project_root = Path(__file__).parent.parent
 sys.path.append(str(project_root))
+# We are intentionally ignoring the E402 warning here because the sys.path
+# modification must happen before we can import from our local package.
+from src.fot_recommender.config import (  # noqa: E402
     PROCESSED_DATA_DIR,
     RAW_KB_PATH,
     FINAL_KB_CHUNKS_PATH,
     FAISS_INDEX_PATH,
     EMBEDDING_MODEL_NAME,
 )
+from src.fot_recommender.semantic_chunker import chunk_by_concept  # noqa: E402
+from src.fot_recommender.rag_pipeline import (  # noqa: E402
     initialize_embedding_model,
     create_embeddings,
 )
 if __name__ == "__main__":
+    build()

src/fot_recommender/config.py CHANGED Viewed

@@ -32,4 +32,6 @@ EMBEDDING_CONTENT_KEY = "content_for_embedding"
 # --- Secrets Management ---
 # Load secrets from the environment. The application will import these variables.
 FOT_GOOGLE_API_KEY = os.environ.get("FOT_GOOGLE_API_KEY")
-DEMO_PASSWORD = os.environ.get("DEMO_PASSWORD", "default_password") # Added a default for safety

 # --- Secrets Management ---
 # Load secrets from the environment. The application will import these variables.
 FOT_GOOGLE_API_KEY = os.environ.get("FOT_GOOGLE_API_KEY")
+DEMO_PASSWORD = os.environ.get(
+    "DEMO_PASSWORD", "default_password"
+)  # Added a default for safety

src/fot_recommender/main.py CHANGED Viewed

@@ -88,10 +88,7 @@ def main():
         return "ERROR: FOT_GOOGLE_API_KEY not found. Please create a .env file and add your key."
     synthesized_recommendation = generate_recommendation_summary(
-        top_interventions,
-        student_query,
-        api_key=api_key,
-        persona="teacher"
     )
     # --- 5. Display Final Output ---

         return "ERROR: FOT_GOOGLE_API_KEY not found. Please create a .env file and add your key."
     synthesized_recommendation = generate_recommendation_summary(
+        top_interventions, student_query, api_key=api_key, persona="teacher"
     )
     # --- 5. Display Final Output ---

src/fot_recommender/rag_pipeline.py CHANGED Viewed

@@ -11,7 +11,7 @@ from fot_recommender.config import (
     EMBEDDING_CONTENT_KEY,
     GENERATIVE_MODEL_NAME,
     SEARCH_RESULT_COUNT_K,
-    MIN_SIMILARITY_SCORE
 )
@@ -89,7 +89,7 @@ def search_interventions(
     """
     print(f"\nSearching for top {k} interventions for query: '{query[:80]}...'")
     query_embedding = np.asarray(model.encode([query])).astype("float32")
-    scores, indices = index.search(query_embedding, k)
     results = []
     for i, score in zip(indices[0], scores[0]):
         if i != -1:  # FAISS returns -1 for no result
@@ -108,7 +108,7 @@ def generate_recommendation_summary(
     student_narrative: str,
     api_key: str,
     persona: str = "teacher",
-    model_name: str = GENERATIVE_MODEL_NAME
 ) -> str:
     """
     Generates a synthesized recommendation using the Google Gemini API.
@@ -131,10 +131,12 @@ def generate_recommendation_summary(
     )
     try:
-        print(f"\nSynthesizing recommendation for persona: '{persona}' using {model_name}...")
-        model = genai.GenerativeModel(model_name)
         response = model.generate_content(prompt)
         print("Synthesis complete.")
         return response.text
     except Exception as e:
-        return f"An error occurred while calling the Gemini API: {e}"

     EMBEDDING_CONTENT_KEY,
     GENERATIVE_MODEL_NAME,
     SEARCH_RESULT_COUNT_K,
+    MIN_SIMILARITY_SCORE,
 )
     """
     print(f"\nSearching for top {k} interventions for query: '{query[:80]}...'")
     query_embedding = np.asarray(model.encode([query])).astype("float32")
+    scores, indices = index.search(query_embedding, k)  # type: ignore
     results = []
     for i, score in zip(indices[0], scores[0]):
         if i != -1:  # FAISS returns -1 for no result
     student_narrative: str,
     api_key: str,
     persona: str = "teacher",
+    model_name: str = GENERATIVE_MODEL_NAME,
 ) -> str:
     """
     Generates a synthesized recommendation using the Google Gemini API.
     )
     try:
+        print(
+            f"\nSynthesizing recommendation for persona: '{persona}' using {model_name}..."
+        )
+        model = genai.GenerativeModel(model_name)  # type: ignore
         response = model.generate_content(prompt)
         print("Synthesis complete.")
         return response.text
     except Exception as e:
+        return f"An error occurred while calling the Gemini API: {e}"

src/fot_recommender/utils.py CHANGED Viewed

@@ -2,6 +2,7 @@ import datetime
 import json
 from IPython.display import display, Markdown
 def display_recommendations(results: list, citations_map: dict):
     """
     Displays the retrieved recommendations in a rich, Markdown-formatted output
@@ -15,29 +16,28 @@ def display_recommendations(results: list, citations_map: dict):
     formatted_evidence = format_evidence_for_display(results, citations_map)
     display(Markdown("### Evidence Base"))
     # 2. Loop through the clean data and render it for the notebook
     for evidence in formatted_evidence:
         recommendation_md = f"""
-**{evidence['title']}**
-- **Source:** {evidence['source']}
-- **Page(s):** {evidence['pages']}
-- **Relevance Score:** {evidence['score']}
 - **Content Snippet:**
-> {evidence['content_snippet']}
 ---
 """
         display(Markdown(recommendation_md))
 def create_evaluation_bundle(
     student_narrative: str,
     persona: str,
     retrieved_chunks_with_scores: list,
     synthesized_recommendation: str,
-    citations_map: dict
 ) -> dict:
     """
     Assembles a comprehensive dictionary for evaluation and logging purposes.
@@ -50,20 +50,20 @@ def create_evaluation_bundle(
         },
         "retrieval_results": [
             {
-                "chunk_title": chunk['title'],
                 "relevance_score": float(score),
-                "source_document": chunk['source_document'],
-                "page_info": chunk.get('fot_pages', 'N/A'),
-                "original_content": chunk.get('original_content', ''),
-                "citation_info": citations_map.get(chunk['source_document'], {})
-            } for chunk, score in retrieved_chunks_with_scores
         ],
-        "llm_output": {
-            "synthesized_recommendation": synthesized_recommendation
-        }
     }
     return evaluation_data
 def format_evidence_for_display(results: list, citations_map: dict) -> list:
     """
     Takes raw search results and formats them into a structured list of dictionaries
@@ -71,34 +71,39 @@ def format_evidence_for_display(results: list, citations_map: dict) -> list:
     """
     evidence_list = []
     for chunk, score in results:
-        source_doc = chunk.get('source_document', 'N/A')
         citation_info = citations_map.get(source_doc, {})
         # Consolidate all the formatting logic here
-        title = citation_info.get('title', 'N/A')
-        author = citation_info.get('author', 'N/A')
-        year = citation_info.get('year', 'N/A')
         source_string = f"*{title}* ({author}, {year})."
-        page_info = chunk.get('fot_pages', 'N/A')
-        original_content = chunk.get("original_content", "Content not available.").strip()
-        blockquote_content = original_content.replace('\n', '\n> ')
-        evidence_list.append({
-            "title": chunk['title'],
-            "source": source_string,
-            "pages": page_info,
-            "score": f"{score:.2f}",
-            "content_snippet": blockquote_content
-        })
     return evidence_list
 def load_citations(path):
     try:
         with open(path, "r", encoding="utf-8") as f:
             citations_list = json.load(f)
         return {item["source_document"]: item for item in citations_list}
     except (FileNotFoundError, json.JSONDecodeError):
-        return {}

 import json
 from IPython.display import display, Markdown
 def display_recommendations(results: list, citations_map: dict):
     """
     Displays the retrieved recommendations in a rich, Markdown-formatted output
     formatted_evidence = format_evidence_for_display(results, citations_map)
     display(Markdown("### Evidence Base"))
     # 2. Loop through the clean data and render it for the notebook
     for evidence in formatted_evidence:
         recommendation_md = f"""
+**{evidence["title"]}**
+- **Source:** {evidence["source"]}
+- **Page(s):** {evidence["pages"]}
+- **Relevance Score:** {evidence["score"]}
 - **Content Snippet:**
+> {evidence["content_snippet"]}
 ---
 """
         display(Markdown(recommendation_md))
 def create_evaluation_bundle(
     student_narrative: str,
     persona: str,
     retrieved_chunks_with_scores: list,
     synthesized_recommendation: str,
+    citations_map: dict,
 ) -> dict:
     """
     Assembles a comprehensive dictionary for evaluation and logging purposes.
         },
         "retrieval_results": [
             {
+                "chunk_title": chunk["title"],
                 "relevance_score": float(score),
+                "source_document": chunk["source_document"],
+                "page_info": chunk.get("fot_pages", "N/A"),
+                "original_content": chunk.get("original_content", ""),
+                "citation_info": citations_map.get(chunk["source_document"], {}),
+            }
+            for chunk, score in retrieved_chunks_with_scores
         ],
+        "llm_output": {"synthesized_recommendation": synthesized_recommendation},
     }
     return evaluation_data
 def format_evidence_for_display(results: list, citations_map: dict) -> list:
     """
     Takes raw search results and formats them into a structured list of dictionaries
     """
     evidence_list = []
     for chunk, score in results:
+        source_doc = chunk.get("source_document", "N/A")
         citation_info = citations_map.get(source_doc, {})
         # Consolidate all the formatting logic here
+        title = citation_info.get("title", "N/A")
+        author = citation_info.get("author", "N/A")
+        year = citation_info.get("year", "N/A")
         source_string = f"*{title}* ({author}, {year})."
+        page_info = chunk.get("fot_pages", "N/A")
+        original_content = chunk.get(
+            "original_content", "Content not available."
+        ).strip()
+        blockquote_content = original_content.replace("\n", "\n> ")
+        evidence_list.append(
+            {
+                "title": chunk["title"],
+                "source": source_string,
+                "pages": page_info,
+                "score": f"{score:.2f}",
+                "content_snippet": blockquote_content,
+            }
+        )
     return evidence_list
 def load_citations(path):
     try:
         with open(path, "r", encoding="utf-8") as f:
             citations_list = json.load(f)
         return {item["source_document"]: item for item in citations_list}
     except (FileNotFoundError, json.JSONDecodeError):
+        return {}

tests/test_chunking.py CHANGED Viewed

@@ -7,9 +7,24 @@ def test_chunk_by_concept_groups_correctly():
     # 1. Arrange: Create simple, predictable raw data
     sample_raw_kb = [
-        {"source_document": "doc_A", "concept": "Mentoring", "absolute_page": 1, "content": "First part."},
-        {"source_document": "doc_B", "concept": "Tutoring", "absolute_page": 10, "content": "Tutoring info."},
-        {"source_document": "doc_A", "concept": "Mentoring", "absolute_page": 2, "content": "Second part."},
     ]
     # 2. Act: Run the function we're testing
@@ -20,10 +35,12 @@ def test_chunk_by_concept_groups_correctly():
     # Find the 'Mentoring' chunk for detailed checks
     mentoring_chunk = next(c for c in final_chunks if c["title"] == "Mentoring")
     assert mentoring_chunk is not None
     assert mentoring_chunk["source_document"] == "doc_A"
     assert mentoring_chunk["fot_pages"] == "Pages: 1, 2"
     assert "First part.\n\nSecond part." in mentoring_chunk["original_content"]
-    assert "Title: Mentoring. Content: First part.\n\nSecond part." in mentoring_chunk["content_for_embedding"]

     # 1. Arrange: Create simple, predictable raw data
     sample_raw_kb = [
+        {
+            "source_document": "doc_A",
+            "concept": "Mentoring",
+            "absolute_page": 1,
+            "content": "First part.",
+        },
+        {
+            "source_document": "doc_B",
+            "concept": "Tutoring",
+            "absolute_page": 10,
+            "content": "Tutoring info.",
+        },
+        {
+            "source_document": "doc_A",
+            "concept": "Mentoring",
+            "absolute_page": 2,
+            "content": "Second part.",
+        },
     ]
     # 2. Act: Run the function we're testing
     # Find the 'Mentoring' chunk for detailed checks
     mentoring_chunk = next(c for c in final_chunks if c["title"] == "Mentoring")
     assert mentoring_chunk is not None
     assert mentoring_chunk["source_document"] == "doc_A"
     assert mentoring_chunk["fot_pages"] == "Pages: 1, 2"
     assert "First part.\n\nSecond part." in mentoring_chunk["original_content"]
+    assert (
+        "Title: Mentoring. Content: First part.\n\nSecond part."
+        in mentoring_chunk["content_for_embedding"]
+    )

tests/test_pipeline.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from unittest.mock import MagicMock, patch
 import numpy as np
 def test_search_interventions_filters_by_score():
     """
     Ensures the search function correctly filters out results
@@ -11,7 +12,7 @@ def test_search_interventions_filters_by_score():
     # 1. Arrange: Create mock objects and sample data
     mock_model = MagicMock()
     mock_index = MagicMock()
     # Fake knowledge base
     sample_kb = [{"id": 1, "content": "high score"}, {"id": 2, "content": "low score"}]
@@ -19,7 +20,7 @@ def test_search_interventions_filters_by_score():
     # Let's say it finds two results, one with a high score (0.9) and one low (0.3)
     mock_index.search.return_value = (
         np.array([[0.9, 0.3]]),  # scores
-        np.array([[0, 1]])        # indices
     )
     # 2. Act: Run the search with a minimum score of 0.5
@@ -29,14 +30,14 @@ def test_search_interventions_filters_by_score():
         index=mock_index,
         knowledge_base=sample_kb,
         k=2,
-        min_similarity_score=0.5
     )
     # 3. Assert: Check that only the high-scoring result was returned
     assert len(results) == 1
-    assert results[0][0]["content"] == "high score" # Check the chunk content
-    assert results[0][1] == 0.9                     # Check the score
 def test_generate_recommendation_summary_builds_correct_prompt():
     """
@@ -47,13 +48,22 @@ def test_generate_recommendation_summary_builds_correct_prompt():
     # 1. Arrange: Create sample inputs
     sample_chunks = [
-        ({"title": "Tip 1", "original_content": "Do this.", "source_document": "doc_A"}, 0.9),
     ]
     student_narrative = "Student is struggling."
     # 2. Act & Assert: Use a patch to intercept the API call
     # This temporarily replaces genai.GenerativeModel with our mock
-    with patch("src.fot_recommender.rag_pipeline.genai.GenerativeModel") as mock_gen_model:
         # Create a mock instance that the function will use
         mock_model_instance = MagicMock()
         mock_gen_model.return_value = mock_model_instance
@@ -62,13 +72,13 @@ def test_generate_recommendation_summary_builds_correct_prompt():
             retrieved_chunks=sample_chunks,
             student_narrative=student_narrative,
             api_key="fake_key",
-            persona="teacher"
         )
         # 3. Assert: Check what our function tried to do
         # Was the API call made once?
         mock_model_instance.generate_content.assert_called_once()
         # Get the actual prompt that was passed to the LLM
         actual_prompt = mock_model_instance.generate_content.call_args[0][0]

 from unittest.mock import MagicMock, patch
 import numpy as np
 def test_search_interventions_filters_by_score():
     """
     Ensures the search function correctly filters out results
     # 1. Arrange: Create mock objects and sample data
     mock_model = MagicMock()
     mock_index = MagicMock()
     # Fake knowledge base
     sample_kb = [{"id": 1, "content": "high score"}, {"id": 2, "content": "low score"}]
     # Let's say it finds two results, one with a high score (0.9) and one low (0.3)
     mock_index.search.return_value = (
         np.array([[0.9, 0.3]]),  # scores
+        np.array([[0, 1]]),  # indices
     )
     # 2. Act: Run the search with a minimum score of 0.5
         index=mock_index,
         knowledge_base=sample_kb,
         k=2,
+        min_similarity_score=0.5,
     )
     # 3. Assert: Check that only the high-scoring result was returned
     assert len(results) == 1
+    assert results[0][0]["content"] == "high score"  # Check the chunk content
+    assert results[0][1] == 0.9  # Check the score
 def test_generate_recommendation_summary_builds_correct_prompt():
     """
     # 1. Arrange: Create sample inputs
     sample_chunks = [
+        (
+            {
+                "title": "Tip 1",
+                "original_content": "Do this.",
+                "source_document": "doc_A",
+            },
+            0.9,
+        ),
     ]
     student_narrative = "Student is struggling."
     # 2. Act & Assert: Use a patch to intercept the API call
     # This temporarily replaces genai.GenerativeModel with our mock
+    with patch(
+        "src.fot_recommender.rag_pipeline.genai.GenerativeModel"
+    ) as mock_gen_model:
         # Create a mock instance that the function will use
         mock_model_instance = MagicMock()
         mock_gen_model.return_value = mock_model_instance
             retrieved_chunks=sample_chunks,
             student_narrative=student_narrative,
             api_key="fake_key",
+            persona="teacher",
         )
         # 3. Assert: Check what our function tried to do
         # Was the API call made once?
         mock_model_instance.generate_content.assert_called_once()
         # Get the actual prompt that was passed to the LLM
         actual_prompt = mock_model_instance.generate_content.call_args[0][0]