chuckfinca commited on
Commit
280d562
·
1 Parent(s): 79336f1

style: Format code and fix linter warnings

Browse files

Applied `ruff format` to the entire codebase for consistency.

Also fixed all outstanding Pyright and Ruff linter warnings:
- Ignored warnings for correct but "private" imports (gradio, genai).
- Added a check for the API key to fix a potential `None` type error.
- Suppressed E402 import errors in the build script where required.

app.py CHANGED
@@ -14,7 +14,7 @@ from fot_recommender.config import (
14
  FOT_GOOGLE_API_KEY,
15
  DEMO_PASSWORD,
16
  SEARCH_RESULT_COUNT_K,
17
- MIN_SIMILARITY_SCORE
18
  )
19
  from fot_recommender.utils import load_citations, format_evidence_for_display
20
  from fot_recommender.rag_pipeline import (
@@ -28,18 +28,18 @@ EXAMPLE_NARRATIVES = [
28
  {
29
  "short_title": "Overwhelmed",
30
  "title": "Overwhelmed Freshman (Academic & Attendance)",
31
- "narrative": "A comprehensive support plan is urgently needed for this freshman. Academic performance is a critical concern, with failures in both Math and English leading to a credit deficiency of only 2 out of 4 expected credits. This academic struggle is compounded by a drop in attendance to 85% and a recent behavioral flag for an outburst in class, suggesting the student is significantly overwhelmed by the transition to high school."
32
  },
33
  {
34
  "short_title": "Withdrawn",
35
  "title": "Withdrawn Freshman (Social-Emotional)",
36
- "narrative": "Academically, this freshman appears to be thriving, with a high GPA and perfect attendance. A closer look at classroom performance, however, reveals a student who is completely withdrawn. They do not participate in discussions or engage in any extracurricular activities, and teacher notes repeatedly describe them as 'isolated.' The lack of behavioral flags is a result of non-engagement, not positive conduct, pointing to a clear need for interventions focused on social-emotional learning and school connectedness."
37
  },
38
  {
39
  "short_title": "Disruptive",
40
  "title": "Disruptive Freshman (Behavioral)",
41
- "narrative": "While this student's academics and credits earned are currently on track and attendance is acceptable at 92%, a significant pattern of disruptive behavior is jeopardizing their long-term success. An accumulation of five behavioral flags across multiple classes indicates a primary need for interventions in behavior management and positive conduct. Support should be focused on mentoring and strategies to foster appropriate classroom engagement before these behaviors begin to negatively impact their academic standing."
42
- }
43
  ]
44
  EXAMPLE_MAP = {ex["short_title"]: ex["narrative"] for ex in EXAMPLE_NARRATIVES}
45
  EXAMPLE_TITLES = list(EXAMPLE_MAP.keys())
@@ -52,25 +52,66 @@ citations_map = load_citations(str(CITATIONS_PATH))
52
  embedding_model = initialize_embedding_model()
53
  print("✅ API initialized successfully.")
54
 
 
55
  def get_recommendations_api(student_narrative, persona, password):
56
  """The main function that runs the RAG pipeline and prepares data for export."""
57
  if password != DEMO_PASSWORD:
58
- yield "Authentication failed. Please enter a valid Access Key.", gr.update(interactive=True), gr.update(visible=False), None, gr.update(visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  return
60
 
61
  if not student_narrative:
62
- yield "Please enter a student narrative.", gr.update(interactive=True), gr.update(visible=False), None, gr.update(visible=False)
 
 
 
 
 
 
63
  return
64
 
65
- yield "Processing...", gr.update(interactive=False), gr.update(visible=False), None, gr.update(visible=False)
 
 
 
 
 
 
66
 
67
  # 1. RETRIEVE
68
- query_embedding = np.asarray(embedding_model.encode([student_narrative])).astype("float32")
 
 
69
  scores, indices = index.search(query_embedding, k=SEARCH_RESULT_COUNT_K)
70
- retrieved_chunks_with_scores = [(knowledge_base_chunks[i], score) for i, score in zip(indices[0], scores[0]) if score >= MIN_SIMILARITY_SCORE]
 
 
 
 
71
 
72
  if not retrieved_chunks_with_scores:
73
- yield "Could not find relevant interventions.", gr.update(interactive=True), gr.update(visible=False), None, gr.update(visible=False)
 
 
 
 
 
 
74
  return
75
 
76
  # 2. GENERATE
@@ -82,7 +123,9 @@ def get_recommendations_api(student_narrative, persona, password):
82
  )
83
 
84
  # 3. Augment with evidence for UI
85
- formatted_evidence = format_evidence_for_display(retrieved_chunks_with_scores, citations_map)
 
 
86
  evidence_header = "\n\n---\n\n### Evidence Base\n"
87
  evidence_list_str = ""
88
  for evidence in formatted_evidence:
@@ -90,72 +133,142 @@ def get_recommendations_api(student_narrative, persona, password):
90
  evidence_list_str += f" - **Source:** {evidence['source']}\n"
91
  evidence_list_str += f" - **Page(s):** {evidence['pages']}\n"
92
  evidence_list_str += f" - **Relevance Score:** {evidence['score']}\n"
93
- evidence_list_str += f" - **Content Snippet:**\n > {evidence['content_snippet']}\n"
94
-
 
 
95
  final_output = synthesized_recommendation + evidence_header + evidence_list_str
96
-
97
  # 4. Assemble Evaluation Data
98
  evaluation_data = {
99
  "timestamp": datetime.datetime.now().isoformat(),
100
  "inputs": {"student_narrative": student_narrative, "persona": persona},
101
  "retrieval_results": [
102
  {
103
- "chunk_title": chunk['title'], "relevance_score": float(score),
104
- "source_document": chunk['source_document'], "page_info": chunk.get('fot_pages', 'N/A'),
105
- "original_content": chunk.get('original_content', ''), "citation_info": citations_map.get(chunk['source_document'], {})
106
- } for chunk, score in retrieved_chunks_with_scores
 
 
 
 
107
  ],
108
  "llm_output": {"synthesized_recommendation": synthesized_recommendation},
109
- "final_ui_output": final_output
110
  }
111
 
112
  # 5. Create a temporary file for download
113
- with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".json", encoding='utf-8') as f:
 
 
114
  json.dump(evaluation_data, f, indent=4)
115
  temp_file_path = f.name
116
 
117
- yield final_output, gr.update(interactive=True), gr.update(visible=True), evaluation_data, gr.update(value=temp_file_path, visible=True)
 
 
 
 
 
 
118
 
119
 
120
  # --- UI Helper Functions ---
121
  def clear_all():
122
- return "", None, "", gr.update(visible=False), None, gr.update(visible=False, value=None)
 
 
 
 
 
 
 
 
123
 
124
  def update_narrative_from_example(selection):
125
  return EXAMPLE_MAP.get(selection, "")
126
 
 
127
  CUSTOM_CSS = """
128
  .radio-horizontal .gr-form { flex-direction: row; flex-wrap: wrap; gap: 0.5rem; }
129
  """
130
 
131
  # --- Gradio Interface ---
132
- with gr.Blocks(theme=gr.themes.Soft(), css=CUSTOM_CSS) as interface:
133
- gr.Markdown("# Freshman On-Track Intervention Recommender\n*A live API demonstrating the FOT Recommender.*")
 
 
134
  with gr.Row(equal_height=False):
135
  with gr.Column(scale=1):
136
  with gr.Group():
137
- narrative_input = gr.Textbox(lines=8, label="Student Narrative", placeholder="Describe the student's situation here, or select an example below.")
138
- example_radio = gr.Radio(EXAMPLE_TITLES, label="Load an Example Scenario", info="Select one to populate the narrative above. Typing a custom narrative will clear this selection.", elem_classes=["radio-horizontal"])
139
- persona_input = gr.Radio(["teacher", "parent", "principal"], label="Who is this recommendation for?", value="teacher", elem_classes=["radio-horizontal"])
140
- password_input = gr.Textbox(label="Access Key", type="password", info="Enter the access key for the demo.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  with gr.Row():
142
  clear_btn = gr.Button("Clear")
143
  submit_btn = gr.Button("Submit", variant="primary")
144
  with gr.Column(scale=2):
145
- recommendation_output = gr.Markdown(label="Synthesized Recommendation", show_copy_button=True)
146
- with gr.Accordion("Evaluation Data", open=False, visible=False) as eval_accordion:
 
 
 
 
147
  json_viewer = gr.JSON(label="Evaluation JSON")
148
  download_btn = gr.DownloadButton("Download JSON", visible=False)
149
 
150
  # --- Event Handlers ---
151
- example_radio.change(fn=update_narrative_from_example, inputs=example_radio, outputs=narrative_input)
 
 
152
  narrative_input.input(fn=lambda: None, inputs=None, outputs=example_radio)
153
- submit_btn.click(fn=get_recommendations_api, inputs=[narrative_input, persona_input, password_input], outputs=[recommendation_output, submit_btn, eval_accordion, json_viewer, download_btn])
154
- clear_btn.click(fn=clear_all, inputs=[], outputs=[narrative_input, example_radio, recommendation_output, eval_accordion, json_viewer, download_btn])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
 
157
  if __name__ == "__main__":
158
  # Add project src to the sys.path for when running as a script
159
  APP_ROOT = Path(__file__).parent
160
  sys.path.insert(0, str(APP_ROOT / "src"))
161
- interface.launch()
 
14
  FOT_GOOGLE_API_KEY,
15
  DEMO_PASSWORD,
16
  SEARCH_RESULT_COUNT_K,
17
+ MIN_SIMILARITY_SCORE,
18
  )
19
  from fot_recommender.utils import load_citations, format_evidence_for_display
20
  from fot_recommender.rag_pipeline import (
 
28
  {
29
  "short_title": "Overwhelmed",
30
  "title": "Overwhelmed Freshman (Academic & Attendance)",
31
+ "narrative": "A comprehensive support plan is urgently needed for this freshman. Academic performance is a critical concern, with failures in both Math and English leading to a credit deficiency of only 2 out of 4 expected credits. This academic struggle is compounded by a drop in attendance to 85% and a recent behavioral flag for an outburst in class, suggesting the student is significantly overwhelmed by the transition to high school.",
32
  },
33
  {
34
  "short_title": "Withdrawn",
35
  "title": "Withdrawn Freshman (Social-Emotional)",
36
+ "narrative": "Academically, this freshman appears to be thriving, with a high GPA and perfect attendance. A closer look at classroom performance, however, reveals a student who is completely withdrawn. They do not participate in discussions or engage in any extracurricular activities, and teacher notes repeatedly describe them as 'isolated.' The lack of behavioral flags is a result of non-engagement, not positive conduct, pointing to a clear need for interventions focused on social-emotional learning and school connectedness.",
37
  },
38
  {
39
  "short_title": "Disruptive",
40
  "title": "Disruptive Freshman (Behavioral)",
41
+ "narrative": "While this student's academics and credits earned are currently on track and attendance is acceptable at 92%, a significant pattern of disruptive behavior is jeopardizing their long-term success. An accumulation of five behavioral flags across multiple classes indicates a primary need for interventions in behavior management and positive conduct. Support should be focused on mentoring and strategies to foster appropriate classroom engagement before these behaviors begin to negatively impact their academic standing.",
42
+ },
43
  ]
44
  EXAMPLE_MAP = {ex["short_title"]: ex["narrative"] for ex in EXAMPLE_NARRATIVES}
45
  EXAMPLE_TITLES = list(EXAMPLE_MAP.keys())
 
52
  embedding_model = initialize_embedding_model()
53
  print("✅ API initialized successfully.")
54
 
55
+
56
  def get_recommendations_api(student_narrative, persona, password):
57
  """The main function that runs the RAG pipeline and prepares data for export."""
58
  if password != DEMO_PASSWORD:
59
+ yield (
60
+ "Authentication failed. Please enter a valid Access Key.",
61
+ gr.update(interactive=True),
62
+ gr.update(visible=False),
63
+ None,
64
+ gr.update(visible=False),
65
+ )
66
+ return
67
+
68
+ if not FOT_GOOGLE_API_KEY:
69
+ yield (
70
+ "ERROR: The Google API Key is not configured. Please set the FOT_GOOGLE_API_KEY in the .env file.",
71
+ gr.update(interactive=True),
72
+ gr.update(visible=False),
73
+ None,
74
+ gr.update(visible=False),
75
+ )
76
  return
77
 
78
  if not student_narrative:
79
+ yield (
80
+ "Please enter a student narrative.",
81
+ gr.update(interactive=True),
82
+ gr.update(visible=False),
83
+ None,
84
+ gr.update(visible=False),
85
+ )
86
  return
87
 
88
+ yield (
89
+ "Processing...",
90
+ gr.update(interactive=False),
91
+ gr.update(visible=False),
92
+ None,
93
+ gr.update(visible=False),
94
+ )
95
 
96
  # 1. RETRIEVE
97
+ query_embedding = np.asarray(embedding_model.encode([student_narrative])).astype(
98
+ "float32"
99
+ )
100
  scores, indices = index.search(query_embedding, k=SEARCH_RESULT_COUNT_K)
101
+ retrieved_chunks_with_scores = [
102
+ (knowledge_base_chunks[i], score)
103
+ for i, score in zip(indices[0], scores[0])
104
+ if score >= MIN_SIMILARITY_SCORE
105
+ ]
106
 
107
  if not retrieved_chunks_with_scores:
108
+ yield (
109
+ "Could not find relevant interventions.",
110
+ gr.update(interactive=True),
111
+ gr.update(visible=False),
112
+ None,
113
+ gr.update(visible=False),
114
+ )
115
  return
116
 
117
  # 2. GENERATE
 
123
  )
124
 
125
  # 3. Augment with evidence for UI
126
+ formatted_evidence = format_evidence_for_display(
127
+ retrieved_chunks_with_scores, citations_map
128
+ )
129
  evidence_header = "\n\n---\n\n### Evidence Base\n"
130
  evidence_list_str = ""
131
  for evidence in formatted_evidence:
 
133
  evidence_list_str += f" - **Source:** {evidence['source']}\n"
134
  evidence_list_str += f" - **Page(s):** {evidence['pages']}\n"
135
  evidence_list_str += f" - **Relevance Score:** {evidence['score']}\n"
136
+ evidence_list_str += (
137
+ f" - **Content Snippet:**\n > {evidence['content_snippet']}\n"
138
+ )
139
+
140
  final_output = synthesized_recommendation + evidence_header + evidence_list_str
141
+
142
  # 4. Assemble Evaluation Data
143
  evaluation_data = {
144
  "timestamp": datetime.datetime.now().isoformat(),
145
  "inputs": {"student_narrative": student_narrative, "persona": persona},
146
  "retrieval_results": [
147
  {
148
+ "chunk_title": chunk["title"],
149
+ "relevance_score": float(score),
150
+ "source_document": chunk["source_document"],
151
+ "page_info": chunk.get("fot_pages", "N/A"),
152
+ "original_content": chunk.get("original_content", ""),
153
+ "citation_info": citations_map.get(chunk["source_document"], {}),
154
+ }
155
+ for chunk, score in retrieved_chunks_with_scores
156
  ],
157
  "llm_output": {"synthesized_recommendation": synthesized_recommendation},
158
+ "final_ui_output": final_output,
159
  }
160
 
161
  # 5. Create a temporary file for download
162
+ with tempfile.NamedTemporaryFile(
163
+ mode="w", delete=False, suffix=".json", encoding="utf-8"
164
+ ) as f:
165
  json.dump(evaluation_data, f, indent=4)
166
  temp_file_path = f.name
167
 
168
+ yield (
169
+ final_output,
170
+ gr.update(interactive=True),
171
+ gr.update(visible=True),
172
+ evaluation_data,
173
+ gr.update(value=temp_file_path, visible=True),
174
+ )
175
 
176
 
177
  # --- UI Helper Functions ---
178
  def clear_all():
179
+ return (
180
+ "",
181
+ None,
182
+ "",
183
+ gr.update(visible=False),
184
+ None,
185
+ gr.update(visible=False, value=None),
186
+ )
187
+
188
 
189
  def update_narrative_from_example(selection):
190
  return EXAMPLE_MAP.get(selection, "")
191
 
192
+
193
  CUSTOM_CSS = """
194
  .radio-horizontal .gr-form { flex-direction: row; flex-wrap: wrap; gap: 0.5rem; }
195
  """
196
 
197
  # --- Gradio Interface ---
198
+ with gr.Blocks(theme=gr.themes.Soft(), css=CUSTOM_CSS) as interface: # type: ignore
199
+ gr.Markdown(
200
+ "# Freshman On-Track Intervention Recommender\n*A live API demonstrating the FOT Recommender.*"
201
+ )
202
  with gr.Row(equal_height=False):
203
  with gr.Column(scale=1):
204
  with gr.Group():
205
+ narrative_input = gr.Textbox(
206
+ lines=8,
207
+ label="Student Narrative",
208
+ placeholder="Describe the student's situation here, or select an example below.",
209
+ )
210
+ example_radio = gr.Radio(
211
+ EXAMPLE_TITLES,
212
+ label="Load an Example Scenario",
213
+ info="Select one to populate the narrative above. Typing a custom narrative will clear this selection.",
214
+ elem_classes=["radio-horizontal"],
215
+ )
216
+ persona_input = gr.Radio(
217
+ ["teacher", "parent", "principal"],
218
+ label="Who is this recommendation for?",
219
+ value="teacher",
220
+ elem_classes=["radio-horizontal"],
221
+ )
222
+ password_input = gr.Textbox(
223
+ label="Access Key",
224
+ type="password",
225
+ info="Enter the access key for the demo.",
226
+ )
227
  with gr.Row():
228
  clear_btn = gr.Button("Clear")
229
  submit_btn = gr.Button("Submit", variant="primary")
230
  with gr.Column(scale=2):
231
+ recommendation_output = gr.Markdown(
232
+ label="Synthesized Recommendation", show_copy_button=True
233
+ )
234
+ with gr.Accordion(
235
+ "Evaluation Data", open=False, visible=False
236
+ ) as eval_accordion:
237
  json_viewer = gr.JSON(label="Evaluation JSON")
238
  download_btn = gr.DownloadButton("Download JSON", visible=False)
239
 
240
  # --- Event Handlers ---
241
+ example_radio.change(
242
+ fn=update_narrative_from_example, inputs=example_radio, outputs=narrative_input
243
+ )
244
  narrative_input.input(fn=lambda: None, inputs=None, outputs=example_radio)
245
+ submit_btn.click(
246
+ fn=get_recommendations_api,
247
+ inputs=[narrative_input, persona_input, password_input],
248
+ outputs=[
249
+ recommendation_output,
250
+ submit_btn,
251
+ eval_accordion,
252
+ json_viewer,
253
+ download_btn,
254
+ ],
255
+ )
256
+ clear_btn.click(
257
+ fn=clear_all,
258
+ inputs=[],
259
+ outputs=[
260
+ narrative_input,
261
+ example_radio,
262
+ recommendation_output,
263
+ eval_accordion,
264
+ json_viewer,
265
+ download_btn,
266
+ ],
267
+ )
268
 
269
 
270
  if __name__ == "__main__":
271
  # Add project src to the sys.path for when running as a script
272
  APP_ROOT = Path(__file__).parent
273
  sys.path.insert(0, str(APP_ROOT / "src"))
274
+ interface.launch()
notebooks/fot_recommender_poc.ipynb CHANGED
@@ -27,7 +27,7 @@
27
  },
28
  {
29
  "cell_type": "code",
30
- "execution_count": 2,
31
  "id": "97f37783",
32
  "metadata": {},
33
  "outputs": [
@@ -47,7 +47,9 @@
47
  "\n",
48
  "# This prevents common, harmless warnings from cluttering the output.\n",
49
  "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
50
- "warnings.filterwarnings(\"ignore\", category=FutureWarning) # Suppress specific torch warning\n",
 
 
51
  "\n",
52
  "# Clones the project from GitHub if not already present.\n",
53
  "PROJECT_DIR = \"fot-intervention-recommender\"\n",
@@ -80,7 +82,7 @@
80
  },
81
  {
82
  "cell_type": "code",
83
- "execution_count": 3,
84
  "id": "3784865f",
85
  "metadata": {},
86
  "outputs": [
@@ -139,28 +141,40 @@
139
  "metadata": {},
140
  "outputs": [
141
  {
142
- "name": "stderr",
143
- "output_type": "stream",
144
- "text": [
145
- "/Users/charlesfeinn/Developer/job_applications/fot-intervention-recommender/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
146
- " from .autonotebook import tqdm as notebook_tqdm\n"
147
- ]
 
 
 
 
148
  },
149
  {
150
  "name": "stdout",
151
  "output_type": "stream",
152
  "text": [
 
153
  "Initializing embedding model: all-MiniLM-L6-v2...\n",
154
  "Model initialized successfully.\n",
155
  "Creating embeddings for 27 chunks...\n"
156
  ]
157
  },
158
  {
159
- "name": "stderr",
160
- "output_type": "stream",
161
- "text": [
162
- "Batches: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00, 2.12s/it]\n"
163
- ]
 
 
 
 
 
 
 
164
  },
165
  {
166
  "name": "stdout",
@@ -292,12 +306,15 @@
292
  }
293
  ],
294
  "source": [
 
 
 
295
  "from fot_recommender.rag_pipeline import (\n",
296
  " load_knowledge_base,\n",
297
  " initialize_embedding_model,\n",
298
  " create_embeddings,\n",
299
  " create_vector_db,\n",
300
- " search_interventions\n",
301
  ")\n",
302
  "from fot_recommender.utils import display_recommendations\n",
303
  "\n",
@@ -320,11 +337,13 @@
320
  " index=vector_db,\n",
321
  " knowledge_base=knowledge_base_chunks,\n",
322
  " k=3,\n",
323
- " min_similarity_score=0.4\n",
324
  ")\n",
325
  "\n",
326
  "# 4. Display a clean summary and the rich results\n",
327
- "print(f\"✅ Successfully loaded models and retrieved the top {len(retrieved_interventions)} most relevant interventions from the knowledge base.\")\n",
 
 
328
  "display_recommendations(retrieved_interventions, citations_map)"
329
  ]
330
  },
 
27
  },
28
  {
29
  "cell_type": "code",
30
+ "execution_count": 1,
31
  "id": "97f37783",
32
  "metadata": {},
33
  "outputs": [
 
47
  "\n",
48
  "# This prevents common, harmless warnings from cluttering the output.\n",
49
  "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
50
+ "warnings.filterwarnings(\n",
51
+ " \"ignore\", category=FutureWarning\n",
52
+ ") # Suppress specific torch warning\n",
53
  "\n",
54
  "# Clones the project from GitHub if not already present.\n",
55
  "PROJECT_DIR = \"fot-intervention-recommender\"\n",
 
82
  },
83
  {
84
  "cell_type": "code",
85
+ "execution_count": 2,
86
  "id": "3784865f",
87
  "metadata": {},
88
  "outputs": [
 
141
  "metadata": {},
142
  "outputs": [
143
  {
144
+ "data": {
145
+ "text/markdown": [
146
+ "🚀 **Starting the retrieval pipeline...**"
147
+ ],
148
+ "text/plain": [
149
+ "<IPython.core.display.Markdown object>"
150
+ ]
151
+ },
152
+ "metadata": {},
153
+ "output_type": "display_data"
154
  },
155
  {
156
  "name": "stdout",
157
  "output_type": "stream",
158
  "text": [
159
+ "This may take a moment as the system loads the embedding model, prepares the knowledge base, and performs the search.\n",
160
  "Initializing embedding model: all-MiniLM-L6-v2...\n",
161
  "Model initialized successfully.\n",
162
  "Creating embeddings for 27 chunks...\n"
163
  ]
164
  },
165
  {
166
+ "data": {
167
+ "application/vnd.jupyter.widget-view+json": {
168
+ "model_id": "aed4d46c859d4f8a88caf88daa5a38cc",
169
+ "version_major": 2,
170
+ "version_minor": 0
171
+ },
172
+ "text/plain": [
173
+ "Batches: 0%| | 0/1 [00:00<?, ?it/s]"
174
+ ]
175
+ },
176
+ "metadata": {},
177
+ "output_type": "display_data"
178
  },
179
  {
180
  "name": "stdout",
 
306
  }
307
  ],
308
  "source": [
309
+ "display(Markdown(\"🚀 **Starting the retrieval pipeline...**\"))\n",
310
+ "print(\"This may take a moment as the system loads the embedding model, prepares the knowledge base, and performs the search.\")\n",
311
+ "\n",
312
  "from fot_recommender.rag_pipeline import (\n",
313
  " load_knowledge_base,\n",
314
  " initialize_embedding_model,\n",
315
  " create_embeddings,\n",
316
  " create_vector_db,\n",
317
+ " search_interventions,\n",
318
  ")\n",
319
  "from fot_recommender.utils import display_recommendations\n",
320
  "\n",
 
337
  " index=vector_db,\n",
338
  " knowledge_base=knowledge_base_chunks,\n",
339
  " k=3,\n",
340
+ " min_similarity_score=0.4,\n",
341
  ")\n",
342
  "\n",
343
  "# 4. Display a clean summary and the rich results\n",
344
+ "print(\n",
345
+ " f\"✅ Successfully loaded models and retrieved the top {len(retrieved_interventions)} most relevant interventions from the knowledge base.\"\n",
346
+ ")\n",
347
  "display_recommendations(retrieved_interventions, citations_map)"
348
  ]
349
  },
scripts/build_knowledge_base.py CHANGED
@@ -7,15 +7,17 @@ from pathlib import Path
7
  project_root = Path(__file__).parent.parent
8
  sys.path.append(str(project_root))
9
 
10
- from src.fot_recommender.config import (
 
 
11
  PROCESSED_DATA_DIR,
12
  RAW_KB_PATH,
13
  FINAL_KB_CHUNKS_PATH,
14
  FAISS_INDEX_PATH,
15
  EMBEDDING_MODEL_NAME,
16
  )
17
- from src.fot_recommender.semantic_chunker import chunk_by_concept
18
- from src.fot_recommender.rag_pipeline import (
19
  initialize_embedding_model,
20
  create_embeddings,
21
  )
@@ -61,4 +63,4 @@ def build():
61
 
62
 
63
  if __name__ == "__main__":
64
- build()
 
7
  project_root = Path(__file__).parent.parent
8
  sys.path.append(str(project_root))
9
 
10
+ # We are intentionally ignoring the E402 warning here because the sys.path
11
+ # modification must happen before we can import from our local package.
12
+ from src.fot_recommender.config import ( # noqa: E402
13
  PROCESSED_DATA_DIR,
14
  RAW_KB_PATH,
15
  FINAL_KB_CHUNKS_PATH,
16
  FAISS_INDEX_PATH,
17
  EMBEDDING_MODEL_NAME,
18
  )
19
+ from src.fot_recommender.semantic_chunker import chunk_by_concept # noqa: E402
20
+ from src.fot_recommender.rag_pipeline import ( # noqa: E402
21
  initialize_embedding_model,
22
  create_embeddings,
23
  )
 
63
 
64
 
65
  if __name__ == "__main__":
66
+ build()
src/fot_recommender/config.py CHANGED
@@ -32,4 +32,6 @@ EMBEDDING_CONTENT_KEY = "content_for_embedding"
32
  # --- Secrets Management ---
33
  # Load secrets from the environment. The application will import these variables.
34
  FOT_GOOGLE_API_KEY = os.environ.get("FOT_GOOGLE_API_KEY")
35
- DEMO_PASSWORD = os.environ.get("DEMO_PASSWORD", "default_password") # Added a default for safety
 
 
 
32
  # --- Secrets Management ---
33
  # Load secrets from the environment. The application will import these variables.
34
  FOT_GOOGLE_API_KEY = os.environ.get("FOT_GOOGLE_API_KEY")
35
+ DEMO_PASSWORD = os.environ.get(
36
+ "DEMO_PASSWORD", "default_password"
37
+ ) # Added a default for safety
src/fot_recommender/main.py CHANGED
@@ -88,10 +88,7 @@ def main():
88
  return "ERROR: FOT_GOOGLE_API_KEY not found. Please create a .env file and add your key."
89
 
90
  synthesized_recommendation = generate_recommendation_summary(
91
- top_interventions,
92
- student_query,
93
- api_key=api_key,
94
- persona="teacher"
95
  )
96
 
97
  # --- 5. Display Final Output ---
 
88
  return "ERROR: FOT_GOOGLE_API_KEY not found. Please create a .env file and add your key."
89
 
90
  synthesized_recommendation = generate_recommendation_summary(
91
+ top_interventions, student_query, api_key=api_key, persona="teacher"
 
 
 
92
  )
93
 
94
  # --- 5. Display Final Output ---
src/fot_recommender/rag_pipeline.py CHANGED
@@ -11,7 +11,7 @@ from fot_recommender.config import (
11
  EMBEDDING_CONTENT_KEY,
12
  GENERATIVE_MODEL_NAME,
13
  SEARCH_RESULT_COUNT_K,
14
- MIN_SIMILARITY_SCORE
15
  )
16
 
17
 
@@ -89,7 +89,7 @@ def search_interventions(
89
  """
90
  print(f"\nSearching for top {k} interventions for query: '{query[:80]}...'")
91
  query_embedding = np.asarray(model.encode([query])).astype("float32")
92
- scores, indices = index.search(query_embedding, k)
93
  results = []
94
  for i, score in zip(indices[0], scores[0]):
95
  if i != -1: # FAISS returns -1 for no result
@@ -108,7 +108,7 @@ def generate_recommendation_summary(
108
  student_narrative: str,
109
  api_key: str,
110
  persona: str = "teacher",
111
- model_name: str = GENERATIVE_MODEL_NAME
112
  ) -> str:
113
  """
114
  Generates a synthesized recommendation using the Google Gemini API.
@@ -131,10 +131,12 @@ def generate_recommendation_summary(
131
  )
132
 
133
  try:
134
- print(f"\nSynthesizing recommendation for persona: '{persona}' using {model_name}...")
135
- model = genai.GenerativeModel(model_name)
 
 
136
  response = model.generate_content(prompt)
137
  print("Synthesis complete.")
138
  return response.text
139
  except Exception as e:
140
- return f"An error occurred while calling the Gemini API: {e}"
 
11
  EMBEDDING_CONTENT_KEY,
12
  GENERATIVE_MODEL_NAME,
13
  SEARCH_RESULT_COUNT_K,
14
+ MIN_SIMILARITY_SCORE,
15
  )
16
 
17
 
 
89
  """
90
  print(f"\nSearching for top {k} interventions for query: '{query[:80]}...'")
91
  query_embedding = np.asarray(model.encode([query])).astype("float32")
92
+ scores, indices = index.search(query_embedding, k) # type: ignore
93
  results = []
94
  for i, score in zip(indices[0], scores[0]):
95
  if i != -1: # FAISS returns -1 for no result
 
108
  student_narrative: str,
109
  api_key: str,
110
  persona: str = "teacher",
111
+ model_name: str = GENERATIVE_MODEL_NAME,
112
  ) -> str:
113
  """
114
  Generates a synthesized recommendation using the Google Gemini API.
 
131
  )
132
 
133
  try:
134
+ print(
135
+ f"\nSynthesizing recommendation for persona: '{persona}' using {model_name}..."
136
+ )
137
+ model = genai.GenerativeModel(model_name) # type: ignore
138
  response = model.generate_content(prompt)
139
  print("Synthesis complete.")
140
  return response.text
141
  except Exception as e:
142
+ return f"An error occurred while calling the Gemini API: {e}"
src/fot_recommender/utils.py CHANGED
@@ -2,6 +2,7 @@ import datetime
2
  import json
3
  from IPython.display import display, Markdown
4
 
 
5
  def display_recommendations(results: list, citations_map: dict):
6
  """
7
  Displays the retrieved recommendations in a rich, Markdown-formatted output
@@ -15,29 +16,28 @@ def display_recommendations(results: list, citations_map: dict):
15
  formatted_evidence = format_evidence_for_display(results, citations_map)
16
 
17
  display(Markdown("### Evidence Base"))
18
-
19
  # 2. Loop through the clean data and render it for the notebook
20
  for evidence in formatted_evidence:
21
  recommendation_md = f"""
22
- **{evidence['title']}**
23
- - **Source:** {evidence['source']}
24
- - **Page(s):** {evidence['pages']}
25
- - **Relevance Score:** {evidence['score']}
26
  - **Content Snippet:**
27
- > {evidence['content_snippet']}
28
 
29
  ---
30
  """
31
  display(Markdown(recommendation_md))
32
 
33
 
34
-
35
  def create_evaluation_bundle(
36
  student_narrative: str,
37
  persona: str,
38
  retrieved_chunks_with_scores: list,
39
  synthesized_recommendation: str,
40
- citations_map: dict
41
  ) -> dict:
42
  """
43
  Assembles a comprehensive dictionary for evaluation and logging purposes.
@@ -50,20 +50,20 @@ def create_evaluation_bundle(
50
  },
51
  "retrieval_results": [
52
  {
53
- "chunk_title": chunk['title'],
54
  "relevance_score": float(score),
55
- "source_document": chunk['source_document'],
56
- "page_info": chunk.get('fot_pages', 'N/A'),
57
- "original_content": chunk.get('original_content', ''),
58
- "citation_info": citations_map.get(chunk['source_document'], {})
59
- } for chunk, score in retrieved_chunks_with_scores
 
60
  ],
61
- "llm_output": {
62
- "synthesized_recommendation": synthesized_recommendation
63
- }
64
  }
65
  return evaluation_data
66
 
 
67
  def format_evidence_for_display(results: list, citations_map: dict) -> list:
68
  """
69
  Takes raw search results and formats them into a structured list of dictionaries
@@ -71,34 +71,39 @@ def format_evidence_for_display(results: list, citations_map: dict) -> list:
71
  """
72
  evidence_list = []
73
  for chunk, score in results:
74
- source_doc = chunk.get('source_document', 'N/A')
75
  citation_info = citations_map.get(source_doc, {})
76
-
77
  # Consolidate all the formatting logic here
78
- title = citation_info.get('title', 'N/A')
79
- author = citation_info.get('author', 'N/A')
80
- year = citation_info.get('year', 'N/A')
81
  source_string = f"*{title}* ({author}, {year})."
82
-
83
- page_info = chunk.get('fot_pages', 'N/A')
84
-
85
- original_content = chunk.get("original_content", "Content not available.").strip()
86
- blockquote_content = original_content.replace('\n', '\n> ')
87
-
88
- evidence_list.append({
89
- "title": chunk['title'],
90
- "source": source_string,
91
- "pages": page_info,
92
- "score": f"{score:.2f}",
93
- "content_snippet": blockquote_content
94
- })
95
-
 
 
 
 
96
  return evidence_list
97
 
 
98
  def load_citations(path):
99
  try:
100
  with open(path, "r", encoding="utf-8") as f:
101
  citations_list = json.load(f)
102
  return {item["source_document"]: item for item in citations_list}
103
  except (FileNotFoundError, json.JSONDecodeError):
104
- return {}
 
2
  import json
3
  from IPython.display import display, Markdown
4
 
5
+
6
  def display_recommendations(results: list, citations_map: dict):
7
  """
8
  Displays the retrieved recommendations in a rich, Markdown-formatted output
 
16
  formatted_evidence = format_evidence_for_display(results, citations_map)
17
 
18
  display(Markdown("### Evidence Base"))
19
+
20
  # 2. Loop through the clean data and render it for the notebook
21
  for evidence in formatted_evidence:
22
  recommendation_md = f"""
23
+ **{evidence["title"]}**
24
+ - **Source:** {evidence["source"]}
25
+ - **Page(s):** {evidence["pages"]}
26
+ - **Relevance Score:** {evidence["score"]}
27
  - **Content Snippet:**
28
+ > {evidence["content_snippet"]}
29
 
30
  ---
31
  """
32
  display(Markdown(recommendation_md))
33
 
34
 
 
35
  def create_evaluation_bundle(
36
  student_narrative: str,
37
  persona: str,
38
  retrieved_chunks_with_scores: list,
39
  synthesized_recommendation: str,
40
+ citations_map: dict,
41
  ) -> dict:
42
  """
43
  Assembles a comprehensive dictionary for evaluation and logging purposes.
 
50
  },
51
  "retrieval_results": [
52
  {
53
+ "chunk_title": chunk["title"],
54
  "relevance_score": float(score),
55
+ "source_document": chunk["source_document"],
56
+ "page_info": chunk.get("fot_pages", "N/A"),
57
+ "original_content": chunk.get("original_content", ""),
58
+ "citation_info": citations_map.get(chunk["source_document"], {}),
59
+ }
60
+ for chunk, score in retrieved_chunks_with_scores
61
  ],
62
+ "llm_output": {"synthesized_recommendation": synthesized_recommendation},
 
 
63
  }
64
  return evaluation_data
65
 
66
+
67
  def format_evidence_for_display(results: list, citations_map: dict) -> list:
68
  """
69
  Takes raw search results and formats them into a structured list of dictionaries
 
71
  """
72
  evidence_list = []
73
  for chunk, score in results:
74
+ source_doc = chunk.get("source_document", "N/A")
75
  citation_info = citations_map.get(source_doc, {})
76
+
77
  # Consolidate all the formatting logic here
78
+ title = citation_info.get("title", "N/A")
79
+ author = citation_info.get("author", "N/A")
80
+ year = citation_info.get("year", "N/A")
81
  source_string = f"*{title}* ({author}, {year})."
82
+
83
+ page_info = chunk.get("fot_pages", "N/A")
84
+
85
+ original_content = chunk.get(
86
+ "original_content", "Content not available."
87
+ ).strip()
88
+ blockquote_content = original_content.replace("\n", "\n> ")
89
+
90
+ evidence_list.append(
91
+ {
92
+ "title": chunk["title"],
93
+ "source": source_string,
94
+ "pages": page_info,
95
+ "score": f"{score:.2f}",
96
+ "content_snippet": blockquote_content,
97
+ }
98
+ )
99
+
100
  return evidence_list
101
 
102
+
103
  def load_citations(path):
104
  try:
105
  with open(path, "r", encoding="utf-8") as f:
106
  citations_list = json.load(f)
107
  return {item["source_document"]: item for item in citations_list}
108
  except (FileNotFoundError, json.JSONDecodeError):
109
+ return {}
tests/test_chunking.py CHANGED
@@ -7,9 +7,24 @@ def test_chunk_by_concept_groups_correctly():
7
 
8
  # 1. Arrange: Create simple, predictable raw data
9
  sample_raw_kb = [
10
- {"source_document": "doc_A", "concept": "Mentoring", "absolute_page": 1, "content": "First part."},
11
- {"source_document": "doc_B", "concept": "Tutoring", "absolute_page": 10, "content": "Tutoring info."},
12
- {"source_document": "doc_A", "concept": "Mentoring", "absolute_page": 2, "content": "Second part."},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  ]
14
 
15
  # 2. Act: Run the function we're testing
@@ -20,10 +35,12 @@ def test_chunk_by_concept_groups_correctly():
20
 
21
  # Find the 'Mentoring' chunk for detailed checks
22
  mentoring_chunk = next(c for c in final_chunks if c["title"] == "Mentoring")
23
-
24
  assert mentoring_chunk is not None
25
  assert mentoring_chunk["source_document"] == "doc_A"
26
  assert mentoring_chunk["fot_pages"] == "Pages: 1, 2"
27
  assert "First part.\n\nSecond part." in mentoring_chunk["original_content"]
28
- assert "Title: Mentoring. Content: First part.\n\nSecond part." in mentoring_chunk["content_for_embedding"]
29
-
 
 
 
7
 
8
  # 1. Arrange: Create simple, predictable raw data
9
  sample_raw_kb = [
10
+ {
11
+ "source_document": "doc_A",
12
+ "concept": "Mentoring",
13
+ "absolute_page": 1,
14
+ "content": "First part.",
15
+ },
16
+ {
17
+ "source_document": "doc_B",
18
+ "concept": "Tutoring",
19
+ "absolute_page": 10,
20
+ "content": "Tutoring info.",
21
+ },
22
+ {
23
+ "source_document": "doc_A",
24
+ "concept": "Mentoring",
25
+ "absolute_page": 2,
26
+ "content": "Second part.",
27
+ },
28
  ]
29
 
30
  # 2. Act: Run the function we're testing
 
35
 
36
  # Find the 'Mentoring' chunk for detailed checks
37
  mentoring_chunk = next(c for c in final_chunks if c["title"] == "Mentoring")
38
+
39
  assert mentoring_chunk is not None
40
  assert mentoring_chunk["source_document"] == "doc_A"
41
  assert mentoring_chunk["fot_pages"] == "Pages: 1, 2"
42
  assert "First part.\n\nSecond part." in mentoring_chunk["original_content"]
43
+ assert (
44
+ "Title: Mentoring. Content: First part.\n\nSecond part."
45
+ in mentoring_chunk["content_for_embedding"]
46
+ )
tests/test_pipeline.py CHANGED
@@ -1,6 +1,7 @@
1
  from unittest.mock import MagicMock, patch
2
  import numpy as np
3
 
 
4
  def test_search_interventions_filters_by_score():
5
  """
6
  Ensures the search function correctly filters out results
@@ -11,7 +12,7 @@ def test_search_interventions_filters_by_score():
11
  # 1. Arrange: Create mock objects and sample data
12
  mock_model = MagicMock()
13
  mock_index = MagicMock()
14
-
15
  # Fake knowledge base
16
  sample_kb = [{"id": 1, "content": "high score"}, {"id": 2, "content": "low score"}]
17
 
@@ -19,7 +20,7 @@ def test_search_interventions_filters_by_score():
19
  # Let's say it finds two results, one with a high score (0.9) and one low (0.3)
20
  mock_index.search.return_value = (
21
  np.array([[0.9, 0.3]]), # scores
22
- np.array([[0, 1]]) # indices
23
  )
24
 
25
  # 2. Act: Run the search with a minimum score of 0.5
@@ -29,14 +30,14 @@ def test_search_interventions_filters_by_score():
29
  index=mock_index,
30
  knowledge_base=sample_kb,
31
  k=2,
32
- min_similarity_score=0.5
33
  )
34
 
35
  # 3. Assert: Check that only the high-scoring result was returned
36
  assert len(results) == 1
37
- assert results[0][0]["content"] == "high score" # Check the chunk content
38
- assert results[0][1] == 0.9 # Check the score
39
-
40
 
41
  def test_generate_recommendation_summary_builds_correct_prompt():
42
  """
@@ -47,13 +48,22 @@ def test_generate_recommendation_summary_builds_correct_prompt():
47
 
48
  # 1. Arrange: Create sample inputs
49
  sample_chunks = [
50
- ({"title": "Tip 1", "original_content": "Do this.", "source_document": "doc_A"}, 0.9),
 
 
 
 
 
 
 
51
  ]
52
  student_narrative = "Student is struggling."
53
 
54
  # 2. Act & Assert: Use a patch to intercept the API call
55
  # This temporarily replaces genai.GenerativeModel with our mock
56
- with patch("src.fot_recommender.rag_pipeline.genai.GenerativeModel") as mock_gen_model:
 
 
57
  # Create a mock instance that the function will use
58
  mock_model_instance = MagicMock()
59
  mock_gen_model.return_value = mock_model_instance
@@ -62,13 +72,13 @@ def test_generate_recommendation_summary_builds_correct_prompt():
62
  retrieved_chunks=sample_chunks,
63
  student_narrative=student_narrative,
64
  api_key="fake_key",
65
- persona="teacher"
66
  )
67
 
68
  # 3. Assert: Check what our function tried to do
69
  # Was the API call made once?
70
  mock_model_instance.generate_content.assert_called_once()
71
-
72
  # Get the actual prompt that was passed to the LLM
73
  actual_prompt = mock_model_instance.generate_content.call_args[0][0]
74
 
 
1
  from unittest.mock import MagicMock, patch
2
  import numpy as np
3
 
4
+
5
  def test_search_interventions_filters_by_score():
6
  """
7
  Ensures the search function correctly filters out results
 
12
  # 1. Arrange: Create mock objects and sample data
13
  mock_model = MagicMock()
14
  mock_index = MagicMock()
15
+
16
  # Fake knowledge base
17
  sample_kb = [{"id": 1, "content": "high score"}, {"id": 2, "content": "low score"}]
18
 
 
20
  # Let's say it finds two results, one with a high score (0.9) and one low (0.3)
21
  mock_index.search.return_value = (
22
  np.array([[0.9, 0.3]]), # scores
23
+ np.array([[0, 1]]), # indices
24
  )
25
 
26
  # 2. Act: Run the search with a minimum score of 0.5
 
30
  index=mock_index,
31
  knowledge_base=sample_kb,
32
  k=2,
33
+ min_similarity_score=0.5,
34
  )
35
 
36
  # 3. Assert: Check that only the high-scoring result was returned
37
  assert len(results) == 1
38
+ assert results[0][0]["content"] == "high score" # Check the chunk content
39
+ assert results[0][1] == 0.9 # Check the score
40
+
41
 
42
  def test_generate_recommendation_summary_builds_correct_prompt():
43
  """
 
48
 
49
  # 1. Arrange: Create sample inputs
50
  sample_chunks = [
51
+ (
52
+ {
53
+ "title": "Tip 1",
54
+ "original_content": "Do this.",
55
+ "source_document": "doc_A",
56
+ },
57
+ 0.9,
58
+ ),
59
  ]
60
  student_narrative = "Student is struggling."
61
 
62
  # 2. Act & Assert: Use a patch to intercept the API call
63
  # This temporarily replaces genai.GenerativeModel with our mock
64
+ with patch(
65
+ "src.fot_recommender.rag_pipeline.genai.GenerativeModel"
66
+ ) as mock_gen_model:
67
  # Create a mock instance that the function will use
68
  mock_model_instance = MagicMock()
69
  mock_gen_model.return_value = mock_model_instance
 
72
  retrieved_chunks=sample_chunks,
73
  student_narrative=student_narrative,
74
  api_key="fake_key",
75
+ persona="teacher",
76
  )
77
 
78
  # 3. Assert: Check what our function tried to do
79
  # Was the API call made once?
80
  mock_model_instance.generate_content.assert_called_once()
81
+
82
  # Get the actual prompt that was passed to the LLM
83
  actual_prompt = mock_model_instance.generate_content.call_args[0][0]
84