Spaces:
Sleeping
Sleeping
Commit
·
280d562
1
Parent(s):
79336f1
style: Format code and fix linter warnings
Browse filesApplied `ruff format` to the entire codebase for consistency.
Also fixed all outstanding Pyright and Ruff linter warnings:
- Ignored warnings for correct but "private" imports (gradio, genai).
- Added a check for the API key to fix a potential `None` type error.
- Suppressed E402 import errors in the build script where required.
- app.py +148 -35
- notebooks/fot_recommender_poc.ipynb +36 -17
- scripts/build_knowledge_base.py +6 -4
- src/fot_recommender/config.py +3 -1
- src/fot_recommender/main.py +1 -4
- src/fot_recommender/rag_pipeline.py +8 -6
- src/fot_recommender/utils.py +42 -37
- tests/test_chunking.py +23 -6
- tests/test_pipeline.py +20 -10
app.py
CHANGED
|
@@ -14,7 +14,7 @@ from fot_recommender.config import (
|
|
| 14 |
FOT_GOOGLE_API_KEY,
|
| 15 |
DEMO_PASSWORD,
|
| 16 |
SEARCH_RESULT_COUNT_K,
|
| 17 |
-
MIN_SIMILARITY_SCORE
|
| 18 |
)
|
| 19 |
from fot_recommender.utils import load_citations, format_evidence_for_display
|
| 20 |
from fot_recommender.rag_pipeline import (
|
|
@@ -28,18 +28,18 @@ EXAMPLE_NARRATIVES = [
|
|
| 28 |
{
|
| 29 |
"short_title": "Overwhelmed",
|
| 30 |
"title": "Overwhelmed Freshman (Academic & Attendance)",
|
| 31 |
-
"narrative": "A comprehensive support plan is urgently needed for this freshman. Academic performance is a critical concern, with failures in both Math and English leading to a credit deficiency of only 2 out of 4 expected credits. This academic struggle is compounded by a drop in attendance to 85% and a recent behavioral flag for an outburst in class, suggesting the student is significantly overwhelmed by the transition to high school."
|
| 32 |
},
|
| 33 |
{
|
| 34 |
"short_title": "Withdrawn",
|
| 35 |
"title": "Withdrawn Freshman (Social-Emotional)",
|
| 36 |
-
"narrative": "Academically, this freshman appears to be thriving, with a high GPA and perfect attendance. A closer look at classroom performance, however, reveals a student who is completely withdrawn. They do not participate in discussions or engage in any extracurricular activities, and teacher notes repeatedly describe them as 'isolated.' The lack of behavioral flags is a result of non-engagement, not positive conduct, pointing to a clear need for interventions focused on social-emotional learning and school connectedness."
|
| 37 |
},
|
| 38 |
{
|
| 39 |
"short_title": "Disruptive",
|
| 40 |
"title": "Disruptive Freshman (Behavioral)",
|
| 41 |
-
"narrative": "While this student's academics and credits earned are currently on track and attendance is acceptable at 92%, a significant pattern of disruptive behavior is jeopardizing their long-term success. An accumulation of five behavioral flags across multiple classes indicates a primary need for interventions in behavior management and positive conduct. Support should be focused on mentoring and strategies to foster appropriate classroom engagement before these behaviors begin to negatively impact their academic standing."
|
| 42 |
-
}
|
| 43 |
]
|
| 44 |
EXAMPLE_MAP = {ex["short_title"]: ex["narrative"] for ex in EXAMPLE_NARRATIVES}
|
| 45 |
EXAMPLE_TITLES = list(EXAMPLE_MAP.keys())
|
|
@@ -52,25 +52,66 @@ citations_map = load_citations(str(CITATIONS_PATH))
|
|
| 52 |
embedding_model = initialize_embedding_model()
|
| 53 |
print("✅ API initialized successfully.")
|
| 54 |
|
|
|
|
| 55 |
def get_recommendations_api(student_narrative, persona, password):
|
| 56 |
"""The main function that runs the RAG pipeline and prepares data for export."""
|
| 57 |
if password != DEMO_PASSWORD:
|
| 58 |
-
yield
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
return
|
| 60 |
|
| 61 |
if not student_narrative:
|
| 62 |
-
yield
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
return
|
| 64 |
|
| 65 |
-
yield
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
# 1. RETRIEVE
|
| 68 |
-
query_embedding = np.asarray(embedding_model.encode([student_narrative])).astype(
|
|
|
|
|
|
|
| 69 |
scores, indices = index.search(query_embedding, k=SEARCH_RESULT_COUNT_K)
|
| 70 |
-
retrieved_chunks_with_scores = [
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
if not retrieved_chunks_with_scores:
|
| 73 |
-
yield
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
return
|
| 75 |
|
| 76 |
# 2. GENERATE
|
|
@@ -82,7 +123,9 @@ def get_recommendations_api(student_narrative, persona, password):
|
|
| 82 |
)
|
| 83 |
|
| 84 |
# 3. Augment with evidence for UI
|
| 85 |
-
formatted_evidence = format_evidence_for_display(
|
|
|
|
|
|
|
| 86 |
evidence_header = "\n\n---\n\n### Evidence Base\n"
|
| 87 |
evidence_list_str = ""
|
| 88 |
for evidence in formatted_evidence:
|
|
@@ -90,72 +133,142 @@ def get_recommendations_api(student_narrative, persona, password):
|
|
| 90 |
evidence_list_str += f" - **Source:** {evidence['source']}\n"
|
| 91 |
evidence_list_str += f" - **Page(s):** {evidence['pages']}\n"
|
| 92 |
evidence_list_str += f" - **Relevance Score:** {evidence['score']}\n"
|
| 93 |
-
evidence_list_str +=
|
| 94 |
-
|
|
|
|
|
|
|
| 95 |
final_output = synthesized_recommendation + evidence_header + evidence_list_str
|
| 96 |
-
|
| 97 |
# 4. Assemble Evaluation Data
|
| 98 |
evaluation_data = {
|
| 99 |
"timestamp": datetime.datetime.now().isoformat(),
|
| 100 |
"inputs": {"student_narrative": student_narrative, "persona": persona},
|
| 101 |
"retrieval_results": [
|
| 102 |
{
|
| 103 |
-
"chunk_title": chunk[
|
| 104 |
-
"
|
| 105 |
-
"
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
],
|
| 108 |
"llm_output": {"synthesized_recommendation": synthesized_recommendation},
|
| 109 |
-
"final_ui_output": final_output
|
| 110 |
}
|
| 111 |
|
| 112 |
# 5. Create a temporary file for download
|
| 113 |
-
with tempfile.NamedTemporaryFile(
|
|
|
|
|
|
|
| 114 |
json.dump(evaluation_data, f, indent=4)
|
| 115 |
temp_file_path = f.name
|
| 116 |
|
| 117 |
-
yield
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
|
| 120 |
# --- UI Helper Functions ---
|
| 121 |
def clear_all():
|
| 122 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
def update_narrative_from_example(selection):
|
| 125 |
return EXAMPLE_MAP.get(selection, "")
|
| 126 |
|
|
|
|
| 127 |
CUSTOM_CSS = """
|
| 128 |
.radio-horizontal .gr-form { flex-direction: row; flex-wrap: wrap; gap: 0.5rem; }
|
| 129 |
"""
|
| 130 |
|
| 131 |
# --- Gradio Interface ---
|
| 132 |
-
with gr.Blocks(theme=gr.themes.Soft(), css=CUSTOM_CSS) as interface:
|
| 133 |
-
gr.Markdown(
|
|
|
|
|
|
|
| 134 |
with gr.Row(equal_height=False):
|
| 135 |
with gr.Column(scale=1):
|
| 136 |
with gr.Group():
|
| 137 |
-
narrative_input = gr.Textbox(
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
with gr.Row():
|
| 142 |
clear_btn = gr.Button("Clear")
|
| 143 |
submit_btn = gr.Button("Submit", variant="primary")
|
| 144 |
with gr.Column(scale=2):
|
| 145 |
-
recommendation_output = gr.Markdown(
|
| 146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
json_viewer = gr.JSON(label="Evaluation JSON")
|
| 148 |
download_btn = gr.DownloadButton("Download JSON", visible=False)
|
| 149 |
|
| 150 |
# --- Event Handlers ---
|
| 151 |
-
example_radio.change(
|
|
|
|
|
|
|
| 152 |
narrative_input.input(fn=lambda: None, inputs=None, outputs=example_radio)
|
| 153 |
-
submit_btn.click(
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
|
| 157 |
if __name__ == "__main__":
|
| 158 |
# Add project src to the sys.path for when running as a script
|
| 159 |
APP_ROOT = Path(__file__).parent
|
| 160 |
sys.path.insert(0, str(APP_ROOT / "src"))
|
| 161 |
-
interface.launch()
|
|
|
|
| 14 |
FOT_GOOGLE_API_KEY,
|
| 15 |
DEMO_PASSWORD,
|
| 16 |
SEARCH_RESULT_COUNT_K,
|
| 17 |
+
MIN_SIMILARITY_SCORE,
|
| 18 |
)
|
| 19 |
from fot_recommender.utils import load_citations, format_evidence_for_display
|
| 20 |
from fot_recommender.rag_pipeline import (
|
|
|
|
| 28 |
{
|
| 29 |
"short_title": "Overwhelmed",
|
| 30 |
"title": "Overwhelmed Freshman (Academic & Attendance)",
|
| 31 |
+
"narrative": "A comprehensive support plan is urgently needed for this freshman. Academic performance is a critical concern, with failures in both Math and English leading to a credit deficiency of only 2 out of 4 expected credits. This academic struggle is compounded by a drop in attendance to 85% and a recent behavioral flag for an outburst in class, suggesting the student is significantly overwhelmed by the transition to high school.",
|
| 32 |
},
|
| 33 |
{
|
| 34 |
"short_title": "Withdrawn",
|
| 35 |
"title": "Withdrawn Freshman (Social-Emotional)",
|
| 36 |
+
"narrative": "Academically, this freshman appears to be thriving, with a high GPA and perfect attendance. A closer look at classroom performance, however, reveals a student who is completely withdrawn. They do not participate in discussions or engage in any extracurricular activities, and teacher notes repeatedly describe them as 'isolated.' The lack of behavioral flags is a result of non-engagement, not positive conduct, pointing to a clear need for interventions focused on social-emotional learning and school connectedness.",
|
| 37 |
},
|
| 38 |
{
|
| 39 |
"short_title": "Disruptive",
|
| 40 |
"title": "Disruptive Freshman (Behavioral)",
|
| 41 |
+
"narrative": "While this student's academics and credits earned are currently on track and attendance is acceptable at 92%, a significant pattern of disruptive behavior is jeopardizing their long-term success. An accumulation of five behavioral flags across multiple classes indicates a primary need for interventions in behavior management and positive conduct. Support should be focused on mentoring and strategies to foster appropriate classroom engagement before these behaviors begin to negatively impact their academic standing.",
|
| 42 |
+
},
|
| 43 |
]
|
| 44 |
EXAMPLE_MAP = {ex["short_title"]: ex["narrative"] for ex in EXAMPLE_NARRATIVES}
|
| 45 |
EXAMPLE_TITLES = list(EXAMPLE_MAP.keys())
|
|
|
|
| 52 |
embedding_model = initialize_embedding_model()
|
| 53 |
print("✅ API initialized successfully.")
|
| 54 |
|
| 55 |
+
|
| 56 |
def get_recommendations_api(student_narrative, persona, password):
|
| 57 |
"""The main function that runs the RAG pipeline and prepares data for export."""
|
| 58 |
if password != DEMO_PASSWORD:
|
| 59 |
+
yield (
|
| 60 |
+
"Authentication failed. Please enter a valid Access Key.",
|
| 61 |
+
gr.update(interactive=True),
|
| 62 |
+
gr.update(visible=False),
|
| 63 |
+
None,
|
| 64 |
+
gr.update(visible=False),
|
| 65 |
+
)
|
| 66 |
+
return
|
| 67 |
+
|
| 68 |
+
if not FOT_GOOGLE_API_KEY:
|
| 69 |
+
yield (
|
| 70 |
+
"ERROR: The Google API Key is not configured. Please set the FOT_GOOGLE_API_KEY in the .env file.",
|
| 71 |
+
gr.update(interactive=True),
|
| 72 |
+
gr.update(visible=False),
|
| 73 |
+
None,
|
| 74 |
+
gr.update(visible=False),
|
| 75 |
+
)
|
| 76 |
return
|
| 77 |
|
| 78 |
if not student_narrative:
|
| 79 |
+
yield (
|
| 80 |
+
"Please enter a student narrative.",
|
| 81 |
+
gr.update(interactive=True),
|
| 82 |
+
gr.update(visible=False),
|
| 83 |
+
None,
|
| 84 |
+
gr.update(visible=False),
|
| 85 |
+
)
|
| 86 |
return
|
| 87 |
|
| 88 |
+
yield (
|
| 89 |
+
"Processing...",
|
| 90 |
+
gr.update(interactive=False),
|
| 91 |
+
gr.update(visible=False),
|
| 92 |
+
None,
|
| 93 |
+
gr.update(visible=False),
|
| 94 |
+
)
|
| 95 |
|
| 96 |
# 1. RETRIEVE
|
| 97 |
+
query_embedding = np.asarray(embedding_model.encode([student_narrative])).astype(
|
| 98 |
+
"float32"
|
| 99 |
+
)
|
| 100 |
scores, indices = index.search(query_embedding, k=SEARCH_RESULT_COUNT_K)
|
| 101 |
+
retrieved_chunks_with_scores = [
|
| 102 |
+
(knowledge_base_chunks[i], score)
|
| 103 |
+
for i, score in zip(indices[0], scores[0])
|
| 104 |
+
if score >= MIN_SIMILARITY_SCORE
|
| 105 |
+
]
|
| 106 |
|
| 107 |
if not retrieved_chunks_with_scores:
|
| 108 |
+
yield (
|
| 109 |
+
"Could not find relevant interventions.",
|
| 110 |
+
gr.update(interactive=True),
|
| 111 |
+
gr.update(visible=False),
|
| 112 |
+
None,
|
| 113 |
+
gr.update(visible=False),
|
| 114 |
+
)
|
| 115 |
return
|
| 116 |
|
| 117 |
# 2. GENERATE
|
|
|
|
| 123 |
)
|
| 124 |
|
| 125 |
# 3. Augment with evidence for UI
|
| 126 |
+
formatted_evidence = format_evidence_for_display(
|
| 127 |
+
retrieved_chunks_with_scores, citations_map
|
| 128 |
+
)
|
| 129 |
evidence_header = "\n\n---\n\n### Evidence Base\n"
|
| 130 |
evidence_list_str = ""
|
| 131 |
for evidence in formatted_evidence:
|
|
|
|
| 133 |
evidence_list_str += f" - **Source:** {evidence['source']}\n"
|
| 134 |
evidence_list_str += f" - **Page(s):** {evidence['pages']}\n"
|
| 135 |
evidence_list_str += f" - **Relevance Score:** {evidence['score']}\n"
|
| 136 |
+
evidence_list_str += (
|
| 137 |
+
f" - **Content Snippet:**\n > {evidence['content_snippet']}\n"
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
final_output = synthesized_recommendation + evidence_header + evidence_list_str
|
| 141 |
+
|
| 142 |
# 4. Assemble Evaluation Data
|
| 143 |
evaluation_data = {
|
| 144 |
"timestamp": datetime.datetime.now().isoformat(),
|
| 145 |
"inputs": {"student_narrative": student_narrative, "persona": persona},
|
| 146 |
"retrieval_results": [
|
| 147 |
{
|
| 148 |
+
"chunk_title": chunk["title"],
|
| 149 |
+
"relevance_score": float(score),
|
| 150 |
+
"source_document": chunk["source_document"],
|
| 151 |
+
"page_info": chunk.get("fot_pages", "N/A"),
|
| 152 |
+
"original_content": chunk.get("original_content", ""),
|
| 153 |
+
"citation_info": citations_map.get(chunk["source_document"], {}),
|
| 154 |
+
}
|
| 155 |
+
for chunk, score in retrieved_chunks_with_scores
|
| 156 |
],
|
| 157 |
"llm_output": {"synthesized_recommendation": synthesized_recommendation},
|
| 158 |
+
"final_ui_output": final_output,
|
| 159 |
}
|
| 160 |
|
| 161 |
# 5. Create a temporary file for download
|
| 162 |
+
with tempfile.NamedTemporaryFile(
|
| 163 |
+
mode="w", delete=False, suffix=".json", encoding="utf-8"
|
| 164 |
+
) as f:
|
| 165 |
json.dump(evaluation_data, f, indent=4)
|
| 166 |
temp_file_path = f.name
|
| 167 |
|
| 168 |
+
yield (
|
| 169 |
+
final_output,
|
| 170 |
+
gr.update(interactive=True),
|
| 171 |
+
gr.update(visible=True),
|
| 172 |
+
evaluation_data,
|
| 173 |
+
gr.update(value=temp_file_path, visible=True),
|
| 174 |
+
)
|
| 175 |
|
| 176 |
|
| 177 |
# --- UI Helper Functions ---
|
| 178 |
def clear_all():
|
| 179 |
+
return (
|
| 180 |
+
"",
|
| 181 |
+
None,
|
| 182 |
+
"",
|
| 183 |
+
gr.update(visible=False),
|
| 184 |
+
None,
|
| 185 |
+
gr.update(visible=False, value=None),
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
|
| 189 |
def update_narrative_from_example(selection):
|
| 190 |
return EXAMPLE_MAP.get(selection, "")
|
| 191 |
|
| 192 |
+
|
| 193 |
CUSTOM_CSS = """
|
| 194 |
.radio-horizontal .gr-form { flex-direction: row; flex-wrap: wrap; gap: 0.5rem; }
|
| 195 |
"""
|
| 196 |
|
| 197 |
# --- Gradio Interface ---
|
| 198 |
+
with gr.Blocks(theme=gr.themes.Soft(), css=CUSTOM_CSS) as interface: # type: ignore
|
| 199 |
+
gr.Markdown(
|
| 200 |
+
"# Freshman On-Track Intervention Recommender\n*A live API demonstrating the FOT Recommender.*"
|
| 201 |
+
)
|
| 202 |
with gr.Row(equal_height=False):
|
| 203 |
with gr.Column(scale=1):
|
| 204 |
with gr.Group():
|
| 205 |
+
narrative_input = gr.Textbox(
|
| 206 |
+
lines=8,
|
| 207 |
+
label="Student Narrative",
|
| 208 |
+
placeholder="Describe the student's situation here, or select an example below.",
|
| 209 |
+
)
|
| 210 |
+
example_radio = gr.Radio(
|
| 211 |
+
EXAMPLE_TITLES,
|
| 212 |
+
label="Load an Example Scenario",
|
| 213 |
+
info="Select one to populate the narrative above. Typing a custom narrative will clear this selection.",
|
| 214 |
+
elem_classes=["radio-horizontal"],
|
| 215 |
+
)
|
| 216 |
+
persona_input = gr.Radio(
|
| 217 |
+
["teacher", "parent", "principal"],
|
| 218 |
+
label="Who is this recommendation for?",
|
| 219 |
+
value="teacher",
|
| 220 |
+
elem_classes=["radio-horizontal"],
|
| 221 |
+
)
|
| 222 |
+
password_input = gr.Textbox(
|
| 223 |
+
label="Access Key",
|
| 224 |
+
type="password",
|
| 225 |
+
info="Enter the access key for the demo.",
|
| 226 |
+
)
|
| 227 |
with gr.Row():
|
| 228 |
clear_btn = gr.Button("Clear")
|
| 229 |
submit_btn = gr.Button("Submit", variant="primary")
|
| 230 |
with gr.Column(scale=2):
|
| 231 |
+
recommendation_output = gr.Markdown(
|
| 232 |
+
label="Synthesized Recommendation", show_copy_button=True
|
| 233 |
+
)
|
| 234 |
+
with gr.Accordion(
|
| 235 |
+
"Evaluation Data", open=False, visible=False
|
| 236 |
+
) as eval_accordion:
|
| 237 |
json_viewer = gr.JSON(label="Evaluation JSON")
|
| 238 |
download_btn = gr.DownloadButton("Download JSON", visible=False)
|
| 239 |
|
| 240 |
# --- Event Handlers ---
|
| 241 |
+
example_radio.change(
|
| 242 |
+
fn=update_narrative_from_example, inputs=example_radio, outputs=narrative_input
|
| 243 |
+
)
|
| 244 |
narrative_input.input(fn=lambda: None, inputs=None, outputs=example_radio)
|
| 245 |
+
submit_btn.click(
|
| 246 |
+
fn=get_recommendations_api,
|
| 247 |
+
inputs=[narrative_input, persona_input, password_input],
|
| 248 |
+
outputs=[
|
| 249 |
+
recommendation_output,
|
| 250 |
+
submit_btn,
|
| 251 |
+
eval_accordion,
|
| 252 |
+
json_viewer,
|
| 253 |
+
download_btn,
|
| 254 |
+
],
|
| 255 |
+
)
|
| 256 |
+
clear_btn.click(
|
| 257 |
+
fn=clear_all,
|
| 258 |
+
inputs=[],
|
| 259 |
+
outputs=[
|
| 260 |
+
narrative_input,
|
| 261 |
+
example_radio,
|
| 262 |
+
recommendation_output,
|
| 263 |
+
eval_accordion,
|
| 264 |
+
json_viewer,
|
| 265 |
+
download_btn,
|
| 266 |
+
],
|
| 267 |
+
)
|
| 268 |
|
| 269 |
|
| 270 |
if __name__ == "__main__":
|
| 271 |
# Add project src to the sys.path for when running as a script
|
| 272 |
APP_ROOT = Path(__file__).parent
|
| 273 |
sys.path.insert(0, str(APP_ROOT / "src"))
|
| 274 |
+
interface.launch()
|
notebooks/fot_recommender_poc.ipynb
CHANGED
|
@@ -27,7 +27,7 @@
|
|
| 27 |
},
|
| 28 |
{
|
| 29 |
"cell_type": "code",
|
| 30 |
-
"execution_count":
|
| 31 |
"id": "97f37783",
|
| 32 |
"metadata": {},
|
| 33 |
"outputs": [
|
|
@@ -47,7 +47,9 @@
|
|
| 47 |
"\n",
|
| 48 |
"# This prevents common, harmless warnings from cluttering the output.\n",
|
| 49 |
"os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
|
| 50 |
-
"warnings.filterwarnings(\
|
|
|
|
|
|
|
| 51 |
"\n",
|
| 52 |
"# Clones the project from GitHub if not already present.\n",
|
| 53 |
"PROJECT_DIR = \"fot-intervention-recommender\"\n",
|
|
@@ -80,7 +82,7 @@
|
|
| 80 |
},
|
| 81 |
{
|
| 82 |
"cell_type": "code",
|
| 83 |
-
"execution_count":
|
| 84 |
"id": "3784865f",
|
| 85 |
"metadata": {},
|
| 86 |
"outputs": [
|
|
@@ -139,28 +141,40 @@
|
|
| 139 |
"metadata": {},
|
| 140 |
"outputs": [
|
| 141 |
{
|
| 142 |
-
"
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
"
|
| 147 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
},
|
| 149 |
{
|
| 150 |
"name": "stdout",
|
| 151 |
"output_type": "stream",
|
| 152 |
"text": [
|
|
|
|
| 153 |
"Initializing embedding model: all-MiniLM-L6-v2...\n",
|
| 154 |
"Model initialized successfully.\n",
|
| 155 |
"Creating embeddings for 27 chunks...\n"
|
| 156 |
]
|
| 157 |
},
|
| 158 |
{
|
| 159 |
-
"
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
},
|
| 165 |
{
|
| 166 |
"name": "stdout",
|
|
@@ -292,12 +306,15 @@
|
|
| 292 |
}
|
| 293 |
],
|
| 294 |
"source": [
|
|
|
|
|
|
|
|
|
|
| 295 |
"from fot_recommender.rag_pipeline import (\n",
|
| 296 |
" load_knowledge_base,\n",
|
| 297 |
" initialize_embedding_model,\n",
|
| 298 |
" create_embeddings,\n",
|
| 299 |
" create_vector_db,\n",
|
| 300 |
-
" search_interventions
|
| 301 |
")\n",
|
| 302 |
"from fot_recommender.utils import display_recommendations\n",
|
| 303 |
"\n",
|
|
@@ -320,11 +337,13 @@
|
|
| 320 |
" index=vector_db,\n",
|
| 321 |
" knowledge_base=knowledge_base_chunks,\n",
|
| 322 |
" k=3,\n",
|
| 323 |
-
" min_similarity_score=0.4
|
| 324 |
")\n",
|
| 325 |
"\n",
|
| 326 |
"# 4. Display a clean summary and the rich results\n",
|
| 327 |
-
"print(
|
|
|
|
|
|
|
| 328 |
"display_recommendations(retrieved_interventions, citations_map)"
|
| 329 |
]
|
| 330 |
},
|
|
|
|
| 27 |
},
|
| 28 |
{
|
| 29 |
"cell_type": "code",
|
| 30 |
+
"execution_count": 1,
|
| 31 |
"id": "97f37783",
|
| 32 |
"metadata": {},
|
| 33 |
"outputs": [
|
|
|
|
| 47 |
"\n",
|
| 48 |
"# This prevents common, harmless warnings from cluttering the output.\n",
|
| 49 |
"os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
|
| 50 |
+
"warnings.filterwarnings(\n",
|
| 51 |
+
" \"ignore\", category=FutureWarning\n",
|
| 52 |
+
") # Suppress specific torch warning\n",
|
| 53 |
"\n",
|
| 54 |
"# Clones the project from GitHub if not already present.\n",
|
| 55 |
"PROJECT_DIR = \"fot-intervention-recommender\"\n",
|
|
|
|
| 82 |
},
|
| 83 |
{
|
| 84 |
"cell_type": "code",
|
| 85 |
+
"execution_count": 2,
|
| 86 |
"id": "3784865f",
|
| 87 |
"metadata": {},
|
| 88 |
"outputs": [
|
|
|
|
| 141 |
"metadata": {},
|
| 142 |
"outputs": [
|
| 143 |
{
|
| 144 |
+
"data": {
|
| 145 |
+
"text/markdown": [
|
| 146 |
+
"🚀 **Starting the retrieval pipeline...**"
|
| 147 |
+
],
|
| 148 |
+
"text/plain": [
|
| 149 |
+
"<IPython.core.display.Markdown object>"
|
| 150 |
+
]
|
| 151 |
+
},
|
| 152 |
+
"metadata": {},
|
| 153 |
+
"output_type": "display_data"
|
| 154 |
},
|
| 155 |
{
|
| 156 |
"name": "stdout",
|
| 157 |
"output_type": "stream",
|
| 158 |
"text": [
|
| 159 |
+
"This may take a moment as the system loads the embedding model, prepares the knowledge base, and performs the search.\n",
|
| 160 |
"Initializing embedding model: all-MiniLM-L6-v2...\n",
|
| 161 |
"Model initialized successfully.\n",
|
| 162 |
"Creating embeddings for 27 chunks...\n"
|
| 163 |
]
|
| 164 |
},
|
| 165 |
{
|
| 166 |
+
"data": {
|
| 167 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 168 |
+
"model_id": "aed4d46c859d4f8a88caf88daa5a38cc",
|
| 169 |
+
"version_major": 2,
|
| 170 |
+
"version_minor": 0
|
| 171 |
+
},
|
| 172 |
+
"text/plain": [
|
| 173 |
+
"Batches: 0%| | 0/1 [00:00<?, ?it/s]"
|
| 174 |
+
]
|
| 175 |
+
},
|
| 176 |
+
"metadata": {},
|
| 177 |
+
"output_type": "display_data"
|
| 178 |
},
|
| 179 |
{
|
| 180 |
"name": "stdout",
|
|
|
|
| 306 |
}
|
| 307 |
],
|
| 308 |
"source": [
|
| 309 |
+
"display(Markdown(\"🚀 **Starting the retrieval pipeline...**\"))\n",
|
| 310 |
+
"print(\"This may take a moment as the system loads the embedding model, prepares the knowledge base, and performs the search.\")\n",
|
| 311 |
+
"\n",
|
| 312 |
"from fot_recommender.rag_pipeline import (\n",
|
| 313 |
" load_knowledge_base,\n",
|
| 314 |
" initialize_embedding_model,\n",
|
| 315 |
" create_embeddings,\n",
|
| 316 |
" create_vector_db,\n",
|
| 317 |
+
" search_interventions,\n",
|
| 318 |
")\n",
|
| 319 |
"from fot_recommender.utils import display_recommendations\n",
|
| 320 |
"\n",
|
|
|
|
| 337 |
" index=vector_db,\n",
|
| 338 |
" knowledge_base=knowledge_base_chunks,\n",
|
| 339 |
" k=3,\n",
|
| 340 |
+
" min_similarity_score=0.4,\n",
|
| 341 |
")\n",
|
| 342 |
"\n",
|
| 343 |
"# 4. Display a clean summary and the rich results\n",
|
| 344 |
+
"print(\n",
|
| 345 |
+
" f\"✅ Successfully loaded models and retrieved the top {len(retrieved_interventions)} most relevant interventions from the knowledge base.\"\n",
|
| 346 |
+
")\n",
|
| 347 |
"display_recommendations(retrieved_interventions, citations_map)"
|
| 348 |
]
|
| 349 |
},
|
scripts/build_knowledge_base.py
CHANGED
|
@@ -7,15 +7,17 @@ from pathlib import Path
|
|
| 7 |
project_root = Path(__file__).parent.parent
|
| 8 |
sys.path.append(str(project_root))
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
| 11 |
PROCESSED_DATA_DIR,
|
| 12 |
RAW_KB_PATH,
|
| 13 |
FINAL_KB_CHUNKS_PATH,
|
| 14 |
FAISS_INDEX_PATH,
|
| 15 |
EMBEDDING_MODEL_NAME,
|
| 16 |
)
|
| 17 |
-
from src.fot_recommender.semantic_chunker import chunk_by_concept
|
| 18 |
-
from src.fot_recommender.rag_pipeline import (
|
| 19 |
initialize_embedding_model,
|
| 20 |
create_embeddings,
|
| 21 |
)
|
|
@@ -61,4 +63,4 @@ def build():
|
|
| 61 |
|
| 62 |
|
| 63 |
if __name__ == "__main__":
|
| 64 |
-
build()
|
|
|
|
| 7 |
project_root = Path(__file__).parent.parent
|
| 8 |
sys.path.append(str(project_root))
|
| 9 |
|
| 10 |
+
# We are intentionally ignoring the E402 warning here because the sys.path
|
| 11 |
+
# modification must happen before we can import from our local package.
|
| 12 |
+
from src.fot_recommender.config import ( # noqa: E402
|
| 13 |
PROCESSED_DATA_DIR,
|
| 14 |
RAW_KB_PATH,
|
| 15 |
FINAL_KB_CHUNKS_PATH,
|
| 16 |
FAISS_INDEX_PATH,
|
| 17 |
EMBEDDING_MODEL_NAME,
|
| 18 |
)
|
| 19 |
+
from src.fot_recommender.semantic_chunker import chunk_by_concept # noqa: E402
|
| 20 |
+
from src.fot_recommender.rag_pipeline import ( # noqa: E402
|
| 21 |
initialize_embedding_model,
|
| 22 |
create_embeddings,
|
| 23 |
)
|
|
|
|
| 63 |
|
| 64 |
|
| 65 |
if __name__ == "__main__":
|
| 66 |
+
build()
|
src/fot_recommender/config.py
CHANGED
|
@@ -32,4 +32,6 @@ EMBEDDING_CONTENT_KEY = "content_for_embedding"
|
|
| 32 |
# --- Secrets Management ---
|
| 33 |
# Load secrets from the environment. The application will import these variables.
|
| 34 |
FOT_GOOGLE_API_KEY = os.environ.get("FOT_GOOGLE_API_KEY")
|
| 35 |
-
DEMO_PASSWORD = os.environ.get(
|
|
|
|
|
|
|
|
|
| 32 |
# --- Secrets Management ---
|
| 33 |
# Load secrets from the environment. The application will import these variables.
|
| 34 |
FOT_GOOGLE_API_KEY = os.environ.get("FOT_GOOGLE_API_KEY")
|
| 35 |
+
DEMO_PASSWORD = os.environ.get(
|
| 36 |
+
"DEMO_PASSWORD", "default_password"
|
| 37 |
+
) # Added a default for safety
|
src/fot_recommender/main.py
CHANGED
|
@@ -88,10 +88,7 @@ def main():
|
|
| 88 |
return "ERROR: FOT_GOOGLE_API_KEY not found. Please create a .env file and add your key."
|
| 89 |
|
| 90 |
synthesized_recommendation = generate_recommendation_summary(
|
| 91 |
-
top_interventions,
|
| 92 |
-
student_query,
|
| 93 |
-
api_key=api_key,
|
| 94 |
-
persona="teacher"
|
| 95 |
)
|
| 96 |
|
| 97 |
# --- 5. Display Final Output ---
|
|
|
|
| 88 |
return "ERROR: FOT_GOOGLE_API_KEY not found. Please create a .env file and add your key."
|
| 89 |
|
| 90 |
synthesized_recommendation = generate_recommendation_summary(
|
| 91 |
+
top_interventions, student_query, api_key=api_key, persona="teacher"
|
|
|
|
|
|
|
|
|
|
| 92 |
)
|
| 93 |
|
| 94 |
# --- 5. Display Final Output ---
|
src/fot_recommender/rag_pipeline.py
CHANGED
|
@@ -11,7 +11,7 @@ from fot_recommender.config import (
|
|
| 11 |
EMBEDDING_CONTENT_KEY,
|
| 12 |
GENERATIVE_MODEL_NAME,
|
| 13 |
SEARCH_RESULT_COUNT_K,
|
| 14 |
-
MIN_SIMILARITY_SCORE
|
| 15 |
)
|
| 16 |
|
| 17 |
|
|
@@ -89,7 +89,7 @@ def search_interventions(
|
|
| 89 |
"""
|
| 90 |
print(f"\nSearching for top {k} interventions for query: '{query[:80]}...'")
|
| 91 |
query_embedding = np.asarray(model.encode([query])).astype("float32")
|
| 92 |
-
scores, indices = index.search(query_embedding, k)
|
| 93 |
results = []
|
| 94 |
for i, score in zip(indices[0], scores[0]):
|
| 95 |
if i != -1: # FAISS returns -1 for no result
|
|
@@ -108,7 +108,7 @@ def generate_recommendation_summary(
|
|
| 108 |
student_narrative: str,
|
| 109 |
api_key: str,
|
| 110 |
persona: str = "teacher",
|
| 111 |
-
model_name: str = GENERATIVE_MODEL_NAME
|
| 112 |
) -> str:
|
| 113 |
"""
|
| 114 |
Generates a synthesized recommendation using the Google Gemini API.
|
|
@@ -131,10 +131,12 @@ def generate_recommendation_summary(
|
|
| 131 |
)
|
| 132 |
|
| 133 |
try:
|
| 134 |
-
print(
|
| 135 |
-
|
|
|
|
|
|
|
| 136 |
response = model.generate_content(prompt)
|
| 137 |
print("Synthesis complete.")
|
| 138 |
return response.text
|
| 139 |
except Exception as e:
|
| 140 |
-
return f"An error occurred while calling the Gemini API: {e}"
|
|
|
|
| 11 |
EMBEDDING_CONTENT_KEY,
|
| 12 |
GENERATIVE_MODEL_NAME,
|
| 13 |
SEARCH_RESULT_COUNT_K,
|
| 14 |
+
MIN_SIMILARITY_SCORE,
|
| 15 |
)
|
| 16 |
|
| 17 |
|
|
|
|
| 89 |
"""
|
| 90 |
print(f"\nSearching for top {k} interventions for query: '{query[:80]}...'")
|
| 91 |
query_embedding = np.asarray(model.encode([query])).astype("float32")
|
| 92 |
+
scores, indices = index.search(query_embedding, k) # type: ignore
|
| 93 |
results = []
|
| 94 |
for i, score in zip(indices[0], scores[0]):
|
| 95 |
if i != -1: # FAISS returns -1 for no result
|
|
|
|
| 108 |
student_narrative: str,
|
| 109 |
api_key: str,
|
| 110 |
persona: str = "teacher",
|
| 111 |
+
model_name: str = GENERATIVE_MODEL_NAME,
|
| 112 |
) -> str:
|
| 113 |
"""
|
| 114 |
Generates a synthesized recommendation using the Google Gemini API.
|
|
|
|
| 131 |
)
|
| 132 |
|
| 133 |
try:
|
| 134 |
+
print(
|
| 135 |
+
f"\nSynthesizing recommendation for persona: '{persona}' using {model_name}..."
|
| 136 |
+
)
|
| 137 |
+
model = genai.GenerativeModel(model_name) # type: ignore
|
| 138 |
response = model.generate_content(prompt)
|
| 139 |
print("Synthesis complete.")
|
| 140 |
return response.text
|
| 141 |
except Exception as e:
|
| 142 |
+
return f"An error occurred while calling the Gemini API: {e}"
|
src/fot_recommender/utils.py
CHANGED
|
@@ -2,6 +2,7 @@ import datetime
|
|
| 2 |
import json
|
| 3 |
from IPython.display import display, Markdown
|
| 4 |
|
|
|
|
| 5 |
def display_recommendations(results: list, citations_map: dict):
|
| 6 |
"""
|
| 7 |
Displays the retrieved recommendations in a rich, Markdown-formatted output
|
|
@@ -15,29 +16,28 @@ def display_recommendations(results: list, citations_map: dict):
|
|
| 15 |
formatted_evidence = format_evidence_for_display(results, citations_map)
|
| 16 |
|
| 17 |
display(Markdown("### Evidence Base"))
|
| 18 |
-
|
| 19 |
# 2. Loop through the clean data and render it for the notebook
|
| 20 |
for evidence in formatted_evidence:
|
| 21 |
recommendation_md = f"""
|
| 22 |
-
**{evidence[
|
| 23 |
-
- **Source:** {evidence[
|
| 24 |
-
- **Page(s):** {evidence[
|
| 25 |
-
- **Relevance Score:** {evidence[
|
| 26 |
- **Content Snippet:**
|
| 27 |
-
> {evidence[
|
| 28 |
|
| 29 |
---
|
| 30 |
"""
|
| 31 |
display(Markdown(recommendation_md))
|
| 32 |
|
| 33 |
|
| 34 |
-
|
| 35 |
def create_evaluation_bundle(
|
| 36 |
student_narrative: str,
|
| 37 |
persona: str,
|
| 38 |
retrieved_chunks_with_scores: list,
|
| 39 |
synthesized_recommendation: str,
|
| 40 |
-
citations_map: dict
|
| 41 |
) -> dict:
|
| 42 |
"""
|
| 43 |
Assembles a comprehensive dictionary for evaluation and logging purposes.
|
|
@@ -50,20 +50,20 @@ def create_evaluation_bundle(
|
|
| 50 |
},
|
| 51 |
"retrieval_results": [
|
| 52 |
{
|
| 53 |
-
"chunk_title": chunk[
|
| 54 |
"relevance_score": float(score),
|
| 55 |
-
"source_document": chunk[
|
| 56 |
-
"page_info": chunk.get(
|
| 57 |
-
"original_content": chunk.get(
|
| 58 |
-
"citation_info": citations_map.get(chunk[
|
| 59 |
-
}
|
|
|
|
| 60 |
],
|
| 61 |
-
"llm_output": {
|
| 62 |
-
"synthesized_recommendation": synthesized_recommendation
|
| 63 |
-
}
|
| 64 |
}
|
| 65 |
return evaluation_data
|
| 66 |
|
|
|
|
| 67 |
def format_evidence_for_display(results: list, citations_map: dict) -> list:
|
| 68 |
"""
|
| 69 |
Takes raw search results and formats them into a structured list of dictionaries
|
|
@@ -71,34 +71,39 @@ def format_evidence_for_display(results: list, citations_map: dict) -> list:
|
|
| 71 |
"""
|
| 72 |
evidence_list = []
|
| 73 |
for chunk, score in results:
|
| 74 |
-
source_doc = chunk.get(
|
| 75 |
citation_info = citations_map.get(source_doc, {})
|
| 76 |
-
|
| 77 |
# Consolidate all the formatting logic here
|
| 78 |
-
title = citation_info.get(
|
| 79 |
-
author = citation_info.get(
|
| 80 |
-
year = citation_info.get(
|
| 81 |
source_string = f"*{title}* ({author}, {year})."
|
| 82 |
-
|
| 83 |
-
page_info = chunk.get(
|
| 84 |
-
|
| 85 |
-
original_content = chunk.get(
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
return evidence_list
|
| 97 |
|
|
|
|
| 98 |
def load_citations(path):
|
| 99 |
try:
|
| 100 |
with open(path, "r", encoding="utf-8") as f:
|
| 101 |
citations_list = json.load(f)
|
| 102 |
return {item["source_document"]: item for item in citations_list}
|
| 103 |
except (FileNotFoundError, json.JSONDecodeError):
|
| 104 |
-
return {}
|
|
|
|
| 2 |
import json
|
| 3 |
from IPython.display import display, Markdown
|
| 4 |
|
| 5 |
+
|
| 6 |
def display_recommendations(results: list, citations_map: dict):
|
| 7 |
"""
|
| 8 |
Displays the retrieved recommendations in a rich, Markdown-formatted output
|
|
|
|
| 16 |
formatted_evidence = format_evidence_for_display(results, citations_map)
|
| 17 |
|
| 18 |
display(Markdown("### Evidence Base"))
|
| 19 |
+
|
| 20 |
# 2. Loop through the clean data and render it for the notebook
|
| 21 |
for evidence in formatted_evidence:
|
| 22 |
recommendation_md = f"""
|
| 23 |
+
**{evidence["title"]}**
|
| 24 |
+
- **Source:** {evidence["source"]}
|
| 25 |
+
- **Page(s):** {evidence["pages"]}
|
| 26 |
+
- **Relevance Score:** {evidence["score"]}
|
| 27 |
- **Content Snippet:**
|
| 28 |
+
> {evidence["content_snippet"]}
|
| 29 |
|
| 30 |
---
|
| 31 |
"""
|
| 32 |
display(Markdown(recommendation_md))
|
| 33 |
|
| 34 |
|
|
|
|
| 35 |
def create_evaluation_bundle(
|
| 36 |
student_narrative: str,
|
| 37 |
persona: str,
|
| 38 |
retrieved_chunks_with_scores: list,
|
| 39 |
synthesized_recommendation: str,
|
| 40 |
+
citations_map: dict,
|
| 41 |
) -> dict:
|
| 42 |
"""
|
| 43 |
Assembles a comprehensive dictionary for evaluation and logging purposes.
|
|
|
|
| 50 |
},
|
| 51 |
"retrieval_results": [
|
| 52 |
{
|
| 53 |
+
"chunk_title": chunk["title"],
|
| 54 |
"relevance_score": float(score),
|
| 55 |
+
"source_document": chunk["source_document"],
|
| 56 |
+
"page_info": chunk.get("fot_pages", "N/A"),
|
| 57 |
+
"original_content": chunk.get("original_content", ""),
|
| 58 |
+
"citation_info": citations_map.get(chunk["source_document"], {}),
|
| 59 |
+
}
|
| 60 |
+
for chunk, score in retrieved_chunks_with_scores
|
| 61 |
],
|
| 62 |
+
"llm_output": {"synthesized_recommendation": synthesized_recommendation},
|
|
|
|
|
|
|
| 63 |
}
|
| 64 |
return evaluation_data
|
| 65 |
|
| 66 |
+
|
| 67 |
def format_evidence_for_display(results: list, citations_map: dict) -> list:
|
| 68 |
"""
|
| 69 |
Takes raw search results and formats them into a structured list of dictionaries
|
|
|
|
| 71 |
"""
|
| 72 |
evidence_list = []
|
| 73 |
for chunk, score in results:
|
| 74 |
+
source_doc = chunk.get("source_document", "N/A")
|
| 75 |
citation_info = citations_map.get(source_doc, {})
|
| 76 |
+
|
| 77 |
# Consolidate all the formatting logic here
|
| 78 |
+
title = citation_info.get("title", "N/A")
|
| 79 |
+
author = citation_info.get("author", "N/A")
|
| 80 |
+
year = citation_info.get("year", "N/A")
|
| 81 |
source_string = f"*{title}* ({author}, {year})."
|
| 82 |
+
|
| 83 |
+
page_info = chunk.get("fot_pages", "N/A")
|
| 84 |
+
|
| 85 |
+
original_content = chunk.get(
|
| 86 |
+
"original_content", "Content not available."
|
| 87 |
+
).strip()
|
| 88 |
+
blockquote_content = original_content.replace("\n", "\n> ")
|
| 89 |
+
|
| 90 |
+
evidence_list.append(
|
| 91 |
+
{
|
| 92 |
+
"title": chunk["title"],
|
| 93 |
+
"source": source_string,
|
| 94 |
+
"pages": page_info,
|
| 95 |
+
"score": f"{score:.2f}",
|
| 96 |
+
"content_snippet": blockquote_content,
|
| 97 |
+
}
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
return evidence_list
|
| 101 |
|
| 102 |
+
|
| 103 |
def load_citations(path):
|
| 104 |
try:
|
| 105 |
with open(path, "r", encoding="utf-8") as f:
|
| 106 |
citations_list = json.load(f)
|
| 107 |
return {item["source_document"]: item for item in citations_list}
|
| 108 |
except (FileNotFoundError, json.JSONDecodeError):
|
| 109 |
+
return {}
|
tests/test_chunking.py
CHANGED
|
@@ -7,9 +7,24 @@ def test_chunk_by_concept_groups_correctly():
|
|
| 7 |
|
| 8 |
# 1. Arrange: Create simple, predictable raw data
|
| 9 |
sample_raw_kb = [
|
| 10 |
-
{
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
]
|
| 14 |
|
| 15 |
# 2. Act: Run the function we're testing
|
|
@@ -20,10 +35,12 @@ def test_chunk_by_concept_groups_correctly():
|
|
| 20 |
|
| 21 |
# Find the 'Mentoring' chunk for detailed checks
|
| 22 |
mentoring_chunk = next(c for c in final_chunks if c["title"] == "Mentoring")
|
| 23 |
-
|
| 24 |
assert mentoring_chunk is not None
|
| 25 |
assert mentoring_chunk["source_document"] == "doc_A"
|
| 26 |
assert mentoring_chunk["fot_pages"] == "Pages: 1, 2"
|
| 27 |
assert "First part.\n\nSecond part." in mentoring_chunk["original_content"]
|
| 28 |
-
assert
|
| 29 |
-
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
# 1. Arrange: Create simple, predictable raw data
|
| 9 |
sample_raw_kb = [
|
| 10 |
+
{
|
| 11 |
+
"source_document": "doc_A",
|
| 12 |
+
"concept": "Mentoring",
|
| 13 |
+
"absolute_page": 1,
|
| 14 |
+
"content": "First part.",
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"source_document": "doc_B",
|
| 18 |
+
"concept": "Tutoring",
|
| 19 |
+
"absolute_page": 10,
|
| 20 |
+
"content": "Tutoring info.",
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"source_document": "doc_A",
|
| 24 |
+
"concept": "Mentoring",
|
| 25 |
+
"absolute_page": 2,
|
| 26 |
+
"content": "Second part.",
|
| 27 |
+
},
|
| 28 |
]
|
| 29 |
|
| 30 |
# 2. Act: Run the function we're testing
|
|
|
|
| 35 |
|
| 36 |
# Find the 'Mentoring' chunk for detailed checks
|
| 37 |
mentoring_chunk = next(c for c in final_chunks if c["title"] == "Mentoring")
|
| 38 |
+
|
| 39 |
assert mentoring_chunk is not None
|
| 40 |
assert mentoring_chunk["source_document"] == "doc_A"
|
| 41 |
assert mentoring_chunk["fot_pages"] == "Pages: 1, 2"
|
| 42 |
assert "First part.\n\nSecond part." in mentoring_chunk["original_content"]
|
| 43 |
+
assert (
|
| 44 |
+
"Title: Mentoring. Content: First part.\n\nSecond part."
|
| 45 |
+
in mentoring_chunk["content_for_embedding"]
|
| 46 |
+
)
|
tests/test_pipeline.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
from unittest.mock import MagicMock, patch
|
| 2 |
import numpy as np
|
| 3 |
|
|
|
|
| 4 |
def test_search_interventions_filters_by_score():
|
| 5 |
"""
|
| 6 |
Ensures the search function correctly filters out results
|
|
@@ -11,7 +12,7 @@ def test_search_interventions_filters_by_score():
|
|
| 11 |
# 1. Arrange: Create mock objects and sample data
|
| 12 |
mock_model = MagicMock()
|
| 13 |
mock_index = MagicMock()
|
| 14 |
-
|
| 15 |
# Fake knowledge base
|
| 16 |
sample_kb = [{"id": 1, "content": "high score"}, {"id": 2, "content": "low score"}]
|
| 17 |
|
|
@@ -19,7 +20,7 @@ def test_search_interventions_filters_by_score():
|
|
| 19 |
# Let's say it finds two results, one with a high score (0.9) and one low (0.3)
|
| 20 |
mock_index.search.return_value = (
|
| 21 |
np.array([[0.9, 0.3]]), # scores
|
| 22 |
-
np.array([[0, 1]])
|
| 23 |
)
|
| 24 |
|
| 25 |
# 2. Act: Run the search with a minimum score of 0.5
|
|
@@ -29,14 +30,14 @@ def test_search_interventions_filters_by_score():
|
|
| 29 |
index=mock_index,
|
| 30 |
knowledge_base=sample_kb,
|
| 31 |
k=2,
|
| 32 |
-
min_similarity_score=0.5
|
| 33 |
)
|
| 34 |
|
| 35 |
# 3. Assert: Check that only the high-scoring result was returned
|
| 36 |
assert len(results) == 1
|
| 37 |
-
assert results[0][0]["content"] == "high score"
|
| 38 |
-
assert results[0][1] == 0.9
|
| 39 |
-
|
| 40 |
|
| 41 |
def test_generate_recommendation_summary_builds_correct_prompt():
|
| 42 |
"""
|
|
@@ -47,13 +48,22 @@ def test_generate_recommendation_summary_builds_correct_prompt():
|
|
| 47 |
|
| 48 |
# 1. Arrange: Create sample inputs
|
| 49 |
sample_chunks = [
|
| 50 |
-
(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
]
|
| 52 |
student_narrative = "Student is struggling."
|
| 53 |
|
| 54 |
# 2. Act & Assert: Use a patch to intercept the API call
|
| 55 |
# This temporarily replaces genai.GenerativeModel with our mock
|
| 56 |
-
with patch(
|
|
|
|
|
|
|
| 57 |
# Create a mock instance that the function will use
|
| 58 |
mock_model_instance = MagicMock()
|
| 59 |
mock_gen_model.return_value = mock_model_instance
|
|
@@ -62,13 +72,13 @@ def test_generate_recommendation_summary_builds_correct_prompt():
|
|
| 62 |
retrieved_chunks=sample_chunks,
|
| 63 |
student_narrative=student_narrative,
|
| 64 |
api_key="fake_key",
|
| 65 |
-
persona="teacher"
|
| 66 |
)
|
| 67 |
|
| 68 |
# 3. Assert: Check what our function tried to do
|
| 69 |
# Was the API call made once?
|
| 70 |
mock_model_instance.generate_content.assert_called_once()
|
| 71 |
-
|
| 72 |
# Get the actual prompt that was passed to the LLM
|
| 73 |
actual_prompt = mock_model_instance.generate_content.call_args[0][0]
|
| 74 |
|
|
|
|
| 1 |
from unittest.mock import MagicMock, patch
|
| 2 |
import numpy as np
|
| 3 |
|
| 4 |
+
|
| 5 |
def test_search_interventions_filters_by_score():
|
| 6 |
"""
|
| 7 |
Ensures the search function correctly filters out results
|
|
|
|
| 12 |
# 1. Arrange: Create mock objects and sample data
|
| 13 |
mock_model = MagicMock()
|
| 14 |
mock_index = MagicMock()
|
| 15 |
+
|
| 16 |
# Fake knowledge base
|
| 17 |
sample_kb = [{"id": 1, "content": "high score"}, {"id": 2, "content": "low score"}]
|
| 18 |
|
|
|
|
| 20 |
# Let's say it finds two results, one with a high score (0.9) and one low (0.3)
|
| 21 |
mock_index.search.return_value = (
|
| 22 |
np.array([[0.9, 0.3]]), # scores
|
| 23 |
+
np.array([[0, 1]]), # indices
|
| 24 |
)
|
| 25 |
|
| 26 |
# 2. Act: Run the search with a minimum score of 0.5
|
|
|
|
| 30 |
index=mock_index,
|
| 31 |
knowledge_base=sample_kb,
|
| 32 |
k=2,
|
| 33 |
+
min_similarity_score=0.5,
|
| 34 |
)
|
| 35 |
|
| 36 |
# 3. Assert: Check that only the high-scoring result was returned
|
| 37 |
assert len(results) == 1
|
| 38 |
+
assert results[0][0]["content"] == "high score" # Check the chunk content
|
| 39 |
+
assert results[0][1] == 0.9 # Check the score
|
| 40 |
+
|
| 41 |
|
| 42 |
def test_generate_recommendation_summary_builds_correct_prompt():
|
| 43 |
"""
|
|
|
|
| 48 |
|
| 49 |
# 1. Arrange: Create sample inputs
|
| 50 |
sample_chunks = [
|
| 51 |
+
(
|
| 52 |
+
{
|
| 53 |
+
"title": "Tip 1",
|
| 54 |
+
"original_content": "Do this.",
|
| 55 |
+
"source_document": "doc_A",
|
| 56 |
+
},
|
| 57 |
+
0.9,
|
| 58 |
+
),
|
| 59 |
]
|
| 60 |
student_narrative = "Student is struggling."
|
| 61 |
|
| 62 |
# 2. Act & Assert: Use a patch to intercept the API call
|
| 63 |
# This temporarily replaces genai.GenerativeModel with our mock
|
| 64 |
+
with patch(
|
| 65 |
+
"src.fot_recommender.rag_pipeline.genai.GenerativeModel"
|
| 66 |
+
) as mock_gen_model:
|
| 67 |
# Create a mock instance that the function will use
|
| 68 |
mock_model_instance = MagicMock()
|
| 69 |
mock_gen_model.return_value = mock_model_instance
|
|
|
|
| 72 |
retrieved_chunks=sample_chunks,
|
| 73 |
student_narrative=student_narrative,
|
| 74 |
api_key="fake_key",
|
| 75 |
+
persona="teacher",
|
| 76 |
)
|
| 77 |
|
| 78 |
# 3. Assert: Check what our function tried to do
|
| 79 |
# Was the API call made once?
|
| 80 |
mock_model_instance.generate_content.assert_called_once()
|
| 81 |
+
|
| 82 |
# Get the actual prompt that was passed to the LLM
|
| 83 |
actual_prompt = mock_model_instance.generate_content.call_args[0][0]
|
| 84 |
|