lesson-agent-dev / libs /agent /tests /test_education_sources.py
MSG
Feat/monday 3 sprint (#20)
bd75839
Raw
History Blame Contribute Delete
8.68 kB
from __future__ import annotations
import json
import numpy as np
import pytest
from agent.models import EducationPptxInput, ResearchIngestResult
from agent.prompts import education_outline_user
from agent.runner import AgentRunner
from researchmind.config import ResearchMindConfig
from researchmind.extract import ExtractedDocument
def _outline_json(slide_count: int = 3) -> str:
slides = [
{
"title": f"Slide {i}",
"bullets": ["Key point"],
"speaker_note": "Note",
}
for i in range(1, slide_count + 1)
]
return json.dumps({"title": "Test Lesson", "slides": slides})
class OutlineBackend:
def load(self) -> None:
return None
def chat(self, messages, *, max_tokens=2048, temperature=0.3):
return _outline_json(3)
def generate(self, prompt, *, max_tokens=512, temperature=0.7):
return self.chat([{"role": "user", "content": prompt}], max_tokens=max_tokens)
@pytest.fixture
def research_env(tmp_path, monkeypatch):
cfg = ResearchMindConfig(
data_dir=tmp_path / "rm",
embed_model="test",
auto_search=False,
top_k=2,
max_context_chunks=8,
chunk_size=50,
chunk_overlap=10,
)
monkeypatch.setenv("RESEARCHMIND_DATA_DIR", str(cfg.data_dir))
monkeypatch.setenv("AGENT_OUTPUTS_DIR", str(tmp_path / "outputs"))
def fake_embed(texts, *, model_name):
vecs = [np.array([1.0, 0.0, 0.0], dtype=np.float32) for _ in texts]
return np.stack(vecs) if vecs else np.zeros((0, 3), dtype=np.float32)
monkeypatch.setattr("researchmind.ingest.embed_texts", fake_embed)
monkeypatch.setattr("researchmind.retrieve.embed_texts", fake_embed)
def fake_scrape(url: str):
return ExtractedDocument(
source_type="web",
uri=url,
title="Example",
text="Photosynthesis converts light to energy in plants.",
)
monkeypatch.setattr("agent.tools.research_tools.fetch_and_extract", fake_scrape)
def fake_search(topic, *, n=5, check_reachable=True):
return [f"https://example.com/{topic.replace(' ', '-')}"]
monkeypatch.setattr("agent.tools.research_tools.search_urls", fake_search)
def fake_validate(url, *, check_reachable=True):
normalized = url if url.startswith("http") else f"https://{url}"
return True, "ok", normalized
monkeypatch.setattr("researchmind.url_validate.validate_url", fake_validate)
return cfg
def test_education_outline_user_includes_source_context():
req = EducationPptxInput(topic="Photosynthesis", grade="6", slide_count=3)
user = education_outline_user(req, source_context="[1] Plants use chlorophyll.")
assert "retrieved source excerpts" in user
assert "chlorophyll" in user
def test_education_outline_user_includes_conversation_context():
req = EducationPptxInput(
topic="Photosynthesis",
grade="6",
slide_count=3,
conversation_context="User: What is photosynthesis?\n\nAssistant: Plants use sunlight.",
)
user = education_outline_user(req)
assert "conversation transcript" in user
assert "What is photosynthesis?" in user
def test_none_mode_skips_source_summary(research_env):
runner = AgentRunner()
result = runner.run_education_pptx(
topic="Photosynthesis",
grade="6",
slide_count=3,
model_key="test",
backend=OutlineBackend(),
source_mode="none",
)
assert result.outline.title == "Test Lesson"
assert result.source_summary == ""
def test_web_auto_calls_ingest_with_auto_search(research_env, monkeypatch):
calls: list[dict] = []
def fake_ingest(self, **kwargs):
calls.append(kwargs)
return ResearchIngestResult(
session_id="sess-auto",
ingested=["https://example.com/photosynthesis"],
skipped=[],
failures=[],
doc_count=1,
chunk_count=1,
trace_path="/tmp/trace.json",
message="Ingested 1 source(s)",
)
monkeypatch.setattr(AgentRunner, "run_researchmind_ingest", fake_ingest)
runner = AgentRunner()
result = runner.run_education_pptx(
topic="Photosynthesis",
grade="6",
slide_count=3,
model_key="test",
backend=OutlineBackend(),
source_mode="web",
search_workflow="auto",
)
assert len(calls) == 1
assert calls[0]["auto_search"] is True
assert "Ingested 1 source(s)" in result.source_summary
def test_web_two_step_requires_urls(research_env):
runner = AgentRunner()
with pytest.raises(ValueError, match="Two-step web search requires"):
runner.run_education_pptx(
topic="Photosynthesis",
grade="6",
slide_count=3,
model_key="test",
backend=OutlineBackend(),
source_mode="web",
search_workflow="two_step",
urls=[],
files=[],
)
def test_web_two_step_ingests_without_auto_search(research_env, monkeypatch):
calls: list[dict] = []
def fake_ingest(self, **kwargs):
calls.append(kwargs)
return ResearchIngestResult(
session_id="sess-two",
ingested=["https://example.com/a"],
skipped=[],
failures=[],
doc_count=1,
chunk_count=1,
trace_path="/tmp/trace.json",
message="Ingested 1 source(s)",
)
monkeypatch.setattr(AgentRunner, "run_researchmind_ingest", fake_ingest)
runner = AgentRunner()
runner.run_education_pptx(
topic="Photosynthesis",
grade="6",
slide_count=3,
model_key="test",
backend=OutlineBackend(),
source_mode="web",
search_workflow="two_step",
urls=["https://example.com/a"],
)
assert calls[0]["auto_search"] is False
def test_rag_requires_indexed_sources(research_env):
runner = AgentRunner()
with pytest.raises(ValueError, match="RAG mode requires indexed sources"):
runner.run_education_pptx(
topic="Photosynthesis",
grade="6",
slide_count=3,
model_key="test",
backend=OutlineBackend(),
source_mode="rag",
session_id="",
urls=[],
files=[],
)
def test_web_two_step_uses_duplicate_doc_ids(research_env):
runner = AgentRunner()
first = runner.run_researchmind_ingest(
topic="Photosynthesis",
urls=["https://example.com/a"],
files=[],
auto_search=False,
session_id=None,
model_key="test",
backend=OutlineBackend(),
)
assert first.doc_ids
new_session = runner.run_researchmind_discover(
topic="Photosynthesis",
auto_search=False,
session_id=None,
model_key="test",
backend=OutlineBackend(),
).session_id
second = runner.run_researchmind_ingest(
topic="Photosynthesis",
urls=["https://example.com/a"],
files=[],
auto_search=False,
session_id=new_session,
model_key="test",
backend=OutlineBackend(),
)
assert second.ingested == []
assert len(second.skipped) == 1
assert second.doc_ids == first.doc_ids
result = runner.run_education_pptx(
topic="Photosynthesis",
grade="6",
slide_count=3,
model_key="test",
backend=OutlineBackend(),
source_mode="web",
search_workflow="two_step",
urls=["https://example.com/a"],
session_id=new_session,
)
assert "Retrieved" in result.source_summary
assert result.source_summary.count("model knowledge only") == 0
def test_rag_uses_session_without_auto_search(research_env, monkeypatch):
ingest = AgentRunner().run_researchmind_ingest(
topic="Photosynthesis",
urls=["https://example.com/a"],
files=[],
auto_search=False,
session_id=None,
model_key="test",
backend=OutlineBackend(),
)
calls: list[dict] = []
def fake_ingest(self, **kwargs):
calls.append(kwargs)
return ingest
monkeypatch.setattr(AgentRunner, "run_researchmind_ingest", fake_ingest)
runner = AgentRunner()
result = runner.run_education_pptx(
topic="Photosynthesis",
grade="6",
slide_count=3,
model_key="test",
backend=OutlineBackend(),
source_mode="rag",
session_id=ingest.session_id,
)
assert calls == []
assert "Retrieved" in result.source_summary