Spaces:

MSGEncrypted
/

lesson-agent-dev

Sleeping

lesson-agent-dev / libs /agent /tests /test_education_sources.py

MSG

Feat/monday 3 sprint (#20)

bd75839 20 days ago

8.68 kB

	from __future__ import annotations

	import json
	import numpy as np
	import pytest

	from agent.models import EducationPptxInput, ResearchIngestResult
	from agent.prompts import education_outline_user
	from agent.runner import AgentRunner
	from researchmind.config import ResearchMindConfig
	from researchmind.extract import ExtractedDocument


	def _outline_json(slide_count: int = 3) -> str:
	slides = [
	{
	"title": f"Slide {i}",
	"bullets": ["Key point"],
	"speaker_note": "Note",
	}
	for i in range(1, slide_count + 1)
	]
	return json.dumps({"title": "Test Lesson", "slides": slides})


	class OutlineBackend:
	def load(self) -> None:
	return None

	def chat(self, messages, *, max_tokens=2048, temperature=0.3):
	return _outline_json(3)

	def generate(self, prompt, *, max_tokens=512, temperature=0.7):
	return self.chat([{"role": "user", "content": prompt}], max_tokens=max_tokens)


	@pytest.fixture
	def research_env(tmp_path, monkeypatch):
	cfg = ResearchMindConfig(
	data_dir=tmp_path / "rm",
	embed_model="test",
	auto_search=False,
	top_k=2,
	max_context_chunks=8,
	chunk_size=50,
	chunk_overlap=10,
	)
	monkeypatch.setenv("RESEARCHMIND_DATA_DIR", str(cfg.data_dir))
	monkeypatch.setenv("AGENT_OUTPUTS_DIR", str(tmp_path / "outputs"))

	def fake_embed(texts, *, model_name):
	vecs = [np.array([1.0, 0.0, 0.0], dtype=np.float32) for _ in texts]
	return np.stack(vecs) if vecs else np.zeros((0, 3), dtype=np.float32)

	monkeypatch.setattr("researchmind.ingest.embed_texts", fake_embed)
	monkeypatch.setattr("researchmind.retrieve.embed_texts", fake_embed)

	def fake_scrape(url: str):
	return ExtractedDocument(
	source_type="web",
	uri=url,
	title="Example",
	text="Photosynthesis converts light to energy in plants.",
	)

	monkeypatch.setattr("agent.tools.research_tools.fetch_and_extract", fake_scrape)

	def fake_search(topic, *, n=5, check_reachable=True):
	return [f"https://example.com/{topic.replace(' ', '-')}"]

	monkeypatch.setattr("agent.tools.research_tools.search_urls", fake_search)

	def fake_validate(url, *, check_reachable=True):
	normalized = url if url.startswith("http") else f"https://{url}"
	return True, "ok", normalized

	monkeypatch.setattr("researchmind.url_validate.validate_url", fake_validate)
	return cfg


	def test_education_outline_user_includes_source_context():
	req = EducationPptxInput(topic="Photosynthesis", grade="6", slide_count=3)
	user = education_outline_user(req, source_context="[1] Plants use chlorophyll.")
	assert "retrieved source excerpts" in user
	assert "chlorophyll" in user


	def test_education_outline_user_includes_conversation_context():
	req = EducationPptxInput(
	topic="Photosynthesis",
	grade="6",
	slide_count=3,
	conversation_context="User: What is photosynthesis?\n\nAssistant: Plants use sunlight.",
	)
	user = education_outline_user(req)
	assert "conversation transcript" in user
	assert "What is photosynthesis?" in user


	def test_none_mode_skips_source_summary(research_env):
	runner = AgentRunner()
	result = runner.run_education_pptx(
	topic="Photosynthesis",
	grade="6",
	slide_count=3,
	model_key="test",
	backend=OutlineBackend(),
	source_mode="none",
	)
	assert result.outline.title == "Test Lesson"
	assert result.source_summary == ""


	def test_web_auto_calls_ingest_with_auto_search(research_env, monkeypatch):
	calls: list[dict] = []

	def fake_ingest(self, **kwargs):
	calls.append(kwargs)
	return ResearchIngestResult(
	session_id="sess-auto",
	ingested=["https://example.com/photosynthesis"],
	skipped=[],
	failures=[],
	doc_count=1,
	chunk_count=1,
	trace_path="/tmp/trace.json",
	message="Ingested 1 source(s)",
	)

	monkeypatch.setattr(AgentRunner, "run_researchmind_ingest", fake_ingest)

	runner = AgentRunner()
	result = runner.run_education_pptx(
	topic="Photosynthesis",
	grade="6",
	slide_count=3,
	model_key="test",
	backend=OutlineBackend(),
	source_mode="web",
	search_workflow="auto",
	)
	assert len(calls) == 1
	assert calls[0]["auto_search"] is True
	assert "Ingested 1 source(s)" in result.source_summary


	def test_web_two_step_requires_urls(research_env):
	runner = AgentRunner()
	with pytest.raises(ValueError, match="Two-step web search requires"):
	runner.run_education_pptx(
	topic="Photosynthesis",
	grade="6",
	slide_count=3,
	model_key="test",
	backend=OutlineBackend(),
	source_mode="web",
	search_workflow="two_step",
	urls=[],
	files=[],
	)


	def test_web_two_step_ingests_without_auto_search(research_env, monkeypatch):
	calls: list[dict] = []

	def fake_ingest(self, **kwargs):
	calls.append(kwargs)
	return ResearchIngestResult(
	session_id="sess-two",
	ingested=["https://example.com/a"],
	skipped=[],
	failures=[],
	doc_count=1,
	chunk_count=1,
	trace_path="/tmp/trace.json",
	message="Ingested 1 source(s)",
	)

	monkeypatch.setattr(AgentRunner, "run_researchmind_ingest", fake_ingest)

	runner = AgentRunner()
	runner.run_education_pptx(
	topic="Photosynthesis",
	grade="6",
	slide_count=3,
	model_key="test",
	backend=OutlineBackend(),
	source_mode="web",
	search_workflow="two_step",
	urls=["https://example.com/a"],
	)
	assert calls[0]["auto_search"] is False


	def test_rag_requires_indexed_sources(research_env):
	runner = AgentRunner()
	with pytest.raises(ValueError, match="RAG mode requires indexed sources"):
	runner.run_education_pptx(
	topic="Photosynthesis",
	grade="6",
	slide_count=3,
	model_key="test",
	backend=OutlineBackend(),
	source_mode="rag",
	session_id="",
	urls=[],
	files=[],
	)


	def test_web_two_step_uses_duplicate_doc_ids(research_env):
	runner = AgentRunner()
	first = runner.run_researchmind_ingest(
	topic="Photosynthesis",
	urls=["https://example.com/a"],
	files=[],
	auto_search=False,
	session_id=None,
	model_key="test",
	backend=OutlineBackend(),
	)
	assert first.doc_ids

	new_session = runner.run_researchmind_discover(
	topic="Photosynthesis",
	auto_search=False,
	session_id=None,
	model_key="test",
	backend=OutlineBackend(),
	).session_id

	second = runner.run_researchmind_ingest(
	topic="Photosynthesis",
	urls=["https://example.com/a"],
	files=[],
	auto_search=False,
	session_id=new_session,
	model_key="test",
	backend=OutlineBackend(),
	)
	assert second.ingested == []
	assert len(second.skipped) == 1
	assert second.doc_ids == first.doc_ids

	result = runner.run_education_pptx(
	topic="Photosynthesis",
	grade="6",
	slide_count=3,
	model_key="test",
	backend=OutlineBackend(),
	source_mode="web",
	search_workflow="two_step",
	urls=["https://example.com/a"],
	session_id=new_session,
	)
	assert "Retrieved" in result.source_summary
	assert result.source_summary.count("model knowledge only") == 0


	def test_rag_uses_session_without_auto_search(research_env, monkeypatch):
	ingest = AgentRunner().run_researchmind_ingest(
	topic="Photosynthesis",
	urls=["https://example.com/a"],
	files=[],
	auto_search=False,
	session_id=None,
	model_key="test",
	backend=OutlineBackend(),
	)

	calls: list[dict] = []

	def fake_ingest(self, **kwargs):
	calls.append(kwargs)
	return ingest

	monkeypatch.setattr(AgentRunner, "run_researchmind_ingest", fake_ingest)

	runner = AgentRunner()
	result = runner.run_education_pptx(
	topic="Photosynthesis",
	grade="6",
	slide_count=3,
	model_key="test",
	backend=OutlineBackend(),
	source_mode="rag",
	session_id=ingest.session_id,
	)
	assert calls == []
	assert "Retrieved" in result.source_summary