from __future__ import annotations import json import numpy as np import pytest from agent.models import EducationPptxInput, ResearchIngestResult from agent.prompts import education_outline_user from agent.runner import AgentRunner from researchmind.config import ResearchMindConfig from researchmind.extract import ExtractedDocument def _outline_json(slide_count: int = 3) -> str: slides = [ { "title": f"Slide {i}", "bullets": ["Key point"], "speaker_note": "Note", } for i in range(1, slide_count + 1) ] return json.dumps({"title": "Test Lesson", "slides": slides}) class OutlineBackend: def load(self) -> None: return None def chat(self, messages, *, max_tokens=2048, temperature=0.3): return _outline_json(3) def generate(self, prompt, *, max_tokens=512, temperature=0.7): return self.chat([{"role": "user", "content": prompt}], max_tokens=max_tokens) @pytest.fixture def research_env(tmp_path, monkeypatch): cfg = ResearchMindConfig( data_dir=tmp_path / "rm", embed_model="test", auto_search=False, top_k=2, max_context_chunks=8, chunk_size=50, chunk_overlap=10, ) monkeypatch.setenv("RESEARCHMIND_DATA_DIR", str(cfg.data_dir)) monkeypatch.setenv("AGENT_OUTPUTS_DIR", str(tmp_path / "outputs")) def fake_embed(texts, *, model_name): vecs = [np.array([1.0, 0.0, 0.0], dtype=np.float32) for _ in texts] return np.stack(vecs) if vecs else np.zeros((0, 3), dtype=np.float32) monkeypatch.setattr("researchmind.ingest.embed_texts", fake_embed) monkeypatch.setattr("researchmind.retrieve.embed_texts", fake_embed) def fake_scrape(url: str): return ExtractedDocument( source_type="web", uri=url, title="Example", text="Photosynthesis converts light to energy in plants.", ) monkeypatch.setattr("agent.tools.research_tools.fetch_and_extract", fake_scrape) def fake_search(topic, *, n=5, check_reachable=True): return [f"https://example.com/{topic.replace(' ', '-')}"] monkeypatch.setattr("agent.tools.research_tools.search_urls", fake_search) def fake_validate(url, *, check_reachable=True): normalized = url if url.startswith("http") else f"https://{url}" return True, "ok", normalized monkeypatch.setattr("researchmind.url_validate.validate_url", fake_validate) return cfg def test_education_outline_user_includes_source_context(): req = EducationPptxInput(topic="Photosynthesis", grade="6", slide_count=3) user = education_outline_user(req, source_context="[1] Plants use chlorophyll.") assert "retrieved source excerpts" in user assert "chlorophyll" in user def test_education_outline_user_includes_conversation_context(): req = EducationPptxInput( topic="Photosynthesis", grade="6", slide_count=3, conversation_context="User: What is photosynthesis?\n\nAssistant: Plants use sunlight.", ) user = education_outline_user(req) assert "conversation transcript" in user assert "What is photosynthesis?" in user def test_none_mode_skips_source_summary(research_env): runner = AgentRunner() result = runner.run_education_pptx( topic="Photosynthesis", grade="6", slide_count=3, model_key="test", backend=OutlineBackend(), source_mode="none", ) assert result.outline.title == "Test Lesson" assert result.source_summary == "" def test_web_auto_calls_ingest_with_auto_search(research_env, monkeypatch): calls: list[dict] = [] def fake_ingest(self, **kwargs): calls.append(kwargs) return ResearchIngestResult( session_id="sess-auto", ingested=["https://example.com/photosynthesis"], skipped=[], failures=[], doc_count=1, chunk_count=1, trace_path="/tmp/trace.json", message="Ingested 1 source(s)", ) monkeypatch.setattr(AgentRunner, "run_researchmind_ingest", fake_ingest) runner = AgentRunner() result = runner.run_education_pptx( topic="Photosynthesis", grade="6", slide_count=3, model_key="test", backend=OutlineBackend(), source_mode="web", search_workflow="auto", ) assert len(calls) == 1 assert calls[0]["auto_search"] is True assert "Ingested 1 source(s)" in result.source_summary def test_web_two_step_requires_urls(research_env): runner = AgentRunner() with pytest.raises(ValueError, match="Two-step web search requires"): runner.run_education_pptx( topic="Photosynthesis", grade="6", slide_count=3, model_key="test", backend=OutlineBackend(), source_mode="web", search_workflow="two_step", urls=[], files=[], ) def test_web_two_step_ingests_without_auto_search(research_env, monkeypatch): calls: list[dict] = [] def fake_ingest(self, **kwargs): calls.append(kwargs) return ResearchIngestResult( session_id="sess-two", ingested=["https://example.com/a"], skipped=[], failures=[], doc_count=1, chunk_count=1, trace_path="/tmp/trace.json", message="Ingested 1 source(s)", ) monkeypatch.setattr(AgentRunner, "run_researchmind_ingest", fake_ingest) runner = AgentRunner() runner.run_education_pptx( topic="Photosynthesis", grade="6", slide_count=3, model_key="test", backend=OutlineBackend(), source_mode="web", search_workflow="two_step", urls=["https://example.com/a"], ) assert calls[0]["auto_search"] is False def test_rag_requires_indexed_sources(research_env): runner = AgentRunner() with pytest.raises(ValueError, match="RAG mode requires indexed sources"): runner.run_education_pptx( topic="Photosynthesis", grade="6", slide_count=3, model_key="test", backend=OutlineBackend(), source_mode="rag", session_id="", urls=[], files=[], ) def test_web_two_step_uses_duplicate_doc_ids(research_env): runner = AgentRunner() first = runner.run_researchmind_ingest( topic="Photosynthesis", urls=["https://example.com/a"], files=[], auto_search=False, session_id=None, model_key="test", backend=OutlineBackend(), ) assert first.doc_ids new_session = runner.run_researchmind_discover( topic="Photosynthesis", auto_search=False, session_id=None, model_key="test", backend=OutlineBackend(), ).session_id second = runner.run_researchmind_ingest( topic="Photosynthesis", urls=["https://example.com/a"], files=[], auto_search=False, session_id=new_session, model_key="test", backend=OutlineBackend(), ) assert second.ingested == [] assert len(second.skipped) == 1 assert second.doc_ids == first.doc_ids result = runner.run_education_pptx( topic="Photosynthesis", grade="6", slide_count=3, model_key="test", backend=OutlineBackend(), source_mode="web", search_workflow="two_step", urls=["https://example.com/a"], session_id=new_session, ) assert "Retrieved" in result.source_summary assert result.source_summary.count("model knowledge only") == 0 def test_rag_uses_session_without_auto_search(research_env, monkeypatch): ingest = AgentRunner().run_researchmind_ingest( topic="Photosynthesis", urls=["https://example.com/a"], files=[], auto_search=False, session_id=None, model_key="test", backend=OutlineBackend(), ) calls: list[dict] = [] def fake_ingest(self, **kwargs): calls.append(kwargs) return ingest monkeypatch.setattr(AgentRunner, "run_researchmind_ingest", fake_ingest) runner = AgentRunner() result = runner.run_education_pptx( topic="Photosynthesis", grade="6", slide_count=3, model_key="test", backend=OutlineBackend(), source_mode="rag", session_id=ingest.session_id, ) assert calls == [] assert "Retrieved" in result.source_summary