""" Tests for paragraph generation via convergence-driven sentence retrieval. Verifies: 1. Planning convergence finds relevant concept clusters 2. Sentence retrieval returns taught sentences in correct order 3. Relevance floor filters noise sentences 4. Multi-sentence output maintains coherence 5. Configurable sentence separator """ import sys import os import numpy as np import pytest sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src')) from engine import Engine def make_engine(tmp_path, seed=42): """Create engine with vocabulary and taught sentences.""" engine = Engine(data_dir=str(tmp_path), dim=300) rng = np.random.RandomState(seed) words = {} word_list = [ "shakespeare", "wrote", "hamlet", "macbeth", "playwright", "english", "was", "an", "a", "is", "the", "of", "einstein", "discovered", "relativity", "physicist", "german", "newton", "invented", "calculus", "gravity", "paris", "capital", "france", "london", "england", "python", "programming", "language", "created", "guido", "famous", "tragedy", "who", "what", "in", "by", ] for w in word_list: vec = rng.randn(300).astype(np.float32) vec = vec / (np.linalg.norm(vec) + 1e-10) words[w] = vec engine.load_embeddings_from_dict(words) return engine class TestParagraphGeneration: def test_single_topic_paragraph(self, tmp_path): """Teaching multiple sentences about one topic → multi-sentence output.""" engine = make_engine(tmp_path) engine.teach_sentence("shakespeare wrote hamlet") engine.teach_sentence("shakespeare wrote macbeth") result = engine.query_paragraph("shakespeare") assert result.strategy != "abstain" # Should mention both works answer = result.answer.lower() assert "shakespeare" in answer engine.close() def test_paragraph_preserves_word_order(self, tmp_path): """Sentences should appear in their taught word order.""" engine = make_engine(tmp_path) engine.teach_sentence("paris is the capital of france") result = engine.query_paragraph("capital of france") if result.strategy == "paragraph": # The sentence should be reproduced in order assert "paris" in result.answer.lower() assert "capital" in result.answer.lower() engine.close() def test_relevance_floor_filters_noise(self, tmp_path): """Low-relevance sentences should be excluded.""" engine = make_engine(tmp_path) engine.teach_sentence("shakespeare wrote hamlet") engine.teach_sentence("paris is the capital of france") # Query about shakespeare — paris sentence should be filtered result = engine.query_paragraph("shakespeare hamlet") answer = result.answer.lower() if result.strategy == "paragraph": # Should have shakespeare content but ideally not paris assert "shakespeare" in answer or "hamlet" in answer engine.close() def test_multi_topic_retrieval(self, tmp_path): """Query spanning multiple topics returns relevant sentences from each.""" engine = make_engine(tmp_path) engine.teach_sentence("einstein discovered relativity") engine.teach_sentence("newton invented calculus") result = engine.query_paragraph("einstein newton") answer = result.answer.lower() if result.strategy == "paragraph": has_einstein = "einstein" in answer or "relativity" in answer has_newton = "newton" in answer or "calculus" in answer assert has_einstein or has_newton engine.close() def test_configurable_separator(self, tmp_path): """Sentence separator should be configurable.""" engine = make_engine(tmp_path) engine.teach_sentence("shakespeare wrote hamlet") engine.teach_sentence("shakespeare wrote macbeth") # Use custom separator via generator directly from convergence import ConvergenceLoop query_vec = engine.encoder.encode_sentence("shakespeare") result = engine.generator.generate_paragraph( query_vector=query_vec, convergence_loop=engine.convergence, query_words=["shakespeare"], sentence_separator=" | ", ) if result.strategy == "paragraph" and "|" in result.text: # Custom separator used assert " | " in result.text engine.close() def test_max_sentences_limit(self, tmp_path): """max_sentences should cap the output.""" engine = make_engine(tmp_path) engine.teach_sentence("shakespeare wrote hamlet") engine.teach_sentence("shakespeare wrote macbeth") engine.teach_sentence("einstein discovered relativity") engine.teach_sentence("newton invented calculus") result = engine.query_paragraph("shakespeare einstein newton", max_sentences=2) if result.strategy == "paragraph": # Count sentences in output — should be <= 2 # (approximate: count by separator) parts = result.answer.split(". ") assert len(parts) <= 3 # 2 sentences + possible trailing engine.close() def test_empty_kb_abstains(self, tmp_path): """No taught sentences → abstain.""" engine = make_engine(tmp_path) result = engine.query_paragraph("shakespeare") assert result.strategy == "abstain" engine.close() def test_paragraph_confidence(self, tmp_path): """Paragraph confidence should reflect concept quality.""" engine = make_engine(tmp_path) engine.teach_sentence("shakespeare wrote hamlet") result = engine.query_paragraph("shakespeare hamlet") assert result.confidence >= 0 assert result.confidence <= 1.0 engine.close() def test_no_duplicate_sentences(self, tmp_path): """Same sentence taught twice shouldn't appear twice in output.""" engine = make_engine(tmp_path) engine.teach_sentence("shakespeare wrote hamlet") engine.teach_sentence("shakespeare wrote hamlet") # duplicate result = engine.query_paragraph("shakespeare") if result.strategy == "paragraph": # Count occurrences of "shakespeare wrote hamlet" count = result.answer.lower().count("shakespeare wrote hamlet") # Allow 1 (deduplicated) — the point is no duplicates assert count <= 1 engine.close() def test_trace_shows_planning(self, tmp_path): """Trace should show the planning phase.""" engine = make_engine(tmp_path) engine.teach_sentence("shakespeare wrote hamlet") result = engine.query_paragraph("shakespeare") if result.strategy == "paragraph" and hasattr(result, 'generation') and result.generation: trace_text = "\n".join(result.generation.trace) assert "Plan" in trace_text engine.close()