guru / tests /test_paragraph.py

Upload folder using huggingface_hub

a5ae1ac verified about 1 month ago

7.11 kB

	"""
	Tests for paragraph generation via convergence-driven sentence retrieval.

	Verifies:
	1. Planning convergence finds relevant concept clusters
	2. Sentence retrieval returns taught sentences in correct order
	3. Relevance floor filters noise sentences
	4. Multi-sentence output maintains coherence
	5. Configurable sentence separator
	"""

	import sys
	import os
	import numpy as np
	import pytest

	sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))

	from engine import Engine


	def make_engine(tmp_path, seed=42):
	"""Create engine with vocabulary and taught sentences."""
	engine = Engine(data_dir=str(tmp_path), dim=300)
	rng = np.random.RandomState(seed)
	words = {}
	word_list = [
	"shakespeare", "wrote", "hamlet", "macbeth", "playwright",
	"english", "was", "an", "a", "is", "the", "of",
	"einstein", "discovered", "relativity", "physicist", "german",
	"newton", "invented", "calculus", "gravity",
	"paris", "capital", "france", "london", "england",
	"python", "programming", "language", "created", "guido",
	"famous", "tragedy", "who", "what", "in", "by",
	]
	for w in word_list:
	vec = rng.randn(300).astype(np.float32)
	vec = vec / (np.linalg.norm(vec) + 1e-10)
	words[w] = vec
	engine.load_embeddings_from_dict(words)
	return engine


	class TestParagraphGeneration:
	def test_single_topic_paragraph(self, tmp_path):
	"""Teaching multiple sentences about one topic → multi-sentence output."""
	engine = make_engine(tmp_path)
	engine.teach_sentence("shakespeare wrote hamlet")
	engine.teach_sentence("shakespeare wrote macbeth")

	result = engine.query_paragraph("shakespeare")
	assert result.strategy != "abstain"
	# Should mention both works
	answer = result.answer.lower()
	assert "shakespeare" in answer
	engine.close()

	def test_paragraph_preserves_word_order(self, tmp_path):
	"""Sentences should appear in their taught word order."""
	engine = make_engine(tmp_path)
	engine.teach_sentence("paris is the capital of france")

	result = engine.query_paragraph("capital of france")
	if result.strategy == "paragraph":
	# The sentence should be reproduced in order
	assert "paris" in result.answer.lower()
	assert "capital" in result.answer.lower()
	engine.close()

	def test_relevance_floor_filters_noise(self, tmp_path):
	"""Low-relevance sentences should be excluded."""
	engine = make_engine(tmp_path)
	engine.teach_sentence("shakespeare wrote hamlet")
	engine.teach_sentence("paris is the capital of france")

	# Query about shakespeare — paris sentence should be filtered
	result = engine.query_paragraph("shakespeare hamlet")
	answer = result.answer.lower()
	if result.strategy == "paragraph":
	# Should have shakespeare content but ideally not paris
	assert "shakespeare" in answer or "hamlet" in answer
	engine.close()

	def test_multi_topic_retrieval(self, tmp_path):
	"""Query spanning multiple topics returns relevant sentences from each."""
	engine = make_engine(tmp_path)
	engine.teach_sentence("einstein discovered relativity")
	engine.teach_sentence("newton invented calculus")

	result = engine.query_paragraph("einstein newton")
	answer = result.answer.lower()
	if result.strategy == "paragraph":
	has_einstein = "einstein" in answer or "relativity" in answer
	has_newton = "newton" in answer or "calculus" in answer
	assert has_einstein or has_newton
	engine.close()

	def test_configurable_separator(self, tmp_path):
	"""Sentence separator should be configurable."""
	engine = make_engine(tmp_path)
	engine.teach_sentence("shakespeare wrote hamlet")
	engine.teach_sentence("shakespeare wrote macbeth")

	# Use custom separator via generator directly
	from convergence import ConvergenceLoop
	query_vec = engine.encoder.encode_sentence("shakespeare")
	result = engine.generator.generate_paragraph(
	query_vector=query_vec,
	convergence_loop=engine.convergence,
	query_words=["shakespeare"],
	sentence_separator=" \| ",
	)
	if result.strategy == "paragraph" and "\|" in result.text:
	# Custom separator used
	assert " \| " in result.text
	engine.close()

	def test_max_sentences_limit(self, tmp_path):
	"""max_sentences should cap the output."""
	engine = make_engine(tmp_path)
	engine.teach_sentence("shakespeare wrote hamlet")
	engine.teach_sentence("shakespeare wrote macbeth")
	engine.teach_sentence("einstein discovered relativity")
	engine.teach_sentence("newton invented calculus")

	result = engine.query_paragraph("shakespeare einstein newton", max_sentences=2)
	if result.strategy == "paragraph":
	# Count sentences in output — should be <= 2
	# (approximate: count by separator)
	parts = result.answer.split(". ")
	assert len(parts) <= 3 # 2 sentences + possible trailing
	engine.close()

	def test_empty_kb_abstains(self, tmp_path):
	"""No taught sentences → abstain."""
	engine = make_engine(tmp_path)
	result = engine.query_paragraph("shakespeare")
	assert result.strategy == "abstain"
	engine.close()

	def test_paragraph_confidence(self, tmp_path):
	"""Paragraph confidence should reflect concept quality."""
	engine = make_engine(tmp_path)
	engine.teach_sentence("shakespeare wrote hamlet")
	result = engine.query_paragraph("shakespeare hamlet")
	assert result.confidence >= 0
	assert result.confidence <= 1.0
	engine.close()

	def test_no_duplicate_sentences(self, tmp_path):
	"""Same sentence taught twice shouldn't appear twice in output."""
	engine = make_engine(tmp_path)
	engine.teach_sentence("shakespeare wrote hamlet")
	engine.teach_sentence("shakespeare wrote hamlet") # duplicate

	result = engine.query_paragraph("shakespeare")
	if result.strategy == "paragraph":
	# Count occurrences of "shakespeare wrote hamlet"
	count = result.answer.lower().count("shakespeare wrote hamlet")
	# Allow 1 (deduplicated) — the point is no duplicates
	assert count <= 1
	engine.close()

	def test_trace_shows_planning(self, tmp_path):
	"""Trace should show the planning phase."""
	engine = make_engine(tmp_path)
	engine.teach_sentence("shakespeare wrote hamlet")
	result = engine.query_paragraph("shakespeare")
	if result.strategy == "paragraph" and hasattr(result, 'generation') and result.generation:
	trace_text = "\n".join(result.generation.trace)
	assert "Plan" in trace_text
	engine.close()