guru / tests /test_paragraph.py
tejadabheja's picture
Upload folder using huggingface_hub
a5ae1ac verified
"""
Tests for paragraph generation via convergence-driven sentence retrieval.
Verifies:
1. Planning convergence finds relevant concept clusters
2. Sentence retrieval returns taught sentences in correct order
3. Relevance floor filters noise sentences
4. Multi-sentence output maintains coherence
5. Configurable sentence separator
"""
import sys
import os
import numpy as np
import pytest
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
from engine import Engine
def make_engine(tmp_path, seed=42):
"""Create engine with vocabulary and taught sentences."""
engine = Engine(data_dir=str(tmp_path), dim=300)
rng = np.random.RandomState(seed)
words = {}
word_list = [
"shakespeare", "wrote", "hamlet", "macbeth", "playwright",
"english", "was", "an", "a", "is", "the", "of",
"einstein", "discovered", "relativity", "physicist", "german",
"newton", "invented", "calculus", "gravity",
"paris", "capital", "france", "london", "england",
"python", "programming", "language", "created", "guido",
"famous", "tragedy", "who", "what", "in", "by",
]
for w in word_list:
vec = rng.randn(300).astype(np.float32)
vec = vec / (np.linalg.norm(vec) + 1e-10)
words[w] = vec
engine.load_embeddings_from_dict(words)
return engine
class TestParagraphGeneration:
def test_single_topic_paragraph(self, tmp_path):
"""Teaching multiple sentences about one topic → multi-sentence output."""
engine = make_engine(tmp_path)
engine.teach_sentence("shakespeare wrote hamlet")
engine.teach_sentence("shakespeare wrote macbeth")
result = engine.query_paragraph("shakespeare")
assert result.strategy != "abstain"
# Should mention both works
answer = result.answer.lower()
assert "shakespeare" in answer
engine.close()
def test_paragraph_preserves_word_order(self, tmp_path):
"""Sentences should appear in their taught word order."""
engine = make_engine(tmp_path)
engine.teach_sentence("paris is the capital of france")
result = engine.query_paragraph("capital of france")
if result.strategy == "paragraph":
# The sentence should be reproduced in order
assert "paris" in result.answer.lower()
assert "capital" in result.answer.lower()
engine.close()
def test_relevance_floor_filters_noise(self, tmp_path):
"""Low-relevance sentences should be excluded."""
engine = make_engine(tmp_path)
engine.teach_sentence("shakespeare wrote hamlet")
engine.teach_sentence("paris is the capital of france")
# Query about shakespeare — paris sentence should be filtered
result = engine.query_paragraph("shakespeare hamlet")
answer = result.answer.lower()
if result.strategy == "paragraph":
# Should have shakespeare content but ideally not paris
assert "shakespeare" in answer or "hamlet" in answer
engine.close()
def test_multi_topic_retrieval(self, tmp_path):
"""Query spanning multiple topics returns relevant sentences from each."""
engine = make_engine(tmp_path)
engine.teach_sentence("einstein discovered relativity")
engine.teach_sentence("newton invented calculus")
result = engine.query_paragraph("einstein newton")
answer = result.answer.lower()
if result.strategy == "paragraph":
has_einstein = "einstein" in answer or "relativity" in answer
has_newton = "newton" in answer or "calculus" in answer
assert has_einstein or has_newton
engine.close()
def test_configurable_separator(self, tmp_path):
"""Sentence separator should be configurable."""
engine = make_engine(tmp_path)
engine.teach_sentence("shakespeare wrote hamlet")
engine.teach_sentence("shakespeare wrote macbeth")
# Use custom separator via generator directly
from convergence import ConvergenceLoop
query_vec = engine.encoder.encode_sentence("shakespeare")
result = engine.generator.generate_paragraph(
query_vector=query_vec,
convergence_loop=engine.convergence,
query_words=["shakespeare"],
sentence_separator=" | ",
)
if result.strategy == "paragraph" and "|" in result.text:
# Custom separator used
assert " | " in result.text
engine.close()
def test_max_sentences_limit(self, tmp_path):
"""max_sentences should cap the output."""
engine = make_engine(tmp_path)
engine.teach_sentence("shakespeare wrote hamlet")
engine.teach_sentence("shakespeare wrote macbeth")
engine.teach_sentence("einstein discovered relativity")
engine.teach_sentence("newton invented calculus")
result = engine.query_paragraph("shakespeare einstein newton", max_sentences=2)
if result.strategy == "paragraph":
# Count sentences in output — should be <= 2
# (approximate: count by separator)
parts = result.answer.split(". ")
assert len(parts) <= 3 # 2 sentences + possible trailing
engine.close()
def test_empty_kb_abstains(self, tmp_path):
"""No taught sentences → abstain."""
engine = make_engine(tmp_path)
result = engine.query_paragraph("shakespeare")
assert result.strategy == "abstain"
engine.close()
def test_paragraph_confidence(self, tmp_path):
"""Paragraph confidence should reflect concept quality."""
engine = make_engine(tmp_path)
engine.teach_sentence("shakespeare wrote hamlet")
result = engine.query_paragraph("shakespeare hamlet")
assert result.confidence >= 0
assert result.confidence <= 1.0
engine.close()
def test_no_duplicate_sentences(self, tmp_path):
"""Same sentence taught twice shouldn't appear twice in output."""
engine = make_engine(tmp_path)
engine.teach_sentence("shakespeare wrote hamlet")
engine.teach_sentence("shakespeare wrote hamlet") # duplicate
result = engine.query_paragraph("shakespeare")
if result.strategy == "paragraph":
# Count occurrences of "shakespeare wrote hamlet"
count = result.answer.lower().count("shakespeare wrote hamlet")
# Allow 1 (deduplicated) — the point is no duplicates
assert count <= 1
engine.close()
def test_trace_shows_planning(self, tmp_path):
"""Trace should show the planning phase."""
engine = make_engine(tmp_path)
engine.teach_sentence("shakespeare wrote hamlet")
result = engine.query_paragraph("shakespeare")
if result.strategy == "paragraph" and hasattr(result, 'generation') and result.generation:
trace_text = "\n".join(result.generation.trace)
assert "Plan" in trace_text
engine.close()