import numpy as np from toxra_core.nlp_pipeline import ( expand_regulatory_queries, extract_evidence_span, hybrid_rank_text_items, ) def test_expand_regulatory_queries_adds_families(): queries, families = expand_regulatory_queries( base_queries=["genotoxicity risk"], endpoint_modules=["Genotoxicity (OECD TG)"], frameworks=["FDA CTP"], ) assert len(queries) > 1 assert "endpoint" in families assert families["endpoint"] def test_extract_evidence_span_hit_and_fallback(): text = "Sentence one. AMES test showed equivocal response. Sentence three. Sentence four." hit = extract_evidence_span(text, "AMES") assert "AMES" in hit["text"] fb = extract_evidence_span("Alpha. Beta.", "nonexistenttoken") assert fb["text"] def test_hybrid_rank_text_items_lexical_only(): items = [ {"text": "This section discusses liver toxicity and NOAEL values."}, {"text": "Completely unrelated formulation text."}, ] selected, diag = hybrid_rank_text_items(items, query="NOAEL liver") assert selected assert diag["ranking_method"] in {"lexical_only", "hybrid_rrf"} def test_hybrid_rank_text_items_with_embeddings(): items = [{"text": "A"}, {"text": "B"}, {"text": "C"}] emb = np.array([[1.0, 0.0], [0.5, 0.5], [0.0, 1.0]], dtype=np.float32) q = np.array([1.0, 0.0], dtype=np.float32) selected, diag = hybrid_rank_text_items(items, query="A", item_embeddings=emb, query_embedding=q) assert selected assert diag["ranking_method"] == "hybrid_rrf"