NLP_Project / tests /test_nlp_pipeline.py
hchevva's picture
Upload 43 files
630d650 verified
import numpy as np
from toxra_core.nlp_pipeline import (
expand_regulatory_queries,
extract_evidence_span,
hybrid_rank_text_items,
)
def test_expand_regulatory_queries_adds_families():
queries, families = expand_regulatory_queries(
base_queries=["genotoxicity risk"],
endpoint_modules=["Genotoxicity (OECD TG)"],
frameworks=["FDA CTP"],
)
assert len(queries) > 1
assert "endpoint" in families
assert families["endpoint"]
def test_extract_evidence_span_hit_and_fallback():
text = "Sentence one. AMES test showed equivocal response. Sentence three. Sentence four."
hit = extract_evidence_span(text, "AMES")
assert "AMES" in hit["text"]
fb = extract_evidence_span("Alpha. Beta.", "nonexistenttoken")
assert fb["text"]
def test_hybrid_rank_text_items_lexical_only():
items = [
{"text": "This section discusses liver toxicity and NOAEL values."},
{"text": "Completely unrelated formulation text."},
]
selected, diag = hybrid_rank_text_items(items, query="NOAEL liver")
assert selected
assert diag["ranking_method"] in {"lexical_only", "hybrid_rrf"}
def test_hybrid_rank_text_items_with_embeddings():
items = [{"text": "A"}, {"text": "B"}, {"text": "C"}]
emb = np.array([[1.0, 0.0], [0.5, 0.5], [0.0, 1.0]], dtype=np.float32)
q = np.array([1.0, 0.0], dtype=np.float32)
selected, diag = hybrid_rank_text_items(items, query="A", item_embeddings=emb, query_embedding=q)
assert selected
assert diag["ranking_method"] == "hybrid_rrf"