Spaces:

hchevva
/

NLP_Project

Sleeping

NLP_Project / tests /test_nlp_pipeline.py

Upload 43 files

630d650 verified 7 days ago

1.57 kB

	import numpy as np

	from toxra_core.nlp_pipeline import (
	expand_regulatory_queries,
	extract_evidence_span,
	hybrid_rank_text_items,
	)


	def test_expand_regulatory_queries_adds_families():
	queries, families = expand_regulatory_queries(
	base_queries=["genotoxicity risk"],
	endpoint_modules=["Genotoxicity (OECD TG)"],
	frameworks=["FDA CTP"],
	)
	assert len(queries) > 1
	assert "endpoint" in families
	assert families["endpoint"]


	def test_extract_evidence_span_hit_and_fallback():
	text = "Sentence one. AMES test showed equivocal response. Sentence three. Sentence four."
	hit = extract_evidence_span(text, "AMES")
	assert "AMES" in hit["text"]

	fb = extract_evidence_span("Alpha. Beta.", "nonexistenttoken")
	assert fb["text"]


	def test_hybrid_rank_text_items_lexical_only():
	items = [
	{"text": "This section discusses liver toxicity and NOAEL values."},
	{"text": "Completely unrelated formulation text."},
	]
	selected, diag = hybrid_rank_text_items(items, query="NOAEL liver")
	assert selected
	assert diag["ranking_method"] in {"lexical_only", "hybrid_rrf"}


	def test_hybrid_rank_text_items_with_embeddings():
	items = [{"text": "A"}, {"text": "B"}, {"text": "C"}]
	emb = np.array([[1.0, 0.0], [0.5, 0.5], [0.0, 1.0]], dtype=np.float32)
	q = np.array([1.0, 0.0], dtype=np.float32)
	selected, diag = hybrid_rank_text_items(items, query="A", item_embeddings=emb, query_embedding=q)
	assert selected
	assert diag["ranking_method"] == "hybrid_rrf"