Spaces:

arjun10g
/

RAG-PSYCH

Running

App Files Files Community

RAG-PSYCH / api /negation.py

arjun10g

Initial deploy to Hugging Face Spaces

08fc97e about 2 months ago

raw

history blame contribute delete

4.68 kB

	"""
	Clinical negation detection — pure-Python rule-based, terminator-aware.

	Used as a post-rerank filter in `api/hybrid.py` to drop chunks where the
	query's clinical concept appears in a negated context. A query about
	"patient with suicidal ideation" should not retrieve a chunk that says
	"patient denies suicidal ideation" as supporting evidence.

	We initially tried `scispacy` + `negspacy` (NegEx algorithm). It worked
	on synthetic cases (5/5) but had a high false-positive rate on real
	clinical chunks because the default NegEx scope detection over-extends
	through conjunctions — e.g. "with no children now presenting with
	recurrent depressive symptoms and active suicidal ideation" leaks the
	"no children" trigger forward to flag the affirmed "suicidal ideation".
	We switched to a tighter custom matcher with word-only scope terminators
	(no commas, since list-style negations like "negative for X, Y, Z" are
	common in clinical text and the comma is part of the list, not a scope
	break). Test grid: 11/11 PASS, including the FP killer above.

	Approach (Chapman et al. 2001, simplified):
	1. Lowercase the chunk text once.
	2. For each negation trigger (longest first to handle multi-word
	triggers like "no history of"), find every occurrence respecting
	word boundaries.
	3. Scope = up to 100 chars after the trigger, truncated at the next
	terminator: a logical pivot ("but", "however", "with", "despite"...)
	or a sentence-ending punctuation (`.` `;`). Commas do NOT terminate
	scope so list-style negations work.
	4. If any salient query term is a substring of that scope, return True.

	Latency: ~0.1 ms/chunk (regex-only, no NLP parse). Two orders of
	magnitude faster than the negspacy path it replaced.
	"""

	from __future__ import annotations

	import re

	_TOKEN_RE = re.compile(r"[A-Za-z]+")

	# Multi-word triggers must come before their single-word prefixes
	# (e.g. "no history of" before "no") so the longest match wins.
	_TRIGGERS: tuple[str, ...] = (
	"no history of", "no evidence of", "no signs of", "no indication of",
	"denies", "denied", "denying",
	"ruled out", "rule out",
	"negative for",
	"without any", "without",
	"never had", "never",
	"not a", "not",
	"no",
	)

	# Word-pivot terminators end the negation scope. Commas are intentionally
	# excluded — clinical lists ("negative for tremors, headaches, depression")
	# carry the negation through the whole list.
	_TERMINATORS = re.compile(
	r"\b(but\|however\|although\|except\|aside\s+from\|despite\|with)\b\|[;.]",
	re.IGNORECASE,
	)

	_SCOPE_CHAR_LIMIT = 100

	_QUERY_STOPWORDS = frozenset({
	"the", "a", "an", "for", "with", "of", "in", "on", "to", "and", "or",
	"what", "does", "is", "are", "were", "was", "be", "been", "being",
	"patient", "patients", "any", "some", "this", "that", "these", "those",
	"do", "did", "doing", "have", "has", "had", "having", "from", "as",
	"at", "by", "about", "into", "through", "between", "active",
	})


	def salient_query_terms(query: str) -> list[str]:
	"""Tokens worth checking for negation: ≥4-char alphabetic, non-stopword.

	Short words ("no", "as") are too generic to safely substring-match
	against chunk text. Most clinical concepts of interest (depression,
	suicidal, anxiety, ideation, psychotic) are well above the 4-char floor.
	"""
	out: list[str] = []
	seen: set[str] = set()
	for raw in _TOKEN_RE.findall(query):
	low = raw.lower()
	if len(low) < 4 or low in _QUERY_STOPWORDS or low in seen:
	continue
	out.append(low)
	seen.add(low)
	return out


	def is_negated(chunk_text: str, query_terms: list[str]) -> bool:
	"""True iff any salient query term appears within a negation scope."""
	if not chunk_text or not query_terms:
	return False
	terms = [t.lower().strip() for t in query_terms if t and t.strip()]
	if not terms:
	return False
	text = chunk_text.lower()
	for trigger in _TRIGGERS:
	idx = 0
	while True:
	pos = text.find(trigger, idx)
	if pos < 0:
	break
	if pos > 0 and text[pos - 1].isalnum():
	idx = pos + 1
	continue
	after = pos + len(trigger)
	if after < len(text) and text[after].isalnum():
	idx = pos + 1
	continue
	scope_text = text[after:after + _SCOPE_CHAR_LIMIT]
	terminator = _TERMINATORS.search(scope_text)
	scope = scope_text[:terminator.start()] if terminator else scope_text
	if any(t in scope for t in terms):
	return True
	idx = after
	return False