""" Clinical negation detection — pure-Python rule-based, terminator-aware. Used as a post-rerank filter in `api/hybrid.py` to drop chunks where the query's clinical concept appears in a negated context. A query about "patient with suicidal ideation" should not retrieve a chunk that says "patient denies suicidal ideation" as supporting evidence. We initially tried `scispacy` + `negspacy` (NegEx algorithm). It worked on synthetic cases (5/5) but had a high false-positive rate on real clinical chunks because the default NegEx scope detection over-extends through conjunctions — e.g. "with no children now presenting with recurrent depressive symptoms and active suicidal ideation" leaks the "no children" trigger forward to flag the affirmed "suicidal ideation". We switched to a tighter custom matcher with word-only scope terminators (no commas, since list-style negations like "negative for X, Y, Z" are common in clinical text and the comma is part of the list, not a scope break). Test grid: 11/11 PASS, including the FP killer above. Approach (Chapman et al. 2001, simplified): 1. Lowercase the chunk text once. 2. For each negation trigger (longest first to handle multi-word triggers like "no history of"), find every occurrence respecting word boundaries. 3. Scope = up to 100 chars after the trigger, truncated at the next terminator: a logical pivot ("but", "however", "with", "despite"...) or a sentence-ending punctuation (`.` `;`). Commas do NOT terminate scope so list-style negations work. 4. If any salient query term is a substring of that scope, return True. Latency: ~0.1 ms/chunk (regex-only, no NLP parse). Two orders of magnitude faster than the negspacy path it replaced. """ from __future__ import annotations import re _TOKEN_RE = re.compile(r"[A-Za-z]+") # Multi-word triggers must come before their single-word prefixes # (e.g. "no history of" before "no") so the longest match wins. _TRIGGERS: tuple[str, ...] = ( "no history of", "no evidence of", "no signs of", "no indication of", "denies", "denied", "denying", "ruled out", "rule out", "negative for", "without any", "without", "never had", "never", "not a", "not", "no", ) # Word-pivot terminators end the negation scope. Commas are intentionally # excluded — clinical lists ("negative for tremors, headaches, depression") # carry the negation through the whole list. _TERMINATORS = re.compile( r"\b(but|however|although|except|aside\s+from|despite|with)\b|[;.]", re.IGNORECASE, ) _SCOPE_CHAR_LIMIT = 100 _QUERY_STOPWORDS = frozenset({ "the", "a", "an", "for", "with", "of", "in", "on", "to", "and", "or", "what", "does", "is", "are", "were", "was", "be", "been", "being", "patient", "patients", "any", "some", "this", "that", "these", "those", "do", "did", "doing", "have", "has", "had", "having", "from", "as", "at", "by", "about", "into", "through", "between", "active", }) def salient_query_terms(query: str) -> list[str]: """Tokens worth checking for negation: ≥4-char alphabetic, non-stopword. Short words ("no", "as") are too generic to safely substring-match against chunk text. Most clinical concepts of interest (depression, suicidal, anxiety, ideation, psychotic) are well above the 4-char floor. """ out: list[str] = [] seen: set[str] = set() for raw in _TOKEN_RE.findall(query): low = raw.lower() if len(low) < 4 or low in _QUERY_STOPWORDS or low in seen: continue out.append(low) seen.add(low) return out def is_negated(chunk_text: str, query_terms: list[str]) -> bool: """True iff any salient query term appears within a negation scope.""" if not chunk_text or not query_terms: return False terms = [t.lower().strip() for t in query_terms if t and t.strip()] if not terms: return False text = chunk_text.lower() for trigger in _TRIGGERS: idx = 0 while True: pos = text.find(trigger, idx) if pos < 0: break if pos > 0 and text[pos - 1].isalnum(): idx = pos + 1 continue after = pos + len(trigger) if after < len(text) and text[after].isalnum(): idx = pos + 1 continue scope_text = text[after:after + _SCOPE_CHAR_LIMIT] terminator = _TERMINATORS.search(scope_text) scope = scope_text[:terminator.start()] if terminator else scope_text if any(t in scope for t in terms): return True idx = after return False