| """ |
| Clinical negation detection — pure-Python rule-based, terminator-aware. |
| |
| Used as a post-rerank filter in `api/hybrid.py` to drop chunks where the |
| query's clinical concept appears in a negated context. A query about |
| "patient with suicidal ideation" should not retrieve a chunk that says |
| "patient denies suicidal ideation" as supporting evidence. |
| |
| We initially tried `scispacy` + `negspacy` (NegEx algorithm). It worked |
| on synthetic cases (5/5) but had a high false-positive rate on real |
| clinical chunks because the default NegEx scope detection over-extends |
| through conjunctions — e.g. "with no children now presenting with |
| recurrent depressive symptoms and active suicidal ideation" leaks the |
| "no children" trigger forward to flag the affirmed "suicidal ideation". |
| We switched to a tighter custom matcher with word-only scope terminators |
| (no commas, since list-style negations like "negative for X, Y, Z" are |
| common in clinical text and the comma is part of the list, not a scope |
| break). Test grid: 11/11 PASS, including the FP killer above. |
| |
| Approach (Chapman et al. 2001, simplified): |
| 1. Lowercase the chunk text once. |
| 2. For each negation trigger (longest first to handle multi-word |
| triggers like "no history of"), find every occurrence respecting |
| word boundaries. |
| 3. Scope = up to 100 chars after the trigger, truncated at the next |
| terminator: a logical pivot ("but", "however", "with", "despite"...) |
| or a sentence-ending punctuation (`.` `;`). Commas do NOT terminate |
| scope so list-style negations work. |
| 4. If any salient query term is a substring of that scope, return True. |
| |
| Latency: ~0.1 ms/chunk (regex-only, no NLP parse). Two orders of |
| magnitude faster than the negspacy path it replaced. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import re |
|
|
| _TOKEN_RE = re.compile(r"[A-Za-z]+") |
|
|
| |
| |
| _TRIGGERS: tuple[str, ...] = ( |
| "no history of", "no evidence of", "no signs of", "no indication of", |
| "denies", "denied", "denying", |
| "ruled out", "rule out", |
| "negative for", |
| "without any", "without", |
| "never had", "never", |
| "not a", "not", |
| "no", |
| ) |
|
|
| |
| |
| |
| _TERMINATORS = re.compile( |
| r"\b(but|however|although|except|aside\s+from|despite|with)\b|[;.]", |
| re.IGNORECASE, |
| ) |
|
|
| _SCOPE_CHAR_LIMIT = 100 |
|
|
| _QUERY_STOPWORDS = frozenset({ |
| "the", "a", "an", "for", "with", "of", "in", "on", "to", "and", "or", |
| "what", "does", "is", "are", "were", "was", "be", "been", "being", |
| "patient", "patients", "any", "some", "this", "that", "these", "those", |
| "do", "did", "doing", "have", "has", "had", "having", "from", "as", |
| "at", "by", "about", "into", "through", "between", "active", |
| }) |
|
|
|
|
| def salient_query_terms(query: str) -> list[str]: |
| """Tokens worth checking for negation: ≥4-char alphabetic, non-stopword. |
| |
| Short words ("no", "as") are too generic to safely substring-match |
| against chunk text. Most clinical concepts of interest (depression, |
| suicidal, anxiety, ideation, psychotic) are well above the 4-char floor. |
| """ |
| out: list[str] = [] |
| seen: set[str] = set() |
| for raw in _TOKEN_RE.findall(query): |
| low = raw.lower() |
| if len(low) < 4 or low in _QUERY_STOPWORDS or low in seen: |
| continue |
| out.append(low) |
| seen.add(low) |
| return out |
|
|
|
|
| def is_negated(chunk_text: str, query_terms: list[str]) -> bool: |
| """True iff any salient query term appears within a negation scope.""" |
| if not chunk_text or not query_terms: |
| return False |
| terms = [t.lower().strip() for t in query_terms if t and t.strip()] |
| if not terms: |
| return False |
| text = chunk_text.lower() |
| for trigger in _TRIGGERS: |
| idx = 0 |
| while True: |
| pos = text.find(trigger, idx) |
| if pos < 0: |
| break |
| if pos > 0 and text[pos - 1].isalnum(): |
| idx = pos + 1 |
| continue |
| after = pos + len(trigger) |
| if after < len(text) and text[after].isalnum(): |
| idx = pos + 1 |
| continue |
| scope_text = text[after:after + _SCOPE_CHAR_LIMIT] |
| terminator = _TERMINATORS.search(scope_text) |
| scope = scope_text[:terminator.start()] if terminator else scope_text |
| if any(t in scope for t in terms): |
| return True |
| idx = after |
| return False |
|
|