Spaces:
Sleeping
RAG: expand PubMed query with protein forms; full-text on by default
Browse filesThe PubMed query was exact-phrase matching on coding HGVS only
(c.413G>A) which excluded every paper that cites the variant by
protein change. For older Mendelian disease genes — NPHS2, COL4A5,
GLA, podocinopathies, collagenopathies — papers virtually never use
the canonical HGVS coding form. They use R138Q, Arg138Gln, p.R138Q.
Result: RAG returned no hits even for well-characterized variants.
build_query now emits five alternate spellings per variant, each
quoted so PubMed treats them as exact phrases:
- coding HGVS (existing)
- p.Arg138Gln, Arg138Gln (three-letter, with/without prefix)
- R138Q, p.R138Q (one-letter, with/without prefix)
Nonsense variants emit R###* in addition to Arg###Ter. The protein
HGVS is parsed once at the build_query layer; the previous "lone p
token" regression is guarded by an explicit unit test.
Also flips rag_fetch_fulltext to default True. Live demo now runs the
full FullTextFetcher strategy chain (EuropePMC → PMC → bioRxiv →
Unpaywall → Semantic Scholar → OpenAlex) for every paper, giving
Claude body text to cite from rather than abstracts only. Adds
~30-60s per query but materially improves PS3/PP1/PM3 citation
specificity.
7 new tests cover query expansion across coding/protein/nonsense
forms plus the regression guard.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
- backend/app/config.py +6 -4
- backend/app/services/rag/fetcher.py +93 -18
- backend/tests/test_rag_query.py +103 -0
|
@@ -61,10 +61,12 @@ class Settings(BaseSettings):
|
|
| 61 |
jwt_algorithm: str = "HS256"
|
| 62 |
jwt_expire_minutes: int = 480
|
| 63 |
|
| 64 |
-
#
|
| 65 |
-
#
|
| 66 |
-
#
|
| 67 |
-
|
|
|
|
|
|
|
| 68 |
# Toggle pypdf-based text extraction from Unpaywall / Wiley / Semantic
|
| 69 |
# Scholar / OpenAlex PDFs. Slim deployments that don't ship pypdf hit
|
| 70 |
# the soft-import guard in `_extract_pdf_text` and degrade gracefully
|
|
|
|
| 61 |
jwt_algorithm: str = "HS256"
|
| 62 |
jwt_expire_minutes: int = 480
|
| 63 |
|
| 64 |
+
# Full-text fetch is on by default — every paper that comes back from
|
| 65 |
+
# PubMed gets the FullTextFetcher strategy chain (EuropePMC → PMC →
|
| 66 |
+
# bioRxiv → Unpaywall → Semantic Scholar → OpenAlex) so Claude has
|
| 67 |
+
# body text to cite from, not just an abstract. Set
|
| 68 |
+
# RAG_FETCH_FULLTEXT=false to revert to abstracts-only.
|
| 69 |
+
rag_fetch_fulltext: bool = True
|
| 70 |
# Toggle pypdf-based text extraction from Unpaywall / Wiley / Semantic
|
| 71 |
# Scholar / OpenAlex PDFs. Slim deployments that don't ship pypdf hit
|
| 72 |
# the soft-import guard in `_extract_pdf_text` and degrade gracefully
|
|
@@ -90,33 +90,108 @@ class LiteratureFetcher:
|
|
| 90 |
def build_query(
|
| 91 |
self, gene: str, hgvs: str, protein: str | None, raw_hgvs: str | None = None
|
| 92 |
) -> str:
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
for h in (hgvs, raw_hgvs):
|
| 106 |
if not h:
|
| 107 |
continue
|
| 108 |
short = self._strip_transcript_prefix(h)
|
| 109 |
if not short or short.startswith("p."):
|
| 110 |
continue
|
| 111 |
-
if short not in
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
return f'"{gene}"'
|
| 117 |
-
quoted = [f'"{t}"' for t in
|
| 118 |
return f'("{gene}") AND ({" OR ".join(quoted)})'
|
| 119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=10), reraise=True)
|
| 121 |
async def search_pubmed(self, query: str, retmax: int | None = None) -> list[str]:
|
| 122 |
cap = retmax if retmax is not None else self.max_results
|
|
|
|
| 90 |
def build_query(
|
| 91 |
self, gene: str, hgvs: str, protein: str | None, raw_hgvs: str | None = None
|
| 92 |
) -> str:
|
| 93 |
+
"""Build a PubMed eutils query that matches the variant whether the
|
| 94 |
+
paper uses HGVS coding, HGVS protein, three-letter codon notation,
|
| 95 |
+
or one-letter codon notation. Each variant identifier is a quoted
|
| 96 |
+
phrase to avoid the previous "bare p token" bug — PubMed will only
|
| 97 |
+
match the literal phrase, never a substring.
|
| 98 |
+
|
| 99 |
+
Coverage strategy:
|
| 100 |
+
- HGVS coding: `c.413G>A`, `c.5266dupC` (both Mutalyzer-normalized
|
| 101 |
+
and user-raw, because papers often retain the trailing nucleotide).
|
| 102 |
+
- HGVS protein: `p.Arg138Gln`, `p.Arg138Gln` (paren-stripped).
|
| 103 |
+
- Three-letter short: `Arg138Gln`.
|
| 104 |
+
- One-letter short: `R138Q` — by far the most common form in older
|
| 105 |
+
literature, especially channelopathy / collagenopathy / Fabry /
|
| 106 |
+
podocinopathy papers that predate ClinVar's HGVS-first convention.
|
| 107 |
+
|
| 108 |
+
The risk addressed in the previous version (bare `p` matching
|
| 109 |
+
everything) only happens when a phrase isn't fully quoted. Here
|
| 110 |
+
every term is wrapped in double-quotes and joined with OR.
|
| 111 |
+
"""
|
| 112 |
+
terms: list[str] = []
|
| 113 |
+
|
| 114 |
+
# --- Coding HGVS terms (Mutalyzer + raw) ---
|
| 115 |
for h in (hgvs, raw_hgvs):
|
| 116 |
if not h:
|
| 117 |
continue
|
| 118 |
short = self._strip_transcript_prefix(h)
|
| 119 |
if not short or short.startswith("p."):
|
| 120 |
continue
|
| 121 |
+
if short not in terms:
|
| 122 |
+
terms.append(short)
|
| 123 |
+
|
| 124 |
+
# --- Protein HGVS terms ---
|
| 125 |
+
protein_forms = self._expand_protein_forms(protein, hgvs, raw_hgvs)
|
| 126 |
+
for pf in protein_forms:
|
| 127 |
+
if pf not in terms:
|
| 128 |
+
terms.append(pf)
|
| 129 |
+
|
| 130 |
+
if not terms:
|
| 131 |
return f'"{gene}"'
|
| 132 |
+
quoted = [f'"{t}"' for t in terms]
|
| 133 |
return f'("{gene}") AND ({" OR ".join(quoted)})'
|
| 134 |
|
| 135 |
+
# Standard amino-acid three-letter → one-letter table. Stop codons
|
| 136 |
+
# are represented as `*` / `Ter` / `X` in different journals; emit
|
| 137 |
+
# both common variants when we hit one.
|
| 138 |
+
_AA3_TO_1: dict[str, str] = {
|
| 139 |
+
"Ala": "A", "Arg": "R", "Asn": "N", "Asp": "D", "Cys": "C",
|
| 140 |
+
"Glu": "E", "Gln": "Q", "Gly": "G", "His": "H", "Ile": "I",
|
| 141 |
+
"Leu": "L", "Lys": "K", "Met": "M", "Phe": "F", "Pro": "P",
|
| 142 |
+
"Ser": "S", "Thr": "T", "Trp": "W", "Tyr": "Y", "Val": "V",
|
| 143 |
+
"Ter": "*", "Sec": "U", "Pyl": "O",
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
@classmethod
|
| 147 |
+
def _expand_protein_forms(
|
| 148 |
+
cls,
|
| 149 |
+
protein: str | None,
|
| 150 |
+
hgvs: str | None,
|
| 151 |
+
raw_hgvs: str | None,
|
| 152 |
+
) -> list[str]:
|
| 153 |
+
"""Return every alternate protein-change spelling we can derive
|
| 154 |
+
from the canonical protein HGVS. All entries are quoted-phrase
|
| 155 |
+
safe (no bare single-letter tokens) since the caller wraps each
|
| 156 |
+
in double quotes."""
|
| 157 |
+
import re
|
| 158 |
+
out: list[str] = []
|
| 159 |
+
sources = [protein] + [
|
| 160 |
+
cls._strip_transcript_prefix(h) for h in (hgvs, raw_hgvs) if h
|
| 161 |
+
]
|
| 162 |
+
for src in sources:
|
| 163 |
+
if not src:
|
| 164 |
+
continue
|
| 165 |
+
# Accept p.(Arg138Gln), p.Arg138Gln, Arg138Gln (any leading prefix).
|
| 166 |
+
m = re.search(r"p\.?\(?([A-Za-z]{3})(\d+)([A-Za-z]{3}|=|\*|Ter|fs\w*)\)?", src)
|
| 167 |
+
if not m:
|
| 168 |
+
continue
|
| 169 |
+
ref3, pos, alt3 = m.group(1), m.group(2), m.group(3)
|
| 170 |
+
ref3_t = ref3.title()
|
| 171 |
+
alt3_t = alt3.title() if alt3.isalpha() else alt3
|
| 172 |
+
# Three-letter (HGVS canonical and stripped)
|
| 173 |
+
three = f"{ref3_t}{pos}{alt3_t}"
|
| 174 |
+
out.append(f"p.{three}")
|
| 175 |
+
out.append(three)
|
| 176 |
+
# One-letter (common in older literature)
|
| 177 |
+
r1 = cls._AA3_TO_1.get(ref3_t)
|
| 178 |
+
a1 = cls._AA3_TO_1.get(alt3_t) if alt3_t in cls._AA3_TO_1 else (
|
| 179 |
+
"*" if alt3_t in ("Ter", "*") else None
|
| 180 |
+
)
|
| 181 |
+
if r1 and a1:
|
| 182 |
+
out.append(f"{r1}{pos}{a1}")
|
| 183 |
+
out.append(f"p.{r1}{pos}{a1}")
|
| 184 |
+
elif r1 and alt3_t.startswith("fs"):
|
| 185 |
+
out.append(f"{r1}{pos}fs")
|
| 186 |
+
# De-dupe preserving order
|
| 187 |
+
seen: set[str] = set()
|
| 188 |
+
unique = []
|
| 189 |
+
for t in out:
|
| 190 |
+
if t not in seen:
|
| 191 |
+
seen.add(t)
|
| 192 |
+
unique.append(t)
|
| 193 |
+
return unique
|
| 194 |
+
|
| 195 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=10), reraise=True)
|
| 196 |
async def search_pubmed(self, query: str, retmax: int | None = None) -> list[str]:
|
| 197 |
cap = retmax if retmax is not None else self.max_results
|
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for the PubMed query builder.
|
| 2 |
+
|
| 3 |
+
The fetcher only sees papers PubMed indexes under the exact phrases in
|
| 4 |
+
its `term` parameter. The previous coding-only query missed every paper
|
| 5 |
+
that cites the variant by protein change (R138Q, Arg138Gln) — the
|
| 6 |
+
expansion below adds those forms.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from backend.app.services.rag.fetcher import LiteratureFetcher
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def _terms_in_query(q: str) -> set[str]:
|
| 13 |
+
"""Pull all quoted phrases out of `(GENE) AND ("a" OR "b" OR ...)`."""
|
| 14 |
+
import re
|
| 15 |
+
return set(re.findall(r'"([^"]+)"', q))
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def test_query_includes_coding_hgvs() -> None:
|
| 19 |
+
f = LiteratureFetcher()
|
| 20 |
+
q = f.build_query(
|
| 21 |
+
"NPHS2",
|
| 22 |
+
"NM_014625.4:c.413G>A",
|
| 23 |
+
"NM_014625.4(NP_055440.1):p.(Arg138Gln)",
|
| 24 |
+
)
|
| 25 |
+
terms = _terms_in_query(q)
|
| 26 |
+
assert "NPHS2" in terms
|
| 27 |
+
assert "c.413G>A" in terms
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def test_query_includes_three_letter_and_one_letter_protein() -> None:
|
| 31 |
+
"""Recovers papers that cite the variant by protein change rather than
|
| 32 |
+
HGVS coding — the major failure mode on older Mendelian disease genes."""
|
| 33 |
+
f = LiteratureFetcher()
|
| 34 |
+
q = f.build_query(
|
| 35 |
+
"NPHS2",
|
| 36 |
+
"NM_014625.4:c.413G>A",
|
| 37 |
+
"NM_014625.4(NP_055440.1):p.(Arg138Gln)",
|
| 38 |
+
)
|
| 39 |
+
terms = _terms_in_query(q)
|
| 40 |
+
assert "p.Arg138Gln" in terms
|
| 41 |
+
assert "Arg138Gln" in terms
|
| 42 |
+
assert "p.R138Q" in terms
|
| 43 |
+
assert "R138Q" in terms
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def test_query_handles_collagenopathy_glycine_substitution() -> None:
|
| 47 |
+
"""COL4A5 Alport — glycine substitutions are usually cited as G953V
|
| 48 |
+
in the channelopathy / collagenopathy literature."""
|
| 49 |
+
f = LiteratureFetcher()
|
| 50 |
+
q = f.build_query(
|
| 51 |
+
"COL4A5",
|
| 52 |
+
"NM_033380.3:c.2858G>T",
|
| 53 |
+
"NP_203699.1:p.(Gly953Val)",
|
| 54 |
+
)
|
| 55 |
+
terms = _terms_in_query(q)
|
| 56 |
+
assert "Gly953Val" in terms
|
| 57 |
+
assert "G953V" in terms
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def test_query_no_bare_p_token() -> None:
|
| 61 |
+
"""The regression guard for the original bug — a bare `p` token in the
|
| 62 |
+
OR clause matched every paper mentioning protein. Every term we emit
|
| 63 |
+
must be fully quoted and at least 2 characters."""
|
| 64 |
+
f = LiteratureFetcher()
|
| 65 |
+
q = f.build_query(
|
| 66 |
+
"BRCA1",
|
| 67 |
+
"NM_007294.4:c.5266dup",
|
| 68 |
+
"NM_007294.4(NP_009225.1):p.(Gln1756ProfsTer74)",
|
| 69 |
+
)
|
| 70 |
+
terms = _terms_in_query(q)
|
| 71 |
+
assert "p" not in terms
|
| 72 |
+
assert all(len(t) >= 2 for t in terms)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def test_query_falls_back_to_gene_when_no_hgvs() -> None:
|
| 76 |
+
f = LiteratureFetcher()
|
| 77 |
+
q = f.build_query("BRCA1", "", None)
|
| 78 |
+
assert q == '"BRCA1"'
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def test_query_handles_nonsense_variants() -> None:
|
| 82 |
+
"""Stop-gained variants get cited as R306X, p.R306*, p.Arg306Ter
|
| 83 |
+
interchangeably."""
|
| 84 |
+
f = LiteratureFetcher()
|
| 85 |
+
q = f.build_query(
|
| 86 |
+
"PKD2",
|
| 87 |
+
"NM_000297.4:c.916C>T",
|
| 88 |
+
"NP_000288.1:p.(Arg306Ter)",
|
| 89 |
+
)
|
| 90 |
+
terms = _terms_in_query(q)
|
| 91 |
+
assert "Arg306Ter" in terms
|
| 92 |
+
assert "R306*" in terms
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def test_expand_protein_forms_dedup() -> None:
|
| 96 |
+
"""The expander should not emit duplicate spellings when given the
|
| 97 |
+
same protein change through multiple input fields."""
|
| 98 |
+
forms = LiteratureFetcher._expand_protein_forms(
|
| 99 |
+
protein="NP_055440.1:p.(Arg138Gln)",
|
| 100 |
+
hgvs="NM_014625.4:c.413G>A",
|
| 101 |
+
raw_hgvs="NM_014625.4(NP_055440.1):p.(Arg138Gln)",
|
| 102 |
+
)
|
| 103 |
+
assert len(forms) == len(set(forms))
|