Codex Claude Opus 4.7 commited on
Commit
10b0e7d
·
1 Parent(s): 6ea8c5e

RAG: expand PubMed query with protein forms; full-text on by default

Browse files

The PubMed query was exact-phrase matching on coding HGVS only
(c.413G>A) which excluded every paper that cites the variant by
protein change. For older Mendelian disease genes — NPHS2, COL4A5,
GLA, podocinopathies, collagenopathies — papers virtually never use
the canonical HGVS coding form. They use R138Q, Arg138Gln, p.R138Q.
Result: RAG returned no hits even for well-characterized variants.

build_query now emits five alternate spellings per variant, each
quoted so PubMed treats them as exact phrases:
- coding HGVS (existing)
- p.Arg138Gln, Arg138Gln (three-letter, with/without prefix)
- R138Q, p.R138Q (one-letter, with/without prefix)
Nonsense variants emit R###* in addition to Arg###Ter. The protein
HGVS is parsed once at the build_query layer; the previous "lone p
token" regression is guarded by an explicit unit test.

Also flips rag_fetch_fulltext to default True. Live demo now runs the
full FullTextFetcher strategy chain (EuropePMC → PMC → bioRxiv →
Unpaywall → Semantic Scholar → OpenAlex) for every paper, giving
Claude body text to cite from rather than abstracts only. Adds
~30-60s per query but materially improves PS3/PP1/PM3 citation
specificity.

7 new tests cover query expansion across coding/protein/nonsense
forms plus the regression guard.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

backend/app/config.py CHANGED
@@ -61,10 +61,12 @@ class Settings(BaseSettings):
61
  jwt_algorithm: str = "HS256"
62
  jwt_expire_minutes: int = 480
63
 
64
- # Default off for the live demoabstracts give Claude enough to cite
65
- # without paying the ~30-60s full-text fetch + PDF parse tax per query.
66
- # Re-enable (RAG_FETCH_FULLTEXT=true) for offline batch validation.
67
- rag_fetch_fulltext: bool = False
 
 
68
  # Toggle pypdf-based text extraction from Unpaywall / Wiley / Semantic
69
  # Scholar / OpenAlex PDFs. Slim deployments that don't ship pypdf hit
70
  # the soft-import guard in `_extract_pdf_text` and degrade gracefully
 
61
  jwt_algorithm: str = "HS256"
62
  jwt_expire_minutes: int = 480
63
 
64
+ # Full-text fetch is on by defaultevery paper that comes back from
65
+ # PubMed gets the FullTextFetcher strategy chain (EuropePMC PMC
66
+ # bioRxiv Unpaywall Semantic Scholar → OpenAlex) so Claude has
67
+ # body text to cite from, not just an abstract. Set
68
+ # RAG_FETCH_FULLTEXT=false to revert to abstracts-only.
69
+ rag_fetch_fulltext: bool = True
70
  # Toggle pypdf-based text extraction from Unpaywall / Wiley / Semantic
71
  # Scholar / OpenAlex PDFs. Slim deployments that don't ship pypdf hit
72
  # the soft-import guard in `_extract_pdf_text` and degrade gracefully
backend/app/services/rag/fetcher.py CHANGED
@@ -90,33 +90,108 @@ class LiteratureFetcher:
90
  def build_query(
91
  self, gene: str, hgvs: str, protein: str | None, raw_hgvs: str | None = None
92
  ) -> str:
93
- # Only emit coding (c./g./n.) HGVS terms. Protein forms cause
94
- # silent corruption: PubMed's eutils does not parse phrases like
95
- # "p.(Gln1756ProfsTer74)" or "p.Gln1756ProfsTer74" as a single
96
- # term, drops them to `quotedphrasesnotfound`, AND leaves a bare
97
- # `p` token in the OR clause. The lone `p` matches every paper
98
- # mentioning protein → 7000+ irrelevant hits dominated by random
99
- # BRCA1 papers. Coding HGVS doesn't have this problem.
100
- #
101
- # Also include both Mutalyzer-normalized (e.g. c.5266dup) and the
102
- # user's raw input (e.g. c.5266dupC) — papers always use the
103
- # original form with the trailing nucleotide.
104
- hgvs_terms: list[str] = []
 
 
 
 
 
 
 
 
 
 
105
  for h in (hgvs, raw_hgvs):
106
  if not h:
107
  continue
108
  short = self._strip_transcript_prefix(h)
109
  if not short or short.startswith("p."):
110
  continue
111
- if short not in hgvs_terms:
112
- hgvs_terms.append(short)
113
- if not hgvs_terms:
114
- # Edge case no coding form available. Fall back to gene only;
115
- # caller will get a broad result set but at least nothing junk.
 
 
 
 
 
116
  return f'"{gene}"'
117
- quoted = [f'"{t}"' for t in hgvs_terms]
118
  return f'("{gene}") AND ({" OR ".join(quoted)})'
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  @retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=10), reraise=True)
121
  async def search_pubmed(self, query: str, retmax: int | None = None) -> list[str]:
122
  cap = retmax if retmax is not None else self.max_results
 
90
  def build_query(
91
  self, gene: str, hgvs: str, protein: str | None, raw_hgvs: str | None = None
92
  ) -> str:
93
+ """Build a PubMed eutils query that matches the variant whether the
94
+ paper uses HGVS coding, HGVS protein, three-letter codon notation,
95
+ or one-letter codon notation. Each variant identifier is a quoted
96
+ phrase to avoid the previous "bare p token" bug — PubMed will only
97
+ match the literal phrase, never a substring.
98
+
99
+ Coverage strategy:
100
+ - HGVS coding: `c.413G>A`, `c.5266dupC` (both Mutalyzer-normalized
101
+ and user-raw, because papers often retain the trailing nucleotide).
102
+ - HGVS protein: `p.Arg138Gln`, `p.Arg138Gln` (paren-stripped).
103
+ - Three-letter short: `Arg138Gln`.
104
+ - One-letter short: `R138Q` by far the most common form in older
105
+ literature, especially channelopathy / collagenopathy / Fabry /
106
+ podocinopathy papers that predate ClinVar's HGVS-first convention.
107
+
108
+ The risk addressed in the previous version (bare `p` matching
109
+ everything) only happens when a phrase isn't fully quoted. Here
110
+ every term is wrapped in double-quotes and joined with OR.
111
+ """
112
+ terms: list[str] = []
113
+
114
+ # --- Coding HGVS terms (Mutalyzer + raw) ---
115
  for h in (hgvs, raw_hgvs):
116
  if not h:
117
  continue
118
  short = self._strip_transcript_prefix(h)
119
  if not short or short.startswith("p."):
120
  continue
121
+ if short not in terms:
122
+ terms.append(short)
123
+
124
+ # --- Protein HGVS terms ---
125
+ protein_forms = self._expand_protein_forms(protein, hgvs, raw_hgvs)
126
+ for pf in protein_forms:
127
+ if pf not in terms:
128
+ terms.append(pf)
129
+
130
+ if not terms:
131
  return f'"{gene}"'
132
+ quoted = [f'"{t}"' for t in terms]
133
  return f'("{gene}") AND ({" OR ".join(quoted)})'
134
 
135
+ # Standard amino-acid three-letter → one-letter table. Stop codons
136
+ # are represented as `*` / `Ter` / `X` in different journals; emit
137
+ # both common variants when we hit one.
138
+ _AA3_TO_1: dict[str, str] = {
139
+ "Ala": "A", "Arg": "R", "Asn": "N", "Asp": "D", "Cys": "C",
140
+ "Glu": "E", "Gln": "Q", "Gly": "G", "His": "H", "Ile": "I",
141
+ "Leu": "L", "Lys": "K", "Met": "M", "Phe": "F", "Pro": "P",
142
+ "Ser": "S", "Thr": "T", "Trp": "W", "Tyr": "Y", "Val": "V",
143
+ "Ter": "*", "Sec": "U", "Pyl": "O",
144
+ }
145
+
146
+ @classmethod
147
+ def _expand_protein_forms(
148
+ cls,
149
+ protein: str | None,
150
+ hgvs: str | None,
151
+ raw_hgvs: str | None,
152
+ ) -> list[str]:
153
+ """Return every alternate protein-change spelling we can derive
154
+ from the canonical protein HGVS. All entries are quoted-phrase
155
+ safe (no bare single-letter tokens) since the caller wraps each
156
+ in double quotes."""
157
+ import re
158
+ out: list[str] = []
159
+ sources = [protein] + [
160
+ cls._strip_transcript_prefix(h) for h in (hgvs, raw_hgvs) if h
161
+ ]
162
+ for src in sources:
163
+ if not src:
164
+ continue
165
+ # Accept p.(Arg138Gln), p.Arg138Gln, Arg138Gln (any leading prefix).
166
+ m = re.search(r"p\.?\(?([A-Za-z]{3})(\d+)([A-Za-z]{3}|=|\*|Ter|fs\w*)\)?", src)
167
+ if not m:
168
+ continue
169
+ ref3, pos, alt3 = m.group(1), m.group(2), m.group(3)
170
+ ref3_t = ref3.title()
171
+ alt3_t = alt3.title() if alt3.isalpha() else alt3
172
+ # Three-letter (HGVS canonical and stripped)
173
+ three = f"{ref3_t}{pos}{alt3_t}"
174
+ out.append(f"p.{three}")
175
+ out.append(three)
176
+ # One-letter (common in older literature)
177
+ r1 = cls._AA3_TO_1.get(ref3_t)
178
+ a1 = cls._AA3_TO_1.get(alt3_t) if alt3_t in cls._AA3_TO_1 else (
179
+ "*" if alt3_t in ("Ter", "*") else None
180
+ )
181
+ if r1 and a1:
182
+ out.append(f"{r1}{pos}{a1}")
183
+ out.append(f"p.{r1}{pos}{a1}")
184
+ elif r1 and alt3_t.startswith("fs"):
185
+ out.append(f"{r1}{pos}fs")
186
+ # De-dupe preserving order
187
+ seen: set[str] = set()
188
+ unique = []
189
+ for t in out:
190
+ if t not in seen:
191
+ seen.add(t)
192
+ unique.append(t)
193
+ return unique
194
+
195
  @retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=10), reraise=True)
196
  async def search_pubmed(self, query: str, retmax: int | None = None) -> list[str]:
197
  cap = retmax if retmax is not None else self.max_results
backend/tests/test_rag_query.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for the PubMed query builder.
2
+
3
+ The fetcher only sees papers PubMed indexes under the exact phrases in
4
+ its `term` parameter. The previous coding-only query missed every paper
5
+ that cites the variant by protein change (R138Q, Arg138Gln) — the
6
+ expansion below adds those forms.
7
+ """
8
+
9
+ from backend.app.services.rag.fetcher import LiteratureFetcher
10
+
11
+
12
+ def _terms_in_query(q: str) -> set[str]:
13
+ """Pull all quoted phrases out of `(GENE) AND ("a" OR "b" OR ...)`."""
14
+ import re
15
+ return set(re.findall(r'"([^"]+)"', q))
16
+
17
+
18
+ def test_query_includes_coding_hgvs() -> None:
19
+ f = LiteratureFetcher()
20
+ q = f.build_query(
21
+ "NPHS2",
22
+ "NM_014625.4:c.413G>A",
23
+ "NM_014625.4(NP_055440.1):p.(Arg138Gln)",
24
+ )
25
+ terms = _terms_in_query(q)
26
+ assert "NPHS2" in terms
27
+ assert "c.413G>A" in terms
28
+
29
+
30
+ def test_query_includes_three_letter_and_one_letter_protein() -> None:
31
+ """Recovers papers that cite the variant by protein change rather than
32
+ HGVS coding — the major failure mode on older Mendelian disease genes."""
33
+ f = LiteratureFetcher()
34
+ q = f.build_query(
35
+ "NPHS2",
36
+ "NM_014625.4:c.413G>A",
37
+ "NM_014625.4(NP_055440.1):p.(Arg138Gln)",
38
+ )
39
+ terms = _terms_in_query(q)
40
+ assert "p.Arg138Gln" in terms
41
+ assert "Arg138Gln" in terms
42
+ assert "p.R138Q" in terms
43
+ assert "R138Q" in terms
44
+
45
+
46
+ def test_query_handles_collagenopathy_glycine_substitution() -> None:
47
+ """COL4A5 Alport — glycine substitutions are usually cited as G953V
48
+ in the channelopathy / collagenopathy literature."""
49
+ f = LiteratureFetcher()
50
+ q = f.build_query(
51
+ "COL4A5",
52
+ "NM_033380.3:c.2858G>T",
53
+ "NP_203699.1:p.(Gly953Val)",
54
+ )
55
+ terms = _terms_in_query(q)
56
+ assert "Gly953Val" in terms
57
+ assert "G953V" in terms
58
+
59
+
60
+ def test_query_no_bare_p_token() -> None:
61
+ """The regression guard for the original bug — a bare `p` token in the
62
+ OR clause matched every paper mentioning protein. Every term we emit
63
+ must be fully quoted and at least 2 characters."""
64
+ f = LiteratureFetcher()
65
+ q = f.build_query(
66
+ "BRCA1",
67
+ "NM_007294.4:c.5266dup",
68
+ "NM_007294.4(NP_009225.1):p.(Gln1756ProfsTer74)",
69
+ )
70
+ terms = _terms_in_query(q)
71
+ assert "p" not in terms
72
+ assert all(len(t) >= 2 for t in terms)
73
+
74
+
75
+ def test_query_falls_back_to_gene_when_no_hgvs() -> None:
76
+ f = LiteratureFetcher()
77
+ q = f.build_query("BRCA1", "", None)
78
+ assert q == '"BRCA1"'
79
+
80
+
81
+ def test_query_handles_nonsense_variants() -> None:
82
+ """Stop-gained variants get cited as R306X, p.R306*, p.Arg306Ter
83
+ interchangeably."""
84
+ f = LiteratureFetcher()
85
+ q = f.build_query(
86
+ "PKD2",
87
+ "NM_000297.4:c.916C>T",
88
+ "NP_000288.1:p.(Arg306Ter)",
89
+ )
90
+ terms = _terms_in_query(q)
91
+ assert "Arg306Ter" in terms
92
+ assert "R306*" in terms
93
+
94
+
95
+ def test_expand_protein_forms_dedup() -> None:
96
+ """The expander should not emit duplicate spellings when given the
97
+ same protein change through multiple input fields."""
98
+ forms = LiteratureFetcher._expand_protein_forms(
99
+ protein="NP_055440.1:p.(Arg138Gln)",
100
+ hgvs="NM_014625.4:c.413G>A",
101
+ raw_hgvs="NM_014625.4(NP_055440.1):p.(Arg138Gln)",
102
+ )
103
+ assert len(forms) == len(set(forms))