hchevva commited on
Commit
c3e44ad
·
verified ·
1 Parent(s): 426090f

Update core/sources/cdc.py

Browse files
Files changed (1) hide show
  1. core/sources/cdc.py +71 -12
core/sources/cdc.py CHANGED
@@ -1,9 +1,22 @@
1
- """CDC / ATSDR ToxProfiles lookup (offline list)."""
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  from __future__ import annotations
4
 
5
  import re
6
- from typing import Dict, List
7
 
8
  # Each entry: {"name": str, "cas": str, "url": str}
9
  TOXPROFILES: List[Dict[str, str]] = [
@@ -258,21 +271,67 @@ TOXPROFILES: List[Dict[str, str]] = [
258
 
259
  _CAS_RE = re.compile(r"^\d{2,7}-\d{2}-\d$")
260
 
261
-
 
 
262
  def is_cas(s: str) -> bool:
263
  return bool(_CAS_RE.match((s or "").strip()))
264
 
265
 
266
  def search(query: str, *, limit: int = 8) -> Dict[str, object]:
267
- """Search toxprofiles by CAS (preferred) or by substring on name."""
 
 
 
 
 
 
 
 
 
 
 
268
  q = (query or "").strip()
269
- if not q:
270
- return {"ok": False, "error": "empty query", "matches": []}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
 
272
- if is_cas(q):
273
- matches = [x for x in TOXPROFILES if x.get("cas") == q]
274
- else:
275
- qn = q.casefold()
276
- matches = [x for x in TOXPROFILES if qn in (x.get("name") or "").casefold()]
277
 
278
- return {"ok": True, "query": q, "matches": matches[: max(0, int(limit))], "total": len(matches)}
 
 
 
 
 
 
 
 
1
+ """CDC/ATSDR ToxProfiles local index + search.
2
+
3
+ Why local?
4
+ - CDC ToxProfiles don't have a simple public search API.
5
+ - Production code typically uses a prebuilt index (CAS/name -> URL).
6
+
7
+ This file mirrors that approach: you maintain TOXPROFILES as a list of dicts.
8
+
9
+ Each item MUST look like:
10
+ {"name": "Acetone", "cas": "67-64-1", "url": "https://wwwn.cdc.gov/TSP/ToxProfiles/ToxProfiles.aspx?id=5&tid=1"}
11
+
12
+ Return shape is stable for Gradio rendering:
13
+ {"ok": True, "query": "...", "cas": "...", "matches": [...], "total": N}
14
+ """
15
 
16
  from __future__ import annotations
17
 
18
  import re
19
+ from typing import Any, Dict, List, Optional
20
 
21
  # Each entry: {"name": str, "cas": str, "url": str}
22
  TOXPROFILES: List[Dict[str, str]] = [
 
271
 
272
  _CAS_RE = re.compile(r"^\d{2,7}-\d{2}-\d$")
273
 
274
+ def _norm(s: str) -> str:
275
+ return re.sub(r"\s+", " ", (s or "").strip()).lower()
276
+
277
  def is_cas(s: str) -> bool:
278
  return bool(_CAS_RE.match((s or "").strip()))
279
 
280
 
281
  def search(query: str, *, limit: int = 8) -> Dict[str, object]:
282
+ """Search the local toxprofile index.
283
+
284
+ Behavior:
285
+ 1) If `cas` is provided and CAS-like, prefer exact CAS matches.
286
+ 2) Else if `query` is CAS-like, use exact CAS matches.
287
+ 3) If no CAS matches, fall back to case-insensitive substring name match.
288
+
289
+ Args:
290
+ query: Original user query (CAS or name).
291
+ cas: Resolved CAS (preferred) if you have it.
292
+ limit: Max results.
293
+ """
294
  q = (query or "").strip()
295
+ cas_q = (cas or "").strip()
296
+
297
+ matches: List[Dict[str, str]] = []
298
+
299
+ # 1) CAS-first
300
+ cas_key = cas_q if is_cas(cas_q) else (q if is_cas(q) else "")
301
+ if cas_key:
302
+ for item in TOXPROFILES:
303
+ if (item.get("cas") or "").strip() == cas_key:
304
+ matches.append(
305
+ {
306
+ "name": item.get("name") or "ToxProfile",
307
+ "cas": item.get("cas") or cas_key,
308
+ "url": item.get("url") or "",
309
+ }
310
+ )
311
+
312
+ # 2) Name fallback (when CAS isn't resolved OR CAS didn't match anything)
313
+ if not matches and q:
314
+ qn = _norm(q)
315
+ for item in TOXPROFILES:
316
+ if qn and qn in _norm(item.get("name") or ""):
317
+ matches.append(
318
+ {
319
+ "name": item.get("name") or "ToxProfile",
320
+ "cas": item.get("cas") or "",
321
+ "url": item.get("url") or "",
322
+ }
323
+ )
324
 
325
+ # Deterministic order: exact name matches first, then substring
326
+ if matches and q:
327
+ qn = _norm(q)
328
+ matches.sort(key=lambda m: 0 if _norm(m.get("name", "")) == qn else 1)
 
329
 
330
+ total = len(matches)
331
+ return {
332
+ "ok": True,
333
+ "query": q,
334
+ "cas": cas_key or cas_q or (q if is_cas(q) else ""),
335
+ "total": total,
336
+ "matches": matches[: max(0, int(limit))],
337
+ }