hchevva commited on
Commit
02835d5
·
verified ·
1 Parent(s): 01ce8ad

Upload 4 files

Browse files
core/sources/ctx.py CHANGED
@@ -60,9 +60,18 @@ def _extract_dtxsid_any(data: Any) -> Optional[str]:
60
 
61
  def _ctx_headers() -> Dict[str, str]:
62
  headers = {"accept": "application/json"}
63
- key = settings.ctx_api_key or os.getenv("CTX_API_KEY") or os.getenv("COMPTOX_API_KEY") or os.getenv("CTX_KEY")
 
 
 
 
 
 
 
64
  if key:
65
  headers["x-api-key"] = key
 
 
66
  return headers
67
 
68
 
@@ -88,6 +97,11 @@ async def _resolve_from_cas(cas: str, http: httpx.AsyncClient) -> Optional[str]:
88
  (f"/chemical/identifiers/by-cas/{quote(clean)}", None),
89
  (f"/chemical/identifiers/search/by-cas/{quote(clean)}", None),
90
  ("/chemical/identifiers", {"cas": clean}),
 
 
 
 
 
91
  ("/chemical/search/equal", {"word": clean}),
92
  ("/chemical/search/contains", {"word": clean}),
93
  ("/chemical/search", {"matchType": "equal", "word": clean}),
@@ -137,6 +151,10 @@ async def _resolve_from_name(name: str, http: httpx.AsyncClient) -> Optional[str
137
  (f"/chemical/identifiers/by-name/{quote(q)}", None),
138
  (f"/chemical/identifiers/search/by-name/{quote(q)}", None),
139
  ("/chemical/identifiers", {"name": q}),
 
 
 
 
140
  ("/chemical/search/equal", {"word": q}),
141
  ("/chemical/search/contains", {"word": q}),
142
  ("/chemical/search", {"matchType": "equal", "word": q}),
@@ -304,6 +322,25 @@ async def fetch_ctx_genetox(cas_or_query: str, http: httpx.AsyncClient) -> Dict[
304
  "dashboard_url": dashboard_details_url(found or q),
305
  }
306
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
  return {
308
  "ok": False,
309
  "error": "No DTXSID found for this query.",
 
60
 
61
  def _ctx_headers() -> Dict[str, str]:
62
  headers = {"accept": "application/json"}
63
+ key = (
64
+ settings.ctx_api_key
65
+ or os.getenv("CTX_API_KEY")
66
+ or os.getenv("COMPTOX_API_KEY")
67
+ or os.getenv("CTX_KEY")
68
+ )
69
+ if isinstance(key, str):
70
+ key = key.strip()
71
  if key:
72
  headers["x-api-key"] = key
73
+ headers["X-Api-Key"] = key
74
+ headers["user-agent"] = "toxrai-hf-demo"
75
  return headers
76
 
77
 
 
97
  (f"/chemical/identifiers/by-cas/{quote(clean)}", None),
98
  (f"/chemical/identifiers/search/by-cas/{quote(clean)}", None),
99
  ("/chemical/identifiers", {"cas": clean}),
100
+ ("/chemical/identifiers", {"casrn": clean}),
101
+ ("/chemical/identifiers/search", {"casrn": clean}),
102
+ ("/chemical/search", {"query": clean, "type": "equals"}),
103
+ ("/chemical/search", {"query": clean, "type": "contains"}),
104
+ ("/chemical/search", {"searchType": "equals", "query": clean}),
105
  ("/chemical/search/equal", {"word": clean}),
106
  ("/chemical/search/contains", {"word": clean}),
107
  ("/chemical/search", {"matchType": "equal", "word": clean}),
 
151
  (f"/chemical/identifiers/by-name/{quote(q)}", None),
152
  (f"/chemical/identifiers/search/by-name/{quote(q)}", None),
153
  ("/chemical/identifiers", {"name": q}),
154
+ ("/chemical/identifiers/search", {"name": q}),
155
+ ("/chemical/search", {"query": q, "type": "equals"}),
156
+ ("/chemical/search", {"query": q, "type": "contains"}),
157
+ ("/chemical/search", {"searchType": "equals", "query": q}),
158
  ("/chemical/search/equal", {"word": q}),
159
  ("/chemical/search/contains", {"word": q}),
160
  ("/chemical/search", {"matchType": "equal", "word": q}),
 
322
  "dashboard_url": dashboard_details_url(found or q),
323
  }
324
 
325
+ # Try one direct identifier call to surface CTX errors (auth, etc.)
326
+ try:
327
+ if is_cas(q):
328
+ await _ctx_get(f"/chemical/identifiers/by-cas/{quote(q)}", http)
329
+ else:
330
+ await _ctx_get(f"/chemical/identifiers/by-name/{quote(q)}", http)
331
+ except httpx.HTTPStatusError as e:
332
+ return {
333
+ "ok": False,
334
+ "error": f"CTX API error {e.response.status_code}: {e.response.text[:200]}",
335
+ "dashboard_search": dashboard_search_url(q),
336
+ }
337
+ except Exception as e:
338
+ return {
339
+ "ok": False,
340
+ "error": f"CTX request failed: {e}",
341
+ "dashboard_search": dashboard_search_url(q),
342
+ }
343
+
344
  return {
345
  "ok": False,
346
  "error": "No DTXSID found for this query.",
core/sources/fema.py CHANGED
@@ -1,5 +1,48 @@
1
  import os
2
- from urllib.parse import quote_plus
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
 
5
  def fema_link(cas_or_query: str, name_query: str | None = None) -> dict:
@@ -15,15 +58,29 @@ def fema_link(cas_or_query: str, name_query: str | None = None) -> dict:
15
 
16
  # NOTE: domain spelling matters; the older '...materialssafety...' variant often 404s.
17
  base = os.getenv("FEMA_BASE_URL", "https://fragrancematerialsafetyresource.elsevier.com/")
18
- cas_param = os.getenv("FEMA_CAS_PARAM", "field_cas_tid_1")
19
- name_param = os.getenv("FEMA_NAME_PARAM", "field_chemical_synonym_tid")
20
 
21
  cas_value = quote_plus(q) if q else ""
22
  name_value = quote_plus(name_q or q)
23
 
24
- cas_url = f"{base}?{cas_param}={cas_value}&{name_param}=" if cas_value else ""
25
- name_url = f"{base}?{cas_param}=&{name_param}={name_value}" if name_value else ""
 
 
 
 
 
26
  # Generic search fallback (some deployments ignore filter params)
27
  search_term = name_q or q
28
  search_url = f"{base}search/node?keys={quote_plus(search_term)}" if search_term else ""
29
- return {"ok": True, "cas_url": cas_url, "name_url": name_url, "alt_url": search_url}
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ from urllib.parse import quote_plus, urljoin
3
+
4
+ import httpx
5
+ from bs4 import BeautifulSoup
6
+
7
+ _PARAM_CACHE: dict[str, tuple[str, str, str]] = {}
8
+
9
+
10
+ def _discover_params(base: str) -> tuple[str, str, str]:
11
+ if base in _PARAM_CACHE:
12
+ return _PARAM_CACHE[base]
13
+
14
+ cas_param = os.getenv("FEMA_CAS_PARAM", "field_cas_tid_1")
15
+ name_param = os.getenv("FEMA_NAME_PARAM", "field_chemical_synonym_tid")
16
+ action = os.getenv("FEMA_FORM_ACTION", base)
17
+
18
+ try:
19
+ r = httpx.get(base, timeout=15, headers={"User-Agent": "Mozilla/5.0"})
20
+ if r.status_code < 400:
21
+ soup = BeautifulSoup(r.text, "lxml")
22
+ form = soup.find("form")
23
+ if form and form.get("action"):
24
+ action = urljoin(base, form.get("action"))
25
+ inputs = soup.find_all("input")
26
+ for inp in inputs:
27
+ name = (inp.get("name") or "").strip()
28
+ if not name:
29
+ continue
30
+ placeholder = (inp.get("placeholder") or "").lower()
31
+ lower_name = name.lower()
32
+ if "cas" in placeholder or lower_name == "cas" or "cas" in lower_name:
33
+ cas_param = name
34
+ if (
35
+ "synonym" in placeholder
36
+ or "chemical" in placeholder
37
+ or "synonym" in lower_name
38
+ or "chemical" in lower_name
39
+ ):
40
+ name_param = name
41
+ except Exception:
42
+ pass
43
+
44
+ _PARAM_CACHE[base] = (cas_param, name_param, action)
45
+ return cas_param, name_param, action
46
 
47
 
48
  def fema_link(cas_or_query: str, name_query: str | None = None) -> dict:
 
58
 
59
  # NOTE: domain spelling matters; the older '...materialssafety...' variant often 404s.
60
  base = os.getenv("FEMA_BASE_URL", "https://fragrancematerialsafetyresource.elsevier.com/")
61
+ cas_param, name_param, action = _discover_params(base)
 
62
 
63
  cas_value = quote_plus(q) if q else ""
64
  name_value = quote_plus(name_q or q)
65
 
66
+ cas_url = f"{action}?{cas_param}={cas_value}&{name_param}=" if cas_value else ""
67
+ name_url = f"{action}?{cas_param}=&{name_param}={name_value}" if name_value else ""
68
+ combo_url = (
69
+ f"{action}?{cas_param}={cas_value}&{name_param}={name_value}"
70
+ if cas_value and name_value
71
+ else ""
72
+ )
73
  # Generic search fallback (some deployments ignore filter params)
74
  search_term = name_q or q
75
  search_url = f"{base}search/node?keys={quote_plus(search_term)}" if search_term else ""
76
+ search_api_url = (
77
+ f"{base}search/node?search_api_fulltext={quote_plus(search_term)}" if search_term else ""
78
+ )
79
+ return {
80
+ "ok": True,
81
+ "cas_url": cas_url,
82
+ "name_url": name_url,
83
+ "combo_url": combo_url,
84
+ "alt_url": search_url,
85
+ "search_api_url": search_api_url,
86
+ }
core/sources/ntp.py CHANGED
@@ -141,6 +141,8 @@ async def search_technical_reports(query: str, http: httpx.AsyncClient, limit: i
141
 
142
  lines = index_html.splitlines()
143
  q_low = q.lower()
 
 
144
  results: List[Dict[str, Any]] = []
145
 
146
  for i, line in enumerate(lines):
@@ -149,9 +151,16 @@ async def search_technical_reports(query: str, http: httpx.AsyncClient, limit: i
149
  continue
150
 
151
  snippet = " ".join(lines[i : min(i + 12, len(lines))])
 
152
  low_text = _strip_tags(snippet).lower()
153
- if q_low not in low_text:
154
- continue
 
 
 
 
 
 
155
 
156
  tr_id = m.group(1)
157
  hrefs = HREF_RE.findall(snippet)
@@ -178,6 +187,8 @@ async def search_technical_reports(query: str, http: httpx.AsyncClient, limit: i
178
  pdf_url = urljoin(REPORTS_URL, pdf_match.group(1)) if pdf_match else None
179
 
180
  text_block = _strip_tags(snippet)
 
 
181
  title_match = re.search(
182
  r"TR-\d{3,}\s+PDF\s+(.*?)\s+\b(20\d{2}|19\d{2})\b\s+Technical Report",
183
  text_block,
 
141
 
142
  lines = index_html.splitlines()
143
  q_low = q.lower()
144
+ is_cas = bool(re.match(r"^\d{2,7}-\d{2}-\d$", q))
145
+ q_digits = re.sub(r"\D", "", q) if is_cas else ""
146
  results: List[Dict[str, Any]] = []
147
 
148
  for i, line in enumerate(lines):
 
151
  continue
152
 
153
  snippet = " ".join(lines[i : min(i + 12, len(lines))])
154
+ mini_snippet = " ".join(lines[i : min(i + 3, len(lines))])
155
  low_text = _strip_tags(snippet).lower()
156
+ mini_text = _strip_tags(mini_snippet)
157
+
158
+ if is_cas:
159
+ if q_digits not in re.sub(r"\D", "", mini_text):
160
+ continue
161
+ else:
162
+ if q_low not in low_text:
163
+ continue
164
 
165
  tr_id = m.group(1)
166
  hrefs = HREF_RE.findall(snippet)
 
187
  pdf_url = urljoin(REPORTS_URL, pdf_match.group(1)) if pdf_match else None
188
 
189
  text_block = _strip_tags(snippet)
190
+ if is_cas and q_digits not in re.sub(r"\D", "", text_block):
191
+ continue
192
  title_match = re.search(
193
  r"TR-\d{3,}\s+PDF\s+(.*?)\s+\b(20\d{2}|19\d{2})\b\s+Technical Report",
194
  text_block,
core/sources/pubchem.py CHANGED
@@ -1,4 +1,5 @@
1
  import html
 
2
  import re
3
  from typing import Any, Dict, List, Optional
4
  from urllib.parse import quote
@@ -9,6 +10,7 @@ PUBCHEM_REST = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
9
  PUBCHEM_VIEW = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view"
10
 
11
  CAS_RE = re.compile(r"^\d{2,7}-\d{2}-\d$")
 
12
 
13
 
14
  def is_cas(s: str) -> bool:
@@ -196,6 +198,14 @@ async def pubchem_by_query(query: str, http: httpx.AsyncClient) -> Dict[str, Any
196
  seen.add(key)
197
  uniq_haz.append(h)
198
 
 
 
 
 
 
 
 
 
199
  return {
200
  "ok": True,
201
  "query": q,
@@ -206,4 +216,5 @@ async def pubchem_by_query(query: str, http: httpx.AsyncClient) -> Dict[str, Any
206
  "url": _compound_url(cid),
207
  "synonyms": synonyms[:50],
208
  "hazards": uniq_haz,
 
209
  }
 
1
  import html
2
+ import json
3
  import re
4
  from typing import Any, Dict, List, Optional
5
  from urllib.parse import quote
 
10
  PUBCHEM_VIEW = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view"
11
 
12
  CAS_RE = re.compile(r"^\d{2,7}-\d{2}-\d$")
13
+ DTXSID_RE = re.compile(r"DTXSID\d{7,}")
14
 
15
 
16
  def is_cas(s: str) -> bool:
 
198
  seen.add(key)
199
  uniq_haz.append(h)
200
 
201
+ dtxsid = None
202
+ try:
203
+ m = DTXSID_RE.search(json.dumps(record_json))
204
+ if m:
205
+ dtxsid = m.group(0)
206
+ except Exception:
207
+ dtxsid = None
208
+
209
  return {
210
  "ok": True,
211
  "query": q,
 
216
  "url": _compound_url(cid),
217
  "synonyms": synonyms[:50],
218
  "hazards": uniq_haz,
219
+ "dtxsid": dtxsid,
220
  }