Upload 4 files
Browse files- core/sources/ctx.py +38 -1
- core/sources/fema.py +63 -6
- core/sources/ntp.py +13 -2
- core/sources/pubchem.py +11 -0
core/sources/ctx.py
CHANGED
|
@@ -60,9 +60,18 @@ def _extract_dtxsid_any(data: Any) -> Optional[str]:
|
|
| 60 |
|
| 61 |
def _ctx_headers() -> Dict[str, str]:
|
| 62 |
headers = {"accept": "application/json"}
|
| 63 |
-
key =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
if key:
|
| 65 |
headers["x-api-key"] = key
|
|
|
|
|
|
|
| 66 |
return headers
|
| 67 |
|
| 68 |
|
|
@@ -88,6 +97,11 @@ async def _resolve_from_cas(cas: str, http: httpx.AsyncClient) -> Optional[str]:
|
|
| 88 |
(f"/chemical/identifiers/by-cas/{quote(clean)}", None),
|
| 89 |
(f"/chemical/identifiers/search/by-cas/{quote(clean)}", None),
|
| 90 |
("/chemical/identifiers", {"cas": clean}),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
("/chemical/search/equal", {"word": clean}),
|
| 92 |
("/chemical/search/contains", {"word": clean}),
|
| 93 |
("/chemical/search", {"matchType": "equal", "word": clean}),
|
|
@@ -137,6 +151,10 @@ async def _resolve_from_name(name: str, http: httpx.AsyncClient) -> Optional[str
|
|
| 137 |
(f"/chemical/identifiers/by-name/{quote(q)}", None),
|
| 138 |
(f"/chemical/identifiers/search/by-name/{quote(q)}", None),
|
| 139 |
("/chemical/identifiers", {"name": q}),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
("/chemical/search/equal", {"word": q}),
|
| 141 |
("/chemical/search/contains", {"word": q}),
|
| 142 |
("/chemical/search", {"matchType": "equal", "word": q}),
|
|
@@ -304,6 +322,25 @@ async def fetch_ctx_genetox(cas_or_query: str, http: httpx.AsyncClient) -> Dict[
|
|
| 304 |
"dashboard_url": dashboard_details_url(found or q),
|
| 305 |
}
|
| 306 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 307 |
return {
|
| 308 |
"ok": False,
|
| 309 |
"error": "No DTXSID found for this query.",
|
|
|
|
| 60 |
|
| 61 |
def _ctx_headers() -> Dict[str, str]:
|
| 62 |
headers = {"accept": "application/json"}
|
| 63 |
+
key = (
|
| 64 |
+
settings.ctx_api_key
|
| 65 |
+
or os.getenv("CTX_API_KEY")
|
| 66 |
+
or os.getenv("COMPTOX_API_KEY")
|
| 67 |
+
or os.getenv("CTX_KEY")
|
| 68 |
+
)
|
| 69 |
+
if isinstance(key, str):
|
| 70 |
+
key = key.strip()
|
| 71 |
if key:
|
| 72 |
headers["x-api-key"] = key
|
| 73 |
+
headers["X-Api-Key"] = key
|
| 74 |
+
headers["user-agent"] = "toxrai-hf-demo"
|
| 75 |
return headers
|
| 76 |
|
| 77 |
|
|
|
|
| 97 |
(f"/chemical/identifiers/by-cas/{quote(clean)}", None),
|
| 98 |
(f"/chemical/identifiers/search/by-cas/{quote(clean)}", None),
|
| 99 |
("/chemical/identifiers", {"cas": clean}),
|
| 100 |
+
("/chemical/identifiers", {"casrn": clean}),
|
| 101 |
+
("/chemical/identifiers/search", {"casrn": clean}),
|
| 102 |
+
("/chemical/search", {"query": clean, "type": "equals"}),
|
| 103 |
+
("/chemical/search", {"query": clean, "type": "contains"}),
|
| 104 |
+
("/chemical/search", {"searchType": "equals", "query": clean}),
|
| 105 |
("/chemical/search/equal", {"word": clean}),
|
| 106 |
("/chemical/search/contains", {"word": clean}),
|
| 107 |
("/chemical/search", {"matchType": "equal", "word": clean}),
|
|
|
|
| 151 |
(f"/chemical/identifiers/by-name/{quote(q)}", None),
|
| 152 |
(f"/chemical/identifiers/search/by-name/{quote(q)}", None),
|
| 153 |
("/chemical/identifiers", {"name": q}),
|
| 154 |
+
("/chemical/identifiers/search", {"name": q}),
|
| 155 |
+
("/chemical/search", {"query": q, "type": "equals"}),
|
| 156 |
+
("/chemical/search", {"query": q, "type": "contains"}),
|
| 157 |
+
("/chemical/search", {"searchType": "equals", "query": q}),
|
| 158 |
("/chemical/search/equal", {"word": q}),
|
| 159 |
("/chemical/search/contains", {"word": q}),
|
| 160 |
("/chemical/search", {"matchType": "equal", "word": q}),
|
|
|
|
| 322 |
"dashboard_url": dashboard_details_url(found or q),
|
| 323 |
}
|
| 324 |
|
| 325 |
+
# Try one direct identifier call to surface CTX errors (auth, etc.)
|
| 326 |
+
try:
|
| 327 |
+
if is_cas(q):
|
| 328 |
+
await _ctx_get(f"/chemical/identifiers/by-cas/{quote(q)}", http)
|
| 329 |
+
else:
|
| 330 |
+
await _ctx_get(f"/chemical/identifiers/by-name/{quote(q)}", http)
|
| 331 |
+
except httpx.HTTPStatusError as e:
|
| 332 |
+
return {
|
| 333 |
+
"ok": False,
|
| 334 |
+
"error": f"CTX API error {e.response.status_code}: {e.response.text[:200]}",
|
| 335 |
+
"dashboard_search": dashboard_search_url(q),
|
| 336 |
+
}
|
| 337 |
+
except Exception as e:
|
| 338 |
+
return {
|
| 339 |
+
"ok": False,
|
| 340 |
+
"error": f"CTX request failed: {e}",
|
| 341 |
+
"dashboard_search": dashboard_search_url(q),
|
| 342 |
+
}
|
| 343 |
+
|
| 344 |
return {
|
| 345 |
"ok": False,
|
| 346 |
"error": "No DTXSID found for this query.",
|
core/sources/fema.py
CHANGED
|
@@ -1,5 +1,48 @@
|
|
| 1 |
import os
|
| 2 |
-
from urllib.parse import quote_plus
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
|
| 5 |
def fema_link(cas_or_query: str, name_query: str | None = None) -> dict:
|
|
@@ -15,15 +58,29 @@ def fema_link(cas_or_query: str, name_query: str | None = None) -> dict:
|
|
| 15 |
|
| 16 |
# NOTE: domain spelling matters; the older '...materialssafety...' variant often 404s.
|
| 17 |
base = os.getenv("FEMA_BASE_URL", "https://fragrancematerialsafetyresource.elsevier.com/")
|
| 18 |
-
cas_param =
|
| 19 |
-
name_param = os.getenv("FEMA_NAME_PARAM", "field_chemical_synonym_tid")
|
| 20 |
|
| 21 |
cas_value = quote_plus(q) if q else ""
|
| 22 |
name_value = quote_plus(name_q or q)
|
| 23 |
|
| 24 |
-
cas_url = f"{
|
| 25 |
-
name_url = f"{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
# Generic search fallback (some deployments ignore filter params)
|
| 27 |
search_term = name_q or q
|
| 28 |
search_url = f"{base}search/node?keys={quote_plus(search_term)}" if search_term else ""
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
+
from urllib.parse import quote_plus, urljoin
|
| 3 |
+
|
| 4 |
+
import httpx
|
| 5 |
+
from bs4 import BeautifulSoup
|
| 6 |
+
|
| 7 |
+
_PARAM_CACHE: dict[str, tuple[str, str, str]] = {}
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def _discover_params(base: str) -> tuple[str, str, str]:
|
| 11 |
+
if base in _PARAM_CACHE:
|
| 12 |
+
return _PARAM_CACHE[base]
|
| 13 |
+
|
| 14 |
+
cas_param = os.getenv("FEMA_CAS_PARAM", "field_cas_tid_1")
|
| 15 |
+
name_param = os.getenv("FEMA_NAME_PARAM", "field_chemical_synonym_tid")
|
| 16 |
+
action = os.getenv("FEMA_FORM_ACTION", base)
|
| 17 |
+
|
| 18 |
+
try:
|
| 19 |
+
r = httpx.get(base, timeout=15, headers={"User-Agent": "Mozilla/5.0"})
|
| 20 |
+
if r.status_code < 400:
|
| 21 |
+
soup = BeautifulSoup(r.text, "lxml")
|
| 22 |
+
form = soup.find("form")
|
| 23 |
+
if form and form.get("action"):
|
| 24 |
+
action = urljoin(base, form.get("action"))
|
| 25 |
+
inputs = soup.find_all("input")
|
| 26 |
+
for inp in inputs:
|
| 27 |
+
name = (inp.get("name") or "").strip()
|
| 28 |
+
if not name:
|
| 29 |
+
continue
|
| 30 |
+
placeholder = (inp.get("placeholder") or "").lower()
|
| 31 |
+
lower_name = name.lower()
|
| 32 |
+
if "cas" in placeholder or lower_name == "cas" or "cas" in lower_name:
|
| 33 |
+
cas_param = name
|
| 34 |
+
if (
|
| 35 |
+
"synonym" in placeholder
|
| 36 |
+
or "chemical" in placeholder
|
| 37 |
+
or "synonym" in lower_name
|
| 38 |
+
or "chemical" in lower_name
|
| 39 |
+
):
|
| 40 |
+
name_param = name
|
| 41 |
+
except Exception:
|
| 42 |
+
pass
|
| 43 |
+
|
| 44 |
+
_PARAM_CACHE[base] = (cas_param, name_param, action)
|
| 45 |
+
return cas_param, name_param, action
|
| 46 |
|
| 47 |
|
| 48 |
def fema_link(cas_or_query: str, name_query: str | None = None) -> dict:
|
|
|
|
| 58 |
|
| 59 |
# NOTE: domain spelling matters; the older '...materialssafety...' variant often 404s.
|
| 60 |
base = os.getenv("FEMA_BASE_URL", "https://fragrancematerialsafetyresource.elsevier.com/")
|
| 61 |
+
cas_param, name_param, action = _discover_params(base)
|
|
|
|
| 62 |
|
| 63 |
cas_value = quote_plus(q) if q else ""
|
| 64 |
name_value = quote_plus(name_q or q)
|
| 65 |
|
| 66 |
+
cas_url = f"{action}?{cas_param}={cas_value}&{name_param}=" if cas_value else ""
|
| 67 |
+
name_url = f"{action}?{cas_param}=&{name_param}={name_value}" if name_value else ""
|
| 68 |
+
combo_url = (
|
| 69 |
+
f"{action}?{cas_param}={cas_value}&{name_param}={name_value}"
|
| 70 |
+
if cas_value and name_value
|
| 71 |
+
else ""
|
| 72 |
+
)
|
| 73 |
# Generic search fallback (some deployments ignore filter params)
|
| 74 |
search_term = name_q or q
|
| 75 |
search_url = f"{base}search/node?keys={quote_plus(search_term)}" if search_term else ""
|
| 76 |
+
search_api_url = (
|
| 77 |
+
f"{base}search/node?search_api_fulltext={quote_plus(search_term)}" if search_term else ""
|
| 78 |
+
)
|
| 79 |
+
return {
|
| 80 |
+
"ok": True,
|
| 81 |
+
"cas_url": cas_url,
|
| 82 |
+
"name_url": name_url,
|
| 83 |
+
"combo_url": combo_url,
|
| 84 |
+
"alt_url": search_url,
|
| 85 |
+
"search_api_url": search_api_url,
|
| 86 |
+
}
|
core/sources/ntp.py
CHANGED
|
@@ -141,6 +141,8 @@ async def search_technical_reports(query: str, http: httpx.AsyncClient, limit: i
|
|
| 141 |
|
| 142 |
lines = index_html.splitlines()
|
| 143 |
q_low = q.lower()
|
|
|
|
|
|
|
| 144 |
results: List[Dict[str, Any]] = []
|
| 145 |
|
| 146 |
for i, line in enumerate(lines):
|
|
@@ -149,9 +151,16 @@ async def search_technical_reports(query: str, http: httpx.AsyncClient, limit: i
|
|
| 149 |
continue
|
| 150 |
|
| 151 |
snippet = " ".join(lines[i : min(i + 12, len(lines))])
|
|
|
|
| 152 |
low_text = _strip_tags(snippet).lower()
|
| 153 |
-
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
tr_id = m.group(1)
|
| 157 |
hrefs = HREF_RE.findall(snippet)
|
|
@@ -178,6 +187,8 @@ async def search_technical_reports(query: str, http: httpx.AsyncClient, limit: i
|
|
| 178 |
pdf_url = urljoin(REPORTS_URL, pdf_match.group(1)) if pdf_match else None
|
| 179 |
|
| 180 |
text_block = _strip_tags(snippet)
|
|
|
|
|
|
|
| 181 |
title_match = re.search(
|
| 182 |
r"TR-\d{3,}\s+PDF\s+(.*?)\s+\b(20\d{2}|19\d{2})\b\s+Technical Report",
|
| 183 |
text_block,
|
|
|
|
| 141 |
|
| 142 |
lines = index_html.splitlines()
|
| 143 |
q_low = q.lower()
|
| 144 |
+
is_cas = bool(re.match(r"^\d{2,7}-\d{2}-\d$", q))
|
| 145 |
+
q_digits = re.sub(r"\D", "", q) if is_cas else ""
|
| 146 |
results: List[Dict[str, Any]] = []
|
| 147 |
|
| 148 |
for i, line in enumerate(lines):
|
|
|
|
| 151 |
continue
|
| 152 |
|
| 153 |
snippet = " ".join(lines[i : min(i + 12, len(lines))])
|
| 154 |
+
mini_snippet = " ".join(lines[i : min(i + 3, len(lines))])
|
| 155 |
low_text = _strip_tags(snippet).lower()
|
| 156 |
+
mini_text = _strip_tags(mini_snippet)
|
| 157 |
+
|
| 158 |
+
if is_cas:
|
| 159 |
+
if q_digits not in re.sub(r"\D", "", mini_text):
|
| 160 |
+
continue
|
| 161 |
+
else:
|
| 162 |
+
if q_low not in low_text:
|
| 163 |
+
continue
|
| 164 |
|
| 165 |
tr_id = m.group(1)
|
| 166 |
hrefs = HREF_RE.findall(snippet)
|
|
|
|
| 187 |
pdf_url = urljoin(REPORTS_URL, pdf_match.group(1)) if pdf_match else None
|
| 188 |
|
| 189 |
text_block = _strip_tags(snippet)
|
| 190 |
+
if is_cas and q_digits not in re.sub(r"\D", "", text_block):
|
| 191 |
+
continue
|
| 192 |
title_match = re.search(
|
| 193 |
r"TR-\d{3,}\s+PDF\s+(.*?)\s+\b(20\d{2}|19\d{2})\b\s+Technical Report",
|
| 194 |
text_block,
|
core/sources/pubchem.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import html
|
|
|
|
| 2 |
import re
|
| 3 |
from typing import Any, Dict, List, Optional
|
| 4 |
from urllib.parse import quote
|
|
@@ -9,6 +10,7 @@ PUBCHEM_REST = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
|
|
| 9 |
PUBCHEM_VIEW = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view"
|
| 10 |
|
| 11 |
CAS_RE = re.compile(r"^\d{2,7}-\d{2}-\d$")
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
def is_cas(s: str) -> bool:
|
|
@@ -196,6 +198,14 @@ async def pubchem_by_query(query: str, http: httpx.AsyncClient) -> Dict[str, Any
|
|
| 196 |
seen.add(key)
|
| 197 |
uniq_haz.append(h)
|
| 198 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
return {
|
| 200 |
"ok": True,
|
| 201 |
"query": q,
|
|
@@ -206,4 +216,5 @@ async def pubchem_by_query(query: str, http: httpx.AsyncClient) -> Dict[str, Any
|
|
| 206 |
"url": _compound_url(cid),
|
| 207 |
"synonyms": synonyms[:50],
|
| 208 |
"hazards": uniq_haz,
|
|
|
|
| 209 |
}
|
|
|
|
| 1 |
import html
|
| 2 |
+
import json
|
| 3 |
import re
|
| 4 |
from typing import Any, Dict, List, Optional
|
| 5 |
from urllib.parse import quote
|
|
|
|
| 10 |
PUBCHEM_VIEW = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view"
|
| 11 |
|
| 12 |
CAS_RE = re.compile(r"^\d{2,7}-\d{2}-\d$")
|
| 13 |
+
DTXSID_RE = re.compile(r"DTXSID\d{7,}")
|
| 14 |
|
| 15 |
|
| 16 |
def is_cas(s: str) -> bool:
|
|
|
|
| 198 |
seen.add(key)
|
| 199 |
uniq_haz.append(h)
|
| 200 |
|
| 201 |
+
dtxsid = None
|
| 202 |
+
try:
|
| 203 |
+
m = DTXSID_RE.search(json.dumps(record_json))
|
| 204 |
+
if m:
|
| 205 |
+
dtxsid = m.group(0)
|
| 206 |
+
except Exception:
|
| 207 |
+
dtxsid = None
|
| 208 |
+
|
| 209 |
return {
|
| 210 |
"ok": True,
|
| 211 |
"query": q,
|
|
|
|
| 216 |
"url": _compound_url(cid),
|
| 217 |
"synonyms": synonyms[:50],
|
| 218 |
"hazards": uniq_haz,
|
| 219 |
+
"dtxsid": dtxsid,
|
| 220 |
}
|