Upload 3 files
Browse files- core/sources/ctx.py +28 -1
- core/sources/fema.py +17 -7
- core/sources/ntp.py +99 -79
core/sources/ctx.py
CHANGED
|
@@ -38,10 +38,14 @@ def _as_rows(data: Any) -> List[Any]:
|
|
| 38 |
if isinstance(data, list):
|
| 39 |
return data
|
| 40 |
if isinstance(data, dict):
|
|
|
|
|
|
|
| 41 |
for key in ("data", "results", "items"):
|
| 42 |
v = data.get(key)
|
| 43 |
if isinstance(v, list):
|
| 44 |
return v
|
|
|
|
|
|
|
| 45 |
return []
|
| 46 |
|
| 47 |
|
|
@@ -275,8 +279,31 @@ async def fetch_ctx_genetox(cas_or_query: str, http: httpx.AsyncClient) -> Dict[
|
|
| 275 |
"dashboard_search": dashboard_search_url(q),
|
| 276 |
}
|
| 277 |
|
| 278 |
-
|
|
|
|
|
|
|
|
|
|
| 279 |
if not dtxsid:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
return {
|
| 281 |
"ok": False,
|
| 282 |
"error": "No DTXSID found for this query.",
|
|
|
|
| 38 |
if isinstance(data, list):
|
| 39 |
return data
|
| 40 |
if isinstance(data, dict):
|
| 41 |
+
if "identifier" in data or "chemical" in data or "DTXSID" in data or "dtxsid" in data:
|
| 42 |
+
return [data]
|
| 43 |
for key in ("data", "results", "items"):
|
| 44 |
v = data.get(key)
|
| 45 |
if isinstance(v, list):
|
| 46 |
return v
|
| 47 |
+
if isinstance(v, dict):
|
| 48 |
+
return [v]
|
| 49 |
return []
|
| 50 |
|
| 51 |
|
|
|
|
| 279 |
"dashboard_search": dashboard_search_url(q),
|
| 280 |
}
|
| 281 |
|
| 282 |
+
if q.upper().startswith("DTXSID"):
|
| 283 |
+
dtxsid = q.strip()
|
| 284 |
+
else:
|
| 285 |
+
dtxsid = await resolve_dtxsid(q, http)
|
| 286 |
if not dtxsid:
|
| 287 |
+
# Attempt direct hazard search by CAS or name (some deployments return summary directly)
|
| 288 |
+
try:
|
| 289 |
+
data = await _ctx_get("/hazard/genetox/summary/search", http, params={"cas": q})
|
| 290 |
+
except Exception:
|
| 291 |
+
data = None
|
| 292 |
+
if not data:
|
| 293 |
+
try:
|
| 294 |
+
data = await _ctx_get("/hazard/genetox/summary/search", http, params={"name": q})
|
| 295 |
+
except Exception:
|
| 296 |
+
data = None
|
| 297 |
+
|
| 298 |
+
if data:
|
| 299 |
+
found = _extract_dtxsid_any(data)
|
| 300 |
+
return {
|
| 301 |
+
"ok": True,
|
| 302 |
+
"dtxsid": found,
|
| 303 |
+
"summary": data,
|
| 304 |
+
"dashboard_url": dashboard_details_url(found or q),
|
| 305 |
+
}
|
| 306 |
+
|
| 307 |
return {
|
| 308 |
"ok": False,
|
| 309 |
"error": "No DTXSID found for this query.",
|
core/sources/fema.py
CHANGED
|
@@ -1,19 +1,29 @@
|
|
|
|
|
| 1 |
from urllib.parse import quote_plus
|
| 2 |
|
| 3 |
|
| 4 |
-
def fema_link(cas_or_query: str) -> dict:
|
| 5 |
"""Build the FEMA / Fragrance Materials Safety Resource search URL.
|
| 6 |
|
| 7 |
Production uses Elsevier's Fragrance Materials Safety Resource with CAS query params.
|
| 8 |
"""
|
| 9 |
|
| 10 |
q = (cas_or_query or "").strip()
|
| 11 |
-
|
|
|
|
| 12 |
return {"ok": False, "error": "Empty query"}
|
| 13 |
|
| 14 |
# NOTE: domain spelling matters; the older '...materialssafety...' variant often 404s.
|
| 15 |
-
base = "https://fragrancematerialsafetyresource.elsevier.com/"
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
from urllib.parse import quote_plus
|
| 3 |
|
| 4 |
|
| 5 |
+
def fema_link(cas_or_query: str, name_query: str | None = None) -> dict:
|
| 6 |
"""Build the FEMA / Fragrance Materials Safety Resource search URL.
|
| 7 |
|
| 8 |
Production uses Elsevier's Fragrance Materials Safety Resource with CAS query params.
|
| 9 |
"""
|
| 10 |
|
| 11 |
q = (cas_or_query or "").strip()
|
| 12 |
+
name_q = (name_query or "").strip()
|
| 13 |
+
if not q and not name_q:
|
| 14 |
return {"ok": False, "error": "Empty query"}
|
| 15 |
|
| 16 |
# NOTE: domain spelling matters; the older '...materialssafety...' variant often 404s.
|
| 17 |
+
base = os.getenv("FEMA_BASE_URL", "https://fragrancematerialsafetyresource.elsevier.com/")
|
| 18 |
+
cas_param = os.getenv("FEMA_CAS_PARAM", "field_cas_tid_1")
|
| 19 |
+
name_param = os.getenv("FEMA_NAME_PARAM", "field_chemical_synonym_tid")
|
| 20 |
+
|
| 21 |
+
cas_value = quote_plus(q) if q else ""
|
| 22 |
+
name_value = quote_plus(name_q or q)
|
| 23 |
+
|
| 24 |
+
cas_url = f"{base}?{cas_param}={cas_value}&{name_param}=" if cas_value else ""
|
| 25 |
+
name_url = f"{base}?{cas_param}=&{name_param}={name_value}" if name_value else ""
|
| 26 |
+
# Generic search fallback (some deployments ignore filter params)
|
| 27 |
+
search_term = name_q or q
|
| 28 |
+
search_url = f"{base}search/node?keys={quote_plus(search_term)}" if search_term else ""
|
| 29 |
+
return {"ok": True, "cas_url": cas_url, "name_url": name_url, "alt_url": search_url}
|
core/sources/ntp.py
CHANGED
|
@@ -14,6 +14,7 @@ INDEX_URL = "https://ntp.niehs.nih.gov/data/tr"
|
|
| 14 |
TR_RE = re.compile(r"\bTR-?(\d{3,4})\b", re.IGNORECASE)
|
| 15 |
HREF_RE = re.compile(r"href=[\"']([^\"']+)[\"']", re.IGNORECASE)
|
| 16 |
TITLE_RE = re.compile(r"<title[^>]*>(.*?)</title>", re.IGNORECASE | re.DOTALL)
|
|
|
|
| 17 |
|
| 18 |
|
| 19 |
def _strip_tags(html_text: str) -> str:
|
|
@@ -138,91 +139,110 @@ async def search_technical_reports(query: str, http: httpx.AsyncClient, limit: i
|
|
| 138 |
except Exception as e:
|
| 139 |
return {"ok": False, "error": f"Failed to fetch NTP index: {e}", "items": []}
|
| 140 |
|
| 141 |
-
|
| 142 |
q_low = q.lower()
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
end = min(len(plain), m.end() + 250)
|
| 154 |
-
neighborhood = plain[start:end]
|
| 155 |
-
if is_cas:
|
| 156 |
-
if q_digits not in re.sub(r"\\D", "", neighborhood):
|
| 157 |
-
continue
|
| 158 |
-
else:
|
| 159 |
-
if q_low not in neighborhood.lower():
|
| 160 |
-
continue
|
| 161 |
-
if num in seen:
|
| 162 |
continue
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
break
|
| 167 |
|
| 168 |
-
|
| 169 |
-
|
| 170 |
try:
|
| 171 |
-
r2 = await http.get(
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
except Exception:
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
for i, line in enumerate(idx_lines):
|
| 183 |
-
if not TR_RE.search(line):
|
| 184 |
-
continue
|
| 185 |
-
block = " ".join(idx_lines[i : i + 6])
|
| 186 |
-
block_text = _strip_tags(block)
|
| 187 |
-
if is_cas:
|
| 188 |
-
if q_digits not in re.sub(r"\\D", "", block_text):
|
| 189 |
continue
|
| 190 |
-
|
| 191 |
-
|
|
|
|
|
|
|
| 192 |
continue
|
| 193 |
-
m = TR_RE.search(block_text)
|
| 194 |
-
if not m:
|
| 195 |
-
continue
|
| 196 |
-
num = m.group(1)
|
| 197 |
-
if num in seen:
|
| 198 |
-
continue
|
| 199 |
-
seen.add(num)
|
| 200 |
-
|
| 201 |
-
# Derive a best-effort title from the block text
|
| 202 |
-
title = re.sub(TR_RE, "", block_text).strip()
|
| 203 |
-
title = re.sub(r"\\b\\d{2,7}-\\d{2}-\\d\\b", "", title).strip()
|
| 204 |
-
|
| 205 |
-
items.append(
|
| 206 |
-
{
|
| 207 |
-
"num": num,
|
| 208 |
-
"tr": f"TR-{num}",
|
| 209 |
-
"report_page": INDEX_URL,
|
| 210 |
-
"title": title,
|
| 211 |
-
"year": None,
|
| 212 |
-
"pdf": None,
|
| 213 |
-
}
|
| 214 |
-
)
|
| 215 |
-
if len(items) >= max(1, int(limit)):
|
| 216 |
-
break
|
| 217 |
-
|
| 218 |
-
return {"ok": True, "query": q, "items": items}
|
| 219 |
-
|
| 220 |
-
items: List[Dict[str, Any]] = []
|
| 221 |
-
for num in nums:
|
| 222 |
-
item = await _fetch_tr_page(num, http)
|
| 223 |
-
if item:
|
| 224 |
-
items.append(item)
|
| 225 |
-
if len(items) >= max(1, int(limit)):
|
| 226 |
-
break
|
| 227 |
|
| 228 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
TR_RE = re.compile(r"\bTR-?(\d{3,4})\b", re.IGNORECASE)
|
| 15 |
HREF_RE = re.compile(r"href=[\"']([^\"']+)[\"']", re.IGNORECASE)
|
| 16 |
TITLE_RE = re.compile(r"<title[^>]*>(.*?)</title>", re.IGNORECASE | re.DOTALL)
|
| 17 |
+
TR_MARKER_RE = re.compile(r">(TR-\d{3,})<", re.IGNORECASE)
|
| 18 |
|
| 19 |
|
| 20 |
def _strip_tags(html_text: str) -> str:
|
|
|
|
| 139 |
except Exception as e:
|
| 140 |
return {"ok": False, "error": f"Failed to fetch NTP index: {e}", "items": []}
|
| 141 |
|
| 142 |
+
lines = index_html.splitlines()
|
| 143 |
q_low = q.lower()
|
| 144 |
+
results: List[Dict[str, Any]] = []
|
| 145 |
+
|
| 146 |
+
for i, line in enumerate(lines):
|
| 147 |
+
m = TR_MARKER_RE.search(line)
|
| 148 |
+
if not m:
|
| 149 |
+
continue
|
| 150 |
+
|
| 151 |
+
snippet = " ".join(lines[i : min(i + 12, len(lines))])
|
| 152 |
+
low_text = _strip_tags(snippet).lower()
|
| 153 |
+
if q_low not in low_text:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
continue
|
| 155 |
+
|
| 156 |
+
tr_id = m.group(1)
|
| 157 |
+
hrefs = HREF_RE.findall(snippet)
|
| 158 |
+
|
| 159 |
+
# Prefer a non-PDF link under /publications/ or /go/
|
| 160 |
+
candidates = [urljoin(REPORTS_URL, h) for h in hrefs if not re.search(r"\.pdf(\?|$)", h, re.I)]
|
| 161 |
+
|
| 162 |
+
def score(u: str) -> int:
|
| 163 |
+
s = 0
|
| 164 |
+
if "/publications/" in u:
|
| 165 |
+
s += 3
|
| 166 |
+
if "/go/" in u:
|
| 167 |
+
s += 3
|
| 168 |
+
if tr_id and tr_id.lower() in u.lower():
|
| 169 |
+
s += 2
|
| 170 |
+
if re.search(r"/reports?", u):
|
| 171 |
+
s += 1
|
| 172 |
+
return s
|
| 173 |
+
|
| 174 |
+
candidates.sort(key=score, reverse=True)
|
| 175 |
+
page_href = candidates[0] if candidates else None
|
| 176 |
+
|
| 177 |
+
pdf_match = re.search(r'href="([^"]+\.pdf)"', snippet, re.I)
|
| 178 |
+
pdf_url = urljoin(REPORTS_URL, pdf_match.group(1)) if pdf_match else None
|
| 179 |
+
|
| 180 |
+
text_block = _strip_tags(snippet)
|
| 181 |
+
title_match = re.search(
|
| 182 |
+
r"TR-\d{3,}\s+PDF\s+(.*?)\s+\b(20\d{2}|19\d{2})\b\s+Technical Report",
|
| 183 |
+
text_block,
|
| 184 |
+
re.I,
|
| 185 |
+
)
|
| 186 |
+
year_match = re.search(r"\b(20\d{2}|19\d{2})\b", text_block)
|
| 187 |
+
|
| 188 |
+
results.append(
|
| 189 |
+
{
|
| 190 |
+
"tr": tr_id,
|
| 191 |
+
"title": title_match.group(1) if title_match else "",
|
| 192 |
+
"year": year_match.group(1) if year_match else "",
|
| 193 |
+
"pdf": pdf_url,
|
| 194 |
+
"report_page": page_href or REPORTS_URL,
|
| 195 |
+
}
|
| 196 |
+
)
|
| 197 |
+
if len(results) >= int(limit):
|
| 198 |
break
|
| 199 |
|
| 200 |
+
# Fallback: scan the TR index page (data/tr)
|
| 201 |
+
if not results:
|
| 202 |
try:
|
| 203 |
+
r2 = await http.get(
|
| 204 |
+
INDEX_URL,
|
| 205 |
+
timeout=25,
|
| 206 |
+
follow_redirects=True,
|
| 207 |
+
headers={"User-Agent": "Mozilla/5.0"},
|
| 208 |
+
)
|
| 209 |
+
if r2.status_code < 400:
|
| 210 |
+
idx_html = r2.text
|
| 211 |
+
else:
|
| 212 |
+
idx_html = ""
|
| 213 |
except Exception:
|
| 214 |
+
idx_html = ""
|
| 215 |
+
|
| 216 |
+
if idx_html:
|
| 217 |
+
idx_lines = idx_html.splitlines()
|
| 218 |
+
for i, row in enumerate(idx_lines):
|
| 219 |
+
if not re.search(r"TR-\d{3,}", row, re.I):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
continue
|
| 221 |
+
block = " ".join(idx_lines[i : min(i + 6, len(idx_lines))])
|
| 222 |
+
block_text = _strip_tags(block)
|
| 223 |
+
low = block_text.lower()
|
| 224 |
+
if q_low not in low:
|
| 225 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
|
| 227 |
+
tr = re.search(r"TR-\d{3,}", block_text, re.I)
|
| 228 |
+
cas = re.search(r"\b\d{2,7}-\d{2}-\d\b", block_text)
|
| 229 |
+
name = block_text
|
| 230 |
+
if tr:
|
| 231 |
+
name = re.sub(r"TR-\d{3,}", "", name, flags=re.I)
|
| 232 |
+
if cas:
|
| 233 |
+
name = re.sub(r"\b\d{2,7}-\d{2}-\d\b", "", name)
|
| 234 |
+
name = name.strip()
|
| 235 |
+
|
| 236 |
+
results.append(
|
| 237 |
+
{
|
| 238 |
+
"tr": tr.group(0) if tr else "",
|
| 239 |
+
"title": name or "",
|
| 240 |
+
"year": "",
|
| 241 |
+
"pdf": None,
|
| 242 |
+
"report_page": INDEX_URL,
|
| 243 |
+
}
|
| 244 |
+
)
|
| 245 |
+
if len(results) >= int(limit):
|
| 246 |
+
break
|
| 247 |
+
|
| 248 |
+
return {"ok": True, "query": q, "items": results}
|