hchevva commited on
Commit
01ce8ad
·
verified ·
1 Parent(s): d59d94c

Upload 3 files

Browse files
Files changed (3) hide show
  1. core/sources/ctx.py +28 -1
  2. core/sources/fema.py +17 -7
  3. core/sources/ntp.py +99 -79
core/sources/ctx.py CHANGED
@@ -38,10 +38,14 @@ def _as_rows(data: Any) -> List[Any]:
38
  if isinstance(data, list):
39
  return data
40
  if isinstance(data, dict):
 
 
41
  for key in ("data", "results", "items"):
42
  v = data.get(key)
43
  if isinstance(v, list):
44
  return v
 
 
45
  return []
46
 
47
 
@@ -275,8 +279,31 @@ async def fetch_ctx_genetox(cas_or_query: str, http: httpx.AsyncClient) -> Dict[
275
  "dashboard_search": dashboard_search_url(q),
276
  }
277
 
278
- dtxsid = await resolve_dtxsid(q, http)
 
 
 
279
  if not dtxsid:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
  return {
281
  "ok": False,
282
  "error": "No DTXSID found for this query.",
 
38
  if isinstance(data, list):
39
  return data
40
  if isinstance(data, dict):
41
+ if "identifier" in data or "chemical" in data or "DTXSID" in data or "dtxsid" in data:
42
+ return [data]
43
  for key in ("data", "results", "items"):
44
  v = data.get(key)
45
  if isinstance(v, list):
46
  return v
47
+ if isinstance(v, dict):
48
+ return [v]
49
  return []
50
 
51
 
 
279
  "dashboard_search": dashboard_search_url(q),
280
  }
281
 
282
+ if q.upper().startswith("DTXSID"):
283
+ dtxsid = q.strip()
284
+ else:
285
+ dtxsid = await resolve_dtxsid(q, http)
286
  if not dtxsid:
287
+ # Attempt direct hazard search by CAS or name (some deployments return summary directly)
288
+ try:
289
+ data = await _ctx_get("/hazard/genetox/summary/search", http, params={"cas": q})
290
+ except Exception:
291
+ data = None
292
+ if not data:
293
+ try:
294
+ data = await _ctx_get("/hazard/genetox/summary/search", http, params={"name": q})
295
+ except Exception:
296
+ data = None
297
+
298
+ if data:
299
+ found = _extract_dtxsid_any(data)
300
+ return {
301
+ "ok": True,
302
+ "dtxsid": found,
303
+ "summary": data,
304
+ "dashboard_url": dashboard_details_url(found or q),
305
+ }
306
+
307
  return {
308
  "ok": False,
309
  "error": "No DTXSID found for this query.",
core/sources/fema.py CHANGED
@@ -1,19 +1,29 @@
 
1
  from urllib.parse import quote_plus
2
 
3
 
4
- def fema_link(cas_or_query: str) -> dict:
5
  """Build the FEMA / Fragrance Materials Safety Resource search URL.
6
 
7
  Production uses Elsevier's Fragrance Materials Safety Resource with CAS query params.
8
  """
9
 
10
  q = (cas_or_query or "").strip()
11
- if not q:
 
12
  return {"ok": False, "error": "Empty query"}
13
 
14
  # NOTE: domain spelling matters; the older '...materialssafety...' variant often 404s.
15
- base = "https://fragrancematerialsafetyresource.elsevier.com/"
16
- cas_url = f"{base}?field_cas_tid_1={quote_plus(q)}&field_chemical_synonym_tid="
17
- # Generic search fallback (some deployments ignore CAS filter params)
18
- search_url = f"{base}search/node?keys={quote_plus(q)}"
19
- return {"ok": True, "url": cas_url, "alt_url": search_url}
 
 
 
 
 
 
 
 
 
1
+ import os
2
  from urllib.parse import quote_plus
3
 
4
 
5
+ def fema_link(cas_or_query: str, name_query: str | None = None) -> dict:
6
  """Build the FEMA / Fragrance Materials Safety Resource search URL.
7
 
8
  Production uses Elsevier's Fragrance Materials Safety Resource with CAS query params.
9
  """
10
 
11
  q = (cas_or_query or "").strip()
12
+ name_q = (name_query or "").strip()
13
+ if not q and not name_q:
14
  return {"ok": False, "error": "Empty query"}
15
 
16
  # NOTE: domain spelling matters; the older '...materialssafety...' variant often 404s.
17
+ base = os.getenv("FEMA_BASE_URL", "https://fragrancematerialsafetyresource.elsevier.com/")
18
+ cas_param = os.getenv("FEMA_CAS_PARAM", "field_cas_tid_1")
19
+ name_param = os.getenv("FEMA_NAME_PARAM", "field_chemical_synonym_tid")
20
+
21
+ cas_value = quote_plus(q) if q else ""
22
+ name_value = quote_plus(name_q or q)
23
+
24
+ cas_url = f"{base}?{cas_param}={cas_value}&{name_param}=" if cas_value else ""
25
+ name_url = f"{base}?{cas_param}=&{name_param}={name_value}" if name_value else ""
26
+ # Generic search fallback (some deployments ignore filter params)
27
+ search_term = name_q or q
28
+ search_url = f"{base}search/node?keys={quote_plus(search_term)}" if search_term else ""
29
+ return {"ok": True, "cas_url": cas_url, "name_url": name_url, "alt_url": search_url}
core/sources/ntp.py CHANGED
@@ -14,6 +14,7 @@ INDEX_URL = "https://ntp.niehs.nih.gov/data/tr"
14
  TR_RE = re.compile(r"\bTR-?(\d{3,4})\b", re.IGNORECASE)
15
  HREF_RE = re.compile(r"href=[\"']([^\"']+)[\"']", re.IGNORECASE)
16
  TITLE_RE = re.compile(r"<title[^>]*>(.*?)</title>", re.IGNORECASE | re.DOTALL)
 
17
 
18
 
19
  def _strip_tags(html_text: str) -> str:
@@ -138,91 +139,110 @@ async def search_technical_reports(query: str, http: httpx.AsyncClient, limit: i
138
  except Exception as e:
139
  return {"ok": False, "error": f"Failed to fetch NTP index: {e}", "items": []}
140
 
141
- plain = _strip_tags(index_html)
142
  q_low = q.lower()
143
- is_cas = bool(re.match(r"^\\d{2,7}-\\d{2}-\\d$", q))
144
- q_digits = re.sub(r"\\D", "", q) if is_cas else ""
145
-
146
- nums: List[str] = []
147
- seen = set()
148
-
149
- for m in TR_RE.finditer(plain):
150
- num = m.group(1)
151
- # neighborhood window similar to production
152
- start = max(0, m.start() - 250)
153
- end = min(len(plain), m.end() + 250)
154
- neighborhood = plain[start:end]
155
- if is_cas:
156
- if q_digits not in re.sub(r"\\D", "", neighborhood):
157
- continue
158
- else:
159
- if q_low not in neighborhood.lower():
160
- continue
161
- if num in seen:
162
  continue
163
- seen.add(num)
164
- nums.append(num)
165
- if len(nums) >= max(1, int(limit)):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  break
167
 
168
- if not nums:
169
- # Fallback: scan the TR index page (data/tr)
170
  try:
171
- r2 = await http.get(INDEX_URL, timeout=25, follow_redirects=True)
172
- if r2.status_code >= 400:
173
- return {"ok": True, "query": q, "items": []}
174
- idx_html = r2.text
 
 
 
 
 
 
175
  except Exception:
176
- return {"ok": True, "query": q, "items": []}
177
-
178
- idx_lines = idx_html.splitlines()
179
- items: List[Dict[str, Any]] = []
180
- seen = set()
181
-
182
- for i, line in enumerate(idx_lines):
183
- if not TR_RE.search(line):
184
- continue
185
- block = " ".join(idx_lines[i : i + 6])
186
- block_text = _strip_tags(block)
187
- if is_cas:
188
- if q_digits not in re.sub(r"\\D", "", block_text):
189
  continue
190
- else:
191
- if q_low not in block_text.lower():
 
 
192
  continue
193
- m = TR_RE.search(block_text)
194
- if not m:
195
- continue
196
- num = m.group(1)
197
- if num in seen:
198
- continue
199
- seen.add(num)
200
-
201
- # Derive a best-effort title from the block text
202
- title = re.sub(TR_RE, "", block_text).strip()
203
- title = re.sub(r"\\b\\d{2,7}-\\d{2}-\\d\\b", "", title).strip()
204
-
205
- items.append(
206
- {
207
- "num": num,
208
- "tr": f"TR-{num}",
209
- "report_page": INDEX_URL,
210
- "title": title,
211
- "year": None,
212
- "pdf": None,
213
- }
214
- )
215
- if len(items) >= max(1, int(limit)):
216
- break
217
-
218
- return {"ok": True, "query": q, "items": items}
219
-
220
- items: List[Dict[str, Any]] = []
221
- for num in nums:
222
- item = await _fetch_tr_page(num, http)
223
- if item:
224
- items.append(item)
225
- if len(items) >= max(1, int(limit)):
226
- break
227
 
228
- return {"ok": True, "query": q, "items": items}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  TR_RE = re.compile(r"\bTR-?(\d{3,4})\b", re.IGNORECASE)
15
  HREF_RE = re.compile(r"href=[\"']([^\"']+)[\"']", re.IGNORECASE)
16
  TITLE_RE = re.compile(r"<title[^>]*>(.*?)</title>", re.IGNORECASE | re.DOTALL)
17
+ TR_MARKER_RE = re.compile(r">(TR-\d{3,})<", re.IGNORECASE)
18
 
19
 
20
  def _strip_tags(html_text: str) -> str:
 
139
  except Exception as e:
140
  return {"ok": False, "error": f"Failed to fetch NTP index: {e}", "items": []}
141
 
142
+ lines = index_html.splitlines()
143
  q_low = q.lower()
144
+ results: List[Dict[str, Any]] = []
145
+
146
+ for i, line in enumerate(lines):
147
+ m = TR_MARKER_RE.search(line)
148
+ if not m:
149
+ continue
150
+
151
+ snippet = " ".join(lines[i : min(i + 12, len(lines))])
152
+ low_text = _strip_tags(snippet).lower()
153
+ if q_low not in low_text:
 
 
 
 
 
 
 
 
 
154
  continue
155
+
156
+ tr_id = m.group(1)
157
+ hrefs = HREF_RE.findall(snippet)
158
+
159
+ # Prefer a non-PDF link under /publications/ or /go/
160
+ candidates = [urljoin(REPORTS_URL, h) for h in hrefs if not re.search(r"\.pdf(\?|$)", h, re.I)]
161
+
162
+ def score(u: str) -> int:
163
+ s = 0
164
+ if "/publications/" in u:
165
+ s += 3
166
+ if "/go/" in u:
167
+ s += 3
168
+ if tr_id and tr_id.lower() in u.lower():
169
+ s += 2
170
+ if re.search(r"/reports?", u):
171
+ s += 1
172
+ return s
173
+
174
+ candidates.sort(key=score, reverse=True)
175
+ page_href = candidates[0] if candidates else None
176
+
177
+ pdf_match = re.search(r'href="([^"]+\.pdf)"', snippet, re.I)
178
+ pdf_url = urljoin(REPORTS_URL, pdf_match.group(1)) if pdf_match else None
179
+
180
+ text_block = _strip_tags(snippet)
181
+ title_match = re.search(
182
+ r"TR-\d{3,}\s+PDF\s+(.*?)\s+\b(20\d{2}|19\d{2})\b\s+Technical Report",
183
+ text_block,
184
+ re.I,
185
+ )
186
+ year_match = re.search(r"\b(20\d{2}|19\d{2})\b", text_block)
187
+
188
+ results.append(
189
+ {
190
+ "tr": tr_id,
191
+ "title": title_match.group(1) if title_match else "",
192
+ "year": year_match.group(1) if year_match else "",
193
+ "pdf": pdf_url,
194
+ "report_page": page_href or REPORTS_URL,
195
+ }
196
+ )
197
+ if len(results) >= int(limit):
198
  break
199
 
200
+ # Fallback: scan the TR index page (data/tr)
201
+ if not results:
202
  try:
203
+ r2 = await http.get(
204
+ INDEX_URL,
205
+ timeout=25,
206
+ follow_redirects=True,
207
+ headers={"User-Agent": "Mozilla/5.0"},
208
+ )
209
+ if r2.status_code < 400:
210
+ idx_html = r2.text
211
+ else:
212
+ idx_html = ""
213
  except Exception:
214
+ idx_html = ""
215
+
216
+ if idx_html:
217
+ idx_lines = idx_html.splitlines()
218
+ for i, row in enumerate(idx_lines):
219
+ if not re.search(r"TR-\d{3,}", row, re.I):
 
 
 
 
 
 
 
220
  continue
221
+ block = " ".join(idx_lines[i : min(i + 6, len(idx_lines))])
222
+ block_text = _strip_tags(block)
223
+ low = block_text.lower()
224
+ if q_low not in low:
225
  continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
+ tr = re.search(r"TR-\d{3,}", block_text, re.I)
228
+ cas = re.search(r"\b\d{2,7}-\d{2}-\d\b", block_text)
229
+ name = block_text
230
+ if tr:
231
+ name = re.sub(r"TR-\d{3,}", "", name, flags=re.I)
232
+ if cas:
233
+ name = re.sub(r"\b\d{2,7}-\d{2}-\d\b", "", name)
234
+ name = name.strip()
235
+
236
+ results.append(
237
+ {
238
+ "tr": tr.group(0) if tr else "",
239
+ "title": name or "",
240
+ "year": "",
241
+ "pdf": None,
242
+ "report_page": INDEX_URL,
243
+ }
244
+ )
245
+ if len(results) >= int(limit):
246
+ break
247
+
248
+ return {"ok": True, "query": q, "items": results}