hchevva commited on
Commit
0f2aafd
·
verified ·
1 Parent(s): ee87da6

Upload 3 files

Browse files
Files changed (3) hide show
  1. core/sources/ctx.py +38 -28
  2. core/sources/fema.py +4 -2
  3. core/sources/ntp.py +29 -8
core/sources/ctx.py CHANGED
@@ -1,4 +1,6 @@
 
1
  import re
 
2
  from typing import Any, Dict, List, Optional
3
  from urllib.parse import quote
4
 
@@ -43,11 +45,26 @@ def _as_rows(data: Any) -> List[Any]:
43
  return []
44
 
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  async def _ctx_get(path: str, http: httpx.AsyncClient, params: Dict[str, Any] | None = None) -> Any:
47
  url = settings.ctx_base_url.rstrip("/") + path
48
- headers = {"accept": "application/json"}
49
- if settings.ctx_api_key:
50
- headers["x-api-key"] = settings.ctx_api_key
51
 
52
  r = await http.get(url, params=params, headers=headers, timeout=25.0, follow_redirects=True)
53
  r.raise_for_status()
@@ -81,6 +98,9 @@ async def _resolve_from_cas(cas: str, http: httpx.AsyncClient) -> Optional[str]:
81
  dtxsid = _pick_dtxsid(rows)
82
  if dtxsid:
83
  return dtxsid
 
 
 
84
  except Exception:
85
  pass
86
 
@@ -95,6 +115,9 @@ async def _resolve_from_cas(cas: str, http: httpx.AsyncClient) -> Optional[str]:
95
  dtxsid = _pick_dtxsid(rows)
96
  if dtxsid:
97
  return dtxsid
 
 
 
98
  except Exception:
99
  pass
100
 
@@ -123,6 +146,9 @@ async def _resolve_from_name(name: str, http: httpx.AsyncClient) -> Optional[str
123
  dtxsid = _pick_dtxsid(rows)
124
  if dtxsid:
125
  return dtxsid
 
 
 
126
  except Exception:
127
  pass
128
 
@@ -137,6 +163,9 @@ async def _resolve_from_name(name: str, http: httpx.AsyncClient) -> Optional[str
137
  dtxsid = _pick_dtxsid(rows)
138
  if dtxsid:
139
  return dtxsid
 
 
 
140
  except Exception:
141
  pass
142
 
@@ -239,31 +268,12 @@ async def fetch_ctx_genetox(cas_or_query: str, http: httpx.AsyncClient) -> Dict[
239
  if not q:
240
  return {"ok": False, "error": "Empty query"}
241
 
242
- # Prefer worker proxy if configured (matches production behavior)
243
- if settings.worker_base_url:
244
- try:
245
- worker_url = settings.worker_base_url.rstrip("/") + "/ctx-genetox"
246
- payload = {"dtxsid": q} if q.upper().startswith("DTXSID") else {"query": q}
247
- r = await http.post(worker_url, json=payload, timeout=25.0)
248
- if r.status_code < 400:
249
- data = r.json()
250
- if data.get("summary"):
251
- dtxsid = data.get("dtxsid")
252
- return {
253
- "ok": True,
254
- "dtxsid": dtxsid,
255
- "summary": data.get("summary"),
256
- "dashboard_url": dashboard_details_url(dtxsid or q),
257
- }
258
- if data.get("resolveUrl"):
259
- return {
260
- "ok": False,
261
- "error": data.get("message") or "No DTXSID found for this query.",
262
- "dashboard_search": data.get("resolveUrl"),
263
- }
264
- # If worker errors, fall through to direct CTX
265
- except Exception:
266
- pass
267
 
268
  dtxsid = await resolve_dtxsid(q, http)
269
  if not dtxsid:
 
1
+ import os
2
  import re
3
+ import json
4
  from typing import Any, Dict, List, Optional
5
  from urllib.parse import quote
6
 
 
45
  return []
46
 
47
 
48
+ def _extract_dtxsid_any(data: Any) -> Optional[str]:
49
+ try:
50
+ text = json.dumps(data)
51
+ except Exception:
52
+ text = str(data)
53
+ m = DTXSID_RE.search(text)
54
+ return m.group(0) if m else None
55
+
56
+
57
+ def _ctx_headers() -> Dict[str, str]:
58
+ headers = {"accept": "application/json"}
59
+ key = settings.ctx_api_key or os.getenv("CTX_API_KEY") or os.getenv("COMPTOX_API_KEY") or os.getenv("CTX_KEY")
60
+ if key:
61
+ headers["x-api-key"] = key
62
+ return headers
63
+
64
+
65
  async def _ctx_get(path: str, http: httpx.AsyncClient, params: Dict[str, Any] | None = None) -> Any:
66
  url = settings.ctx_base_url.rstrip("/") + path
67
+ headers = _ctx_headers()
 
 
68
 
69
  r = await http.get(url, params=params, headers=headers, timeout=25.0, follow_redirects=True)
70
  r.raise_for_status()
 
98
  dtxsid = _pick_dtxsid(rows)
99
  if dtxsid:
100
  return dtxsid
101
+ dtxsid = _extract_dtxsid_any(data)
102
+ if dtxsid:
103
+ return dtxsid
104
  except Exception:
105
  pass
106
 
 
115
  dtxsid = _pick_dtxsid(rows)
116
  if dtxsid:
117
  return dtxsid
118
+ dtxsid = _extract_dtxsid_any(data)
119
+ if dtxsid:
120
+ return dtxsid
121
  except Exception:
122
  pass
123
 
 
146
  dtxsid = _pick_dtxsid(rows)
147
  if dtxsid:
148
  return dtxsid
149
+ dtxsid = _extract_dtxsid_any(data)
150
+ if dtxsid:
151
+ return dtxsid
152
  except Exception:
153
  pass
154
 
 
163
  dtxsid = _pick_dtxsid(rows)
164
  if dtxsid:
165
  return dtxsid
166
+ dtxsid = _extract_dtxsid_any(data)
167
+ if dtxsid:
168
+ return dtxsid
169
  except Exception:
170
  pass
171
 
 
268
  if not q:
269
  return {"ok": False, "error": "Empty query"}
270
 
271
+ if not _ctx_headers().get("x-api-key"):
272
+ return {
273
+ "ok": False,
274
+ "error": "CTX_API_KEY not configured. Please set it in HF Secrets.",
275
+ "dashboard_search": dashboard_search_url(q),
276
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
 
278
  dtxsid = await resolve_dtxsid(q, http)
279
  if not dtxsid:
core/sources/fema.py CHANGED
@@ -13,5 +13,7 @@ def fema_link(cas_or_query: str) -> dict:
13
 
14
  # NOTE: domain spelling matters; the older '...materialssafety...' variant often 404s.
15
  base = "https://fragrancematerialsafetyresource.elsevier.com/"
16
- url = f"{base}?field_cas_tid_1={quote_plus(q)}&field_chemical_synonym_tid="
17
- return {"ok": True, "url": url}
 
 
 
13
 
14
  # NOTE: domain spelling matters; the older '...materialssafety...' variant often 404s.
15
  base = "https://fragrancematerialsafetyresource.elsevier.com/"
16
+ cas_url = f"{base}?field_cas_tid_1={quote_plus(q)}&field_chemical_synonym_tid="
17
+ # Generic search fallback (some deployments ignore CAS filter params)
18
+ search_url = f"{base}search/node?keys={quote_plus(q)}"
19
+ return {"ok": True, "url": cas_url, "alt_url": search_url}
core/sources/ntp.py CHANGED
@@ -52,7 +52,12 @@ def _extract_pdf_url(page_html: str, page_url: str) -> Optional[str]:
52
  async def _fetch_tr_page(num: str, http: httpx.AsyncClient) -> Optional[Dict[str, Any]]:
53
  page_url = f"{BASE}/publications/reports/tr{num}"
54
  try:
55
- r = await http.get(page_url, timeout=25, follow_redirects=True)
 
 
 
 
 
56
  if r.status_code >= 400:
57
  return None
58
  page_html = r.text
@@ -116,12 +121,18 @@ async def search_technical_reports(query: str, http: httpx.AsyncClient, limit: i
116
  "pdf": row.get("pdf"),
117
  }
118
  )
119
- return {"ok": True, "query": q, "items": items}
 
120
  except Exception:
121
  pass
122
 
123
  try:
124
- r = await http.get(REPORTS_URL, timeout=25, follow_redirects=True)
 
 
 
 
 
125
  r.raise_for_status()
126
  index_html = r.text
127
  except Exception as e:
@@ -129,6 +140,8 @@ async def search_technical_reports(query: str, http: httpx.AsyncClient, limit: i
129
 
130
  plain = _strip_tags(index_html)
131
  q_low = q.lower()
 
 
132
 
133
  nums: List[str] = []
134
  seen = set()
@@ -138,9 +151,13 @@ async def search_technical_reports(query: str, http: httpx.AsyncClient, limit: i
138
  # neighborhood window similar to production
139
  start = max(0, m.start() - 250)
140
  end = min(len(plain), m.end() + 250)
141
- neighborhood = plain[start:end].lower()
142
- if q_low not in neighborhood:
143
- continue
 
 
 
 
144
  if num in seen:
145
  continue
146
  seen.add(num)
@@ -167,8 +184,12 @@ async def search_technical_reports(query: str, http: httpx.AsyncClient, limit: i
167
  continue
168
  block = " ".join(idx_lines[i : i + 6])
169
  block_text = _strip_tags(block)
170
- if q_low not in block_text.lower():
171
- continue
 
 
 
 
172
  m = TR_RE.search(block_text)
173
  if not m:
174
  continue
 
52
  async def _fetch_tr_page(num: str, http: httpx.AsyncClient) -> Optional[Dict[str, Any]]:
53
  page_url = f"{BASE}/publications/reports/tr{num}"
54
  try:
55
+ r = await http.get(
56
+ page_url,
57
+ timeout=25,
58
+ follow_redirects=True,
59
+ headers={"User-Agent": "Mozilla/5.0"},
60
+ )
61
  if r.status_code >= 400:
62
  return None
63
  page_html = r.text
 
121
  "pdf": row.get("pdf"),
122
  }
123
  )
124
+ if items:
125
+ return {"ok": True, "query": q, "items": items}
126
  except Exception:
127
  pass
128
 
129
  try:
130
+ r = await http.get(
131
+ REPORTS_URL,
132
+ timeout=25,
133
+ follow_redirects=True,
134
+ headers={"User-Agent": "Mozilla/5.0"},
135
+ )
136
  r.raise_for_status()
137
  index_html = r.text
138
  except Exception as e:
 
140
 
141
  plain = _strip_tags(index_html)
142
  q_low = q.lower()
143
+ is_cas = bool(re.match(r"^\\d{2,7}-\\d{2}-\\d$", q))
144
+ q_digits = re.sub(r"\\D", "", q) if is_cas else ""
145
 
146
  nums: List[str] = []
147
  seen = set()
 
151
  # neighborhood window similar to production
152
  start = max(0, m.start() - 250)
153
  end = min(len(plain), m.end() + 250)
154
+ neighborhood = plain[start:end]
155
+ if is_cas:
156
+ if q_digits not in re.sub(r"\\D", "", neighborhood):
157
+ continue
158
+ else:
159
+ if q_low not in neighborhood.lower():
160
+ continue
161
  if num in seen:
162
  continue
163
  seen.add(num)
 
184
  continue
185
  block = " ".join(idx_lines[i : i + 6])
186
  block_text = _strip_tags(block)
187
+ if is_cas:
188
+ if q_digits not in re.sub(r"\\D", "", block_text):
189
+ continue
190
+ else:
191
+ if q_low not in block_text.lower():
192
+ continue
193
  m = TR_RE.search(block_text)
194
  if not m:
195
  continue