Spaces:

hchevva
/

TOXRA.AI

Sleeping

App Files Files Community

hchevva commited on 15 days ago

Commit

02835d5

verified ·

1 Parent(s): 01ce8ad

Upload 4 files

Browse files

Files changed (4) hide show

core/sources/ctx.py +38 -1
core/sources/fema.py +63 -6
core/sources/ntp.py +13 -2
core/sources/pubchem.py +11 -0

core/sources/ctx.py CHANGED Viewed

@@ -60,9 +60,18 @@ def _extract_dtxsid_any(data: Any) -> Optional[str]:
 def _ctx_headers() -> Dict[str, str]:
     headers = {"accept": "application/json"}
-    key = settings.ctx_api_key or os.getenv("CTX_API_KEY") or os.getenv("COMPTOX_API_KEY") or os.getenv("CTX_KEY")
     if key:
         headers["x-api-key"] = key
     return headers
@@ -88,6 +97,11 @@ async def _resolve_from_cas(cas: str, http: httpx.AsyncClient) -> Optional[str]:
         (f"/chemical/identifiers/by-cas/{quote(clean)}", None),
         (f"/chemical/identifiers/search/by-cas/{quote(clean)}", None),
         ("/chemical/identifiers", {"cas": clean}),
         ("/chemical/search/equal", {"word": clean}),
         ("/chemical/search/contains", {"word": clean}),
         ("/chemical/search", {"matchType": "equal", "word": clean}),
@@ -137,6 +151,10 @@ async def _resolve_from_name(name: str, http: httpx.AsyncClient) -> Optional[str
         (f"/chemical/identifiers/by-name/{quote(q)}", None),
         (f"/chemical/identifiers/search/by-name/{quote(q)}", None),
         ("/chemical/identifiers", {"name": q}),
         ("/chemical/search/equal", {"word": q}),
         ("/chemical/search/contains", {"word": q}),
         ("/chemical/search", {"matchType": "equal", "word": q}),
@@ -304,6 +322,25 @@ async def fetch_ctx_genetox(cas_or_query: str, http: httpx.AsyncClient) -> Dict[
                 "dashboard_url": dashboard_details_url(found or q),
             }
         return {
             "ok": False,
             "error": "No DTXSID found for this query.",

 def _ctx_headers() -> Dict[str, str]:
     headers = {"accept": "application/json"}
+    key = (
+        settings.ctx_api_key
+        or os.getenv("CTX_API_KEY")
+        or os.getenv("COMPTOX_API_KEY")
+        or os.getenv("CTX_KEY")
+    )
+    if isinstance(key, str):
+        key = key.strip()
     if key:
         headers["x-api-key"] = key
+        headers["X-Api-Key"] = key
+    headers["user-agent"] = "toxrai-hf-demo"
     return headers
         (f"/chemical/identifiers/by-cas/{quote(clean)}", None),
         (f"/chemical/identifiers/search/by-cas/{quote(clean)}", None),
         ("/chemical/identifiers", {"cas": clean}),
+        ("/chemical/identifiers", {"casrn": clean}),
+        ("/chemical/identifiers/search", {"casrn": clean}),
+        ("/chemical/search", {"query": clean, "type": "equals"}),
+        ("/chemical/search", {"query": clean, "type": "contains"}),
+        ("/chemical/search", {"searchType": "equals", "query": clean}),
         ("/chemical/search/equal", {"word": clean}),
         ("/chemical/search/contains", {"word": clean}),
         ("/chemical/search", {"matchType": "equal", "word": clean}),
         (f"/chemical/identifiers/by-name/{quote(q)}", None),
         (f"/chemical/identifiers/search/by-name/{quote(q)}", None),
         ("/chemical/identifiers", {"name": q}),
+        ("/chemical/identifiers/search", {"name": q}),
+        ("/chemical/search", {"query": q, "type": "equals"}),
+        ("/chemical/search", {"query": q, "type": "contains"}),
+        ("/chemical/search", {"searchType": "equals", "query": q}),
         ("/chemical/search/equal", {"word": q}),
         ("/chemical/search/contains", {"word": q}),
         ("/chemical/search", {"matchType": "equal", "word": q}),
                 "dashboard_url": dashboard_details_url(found or q),
             }
+        # Try one direct identifier call to surface CTX errors (auth, etc.)
+        try:
+            if is_cas(q):
+                await _ctx_get(f"/chemical/identifiers/by-cas/{quote(q)}", http)
+            else:
+                await _ctx_get(f"/chemical/identifiers/by-name/{quote(q)}", http)
+        except httpx.HTTPStatusError as e:
+            return {
+                "ok": False,
+                "error": f"CTX API error {e.response.status_code}: {e.response.text[:200]}",
+                "dashboard_search": dashboard_search_url(q),
+            }
+        except Exception as e:
+            return {
+                "ok": False,
+                "error": f"CTX request failed: {e}",
+                "dashboard_search": dashboard_search_url(q),
+            }
         return {
             "ok": False,
             "error": "No DTXSID found for this query.",

core/sources/fema.py CHANGED Viewed

@@ -1,5 +1,48 @@
 import os
-from urllib.parse import quote_plus
 def fema_link(cas_or_query: str, name_query: str | None = None) -> dict:
@@ -15,15 +58,29 @@ def fema_link(cas_or_query: str, name_query: str | None = None) -> dict:
     # NOTE: domain spelling matters; the older '...materialssafety...' variant often 404s.
     base = os.getenv("FEMA_BASE_URL", "https://fragrancematerialsafetyresource.elsevier.com/")
-    cas_param = os.getenv("FEMA_CAS_PARAM", "field_cas_tid_1")
-    name_param = os.getenv("FEMA_NAME_PARAM", "field_chemical_synonym_tid")
     cas_value = quote_plus(q) if q else ""
     name_value = quote_plus(name_q or q)
-    cas_url = f"{base}?{cas_param}={cas_value}&{name_param}=" if cas_value else ""
-    name_url = f"{base}?{cas_param}=&{name_param}={name_value}" if name_value else ""
     # Generic search fallback (some deployments ignore filter params)
     search_term = name_q or q
     search_url = f"{base}search/node?keys={quote_plus(search_term)}" if search_term else ""
-    return {"ok": True, "cas_url": cas_url, "name_url": name_url, "alt_url": search_url}

 import os
+from urllib.parse import quote_plus, urljoin
+import httpx
+from bs4 import BeautifulSoup
+_PARAM_CACHE: dict[str, tuple[str, str, str]] = {}
+def _discover_params(base: str) -> tuple[str, str, str]:
+    if base in _PARAM_CACHE:
+        return _PARAM_CACHE[base]
+    cas_param = os.getenv("FEMA_CAS_PARAM", "field_cas_tid_1")
+    name_param = os.getenv("FEMA_NAME_PARAM", "field_chemical_synonym_tid")
+    action = os.getenv("FEMA_FORM_ACTION", base)
+    try:
+        r = httpx.get(base, timeout=15, headers={"User-Agent": "Mozilla/5.0"})
+        if r.status_code < 400:
+            soup = BeautifulSoup(r.text, "lxml")
+            form = soup.find("form")
+            if form and form.get("action"):
+                action = urljoin(base, form.get("action"))
+            inputs = soup.find_all("input")
+            for inp in inputs:
+                name = (inp.get("name") or "").strip()
+                if not name:
+                    continue
+                placeholder = (inp.get("placeholder") or "").lower()
+                lower_name = name.lower()
+                if "cas" in placeholder or lower_name == "cas" or "cas" in lower_name:
+                    cas_param = name
+                if (
+                    "synonym" in placeholder
+                    or "chemical" in placeholder
+                    or "synonym" in lower_name
+                    or "chemical" in lower_name
+                ):
+                    name_param = name
+    except Exception:
+        pass
+    _PARAM_CACHE[base] = (cas_param, name_param, action)
+    return cas_param, name_param, action
 def fema_link(cas_or_query: str, name_query: str | None = None) -> dict:
     # NOTE: domain spelling matters; the older '...materialssafety...' variant often 404s.
     base = os.getenv("FEMA_BASE_URL", "https://fragrancematerialsafetyresource.elsevier.com/")
+    cas_param, name_param, action = _discover_params(base)
     cas_value = quote_plus(q) if q else ""
     name_value = quote_plus(name_q or q)
+    cas_url = f"{action}?{cas_param}={cas_value}&{name_param}=" if cas_value else ""
+    name_url = f"{action}?{cas_param}=&{name_param}={name_value}" if name_value else ""
+    combo_url = (
+        f"{action}?{cas_param}={cas_value}&{name_param}={name_value}"
+        if cas_value and name_value
+        else ""
+    )
     # Generic search fallback (some deployments ignore filter params)
     search_term = name_q or q
     search_url = f"{base}search/node?keys={quote_plus(search_term)}" if search_term else ""
+    search_api_url = (
+        f"{base}search/node?search_api_fulltext={quote_plus(search_term)}" if search_term else ""
+    )
+    return {
+        "ok": True,
+        "cas_url": cas_url,
+        "name_url": name_url,
+        "combo_url": combo_url,
+        "alt_url": search_url,
+        "search_api_url": search_api_url,
+    }

core/sources/ntp.py CHANGED Viewed

@@ -141,6 +141,8 @@ async def search_technical_reports(query: str, http: httpx.AsyncClient, limit: i
     lines = index_html.splitlines()
     q_low = q.lower()
     results: List[Dict[str, Any]] = []
     for i, line in enumerate(lines):
@@ -149,9 +151,16 @@ async def search_technical_reports(query: str, http: httpx.AsyncClient, limit: i
             continue
         snippet = " ".join(lines[i : min(i + 12, len(lines))])
         low_text = _strip_tags(snippet).lower()
-        if q_low not in low_text:
-            continue
         tr_id = m.group(1)
         hrefs = HREF_RE.findall(snippet)
@@ -178,6 +187,8 @@ async def search_technical_reports(query: str, http: httpx.AsyncClient, limit: i
         pdf_url = urljoin(REPORTS_URL, pdf_match.group(1)) if pdf_match else None
         text_block = _strip_tags(snippet)
         title_match = re.search(
             r"TR-\d{3,}\s+PDF\s+(.*?)\s+\b(20\d{2}|19\d{2})\b\s+Technical Report",
             text_block,

     lines = index_html.splitlines()
     q_low = q.lower()
+    is_cas = bool(re.match(r"^\d{2,7}-\d{2}-\d$", q))
+    q_digits = re.sub(r"\D", "", q) if is_cas else ""
     results: List[Dict[str, Any]] = []
     for i, line in enumerate(lines):
             continue
         snippet = " ".join(lines[i : min(i + 12, len(lines))])
+        mini_snippet = " ".join(lines[i : min(i + 3, len(lines))])
         low_text = _strip_tags(snippet).lower()
+        mini_text = _strip_tags(mini_snippet)
+        if is_cas:
+            if q_digits not in re.sub(r"\D", "", mini_text):
+                continue
+        else:
+            if q_low not in low_text:
+                continue
         tr_id = m.group(1)
         hrefs = HREF_RE.findall(snippet)
         pdf_url = urljoin(REPORTS_URL, pdf_match.group(1)) if pdf_match else None
         text_block = _strip_tags(snippet)
+        if is_cas and q_digits not in re.sub(r"\D", "", text_block):
+            continue
         title_match = re.search(
             r"TR-\d{3,}\s+PDF\s+(.*?)\s+\b(20\d{2}|19\d{2})\b\s+Technical Report",
             text_block,

core/sources/pubchem.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import html
 import re
 from typing import Any, Dict, List, Optional
 from urllib.parse import quote
@@ -9,6 +10,7 @@ PUBCHEM_REST = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
 PUBCHEM_VIEW = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view"
 CAS_RE = re.compile(r"^\d{2,7}-\d{2}-\d$")
 def is_cas(s: str) -> bool:
@@ -196,6 +198,14 @@ async def pubchem_by_query(query: str, http: httpx.AsyncClient) -> Dict[str, Any
         seen.add(key)
         uniq_haz.append(h)
     return {
         "ok": True,
         "query": q,
@@ -206,4 +216,5 @@ async def pubchem_by_query(query: str, http: httpx.AsyncClient) -> Dict[str, Any
         "url": _compound_url(cid),
         "synonyms": synonyms[:50],
         "hazards": uniq_haz,
     }

 import html
+import json
 import re
 from typing import Any, Dict, List, Optional
 from urllib.parse import quote
 PUBCHEM_VIEW = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view"
 CAS_RE = re.compile(r"^\d{2,7}-\d{2}-\d$")
+DTXSID_RE = re.compile(r"DTXSID\d{7,}")
 def is_cas(s: str) -> bool:
         seen.add(key)
         uniq_haz.append(h)
+    dtxsid = None
+    try:
+        m = DTXSID_RE.search(json.dumps(record_json))
+        if m:
+            dtxsid = m.group(0)
+    except Exception:
+        dtxsid = None
     return {
         "ok": True,
         "query": q,
         "url": _compound_url(cid),
         "synonyms": synonyms[:50],
         "hazards": uniq_haz,
+        "dtxsid": dtxsid,
     }