Spaces:

hchevva
/

TOXRA.AI

Sleeping

App Files Files Community

hchevva commited on Feb 6

Commit

01ce8ad

verified ·

1 Parent(s): d59d94c

Upload 3 files

Browse files

Files changed (3) hide show

core/sources/ctx.py +28 -1
core/sources/fema.py +17 -7
core/sources/ntp.py +99 -79

core/sources/ctx.py CHANGED Viewed

@@ -38,10 +38,14 @@ def _as_rows(data: Any) -> List[Any]:
     if isinstance(data, list):
         return data
     if isinstance(data, dict):
         for key in ("data", "results", "items"):
             v = data.get(key)
             if isinstance(v, list):
                 return v
     return []
@@ -275,8 +279,31 @@ async def fetch_ctx_genetox(cas_or_query: str, http: httpx.AsyncClient) -> Dict[
             "dashboard_search": dashboard_search_url(q),
         }
-    dtxsid = await resolve_dtxsid(q, http)
     if not dtxsid:
         return {
             "ok": False,
             "error": "No DTXSID found for this query.",

     if isinstance(data, list):
         return data
     if isinstance(data, dict):
+        if "identifier" in data or "chemical" in data or "DTXSID" in data or "dtxsid" in data:
+            return [data]
         for key in ("data", "results", "items"):
             v = data.get(key)
             if isinstance(v, list):
                 return v
+            if isinstance(v, dict):
+                return [v]
     return []
             "dashboard_search": dashboard_search_url(q),
         }
+    if q.upper().startswith("DTXSID"):
+        dtxsid = q.strip()
+    else:
+        dtxsid = await resolve_dtxsid(q, http)
     if not dtxsid:
+        # Attempt direct hazard search by CAS or name (some deployments return summary directly)
+        try:
+            data = await _ctx_get("/hazard/genetox/summary/search", http, params={"cas": q})
+        except Exception:
+            data = None
+        if not data:
+            try:
+                data = await _ctx_get("/hazard/genetox/summary/search", http, params={"name": q})
+            except Exception:
+                data = None
+        if data:
+            found = _extract_dtxsid_any(data)
+            return {
+                "ok": True,
+                "dtxsid": found,
+                "summary": data,
+                "dashboard_url": dashboard_details_url(found or q),
+            }
         return {
             "ok": False,
             "error": "No DTXSID found for this query.",

core/sources/fema.py CHANGED Viewed

@@ -1,19 +1,29 @@
 from urllib.parse import quote_plus
-def fema_link(cas_or_query: str) -> dict:
     """Build the FEMA / Fragrance Materials Safety Resource search URL.
     Production uses Elsevier's Fragrance Materials Safety Resource with CAS query params.
     """
     q = (cas_or_query or "").strip()
-    if not q:
         return {"ok": False, "error": "Empty query"}
     # NOTE: domain spelling matters; the older '...materialssafety...' variant often 404s.
-    base = "https://fragrancematerialsafetyresource.elsevier.com/"
-    cas_url = f"{base}?field_cas_tid_1={quote_plus(q)}&field_chemical_synonym_tid="
-    # Generic search fallback (some deployments ignore CAS filter params)
-    search_url = f"{base}search/node?keys={quote_plus(q)}"
-    return {"ok": True, "url": cas_url, "alt_url": search_url}

+import os
 from urllib.parse import quote_plus
+def fema_link(cas_or_query: str, name_query: str | None = None) -> dict:
     """Build the FEMA / Fragrance Materials Safety Resource search URL.
     Production uses Elsevier's Fragrance Materials Safety Resource with CAS query params.
     """
     q = (cas_or_query or "").strip()
+    name_q = (name_query or "").strip()
+    if not q and not name_q:
         return {"ok": False, "error": "Empty query"}
     # NOTE: domain spelling matters; the older '...materialssafety...' variant often 404s.
+    base = os.getenv("FEMA_BASE_URL", "https://fragrancematerialsafetyresource.elsevier.com/")
+    cas_param = os.getenv("FEMA_CAS_PARAM", "field_cas_tid_1")
+    name_param = os.getenv("FEMA_NAME_PARAM", "field_chemical_synonym_tid")
+    cas_value = quote_plus(q) if q else ""
+    name_value = quote_plus(name_q or q)
+    cas_url = f"{base}?{cas_param}={cas_value}&{name_param}=" if cas_value else ""
+    name_url = f"{base}?{cas_param}=&{name_param}={name_value}" if name_value else ""
+    # Generic search fallback (some deployments ignore filter params)
+    search_term = name_q or q
+    search_url = f"{base}search/node?keys={quote_plus(search_term)}" if search_term else ""
+    return {"ok": True, "cas_url": cas_url, "name_url": name_url, "alt_url": search_url}

core/sources/ntp.py CHANGED Viewed

@@ -14,6 +14,7 @@ INDEX_URL = "https://ntp.niehs.nih.gov/data/tr"
 TR_RE = re.compile(r"\bTR-?(\d{3,4})\b", re.IGNORECASE)
 HREF_RE = re.compile(r"href=[\"']([^\"']+)[\"']", re.IGNORECASE)
 TITLE_RE = re.compile(r"<title[^>]*>(.*?)</title>", re.IGNORECASE | re.DOTALL)
 def _strip_tags(html_text: str) -> str:
@@ -138,91 +139,110 @@ async def search_technical_reports(query: str, http: httpx.AsyncClient, limit: i
     except Exception as e:
         return {"ok": False, "error": f"Failed to fetch NTP index: {e}", "items": []}
-    plain = _strip_tags(index_html)
     q_low = q.lower()
-    is_cas = bool(re.match(r"^\\d{2,7}-\\d{2}-\\d$", q))
-    q_digits = re.sub(r"\\D", "", q) if is_cas else ""
-    nums: List[str] = []
-    seen = set()
-    for m in TR_RE.finditer(plain):
-        num = m.group(1)
-        # neighborhood window similar to production
-        start = max(0, m.start() - 250)
-        end = min(len(plain), m.end() + 250)
-        neighborhood = plain[start:end]
-        if is_cas:
-            if q_digits not in re.sub(r"\\D", "", neighborhood):
-                continue
-        else:
-            if q_low not in neighborhood.lower():
-                continue
-        if num in seen:
             continue
-        seen.add(num)
-        nums.append(num)
-        if len(nums) >= max(1, int(limit)):
             break
-    if not nums:
-        # Fallback: scan the TR index page (data/tr)
         try:
-            r2 = await http.get(INDEX_URL, timeout=25, follow_redirects=True)
-            if r2.status_code >= 400:
-                return {"ok": True, "query": q, "items": []}
-            idx_html = r2.text
         except Exception:
-            return {"ok": True, "query": q, "items": []}
-        idx_lines = idx_html.splitlines()
-        items: List[Dict[str, Any]] = []
-        seen = set()
-        for i, line in enumerate(idx_lines):
-            if not TR_RE.search(line):
-                continue
-            block = " ".join(idx_lines[i : i + 6])
-            block_text = _strip_tags(block)
-            if is_cas:
-                if q_digits not in re.sub(r"\\D", "", block_text):
                     continue
-            else:
-                if q_low not in block_text.lower():
                     continue
-            m = TR_RE.search(block_text)
-            if not m:
-                continue
-            num = m.group(1)
-            if num in seen:
-                continue
-            seen.add(num)
-            # Derive a best-effort title from the block text
-            title = re.sub(TR_RE, "", block_text).strip()
-            title = re.sub(r"\\b\\d{2,7}-\\d{2}-\\d\\b", "", title).strip()
-            items.append(
-                {
-                    "num": num,
-                    "tr": f"TR-{num}",
-                    "report_page": INDEX_URL,
-                    "title": title,
-                    "year": None,
-                    "pdf": None,
-                }
-            )
-            if len(items) >= max(1, int(limit)):
-                break
-        return {"ok": True, "query": q, "items": items}
-    items: List[Dict[str, Any]] = []
-    for num in nums:
-        item = await _fetch_tr_page(num, http)
-        if item:
-            items.append(item)
-        if len(items) >= max(1, int(limit)):
-            break
-    return {"ok": True, "query": q, "items": items}

 TR_RE = re.compile(r"\bTR-?(\d{3,4})\b", re.IGNORECASE)
 HREF_RE = re.compile(r"href=[\"']([^\"']+)[\"']", re.IGNORECASE)
 TITLE_RE = re.compile(r"<title[^>]*>(.*?)</title>", re.IGNORECASE | re.DOTALL)
+TR_MARKER_RE = re.compile(r">(TR-\d{3,})<", re.IGNORECASE)
 def _strip_tags(html_text: str) -> str:
     except Exception as e:
         return {"ok": False, "error": f"Failed to fetch NTP index: {e}", "items": []}
+    lines = index_html.splitlines()
     q_low = q.lower()
+    results: List[Dict[str, Any]] = []
+    for i, line in enumerate(lines):
+        m = TR_MARKER_RE.search(line)
+        if not m:
+            continue
+        snippet = " ".join(lines[i : min(i + 12, len(lines))])
+        low_text = _strip_tags(snippet).lower()
+        if q_low not in low_text:
             continue
+        tr_id = m.group(1)
+        hrefs = HREF_RE.findall(snippet)
+        # Prefer a non-PDF link under /publications/ or /go/
+        candidates = [urljoin(REPORTS_URL, h) for h in hrefs if not re.search(r"\.pdf(\?|$)", h, re.I)]
+        def score(u: str) -> int:
+            s = 0
+            if "/publications/" in u:
+                s += 3
+            if "/go/" in u:
+                s += 3
+            if tr_id and tr_id.lower() in u.lower():
+                s += 2
+            if re.search(r"/reports?", u):
+                s += 1
+            return s
+        candidates.sort(key=score, reverse=True)
+        page_href = candidates[0] if candidates else None
+        pdf_match = re.search(r'href="([^"]+\.pdf)"', snippet, re.I)
+        pdf_url = urljoin(REPORTS_URL, pdf_match.group(1)) if pdf_match else None
+        text_block = _strip_tags(snippet)
+        title_match = re.search(
+            r"TR-\d{3,}\s+PDF\s+(.*?)\s+\b(20\d{2}|19\d{2})\b\s+Technical Report",
+            text_block,
+            re.I,
+        )
+        year_match = re.search(r"\b(20\d{2}|19\d{2})\b", text_block)
+        results.append(
+            {
+                "tr": tr_id,
+                "title": title_match.group(1) if title_match else "",
+                "year": year_match.group(1) if year_match else "",
+                "pdf": pdf_url,
+                "report_page": page_href or REPORTS_URL,
+            }
+        )
+        if len(results) >= int(limit):
             break
+    # Fallback: scan the TR index page (data/tr)
+    if not results:
         try:
+            r2 = await http.get(
+                INDEX_URL,
+                timeout=25,
+                follow_redirects=True,
+                headers={"User-Agent": "Mozilla/5.0"},
+            )
+            if r2.status_code < 400:
+                idx_html = r2.text
+            else:
+                idx_html = ""
         except Exception:
+            idx_html = ""
+        if idx_html:
+            idx_lines = idx_html.splitlines()
+            for i, row in enumerate(idx_lines):
+                if not re.search(r"TR-\d{3,}", row, re.I):
                     continue
+                block = " ".join(idx_lines[i : min(i + 6, len(idx_lines))])
+                block_text = _strip_tags(block)
+                low = block_text.lower()
+                if q_low not in low:
                     continue
+                tr = re.search(r"TR-\d{3,}", block_text, re.I)
+                cas = re.search(r"\b\d{2,7}-\d{2}-\d\b", block_text)
+                name = block_text
+                if tr:
+                    name = re.sub(r"TR-\d{3,}", "", name, flags=re.I)
+                if cas:
+                    name = re.sub(r"\b\d{2,7}-\d{2}-\d\b", "", name)
+                name = name.strip()
+                results.append(
+                    {
+                        "tr": tr.group(0) if tr else "",
+                        "title": name or "",
+                        "year": "",
+                        "pdf": None,
+                        "report_page": INDEX_URL,
+                    }
+                )
+                if len(results) >= int(limit):
+                    break
+    return {"ok": True, "query": q, "items": results}