Brain_Cancer_Trails_Finder / ctgov_client.py
PrazNeuro's picture
Upload 12 files
b950dbe verified
# Shared client for ClinicalTrials.gov v2 API and scoring
import re
import requests
from typing import Any, Dict, List, Tuple
DEFAULT_DIAG_TERMS = {
"Glioblastoma": ["glioblastoma", "GBM", "glioblastoma multiforme"],
"Diffuse midline glioma": ["diffuse midline glioma", "DMG", "H3 K27M"],
"Anaplastic astrocytoma": ["anaplastic astrocytoma", "grade 3 astrocytoma"],
"Astrocytoma": ["astrocytoma", "grade 2 astrocytoma", "grade 4 astrocytoma"],
"Oligodendroglioma": ["oligodendroglioma", "1p19q codeleted"],
"Meningioma": ["meningioma"],
"Medulloblastoma": ["medulloblastoma"],
"Ependymoma": ["ependymoma"],
"Spinal cord tumor": ["spinal cord tumor", "spinal cord neoplasm"],
}
API_BASE = "https://clinicaltrials.gov/api/v2/studies"
UA = {"User-Agent": "BrainTrialsFinder-Desktop/1.0 (+https://clinicaltrials.gov)"}
def build_terms(diagnosis: str, keywords: str) -> List[str]:
terms: List[str] = []
if diagnosis in DEFAULT_DIAG_TERMS:
terms.extend(DEFAULT_DIAG_TERMS[diagnosis])
else:
terms.extend(["brain tumor", "spinal cord tumor", "CNS tumor"])
extra = [k.strip() for k in (keywords or "").split(",") if k.strip()]
return terms + extra
def ctgov_search_one(term: str, statuses: List[str], page_size: int = 100, max_pages: int = 5) -> List[Dict[str, Any]]:
session = requests.Session()
session.headers.update(UA)
all_studies: List[Dict[str, Any]] = []
page_token = None
count = 0
max_iters = max_pages or 0
while count < max_iters:
params = {
"query.term": term,
"filter.overallStatus": ",".join(statuses),
"pageSize": page_size,
}
if page_token:
params["pageToken"] = page_token
r = session.get(API_BASE, params=params, timeout=30)
r.raise_for_status()
data = r.json()
studies = data.get("studies", [])
if not studies:
break
all_studies.extend(studies)
page_token = data.get("nextPageToken")
if not page_token:
break
count += 1
return all_studies
def fetch_all_terms(terms: List[str], statuses: List[str], page_size=100, max_pages=5) -> List[Dict[str, Any]]:
dedup: Dict[str, Dict[str, Any]] = {}
for t in terms:
try:
for s in ctgov_search_one(t, statuses, page_size=page_size, max_pages=max_pages):
ident = (s.get("protocolSection", {}) or {}).get("identificationModule", {}) or {}
nct = ident.get("nctId")
key = nct or id(s)
if key not in dedup:
dedup[key] = s
except requests.HTTPError:
continue
return list(dedup.values())
def mentions(txt: str, term: str) -> bool:
return bool(re.search(rf"\b{re.escape(term)}\b", txt or "", re.I))
def as_text(obj: Any) -> str:
if obj is None:
return ""
if isinstance(obj, dict):
for k in ("textblock", "textBlock", "value"):
if k in obj:
return str(obj.get(k) or "")
return " ".join(str(v) for v in obj.values() if v is not None)
if isinstance(obj, list):
return "; ".join(as_text(x) for x in obj)
return str(obj)
def parse_age_to_int(v: Any):
if v is None:
return None
if isinstance(v, dict):
return parse_age_to_int(v.get("value"))
if isinstance(v, (int, float)):
return int(v)
m = re.search(r"(\d+)", str(v))
return int(m.group(1)) if m else None
def ensure_list(v: Any):
if v is None:
return []
if isinstance(v, list):
return v
return [v]
def score_trial(t: Dict[str, Any], intake: Dict[str, Any]) -> Tuple[int, List[str]]:
age_local = (intake or {}).get("age")
kps_local = (intake or {}).get("kps")
prior_bev_local = bool((intake or {}).get("prior_bev", False))
setting_local = (intake or {}).get("setting") or ""
keywords_local = (intake or {}).get("keywords") or ""
diagnosis_local = (intake or {}).get("diagnosis") or ""
if diagnosis_local in DEFAULT_DIAG_TERMS:
diag_terms = DEFAULT_DIAG_TERMS[diagnosis_local]
elif diagnosis_local and diagnosis_local != "Other":
diag_terms = [diagnosis_local]
else:
diag_terms = ["brain tumor", "CNS tumor", "spinal cord tumor"]
ps = (t or {}).get("protocolSection") or {}
elig = ps.get("eligibilityModule")
crit = ""
min_age = None
max_age = None
if isinstance(elig, dict):
crit_raw = elig.get("eligibilityCriteria") or elig.get("criteria") or elig
crit = as_text(crit_raw)
min_age = parse_age_to_int(elig.get("minimumAge"))
max_age = parse_age_to_int(elig.get("maximumAge"))
elif isinstance(elig, str):
crit = as_text(elig)
phases_list = ensure_list(ps.get("designModule", {}).get("phases"))
phases_up = [str(p).upper() for p in phases_list]
conds_list = ensure_list(ps.get("conditionsModule", {}).get("conditions"))
title = (ps.get("identificationModule", {}) or {}).get("briefTitle", "")
s = 0
reasons: List[str] = []
if any(any(mentions(c, term) for term in diag_terms) for c in conds_list) or any(mentions(title, term) for term in diag_terms):
s += 30
reasons.append(f"Matches diagnosis: {diagnosis_local or 'neuro-oncology'}.")
if any("PHASE 2" in p or "PHASE2" in p for p in phases_up):
s += 8
if any("PHASE 3" in p or "PHASE3" in p for p in phases_up):
s += 12
try:
if min_age is not None and age_local is not None and age_local < min_age:
reasons.append(f"Age below minimum ({min_age}).")
s -= 30
if max_age is not None and age_local is not None and age_local > max_age:
reasons.append(f"Age above maximum ({max_age}).")
s -= 30
except Exception:
pass
if mentions(crit, "ECOG 0-1") and (kps_local is None or kps_local < 80):
s -= 15
reasons.append("Requires ECOG 0–1 (KPS ~≥80).")
if mentions(crit, "Karnofsky") and (kps_local is None or kps_local < 70):
s -= 10
reasons.append("Requires KPS ≥70.")
if prior_bev_local and mentions(crit, "no prior bevacizumab"):
s -= 25
reasons.append("Excludes prior bevacizumab.")
if setting_local == "Recurrent" and mentions(crit, "recurrent"):
s += 8
if setting_local == "Newly diagnosed" and (mentions(crit, "newly diagnosed") or mentions(title, "adjuvant")):
s += 8
for kw in [k.strip() for k in (keywords_local or "").split(",") if k.strip()]:
if mentions(title, kw) or mentions(crit, kw):
s += 3
return max(0, min(100, s)), reasons
# python
def extract_row(study: dict) -> dict:
"""Return a flat row dict for the table/PDF. Safe against missing fields."""
ps = (study.get("protocolSection") or {})
idm = (ps.get("identificationModule") or {})
scm = (ps.get("statusModule") or {})
dsm = (ps.get("designModule") or {})
cdnm = (ps.get("conditionsModule") or {})
slm = (ps.get("sponsorCollaboratorsModule") or {})
clm = (ps.get("contactsLocationsModule") or {})
title = (idm.get("officialTitle") or idm.get("briefTitle") or "").strip()
nct = (idm.get("nctId") or "").strip()
status_raw = (scm.get("overallStatus") or "").strip()
# e.g., RECRUITING -> Recruiting
status = status_raw.replace("_", " ").title() if status_raw else ""
phases_list = ensure_list(dsm.get("phases"))
phases = ", ".join(phases_list)
conditions = ", ".join(ensure_list(cdnm.get("conditions")))
sponsor = ""
lead = slm.get("leadSponsor") or {}
if isinstance(lead, dict):
sponsor = (lead.get("name") or "").strip()
city_country = ""
locs = ensure_list(clm.get("locations"))
if locs:
first = locs[0]
city = (first.get("locationCity") or "").strip()
country = (first.get("locationCountry") or "").strip()
parts = [p for p in [city, country] if p]
city_country = ", ".join(parts)
return {
"title": title,
"nct": nct,
"status": status,
"phases": phases,
"conditions": conditions,
"sponsor": sponsor,
"city_country": city_country,
}