Brain_Cancer_Trails_Finder / ctgov_client.py

Upload 12 files

b950dbe verified about 1 month ago

8.54 kB

	# Shared client for ClinicalTrials.gov v2 API and scoring
	import re
	import requests
	from typing import Any, Dict, List, Tuple

	DEFAULT_DIAG_TERMS = {
	"Glioblastoma": ["glioblastoma", "GBM", "glioblastoma multiforme"],
	"Diffuse midline glioma": ["diffuse midline glioma", "DMG", "H3 K27M"],
	"Anaplastic astrocytoma": ["anaplastic astrocytoma", "grade 3 astrocytoma"],
	"Astrocytoma": ["astrocytoma", "grade 2 astrocytoma", "grade 4 astrocytoma"],
	"Oligodendroglioma": ["oligodendroglioma", "1p19q codeleted"],
	"Meningioma": ["meningioma"],
	"Medulloblastoma": ["medulloblastoma"],
	"Ependymoma": ["ependymoma"],
	"Spinal cord tumor": ["spinal cord tumor", "spinal cord neoplasm"],
	}

	API_BASE = "https://clinicaltrials.gov/api/v2/studies"
	UA = {"User-Agent": "BrainTrialsFinder-Desktop/1.0 (+https://clinicaltrials.gov)"}


	def build_terms(diagnosis: str, keywords: str) -> List[str]:
	terms: List[str] = []
	if diagnosis in DEFAULT_DIAG_TERMS:
	terms.extend(DEFAULT_DIAG_TERMS[diagnosis])
	else:
	terms.extend(["brain tumor", "spinal cord tumor", "CNS tumor"])
	extra = [k.strip() for k in (keywords or "").split(",") if k.strip()]
	return terms + extra


	def ctgov_search_one(term: str, statuses: List[str], page_size: int = 100, max_pages: int = 5) -> List[Dict[str, Any]]:
	session = requests.Session()
	session.headers.update(UA)
	all_studies: List[Dict[str, Any]] = []
	page_token = None
	count = 0
	max_iters = max_pages or 0
	while count < max_iters:
	params = {
	"query.term": term,
	"filter.overallStatus": ",".join(statuses),
	"pageSize": page_size,
	}
	if page_token:
	params["pageToken"] = page_token
	r = session.get(API_BASE, params=params, timeout=30)
	r.raise_for_status()
	data = r.json()
	studies = data.get("studies", [])
	if not studies:
	break
	all_studies.extend(studies)
	page_token = data.get("nextPageToken")
	if not page_token:
	break
	count += 1
	return all_studies


	def fetch_all_terms(terms: List[str], statuses: List[str], page_size=100, max_pages=5) -> List[Dict[str, Any]]:
	dedup: Dict[str, Dict[str, Any]] = {}
	for t in terms:
	try:
	for s in ctgov_search_one(t, statuses, page_size=page_size, max_pages=max_pages):
	ident = (s.get("protocolSection", {}) or {}).get("identificationModule", {}) or {}
	nct = ident.get("nctId")
	key = nct or id(s)
	if key not in dedup:
	dedup[key] = s
	except requests.HTTPError:
	continue
	return list(dedup.values())


	def mentions(txt: str, term: str) -> bool:
	return bool(re.search(rf"\b{re.escape(term)}\b", txt or "", re.I))


	def as_text(obj: Any) -> str:
	if obj is None:
	return ""
	if isinstance(obj, dict):
	for k in ("textblock", "textBlock", "value"):
	if k in obj:
	return str(obj.get(k) or "")
	return " ".join(str(v) for v in obj.values() if v is not None)
	if isinstance(obj, list):
	return "; ".join(as_text(x) for x in obj)
	return str(obj)


	def parse_age_to_int(v: Any):
	if v is None:
	return None
	if isinstance(v, dict):
	return parse_age_to_int(v.get("value"))
	if isinstance(v, (int, float)):
	return int(v)
	m = re.search(r"(\d+)", str(v))
	return int(m.group(1)) if m else None


	def ensure_list(v: Any):
	if v is None:
	return []
	if isinstance(v, list):
	return v
	return [v]


	def score_trial(t: Dict[str, Any], intake: Dict[str, Any]) -> Tuple[int, List[str]]:
	age_local = (intake or {}).get("age")
	kps_local = (intake or {}).get("kps")
	prior_bev_local = bool((intake or {}).get("prior_bev", False))
	setting_local = (intake or {}).get("setting") or ""
	keywords_local = (intake or {}).get("keywords") or ""
	diagnosis_local = (intake or {}).get("diagnosis") or ""

	if diagnosis_local in DEFAULT_DIAG_TERMS:
	diag_terms = DEFAULT_DIAG_TERMS[diagnosis_local]
	elif diagnosis_local and diagnosis_local != "Other":
	diag_terms = [diagnosis_local]
	else:
	diag_terms = ["brain tumor", "CNS tumor", "spinal cord tumor"]

	ps = (t or {}).get("protocolSection") or {}
	elig = ps.get("eligibilityModule")
	crit = ""
	min_age = None
	max_age = None
	if isinstance(elig, dict):
	crit_raw = elig.get("eligibilityCriteria") or elig.get("criteria") or elig
	crit = as_text(crit_raw)
	min_age = parse_age_to_int(elig.get("minimumAge"))
	max_age = parse_age_to_int(elig.get("maximumAge"))
	elif isinstance(elig, str):
	crit = as_text(elig)

	phases_list = ensure_list(ps.get("designModule", {}).get("phases"))
	phases_up = [str(p).upper() for p in phases_list]
	conds_list = ensure_list(ps.get("conditionsModule", {}).get("conditions"))
	title = (ps.get("identificationModule", {}) or {}).get("briefTitle", "")

	s = 0
	reasons: List[str] = []
	if any(any(mentions(c, term) for term in diag_terms) for c in conds_list) or any(mentions(title, term) for term in diag_terms):
	s += 30
	reasons.append(f"Matches diagnosis: {diagnosis_local or 'neuro-oncology'}.")
	if any("PHASE 2" in p or "PHASE2" in p for p in phases_up):
	s += 8
	if any("PHASE 3" in p or "PHASE3" in p for p in phases_up):
	s += 12
	try:
	if min_age is not None and age_local is not None and age_local < min_age:
	reasons.append(f"Age below minimum ({min_age}).")
	s -= 30
	if max_age is not None and age_local is not None and age_local > max_age:
	reasons.append(f"Age above maximum ({max_age}).")
	s -= 30
	except Exception:
	pass
	if mentions(crit, "ECOG 0-1") and (kps_local is None or kps_local < 80):
	s -= 15
	reasons.append("Requires ECOG 0–1 (KPS ~≥80).")
	if mentions(crit, "Karnofsky") and (kps_local is None or kps_local < 70):
	s -= 10
	reasons.append("Requires KPS ≥70.")
	if prior_bev_local and mentions(crit, "no prior bevacizumab"):
	s -= 25
	reasons.append("Excludes prior bevacizumab.")
	if setting_local == "Recurrent" and mentions(crit, "recurrent"):
	s += 8
	if setting_local == "Newly diagnosed" and (mentions(crit, "newly diagnosed") or mentions(title, "adjuvant")):
	s += 8
	for kw in [k.strip() for k in (keywords_local or "").split(",") if k.strip()]:
	if mentions(title, kw) or mentions(crit, kw):
	s += 3
	return max(0, min(100, s)), reasons
	# python
	def extract_row(study: dict) -> dict:
	"""Return a flat row dict for the table/PDF. Safe against missing fields."""
	ps = (study.get("protocolSection") or {})
	idm = (ps.get("identificationModule") or {})
	scm = (ps.get("statusModule") or {})
	dsm = (ps.get("designModule") or {})
	cdnm = (ps.get("conditionsModule") or {})
	slm = (ps.get("sponsorCollaboratorsModule") or {})
	clm = (ps.get("contactsLocationsModule") or {})

	title = (idm.get("officialTitle") or idm.get("briefTitle") or "").strip()
	nct = (idm.get("nctId") or "").strip()

	status_raw = (scm.get("overallStatus") or "").strip()
	# e.g., RECRUITING -> Recruiting
	status = status_raw.replace("_", " ").title() if status_raw else ""

	phases_list = ensure_list(dsm.get("phases"))
	phases = ", ".join(phases_list)

	conditions = ", ".join(ensure_list(cdnm.get("conditions")))

	sponsor = ""
	lead = slm.get("leadSponsor") or {}
	if isinstance(lead, dict):
	sponsor = (lead.get("name") or "").strip()

	city_country = ""
	locs = ensure_list(clm.get("locations"))
	if locs:
	first = locs[0]
	city = (first.get("locationCity") or "").strip()
	country = (first.get("locationCountry") or "").strip()
	parts = [p for p in [city, country] if p]
	city_country = ", ".join(parts)

	return {
	"title": title,
	"nct": nct,
	"status": status,
	"phases": phases,
	"conditions": conditions,
	"sponsor": sponsor,
	"city_country": city_country,
	}