Spaces:

rcai
/

doctr_test

No application file

App Files Files Community

rcai commited on Sep 9, 2025

Commit

710de3c

verified ·

1 Parent(s): 3fa4af9

Update test.py

Browse files

Files changed (1) hide show

test.py +116 -0

test.py CHANGED Viewed

	@@ -392,3 +392,119 @@ Step 11. **HARD CONSTRAINT – Secondary cancer aggregation (windowed by the sel
392
393	"secondary_cancer_types_within_30d_of_progression": []
394

 "secondary_cancer_types_within_30d_of_progression": []
+                                                  from datetime import datetime, timedelta
+from typing import List, Dict, Any, Optional, Tuple
+import re
+# --- helpers ---------------------------------------------------------------
+def parse_date(s: Optional[str]) -> Optional[datetime]:
+    """Parse common date formats to a datetime.date (YYYY-MM-DD, M/D/YYYY, etc.)."""
+    if not s or not isinstance(s, str):
+        return None
+    s = s.strip()
+    fmts = ["%Y-%m-%d", "%Y/%m/%d", "%m/%d/%Y", "%d-%b-%Y", "%d-%B-%Y"]
+    for fmt in fmts:
+        try:
+            return datetime.strptime(s, fmt)
+        except Exception:
+            pass
+    # try M/D/YY
+    m = re.search(r"(\d{1,2})/(\d{1,2})/(\d{2})$", s)
+    if m:
+        mm, dd, yy = map(int, m.groups())
+        yy = (2000 + yy) if yy < 50 else (1900 + yy)
+        try:
+            return datetime(yy, mm, dd)
+        except Exception:
+            return None
+    return None
+def iso(d: Optional[datetime]) -> str:
+    return d.strftime("%Y-%m-%d") if d else ""
+def anchor_date_for_event(ev: Dict[str, Any]) -> Optional[datetime]:
+    """
+    Step-13 anchor:
+    D = date_of_disease_progression_assessment
+        or treatment_change.start_date_of_treatment
+        or date_of_secondary_cancer_diagnosis
+    """
+    d_prog = parse_date(ev.get("date_of_disease_progression_assessment"))
+    if d_prog:
+        return d_prog
+    start_tx = parse_date((ev.get("treatment_change") or {}).get("start_date_of_treatment"))
+    if start_tx:
+        return start_tx
+    return parse_date(ev.get("date_of_secondary_cancer_diagnosis"))
+def collect_secondary_pool(observations: List[Dict[str, Any]]) -> List[Tuple[str, datetime]]:
+    """Collect (secondary_cancer_type, diagnosis_date) pairs from all observations."""
+    pool: List[Tuple[str, datetime]] = []
+    for ob in observations:
+        t = (ob.get("secondary_cancer_type") or "").strip()
+        dt = parse_date(ob.get("date_of_secondary_cancer_diagnosis"))
+        if t and dt:
+            pool.append((t, dt))
+    return pool
+# --- main aggregation ------------------------------------------------------
+def aggregate_secondary_types_within_30d(
+    observations: List[Dict[str, Any]],
+    id_field: str = "segment_id",
+) -> List[Dict[str, Any]]:
+    """
+    For EACH event in `observations`, compute the distinct secondary tumor types whose
+    diagnosis dates fall within ±30 days of the event's anchor date (D).
+    Returns a new list (does not mutate input) with:
+      - 'secondary_cancer_types_within_30d_of_progression': List[str]
+      - 'secondary_cancer_types_within_30d_of_progression_csv': str
+      - 'date_of_disease_progression_assessment' normalized to YYYY-MM-DD (if present)
+    """
+    pool = collect_secondary_pool(observations)
+    out: List[Dict[str, Any]] = []
+    for ev in observations:
+        ev2 = dict(ev)  # shallow copy
+        D = anchor_date_for_event(ev2)
+        # normalize progression date if present
+        if ev2.get("date_of_disease_progression_assessment"):
+            ev2["date_of_disease_progression_assessment"] = iso(parse_date(ev2.get("date_of_disease_progression_assessment")))
+        if not D:
+            ev2["secondary_cancer_types_within_30d_of_progression"] = []
+            ev2["secondary_cancer_types_within_30d_of_progression_csv"] = ""
+            out.append(ev2)
+            continue
+        lo, hi = D - timedelta(days=30), D + timedelta(days=30)
+        hits = sorted({t for (t, dt) in pool if lo <= dt <= hi}, key=lambda s: s.lower())
+        ev2["secondary_cancer_types_within_30d_of_progression"] = hits
+        ev2["secondary_cancer_types_within_30d_of_progression_csv"] = ", ".join(hits)
+        out.append(ev2)
+    return out
+# --- optional convenience: aggregate for a single assessment date 'y' -----
+def aggregate_for_assessment_date_y(
+    observations: List[Dict[str, Any]],
+    y: str,  # e.g., "2024-04-13" or "4/13/2024"
+) -> List[str]:
+    """
+    Given a specific disease-progression assessment date 'y', return the distinct
+    secondary tumor types with diagnosis_date within ±30 days of y.
+    """
+    D = parse_date(y)
+    if not D:
+        return []
+    lo, hi = D - timedelta(days=30), D + timedelta(days=30)
+    pool = collect_secondary_pool(observations)
+    return sorted({t for (t, dt) in pool if lo <= dt <= hi}, key=lambda s: s.lower())
+print("For assessment date 2024-04-13:",
+          aggregate_for_assessment_date_y(observations, "2024-04-13"))