Spaces:

VEDAGI1
/

Medica_DecisionSupportAI

Sleeping

App Files Files Community

Rajan Sharma commited on Sep 19

Commit

979b614

verified ·

1 Parent(s): ee530c2

Update app.py

Browse files

Files changed (1) hide show

app.py +193 -93

app.py CHANGED Viewed

@@ -5,19 +5,13 @@ from typing import List, Dict, Any, Tuple
 import gradio as gr
 import torch
 import regex as re2  # robust control-char sanitizer
 from settings import SNAPSHOT_PATH, PERSIST_CONTENT
 from audit_log import log_event, hash_summary
 from privacy import redact_text
-# NEW: dynamic plan & profiling imports
-from plan_extractor import draft_plan_from_scenario
-from schema_profiler import profile_csv, build_dynamic_label_space, soft_bind_inputs_to_columns
-from analysis_runtime import ExecContext, op_summary_table, op_rank_top_n, op_delta_over_time, op_capacity_calc, op_cost_total
-from clarifier import missing_inputs_questions, render_phase1_markdown
-# ------------------------------------------------------
 # ---------- Writable caches (HF Spaces-safe) ----------
 HOME = pathlib.Path.home()
 HF_HOME = str(HOME / ".cache" / "huggingface")
@@ -224,6 +218,9 @@ def _load_snapshot(path=SNAPSHOT_PATH):
 init_retriever()
 _session_rag = SessionRAG()
 # ---------- Executive pre-compute (MDSi block) ----------
 def _mdsi_block():
     base_capacity = capacity_projection(18, 48, 6)
@@ -237,31 +234,164 @@ def _mdsi_block():
         "outcomes_summary": outcomes
     }, indent=2)
-# ---------- Core chat logic (auto scenario, dynamic Phase 1) ----------
-def clarityops_reply(user_msg, history, tz, uploaded_files_paths, awaiting_answers=False,
-                     session_profiles=None, session_frames=None):
     """
-    awaiting_answers:
-      - False: If scenario triggered -> Phase 1 (dynamic questions). Else normal chat.
-      - True:  If scenario triggered -> Phase 2 (structured analysis). Else normal chat.
-    session_profiles: list of CSV profiles built at upload-time
-    session_frames: dict of {filename: DataFrame}
     """
     try:
         log_event("user_message", None, {"sizes": {"chars": len(user_msg or "")}})
-        # Safety (input)
         safe_in, blocked_in, reason_in = safety_filter(user_msg, mode="input")
         if blocked_in:
             ans = refusal_reply(reason_in)
             return history + [(user_msg, ans)], awaiting_answers
-        # Identity short-circuit
         if is_identity_query(safe_in, history):
             ans = "I am ClarityOps, your strategic decision making AI partner."
             return history + [(user_msg, ans)], awaiting_answers
-        # Ingest uploads (text for RAG; CSVs already profiled by UI handler)
         artifacts = []
         if uploaded_files_paths:
             ing = extract_text_from_files(uploaded_files_paths)
@@ -271,15 +401,26 @@ def clarityops_reply(user_msg, history, tz, uploaded_files_paths, awaiting_answe
                 _session_rag.add_docs(chunks)
             if artifacts:
                 _session_rag.register_artifacts(artifacts)
-            log_event("uploads_added", None, {"chunks": len(chunks), "artifacts": len(artifacts)})
-        # Column helper
         if re.search(r"\b(columns?|headers?)\b", (safe_in or "").lower()):
             cols = _session_rag.get_latest_csv_columns()
             if cols:
                 return history + [(user_msg, "Here are the column names from your most recent CSV upload:\n\n- " + "\n- ".join(cols))], awaiting_answers
-        # Decide mode
         scenario_mode = is_scenario_triggered(safe_in, uploaded_files_paths)
         if not scenario_mode:
@@ -307,34 +448,19 @@ def clarityops_reply(user_msg, history, tz, uploaded_files_paths, awaiting_answe
             return history + [(user_msg, safe_out)], awaiting_answers
         # ---------- Scenario Mode ----------
-        # Build a dynamic column bag from uploaded profiles
-        column_bag: List[str] = []
-        for prof in (session_profiles or []):
-            for c in prof.get("columns", []):
-                nm = c.get("raw")
-                if nm:
-                    column_bag.append(str(nm))
-        column_bag = list(dict.fromkeys(column_bag))
         if not awaiting_answers:
-            # PHASE 1: draft plan, bind inputs to real columns, and ask only for missing bits
-            hf_tuple = None if USE_HOSTED_COHERE else load_local_model()
-            plan = draft_plan_from_scenario(safe_in, column_bag, cohere_client=None, hf_tuple=hf_tuple)
-            required_names = [r.get("input") or r.get("name") or "" for r in (plan.get("requires") or [])]
-            scenario_labels = build_dynamic_label_space(safe_in)
-            binding = soft_bind_inputs_to_columns(required_names, column_bag, scenario_labels)
-            questions = missing_inputs_questions(plan, binding)
-            phase1_md = render_phase1_markdown(questions)
-            phase1_md = _sanitize_text(phase1_md)
             log_event("assistant_reply", None, {
                 **hash_summary("prompt", safe_in if not PERSIST_CONTENT else ""),
-                **hash_summary("reply", phase1_md if not PERSIST_CONTENT else ""),
                 "mode": "scenario_phase1",
                 "awaiting_next_phase": True
             })
-            return history + [(user_msg, phase1_md)], True
-        # PHASE 2: keep your existing structured generation (now with better file summary)
         session_snips = "\n---\n".join(_session_rag.retrieve(
             "diabetes screening Indigenous Métis mobile program cost throughput outcomes logistics",
             k=6
@@ -349,15 +475,9 @@ def clarityops_reply(user_msg, history, tz, uploaded_files_paths, awaiting_answe
         user_lower = (safe_in or "").lower()
         mdsi_extra = _mdsi_block() if ("diabetes" in user_lower or "mdsi" in user_lower or "mobile screening" in user_lower) else ""
-        # Summarize actual DataFrames for provenance
-        prov_lines = []
-        for name, df in (session_frames or {}).items():
-            try:
-                cols = ", ".join(map(str, list(df.columns)[:12]))
-            except Exception:
-                cols = "<unavailable>"
-            prov_lines.append(f"- {name}: {cols}")
-        artifact_block = "Uploaded Data Files (summarized):\n" + ("\n".join(prov_lines) if prov_lines else "- <none>")
         scenario_block = safe_in if len((safe_in or "")) > 0 else ""
         system_preamble = build_system_preamble(
@@ -372,7 +492,8 @@ def clarityops_reply(user_msg, history, tz, uploaded_files_paths, awaiting_answe
             "\n\n[INSTRUCTION TO MODEL]\n"
             "Produce **Phase 2** only now: start with 'Structured Analysis' and follow the exact section order "
             "(Prioritization, Capacity, Cost, Clinical Benefits, ClarityOps Top 3 Recommendations). "
-            "Use uploaded files + the user's latest answers as authoritative. Show calculations, units, and a brief Provenance.\n"
         )
         augmented_user = SYSTEM_MASTER + "\n\n" + system_preamble + "\n\nUser scenario & answers:\n" + safe_in + directive
@@ -479,45 +600,23 @@ with gr.Blocks(theme=theme, css=custom_css, analytics_enabled=False) as demo:
     state_history = gr.State(value=[])
     state_uploaded = gr.State(value=[])
     state_awaiting = gr.State(value=False)
-    # NEW: store CSV profiles and dataframes
-    state_profiles = gr.State(value=[])
-    state_frames = gr.State(value={})
-    # ---- Uploads (now: store paths + build CSV profiles/frames)
-    def _ingest_uploads(files, current_paths, current_profiles, current_frames):
-        paths = list(current_paths or [])
-        profiles = list(current_profiles or [])
-        frames = dict(current_frames or {})
         for f in (files or []):
-            p = getattr(f, "name", None) or f
-            if not p:
-                continue
-            paths.append(p)
-            # Build CSV profile+df when applicable
-            try:
-                if str(p).lower().endswith(".csv"):
-                    prof = profile_csv(p)
-                    profiles.append({k:v for k,v in prof.items() if k != "df"})
-                    frames[prof["name"]] = prof["df"]
-            except Exception:
-                # Non-fatal; keep going
-                pass
-        return paths, profiles, frames
-    uploads.change(
-        fn=_ingest_uploads,
-        inputs=[uploads, state_uploaded, state_profiles, state_frames],
-        outputs=[state_uploaded, state_profiles, state_frames]
-    )
     # ---- Core send (used by both hero input and chat input)
-    def _on_send(user_msg, history, up_paths, awaiting, profiles, frames):
         try:
             if not user_msg or not user_msg.strip():
                 return history, "", history, awaiting
             new_history, new_awaiting = clarityops_reply(
-                user_msg.strip(), history or [], None, up_paths or [], awaiting_answers=awaiting,
-                session_profiles=profiles or [], session_frames=frames or {}
             )
             return new_history, "", new_history, new_awaiting
         except Exception as e:
@@ -528,8 +627,8 @@ with gr.Blocks(theme=theme, css=custom_css, analytics_enabled=False) as demo:
             return new_hist, "", new_hist, awaiting
     # ---- Hero -> App transition + first send
-    def _hero_start(user_msg, history, up_paths, awaiting, profiles, frames):
-        chat_o, msg_o, hist_o, await_o = _on_send(user_msg, history, up_paths, awaiting, profiles, frames)
         return (
             chat_o, msg_o, hist_o, await_o,
             gr.update(visible=False),
@@ -539,28 +638,28 @@ with gr.Blocks(theme=theme, css=custom_css, analytics_enabled=False) as demo:
     hero_send.click(
         _hero_start,
-        inputs=[hero_msg, state_history, state_uploaded, state_awaiting, state_profiles, state_frames],
         outputs=[chat, msg, state_history, state_awaiting, hero_wrap, app_wrap, hero_msg],
         concurrency_limit=2, queue=True
     )
     hero_msg.submit(
         _hero_start,
-        inputs=[hero_msg, state_history, state_uploaded, state_awaiting, state_profiles, state_frames],
         outputs=[chat, msg, state_history, state_awaiting, hero_wrap, app_wrap, hero_msg],
         concurrency_limit=2, queue=True
     )
     # ---- Normal chat interactions after hero is gone
-    send.click(_on_send,
-               inputs=[msg, state_history, state_uploaded, state_awaiting, state_profiles, state_frames],
                outputs=[chat, msg, state_history, state_awaiting],
                concurrency_limit=2, queue=True)
-    msg.submit(_on_send,
-               inputs=[msg, state_history, state_uploaded, state_awaiting, state_profiles, state_frames],
                outputs=[chat, msg, state_history, state_awaiting],
                concurrency_limit=2, queue=True)
     def _on_clear():
         return (
             [], "", [], False,
             gr.update(visible=True),
@@ -574,3 +673,4 @@ if __name__ == "__main__":
     port = int(os.environ.get("PORT", "7860"))
     demo.launch(server_name="0.0.0.0", server_port=port, show_api=False, max_threads=8)

 import gradio as gr
 import torch
+import pandas as pd               # <-- NEW: for real CSV analytics
 import regex as re2  # robust control-char sanitizer
 from settings import SNAPSHOT_PATH, PERSIST_CONTENT
 from audit_log import log_event, hash_summary
 from privacy import redact_text
 # ---------- Writable caches (HF Spaces-safe) ----------
 HOME = pathlib.Path.home()
 HF_HOME = str(HOME / ".cache" / "huggingface")
 init_retriever()
 _session_rag = SessionRAG()
+# In-memory stash of uploaded DataFrames (name -> pd.DataFrame)
+_SESSION_FRAMES: Dict[str, pd.DataFrame] = {}   # <-- NEW
 # ---------- Executive pre-compute (MDSi block) ----------
 def _mdsi_block():
     base_capacity = capacity_projection(18, 48, 6)
         "outcomes_summary": outcomes
     }, indent=2)
+# ---------- DataFrame -> JSON summary (generic, schema-free) ----------
+def _summarize_frames_for_prompt(frames: Dict[str, pd.DataFrame], max_cols: int = 12, max_groups: int = 10) -> str:
     """
+    Build a JSON block with concrete, generic stats from uploaded DataFrames.
+    Works for arbitrary CSVs (no static schema).
     """
+    def safe_num_cols(df: pd.DataFrame):
+        return [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
+    def likely_group_cols(df: pd.DataFrame):
+        cand = [c for c in df.columns if any(k in str(c).lower()
+                for k in ["settlement", "community", "facility", "site", "region", "zone", "program", "service", "specialty", "hospital"])]
+        return cand[:2]
+    out = {"files": []}
+    for name, df in (frames or {}).items():
+        try:
+            rec = {"name": name, "shape": [int(df.shape[0]), int(df.shape[1])], "columns": list(map(str, df.columns[:max_cols]))}
+            num_cols = safe_num_cols(df)
+            if num_cols:
+                # count, mean, std, min, 25%, 50%, 75%, max for each numeric column
+                desc = df[num_cols].describe().to_dict()
+                # convert numpy types to natives for JSON
+                for k, v in desc.items():
+                    for m, val in v.items():
+                        try:
+                            v[m] = float(val)
+                        except Exception:
+                            try:
+                                v[m] = int(val)
+                            except Exception:
+                                pass
+                rec["numeric_summary"] = desc
+            groups = []
+            for gcol in likely_group_cols(df):
+                try:
+                    gb = df.groupby(gcol).size().sort_values(ascending=False).head(max_groups)
+                    # ensure JSON-serializable
+                    groups.append({"by": str(gcol), "size_top": {str(k): int(v) for k, v in gb.to_dict().items()}})
+                except Exception:
+                    pass
+            if groups:
+                rec["groups"] = groups
+            out["files"].append(rec)
+        except Exception:
+            continue
+    return json.dumps(out, indent=2)
+# ---------- Dynamic Phase 1 question generator ----------
+def _extract_present_domains(artifacts: List[Dict[str, Any]]) -> Dict[str, bool]:
+    flags = dict(population=False, cost=False, clinical=False, capacity=False)
+    for a in artifacts or []:
+        name = (a.get("name") or "").lower()
+        cols = [c.lower() for c in (a.get("columns") or [])]
+        if any(k in name for k in ["population", "census", "membership"]) or any(
+            k in ",".join(cols) for k in ["population", "census", "residence", "settlement", "age"]
+        ):
+            flags["population"] = True
+        if any(k in name for k in ["cost", "finance", "budget"]) or any(
+            k in ",".join(cols) for k in ["cost", "startup", "ongoing", "per_client", "per-visit"]
+        ):
+            flags["cost"] = True
+        if any(k in name for k in ["a1c", "outcome", "bp", "chol"]) or any(
+            k in ",".join(cols) for k in ["a1c", "bmi", "bp", "chol", "outcome"]
+        ):
+            flags["clinical"] = True
+        if any(k in name for k in ["ops", "capacity", "throughput", "volume"]) or any(
+            k in ",".join(cols) for k in ["clients_per_day", "teams", "visits", "throughput"]
+        ):
+            flags["capacity"] = True
+    return flags
+def _domain_from_text(text: str) -> Dict[str, bool]:
+    t = (text or "").lower()
+    return {
+        "population": any(k in t for k in ["population", "census", "settlement", "membership"]),
+        "cost": any(k in t for k in ["cost", "budget", "startup", "per client", "per-client", "ongoing"]),
+        "clinical": any(k in t for k in ["a1c", "bmi", "blood pressure", "bp", "cholesterol", "outcome"]),
+        "capacity": any(k in t for k in ["capacity", "throughput", "clients per day", "teams", "screen", "volume"]),
+    }
+def _is_mdsi_diabetes(text: str) -> bool:
+    t = (text or "").lower()
+    return any(k in t for k in ["mdsi", "mobile diabetes", "diabetes", "metabolic", "a1c", "metis"])
+def build_dynamic_clarifications(scenario_text: str, artifacts: List[Dict[str, Any]]) -> str:
+    flags_from_files = _extract_present_domains(artifacts)
+    flags_from_text = _domain_from_text(scenario_text)
+    missing = {
+        k: not (flags_from_files.get(k) or flags_from_text.get(k))
+        for k in ["population", "capacity", "cost", "clinical"]
+    }
+    qs: List[Tuple[str, str]] = []
+    is_mdsi = _is_mdsi_diabetes(scenario_text)
+    if missing["population"]:
+        qs.append((
+            "Prioritization",
+            "Which population/risk indicators should drive prioritization (size, prevalence, access, equity factors)?"
+            if not is_mdsi else
+            "Confirm prioritization inputs: settlement membership living on-settlement (latest), obesity/metabolic syndrome prevalence, and any access-to-care constraints to weigh."
+        ))
+    if missing["capacity"]:
+        qs.append((
+            "Capacity",
+            "What per-team throughput and operating schedule should be used for capacity calculations?"
+            if not is_mdsi else
+            "What is the realistic per-team screening rate (clients/day) and operating schedule (days/week, weeks/3-month window)?"
+        ))
+    if missing["cost"]:
+        qs.append((
+            "Cost",
+            "Provide fixed setup costs and variable cost per client to model total program spend."
+            if not is_mdsi else
+            "Provide startup cost per client and ongoing cost per client/visit (or total program costs) to price scenarios like 1,200 screens."
+        ))
+    if missing["clinical"]:
+        qs.append((
+            "Clinical",
+            "Which clinical indicators and expected effect sizes should be tracked for outcomes?"
+            if not is_mdsi else
+            "What longitudinal deltas should we expect (e.g., ΔA1c, ΔBP, ΔBMI, lipids) from repeat screenings, and over what interval?"
+        ))
+    qs.append((
+        "Recommendations",
+        "Any operational constraints (scheduling, staffing, partnerships) we should incorporate into deployment modeling?"
+        if not is_mdsi else
+        "Are there community constraints (events/seasonality/cultural protocols) that should shape routing and visit cadence?"
+    ))
+    qs = qs[:5]
+    out = ["**Clarification Questions**"]
+    current_group = None
+    for grp, q in qs:
+        if grp != current_group:
+            out.append(f"\n**{grp}:**")
+            current_group = grp
+        out.append(f"- {q}")
+    return "\n".join(out)
+# ---------- Core chat logic (auto scenario, dynamic Phase 1) ----------
+def clarityops_reply(user_msg, history, tz, uploaded_files_paths, awaiting_answers=False):
     try:
         log_event("user_message", None, {"sizes": {"chars": len(user_msg or "")}})
         safe_in, blocked_in, reason_in = safety_filter(user_msg, mode="input")
         if blocked_in:
             ans = refusal_reply(reason_in)
             return history + [(user_msg, ans)], awaiting_answers
         if is_identity_query(safe_in, history):
             ans = "I am ClarityOps, your strategic decision making AI partner."
             return history + [(user_msg, ans)], awaiting_answers
+        # ---- Ingest uploads FIRST (files alone can trigger scenario mode)
         artifacts = []
         if uploaded_files_paths:
             ing = extract_text_from_files(uploaded_files_paths)
                 _session_rag.add_docs(chunks)
             if artifacts:
                 _session_rag.register_artifacts(artifacts)
+            # NEW: Read CSVs into DataFrames and stash in-memory for analytics
+            for a in (artifacts or []):
+                try:
+                    if a.get("kind") == "csv" and a.get("path") and a.get("name"):
+                        # read the whole CSV with automatic dtype inference; fallback to strings
+                        try:
+                            df = pd.read_csv(a["path"])
+                        except Exception:
+                            df = pd.read_csv(a["path"], dtype=str, low_memory=False)
+                        _SESSION_FRAMES[str(a["name"])] = df
+                except Exception:
+                    pass
+            log_event("uploads_added", None, {"chunks": len(chunks), "artifacts": len(artifacts), "dfs": len(_SESSION_FRAMES)})
+        # CSV columns helper (works in both modes)
         if re.search(r"\b(columns?|headers?)\b", (safe_in or "").lower()):
             cols = _session_rag.get_latest_csv_columns()
             if cols:
                 return history + [(user_msg, "Here are the column names from your most recent CSV upload:\n\n- " + "\n- ".join(cols))], awaiting_answers
         scenario_mode = is_scenario_triggered(safe_in, uploaded_files_paths)
         if not scenario_mode:
             return history + [(user_msg, safe_out)], awaiting_answers
         # ---------- Scenario Mode ----------
         if not awaiting_answers:
+            # PHASE 1: dynamic questions (no assumptions)
+            phase1 = build_dynamic_clarifications(scenario_text=safe_in, artifacts=artifacts or _session_rag.artifacts)
+            phase1 = _sanitize_text(phase1)
             log_event("assistant_reply", None, {
                 **hash_summary("prompt", safe_in if not PERSIST_CONTENT else ""),
+                **hash_summary("reply", phase1 if not PERSIST_CONTENT else ""),
                 "mode": "scenario_phase1",
                 "awaiting_next_phase": True
             })
+            return history + [(user_msg, phase1)], True
+        # PHASE 2: build rich system preamble + feed to LLM
         session_snips = "\n---\n".join(_session_rag.retrieve(
             "diabetes screening Indigenous Métis mobile program cost throughput outcomes logistics",
             k=6
         user_lower = (safe_in or "").lower()
         mdsi_extra = _mdsi_block() if ("diabetes" in user_lower or "mdsi" in user_lower or "mobile screening" in user_lower) else ""
+        # NEW: Real computed stats from CSVs for the model to use
+        computed_from_csvs = _summarize_frames_for_prompt(_SESSION_FRAMES)
+        artifact_block = "Computed Blocks From Uploaded Data (JSON):\n" + computed_from_csvs
         scenario_block = safe_in if len((safe_in or "")) > 0 else ""
         system_preamble = build_system_preamble(
             "\n\n[INSTRUCTION TO MODEL]\n"
             "Produce **Phase 2** only now: start with 'Structured Analysis' and follow the exact section order "
             "(Prioritization, Capacity, Cost, Clinical Benefits, ClarityOps Top 3 Recommendations). "
+            "Use the JSON computed blocks from the uploaded files + the user's latest answers as authoritative. "
+            "Show calculations, units, and a brief Provenance. If required data is still missing, output INSUFFICIENT_DATA.\n"
         )
         augmented_user = SYSTEM_MASTER + "\n\n" + system_preamble + "\n\nUser scenario & answers:\n" + safe_in + directive
     state_history = gr.State(value=[])
     state_uploaded = gr.State(value=[])
     state_awaiting = gr.State(value=False)
+    # ---- Uploads
+    def _store_uploads(files, current):
+        paths = []
         for f in (files or []):
+            paths.append(getattr(f, "name", None) or f)
+        return (current or []) + paths
+    uploads.change(fn=_store_uploads, inputs=[uploads, state_uploaded], outputs=state_uploaded)
     # ---- Core send (used by both hero input and chat input)
+    def _on_send(user_msg, history, up_paths, awaiting):
         try:
             if not user_msg or not user_msg.strip():
                 return history, "", history, awaiting
             new_history, new_awaiting = clarityops_reply(
+                user_msg.strip(), history or [], None, up_paths or [], awaiting_answers=awaiting
             )
             return new_history, "", new_history, new_awaiting
         except Exception as e:
             return new_hist, "", new_hist, awaiting
     # ---- Hero -> App transition + first send
+    def _hero_start(user_msg, history, up_paths, awaiting):
+        chat_o, msg_o, hist_o, await_o = _on_send(user_msg, history, up_paths, awaiting)
         return (
             chat_o, msg_o, hist_o, await_o,
             gr.update(visible=False),
     hero_send.click(
         _hero_start,
+        inputs=[hero_msg, state_history, state_uploaded, state_awaiting],
         outputs=[chat, msg, state_history, state_awaiting, hero_wrap, app_wrap, hero_msg],
         concurrency_limit=2, queue=True
     )
     hero_msg.submit(
         _hero_start,
+        inputs=[hero_msg, state_history, state_uploaded, state_awaiting],
         outputs=[chat, msg, state_history, state_awaiting, hero_wrap, app_wrap, hero_msg],
         concurrency_limit=2, queue=True
     )
     # ---- Normal chat interactions after hero is gone
+    send.click(_on_send, inputs=[msg, state_history, state_uploaded, state_awaiting],
                outputs=[chat, msg, state_history, state_awaiting],
                concurrency_limit=2, queue=True)
+    msg.submit(_on_send, inputs=[msg, state_history, state_uploaded, state_awaiting],
                outputs=[chat, msg, state_history, state_awaiting],
                concurrency_limit=2, queue=True)
     def _on_clear():
+        # also clear in-memory DataFrames
+        _SESSION_FRAMES.clear()
         return (
             [], "", [], False,
             gr.update(visible=True),
     port = int(os.environ.get("PORT", "7860"))
     demo.launch(server_name="0.0.0.0", server_port=port, show_api=False, max_threads=8)