Spaces:

spygaurad
/

DataLine

Runtime error

App Files Files Community

Suraj Prasai commited on Oct 8, 2025

Commit

458c8e2

0 Parent(s):

aded initial

Browse files

Files changed (40) hide show

.gitattributes +35 -0
.gitignore +1 -0
__init__.py +0 -0
__pycache__/app.cpython-311.pyc +0 -0
__pycache__/test_app.cpython-311.pyc +0 -0
agent/ChatbotHandler.py +189 -0
agent/__init__.py +0 -0
agent/__pycache__/ChatbotHandler.cpython-311.pyc +0 -0
agent/__pycache__/__init__.cpython-311.pyc +0 -0
agent/__pycache__/agent_graph.cpython-311.pyc +0 -0
agent/__pycache__/runtime_ctx.cpython-311.pyc +0 -0
agent/__pycache__/tools.cpython-311.pyc +0 -0
agent/agent_graph.py +215 -0
agent/runtime_ctx.py +175 -0
agent/simple_chat.py +103 -0
agent/tools.py +354 -0
app.py +532 -0
app_studio.py +543 -0
app_test.py +867 -0
backend_client.py +42 -0
data/Aubrie.csv +0 -0
data/JS00001_filtered.csv +0 -0
data/Lisette.csv +0 -0
ecg_analyzer.py +198 -0
ecg_visualization.py +67 -0
examples/main.py +27 -0
pipeline/__init__.py +0 -0
pipeline/__pycache__/__init__.cpython-311.pyc +0 -0
pipeline/__pycache__/deduplication.cpython-311.pyc +0 -0
pipeline/__pycache__/featurizer.cpython-311.pyc +0 -0
pipeline/__pycache__/issues.cpython-311.pyc +0 -0
pipeline/__pycache__/pipeline.cpython-311.pyc +0 -0
pipeline/__pycache__/utils.cpython-311.pyc +0 -0
pipeline/__pycache__/utils_cool.cpython-311.pyc +0 -0
pipeline/deduplication.py +185 -0
pipeline/featurizer.py +358 -0
pipeline/issues.py +157 -0
pipeline/pipeline.py +63 -0
pipeline/utils_cool.py +208 -0
requirements.txt +126 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .env

__init__.py ADDED Viewed

File without changes

__pycache__/app.cpython-311.pyc ADDED Viewed

Binary file (23.3 kB). View file

__pycache__/test_app.cpython-311.pyc ADDED Viewed

Binary file (11.8 kB). View file

agent/ChatbotHandler.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import os
+from typing import Any, Dict, List, Optional
+import pandas as pd
+from langchain_core.messages import HumanMessage
+from langchain_google_genai import ChatGoogleGenerativeAI
+from agent.agent_graph import build_app
+from pipeline.utils_cool import df_to_payload, parse_user_choice
+from .runtime_ctx import get_df_summary
+from dotenv import load_dotenv
+load_dotenv()
+class ChatbotHandler:
+    def __init__(self):
+        self.ctx: Dict[str, Any] = {
+            "graph_app": None,      # LangGraph app
+            "state": {              # mirrors your prior STATE
+                "df_payload": None,
+                "results": [],
+                "steps_taken": 0,
+                "confirmed_step": None,
+                "confirmed_params": {},
+                "last_task": None,
+                "plan": None,
+                "messages": [],
+                "max_steps": 8,
+            },
+        }
+        # keep chat UI history in the component; we only need to return a reply string to it
+        # but Gradio Chatbot expects (history, ""), so we'll append our reply to history.
+        self._boot_text: Optional[str] = None  # first reply after upload
+        # LLM for the graph
+        self.llm = ChatGoogleGenerativeAI(
+            model="gemini-2.5-flash-lite",
+            temperature=0,
+            api_key=os.getenv("GOOGLE_API_KEY"),
+        )
+    def _format_summary(self, s: Dict[str, Any]) -> str:
+        cols = s.get("columns") or []
+        dtypes = s.get("dtypes") or {}
+        shape = s.get("shape") or (None, None)
+        label_guess = s.get("label_guess") or "None"
+        task_guess = s.get("task_guess") or "Unknown"
+        issues = s.get("issues") or []
+        # keep it concise but helpful
+        dt_pairs = [f"{k}: {v}" for k, v in list(dtypes.items())[:8]]
+        if len(dtypes) > 8:
+            dt_pairs.append("…")
+        lines = [
+            "### Dataset summary",
+            f"- Shape: {shape[0]} rows × {shape[1]} columns",
+            f"- Columns: {', '.join(map(str, cols[:10]))}{'…' if len(cols) > 10 else ''}",
+            f"- Dtypes: {', '.join(dt_pairs)}",
+            f"- Label guess: {label_guess}",
+            f"- Task guess: {task_guess}",
+        ]
+        if issues:
+            lines.append(f"- Potential issues: {('; '.join(issues[:3]))}{'…' if len(issues) > 3 else ''}")
+        return "\n".join(lines)
+    # ---------------- Boot on upload: run inspect + SOTA + plan ----------------
+    def update_context(self, file_path: Optional[str], data_type: Optional[str], df: Optional["pd.DataFrame"]):
+        if df is None:
+            return ""
+        # (Re)build graph + seed state (unchanged)
+        self.ctx["graph_app"] = build_app(self.llm)
+        df_payload = df_to_payload(df)
+        st = self.ctx["state"]
+        st.update({
+            "df_payload": df_payload,
+            "results": [],
+            "steps_taken": 0,
+            "confirmed_step": None,
+            "confirmed_params": {},
+            "last_task": None,
+            "plan": None,
+            "messages": [HumanMessage(content="A new dataset was uploaded. Start the workflow.")],
+            "max_steps": 8,
+        })
+        final = self.ctx["graph_app"].invoke(st)
+        for k in ["df_payload","results","steps_taken","confirmed_step","confirmed_params","last_task","plan","messages"]:
+            st[k] = final.get(k, st.get(k))
+        # Build the boot text from the stored summary (authoritative + consistent)
+        s = get_df_summary() or {}
+        summary_text = self._format_summary(s)
+        # Decide whether to ask for confirmation
+        task_guess = (s.get("task_guess") or "").lower()
+        label_guess = s.get("label_guess")
+        needs_task = task_guess not in {"classification", "regression", "unsupervised"}
+        needs_label = (task_guess in {"classification", "regression"}) and (not label_guess)
+        if needs_task or needs_label:
+            ask = "\n\nPlease confirm the task" + (" and label column" if needs_label else "") + \
+                  ". For example: `task=classification label=noisy_letter_grade`."
+        else:
+            ask = f"\n\nIf that looks right, say `confirm task={task_guess}" + \
+                  (f" label={label_guess}`" if label_guess else "`") + \
+                  " and I’ll fetch SOTA and propose a plan."
+        self._boot_text = summary_text + ask
+        return self._boot_text
+    # ---------------- One chat turn → graph turn ----------------
+    def respond(self, message: str, history: List):
+        if history is None:
+            history = []
+        msg = (message or "").strip()
+        if not msg:
+            return history, ""
+        # If we have a prepared boot reply (from upload) and the chat is empty,
+        # show it before processing the user's first message.
+        if self._boot_text and len(history) == 0:
+            history.append(("[system]", self._boot_text))
+            self._boot_text = None
+        # Require a booted graph
+        if self.ctx.get("graph_app") is None:
+            history.append((msg, "Please upload a dataset first."))
+            return history, ""
+        st = self.ctx["state"]
+        # Allow quick “run X a=b” parsing before we call the graph (same as your old handle_chat)
+        step, params = parse_user_choice(msg)
+        if step:
+            st["confirmed_step"] = step
+            st["confirmed_params"] = {**(st.get("confirmed_params") or {}), **params}
+        # Build this turn’s input state
+        messages = (st.get("messages") or []) + [HumanMessage(content=msg)]
+        turn_state = {
+            "messages": messages,
+            "df_payload": st.get("df_payload"),
+            "results": st.get("results", []),
+            "steps_taken": st.get("steps_taken", 0),
+            "max_steps": max(8, st.get("steps_taken", 0) + 4),
+            "confirmed_step": st.get("confirmed_step"),
+            "confirmed_params": st.get("confirmed_params", {}),
+            "last_task": st.get("last_task"),
+            "plan": st.get("plan"),
+        }
+        # Invoke graph for this turn
+        final = self.ctx["graph_app"].invoke(turn_state)
+        # Persist state back
+        for k in ["df_payload","results","steps_taken","confirmed_step","confirmed_params","last_task","plan","messages"]:
+            st[k] = final.get(k, turn_state.get(k, st.get(k)))
+        # Extract assistant text
+        reply = self._extract_ai_text(final.get("messages", [])) or "Done."
+        history.append((msg, reply))
+        return history, ""
+    # ---------------- helper: extract last AI string ----------------
+    def _extract_ai_text(self, messages: List[Any]) -> str:
+        def coerce_text(content: Any) -> str:
+            if content is None: return ""
+            if isinstance(content, str): return content
+            if isinstance(content, list):
+                parts = []
+                for c in content:
+                    if isinstance(c, dict):
+                        parts.append(str(c.get("text") or c.get("content") or c.get("data") or ""))
+                    else:
+                        parts.append(str(c))
+                return " ".join(p for p in parts if p)
+            return str(content)
+        for m in reversed(messages or []):
+            role = getattr(m, "type", None) or getattr(m, "role", None)
+            if role in ("ai", "assistant", "aimessage"):
+                return coerce_text(getattr(m, "content", None))
+            if isinstance(m, dict):
+                r = (m.get("role") or m.get("type") or "").lower()
+                if r in ("assistant", "ai", "aimessage"):
+                    return coerce_text(m.get("content"))
+        return ""

agent/__init__.py ADDED Viewed

File without changes

agent/__pycache__/ChatbotHandler.cpython-311.pyc ADDED Viewed

Binary file (10.4 kB). View file

agent/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (162 Bytes). View file

agent/__pycache__/agent_graph.cpython-311.pyc ADDED Viewed

Binary file (12.8 kB). View file

agent/__pycache__/runtime_ctx.cpython-311.pyc ADDED Viewed

Binary file (9.63 kB). View file

agent/__pycache__/tools.cpython-311.pyc ADDED Viewed

Binary file (16.7 kB). View file

agent/agent_graph.py ADDED Viewed

	@@ -0,0 +1,215 @@

+# agent/agent_graph.py
+from __future__ import annotations
+import json
+from typing import Any, Dict, List, Optional, TypedDict
+from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, ToolMessage
+from langgraph.graph import END, START, StateGraph
+from langgraph.prebuilt import ToolNode
+from .runtime_ctx import set_df_payload, set_df_summary, set_sota_bundled
+from .tools import (
+    tool_describe_step,
+    tool_inspect_dataset,
+    tool_list_steps,
+    tool_list_versions,
+    tool_propose_plan,
+    tool_reset_to_version,
+    tool_run_step,
+    tool_sota_preprocessing,
+)
+def _to_text(content: Any, limit: int = 4000) -> str:
+    """Coerce any message content to a string that Gemini will accept."""
+    if content is None:
+        return ""
+    if isinstance(content, str):
+        return content
+    try:
+        s = json.dumps(content, default=str, ensure_ascii=False)
+    except Exception:
+        s = str(content)
+    # lightly truncate huge tool dumps
+    return (s[:limit] + " …") if len(s) > limit else s
+def _sanitize_messages(msgs: list[Any]) -> list[Any]:
+    """Keep only system/human/assistant messages and ensure content is str."""
+    clean = []
+    for m in msgs or []:
+        # Drop raw ToolMessage or unknown roles (Gemini doesn't accept them)
+        role = getattr(m, "type", None) or getattr(m, "role", None) or ""
+        if isinstance(m, ToolMessage) or role == "tool":
+            # Optionally compress tool outputs into a short assistant line instead:
+            txt = _to_text(getattr(m, "content", None))
+            if txt:
+                clean.append(AIMessage(content=f"[Tool result] {txt}"))
+            continue
+        c = _to_text(getattr(m, "content", None))
+        if isinstance(m, SystemMessage):
+            clean.append(SystemMessage(content=c))
+        elif isinstance(m, HumanMessage):
+            clean.append(HumanMessage(content=c))
+        elif isinstance(m, AIMessage):
+            clean.append(AIMessage(content=c))
+        else:
+            # Unknown BaseMessage; best-effort map by role string
+            r = str(role).lower()
+            if r == "system":
+                clean.append(SystemMessage(content=c))
+            elif r in ("human", "user"):
+                clean.append(HumanMessage(content=c))
+            elif r in ("assistant", "ai", "aimessage"):
+                clean.append(AIMessage(content=c))
+            # else: ignore silently
+    return clean
+TOOLS = [tool_inspect_dataset, tool_sota_preprocessing, tool_list_steps, tool_describe_step, tool_propose_plan, tool_run_step, tool_list_versions, tool_reset_to_version]
+SYSTEM_PRIMER = (
+    "You are a data-quality assistant.\n"
+    "\n"
+    "Workflow:\n"
+    "1) Call inspect_dataset() to summarize columns/dtypes and GUESS task/label.\n"
+    "   • If you are NOT SURE about the task (or the label for supervised tasks), ASK the user to confirm and END THE TURN.\n"
+    "   • Do NOT call sota_preprocessing until the user explicitly confirms the task (and label if supervised).\n"
+    "   Acceptable confirmations include messages like: "
+    "   'task=classification label=HARDSHIP_INDEX', 'Task: regression', or 'Unsupervised'.\n"
+    "2) After the user confirms, call sota_preprocessing(task, modality, ...) and PRESENT a brief 'SOTA Evidence' section (3–6 bullets with titles and links from the tool).\n"
+    "3) Call list_steps() and map SOTA insights to the available tools. Produce a plan (no execution yet); cite up to 2 SOTA sources per step.\n"
+    "4) Ask: 'Which step should we execute first?' Do NOT call run_step until the user explicitly picks.\n"
+    "5) After the user picks, call describe_step(name) and list ONLY real parameters from the tool. Ask for missing/optional params and confirm them.\n"
+    "6) Execute with run_step(name, params_json). Version controls inside params_json when relevant:\n"
+    "   • source: 'current' | 'prev' | 'base' | '@-1' | '@-2' | <int>\n"
+    "   • dry_run: true|false (preview without mutating)\n"
+    "   • new_version: true|false (create new snapshot vs replace current)\n"
+    "   Avoid loops: if the same step+params just ran, ask to change parameters or source.\n"
+    "7) Summarize results; optionally call list_versions() and offer reset_to_version(spec). If helpful, research again before proposing next steps.\n"
+    "\n"
+    "Rules:\n"
+    "- Return exactly one tool call at a time.\n"
+    "- Never call sota_preprocessing before explicit task confirmation.\n"
+    "- Never call run_step without an explicit user choice.\n"
+    "- When users ask about parameters, use describe_step (or list_steps) and answer ONLY from tool output.\n"
+    "- Reject parameters that are not in the tool signature.\n"
+)
+class AgentState(TypedDict):
+    messages: List[Any]
+    df_payload: Optional[Dict[str, Any]]
+    results: List[Dict[str, Any]]
+    steps_taken: int
+    max_steps: int
+    confirmed_step: Optional[str]
+    confirmed_params: Dict[str, Any]
+    last_task: Optional[str]
+    plan: Optional[Dict[str, Any]]
+def make_agent_node(llm):
+    """LLM emits tool calls; we sanitize history and ALWAYS append an AIMessage."""
+    llm_with_tools = llm.bind_tools(TOOLS)
+    def _node(state: AgentState) -> AgentState:
+        d = (state.get("df_payload") or {}).get("data", {})
+        rows = len(d.get("data", []) or [])
+        cols = len(d.get("columns", []) or [])
+        shape_note = SystemMessage(content=f"Current dataset shape: {rows} rows × {cols} columns.")
+        history = _sanitize_messages(state.get("messages", []))
+        inputs = [SystemMessage(content=SYSTEM_PRIMER), *history, shape_note]
+        ai = llm_with_tools.invoke(inputs)
+        # guard: ensure we append an AIMessage object
+        if not isinstance(ai, AIMessage):
+            ai = AIMessage(content=_to_text(getattr(ai, "content", ai)))
+        state["messages"] = state["messages"] + [ai]
+        # debug
+        # print("DEBUG roles after agent:", [getattr(m, "type", None) or getattr(m, "role", None) for m in state["messages"]])
+        return state
+    return _node
+def tools_exec_node():
+    """
+    Execute tools only here, after injecting df_payload into runtime context.
+    Also updates state with tool outputs (summary/SOTA/plan/step_result).
+    """
+    tool_node = ToolNode(TOOLS)
+    def _node(state: AgentState) -> AgentState:
+        # Inject dataset into runtime context BEFORE any tool executes
+        set_df_payload(state.get("df_payload"))
+        # If no dataset at all, be friendly and stop
+        if state.get("df_payload") is None:
+            state["messages"].append(type(state["messages"][-1])(content="I don't have a dataset yet. Please upload one."))
+            return state
+        # Hard gate: block run_step unless user confirmed a step
+        last = state["messages"][-1]
+        tool_calls = getattr(last, "tool_calls", None) or []
+        for c in tool_calls:
+            if c.get("name") == "run_step":
+                intended = (c.get("args") or {}).get("name")
+                if intended and intended != state.get("confirmed_step"):
+                    state["messages"].append(type(last)(content="I have a plan ready. Which step should we run first?"))
+                    return state
+        # Actually execute the tool(s) requested by the last assistant message
+        out = tool_node.invoke({"messages": state["messages"]})
+        # Append ONLY new ToolMessages; do NOT overwrite the conversation
+        new_msgs = [m for m in out["messages"] if isinstance(m, ToolMessage)]
+        if not new_msgs:
+            # fallback: if provider returned the whole list, take the tail
+            if len(out["messages"]) > len(state["messages"]):
+                new_msgs = out["messages"][len(state["messages"]):]
+            else:
+                new_msgs = out["messages"]
+        state["messages"] = state["messages"] + new_msgs
+        # Parse the most recent tool payload (dict in .content)
+        payload = new_msgs[-1].content if new_msgs else None
+        if isinstance(payload, dict):
+            typ = payload.get("type")
+            if typ == "dataset_summary":
+                set_df_summary(payload)
+                state["last_task"] = payload.get("task_guess")
+            elif typ == "sota":
+                set_sota_bundled(payload.get("bundled_results") or [])
+            elif typ == "plan":
+                state["plan"] = payload
+            elif typ == "step_result":
+                state["df_payload"] = payload["df"]
+                set_df_payload(state["df_payload"])
+                state["results"].append({"name": payload["name"], "stats": payload["stats"]})
+                state["steps_taken"] += 1
+                state["confirmed_step"] = None
+                state["confirmed_params"] = {}
+        # print("DEBUG roles after tools:", [getattr(m, "type", None) or getattr(m, "role", None) for m in state["messages"]])
+        return state
+    return _node
+def should_continue(state: AgentState) -> str:
+    last = state["messages"][-1]
+    if state.get("steps_taken", 0) >= state.get("max_steps", 8):
+        return "end"
+    # Continue if the last assistant message contains tool calls
+    return "continue" if getattr(last, "tool_calls", None) else "end"
+def build_app(llm):
+    g = StateGraph(AgentState)
+    g.add_node("agent", make_agent_node(llm))
+    g.add_node("tools", tools_exec_node())
+    g.add_edge(START, "agent")
+    g.add_conditional_edges("agent", should_continue, {"continue": "tools", "end": END})
+    g.add_edge("tools", "agent")
+    return g.compile()

agent/runtime_ctx.py ADDED Viewed

	@@ -0,0 +1,175 @@

+# agent/runtime_ctx.py
+from __future__ import annotations
+from contextvars import ContextVar
+from typing import Any, Dict, List, Optional, Union
+# ---------------- Versioned dataset store ----------------
+# Each new mutating step can create a new "version".
+# You can address versions as: "current", "base", "prev", "@-1", "@-2", "@3" (0-based).
+_VERSIONS_CV: ContextVar[Optional[List[Dict[str, Any]]]] = ContextVar("VERSIONS", default=None)
+_CUR_INDEX_CV: ContextVar[int] = ContextVar("CUR_INDEX", default=-1)
+_VERS_META_CV: ContextVar[Optional[List[Dict[str, Any]]]] = ContextVar("VERS_META", default=None)
+# Legacy singletons (fallback across tasks)
+_STORE: Dict[str, Any] = {
+    "versions": [],            # list of df_payloads
+    "version_meta": [],        # parallel list of metadata dicts
+    "cur_index": -1,
+    # kept for backward compat with old getters:
+    "df_payload": None,        # alias of current
+    "base_df_payload": None,   # alias of versions[0]
+    "sota_bundled": None,
+    "df_summary": None,
+}
+_SOTA_BUNDLED_CV: ContextVar[Optional[list]] = ContextVar("SOTA_BUNDLED", default=None)
+_DF_SUMMARY_CV: ContextVar[Optional[Dict[str, Any]]] = ContextVar("DF_SUMMARY", default=None)
+# -------- internal helpers --------
+def _get_versions() -> List[Dict[str, Any]]:
+    return _VERSIONS_CV.get() or _STORE["versions"]
+def _get_meta() -> List[Dict[str, Any]]:
+    return _VERS_META_CV.get() or _STORE["version_meta"]
+def _set_versions(vers: List[Dict[str, Any]], meta: List[Dict[str, Any]], cur: int) -> None:
+    _VERSIONS_CV.set(vers)
+    _VERS_META_CV.set(meta)
+    _CUR_INDEX_CV.set(cur)
+    _STORE["versions"] = vers
+    _STORE["version_meta"] = meta
+    _STORE["cur_index"] = cur
+    # keep legacy aliases in sync
+    _STORE["df_payload"] = vers[cur] if (0 <= cur < len(vers)) else None
+    _STORE["base_df_payload"] = vers[0] if vers else None
+# =========================
+# Init / Set / Annotate
+# =========================
+def init_dataset(p: Optional[Dict[str, Any]]) -> None:
+    """Initialize version stack with a single BASE version."""
+    vers = [] if p is None else [p]
+    meta = [] if p is None else [dict(tag="base")]
+    cur = -1 if p is None else 0
+    _set_versions(vers, meta, cur)
+def set_df_payload(p: Optional[Dict[str, Any]], *, new_version: bool = True) -> None:
+    """
+    Set CURRENT dataset.
+    - new_version=True: truncate any forward history and append p (like a new commit).
+    - new_version=False: replace the current version in place (no new snapshot).
+    """
+    vers = list(_get_versions())
+    meta = list(_get_meta())
+    cur = _CUR_INDEX_CV.get() if _CUR_INDEX_CV.get() is not None else _STORE["cur_index"]
+    if cur < 0 or not vers:
+        # not initialized yet
+        init_dataset(p)
+        return
+    if new_version:
+        # drop any versions after current (no branching for simplicity)
+        vers = vers[:cur + 1]
+        meta = meta[:cur + 1]
+        vers.append(p)
+        meta.append({})
+        cur = len(vers) - 1
+    else:
+        vers[cur] = p
+    _set_versions(vers, meta, cur)
+def annotate_current(**kv) -> None:
+    """Attach metadata to the current version (e.g., step/params/stats)."""
+    vers = list(_get_versions())
+    meta = list(_get_meta())
+    cur = _CUR_INDEX_CV.get() if _CUR_INDEX_CV.get() is not None else _STORE["cur_index"]
+    if 0 <= cur < len(meta):
+        meta[cur] = {**meta[cur], **kv}
+        _set_versions(vers, meta, cur)
+# =========================
+# Getters / Navigation
+# =========================
+def _resolve_index(spec: Union[str, int, None]) -> int:
+    vers = _get_versions()
+    cur = _CUR_INDEX_CV.get() if _CUR_INDEX_CV.get() is not None else _STORE["cur_index"]
+    if spec is None or spec == "current":
+        return cur
+    if spec == "base":
+        return 0 if vers else -1
+    if spec == "prev":
+        return max(-1, cur - 1)
+    if isinstance(spec, int):
+        idx = spec if spec >= 0 else len(vers) + spec
+        return idx
+    # strings like "@-1", "@3"
+    if isinstance(spec, str) and spec.startswith("@"):
+        try:
+            n = int(spec[1:])
+        except Exception:
+            return cur
+        idx = n if n >= 0 else len(vers) + n
+        return idx
+    return cur
+def get_df_payload(version: Union[str, int, None] = None) -> Optional[Dict[str, Any]]:
+    """Return dataset payload for the requested version (default: current)."""
+    vers = _get_versions()
+    idx = _resolve_index(version)
+    if 0 <= idx < len(vers):
+        return vers[idx]
+    # legacy fallback
+    return _STORE["df_payload"]
+def get_base_df_payload() -> Optional[Dict[str, Any]]:
+    return get_df_payload("base")
+def get_prev_df_payload() -> Optional[Dict[str, Any]]:
+    return get_df_payload("prev")
+def list_versions() -> Dict[str, Any]:
+    """Lightweight overview for debugging/UI."""
+    vers = _get_versions()
+    meta = _get_meta()
+    cur = _CUR_INDEX_CV.get() if _CUR_INDEX_CV.get() is not None else _STORE["cur_index"]
+    return {
+        "count": len(vers),
+        "current_index": cur,
+        "has_base": bool(vers),
+        "meta": meta,  # [{tag:..., step:..., params:...}, ...]
+    }
+def reset_current_to(version: Union[str, int]) -> None:
+    """Move the current pointer to a prior version (no deletion)."""
+    vers = _get_versions()
+    meta = _get_meta()
+    idx = _resolve_index(version)
+    if 0 <= idx < len(vers):
+        _set_versions(vers, meta, idx)
+def reset_current_to_base() -> None:
+    reset_current_to("base")
+# =========================
+# SOTA / Summary passthrough
+# =========================
+def set_sota_bundled(b: Optional[list]) -> None:
+    _SOTA_BUNDLED_CV.set(b)
+    _STORE["sota_bundled"] = b
+def get_sota_bundled() -> Optional[list]:
+    return _SOTA_BUNDLED_CV.get() or _STORE["sota_bundled"]
+def set_df_summary(s: Optional[Dict[str, Any]]) -> None:
+    _DF_SUMMARY_CV.set(s)
+    _STORE["df_summary"] = s
+def get_df_summary() -> Optional[Dict[str, Any]]:
+    return _DF_SUMMARY_CV.get() or _STORE["df_summary"]

agent/simple_chat.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import os
+from typing import List, Optional
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_core.messages import HumanMessage
+from dotenv import load_dotenv
+load_dotenv()
+def simple_chat(
+    question: str,
+    context: str,
+    image_paths: Optional[List[str]] = None
+) -> str:
+    """
+    Simple chat function that answers questions based on context and optional images.
+    Args:
+        question: User's question
+        context: Context information (e.g., dataset summary, analysis results)
+        image_paths: Optional list of image file paths to include
+    Returns:
+        AI response as a string
+    """
+    try:
+        # Initialize LLM
+        llm = ChatGoogleGenerativeAI(
+            model="gemini-2.0-flash-exp",
+            temperature=0,
+            api_key=os.getenv("GOOGLE_API_KEY"),
+        )
+        # Build the prompt
+        prompt = f"""You are a helpful data analysis assistant.
+Context:
+{context}
+User Question: {question}
+Please provide a clear, concise answer based on the context provided."""
+        # Handle images if provided
+        if image_paths:
+            content = [{"type": "text", "text": prompt}]
+            for img_path in image_paths:
+                if os.path.exists(img_path):
+                    import base64
+                    with open(img_path, "rb") as f:
+                        img_data = base64.b64encode(f.read()).decode()
+                    # Determine image type
+                    ext = os.path.splitext(img_path)[1].lower()
+                    mime_type = {
+                        '.png': 'image/png',
+                        '.jpg': 'image/jpeg',
+                        '.jpeg': 'image/jpeg',
+                        '.gif': 'image/gif',
+                        '.webp': 'image/webp'
+                    }.get(ext, 'image/png')
+                    content.append({
+                        "type": "image_url",
+                        "image_url": f"data:{mime_type};base64,{img_data}"
+                    })
+            message = HumanMessage(content=content)
+        else:
+            message = HumanMessage(content=prompt)
+        # Get response
+        response = llm.invoke([message])
+        # Extract text from response
+        if hasattr(response, 'content'):
+            return str(response.content)
+        return str(response)
+    except Exception as e:
+        return f"Error: {str(e)}"
+# Example usage:
+if __name__ == "__main__":
+    # Simple text-only example
+    context = """
+    Dataset: Customer Sales Data
+    - 1000 rows, 15 columns
+    - Label: purchase_made (binary)
+    - Task: Classification
+    - Missing values: 5% in age column
+    """
+    question = "What's the main task for this dataset?"
+    response = simple_chat(question, context)
+    print(response)
+    # With images
+    question2 = "What do you see in the visualization?"
+    response2 = simple_chat(question2, context, image_paths=["/path/to/plot.png"])
+    print(response2)

agent/tools.py ADDED Viewed

	@@ -0,0 +1,354 @@

+from __future__ import annotations
+import json
+from typing import Any, Dict, List, Optional
+from langchain.tools import tool
+from langchain_community.tools.tavily_search import TavilySearchResults
+from pipeline.deduplication import find_near_duplicates
+from pipeline.featurizer import custom_featurizer
+from pipeline.issues import find_issues
+from pipeline.utils_cool import (
+    df_from_payload,
+    df_to_payload,
+    get_signature_dict,
+    guess_task_and_label,
+)
+from .runtime_ctx import (
+    get_df_payload,  # now supports version spec (None|'current'|'prev'|'base'|'@-1'|int)
+    get_df_summary,
+    get_sota_bundled,
+    set_df_payload,  # commit new dataset version (or replace)
+    set_df_summary,
+    set_sota_bundled,
+)
+from .runtime_ctx import (
+    list_versions as _list_versions_state,
+)
+from .runtime_ctx import (
+    reset_current_to as _reset_current_to,
+)
+# Registry of runnable steps (names used by the agent/UI)
+STEP_FUNCS = {
+    "dedup": find_near_duplicates,
+    "featurize": custom_featurizer,
+    "find_label_issues": find_issues,
+}
+@tool("inspect_dataset", return_direct=True)
+def tool_inspect_dataset() -> Dict[str, Any]:
+    """
+    Summarize the CURRENT dataset (no arguments required).
+    Behavior:
+      • Reads the dataset from the runtime context (set by the graph).
+      • Returns a compact summary of columns, dtypes, shape, and a guessed label/task.
+    Returns:
+      {
+        "type": "dataset_summary",
+        "columns": [...],
+        "dtypes": {col: dtype, ...},
+        "shape": (rows, cols),
+        "label_guess": "<name or None>",
+        "task_guess": "classification|regression|unsupervised",
+        "issues": [ ... ]  # e.g., missing labels, single-class, etc.
+      }
+    """
+    df_payload = get_df_payload()  # default: current version
+    if df_payload is None:
+        raise RuntimeError("inspect_dataset: no dataset available in runtime context.")
+    df = df_from_payload(df_payload)
+    summary = guess_task_and_label(df)
+    # keep context fresh for downstream tools
+    set_df_summary(summary)
+    return {"type": "dataset_summary", **summary}
+@tool("sota_preprocessing", return_direct=True)
+def tool_sota_preprocessing(
+    task: Optional[str] = None,
+    modality: Optional[str] = None,
+    domain: Optional[str] = None,
+    target: Optional[str] = None,
+) -> Dict[str, Any]:
+    """
+    Search state-of-the-art preprocessing best practices (modality-aware).
+    Args:
+      task: e.g., "classification", "regression", "segmentation", "NER", "ASR", "forecasting".
+            If omitted, inferred from the dataset summary if available.
+      modality: one of {"tabular","text","image","audio","video","time_series","graph","multimodal"}.
+      domain: optional domain context (e.g., "clinical", "finance").
+      target: optional target structure (e.g., "segmentation masks", "bounding boxes").
+    Returns:
+      {
+        "type": "sota",
+        "task": ...,
+        "modality": ...,
+        "domain": ...,
+        "target": ...,
+        "queries": [...],
+        "bundled_results": [{ "query": q, "results": <tavily-results> }, ...],
+        "results": <first tavily-results batch>
+      }
+    """
+    df_summary = get_df_summary() or {}
+    if not task:
+        task = df_summary.get("task_guess") or "classification"
+    yr = "2024 2025"
+    m = (modality or "").lower().strip()
+    modality_terms = {
+        "tabular": ["imputation", "encoding", "scaling", "outliers", "leakage prevention"],
+        "text": ["tokenization", "normalization", "subword", "BPE", "SentencePiece", "stopwords", "lemmatization", "augmentation"],
+        "image": ["normalization", "resizing", "color space", "augmentation", "RandAugment", "MixUp", "CutMix"],
+        "audio": ["resampling", "log-mel spectrogram", "MFCC", "pre-emphasis", "SpecAugment", "denoising"],
+        "time_series": ["resampling", "windowing", "detrending", "imputation", "outlier detection", "scaling"],
+        "video": ["frame sampling", "temporal augmentation", "clip normalization", "optical flow"],
+        "graph": ["feature normalization", "self-loops", "adjacency normalization", "sparsification"],
+        "multimodal": ["alignment", "synchronization", "fusion", "tokenization"],
+    }
+    m_terms = modality_terms.get(m, [])
+    # Build candidate queries
+    queries: List[str] = []
+    queries.append(f"state of the art preprocessing {task} {yr}")
+    queries.append(f"best practices data preprocessing {task} {yr}")
+    if m:
+        queries.append(f"{m} {task} preprocessing best practices {yr}")
+    if domain:
+        queries.append(f"{domain} {m or ''} {task} preprocessing best practices {yr}".strip())
+    if target:
+        queries.append(f"{m or ''} {task} {target} preprocessing pipeline {yr}".strip())
+    if m_terms:
+        queries.append(f"{m} {task} preprocessing {' '.join(m_terms)} {yr}")
+    # Deduplicate, preserve order
+    seen = set()
+    queries = [q for q in (q.strip() for q in queries) if q and (q not in seen and not seen.add(q))]
+    tavily = TavilySearchResults(k=6)
+    bundled: List[Dict[str, Any]] = [{"query": q, "results": tavily.invoke({"query": q})} for q in queries]
+    flat_first = bundled[0]["results"] if (bundled and "results" in bundled[0]) else []
+    # persist for planning
+    set_sota_bundled(bundled)
+    return {
+        "type": "sota",
+        "task": task,
+        "modality": m or "unknown",
+        "domain": domain,
+        "target": target,
+        "queries": queries,
+        "bundled_results": bundled,
+        "results": flat_first,
+    }
+@tool("describe_step", return_direct=True)
+def tool_describe_step(name: str) -> Dict[str, Any]:
+    """
+    Return the exact docstring + parameter schema for a single step by name.
+    This prevents the model from inventing params.
+    """
+    if name not in STEP_FUNCS:
+        raise ValueError(f"Unknown step '{name}'. Available: {list(STEP_FUNCS)}")
+    fn = STEP_FUNCS[name]
+    sig = get_signature_dict(fn)  # your util that introspects defaults/annotations
+    return {"type": "step_description", "name": name, **sig}
+@tool("list_steps", return_direct=True)
+def tool_list_steps() -> Dict[str, Any]:
+    """
+    List available pipeline steps (name, docstring, and signature).
+    Returns:
+      {
+        "type": "steps",
+        "steps": [
+          {
+            "name": "dedup" | "featurize" | "find_label_issues",
+            "doc": "<docstring>",
+            "params": [{"name": "...", "default": ..., "annotation": "...", "kind": "..."}]
+          }, ...
+        ]
+      }
+    """
+    return {
+        "type": "steps",
+        "steps": [{"name": n, **get_signature_dict(fn)} for n, fn in STEP_FUNCS.items()],
+    }
+@tool("propose_plan", return_direct=True)
+def tool_propose_plan(
+    task: Optional[str] = None,
+    modality: Optional[str] = None,
+) -> Dict[str, Any]:
+    """
+    Propose an ordered preprocessing plan grounded in SOTA + dataset summary.
+    (Planning only — does not execute steps.)
+    """
+    df_summary = get_df_summary() or {}
+    bundled = get_sota_bundled() or []
+    if not task:
+        task = df_summary.get("task_guess") or "classification"
+    label_guess = df_summary.get("label_guess")
+    KEYWORDS = {
+        "dedup": {"duplicate", "near-duplicate", "near duplicate", "dupe", "dedup", "similarity", "knn", "kNN"},
+        "featurize": {
+            "impute", "imputation", "encoding", "one-hot", "scale", "scaling",
+            "normalize", "normalization", "standardize", "tfidf", "tokenization",
+            "lemmatization", "augmentation"
+        },
+        "find_label_issues": {"label noise", "noisy labels", "cleanlab", "confident learning", "label issues", "weak labels"},
+    }
+    def _score(text: str, keys: set[str]) -> int:
+        t = (text or "").lower()
+        return sum(1 for k in keys if k in t)
+    hits = {"dedup": 0, "featurize": 0, "find_label_issues": 0}
+    evidence: Dict[str, List[Dict[str, str]]] = {"dedup": [], "featurize": [], "find_label_issues": []}
+    for pack in bundled:
+        q = pack.get("query", "")
+        for item in (pack.get("results") or []):
+            title = item.get("title", "")
+            content = item.get("content", "")
+            url = item.get("url", "")
+            for step, keys in KEYWORDS.items():
+                s = _score(f"{q} {title} {content}", keys)
+                if s > 0:
+                    hits[step] += s
+                    if len(evidence[step]) < 5:
+                        evidence[step].append({"query": q, "title": title, "url": url})
+    options: List[Dict[str, Any]] = []
+    if hits["dedup"] > 0 or modality in {None, "tabular", "text", "image", "time_series"}:
+        options.append(
+            {
+                "reason": "SOTA emphasizes handling near-duplicates early" if hits["dedup"] else "Practical first step to prevent leakage/skew",
+                "step": "dedup",
+                "params": {"threshold": 0.95, "metric": "cosine"},
+                "evidence": evidence["dedup"][:3],
+            }
+        )
+    options.append(
+        {
+            "reason": "SOTA emphasizes robust imputation/encoding/scaling" if hits["featurize"] else "Prepare features based on modality",
+            "step": "featurize",
+            "params": {"nan_strategy": "impute"},
+            "evidence": evidence["featurize"][:3],
+        }
+    )
+    if (task == "classification" and label_guess) or hits["find_label_issues"] > 0:
+        options.append(
+            {
+                "reason": "SOTA recommends checking noisy labels" if hits["find_label_issues"] else "Check label quality before training",
+                "step": "find_label_issues",
+                "params": {"label": label_guess or "<CONFIRM>"},
+                "evidence": evidence["find_label_issues"][:3],
+            }
+        )
+    if not options:
+        options = [{"reason": "Generic best practice", "step": "featurize", "params": {"nan_strategy": "impute"}, "evidence": []}]
+    return {
+        "type": "plan",
+        "task": task,
+        "modality": modality,
+        "label_guess": label_guess,
+        "options": options,
+        "keyword_hits": hits,
+    }
+@tool("run_step", return_direct=True)
+def tool_run_step(name: str, params_json: str = "") -> Dict[str, Any]:
+    """
+    Execute a single pipeline step on the CURRENT dataset (no df argument).
+    Returns ONLY a compact summary; the updated df is stored in runtime context.
+    """
+    df_payload = get_df_payload()
+    if df_payload is None:
+        raise RuntimeError("run_step: no dataset available in runtime context.")
+    if name not in STEP_FUNCS:
+        raise ValueError(f"Unknown step '{name}'. Available: {list(STEP_FUNCS)}")
+    params = json.loads(params_json) if params_json else {}
+    if not isinstance(params, dict):
+        raise ValueError("params_json must decode to a JSON object")
+    df = df_from_payload(df_payload)
+    df_out, stats = STEP_FUNCS[name](df=df, **params)
+    df_next = df_out if df_out is not None else df
+    # ✅ update runtime dataset, but DO NOT send it back in the tool message
+    set_df_payload(df_to_payload(df_next))
+    # Build a tiny, safe summary for the model
+    shape_before = (len(df), len(df.columns))
+    shape_after  = (len(df_next), len(df_next.columns))
+    compact_stats = {k: stats.get(k) for k in [
+        "n_rows_before_dedup", "n_near_dupe_pairs", "n_groups",
+        "n_rows_flagged_duplicates", "n_rows_after_dedup",
+        "metric", "threshold", "k", "total_time_sec"
+    ] if k in stats}
+    return {
+        "type": "step_result",
+        "name": name,
+        "params_used": params,
+        "shape_before": shape_before,
+        "shape_after": shape_after,
+        "stats": compact_stats,   # small dict only
+        "note": "Dataset updated in runtime context; use list_versions/reset_to_version if needed."
+    }
+# ---------------------------
+# Optional helpers for version control from chat/agent
+# ---------------------------
+@tool("list_versions", return_direct=True)
+def tool_list_versions() -> Dict[str, Any]:
+    """
+    Return a lightweight view of the version stack:
+      { count, current_index, has_base, meta: [{...}, ...] }
+    """
+    return {"type": "versions", **_list_versions_state()}
+@tool("reset_to_version", return_direct=True)
+def tool_reset_to_version(spec: str) -> Dict[str, Any]:
+    """
+    Move CURRENT pointer to a prior version without deleting history.
+    spec can be: "base" | "prev" | "@-1" | "@-2" | "3"
+    """
+    # accept int or @-k in string form
+    try:
+        if spec.isdigit():
+            _reset_current_to(int(spec))
+        else:
+            _reset_current_to(spec)
+    except Exception as e:
+        raise RuntimeError(f"reset_to_version: {e}")
+    df_payload = get_df_payload()
+    return {
+        "type": "reset",
+        "current": _list_versions_state(),
+        "df": df_payload,
+    }

app.py ADDED Viewed

	@@ -0,0 +1,532 @@

+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+from typing import Optional, Tuple, List, Dict, Any
+import io
+import base64
+from tqdm.auto import tqdm
+from dataclasses import dataclass
+import gradio as gr
+import json
+from pipeline.deduplication import find_near_duplicates
+from pipeline.featurizer import custom_featurizer
+from pipeline.issues import find_issues
+from pipeline.pipeline import make_step, run_pipeline
+from ecg_analyzer import ECGAnalyzer
+from agent.simple_chat import simple_chat
+# ============================================================================
+# ANALYSIS TASK CONFIGURATION
+# ============================================================================
+@dataclass
+class TaskConfig:
+    """Configuration for each analysis task"""
+    name: str
+    data_type: str
+    requires_params: bool
+    param_components: List[Dict[str, Any]]
+    output_tabs: List[str]
+class TaskRegistry:
+    """Registry mapping tasks to their configurations"""
+    @staticmethod
+    def get_config(data_type: str, task_name: str) -> Optional[TaskConfig]:
+        """Get configuration for a specific task"""
+        configs = {
+            "EHR Data": {
+                "Near-Duplicate Detection": TaskConfig(
+                    name="Near-Duplicate Detection",
+                    data_type="EHR Data",
+                    requires_params=True,
+                    param_components=[
+                        {"type": "dropdown", "label": "Label Column", "elem_id": "ndd_label"}
+                    ],
+                    output_tabs=["original", "processed", "summary"]
+                ),
+                "Find Mislabeled Data": TaskConfig(
+                    name="Find Mislabeled Data",
+                    data_type="EHR Data",
+                    requires_params=True,
+                    param_components=[
+                        {"type": "dropdown", "label": "Label Column", "elem_id": "mislabel_label"}
+                    ],
+                    output_tabs=["original", "summary"]
+                )
+            },
+            "ECG Data": {
+                "ECG Visualization": TaskConfig(
+                    name="ECG Visualization",
+                    data_type="ECG Data",
+                    requires_params=True,
+                    param_components=[
+                        {"type": "checkboxgroup", "label": "Select Leads", "elem_id": "ecg_leads"},
+                        {"type": "checkboxgroup", "label": "Visualization Types", "elem_id": "ecg_viz_types"}
+                    ],
+                    output_tabs=["visualization", "summary"]
+                ),
+                "Statistical Summary": TaskConfig(
+                    name="Statistical Summary",
+                    data_type="ECG Data",
+                    requires_params=True,
+                    param_components=[
+                        {"type": "checkboxgroup", "label": "Select Leads", "elem_id": "ecg_stats_leads"}
+                    ],
+                    output_tabs=["summary", "visualization"]
+                )
+            }
+        }
+        return configs.get(data_type, {}).get(task_name)
+    @staticmethod
+    def get_tasks_for_data_type(data_type: str) -> List[str]:
+        """Get available tasks for a data type"""
+        tasks = {
+            "EHR Data": ["Near-Duplicate Detection", "Find Mislabeled Data"],
+            "ECG Data": ["ECG Visualization", "Statistical Summary"]
+        }
+        return tasks.get(data_type, [])
+# ============================================================================
+# ANALYSIS EXECUTION
+# ============================================================================
+class AnalysisExecutor:
+    """Executes analysis tasks and returns results"""
+    @staticmethod
+    def execute_near_duplicate_detection(df: pd.DataFrame, label: str) -> Tuple[str, Dict[str, Any]]:
+        """Execute near-duplicate detection pipeline"""
+        try:
+            if not label:
+                return "⚠ Label column required", {"original": df, "processed": None, "summary": None}
+            bar = tqdm(total=100, leave=False, desc="Pipeline Progress")
+            steps = [
+                make_step(find_near_duplicates, name="dedup")(progress=bar),
+                make_step(custom_featurizer, name="featurize")(
+                    label=label, nan_strategy="impute", on_pipeline_error="drop", progress=bar
+                ),
+                make_step(find_issues, name="find_label_issues")(label=label, progress=bar),
+            ]
+            results_df, summary_list = run_pipeline(steps, df=df)
+            bar.close()
+            return "✓ Near-duplicate detection completed", {
+                "original": df, "processed": results_df, "summary": summary_list
+            }
+        except Exception as e:
+            return f"✗ Error: {str(e)}", {"original": df, "processed": None, "summary": None}
+    @staticmethod
+    def execute_find_mislabeled(df: pd.DataFrame, label: str) -> Tuple[str, Dict[str, Any]]:
+        """Execute mislabeled data detection"""
+        try:
+            if not label:
+                return "⚠ Label column required", {"original": df, "summary": None}
+            summary = {
+                "task": "Find Mislabeled Data", "label_column": label, "total_samples": len(df),
+                "suspicious_samples": 0, "message": "Mislabeled detection analysis completed"
+            }
+            return "✓ Mislabeled data analysis completed", {"original": df, "summary": summary}
+        except Exception as e:
+            return f"✗ Error: {str(e)}", {"original": df, "summary": None}
+    @staticmethod
+    def execute_ecg_visualization(df: pd.DataFrame, leads: List[str] = None, viz_types: List[str] = None) -> Tuple[str, Dict[str, Any]]:
+        """Execute ECG visualization using ECGAnalyzer"""
+        try:
+            # Detect available leads
+            available_leads = ECGAnalyzer.detect_leads(df)
+            # Use provided leads or default to all available
+            if not leads:
+                leads = available_leads if available_leads else []
+            if not leads:
+                return "⚠ No ECG leads found in data", {"visualization": None, "summary": None}
+            # Default visualization types
+            if not viz_types:
+                viz_types = ["Signal Waveform", "Histogram"]
+            # Create visualizations
+            viz_html = ECGAnalyzer.create_all_visualizations(df, leads, viz_types)
+            # Generate statistics
+            stats = ECGAnalyzer.generate_statistics(df, leads)
+            summary = {
+                "task": "ECG Visualization",
+                "samples": len(df),
+                "leads_analyzed": leads,
+                "visualizations": viz_types,
+                "statistics": stats
+            }
+            return "✓ ECG visualization created", {"visualization": viz_html, "summary": summary}
+        except Exception as e:
+            return f"✗ Error: {str(e)}", {"visualization": None, "summary": None}
+    @staticmethod
+    def execute_statistical_summary(df: pd.DataFrame, leads: List[str] = None) -> Tuple[str, Dict[str, Any]]:
+        """Execute statistical summary using ECGAnalyzer"""
+        try:
+            # Detect available leads
+            available_leads = ECGAnalyzer.detect_leads(df)
+            # Use provided leads or default to all available
+            if not leads:
+                leads = available_leads if available_leads else list(df.select_dtypes(include=[np.number]).columns)
+            if not leads:
+                return "⚠ No numeric columns found", {"summary": None, "visualization": None}
+            # Generate statistics
+            stats = ECGAnalyzer.generate_statistics(df, leads)
+            # Create HTML table for statistics
+            html_rows = []
+            html_rows.append("<table class='preview-table' style='margin: 20px auto; max-width: 900px;'>")
+            html_rows.append("<thead><tr><th>Lead</th><th>Mean</th><th>Std</th><th>Min</th><th>Q25</th><th>Median</th><th>Q75</th><th>Max</th></tr></thead>")
+            html_rows.append("<tbody>")
+            for lead, lead_stats in stats.items():
+                html_rows.append(f"<tr>")
+                html_rows.append(f"<td><strong>{lead}</strong></td>")
+                html_rows.append(f"<td>{lead_stats['mean']:.4f}</td>")
+                html_rows.append(f"<td>{lead_stats['std']:.4f}</td>")
+                html_rows.append(f"<td>{lead_stats['min']:.4f}</td>")
+                html_rows.append(f"<td>{lead_stats['q25']:.4f}</td>")
+                html_rows.append(f"<td>{lead_stats['median']:.4f}</td>")
+                html_rows.append(f"<td>{lead_stats['q75']:.4f}</td>")
+                html_rows.append(f"<td>{lead_stats['max']:.4f}</td>")
+                html_rows.append(f"</tr>")
+            html_rows.append("</tbody></table>")
+            summary_html = f"<div style='overflow-x:auto;'><h3 style='text-align:center;'>Statistical Summary</h3>{''.join(html_rows)}</div>"
+            summary = {
+                "task": "Statistical Summary",
+                "rows": len(df),
+                "leads_analyzed": leads,
+                "statistics": stats
+            }
+            return "✓ Statistical summary generated", {"summary": summary, "visualization": summary_html}
+        except Exception as e:
+            return f"✗ Error: {str(e)}", {"summary": None, "visualization": None}
+# ============================================================================
+# UI MANAGER - Handles all UI state and updates
+# ============================================================================
+class UIManager:
+    """Manages UI state and dynamic updates"""
+    def __init__(self):
+        self.current_df = None
+        self.current_data_type = "EHR Data"
+        self.chatbot_context = {}
+        self.command_map = {
+            "data_type": {"ehr": "EHR Data", "ecg": "ECG Data"},
+            "task": {
+                "deduplication": "Near-Duplicate Detection", "mislabeled": "Find Mislabeled Data",
+                "visualize_ecg": "ECG Visualization", "stats": "Statistical Summary"
+            }
+        }
+    def load_csv(self, file) -> Tuple[str, Optional[pd.DataFrame]]:
+        """Load CSV file"""
+        if file is None: return "⚠ No file uploaded", None
+        try:
+            df = pd.read_csv(file.name)
+            self.current_df = df
+            return f"✓ Loaded {len(df)} rows, {len(df.columns)} columns", df
+        except Exception as e:
+            return f"✗ Error: {str(e)}", None
+    def on_file_upload(self, file, data_type: str):
+        """Handle file upload - returns updates for all components"""
+        status, df = self.load_csv(file)
+        if df is None:
+            return (
+                status, gr.update(value=None), gr.update(choices=[], value=[]),
+                gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False),
+                gr.update(choices=[]), gr.update(choices=[]),
+                gr.update(choices=[]), gr.update(choices=[], value=[]),
+                gr.update(choices=[]), gr.update(choices=[], value=[]),
+                gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False),
+            )
+        self.chatbot_context = {"file": file.name, "type": data_type, "df": df}
+        available_tasks = TaskRegistry.get_tasks_for_data_type(data_type)
+        col_choices = list(df.columns)
+        # Detect ECG leads if ECG data
+        ecg_leads = ECGAnalyzer.detect_leads(df) if data_type == "ECG Data" else []
+        viz_types = ["Signal Waveform", "Histogram", "Scatter Plot", "Rolling Average"]
+        return (
+            status, gr.update(value=df.head(200)), gr.update(choices=available_tasks, value=[], interactive=True),
+            gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False),
+            gr.update(choices=col_choices, value=None), gr.update(choices=col_choices, value=None),
+            gr.update(choices=ecg_leads, value=ecg_leads), gr.update(choices=viz_types, value=["Signal Waveform", "Histogram"]),
+            gr.update(choices=ecg_leads, value=ecg_leads), gr.update(choices=viz_types, value=["Signal Waveform"]),
+            gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False),
+        )
+    def on_data_type_change(self, data_type: str, file):
+        """Handle data type change"""
+        self.current_data_type = data_type
+        if file and self.current_df is not None:
+            self.chatbot_context["type"] = data_type
+            # Update ECG lead choices if switching to ECG data
+            ecg_leads = ECGAnalyzer.detect_leads(self.current_df) if data_type == "ECG Data" else []
+            viz_types = ["Signal Waveform", "Histogram", "Scatter Plot", "Rolling Average"]
+            return (
+                gr.update(choices=TaskRegistry.get_tasks_for_data_type(data_type), value=[]),
+                gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False),
+                gr.update(choices=ecg_leads, value=ecg_leads), gr.update(choices=viz_types, value=["Signal Waveform", "Histogram"]),
+                gr.update(choices=ecg_leads, value=ecg_leads), gr.update(choices=viz_types, value=["Signal Waveform"]),
+                f"Data type changed to: {data_type}"
+            )
+        return (
+            gr.update(choices=TaskRegistry.get_tasks_for_data_type(data_type), value=[]),
+            gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False),
+            gr.update(), gr.update(), gr.update(), gr.update(),
+            f"Data type changed to: {data_type}"
+        )
+    def on_tasks_change(self, selected_tasks: List[str]):
+        """Handle task selection change - show/hide parameter groups"""
+        show_ndd = "Near-Duplicate Detection" in selected_tasks
+        show_mislabel = "Find Mislabeled Data" in selected_tasks
+        show_ecg_viz = "ECG Visualization" in selected_tasks
+        show_ecg_stats = "Statistical Summary" in selected_tasks and self.current_data_type == "ECG Data"
+        return (
+            gr.update(visible=show_ndd),
+            gr.update(visible=show_mislabel),
+            gr.update(visible=show_ecg_viz),
+            gr.update(visible=show_ecg_stats)
+        )
+    def process_analysis(self, file, data_type: str, selected_tasks: List[str],
+                         ndd_label: str, mislabel_label: str,
+                         ecg_viz_leads: List[str], ecg_viz_types: List[str],
+                         ecg_stats_leads: List[str]):
+        """Process analysis tasks based on UI inputs."""
+        status, df = self.load_csv(file)
+        if df is None:
+            return (status, None, None, None, None, gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False))
+        params = {
+            "ndd_label": ndd_label,
+            "mislabel_label": mislabel_label,
+            "ecg_viz_leads": ecg_viz_leads,
+            "ecg_viz_types": ecg_viz_types,
+            "ecg_stats_leads": ecg_stats_leads
+        }
+        return self._run_analysis(df, data_type, selected_tasks, params)
+    def _run_analysis(self, df: pd.DataFrame, data_type: str, selected_tasks: List[str], params: Dict[str, Any]):
+        """Centralized analysis executor, callable from UI or chatbot."""
+        if not selected_tasks:
+            return (
+                "⚠ No tasks selected", df.head(200), None, None, None,
+                gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
+            )
+        all_tabs = set(); all_results = {"original": df.head(200), "processed": None, "summary": [], "visualization": ""}
+        status_messages = []; executor = AnalysisExecutor()
+        for task_name in selected_tasks:
+            config = TaskRegistry.get_config(data_type, task_name)
+            if not config:
+                status_messages.append(f"✗ Unknown task: {task_name}"); continue
+            all_tabs.update(config.output_tabs)
+            if task_name == "Near-Duplicate Detection":
+                status_msg, results = executor.execute_near_duplicate_detection(df, params.get("ndd_label"))
+            elif task_name == "Find Mislabeled Data":
+                status_msg, results = executor.execute_find_mislabeled(df, params.get("mislabel_label"))
+            elif task_name == "ECG Visualization":
+                status_msg, results = executor.execute_ecg_visualization(
+                    df,
+                    params.get("ecg_viz_leads"),
+                    params.get("ecg_viz_types")
+                )
+            elif task_name == "Statistical Summary":
+                if data_type == "ECG Data":
+                    status_msg, results = executor.execute_statistical_summary(df, params.get("ecg_stats_leads"))
+                else:
+                    status_msg, results = executor.execute_statistical_summary(df)
+            else:
+                status_msg, results = "✗ Task not implemented", {}
+            status_messages.append(f"{task_name}: {status_msg}")
+            if results.get("processed") is not None: all_results["processed"] = results["processed"]
+            if results.get("visualization"): all_results["visualization"] += results["visualization"]
+            if results.get("summary") is not None: all_results["summary"].append({"task": task_name, "data": results["summary"]})
+        self.chatbot_context["summary"] = all_results["summary"] or None
+        self.chatbot_context["visualization"] = all_results["visualization"] or None
+        return (
+            "\n".join(status_messages), all_results["original"], all_results["processed"],
+            all_results["summary"] or None, all_results["visualization"] or None,
+            gr.update(visible="original" in all_tabs), gr.update(visible="processed" in all_tabs),
+            gr.update(visible="summary" in all_tabs), gr.update(visible="visualization" in all_tabs)
+        )
+    def chatbot_respond(self, message: str, history: List):
+        """Handle chatbot messages, parsing for commands or responding to queries."""
+        history = history or []; df = self.chatbot_context.get("df")
+        summary = json.dumps(self.chatbot_context.get("summary")[-1]) if self.chatbot_context.get("summary") else ""
+        # visualization = self.chatbot_context.get("visualization")
+        visualization = ''
+        print("history:", history)
+        print("# ============================================================================\n ")
+        print("message:", message)
+        print("# ============================================================================\n ")
+        print("summary:", summary)
+        print("# ============================================================================\n ")
+        print("visualization:", visualization)
+        print("# ============================================================================\n ")
+        ui_updates = tuple([gr.update()] * 9) # status, 4 outputs, 4 tabs
+        command = message
+        context = summary + visualization if summary or visualization else ""
+        response = simple_chat(command, context)
+        history.append((message, response))
+        return (history, "") + ui_updates
+# ============================================================================
+# GRADIO INTERFACE
+# ============================================================================
+def create_interface():
+    """Build the Gradio interface"""
+    ui_manager = UIManager()
+    custom_css = """
+    * { box-sizing: border-box; } html, body { margin: 0; padding: 0; height: 100vh; overflow: hidden; }
+    .gradio-container { height: 100vh !important; max-width: 100% !important; padding: 0 !important; }
+    #app-container { height: 100vh; display: flex; flex-direction: column; padding: 0.75rem; gap: 0.75rem; }
+    #main-row { flex: 1; min-height: 0; display: flex; gap: 0.75rem; }
+    #left-panel { display: flex; flex-direction: column; height: 100%; background: #f9fafb; border-radius: 10px; padding: 0.75rem; gap: 0.5rem; }
+    #task-section { flex: 1; min-height: 0; overflow-y: auto; display: flex; flex-direction: column; gap: 0.5rem; }
+    #middle-panel, #chat-panel { display: flex; flex-direction: column; height: 100%; }
+    #tabs-container { flex: 1; min-height: 0; display: flex; flex-direction: column; }
+    #tabs-container .tabitem { flex: 1; min-height: 0; overflow: auto; }
+    #chat-history { flex: 1; min-height: 0; overflow-y: auto; margin-bottom: 0.5rem; }
+    #chat-input-row { flex-shrink: 0; display: flex; gap: 0.5rem; }
+    .preview-table { border-collapse: collapse; width: 100%; font-size: 0.875rem; }
+    .preview-table th { background-color: #3498db; color: white; padding: 8px; text-align: left; position: sticky; top: 0; }
+    """
+    with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="Medical Data Analysis Platform") as demo:
+        with gr.Column(elem_id="app-container"):
+            gr.Markdown("# 🏥 Medical Data Analysis Platform")
+            with gr.Row():
+                file_input = gr.File(label="Upload CSV", file_types=[".csv"], scale=2)
+                data_type = gr.Dropdown(choices=["EHR Data", "ECG Data"], value="EHR Data", label="Data Type", scale=1)
+            with gr.Row(elem_id="main-row"):
+                with gr.Column(scale=2, elem_id="left-panel"):
+                    with gr.Group(elem_id="task-section"):
+                        gr.Markdown("#### Analysis Tasks")
+                        task_selector = gr.CheckboxGroup(choices=TaskRegistry.get_tasks_for_data_type("EHR Data"), label=None)
+                        with gr.Group(visible=False) as ndd_param_group:
+                            gr.Markdown("**Near-Duplicate Detection Parameters**")
+                            ndd_label_dropdown = gr.Dropdown(choices=[], label="Label Column")
+                        with gr.Group(visible=False) as mislabel_param_group:
+                            gr.Markdown("**Find Mislabeled Data Parameters**")
+                            mislabel_label_dropdown = gr.Dropdown(choices=[], label="Label Column")
+                        with gr.Group(visible=False) as ecg_viz_param_group:
+                            gr.Markdown("**ECG Visualization Parameters**")
+                            ecg_viz_leads = gr.CheckboxGroup(choices=[], label="Select Leads", value=[])
+                            ecg_viz_types = gr.CheckboxGroup(
+                                choices=["Signal Waveform", "Histogram", "Scatter Plot", "Rolling Average"],
+                                label="Visualization Types",
+                                value=["Signal Waveform", "Histogram"]
+                            )
+                        with gr.Group(visible=False) as ecg_stats_param_group:
+                            gr.Markdown("**Statistical Summary Parameters**")
+                            ecg_stats_leads = gr.CheckboxGroup(choices=[], label="Select Leads", value=[])
+                    process_btn = gr.Button("▶ Process", variant="primary")
+                    status_output = gr.Textbox(label="Status", interactive=False, lines=2)
+                with gr.Column(scale=7, elem_id="middle-panel"):
+                    with gr.Tabs(elem_id="tabs-container"):
+                        with gr.TabItem("Original Data", visible=False) as tab_original:
+                            original_df_output = gr.DataFrame(interactive=False)
+                        with gr.TabItem("Processed Data", visible=False) as tab_processed:
+                            processed_df_output = gr.DataFrame(interactive=False)
+                        with gr.TabItem("Summary", visible=False) as tab_summary:
+                            summary_output = gr.JSON()
+                        with gr.TabItem("Visualization", visible=False) as tab_viz:
+                            viz_output = gr.HTML()
+                with gr.Column(scale=3, elem_id="chat-panel"):
+                    gr.Markdown("### 💬 AI Assistant")
+                    chatbot = gr.Chatbot(elem_id="chat-history", height="100%")
+                    with gr.Row(elem_id="chat-input-row"):
+                        msg_input = gr.Textbox(placeholder="Ask or send a JSON command...", scale=4, container=False)
+                        send_btn = gr.Button("Send", scale=1)
+        analysis_outputs = [
+            status_output, original_df_output, processed_df_output, summary_output, viz_output,
+            tab_original, tab_processed, tab_summary, tab_viz
+        ]
+        file_input.change(
+            fn=ui_manager.on_file_upload, inputs=[file_input, data_type],
+            outputs=[status_output, original_df_output, task_selector,
+                     ndd_param_group, mislabel_param_group, ecg_viz_param_group, ecg_stats_param_group,
+                     ndd_label_dropdown, mislabel_label_dropdown,
+                     ecg_viz_leads, ecg_viz_types, ecg_stats_leads, ecg_viz_types,
+                     tab_original, tab_processed, tab_summary, tab_viz]
+        )
+        data_type.change(
+            fn=ui_manager.on_data_type_change, inputs=[data_type, file_input],
+            outputs=[task_selector, ndd_param_group, mislabel_param_group, ecg_viz_param_group, ecg_stats_param_group,
+                     ecg_viz_leads, ecg_viz_types, ecg_stats_leads, ecg_viz_types, status_output]
+        )
+        task_selector.change(
+            fn=ui_manager.on_tasks_change, inputs=[task_selector],
+            outputs=[ndd_param_group, mislabel_param_group, ecg_viz_param_group, ecg_stats_param_group]
+        )
+        process_btn.click(
+            fn=ui_manager.process_analysis,
+            inputs=[file_input, data_type, task_selector, ndd_label_dropdown, mislabel_label_dropdown,
+                    ecg_viz_leads, ecg_viz_types, ecg_stats_leads],
+            outputs=analysis_outputs
+        )
+        chat_submit_args = {"fn": ui_manager.chatbot_respond, "inputs": [msg_input, chatbot], "outputs": [chatbot, msg_input] + analysis_outputs}
+        send_btn.click(**chat_submit_args)
+        msg_input.submit(**chat_submit_args)
+    return demo
+# ============================================================================
+# LAUNCH
+# ============================================================================
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(share=False, server_name="0.0.0.0", server_port=7890)

app_studio.py ADDED Viewed

	@@ -0,0 +1,543 @@

+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+from typing import Optional, Tuple, List, Dict, Any
+import io
+import base64
+from tqdm.auto import tqdm
+from dataclasses import dataclass
+import gradio as gr
+import json
+from pipeline.deduplication import find_near_duplicates
+from pipeline.featurizer import custom_featurizer
+from pipeline.issues import find_issues
+from pipeline.pipeline import make_step, run_pipeline
+from ecg_analyzer import ECGAnalyzer
+# ============================================================================
+# ANALYSIS TASK CONFIGURATION
+# ============================================================================
+@dataclass
+class TaskConfig:
+    """Configuration for each analysis task"""
+    name: str
+    data_type: str
+    requires_params: bool
+    param_components: List[Dict[str, Any]]
+    output_tabs: List[str]
+class TaskRegistry:
+    """Registry mapping tasks to their configurations"""
+    @staticmethod
+    def get_config(data_type: str, task_name: str) -> Optional[TaskConfig]:
+        """Get configuration for a specific task"""
+        configs = {
+            "EHR Data": {
+                "Near-Duplicate Detection": TaskConfig(
+                    name="Near-Duplicate Detection",
+                    data_type="EHR Data",
+                    requires_params=True,
+                    param_components=[
+                        {"type": "dropdown", "label": "Label Column", "elem_id": "ndd_label"}
+                    ],
+                    output_tabs=["original", "processed", "summary"]
+                ),
+                "Find Mislabeled Data": TaskConfig(
+                    name="Find Mislabeled Data",
+                    data_type="EHR Data",
+                    requires_params=True,
+                    param_components=[
+                        {"type": "dropdown", "label": "Label Column", "elem_id": "mislabel_label"}
+                    ],
+                    output_tabs=["original", "summary"]
+                )
+            },
+            "ECG Data": {
+                "ECG Visualization": TaskConfig(
+                    name="ECG Visualization",
+                    data_type="ECG Data",
+                    requires_params=True,
+                    param_components=[
+                        {"type": "checkboxgroup", "label": "Select Leads", "elem_id": "ecg_leads"},
+                        {"type": "checkboxgroup", "label": "Visualization Types", "elem_id": "ecg_viz_types"}
+                    ],
+                    output_tabs=["visualization", "summary"]
+                ),
+                "Statistical Summary": TaskConfig(
+                    name="Statistical Summary",
+                    data_type="ECG Data",
+                    requires_params=True,
+                    param_components=[
+                        {"type": "checkboxgroup", "label": "Select Leads", "elem_id": "ecg_stats_leads"}
+                    ],
+                    output_tabs=["summary", "visualization"]
+                )
+            }
+        }
+        return configs.get(data_type, {}).get(task_name)
+    @staticmethod
+    def get_tasks_for_data_type(data_type: str) -> List[str]:
+        """Get available tasks for a data type"""
+        tasks = {
+            "EHR Data": ["Near-Duplicate Detection", "Find Mislabeled Data"],
+            "ECG Data": ["ECG Visualization", "Statistical Summary"]
+        }
+        return tasks.get(data_type, [])
+# ============================================================================
+# ANALYSIS EXECUTION
+# ============================================================================
+class AnalysisExecutor:
+    """Executes analysis tasks and returns results"""
+    @staticmethod
+    def execute_near_duplicate_detection(df: pd.DataFrame, label: str) -> Tuple[str, Dict[str, Any]]:
+        """Execute near-duplicate detection pipeline"""
+        try:
+            if not label:
+                return "⚠ Label column required", {"original": df, "processed": None, "summary": None}
+            bar = tqdm(total=100, leave=False, desc="Pipeline Progress")
+            steps = [
+                make_step(find_near_duplicates, name="dedup")(progress=bar),
+                make_step(custom_featurizer, name="featurize")(
+                    label=label, nan_strategy="impute", on_pipeline_error="drop", progress=bar
+                ),
+                make_step(find_issues, name="find_label_issues")(label=label, progress=bar),
+            ]
+            results_df, summary_list = run_pipeline(steps, df=df)
+            bar.close()
+            return "✓ Near-duplicate detection completed", {
+                "original": df, "processed": results_df, "summary": summary_list
+            }
+        except Exception as e:
+            return f"✗ Error: {str(e)}", {"original": df, "processed": None, "summary": None}
+    @staticmethod
+    def execute_find_mislabeled(df: pd.DataFrame, label: str) -> Tuple[str, Dict[str, Any]]:
+        """Execute mislabeled data detection"""
+        try:
+            if not label:
+                return "⚠ Label column required", {"original": df, "summary": None}
+            summary = {
+                "task": "Find Mislabeled Data", "label_column": label, "total_samples": len(df),
+                "suspicious_samples": 0, "message": "Mislabeled detection analysis completed"
+            }
+            return "✓ Mislabeled data analysis completed", {"original": df, "summary": summary}
+        except Exception as e:
+            return f"✗ Error: {str(e)}", {"original": df, "summary": None}
+    @staticmethod
+    def execute_ecg_visualization(df: pd.DataFrame, leads: List[str] = None, viz_types: List[str] = None) -> Tuple[str, Dict[str, Any]]:
+        """Execute ECG visualization using ECGAnalyzer"""
+        try:
+            # Detect available leads
+            available_leads = ECGAnalyzer.detect_leads(df)
+            # Use provided leads or default to all available
+            if not leads:
+                leads = available_leads if available_leads else []
+            if not leads:
+                return "⚠ No ECG leads found in data", {"visualization": None, "summary": None}
+            # Default visualization types
+            if not viz_types:
+                viz_types = ["Signal Waveform", "Histogram"]
+            # Create visualizations
+            viz_html = ECGAnalyzer.create_all_visualizations(df, leads, viz_types)
+            # Generate statistics
+            stats = ECGAnalyzer.generate_statistics(df, leads)
+            summary = {
+                "task": "ECG Visualization",
+                "samples": len(df),
+                "leads_analyzed": leads,
+                "visualizations": viz_types,
+                "statistics": stats
+            }
+            return "✓ ECG visualization created", {"visualization": viz_html, "summary": summary}
+        except Exception as e:
+            return f"✗ Error: {str(e)}", {"visualization": None, "summary": None}
+    @staticmethod
+    def execute_statistical_summary(df: pd.DataFrame, leads: List[str] = None) -> Tuple[str, Dict[str, Any]]:
+        """Execute statistical summary using ECGAnalyzer"""
+        try:
+            # Detect available leads
+            available_leads = ECGAnalyzer.detect_leads(df)
+            # Use provided leads or default to all available
+            if not leads:
+                leads = available_leads if available_leads else list(df.select_dtypes(include=[np.number]).columns)
+            if not leads:
+                return "⚠ No numeric columns found", {"summary": None, "visualization": None}
+            # Generate statistics
+            stats = ECGAnalyzer.generate_statistics(df, leads)
+            # Create HTML table for statistics
+            html_rows = []
+            html_rows.append("<table class='preview-table' style='margin: 20px auto; max-width: 900px;'>")
+            html_rows.append("<thead><tr><th>Lead</th><th>Mean</th><th>Std</th><th>Min</th><th>Q25</th><th>Median</th><th>Q75</th><th>Max</th></tr></thead>")
+            html_rows.append("<tbody>")
+            for lead, lead_stats in stats.items():
+                html_rows.append(f"<tr>")
+                html_rows.append(f"<td><strong>{lead}</strong></td>")
+                html_rows.append(f"<td>{lead_stats['mean']:.4f}</td>")
+                html_rows.append(f"<td>{lead_stats['std']:.4f}</td>")
+                html_rows.append(f"<td>{lead_stats['min']:.4f}</td>")
+                html_rows.append(f"<td>{lead_stats['q25']:.4f}</td>")
+                html_rows.append(f"<td>{lead_stats['median']:.4f}</td>")
+                html_rows.append(f"<td>{lead_stats['q75']:.4f}</td>")
+                html_rows.append(f"<td>{lead_stats['max']:.4f}</td>")
+                html_rows.append(f"</tr>")
+            html_rows.append("</tbody></table>")
+            summary_html = f"<div style='overflow-x:auto;'><h3 style='text-align:center;'>Statistical Summary</h3>{''.join(html_rows)}</div>"
+            summary = {
+                "task": "Statistical Summary",
+                "rows": len(df),
+                "leads_analyzed": leads,
+                "statistics": stats
+            }
+            return "✓ Statistical summary generated", {"summary": summary, "visualization": summary_html}
+        except Exception as e:
+            return f"✗ Error: {str(e)}", {"summary": None, "visualization": None}
+# ============================================================================
+# UI MANAGER - Handles all UI state and updates
+# ============================================================================
+class UIManager:
+    """Manages UI state and dynamic updates"""
+    def __init__(self):
+        self.current_df = None
+        self.current_data_type = "EHR Data"
+        self.chatbot_context = {}
+        self.command_map = {
+            "data_type": {"ehr": "EHR Data", "ecg": "ECG Data"},
+            "task": {
+                "deduplication": "Near-Duplicate Detection", "mislabeled": "Find Mislabeled Data",
+                "visualize_ecg": "ECG Visualization", "stats": "Statistical Summary"
+            }
+        }
+    def load_csv(self, file) -> Tuple[str, Optional[pd.DataFrame]]:
+        """Load CSV file"""
+        if file is None: return "⚠ No file uploaded", None
+        try:
+            df = pd.read_csv(file.name)
+            self.current_df = df
+            return f"✓ Loaded {len(df)} rows, {len(df.columns)} columns", df
+        except Exception as e:
+            return f"✗ Error: {str(e)}", None
+    def on_file_upload(self, file, data_type: str):
+        """Handle file upload - returns updates for all components"""
+        status, df = self.load_csv(file)
+        if df is None:
+            return (
+                status, gr.update(value=None), gr.update(choices=[], value=[]),
+                gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False),
+                gr.update(choices=[]), gr.update(choices=[]),
+                gr.update(choices=[]), gr.update(choices=[], value=[]),
+                gr.update(choices=[]), gr.update(choices=[], value=[]),
+                gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False),
+            )
+        self.chatbot_context = {"file": file.name, "type": data_type, "df": df}
+        available_tasks = TaskRegistry.get_tasks_for_data_type(data_type)
+        col_choices = list(df.columns)
+        # Detect ECG leads if ECG data
+        ecg_leads = ECGAnalyzer.detect_leads(df) if data_type == "ECG Data" else []
+        viz_types = ["Signal Waveform", "Histogram", "Scatter Plot", "Rolling Average"]
+        return (
+            status, gr.update(value=df.head(200)), gr.update(choices=available_tasks, value=[], interactive=True),
+            gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False),
+            gr.update(choices=col_choices, value=None), gr.update(choices=col_choices, value=None),
+            gr.update(choices=ecg_leads, value=ecg_leads), gr.update(choices=viz_types, value=["Signal Waveform", "Histogram"]),
+            gr.update(choices=ecg_leads, value=ecg_leads), gr.update(choices=viz_types, value=["Signal Waveform"]),
+            gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False),
+        )
+    def on_data_type_change(self, data_type: str, file):
+        """Handle data type change"""
+        self.current_data_type = data_type
+        if file and self.current_df is not None:
+            self.chatbot_context["type"] = data_type
+            # Update ECG lead choices if switching to ECG data
+            ecg_leads = ECGAnalyzer.detect_leads(self.current_df) if data_type == "ECG Data" else []
+            viz_types = ["Signal Waveform", "Histogram", "Scatter Plot", "Rolling Average"]
+            return (
+                gr.update(choices=TaskRegistry.get_tasks_for_data_type(data_type), value=[]),
+                gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False),
+                gr.update(choices=ecg_leads, value=ecg_leads), gr.update(choices=viz_types, value=["Signal Waveform", "Histogram"]),
+                gr.update(choices=ecg_leads, value=ecg_leads), gr.update(choices=viz_types, value=["Signal Waveform"]),
+                f"Data type changed to: {data_type}"
+            )
+        return (
+            gr.update(choices=TaskRegistry.get_tasks_for_data_type(data_type), value=[]),
+            gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False),
+            gr.update(), gr.update(), gr.update(), gr.update(),
+            f"Data type changed to: {data_type}"
+        )
+    def on_tasks_change(self, selected_tasks: List[str]):
+        """Handle task selection change - show/hide parameter groups"""
+        show_ndd = "Near-Duplicate Detection" in selected_tasks
+        show_mislabel = "Find Mislabeled Data" in selected_tasks
+        show_ecg_viz = "ECG Visualization" in selected_tasks
+        show_ecg_stats = "Statistical Summary" in selected_tasks and self.current_data_type == "ECG Data"
+        return (
+            gr.update(visible=show_ndd),
+            gr.update(visible=show_mislabel),
+            gr.update(visible=show_ecg_viz),
+            gr.update(visible=show_ecg_stats)
+        )
+    def process_analysis(self, file, data_type: str, selected_tasks: List[str],
+                         ndd_label: str, mislabel_label: str,
+                         ecg_viz_leads: List[str], ecg_viz_types: List[str],
+                         ecg_stats_leads: List[str]):
+        """Process analysis tasks based on UI inputs."""
+        status, df = self.load_csv(file)
+        if df is None:
+            return (status, None, None, None, None, gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False))
+        params = {
+            "ndd_label": ndd_label,
+            "mislabel_label": mislabel_label,
+            "ecg_viz_leads": ecg_viz_leads,
+            "ecg_viz_types": ecg_viz_types,
+            "ecg_stats_leads": ecg_stats_leads
+        }
+        return self._run_analysis(df, data_type, selected_tasks, params)
+    def _run_analysis(self, df: pd.DataFrame, data_type: str, selected_tasks: List[str], params: Dict[str, Any]):
+        """Centralized analysis executor, callable from UI or chatbot."""
+        if not selected_tasks:
+            return (
+                "⚠ No tasks selected", df.head(200), None, None, None,
+                gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
+            )
+        all_tabs = set(); all_results = {"original": df.head(200), "processed": None, "summary": [], "visualization": ""}
+        status_messages = []; executor = AnalysisExecutor()
+        for task_name in selected_tasks:
+            config = TaskRegistry.get_config(data_type, task_name)
+            if not config:
+                status_messages.append(f"✗ Unknown task: {task_name}"); continue
+            all_tabs.update(config.output_tabs)
+            if task_name == "Near-Duplicate Detection":
+                status_msg, results = executor.execute_near_duplicate_detection(df, params.get("ndd_label"))
+            elif task_name == "Find Mislabeled Data":
+                status_msg, results = executor.execute_find_mislabeled(df, params.get("mislabel_label"))
+            elif task_name == "ECG Visualization":
+                status_msg, results = executor.execute_ecg_visualization(
+                    df,
+                    params.get("ecg_viz_leads"),
+                    params.get("ecg_viz_types")
+                )
+            elif task_name == "Statistical Summary":
+                if data_type == "ECG Data":
+                    status_msg, results = executor.execute_statistical_summary(df, params.get("ecg_stats_leads"))
+                else:
+                    status_msg, results = executor.execute_statistical_summary(df)
+            else:
+                status_msg, results = "✗ Task not implemented", {}
+            status_messages.append(f"{task_name}: {status_msg}")
+            if results.get("processed") is not None: all_results["processed"] = results["processed"]
+            if results.get("visualization"): all_results["visualization"] += results["visualization"]
+            if results.get("summary") is not None: all_results["summary"].append({"task": task_name, "data": results["summary"]})
+        return (
+            "\n".join(status_messages), all_results["original"], all_results["processed"],
+            all_results["summary"] or None, all_results["visualization"] or None,
+            gr.update(visible="original" in all_tabs), gr.update(visible="processed" in all_tabs),
+            gr.update(visible="summary" in all_tabs), gr.update(visible="visualization" in all_tabs)
+        )
+    def chatbot_respond(self, message: str, history: List):
+        """Handle chatbot messages, parsing for commands or responding to queries."""
+        history = history or []; df = self.chatbot_context.get("df")
+        ui_updates = tuple([gr.update()] * 9) # status, 4 outputs, 4 tabs
+        try:
+            command = json.loads(message)
+            if isinstance(command, dict) and "task" in command:
+                if df is None:
+                    history.append((message, "Please upload a data file before running a command."))
+                    return (history, "") + ui_updates
+                data_type = self.command_map["data_type"].get(command.get("data_type", "").lower(), self.current_data_type)
+                task_name = self.command_map["task"].get(command.get("task", "").lower())
+                if not task_name:
+                    response = f"Unknown task: '{command['task']}'. Valid: {list(self.command_map['task'].keys())}"
+                    history.append((message, response))
+                    return (history, "") + ui_updates
+                params = command.get("parameters", {})
+                analysis_params = {"ndd_label": params.get("label"), "mislabel_label": params.get("label")}
+                analysis_updates = self._run_analysis(df, data_type, [task_name], analysis_params)
+                history.append((message, f"Command executed: Running '{task_name}'."))
+                return (history, "") + analysis_updates
+        except (json.JSONDecodeError, TypeError):
+            pass # Not a JSON command, proceed with standard logic
+        if "column" in message.lower():
+            response = (f"Dataset has {len(df.columns)} columns: {', '.join(map(str, df.columns))}" if df is not None else "Please upload a file first.")
+        elif "row" in message.lower():
+            response = f"Dataset has {len(df)} rows." if df is not None else "Please upload a file first."
+        elif "help" in message.lower():
+            response = "Ask about 'columns' or 'rows'. To run a task, send JSON, e.g., `{\"task\": \"stats\"}` or `{\"task\": \"deduplication\", \"parameters\": {\"label\": \"your_column\"}}`"
+        else:
+            response = "I can help with data queries or run tasks via JSON commands. Try asking 'help'."
+        history.append((message, response))
+        return (history, "") + ui_updates
+# ============================================================================
+# GRADIO INTERFACE
+# ============================================================================
+def create_interface():
+    """Build the Gradio interface"""
+    ui_manager = UIManager()
+    custom_css = """
+    * { box-sizing: border-box; } html, body { margin: 0; padding: 0; height: 100vh; overflow: hidden; }
+    .gradio-container { height: 100vh !important; max-width: 100% !important; padding: 0 !important; }
+    #app-container { height: 100vh; display: flex; flex-direction: column; padding: 0.75rem; gap: 0.75rem; }
+    #main-row { flex: 1; min-height: 0; display: flex; gap: 0.75rem; }
+    #left-panel { display: flex; flex-direction: column; height: 100%; background: #f9fafb; border-radius: 10px; padding: 0.75rem; gap: 0.5rem; }
+    #task-section { flex: 1; min-height: 0; overflow-y: auto; display: flex; flex-direction: column; gap: 0.5rem; }
+    #middle-panel, #chat-panel { display: flex; flex-direction: column; height: 100%; }
+    #tabs-container { flex: 1; min-height: 0; display: flex; flex-direction: column; }
+    #tabs-container .tabitem { flex: 1; min-height: 0; overflow: auto; }
+    #chat-history { flex: 1; min-height: 0; overflow-y: auto; margin-bottom: 0.5rem; }
+    #chat-input-row { flex-shrink: 0; display: flex; gap: 0.5rem; }
+    .preview-table { border-collapse: collapse; width: 100%; font-size: 0.875rem; }
+    .preview-table th { background-color: #3498db; color: white; padding: 8px; text-align: left; position: sticky; top: 0; }
+    """
+    with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="Medical Data Analysis Platform") as demo:
+        with gr.Column(elem_id="app-container"):
+            gr.Markdown("# 🏥 Medical Data Analysis Platform")
+            with gr.Row():
+                file_input = gr.File(label="Upload CSV", file_types=[".csv"], scale=2)
+                data_type = gr.Dropdown(choices=["EHR Data", "ECG Data"], value="EHR Data", label="Data Type", scale=1)
+            with gr.Row(elem_id="main-row"):
+                with gr.Column(scale=2, elem_id="left-panel"):
+                    with gr.Group(elem_id="task-section"):
+                        gr.Markdown("#### Analysis Tasks")
+                        task_selector = gr.CheckboxGroup(choices=TaskRegistry.get_tasks_for_data_type("EHR Data"), label=None)
+                        with gr.Group(visible=False) as ndd_param_group:
+                            gr.Markdown("**Near-Duplicate Detection Parameters**")
+                            ndd_label_dropdown = gr.Dropdown(choices=[], label="Label Column")
+                        with gr.Group(visible=False) as mislabel_param_group:
+                            gr.Markdown("**Find Mislabeled Data Parameters**")
+                            mislabel_label_dropdown = gr.Dropdown(choices=[], label="Label Column")
+                        with gr.Group(visible=False) as ecg_viz_param_group:
+                            gr.Markdown("**ECG Visualization Parameters**")
+                            ecg_viz_leads = gr.CheckboxGroup(choices=[], label="Select Leads", value=[])
+                            ecg_viz_types = gr.CheckboxGroup(
+                                choices=["Signal Waveform", "Histogram", "Scatter Plot", "Rolling Average"],
+                                label="Visualization Types",
+                                value=["Signal Waveform", "Histogram"]
+                            )
+                        with gr.Group(visible=False) as ecg_stats_param_group:
+                            gr.Markdown("**Statistical Summary Parameters**")
+                            ecg_stats_leads = gr.CheckboxGroup(choices=[], label="Select Leads", value=[])
+                    process_btn = gr.Button("▶ Process", variant="primary")
+                    status_output = gr.Textbox(label="Status", interactive=False, lines=2)
+                with gr.Column(scale=7, elem_id="middle-panel"):
+                    with gr.Tabs(elem_id="tabs-container"):
+                        with gr.TabItem("Original Data", visible=False) as tab_original:
+                            original_df_output = gr.DataFrame(interactive=False)
+                        with gr.TabItem("Processed Data", visible=False) as tab_processed:
+                            processed_df_output = gr.DataFrame(interactive=False)
+                        with gr.TabItem("Summary", visible=False) as tab_summary:
+                            summary_output = gr.JSON()
+                        with gr.TabItem("Visualization", visible=False) as tab_viz:
+                            viz_output = gr.HTML()
+                with gr.Column(scale=3, elem_id="chat-panel"):
+                    gr.Markdown("### 💬 AI Assistant")
+                    chatbot = gr.Chatbot(elem_id="chat-history", height="100%")
+                    with gr.Row(elem_id="chat-input-row"):
+                        msg_input = gr.Textbox(placeholder="Ask or send a JSON command...", scale=4, container=False)
+                        send_btn = gr.Button("Send", scale=1)
+        analysis_outputs = [
+            status_output, original_df_output, processed_df_output, summary_output, viz_output,
+            tab_original, tab_processed, tab_summary, tab_viz
+        ]
+        file_input.change(
+            fn=ui_manager.on_file_upload, inputs=[file_input, data_type],
+            outputs=[status_output, original_df_output, task_selector,
+                     ndd_param_group, mislabel_param_group, ecg_viz_param_group, ecg_stats_param_group,
+                     ndd_label_dropdown, mislabel_label_dropdown,
+                     ecg_viz_leads, ecg_viz_types, ecg_stats_leads, ecg_viz_types,
+                     tab_original, tab_processed, tab_summary, tab_viz]
+        )
+        data_type.change(
+            fn=ui_manager.on_data_type_change, inputs=[data_type, file_input],
+            outputs=[task_selector, ndd_param_group, mislabel_param_group, ecg_viz_param_group, ecg_stats_param_group,
+                     ecg_viz_leads, ecg_viz_types, ecg_stats_leads, ecg_viz_types, status_output]
+        )
+        task_selector.change(
+            fn=ui_manager.on_tasks_change, inputs=[task_selector],
+            outputs=[ndd_param_group, mislabel_param_group, ecg_viz_param_group, ecg_stats_param_group]
+        )
+        process_btn.click(
+            fn=ui_manager.process_analysis,
+            inputs=[file_input, data_type, task_selector, ndd_label_dropdown, mislabel_label_dropdown,
+                    ecg_viz_leads, ecg_viz_types, ecg_stats_leads],
+            outputs=analysis_outputs
+        )
+        chat_submit_args = {"fn": ui_manager.chatbot_respond, "inputs": [msg_input, chatbot], "outputs": [chatbot, msg_input] + analysis_outputs}
+        send_btn.click(**chat_submit_args)
+        msg_input.submit(**chat_submit_args)
+    return demo
+# ============================================================================
+# LAUNCH
+# ============================================================================
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(share=False, server_name="0.0.0.0", server_port=7890)

app_test.py ADDED Viewed

	@@ -0,0 +1,867 @@

+"""
+ADDING NEW ANALYSIS TASKS:
+==========================
+1. Add task configuration in TaskRegistry.get_config():
+   "Your Data Type": {
+       "Your Task Name": TaskConfig(
+           name="Your Task Name",
+           data_type="Your Data Type",
+           requires_params=True,  # Set to True if needs parameters
+           param_components=[...],  # Define parameter types
+           output_tabs=["original", "summary", ...]  # Which tabs to show
+       )
+   }
+2. Add execution method in AnalysisExecutor:
+   @staticmethod
+   def execute_your_task(df, param1, param2):
+       # Your analysis logic
+       return "✓ Done", {
+           "original": df,
+           "summary": summary_data,
+           ...
+       }
+3. In create_interface(), add parameter group in the LEFT panel:
+   with gr.Group(visible=False) as your_task_param_group:
+       gr.Markdown("**Your Task Parameters**")
+       your_param1 = gr.Slider(minimum=0, maximum=1, label="Threshold")
+       your_param2 = gr.Dropdown(choices=["A", "B"], label="Option")
+4. In UIManager.on_tasks_change()
+"""
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+from typing import Optional, Tuple, List, Dict, Any
+import io
+import base64
+from tqdm.auto import tqdm
+from dataclasses import dataclass
+import gradio as gr
+# Direct imports (your existing modules)
+from pipeline.deduplication import find_near_duplicates
+from pipeline.featurizer import custom_featurizer
+from pipeline.issues import find_issues
+from pipeline.pipeline import make_step, run_pipeline
+from pipeline import make_step, run_pipeline
+# ============================================================================
+# ANALYSIS TASK CONFIGURATION
+# ============================================================================
+@dataclass
+class TaskConfig:
+    """Configuration for each analysis task"""
+    name: str
+    data_type: str
+    requires_params: bool  # Does it need additional parameters?
+    param_components: List[Dict[str, Any]]  # List of parameter UI components
+    output_tabs: List[str]  # Which tabs to show: "original", "processed", "summary", "visualization"
+class TaskRegistry:
+    """Registry mapping tasks to their configurations"""
+    @staticmethod
+    def get_config(data_type: str, task_name: str) -> TaskConfig:
+        """Get configuration for a specific task"""
+        configs = {
+            "EHR Data": {
+                "Near-Duplicate Detection": TaskConfig(
+                    name="Near-Duplicate Detection",
+                    data_type="EHR Data",
+                    requires_params=True,
+                    param_components=[
+                        {"type": "dropdown", "label": "Label Column", "elem_id": "ndd_label"}
+                    ],
+                    output_tabs=["original", "processed", "summary"]
+                ),
+                "Find Mislabeled Data": TaskConfig(
+                    name="Find Mislabeled Data",
+                    data_type="EHR Data",
+                    requires_params=True,
+                    param_components=[
+                        {"type": "dropdown", "label": "Label Column", "elem_id": "mislabel_label"}
+                    ],
+                    output_tabs=["original", "summary"]
+                )
+            },
+            "ECG Data": {
+                "ECG Visualization": TaskConfig(
+                    name="ECG Visualization",
+                    data_type="ECG Data",
+                    requires_params=False,
+                    param_components=[],
+                    output_tabs=["visualization", "summary"]
+                ),
+                "Statistical Summary": TaskConfig(
+                    name="Statistical Summary",
+                    data_type="ECG Data",
+                    requires_params=False,
+                    param_components=[],
+                    output_tabs=["summary"]
+                )
+            }
+        }
+        return configs.get(data_type, {}).get(task_name)
+    @staticmethod
+    def get_tasks_for_data_type(data_type: str) -> List[str]:
+        """Get available tasks for a data type"""
+        tasks = {
+            "EHR Data": ["Near-Duplicate Detection", "Find Mislabeled Data"],
+            "ECG Data": ["ECG Visualization", "Statistical Summary"]
+        }
+        return tasks.get(data_type, [])
+# ============================================================================
+# ANALYSIS EXECUTION
+# ============================================================================
+class AnalysisExecutor:
+    """Executes analysis tasks and returns results"""
+    @staticmethod
+    def execute_near_duplicate_detection(df: pd.DataFrame, label: str) -> Tuple[str, Dict[str, Any]]:
+        """Execute near-duplicate detection pipeline"""
+        try:
+            if not label:
+                return "⚠ Label column required", {
+                    "original": df,
+                    "processed": None,
+                    "summary": None
+                }
+            bar = tqdm(total=100, leave=False, desc="Pipeline Progress")
+            steps = [
+                make_step(find_near_duplicates, name="dedup")(progress=bar),
+                make_step(custom_featurizer, name="featurize")(
+                    label=label, nan_strategy="impute", on_pipeline_error="drop", progress=bar
+                ),
+                make_step(find_issues, name="find_label_issues")(label=label, progress=bar),
+            ]
+            results_df, summary_list = run_pipeline(steps, df=df)
+            bar.close()
+            return "✓ Near-duplicate detection completed", {
+                "original": df,
+                "processed": results_df,
+                "summary": summary_list
+            }
+        except Exception as e:
+            return f"✗ Error: {str(e)}", {
+                "original": df,
+                "processed": None,
+                "summary": None
+            }
+    @staticmethod
+    def execute_find_mislabeled(df: pd.DataFrame, label: str) -> Tuple[str, Dict[str, Any]]:
+        """Execute mislabeled data detection"""
+        try:
+            if not label:
+                return "⚠ Label column required", {
+                    "original": df,
+                    "summary": None
+                }
+            # Placeholder for actual mislabeled detection logic
+            summary = {
+                "task": "Find Mislabeled Data",
+                "label_column": label,
+                "total_samples": len(df),
+                "suspicious_samples": 0,
+                "message": "Mislabeled detection analysis completed"
+            }
+            return "✓ Mislabeled data analysis completed", {
+                "original": df,
+                "summary": summary
+            }
+        except Exception as e:
+            return f"✗ Error: {str(e)}", {
+                "original": df,
+                "summary": None
+            }
+    @staticmethod
+    def execute_ecg_visualization(df: pd.DataFrame) -> Tuple[str, Dict[str, Any]]:
+        """Execute ECG visualization"""
+        try:
+            fig, ax = plt.subplots(figsize=(12, 6))
+            if len(df.columns) > 1:
+                for col in df.columns[1:]:
+                    ax.plot(df.iloc[:, 0], df[col], label=str(col), linewidth=0.8)
+                ax.set_xlabel('Time (ms)')
+                ax.set_ylabel('Amplitude (mV)')
+                ax.set_title('ECG Signal Visualization')
+                ax.legend()
+            else:
+                ax.plot(df.iloc[:, 0], linewidth=0.8)
+                ax.set_xlabel('Sample Index')
+                ax.set_ylabel('Amplitude')
+                ax.set_title('ECG Signal')
+            ax.grid(True, alpha=0.3)
+            plt.tight_layout()
+            # Convert plot to base64
+            buf = io.BytesIO()
+            fig.savefig(buf, format='png', dpi=100, bbox_inches='tight')
+            buf.seek(0)
+            img_base64 = base64.b64encode(buf.read()).decode('utf-8')
+            plt.close(fig)
+            viz_html = f'<div style="text-align:center;"><img src="data:image/png;base64,{img_base64}" style="max-width:100%;"/></div>'
+            summary = {
+                "task": "ECG Visualization",
+                "samples": len(df),
+                "channels": len(df.columns) - 1 if len(df.columns) > 1 else 1
+            }
+            return "✓ ECG visualization created", {
+                "visualization": viz_html,
+                "summary": summary
+            }
+        except Exception as e:
+            return f"✗ Error: {str(e)}", {
+                "visualization": None,
+                "summary": None
+            }
+    @staticmethod
+    def execute_statistical_summary(df: pd.DataFrame) -> Tuple[str, Dict[str, Any]]:
+        """Execute statistical summary"""
+        try:
+            stats = df.describe().to_html(classes='preview-table')
+            summary_html = f"<h3>Statistical Summary</h3><div style='overflow-x:auto;'>{stats}</div>"
+            summary = {
+                "task": "Statistical Summary",
+                "rows": len(df),
+                "columns": len(df.columns),
+                "numeric_columns": len(df.select_dtypes(include=[np.number]).columns),
+                "html": summary_html
+            }
+            return "✓ Statistical summary generated", {
+                "summary": summary,
+                "visualization": summary_html
+            }
+        except Exception as e:
+            return f"✗ Error: {str(e)}", {
+                "summary": None,
+                "visualization": None
+            }
+# ============================================================================
+# UI MANAGER - Handles all UI state and updates
+# ============================================================================
+class UIManager:
+    """Manages UI state and dynamic updates"""
+    def __init__(self):
+        self.current_df = None
+        self.current_data_type = "EHR Data"
+        self.chatbot_context = {}
+    def load_csv(self, file) -> Tuple[str, Optional[pd.DataFrame]]:
+        """Load CSV file"""
+        if file is None:
+            return "⚠ No file uploaded", None
+        try:
+            df = pd.read_csv(file.name)
+            self.current_df = df
+            return f"✓ Loaded {len(df)} rows, {len(df.columns)} columns", df
+        except Exception as e:
+            return f"✗ Error: {str(e)}", None
+    def on_file_upload(self, file, data_type: str):
+        """Handle file upload - returns updates for all components"""
+        status, df = self.load_csv(file)
+        if df is None:
+            return (
+                status,  # status
+                gr.update(value=None),  # original_df
+                gr.update(choices=[], value=[], interactive=True),  # task_checkboxes
+                gr.update(visible=False),  # ndd_param_group
+                gr.update(visible=False),  # mislabel_param_group
+                gr.update(choices=[]),  # ndd_label_dropdown
+                gr.update(choices=[]),  # mislabel_label_dropdown
+                gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False),  # tabs
+            )
+        self.chatbot_context = {"file": file.name, "type": data_type, "df": df}
+        available_tasks = TaskRegistry.get_tasks_for_data_type(data_type)
+        col_choices = list(df.columns)
+        return (
+            status,
+            gr.update(value=df.head(200)),
+            gr.update(choices=available_tasks, value=[], interactive=True),
+            gr.update(visible=False),
+            gr.update(visible=False),
+            gr.update(choices=col_choices, value=None),
+            gr.update(choices=col_choices, value=None),
+            gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False),
+        )
+    def on_data_type_change(self, data_type: str, file):
+        """Handle data type change"""
+        self.current_data_type = data_type
+        available_tasks = TaskRegistry.get_tasks_for_data_type(data_type)
+        if file and self.current_df is not None:
+            self.chatbot_context["type"] = data_type
+        return (
+            gr.update(choices=available_tasks, value=[]),  # Reset task selection
+            gr.update(visible=False),  # Hide ndd params
+            gr.update(visible=False),  # Hide mislabel params
+            f"Data type changed to: {data_type}"
+        )
+    def on_tasks_change(self, selected_tasks: List[str], data_type: str, df_columns: List[str]):
+        """Handle task selection change - show/hide parameter groups for all selected tasks"""
+        if not selected_tasks:
+            # Hide all parameter groups
+            return (
+                gr.update(visible=False),  # ndd_param_group
+                gr.update(visible=False),  # mislabel_param_group
+            )
+        # Check which tasks need which parameters
+        show_ndd_params = "Near-Duplicate Detection" in selected_tasks
+        show_mislabel_params = "Find Mislabeled Data" in selected_tasks
+        return (
+            gr.update(visible=show_ndd_params),
+            gr.update(visible=show_mislabel_params),
+        )
+    def process_analysis(self, file, data_type: str, selected_tasks: List[str], ndd_label: str, mislabel_label: str):
+        """Process selected analysis tasks - handles multiple tasks"""
+        status, df = self.load_csv(file)
+        if df is None:
+            return (
+                status,
+                None, None, None, None,
+                gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
+            )
+        if not selected_tasks:
+            return (
+                "⚠ No tasks selected",
+                df.head(200), None, None, None,
+                gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
+            )
+        # Track which tabs to show (union of all selected tasks)
+        all_tabs = set()
+        all_results = {
+            "original": df.head(200),
+            "processed": None,
+            "summary": [],
+            "visualization": ""
+        }
+        status_messages = []
+        executor = AnalysisExecutor()
+        # Process each selected task
+        for task_name in selected_tasks:
+            config = TaskRegistry.get_config(data_type, task_name)
+            if not config:
+                status_messages.append(f"✗ Unknown task: {task_name}")
+                continue
+            # Add this task's tabs to the set
+            all_tabs.update(config.output_tabs)
+            # Execute the task with appropriate parameters
+            if task_name == "Near-Duplicate Detection":
+                status_msg, results = executor.execute_near_duplicate_detection(df, ndd_label)
+                status_messages.append(f"{task_name}: {status_msg}")
+                if results.get("processed") is not None:
+                    all_results["processed"] = results["processed"]
+                if results.get("summary") is not None:
+                    all_results["summary"].append({"task": task_name, "data": results["summary"]})
+            elif task_name == "Find Mislabeled Data":
+                status_msg, results = executor.execute_find_mislabeled(df, mislabel_label)
+                status_messages.append(f"{task_name}: {status_msg}")
+                if results.get("summary") is not None:
+                    all_results["summary"].append({"task": task_name, "data": results["summary"]})
+            elif task_name == "ECG Visualization":
+                status_msg, results = executor.execute_ecg_visualization(df)
+                status_messages.append(f"{task_name}: {status_msg}")
+                if results.get("visualization"):
+                    all_results["visualization"] += results["visualization"]
+                if results.get("summary") is not None:
+                    all_results["summary"].append({"task": task_name, "data": results["summary"]})
+            elif task_name == "Statistical Summary":
+                status_msg, results = executor.execute_statistical_summary(df)
+                status_messages.append(f"{task_name}: {status_msg}")
+                if results.get("visualization"):
+                    all_results["visualization"] += results["visualization"]
+                if results.get("summary") is not None:
+                    all_results["summary"].append({"task": task_name, "data": results["summary"]})
+        # Format final outputs based on all_tabs
+        show_original = gr.update(visible="original" in all_tabs)
+        show_processed = gr.update(visible="processed" in all_tabs)
+        show_summary = gr.update(visible="summary" in all_tabs)
+        show_viz = gr.update(visible="visualization" in all_tabs)
+        # Combine status messages
+        final_status = "\n".join(status_messages)
+        return (
+            final_status,
+            all_results["original"],
+            all_results["processed"],
+            all_results["summary"] if all_results["summary"] else None,
+            all_results["visualization"] if all_results["visualization"] else None,
+            show_original,
+            show_processed,
+            show_summary,
+            show_viz
+        )
+    def _format_results(self, status: str, output_tabs: List[str], results: Dict[str, Any]):
+        """Format results based on output tabs configuration"""
+        # Default all outputs to None
+        original_out = None
+        processed_out = None
+        summary_out = None
+        viz_out = None
+        # Default all tabs hidden
+        show_original = gr.update(visible=False)
+        show_processed = gr.update(visible=False)
+        show_summary = gr.update(visible=False)
+        show_viz = gr.update(visible=False)
+        # Populate based on output_tabs
+        if "original" in output_tabs:
+            original_out = results.get("original")
+            show_original = gr.update(visible=True)
+        if "processed" in output_tabs:
+            processed_out = results.get("processed")
+            show_processed = gr.update(visible=True)
+        if "summary" in output_tabs:
+            summary_data = results.get("summary")
+            if results.get("summary_html"):  # For statistical summary
+                summary_out = results.get("summary_html")
+            else:
+                summary_out = summary_data
+            show_summary = gr.update(visible=True)
+        if "visualization" in output_tabs:
+            viz_out = results.get("visualization")
+            show_viz = gr.update(visible=True)
+        return (
+            status,
+            original_out,
+            processed_out,
+            summary_out,
+            viz_out,
+            show_original,
+            show_processed,
+            show_summary,
+            show_viz
+        )
+    def chatbot_respond(self, message: str, history: List):
+        """Chatbot response"""
+        if history is None:
+            history = []
+        if not message or message.strip() == "":
+            return history, ""
+        df = self.chatbot_context.get("df")
+        if "column" in message.lower():
+            response = (f"Dataset has {len(df.columns)} columns: {', '.join(map(str, df.columns))}"
+                       if df is not None else "Please upload a file first.")
+        elif "row" in message.lower():
+            response = f"Dataset has {len(df)} rows." if df is not None else "Please upload a file first."
+        elif "help" in message.lower():
+            response = "Ask about 'columns', 'rows', or your data analysis."
+        else:
+            response = f"You asked: '{message}'. Analyzing: {self.chatbot_context.get('type', 'No data')}."
+        history.append((message, response))
+        return history, ""
+# ============================================================================
+# GRADIO INTERFACE
+# ============================================================================
+def create_interface():
+    """Build the Gradio interface"""
+    ui_manager = UIManager()
+    custom_css = """
+    * { box-sizing: border-box; }
+    html, body { margin: 0; padding: 0; height: 100vh; overflow: hidden; }
+    .gradio-container { height: 100vh !important; max-width: 100% !important; padding: 0 !important; }
+    #app-container {
+        height: 100vh;
+        display: flex;
+        flex-direction: column;
+        padding: 0.75rem;
+        gap: 0.75rem;
+    }
+    #main-row {
+        flex: 1;
+        min-height: 0;
+        display: flex;
+        gap: 0.75rem;
+    }
+    #left-panel {
+        display: flex;
+        flex-direction: column;
+        height: 100%;
+        background: #f9fafb;
+        border-radius: 10px;
+        padding: 0.75rem;
+        gap: 0.5rem;
+    }
+    #task-section {
+        flex: 1;
+        min-height: 0;
+        overflow-y: auto;
+        display: flex;
+        flex-direction: column;
+        gap: 0.5rem;
+    }
+    #action-section {
+        flex-shrink: 0;
+    }
+    #middle-panel {
+        display: flex;
+        flex-direction: column;
+        height: 100%;
+        min-width: 0;
+    }
+    #tabs-container {
+        flex: 1;
+        min-height: 0;
+        display: flex;
+        flex-direction: column;
+    }
+    #tabs-container .tabs {
+        height: 100%;
+        display: flex;
+        flex-direction: column;
+    }
+    #tabs-container .tab-nav {
+        flex-shrink: 0;
+    }
+    #tabs-container .tabitem {
+        flex: 1;
+        min-height: 0;
+        overflow: auto;
+    }
+    #chat-panel {
+        display: flex;
+        flex-direction: column;
+        height: 100%;
+        background: #f9fafb;
+        border-radius: 10px;
+        padding: 0.75rem;
+    }
+    #chat-header {
+        flex-shrink: 0;
+        margin-bottom: 0.5rem;
+    }
+    #chat-history {
+        flex: 1;
+        min-height: 0;
+        overflow-y: auto;
+        margin-bottom: 0.5rem;
+    }
+    #chat-input-row {
+        flex-shrink: 0;
+        display: flex;
+        gap: 0.5rem;
+        align-items: flex-end;
+    }
+    #chat-input-row .textbox {
+        flex: 1;
+    }
+    #chat-input-row button {
+        flex-shrink: 0;
+    }
+    .preview-table {
+        border-collapse: collapse;
+        width: 100%;
+        font-size: 0.875rem;
+    }
+    .preview-table th {
+        background-color: #3498db;
+        color: white;
+        padding: 8px;
+        text-align: left;
+        position: sticky;
+        top: 0;
+    }
+    .preview-table td {
+        padding: 6px;
+        border-bottom: 1px solid #ddd;
+    }
+    .compact-header { font-size: 0.95rem; margin: 0; }
+    """
+    with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="Medical Data Analysis Platform") as demo:
+        with gr.Column(elem_id="app-container"):
+            # Header
+            gr.Markdown("# 🏥 Medical Data Analysis Platform", elem_classes=["compact-header"])
+            # Top controls
+            with gr.Row():
+                file_input = gr.File(label="Upload CSV", file_types=[".csv"], scale=2)
+                data_type = gr.Dropdown(
+                    choices=["EHR Data", "ECG Data"],
+                    value="EHR Data",
+                    label="Data Type",
+                    scale=1
+                )
+            # Main row with 3 columns
+            with gr.Row(elem_id="main-row"):
+                # LEFT: Analysis tasks
+                with gr.Column(scale=2, elem_id="left-panel"):
+                    with gr.Group(elem_id="task-section"):
+                        gr.Markdown("#### Analysis Tasks")
+                        task_selector = gr.CheckboxGroup(
+                            choices=TaskRegistry.get_tasks_for_data_type("EHR Data"),
+                            label=None,
+                            elem_id="task-checkboxes"
+                        )
+                        # Parameter groups (conditionally visible) - one for each task that needs params
+                        with gr.Group(visible=False) as ndd_param_group:
+                            gr.Markdown("**Near-Duplicate Detection Parameters**")
+                            ndd_label_dropdown = gr.Dropdown(
+                                choices=[],
+                                label="Label Column",
+                                elem_id="ndd-label-dropdown"
+                            )
+                        with gr.Group(visible=False) as mislabel_param_group:
+                            gr.Markdown("**Find Mislabeled Data Parameters**")
+                            mislabel_label_dropdown = gr.Dropdown(
+                                choices=[],
+                                label="Label Column",
+                                elem_id="mislabel-label-dropdown"
+                            )
+                    with gr.Group(elem_id="action-section"):
+                        process_btn = gr.Button("▶ Process", variant="primary", size="lg")
+                        status_output = gr.Textbox(label="Status", interactive=False, lines=2)
+                # MIDDLE: Results display
+                with gr.Column(scale=7, elem_id="middle-panel"):
+                    with gr.Tabs(elem_id="tabs-container") as result_tabs:
+                        with gr.TabItem("Original Data", visible=False) as tab_original:
+                            original_df_output = gr.Dataframe(interactive=False)
+                        with gr.TabItem("Processed Data", visible=False) as tab_processed:
+                            processed_df_output = gr.Dataframe(interactive=False)
+                        with gr.TabItem("Summary", visible=False) as tab_summary:
+                            summary_output = gr.JSON()
+                        with gr.TabItem("Visualization", visible=False) as tab_viz:
+                            viz_output = gr.HTML()
+                # RIGHT: Chat
+                with gr.Column(scale=3, elem_id="chat-panel"):
+                    gr.Markdown("### 💬 AI Assistant", elem_id="chat-header")
+                    chatbot = gr.Chatbot(elem_id="chat-history", height=None, label=None)
+                    with gr.Row(elem_id="chat-input-row"):
+                        msg_input = gr.Textbox(
+                            placeholder="Ask about your data...",
+                            label="",
+                            scale=4,
+                            container=False,
+                            lines=1
+                        )
+                        send_btn = gr.Button("Send", scale=1, size="sm")
+        # ============ EVENT HANDLERS ============
+        # File upload
+        file_input.change(
+            fn=ui_manager.on_file_upload,
+            inputs=[file_input, data_type],
+            outputs=[
+                status_output,
+                original_df_output,
+                task_selector,
+                ndd_param_group,
+                mislabel_param_group,
+                ndd_label_dropdown,
+                mislabel_label_dropdown,
+                tab_original, tab_processed, tab_summary, tab_viz,
+            ]
+        )
+        # Data type change
+        data_type.change(
+            fn=ui_manager.on_data_type_change,
+            inputs=[data_type, file_input],
+            outputs=[task_selector, ndd_param_group, mislabel_param_group, status_output]
+        )
+        # Task selection change
+        task_selector.change(
+            fn=ui_manager.on_tasks_change,
+            inputs=[task_selector, data_type, ndd_label_dropdown],
+            outputs=[ndd_param_group, mislabel_param_group]
+        )
+        # Process button
+        process_btn.click(
+            fn=ui_manager.process_analysis,
+            inputs=[file_input, data_type, task_selector, ndd_label_dropdown, mislabel_label_dropdown],
+            outputs=[
+                status_output,
+                original_df_output,
+                processed_df_output,
+                summary_output,
+                viz_output,
+                tab_original,
+                tab_processed,
+                tab_summary,
+                tab_viz
+            ]
+        )
+        # Chat
+        send_btn.click(
+            fn=ui_manager.chatbot_respond,
+            inputs=[msg_input, chatbot],
+            outputs=[chatbot, msg_input]
+        )
+        msg_input.submit(
+            fn=ui_manager.chatbot_respond,
+            inputs=[msg_input, chatbot],
+            outputs=[chatbot, msg_input]
+        )
+    return demo
+# ============================================================================
+# LAUNCH
+# ============================================================================
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(share=False, server_name="0.0.0.0", server_port=7860)
+# ============================================================================
+# HOW TO EXTEND
+# ============================================================================
+"""
+ADDING NEW ANALYSIS TASKS:
+==========================
+1. Add task configuration in TaskRegistry:
+   - Define name, data_type, requires_params, param_components, output_tabs
+2. Add execution method in AnalysisExecutor:
+   - Create execute_your_task() method
+   - Return (status_message, results_dict)
+3. Add condition in UIManager.process_analysis():
+   - elif task_name == "Your Task Name":
+   -     status_msg, results = executor.execute_your_task(df, params)
+   -     return self._format_results(status_msg, config.output_tabs, results)
+EXAMPLE - Adding a new ECG task with parameters:
+================================================
+In TaskRegistry.get_config():
+    "ECG Data": {
+        ...
+        "ECG Quality Check": TaskConfig(
+            name="ECG Quality Check",
+            data_type="ECG Data",
+            requires_params=True,
+            param_components=[
+                {"type": "slider", "label": "Quality Threshold", "elem_id": "quality_thresh"}
+            ],
+            output_tabs=["original", "summary", "visualization"]
+        )
+    }
+In AnalysisExecutor:
+    @staticmethod
+    def execute_ecg_quality_check(df: pd.DataFrame, threshold: float) -> Tuple[str, Dict[str, Any]]:
+        # Your analysis logic here
+        return "✓ Quality check completed", {
+            "original": df,
+            "summary": quality_summary,
+            "visualization": viz_html
+        }
+In UIManager.process_analysis():
+    elif task_name == "ECG Quality Check":
+        status_msg, results = executor.execute_ecg_quality_check(df, float(param_value))
+        return self._format_results(status_msg, config.output_tabs, results)
+The architecture is now:
+- Simple and clean
+- Easy to extend with new tasks
+- Dynamic UI based on task configuration
+- Fixed chat scrolling issue
+"""

backend_client.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# backend_client.py
+from __future__ import annotations
+import requests
+from typing import Optional, List, Tuple
+class LangGraphClient:
+    """
+    Minimal client to talk to your LangGraph app running on http://localhost:8010.
+    Adjust ENDPOINTS to match your server routes/payloads.
+    """
+    def __init__(self, base_url: str = "http://localhost:8010"):
+        self.base_url = base_url.rstrip("/")
+        # Example endpoints — change if your server differs
+        self.endpoints = {
+            "chat": f"{self.base_url}/chat"   # expects JSON {session_id, message, history?}
+        }
+    def send_message(
+        self,
+        message: str,
+        session_id: Optional[str] = "default",
+        history: Optional[List[Tuple[str, str]]] = None,
+        timeout: int = 30
+    ) -> str:
+        """
+        Sends a message to the backend and returns the assistant reply text.
+        """
+        payload = {
+            "session_id": session_id,
+            "message": message,
+            "history": history or []  # [[user, assistant], ...] if your server uses it
+        }
+        try:
+            r = requests.post(self.endpoints["chat"], json=payload, timeout=timeout)
+            r.raise_for_status()
+            data = r.json()
+            # Expecting {"reply": "..."}; adjust if your API returns a different shape
+            reply = data.get("reply") or data.get("message") or data.get("text") or ""
+            return str(reply)
+        except requests.RequestException as e:
+            return f"[Backend error] {e}"

data/Aubrie.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/JS00001_filtered.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/Lisette.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

ecg_analyzer.py ADDED Viewed

	@@ -0,0 +1,198 @@

+"""
+ECG Analysis Module
+Provides modular ECG visualization and analysis functions
+"""
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+import io
+import base64
+from typing import List, Optional, Tuple, Dict, Any
+class ECGAnalyzer:
+    """Handles ECG data analysis and visualization"""
+    # Standard 12-lead ECG leads
+    STANDARD_LEADS = ['I', 'II', 'III', 'aVR', 'aVL', 'aVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6']
+    @staticmethod
+    def detect_leads(df: pd.DataFrame) -> List[str]:
+        """Detect available ECG leads in the dataframe"""
+        available_leads = []
+        for lead in ECGAnalyzer.STANDARD_LEADS:
+            if lead in df.columns:
+                available_leads.append(lead)
+        return available_leads
+    @staticmethod
+    def detect_time_column(df: pd.DataFrame) -> Optional[str]:
+        """Detect the time column in the dataframe"""
+        time_candidates = ['time', 'Time', 'TIME', 'timestamp', 'sample']
+        for col in time_candidates:
+            if col in df.columns:
+                return col
+        # If no explicit time column, use index
+        return None
+    @staticmethod
+    def create_signal_plot(df: pd.DataFrame, leads: List[str], time_col: Optional[str] = None) -> str:
+        """Create ECG signal waveform plot"""
+        fig, ax = plt.subplots(figsize=(14, 6))
+        if time_col and time_col in df.columns:
+            x_data = df[time_col]
+            x_label = 'Time (ms)' if 'time' in time_col.lower() else time_col
+        else:
+            x_data = df.index
+            x_label = 'Sample Index'
+        colors = plt.cm.tab10(np.linspace(0, 1, len(leads)))
+        for idx, lead in enumerate(leads):
+            if lead in df.columns:
+                ax.plot(x_data, df[lead], label=f'Lead {lead}',
+                       linewidth=1.2, alpha=0.8, color=colors[idx])
+        ax.set_xlabel(x_label, fontsize=11)
+        ax.set_ylabel('Amplitude (mV)', fontsize=11)
+        ax.set_title('ECG Signal Waveform', fontsize=13, fontweight='bold')
+        ax.legend(loc='upper right', fontsize=9, ncol=min(4, len(leads)))
+        ax.grid(True, alpha=0.3, linestyle='--')
+        plt.tight_layout()
+        return ECGAnalyzer._fig_to_base64(fig)
+    @staticmethod
+    def create_histogram(df: pd.DataFrame, leads: List[str]) -> str:
+        """Create histogram of signal amplitudes"""
+        fig, ax = plt.subplots(figsize=(14, 6))
+        colors = plt.cm.tab10(np.linspace(0, 1, len(leads)))
+        for idx, lead in enumerate(leads):
+            if lead in df.columns:
+                ax.hist(df[lead].dropna(), bins=50, alpha=0.6,
+                       label=f'Lead {lead}', color=colors[idx], edgecolor='black')
+        ax.set_xlabel('Amplitude (mV)', fontsize=11)
+        ax.set_ylabel('Frequency', fontsize=11)
+        ax.set_title('Distribution of Signal Amplitudes', fontsize=13, fontweight='bold')
+        ax.legend(loc='upper right', fontsize=9)
+        ax.grid(True, alpha=0.3, axis='y')
+        plt.tight_layout()
+        return ECGAnalyzer._fig_to_base64(fig)
+    @staticmethod
+    def create_scatter_plot(df: pd.DataFrame, lead_x: str = 'I', lead_y: str = 'II') -> str:
+        """Create scatter plot comparing two leads"""
+        fig, ax = plt.subplots(figsize=(8, 8))
+        if lead_x in df.columns and lead_y in df.columns:
+            ax.scatter(df[lead_x], df[lead_y], alpha=0.5, s=10, c='steelblue')
+            ax.set_xlabel(f'Lead {lead_x} Amplitude (mV)', fontsize=11)
+            ax.set_ylabel(f'Lead {lead_y} Amplitude (mV)', fontsize=11)
+            ax.set_title(f'Lead {lead_x} vs Lead {lead_y}', fontsize=13, fontweight='bold')
+            ax.grid(True, alpha=0.3)
+            # Add correlation coefficient
+            correlation = df[[lead_x, lead_y]].corr().iloc[0, 1]
+            ax.text(0.05, 0.95, f'Correlation: {correlation:.3f}',
+                   transform=ax.transAxes, fontsize=10, verticalalignment='top',
+                   bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
+        else:
+            ax.text(0.5, 0.5, 'Selected leads not available',
+                   ha='center', va='center', fontsize=12)
+        plt.tight_layout()
+        return ECGAnalyzer._fig_to_base64(fig)
+    @staticmethod
+    def create_rolling_average(df: pd.DataFrame, leads: List[str],
+                               time_col: Optional[str] = None, window: int = 100) -> str:
+        """Create rolling average plot"""
+        fig, ax = plt.subplots(figsize=(14, 6))
+        if time_col and time_col in df.columns:
+            x_data = df[time_col]
+            x_label = 'Time (ms)' if 'time' in time_col.lower() else time_col
+        else:
+            x_data = df.index
+            x_label = 'Sample Index'
+        colors = plt.cm.tab10(np.linspace(0, 1, len(leads)))
+        for idx, lead in enumerate(leads):
+            if lead in df.columns:
+                rolling_avg = df[lead].rolling(window=window, min_periods=1).mean()
+                ax.plot(x_data, rolling_avg, label=f'Lead {lead} (MA-{window})',
+                       linewidth=1.5, alpha=0.8, color=colors[idx])
+        ax.set_xlabel(x_label, fontsize=11)
+        ax.set_ylabel('Amplitude (mV)', fontsize=11)
+        ax.set_title(f'Rolling Average (Window={window})', fontsize=13, fontweight='bold')
+        ax.legend(loc='upper right', fontsize=9, ncol=min(4, len(leads)))
+        ax.grid(True, alpha=0.3, linestyle='--')
+        plt.tight_layout()
+        return ECGAnalyzer._fig_to_base64(fig)
+    @staticmethod
+    def create_all_visualizations(df: pd.DataFrame, leads: List[str],
+                                  viz_types: List[str]) -> str:
+        """Create multiple visualizations based on selected types"""
+        html_parts = []
+        time_col = ECGAnalyzer.detect_time_column(df)
+        for viz_type in viz_types:
+            if viz_type == "Signal Waveform":
+                img_base64 = ECGAnalyzer.create_signal_plot(df, leads, time_col)
+                html_parts.append(f'<div style="margin-bottom: 30px;"><img src="data:image/png;base64,{img_base64}" style="max-width:100%;"/></div>')
+            elif viz_type == "Histogram":
+                img_base64 = ECGAnalyzer.create_histogram(df, leads)
+                html_parts.append(f'<div style="margin-bottom: 30px;"><img src="data:image/png;base64,{img_base64}" style="max-width:100%;"/></div>')
+            elif viz_type == "Scatter Plot":
+                # Use first two available leads for scatter plot
+                lead_x = leads[0] if len(leads) > 0 else 'I'
+                lead_y = leads[1] if len(leads) > 1 else 'II'
+                img_base64 = ECGAnalyzer.create_scatter_plot(df, lead_x, lead_y)
+                html_parts.append(f'<div style="margin-bottom: 30px;"><img src="data:image/png;base64,{img_base64}" style="max-width:100%;"/></div>')
+            elif viz_type == "Rolling Average":
+                img_base64 = ECGAnalyzer.create_rolling_average(df, leads, time_col)
+                html_parts.append(f'<div style="margin-bottom: 30px;"><img src="data:image/png;base64,{img_base64}" style="max-width:100%;"/></div>')
+        if not html_parts:
+            return '<div style="text-align:center; padding:40px;"><p>No visualizations selected</p></div>'
+        return '<div style="text-align:center;">' + ''.join(html_parts) + '</div>'
+    @staticmethod
+    def _fig_to_base64(fig) -> str:
+        """Convert matplotlib figure to base64 string"""
+        buf = io.BytesIO()
+        fig.savefig(buf, format='png', dpi=100, bbox_inches='tight')
+        buf.seek(0)
+        img_base64 = base64.b64encode(buf.read()).decode('utf-8')
+        plt.close(fig)
+        return img_base64
+    @staticmethod
+    def generate_statistics(df: pd.DataFrame, leads: List[str]) -> Dict[str, Any]:
+        """Generate statistical summary for selected leads"""
+        stats = {}
+        for lead in leads:
+            if lead in df.columns:
+                lead_data = df[lead].dropna()
+                stats[lead] = {
+                    'mean': float(lead_data.mean()),
+                    'std': float(lead_data.std()),
+                    'min': float(lead_data.min()),
+                    'max': float(lead_data.max()),
+                    'median': float(lead_data.median()),
+                    'q25': float(lead_data.quantile(0.25)),
+                    'q75': float(lead_data.quantile(0.75))
+                }
+        return stats

ecg_visualization.py ADDED Viewed

	@@ -0,0 +1,67 @@

+# Import necessary libraries
+import dash
+from dash import html, dcc
+from dash.dependencies import Input, Output
+import pandas as pd
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+# Load the dataset
+file_path = 'dataset/JS00001_filtered.csv'  # Update this path as necessary
+ecg_data = pd.read_csv(file_path)
+# Define leads
+leads = ['I', 'II', 'III', 'aVR', 'aVL', 'aVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6']
+# Initialize the Dash app
+app = dash.Dash(__name__)
+app.layout = html.Div(children=[
+    html.H1(children='ECG Data Dashboard', style={'textAlign': 'center', 'color': '#007BFF'}),
+    html.Div([
+        html.Label('Select ECG Lead for Analysis:'),
+        dcc.Dropdown(
+            id='lead-dropdown',
+            options=[{'label': 'All Leads', 'value': 'ALL'}] + [{'label': lead, 'value': lead} for lead in leads],
+            value='ALL'  # Default value to show all leads
+        )
+    ], style={'width': '30%', 'margin': '0 auto', 'padding': '20px'}),
+    dcc.Graph(id='ecg-data-visualization'),
+], style={'padding': '20px', 'maxWidth': '1200px', 'margin': '0 auto'})
+# Define callback to update the figure based on the selected lead
+@app.callback(
+    Output('ecg-data-visualization', 'figure'),
+    [Input('lead-dropdown', 'value')]
+)
+def update_figure(selected_lead):
+    # Initialize the figure with subplots
+    fig = make_subplots(rows=4, cols=1,
+                        subplot_titles=("ECG Signal Over Time", "Histogram of Signal Amplitudes",
+                                        "Scatter Plot: Lead I vs Lead II", "Rolling Average"),
+                        vertical_spacing=0.1,
+                        specs=[[{"type": "scatter"}], [{"type": "histogram"}], [{"type": "scatter"}], [{"type": "scatter"}]])
+    # Conditionally display either all leads or just the selected lead
+    if selected_lead == 'ALL':
+        # Show all leads
+        for lead in leads:
+            fig.add_trace(go.Scatter(x=ecg_data['time'], y=ecg_data[lead], mode='lines', name=f'Lead {lead}'), row=1, col=1)
+            fig.add_trace(go.Histogram(x=ecg_data[lead], name=f'Lead {lead}', opacity=0.75), row=2, col=1)
+    else:
+        # Show only the selected lead
+        ecg_data['RollingAvg'] = ecg_data[selected_lead].rolling(window=100).mean()
+        fig.add_trace(go.Scatter(x=ecg_data['time'], y=ecg_data[selected_lead], mode='lines', name=f'Lead {selected_lead}'), row=1, col=1)
+        fig.add_trace(go.Histogram(x=ecg_data[selected_lead], name=f'Lead {selected_lead}', opacity=0.75), row=2, col=1)
+        fig.add_trace(go.Scatter(x=ecg_data['time'], y=ecg_data['RollingAvg'], mode='lines', name=f'Rolling Average: Lead {selected_lead}'), row=4, col=1)
+    # Common settings for all cases
+    fig.update_traces(opacity=0.75, bingroup=1, row=2, col=1)
+    fig.update_layout(barmode='overlay')
+    fig.add_trace(go.Scatter(x=ecg_data['I'], y=ecg_data['II'], mode='markers', name='Lead I vs Lead II'), row=3, col=1)
+    # Update the figure layout
+    fig.update_layout(height=1600, title_text="Comprehensive ECG Data Analysis", showlegend=True)
+    return fig
+# Run the app
+if __name__ == '__main__':
+    app.run_server(debug=True)

examples/main.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from deduplication import find_near_duplicates
+from featurizer import custom_featurizer
+from issues import find_issues
+from pipeline import make_step, run_pipeline
+import pandas as pd
+from tqdm.auto import tqdm
+bar = tqdm(total=100, leave=True)
+steps = [
+    make_step(find_near_duplicates, name="dedup")(progress=bar),
+    make_step(custom_featurizer, name="featurize")(
+        label=None,                 # optional; only used to drop NaN label rows
+        nan_strategy="impute",
+        on_pipeline_error="drop",
+        progress=bar
+    ),
+    make_step(find_issues, name="find_label_issues")(label="HARDSHIP_INDEX", progress=bar)
+]
+df = pd.read_csv("./data/Lisette.csv")
+results = run_pipeline(steps, df=df)
+bar.close()
+print(results)

pipeline/__init__.py ADDED Viewed

File without changes

pipeline/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (165 Bytes). View file

pipeline/__pycache__/deduplication.cpython-311.pyc ADDED Viewed

Binary file (10.1 kB). View file

pipeline/__pycache__/featurizer.cpython-311.pyc ADDED Viewed

Binary file (19.2 kB). View file

pipeline/__pycache__/issues.cpython-311.pyc ADDED Viewed

Binary file (7.78 kB). View file

pipeline/__pycache__/pipeline.cpython-311.pyc ADDED Viewed

Binary file (3.64 kB). View file

pipeline/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (13.5 kB). View file

pipeline/__pycache__/utils_cool.cpython-311.pyc ADDED Viewed

Binary file (13.5 kB). View file

pipeline/deduplication.py ADDED Viewed

	@@ -0,0 +1,185 @@

+from __future__ import annotations
+from time import perf_counter
+from typing import Any, Dict, List, Optional, Tuple
+import numpy as np
+import pandas as pd
+from cleanlab import Datalab
+from cleanlab.datalab.internal.issue_manager.duplicate import NearDuplicateIssueManager
+from scipy.sparse import issparse
+from scipy.special import comb
+from sklearn.base import TransformerMixin
+from sklearn.feature_extraction.text import TfidfVectorizer
+from tqdm.auto import tqdm
+from .utils_cool import PhaseProgress, _ensure_dense32, choose_k
+# =========================
+# Core near-duplicate finder
+# =========================
+def find_near_duplicates(
+    df: pd.DataFrame,
+    *,
+    # Cleanlab params
+    metric: str = "cosine",
+    threshold: float = 0.13,
+    k: Optional[int] = None,
+    # Vectorizer / features
+    vectorizer: Optional[TransformerMixin] = None,
+    force_dense: bool = False,   # set True if your cleanlab version needs dense
+    # Behavior
+    verbose: bool = False,
+    # Progress: either a tqdm object or a callable phase,p in [0,1]
+    progress: Optional[Any] = None,
+) -> Tuple[Optional[pd.DataFrame], Dict]:
+    """
+    Detect near-duplicates using Cleanlab's NearDuplicateIssueManager.
+    Parameters
+    ----------
+    df : DataFrame
+        Rows to analyze. If no vectorizer is passed, all columns are joined as strings for TF-IDF.
+    metric : {"cosine", "euclidean", "manhattan"}
+        Distance metric for kNN graph.
+    threshold : float
+        Near-duplicate radius is based on threshold × median NN distance (internal to Cleanlab).
+    k : int or None
+        Neighborhood size. If None, uses sqrt(N) clipped to [5, 50] and ≤ N-1.
+    vectorizer : sklearn Transformer
+        Any transformer with fit_transform/transform. If None, uses TF-IDF (float32).
+    force_dense : bool
+        If True, densify features before passing to Cleanlab.
+    verbose : bool
+        Print timing breakdown.
+    progress : tqdm or Callable[[str, float], None]
+        Phase-aware progress reporting.
+    Returns
+    -------
+    (output_df, stats)
+        output_df : DataFrame after deduplication
+        stats     : dict of counts/timings/params
+    """
+    # Progress setup (one bar per call unless user provided one)
+    local_bar = None
+    pp = None
+    if progress is None:
+        local_bar = tqdm(total=100, leave=True)
+        pp = PhaseProgress(local_bar, weights={"vectorize": .2, "find_issues": .7, "grouping": .1})
+    else:
+        if hasattr(progress, "set_description") and hasattr(progress, "update"):
+            # treat given tqdm as the bar
+            if not hasattr(progress, "_last_val"):
+                progress._last_val = 0
+            pp = PhaseProgress(progress, weights={"vectorize": .2, "find_issues": .7, "grouping": .1})
+    timings: Dict[str, float] = {}
+    t0 = perf_counter()
+    N = int(len(df))
+    # --- Vectorize ---
+    pp and pp.start("vectorize", extra={"N": N})
+    t_vec0 = perf_counter()
+    text_series = df.astype(str).agg(" ".join, axis=1)
+    if vectorizer is None:
+        vectorizer = TfidfVectorizer(dtype=np.float32)
+    if hasattr(vectorizer, "fit_transform"):
+        X = vectorizer.fit_transform(text_series.tolist())
+    elif hasattr(vectorizer, "transform"):
+        X = vectorizer.transform(text_series.tolist())
+    else:
+        raise TypeError("`vectorizer` must implement fit_transform or transform.")
+    if force_dense:
+        X = _ensure_dense32(X)
+    timings["vectorize"] = perf_counter() - t_vec0
+    pp and pp.tick_abs("vectorize", 1.0)
+    pp and pp.end("vectorize")
+    # --- Cleanlab duplicate finder ---
+    pp and pp.start("find_issues", extra={"metric": metric})
+    t_cl0 = perf_counter()
+    if k is None:
+        k = choose_k(N)
+    lab = Datalab(data={"__row__": list(range(N))})
+    ndm = NearDuplicateIssueManager(datalab=lab, metric=metric, threshold=threshold, k=k)
+    pp and pp.tick_abs("find_issues", 0.0, extra={"k": k})
+    try:
+        ndm.find_issues(features=X)
+    except Exception:
+        if issparse(X):
+            X_dense = _ensure_dense32(X)
+            ndm.find_issues(features=X_dense)  # retry dense
+        else:
+            raise
+    pp and pp.tick_abs("find_issues", 1.0)
+    timings["cleanlab_find_issues"] = perf_counter() - t_cl0
+    pp and pp.end("find_issues", extra={"k": k})
+    near_dup_sets: List[List[int]] = getattr(ndm, "near_duplicate_sets", []) or []
+    # --- Representatives & output ---
+    pp and pp.start("grouping")
+    t_out0 = perf_counter()
+    # Keep smallest index as representative; skip any empty groups defensively
+    reps = [int(np.min(g)) for g in near_dup_sets if np.size(g) > 0]
+    in_any = {int(i) for g in near_dup_sets for i in np.asarray(g).ravel()}
+    keep_set = set(reps) | (set(range(N)) - in_any)
+    out_df = None
+    if N > 0:
+        keep_mask = np.zeros(N, dtype=bool)
+        if keep_set:
+            keep_mask[list(keep_set)] = True
+        out_df = df.iloc[np.where(keep_mask)[0]].copy()
+    else:
+        out_df = df.copy()
+    # Stats
+    n_groups = sum(1 for g in near_dup_sets if np.size(g) > 0)
+    group_sizes = [int(np.size(g)) for g in near_dup_sets if np.size(g) > 0]
+    n_flagged = sum(max(0, s - 1) for s in group_sizes)  # rows we'd drop if keeping 1 per group
+    n_pairs = int(sum(comb(s, 2, exact=True) for s in group_sizes))
+    timings["groups_and_output"] = perf_counter() - t_out0
+    total_time = perf_counter() - t0
+    stats = {
+        "n_rows_before_dedup": N,
+        "n_near_dupe_pairs": n_pairs,
+        "n_groups": n_groups,
+        "avg_group_size": float(np.mean(group_sizes)) if group_sizes else 0.0,
+        "max_group_size": max(group_sizes) if group_sizes else 0,
+        "n_rows_flagged_duplicates": n_flagged,
+        "n_rows_after_dedup": int(len(out_df)) if out_df is not None else N,
+        "metric": metric,
+        "threshold": threshold,
+        "k": int(k),
+        "timings": timings,
+        "total_time_sec": total_time,
+    }
+    if verbose:
+        print(f"[timing] TOTAL: {total_time*1000:.1f} ms")
+        for k_, v in timings.items():
+            print(f"  - {k_}: {v*1000:.1f} ms")
+    pp and pp.tick_abs("grouping", 1.0, extra={"groups": n_groups, "pairs": n_pairs})
+    pp and pp.end("grouping")
+    if local_bar is not None:
+        pp.close()
+    # Return the standardized pair expected by your pipeline
+    return out_df, stats

pipeline/featurizer.py ADDED Viewed

	@@ -0,0 +1,358 @@

+from __future__ import annotations
+from typing import Any, Dict, List, Optional, Tuple
+import numpy as np
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import FunctionTransformer, StandardScaler
+from .utils_cool import PhaseProgress
+def _safe_apply_pipeline(
+    group_name: str,
+    df_in: pd.DataFrame,
+    cols: List[str],
+    pipeline: Pipeline,
+    *,
+    drop_on_error: bool,
+    warnings: List[str],
+    max_new_cols: int = 2000,
+) -> pd.DataFrame:
+    """
+    Apply a user pipeline to df_in[cols] safely.
+    - If success and returns DataFrame with same index: replace group columns with returned columns (prefixed).
+    - If success and returns ndarray/sparse: if width <= max_new_cols, expand with auto names; else drop group.
+    - If failure and drop_on_error: drop group columns and record a warning.
+    """
+    Xsub = df_in[cols]
+    try:
+        out = pipeline.fit_transform(Xsub)
+    except Exception as e:
+        if drop_on_error:
+            warnings.append(
+                f"[custom_featurizer] dropped '{group_name}' columns ({len(cols)}): {cols[:6]}{'...' if len(cols)>6 else ''} "
+                f"reason={type(e).__name__}: {str(e)[:180]}"
+            )
+            return df_in.drop(columns=cols)
+        raise
+    # If pipeline returns a DataFrame -> use its columns (prefixed)
+    if isinstance(out, pd.DataFrame):
+        out_df = out.copy()
+        # ensure index alignment
+        out_df.index = df_in.index
+        # prefix to avoid collisions
+        out_df.columns = [f"{group_name}__{c}" for c in out_df.columns]
+        df_out = df_in.drop(columns=cols).join(out_df)
+        return df_out
+    # If ndarray / sparse matrix -> expand to columns cautiously
+    try:
+        import numpy as _np
+        import scipy.sparse as _sp
+        if _sp.issparse(out):
+            out = out.toarray()
+        out = _np.asarray(out)
+        n_rows, n_cols = out.shape[0], (out.shape[1] if out.ndim == 2 else 1)
+        if n_rows != len(df_in):
+            raise ValueError(f"Pipeline for '{group_name}' returned {n_rows} rows; expected {len(df_in)}.")
+        if n_cols > max_new_cols:
+            warnings.append(
+                f"[custom_featurizer] '{group_name}' produced {n_cols} columns (> {max_new_cols}); dropping this group to avoid explosion."
+            )
+            return df_in.drop(columns=cols)
+        if out.ndim == 1:
+            out = out.reshape(-1, 1)
+            n_cols = 1
+        new_cols = [f"{group_name}__f{i}" for i in range(n_cols)]
+        out_df = pd.DataFrame(out, index=df_in.index, columns=new_cols)
+        df_out = df_in.drop(columns=cols).join(out_df)
+        return df_out
+    except Exception as e:
+        if drop_on_error:
+            warnings.append(
+                f"[custom_featurizer] failed to materialize output for '{group_name}'; dropping group. "
+                f"reason={type(e).__name__}: {str(e)[:180]}"
+            )
+            return df_in.drop(columns=cols)
+        raise
+def custom_featurizer(
+    df: pd.DataFrame,
+    *,
+    # optional label just for cleaning rows with NaN in label; not returned as y
+    label: Optional[str] = None,
+    # user-provided pipelines (override defaults for that group)
+    numeric_pipeline: Optional[Pipeline] = None,
+    low_card_pipeline: Optional[Pipeline] = None,
+    text_pipeline: Optional[Pipeline] = None,
+    # defaults (used ONLY if the corresponding pipeline is NOT provided)
+    numeric_scale: bool = True,                  # scale numeric after impute
+    text_lowercase: bool = True,                 # lower+strip text
+    max_ohe_cardinality: int = 50,               # threshold to split low-card vs text
+    # NaN handling for DEFAULTS ONLY (ignored if a custom pipeline is provided for that group)
+    nan_strategy: str = "impute",                # "impute" or "drop"
+    # failure policy for user pipelines
+    on_pipeline_error: str = "drop",             # "drop" -> drop group; "raise" -> bubble error
+    # control expansion when user pipelines return big matrices
+    max_new_cols_per_group: int = 2000,
+    # progress / logging
+    progress: Optional[Any] = None,              # pass a tqdm here; None -> auto-create
+    # logging
+    verbose: bool = False,
+) -> Tuple[pd.DataFrame, Dict[str, Any]]:
+    """
+    Featurize a mixed DataFrame while keeping the output as a DataFrame for downstream steps.
+    Overview
+    --------
+    Columns are split into three groups:
+      • numeric (including boolean)
+      • low-cardinality categoricals (nunique ≤ `max_ohe_cardinality`)
+      • text/high-cardinality (nunique > `max_ohe_cardinality`)
+    You may pass custom sklearn `Pipeline`s per group. If a user pipeline raises and
+    `on_pipeline_error="drop"`, that entire group of columns is dropped and a short
+    warning is recorded in `stats["warnings"]` (the step does not crash). If `"raise"`,
+    the error is propagated.
+    Defaults when pipelines are NOT provided
+    ---------------------------------------
+    • Numeric (when `numeric_pipeline is None`)
+        - NaNs: if `nan_strategy="impute"`, apply `SimpleImputer(strategy="median")`;
+                if `"drop"`, rows with NaNs in numeric columns are dropped BEFORE this step.
+        - Scaling: if `numeric_scale=True`, apply `StandardScaler(with_mean=False)`.
+        - Columns: replaced in place (same column names remain; values become numeric/float).
+    • Low-cardinality categoricals (when `low_card_pipeline is None`)
+        - NaNs: if `nan_strategy="impute"`, apply `SimpleImputer(strategy="most_frequent")`;
+                if `"drop"`, rows with NaNs in these columns are dropped BEFORE this step.
+        - Encoding: **no one-hot by default** (values stay as cleaned strings/categories).
+          If you want encodings, pass your own `low_card_pipeline` (e.g., OneHotEncoder/CatBoostEncoder).
+        - Columns: preserved (same names).
+    • Text / high-cardinality (when `text_pipeline is None`)
+        - Build a tiny pipeline:
+              concat selected text cols → `TfidfVectorizer(dtype=float32, lowercase=text_lowercase)`
+        - Output: numeric TF-IDF features. Original text columns are **replaced** by new
+          columns named `txt__f0`, `txt__f1`, … (or `txt__<col>` if your pipeline returns a DataFrame).
+        - Feature explosion guard: if the produced matrix has more than `max_new_cols_per_group` columns,
+          the entire text group is dropped and a warning is recorded.
+        - If TF-IDF fails (e.g., empty vocabulary on tiny data), the text group is dropped with a warning.
+    NaN handling
+    ------------
+    `nan_strategy` applies only to groups using the **default** pipeline:
+      - "impute": impute NaNs (as above)
+      - "drop"  : drop rows containing NaNs in any default-handled feature column **before**
+                  transforming. For groups with a **custom** pipeline, NaN handling is your pipeline’s responsibility.
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Input table. If `label` is provided, rows with NaN in `label` are dropped first.
+    label : Optional[str], default=None
+        Name of the target column (only used to drop NaN labels). Not returned as `y`.
+    numeric_pipeline : Optional[sklearn.pipeline.Pipeline], default=None
+        Custom pipeline for numeric/boolean columns. If provided, `nan_strategy` is ignored for this group.
+        By default: median imputation (+ optional scaling) and columns are preserved.
+    low_card_pipeline : Optional[sklearn.pipeline.Pipeline], default=None
+        Custom pipeline for low-card categorical columns. If provided, `nan_strategy` is ignored for this group.
+        By default: most-frequent imputation only; **no encoding**; columns are preserved.
+    text_pipeline : Optional[sklearn.pipeline.Pipeline], default=None
+        Custom pipeline for text/high-card columns. If provided, it replaces the text columns with whatever
+        it outputs (DataFrame → prefixed columns; array/sparse → `txt__f*` columns). If not provided, the
+        built-in concat+TF-IDF is used (see defaults above).
+    numeric_scale : bool, default=True
+        Applies `StandardScaler(with_mean=False)` to numeric columns in the default numeric pipeline.
+    text_lowercase : bool, default=True
+        Forwarded to the default `TfidfVectorizer(lowercase=...)`.
+    max_ohe_cardinality : int, default=50
+        Threshold to classify non-numeric columns as low-card (≤ threshold) vs text/high-card (> threshold).
+    nan_strategy : {"impute","drop"}, default="impute"
+        Strategy for **default** pipelines only (custom pipelines manage their own NaNs).
+    on_pipeline_error : {"drop","raise"}, default="drop"
+        If a **user** pipeline raises: "drop" → drop that group and record a warning; "raise" → propagate.
+    max_new_cols_per_group : int, default=2000
+        Upper bound on the number of columns a group is allowed to add (applies to array/sparse outputs).
+        If exceeded, the group is dropped and a warning is recorded.
+    progress : Optional[tqdm], default=None
+        Phase-aware progress bar (clean → split → numeric → low_card → text → finalize).
+        If None, a local bar is created and closed automatically.
+    verbose : bool, default=False
+        When True, prints recorded warnings (in addition to returning them in `stats["warnings"]`).
+    Returns
+    -------
+    output_df : pd.DataFrame
+        Transformed DataFrame ready for the next pipeline step. Numeric/low-card columns are
+        updated in place by defaults; text columns are replaced by TF-IDF features when using
+        the default text path.
+    stats : Dict[str, Any]
+        Minimal metadata:
+          - "warnings": List[str]
+          - "cols": {"numeric": [...], "low_card": [...], "text": [...]}
+          - "n_rows_before": int
+          - "n_rows_after": int
+    Notes
+    -----
+    • If a user pipeline returns a DataFrame, columns are prefixed with the group name to avoid collisions.
+      If it returns an array/sparse matrix, columns are auto-named `"{group}__f{i}"`.
+    • The feature explosion guard applies after transformation; if you often hit it with text, consider passing
+      your own `TfidfVectorizer(max_features=...)` in `text_pipeline` to cap the width proactively.
+    """
+    # progress setup
+    local_bar = None
+    pp = None
+    if progress is None:
+        from tqdm.auto import tqdm  # local import to keep module lightweight
+        local_bar = tqdm(total=100, leave=True)
+        pp = PhaseProgress(local_bar, weights={
+            "clean": .10, "split": .10, "numeric": .30, "low_card": .25, "text": .20, "finalize": .05
+        })
+    elif hasattr(progress, "set_description") and hasattr(progress, "update"):
+        if not hasattr(progress, "_last_val"):
+            progress._last_val = 0
+        pp = PhaseProgress(progress, weights={
+            "clean": .10, "split": .10, "numeric": .30, "low_card": .25, "text": .20, "finalize": .05
+        })
+    warnings: List[str] = []
+    # 1) drop rows with NaN label if present
+    n_before = len(df)
+    pp and pp.start("clean", extra={"N": n_before})
+    if label is not None and label not in df.columns:
+        # ensure the bar is closed even on error
+        if local_bar is not None:
+            pp.close()
+        raise KeyError(f"Target column '{label}' not in DataFrame.")
+    if label is not None:
+        df = df.dropna(subset=[label]).reset_index(drop=True)
+    pp and pp.tick_abs("clean", 1.0, extra={"N": len(df)})
+    pp and pp.end("clean")
+    # 2) split columns
+    pp and pp.start("split")
+    Xf = df if label is None else df.drop(columns=[label])
+    numeric_cols: List[str] = Xf.select_dtypes(include=[np.number, "boolean"]).columns.tolist()
+    non_numeric = [c for c in Xf.columns if c not in numeric_cols]
+    low_card_cols: List[str] = []
+    text_cols: List[str] = []
+    for c in non_numeric:
+        nunq = Xf[c].nunique(dropna=True)
+        (low_card_cols if nunq <= max_ohe_cardinality else text_cols).append(c)
+    pp and pp.tick_abs("split", 1.0, extra={
+        "num": len(numeric_cols), "low": len(low_card_cols), "txt": len(text_cols)
+    })
+    pp and pp.end("split")
+    # 3) NaN strategy for default groups (custom pipelines assumed to handle NaNs)
+    if nan_strategy not in {"impute", "drop"}:
+        if local_bar is not None:
+            pp.close()
+        raise ValueError("nan_strategy must be 'impute' or 'drop'")
+    if nan_strategy == "drop":
+        cols_to_check = []
+        if numeric_cols and numeric_pipeline is None:
+            cols_to_check += numeric_cols
+        if low_card_cols and low_card_pipeline is None:
+            cols_to_check += low_card_cols
+        if text_cols and text_pipeline is None:
+            cols_to_check += text_cols
+        if cols_to_check:
+            mask = ~Xf[cols_to_check].isna().any(axis=1)
+            df = df.loc[mask].reset_index(drop=True)
+            Xf = df if label is None else df.drop(columns=[label])
+    # 4) numeric
+    pp and pp.start("numeric", extra={"cols": len(numeric_cols)})
+    if numeric_cols:
+        if numeric_pipeline is not None:
+            df = _safe_apply_pipeline(
+                "num", df, numeric_cols, numeric_pipeline,
+                drop_on_error=(on_pipeline_error == "drop"),
+                warnings=warnings,
+                max_new_cols=max_new_cols_per_group,
+            )
+        else:
+            if nan_strategy == "impute":
+                imputed = SimpleImputer(strategy="median").fit_transform(df[numeric_cols])
+                df.loc[:, numeric_cols] = imputed
+            if numeric_scale:
+                scaler = StandardScaler(with_mean=False)
+                scaled = scaler.fit_transform(df[numeric_cols].astype(float, copy=False))
+                df.loc[:, numeric_cols] = np.asarray(scaled)
+    pp and pp.tick_abs("numeric", 1.0)
+    pp and pp.end("numeric", extra={"cols": len(numeric_cols)})
+    # 5) low-card categoricals
+    pp and pp.start("low_card", extra={"cols": len(low_card_cols)})
+    if low_card_cols:
+        if low_card_pipeline is not None:
+            df = _safe_apply_pipeline(
+                "cat", df, low_card_cols, low_card_pipeline,
+                drop_on_error=(on_pipeline_error == "drop"),
+                warnings=warnings,
+                max_new_cols=max_new_cols_per_group,
+            )
+        else:
+            if nan_strategy == "impute":
+                df.loc[:, low_card_cols] = SimpleImputer(strategy="most_frequent").fit_transform(df[low_card_cols])
+    pp and pp.tick_abs("low_card", 1.0)
+    pp and pp.end("low_card", extra={"cols": len(low_card_cols)})
+    # 6) text
+    pp and pp.start("text", extra={"cols": len(text_cols)})
+    if text_cols:
+        if text_pipeline is None:
+            def _concat_cols(Xframe: pd.DataFrame):
+                return Xframe.fillna("").astype(str).agg(" ".join, axis=1).values
+            text_pipeline = Pipeline([
+            ("concat", FunctionTransformer(_concat_cols, validate=False)),
+            ("tfidf",  TfidfVectorizer(
+                dtype=np.float32,
+                lowercase=bool(text_lowercase),
+            )),
+        ])
+        df = _safe_apply_pipeline(
+            "txt", df, text_cols, text_pipeline,
+            drop_on_error=(on_pipeline_error == "drop"),
+            warnings=warnings,
+            max_new_cols=max_new_cols_per_group,
+        )
+    pp and pp.tick_abs("text", 1.0)
+    pp and pp.end("text", extra={"cols": len(text_cols)})
+    # 7) finalize
+    pp and pp.start("finalize")
+    stats: Dict[str, Any] = {
+        "warnings": warnings,
+        "cols": {"numeric": numeric_cols, "low_card": low_card_cols, "text": text_cols},
+        "n_rows_before": n_before,
+        "n_rows_after": len(df),
+    }
+    pp and pp.tick_abs("finalize", 1.0, extra={"warn": len(warnings)})
+    pp and pp.end("finalize", extra={"warn": len(warnings)})
+    if verbose and warnings:
+        print("\n".join(warnings))
+    if local_bar is not None:
+        pp.close()
+    return df, stats

pipeline/issues.py ADDED Viewed

	@@ -0,0 +1,157 @@

+from __future__ import annotations
+from typing import Any, Dict, List, Optional, Tuple
+import numpy as np
+import pandas as pd
+from scipy.sparse import issparse
+from sklearn.linear_model import LogisticRegression, SGDRegressor
+from tqdm.auto import tqdm
+from .utils_cool import PhaseProgress, _ensure_dense32, _infer_task
+def find_issues(
+    df: pd.DataFrame,
+    *,
+    label: str,
+    task: Optional[str] = None,                 # "classification" | "regression"; if None we infer
+    model: Optional[Any] = None,                # sklearn estimator; default chosen by task
+    progress: Optional[Any] = None,             # tqdm bar; None -> auto-create
+    verbose: bool = False,
+) -> Tuple[Optional[pd.DataFrame], Dict]:
+    """
+    Detect label issues using Cleanlab's CleanLearning.
+    Parameters
+    ----------
+    df : DataFrame
+        Input table containing features and a label column.
+    label : str
+        Name of the label column. Rows with NaN in label are dropped.
+    task : {"classification","regression"}, optional
+        If not provided, inferred from y.
+    model : sklearn estimator, optional
+        If not provided, defaults to LogisticRegression or SGDRegressor.
+    progress : tqdm, optional
+        Phase-aware progress bar. If None, a local bar is created and closed.
+    verbose : bool, default False
+        Print warnings/timings in addition to returning them in stats.
+    Returns
+    -------
+    (df_out, stats)
+        df_out : DataFrame with label-issues optionally removed (or original df if `remove_issues=False`).
+        stats  : dict with minimal metadata and counts.
+    """
+    # ---- progress setup ----
+    local_bar = None
+    pp = None
+    if progress is None:
+        local_bar = tqdm(total=100, leave=True)
+        pp = PhaseProgress(local_bar, weights={"clean": .15, "cleanlab": .75, "finalize": .10})
+    elif hasattr(progress, "set_description") and hasattr(progress, "update"):
+        if not hasattr(progress, "_last_val"):
+            progress._last_val = 0
+        pp = PhaseProgress(progress, weights={"clean": .15, "cleanlab": .75, "finalize": .10})
+    warnings: List[str] = []
+    n_before = len(df)
+    # ---- clean: ensure label exists, drop NaNs in label ----
+    pp and pp.start("clean", extra={"N": n_before})
+    if label not in df.columns:
+        if local_bar is not None:
+            pp.close()
+        raise KeyError(f"Label column '{label}' not found in DataFrame.")
+    df_in = df.dropna(subset=[label]).reset_index(drop=True)
+    X = df_in
+    y_raw = X[label].to_numpy()
+    pp and pp.tick_abs("clean", 1.0, extra={"N": len(X)})
+    pp and pp.end("clean")
+    # ---- task/model selection ----
+    task_applied = _infer_task(y_raw, task)
+    if model is None:
+        if task_applied == "classification":
+            model = LogisticRegression(solver="saga", n_jobs=-1)
+        else:
+            model = SGDRegressor()
+    # y encoding for classification (Cleanlab expects numeric labels)
+    if task_applied == "classification":
+        classes, y = np.unique(y_raw, return_inverse=True)
+    else:
+        classes = None
+        y = y_raw.astype(np.float64, copy=False)
+    # ---- cleanlab: find label issues (with auto-dense fallback for tiny/sparse edge cases) ----
+    pp and pp.start("cleanlab", extra={"task": task_applied})
+    used_dense_fallback = False
+    try:
+        if task_applied == "classification":
+            from cleanlab.classification import CleanLearning as _CL
+        else:
+            from cleanlab.regression.learn import CleanLearning as _CL
+        cl = _CL(model)
+        issues = cl.find_label_issues(X, y)
+    except Exception as e:
+        if issparse(X):
+            # retry dense (cleanlab/small-N often prefers dense)
+            Xd = _ensure_dense32(X)
+            try:
+                cl = _CL(model)
+                issues = cl.find_label_issues(Xd, y)
+                used_dense_fallback = True
+            except Exception as e2:
+                if local_bar is not None:
+                    pp.close()
+                raise RuntimeError(f"Cleanlab failed on sparse and dense features: {e2}") from e
+        else:
+            if local_bar is not None:
+                pp.close()
+            raise
+    # Parse outputs robustly
+    if isinstance(issues, pd.DataFrame):
+        is_issue = issues.get("is_label_issue", None)
+        label_quality = issues.get("label_quality", None)
+    else:
+        is_issue = None
+        label_quality = None
+    n = len(issues) if hasattr(issues, "__len__") else len(y)
+    n_issues = int(is_issue.sum()) if isinstance(is_issue, (pd.Series, np.ndarray)) else 0
+    pct = round((n_issues / n) * 100.0, 3) if n else 0.0
+    avg_quality = float(np.nanmean(label_quality.values)) if isinstance(label_quality, pd.Series) else float("nan")
+    pp and pp.tick_abs("cleanlab", 1.0, extra={"issues": n_issues})
+    pp and pp.end("cleanlab", extra={"issues": n_issues})
+    # ---- finalize: optionally drop issue rows ----
+    pp and pp.start("finalize")
+    df_out = df_in
+    if isinstance(is_issue, (pd.Series, np.ndarray)):
+        mask_keep = ~(is_issue.astype(bool).values)
+        df_out = df_in.loc[mask_keep].copy()
+    stats: Dict[str, Any] = {
+        "n_rows_before_cleanlab": int(len(df_in)),
+        "n_label_issues": int(n_issues),
+        "pct_label_issues": float(pct),
+        "avg_label_quality": float(avg_quality),
+        "n_rows_after_cleanlab": int(len(df_out)),
+        "task_applied": task_applied,
+        "model_name": type(model).__name__,
+        "used_dense_fallback": bool(used_dense_fallback),
+        "warnings": warnings,
+    }
+    pp and pp.tick_abs("finalize", 1.0)
+    pp and pp.end("finalize")
+    if verbose and warnings:
+        print("\n".join(warnings))
+    if local_bar is not None:
+        pp.close()
+    return df_out, stats

pipeline/pipeline.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import inspect
+from typing import Callable, Optional, Any, Dict, Tuple, Sequence, List
+import inspect
+from typing import Optional, Callable, Tuple, Dict, Any
+import pandas as pd
+def make_step(func: Callable, *, name: Optional[str] = None, use_original_df: bool = False):
+    """
+    Wrap `func` into a pipeline step builder.
+    Assumes: func(...) -> (output_df, stats: dict)
+    If the function has a `df` parameter, the step will:
+      - by default use the previous step's output as df
+      - if `use_original_df=True`, use the original df passed to `run_pipeline`
+      - if you bind `df=` at build time, that takes precedence over both
+    """
+    sig = inspect.signature(func)
+    step_name = name or func.__name__
+    def builder(**params):
+        sig.bind_partial(**params)  # validate early
+        def _run(prev_df: pd.DataFrame, orig_df: Optional[pd.DataFrame] = None) -> Tuple[pd.DataFrame, pd.DataFrame, Dict[str, Any], str]:
+            call = dict(params)
+            if "df" in sig.parameters:
+                # explicit df bound at build time wins
+                if "df" not in call:
+                    call["df"] = orig_df if use_original_df and orig_df is not None else prev_df
+            out_df, stats = func(**call)
+            if not isinstance(stats, dict):
+                raise TypeError(f"{step_name}: expected stats to be dict, got {type(stats)}")
+            # for logging: the input df actually used by the function (if any), else prev_df
+            input_df = call.get("df", prev_df)
+            return input_df, out_df, stats, step_name
+        _run.__name__ = step_name
+        _run.__doc__  = func.__doc__
+        _run.__signature__ = sig
+        return _run
+    builder.__name__ = f"{step_name}_step"
+    builder.__doc__  = f"Step builder for `{func.__name__}`.\n\n" + (func.__doc__ or "")
+    builder.__signature__ = sig
+    return builder
+def run_pipeline(steps: Sequence[Callable], df: pd.DataFrame):
+    """
+    Calls each step as step(prev_df, orig_df) and chains outputs.
+    Returns final_df, logs.
+    """
+    orig_df = df
+    prev_df = df
+    logs: List[Dict[str, Any]] = []
+    for step in steps:
+        in_df, out_df, stats, name = step(prev_df, orig_df)
+        logs.append({"step": name, **stats})
+        prev_df = out_df if out_df is not None else prev_df
+    return prev_df, logs

pipeline/utils_cool.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import inspect
+import math
+import re
+from dataclasses import dataclass
+from time import perf_counter
+from typing import Any, Dict, Optional, Tuple
+import numpy as np
+import pandas as pd
+from scipy.sparse import issparse
+from tqdm import tqdm
+@dataclass
+class PhaseProgress:
+    bar: "tqdm"
+    weights: Dict[str, float]
+    total: int = 100
+    def __post_init__(self):
+        self._norm = sum(self.weights.values()) or 1.0
+        self._done = 0.0
+        self._phase = None
+        self._phase_t0 = None
+        # for smooth updates
+        if not hasattr(self.bar, "_last_val"):
+            self.bar._last_val = 0
+    def start(self, phase: str, extra: Optional[Dict] = None):
+        self._phase = phase
+        self._phase_t0 = perf_counter()
+        self.bar.set_description_str(phase)
+        if extra:
+            self.bar.set_postfix(extra, refresh=False)
+    def tick_abs(self, phase: str, p01: float, extra: Optional[Dict] = None):
+        """Update absolute progress based on within-phase progress p01 ∈ [0,1]."""
+        p01 = max(0.0, min(1.0, float(p01)))
+        w = self.weights.get(phase, 0.0) / self._norm
+        target = int(round(self.total * (self._done + w * p01)))
+        delta = target - self.bar._last_val
+        if delta > 0:
+            self.bar.update(delta)
+            self.bar._last_val = target
+        self.bar.set_description_str(f"{phase} {int(100*p01)}%")
+        if extra:
+            self.bar.set_postfix(extra, refresh=False)
+    def end(self, phase: str, extra: Optional[Dict] = None):
+        w = self.weights.get(phase, 0.0) / self._norm
+        self._done += w
+        elapsed_ms = (perf_counter() - (self._phase_t0 or perf_counter())) * 1000
+        post = dict(extra or {})
+        post["t"] = f"{elapsed_ms:.0f}ms"
+        self.bar.set_postfix(post, refresh=False)
+    def close(self):
+        try:
+            if self.bar._last_val < self.total:
+                self.bar.update(self.total - self.bar._last_val)
+        finally:
+            self.bar.close()
+def choose_k(N: int, k_min: int = 5, k_max: int = 50) -> int:
+    """sqrt(N) clipped to [k_min, k_max] and ≤ N-1."""
+    if N <= 1:
+        return 1
+    k = int(math.sqrt(N))
+    k = max(k_min, min(k, k_max))
+    return min(k, N - 1)
+def _ensure_dense32(X) -> np.ndarray:
+    """Convert to contiguous float32 ndarray (densify only if needed)."""
+    if issparse(X):
+        X = X.toarray()
+    return np.asarray(X, dtype=np.float32, order="C")
+def decide_task_and_model(
+    y: np.ndarray,
+    series: pd.Series,
+    *,
+    is_categorical: bool = False,
+    few_class_floor: int = 20,
+    few_class_frac: float = 0.05,
+):
+    N = len(y)
+    # dtype checks
+    is_bool = pd.api.types.is_bool_dtype(series)
+    is_numeric = pd.api.types.is_numeric_dtype(series)
+    # unique values (ignore NaNs)
+    y_nonnull = y[~pd.isnull(y)]
+    n_unique = len(pd.unique(y_nonnull))
+    # numeric-but-few-classes heuristic
+    few_classes_threshold = max(few_class_floor, int(np.ceil(few_class_frac * max(N, 1))))
+    numeric_few_classes = is_numeric and (n_unique <= few_classes_threshold)
+    use_classification = (
+        is_categorical
+        or is_bool
+        or (not is_numeric)
+        or numeric_few_classes
+    )
+    if use_classification:
+        return "classification"
+    else:
+        return "regression"
+def _infer_task(y: np.ndarray, task: Optional[str]) -> str:
+    """Decide task if not provided: numeric with many uniques -> regression, else classification."""
+    if task in {"classification", "regression"}:
+        return task
+    if np.issubdtype(y.dtype, np.number):
+        nunq = len(np.unique(y[~pd.isna(y)]))
+        is_categorical = nunq <= max(2, int(0.02 * max(1, len(y))))
+    else:
+        is_categorical = True
+    return "classification" if is_categorical else "regression"
+# --------- DataFrame payload helpers (for tool IO) ---------
+def df_to_payload(df: pd.DataFrame) -> Dict[str, Any]:
+    return {"orient": "split", "data": df.to_dict(orient="split")}
+def df_from_payload(p: Dict[str, Any]) -> pd.DataFrame:
+    d = p["data"]
+    return pd.DataFrame(d["data"], columns=d["columns"])
+# --------- Light heuristics for task/label guess ---------
+def guess_task_and_label(df: pd.DataFrame) -> Dict[str, Any]:
+    cols = list(df.columns)
+    label_candidates = [c for c in cols if c.lower() in {"label","target","y","class","outcome"}]
+    label = label_candidates[0] if label_candidates else None
+    task = None
+    if label and (pd.api.types.is_integer_dtype(df[label]) or pd.api.types.is_bool_dtype(df[label])):
+        nuniq = df[label].nunique(dropna=True)
+        task = "classification" if nuniq <= max(20, int(0.05*len(df))) else "regression"
+    elif label and pd.api.types.is_float_dtype(df[label]):
+        task = "regression"
+    else:
+        task = "unsupervised"
+    issues = []
+    if label and df[label].isna().any():
+        issues.append(f"Missing values in label `{label}`")
+    if label and df[label].nunique() == 1:
+        issues.append(f"Label `{label}` has a single class")
+    return {
+        "columns": cols,
+        "dtypes": {c: str(df[c].dtype) for c in cols},
+        "label_guess": label,
+        "task_guess": task,
+        "issues": issues,
+        "shape": df.shape,
+    }
+# --------- Signature extraction for asking params ---------
+def get_signature_dict(fn) -> Dict[str, Any]:
+    sig = inspect.signature(fn)
+    doc = (fn.__doc__ or "").strip()
+    params = []
+    for p in sig.parameters.values():
+        if p.name == "df":
+            continue
+        default = None if (p.default is inspect._empty) else p.default
+        annotation = None if (p.annotation is inspect._empty) else str(p.annotation)
+        params.append({"name": p.name, "default": default, "annotation": annotation, "kind": str(p.kind)})
+    return {"params": params, "doc": doc}
+# --------- Parse free-text confirmation like "Run dedup threshold=0.93 metric=cosine" ---------
+STEP_ALIASES = {
+"dedup": {"dedup","de-dup","duplicates","near-dup"},
+"featurize": {"featurize","features","featureize","engineering"},
+"find_label_issues": {"find_label_issues","label issues","cleanlab","label noise"},
+}
+def parse_user_choice(text: str) -> Tuple[Optional[str], Dict[str, Any]]:
+    t = text.lower()
+    chosen = None
+    for step, aliases in STEP_ALIASES.items():
+        if any(a in t for a in aliases):
+            chosen = step
+            break
+    params: Dict[str, Any] = {}
+    for m in re.finditer(r"(\w+)\s*=\s*([\-\w\.]+)", text):
+        k, v = m.group(1), m.group(2)
+        if v.replace('.', '', 1).isdigit():
+            v = float(v) if '.' in v else int(v)
+        elif v.lower() in {"true","false"}:
+            v = (v.lower() == "true")
+        params[k] = v
+    return chosen, params

requirements.txt ADDED Viewed

	@@ -0,0 +1,126 @@

+aiofiles==23.2.1
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.15
+aiosignal==1.4.0
+annotated-types==0.7.0
+anyio==4.11.0
+async-timeout==4.0.3
+attrs==25.3.0
+cachetools==6.2.0
+certifi==2025.8.3
+charset-normalizer==3.4.3
+cleanlab==2.7.1
+click==8.1.8
+contourpy==1.3.0
+cycler==0.12.1
+dataclasses-json==0.6.7
+datasets==4.1.1
+dill==0.4.0
+dotenv==0.9.9
+exceptiongroup==1.3.0
+fastapi==0.118.0
+ffmpy==0.6.1
+filelock==3.19.1
+filetype==1.2.0
+fonttools==4.60.1
+frozenlist==1.7.0
+fsspec==2025.9.0
+google-ai-generativelanguage==0.7.0
+google-api-core==2.25.2
+google-auth==2.41.1
+googleapis-common-protos==1.70.0
+gradio==4.44.1
+gradio_client==1.3.0
+grpcio==1.75.1
+grpcio-status==1.75.1
+h11==0.16.0
+hf-xet==1.1.10
+httpcore==1.0.9
+httpx==0.28.1
+httpx-sse==0.4.1
+huggingface-hub==0.35.3
+idna==3.10
+importlib_resources==6.5.2
+Jinja2==3.1.6
+joblib==1.5.2
+jsonpatch==1.33
+jsonpointer==3.0.0
+kiwisolver==1.4.7
+langchain==0.3.27
+langchain-community==0.3.30
+langchain-core==0.3.78
+langchain-google-genai==2.1.12
+langchain-text-splitters==0.3.11
+langgraph==0.6.8
+langgraph-checkpoint==2.1.1
+langgraph-prebuilt==0.6.4
+langgraph-sdk==0.2.9
+langsmith==0.4.32
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+marshmallow==3.26.1
+matplotlib==3.9.4
+mdurl==0.1.2
+multidict==6.6.4
+multiprocess==0.70.16
+mypy_extensions==1.1.0
+narwhals==2.6.0
+numpy==1.26.4
+orjson==3.11.3
+ormsgpack==1.10.0
+packaging==25.0
+pandas==2.3.3
+pandoc==2.4
+pillow==10.4.0
+plotly==6.3.1
+plumbum==1.9.0
+ply==3.11
+propcache==0.4.0
+proto-plus==1.26.1
+protobuf==6.32.1
+pyarrow==21.0.0
+pyasn1==0.6.1
+pyasn1_modules==0.4.2
+pydantic==2.10.6
+pydantic-settings==2.11.0
+pydantic_core==2.27.2
+pydub==0.25.1
+Pygments==2.19.2
+pypandoc==1.15
+pyparsing==3.2.5
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.1
+python-multipart==0.0.20
+pytz==2025.2
+PyYAML==6.0.3
+reportlab==4.4.4
+requests==2.32.5
+requests-toolbelt==1.0.0
+rich==14.1.0
+rsa==4.9.1
+ruff==0.13.3
+scikit-learn==1.6.1
+scipy==1.13.1
+semantic-version==2.10.0
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+SQLAlchemy==2.0.43
+starlette==0.48.0
+tenacity==9.1.2
+termcolor==3.1.0
+threadpoolctl==3.6.0
+tomlkit==0.12.0
+tqdm==4.67.1
+typer==0.19.2
+typing-inspect==0.9.0
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+tzdata==2025.2
+urllib3==2.5.0
+uvicorn==0.37.0
+websockets==12.0
+xxhash==3.6.0
+yarl==1.20.1
+zipp==3.23.0
+zstandard==0.25.0