Spaces:

paoo4511
/

asd-screening-tool

Sleeping

File size: 8,456 Bytes

9d2afb8

"""
Longitudinal progress tracking for combined corpora (21 children:
Rollins 5 + Flusberg 6 + Quigley 10, each with multiple sessions).

For each child we track the trajectory of core speech-language features
across sessions and fit a simple linear trend.  This mirrors the clinical
question raised by the advisor:

    "Can AI tell whether the child's speech improves from session to session?"

Outputs:
    reports/figures/longitudinal_trajectories.png      (per-feature, per-child lines)
    reports/figures/longitudinal_composite_score.png   (composite progress score)
    reports/metrics/longitudinal_trends.csv            (slope, r, p per child-feature)
    reports/metrics/longitudinal_progress_summary.csv  (first vs last session delta)
"""

from __future__ import annotations

from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats

PROJECT_ROOT = Path(__file__).resolve().parent.parent
DATA_DIR = PROJECT_ROOT / "data"
FIG_DIR = PROJECT_ROOT / "reports" / "figures"
METRIC_DIR = PROJECT_ROOT / "reports" / "metrics"
FIG_DIR.mkdir(parents=True, exist_ok=True)
METRIC_DIR.mkdir(parents=True, exist_ok=True)

sns.set_theme(style="whitegrid", context="talk")

# Features we expect to IMPROVE over therapy.
# +1 = higher is better  |  -1 = lower is better
FEATURE_DIRECTION = {
    "mlu": +1,
    "mluw": +1,
    "ttr": +1,
    "total_words": +1,
    "total_utterances": +1,
    "unintelligible_ratio": -1,
    "zero_vocalization_count": -1,
}


def compute_trends(df: pd.DataFrame) -> pd.DataFrame:
    """Fit a linear regression of feature ~ session_order for each child-feature."""
    rows = []
    for child, g in df.groupby("child"):
        g = g.sort_values("session_order")
        x = g["session_order"].to_numpy(dtype=float)
        for feat, direction in FEATURE_DIRECTION.items():
            y = g[feat].to_numpy(dtype=float)
            if len(x) < 2 or np.all(np.isnan(y)):
                continue
            res = stats.linregress(x, y)
            improving = (res.slope * direction) > 0
            rows.append({
                "child": child,
                "feature": feat,
                "direction": "higher=better" if direction > 0 else "lower=better",
                "slope": round(res.slope, 4),
                "intercept": round(res.intercept, 4),
                "r": round(res.rvalue, 4),
                "p_value": round(res.pvalue, 4),
                "n_sessions": len(x),
                "improving": bool(improving),
            })
    return pd.DataFrame(rows)


def compute_progress_summary(df: pd.DataFrame) -> pd.DataFrame:
    """First vs last session delta per child."""
    rows = []
    for child, g in df.groupby("child"):
        g = g.sort_values("session_order")
        first, last = g.iloc[0], g.iloc[-1]
        row = {
            "child": child,
            "n_sessions": len(g),
            "age_start_mo": first["age_months"],
            "age_end_mo": last["age_months"],
            "duration_mo": (
                round(last["age_months"] - first["age_months"], 1)
                if pd.notna(last["age_months"]) and pd.notna(first["age_months"])
                else np.nan
            ),
        }
        for feat in FEATURE_DIRECTION:
            row[f"{feat}_start"] = round(first[feat], 3)
            row[f"{feat}_end"] = round(last[feat], 3)
            row[f"{feat}_delta"] = round(last[feat] - first[feat], 3)
        rows.append(row)
    return pd.DataFrame(rows)


def plot_trajectories(df: pd.DataFrame) -> None:
    feats = list(FEATURE_DIRECTION.keys())
    fig, axes = plt.subplots(3, 3, figsize=(18, 14))

    # Create child+corpus label for clarity
    df = df.copy()
    df["child_label"] = df["child"] + " (" + df["corpus"].str[:4] + ")"

    for ax, feat in zip(axes.flat, feats):
        sns.lineplot(
            data=df, x="session_order", y=feat,
            hue="child_label", marker="o", ax=ax,
        )
        ax.set_title(feat)
        ax.set_xlabel("Session order")
        ax.set_ylabel("")
        ax.legend(fontsize=8, loc="best")
    # Hide unused subplots
    for ax in axes.flat[len(feats):]:
        ax.set_visible(False)
    fig.suptitle("Longitudinal: per-child trajectories across sessions\n(Rollins + Flusberg + Quigley)", y=1.00)
    fig.tight_layout()
    out = FIG_DIR / "longitudinal_trajectories.png"
    fig.savefig(out, dpi=150, bbox_inches="tight")
    plt.close(fig)
    print(f"  saved  {out.relative_to(PROJECT_ROOT)}")


def compute_composite_score(df: pd.DataFrame) -> pd.DataFrame:
    """Per-session composite score: z-scored features combined with direction.

    Score = mean over features of (z(feature) * direction).
    Higher = better / more typical language production.
    """
    df = df.copy()
    zdf = pd.DataFrame(index=df.index)
    for feat, direction in FEATURE_DIRECTION.items():
        x = df[feat].astype(float)
        mu, sd = x.mean(), x.std(ddof=0)
        if sd == 0 or np.isnan(sd):
            zdf[feat] = 0.0
        else:
            zdf[feat] = direction * (x - mu) / sd
    df["composite_score"] = zdf.mean(axis=1)
    return df


def plot_composite(df: pd.DataFrame) -> None:
    fig, ax = plt.subplots(figsize=(12, 7))

    # Create child+corpus label for clarity
    df = df.copy()
    df["child_label"] = df["child"] + " (" + df["corpus"].str[:4] + ")"

    sns.lineplot(
        data=df, x="session_order", y="composite_score",
        hue="child_label", marker="o", linewidth=2.5, ax=ax,
    )
    ax.axhline(0, color="gray", linestyle="--", alpha=0.5)
    ax.set_title("Longitudinal composite progress score\n(z-scored, direction-adjusted)\n(Rollins + Flusberg + Quigley)")
    ax.set_xlabel("Session order")
    ax.set_ylabel("Composite score (higher = better)")
    fig.tight_layout()
    out = FIG_DIR / "longitudinal_composite_score.png"
    fig.savefig(out, dpi=150, bbox_inches="tight")
    plt.close(fig)
    print(f"  saved  {out.relative_to(PROJECT_ROOT)}")


def main() -> None:
    csv_path = DATA_DIR / "longitudinal_features.csv"
    df = pd.read_csv(csv_path)
    df = df.sort_values(["child", "session_order"]).reset_index(drop=True)
    print(f"Loaded {len(df)} sessions from {df['corpus'].nunique()} corpora "
          f"for {df['child'].nunique()} children.\n")
    print("Corpus distribution:")
    print(df.groupby("corpus")["child"].nunique())
    print()

    # Per-child per-feature linear trends
    trends = compute_trends(df)
    # Add corpus info to trends
    child_corpus = df.groupby("child")["corpus"].first().to_dict()
    trends["corpus"] = trends["child"].map(child_corpus)
    out = METRIC_DIR / "longitudinal_trends.csv"
    trends.to_csv(out, index=False)
    print(f"[saved] {out.relative_to(PROJECT_ROOT)}")

    # First-vs-last session summary
    summary = compute_progress_summary(df)
    # Add corpus info to summary
    summary["corpus"] = summary["child"].map(child_corpus)
    out = METRIC_DIR / "longitudinal_progress_summary.csv"
    summary.to_csv(out, index=False)
    print(f"[saved] {out.relative_to(PROJECT_ROOT)}")

    # Plots
    plot_trajectories(df)
    df_c = compute_composite_score(df)
    plot_composite(df_c)

    # Print human-readable verdict
    print("\n=== Per-child improvement summary ===")
    improved_flags = (
        trends.groupby("child")["improving"].sum()
        .reindex(sorted(trends["child"].unique()))
    )
    total_feats = len(FEATURE_DIRECTION)
    for child, n_improved in improved_flags.items():
        verdict = "IMPROVING" if n_improved > total_feats / 2 else "mixed / stalling"
        print(f"  {child:8s}  {int(n_improved)}/{total_feats} features improving  ->  {verdict}")

    print("\n=== Top improving child-feature pairs (by |r|) ===")
    sig = trends[trends["improving"]].copy()
    sig = sig.reindex(sig["r"].abs().sort_values(ascending=False).index)
    print(sig.head(10).to_string(index=False))

    print("\n=== Composite score: first vs last session ===")
    for child, g in df_c.groupby("child"):
        g = g.sort_values("session_order")
        first = g["composite_score"].iloc[0]
        last = g["composite_score"].iloc[-1]
        arrow = "↑" if last > first else "↓"
        print(f"  {child:8s}  {first:+.2f}  ->  {last:+.2f}   "
              f"({arrow} delta = {last - first:+.2f})")

    print("\n[done] progress tracking complete.")


if __name__ == "__main__":
    main()