asd-screening-tool / src /progress_tracking.py
monai86
deploy: ASD screening dashboard for HuggingFace Spaces
9d2afb8
"""
Longitudinal progress tracking for combined corpora (21 children:
Rollins 5 + Flusberg 6 + Quigley 10, each with multiple sessions).
For each child we track the trajectory of core speech-language features
across sessions and fit a simple linear trend. This mirrors the clinical
question raised by the advisor:
"Can AI tell whether the child's speech improves from session to session?"
Outputs:
reports/figures/longitudinal_trajectories.png (per-feature, per-child lines)
reports/figures/longitudinal_composite_score.png (composite progress score)
reports/metrics/longitudinal_trends.csv (slope, r, p per child-feature)
reports/metrics/longitudinal_progress_summary.csv (first vs last session delta)
"""
from __future__ import annotations
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
PROJECT_ROOT = Path(__file__).resolve().parent.parent
DATA_DIR = PROJECT_ROOT / "data"
FIG_DIR = PROJECT_ROOT / "reports" / "figures"
METRIC_DIR = PROJECT_ROOT / "reports" / "metrics"
FIG_DIR.mkdir(parents=True, exist_ok=True)
METRIC_DIR.mkdir(parents=True, exist_ok=True)
sns.set_theme(style="whitegrid", context="talk")
# Features we expect to IMPROVE over therapy.
# +1 = higher is better | -1 = lower is better
FEATURE_DIRECTION = {
"mlu": +1,
"mluw": +1,
"ttr": +1,
"total_words": +1,
"total_utterances": +1,
"unintelligible_ratio": -1,
"zero_vocalization_count": -1,
}
def compute_trends(df: pd.DataFrame) -> pd.DataFrame:
"""Fit a linear regression of feature ~ session_order for each child-feature."""
rows = []
for child, g in df.groupby("child"):
g = g.sort_values("session_order")
x = g["session_order"].to_numpy(dtype=float)
for feat, direction in FEATURE_DIRECTION.items():
y = g[feat].to_numpy(dtype=float)
if len(x) < 2 or np.all(np.isnan(y)):
continue
res = stats.linregress(x, y)
improving = (res.slope * direction) > 0
rows.append({
"child": child,
"feature": feat,
"direction": "higher=better" if direction > 0 else "lower=better",
"slope": round(res.slope, 4),
"intercept": round(res.intercept, 4),
"r": round(res.rvalue, 4),
"p_value": round(res.pvalue, 4),
"n_sessions": len(x),
"improving": bool(improving),
})
return pd.DataFrame(rows)
def compute_progress_summary(df: pd.DataFrame) -> pd.DataFrame:
"""First vs last session delta per child."""
rows = []
for child, g in df.groupby("child"):
g = g.sort_values("session_order")
first, last = g.iloc[0], g.iloc[-1]
row = {
"child": child,
"n_sessions": len(g),
"age_start_mo": first["age_months"],
"age_end_mo": last["age_months"],
"duration_mo": (
round(last["age_months"] - first["age_months"], 1)
if pd.notna(last["age_months"]) and pd.notna(first["age_months"])
else np.nan
),
}
for feat in FEATURE_DIRECTION:
row[f"{feat}_start"] = round(first[feat], 3)
row[f"{feat}_end"] = round(last[feat], 3)
row[f"{feat}_delta"] = round(last[feat] - first[feat], 3)
rows.append(row)
return pd.DataFrame(rows)
def plot_trajectories(df: pd.DataFrame) -> None:
feats = list(FEATURE_DIRECTION.keys())
fig, axes = plt.subplots(3, 3, figsize=(18, 14))
# Create child+corpus label for clarity
df = df.copy()
df["child_label"] = df["child"] + " (" + df["corpus"].str[:4] + ")"
for ax, feat in zip(axes.flat, feats):
sns.lineplot(
data=df, x="session_order", y=feat,
hue="child_label", marker="o", ax=ax,
)
ax.set_title(feat)
ax.set_xlabel("Session order")
ax.set_ylabel("")
ax.legend(fontsize=8, loc="best")
# Hide unused subplots
for ax in axes.flat[len(feats):]:
ax.set_visible(False)
fig.suptitle("Longitudinal: per-child trajectories across sessions\n(Rollins + Flusberg + Quigley)", y=1.00)
fig.tight_layout()
out = FIG_DIR / "longitudinal_trajectories.png"
fig.savefig(out, dpi=150, bbox_inches="tight")
plt.close(fig)
print(f" saved {out.relative_to(PROJECT_ROOT)}")
def compute_composite_score(df: pd.DataFrame) -> pd.DataFrame:
"""Per-session composite score: z-scored features combined with direction.
Score = mean over features of (z(feature) * direction).
Higher = better / more typical language production.
"""
df = df.copy()
zdf = pd.DataFrame(index=df.index)
for feat, direction in FEATURE_DIRECTION.items():
x = df[feat].astype(float)
mu, sd = x.mean(), x.std(ddof=0)
if sd == 0 or np.isnan(sd):
zdf[feat] = 0.0
else:
zdf[feat] = direction * (x - mu) / sd
df["composite_score"] = zdf.mean(axis=1)
return df
def plot_composite(df: pd.DataFrame) -> None:
fig, ax = plt.subplots(figsize=(12, 7))
# Create child+corpus label for clarity
df = df.copy()
df["child_label"] = df["child"] + " (" + df["corpus"].str[:4] + ")"
sns.lineplot(
data=df, x="session_order", y="composite_score",
hue="child_label", marker="o", linewidth=2.5, ax=ax,
)
ax.axhline(0, color="gray", linestyle="--", alpha=0.5)
ax.set_title("Longitudinal composite progress score\n(z-scored, direction-adjusted)\n(Rollins + Flusberg + Quigley)")
ax.set_xlabel("Session order")
ax.set_ylabel("Composite score (higher = better)")
fig.tight_layout()
out = FIG_DIR / "longitudinal_composite_score.png"
fig.savefig(out, dpi=150, bbox_inches="tight")
plt.close(fig)
print(f" saved {out.relative_to(PROJECT_ROOT)}")
def main() -> None:
csv_path = DATA_DIR / "longitudinal_features.csv"
df = pd.read_csv(csv_path)
df = df.sort_values(["child", "session_order"]).reset_index(drop=True)
print(f"Loaded {len(df)} sessions from {df['corpus'].nunique()} corpora "
f"for {df['child'].nunique()} children.\n")
print("Corpus distribution:")
print(df.groupby("corpus")["child"].nunique())
print()
# Per-child per-feature linear trends
trends = compute_trends(df)
# Add corpus info to trends
child_corpus = df.groupby("child")["corpus"].first().to_dict()
trends["corpus"] = trends["child"].map(child_corpus)
out = METRIC_DIR / "longitudinal_trends.csv"
trends.to_csv(out, index=False)
print(f"[saved] {out.relative_to(PROJECT_ROOT)}")
# First-vs-last session summary
summary = compute_progress_summary(df)
# Add corpus info to summary
summary["corpus"] = summary["child"].map(child_corpus)
out = METRIC_DIR / "longitudinal_progress_summary.csv"
summary.to_csv(out, index=False)
print(f"[saved] {out.relative_to(PROJECT_ROOT)}")
# Plots
plot_trajectories(df)
df_c = compute_composite_score(df)
plot_composite(df_c)
# Print human-readable verdict
print("\n=== Per-child improvement summary ===")
improved_flags = (
trends.groupby("child")["improving"].sum()
.reindex(sorted(trends["child"].unique()))
)
total_feats = len(FEATURE_DIRECTION)
for child, n_improved in improved_flags.items():
verdict = "IMPROVING" if n_improved > total_feats / 2 else "mixed / stalling"
print(f" {child:8s} {int(n_improved)}/{total_feats} features improving -> {verdict}")
print("\n=== Top improving child-feature pairs (by |r|) ===")
sig = trends[trends["improving"]].copy()
sig = sig.reindex(sig["r"].abs().sort_values(ascending=False).index)
print(sig.head(10).to_string(index=False))
print("\n=== Composite score: first vs last session ===")
for child, g in df_c.groupby("child"):
g = g.sort_values("session_order")
first = g["composite_score"].iloc[0]
last = g["composite_score"].iloc[-1]
arrow = "↑" if last > first else "↓"
print(f" {child:8s} {first:+.2f} -> {last:+.2f} "
f"({arrow} delta = {last - first:+.2f})")
print("\n[done] progress tracking complete.")
if __name__ == "__main__":
main()