#!/usr/bin/env python3 """Plot distribution of student attempts over elapsed time. This script reads FoundationalASSIST `Interactions.csv`, computes elapsed time for each attempt from the student's first attempt, groups attempts into fixed time bins, and plots the resulting column distribution. """ from __future__ import annotations import argparse import math from pathlib import Path import matplotlib.pyplot as plt import pandas as pd from matplotlib.ticker import FuncFormatter, MaxNLocator DEFAULT_INTERACTIONS_PATH = ( Path(__file__).resolve().parent.parent / "Data" / "Interactions.csv" ) DEFAULT_OUTPUT_PLOT = ( Path(__file__).resolve().parent.parent / "Results" / "student_attempt_distribution.png" ) DEFAULT_OUTPUT_COUNTS = ( Path(__file__).resolve().parent.parent / "Results" / "student_attempt_distribution_counts.csv" ) def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description=( "Compute distribution of attempts over elapsed time from " "Interactions.csv and plot binned columns." ) ) parser.add_argument( "--interactions-path", type=Path, default=DEFAULT_INTERACTIONS_PATH, help="Path to Interactions.csv.", ) parser.add_argument( "--output-plot", type=Path, default=DEFAULT_OUTPUT_PLOT, help="Path to save the output figure.", ) parser.add_argument( "--output-counts", type=Path, default=DEFAULT_OUTPUT_COUNTS, help="Path to save binned attempt counts as CSV.", ) parser.add_argument( "--max-rows", type=int, default=None, help="Optional cap on rows after sorting (for quick debugging).", ) parser.add_argument( "--bin-time", type=float, default=10.0, help=( "Fixed bin width in minutes. " "For example, --bin-time 10 creates bins [0,10), [10,20), ..." ), ) parser.add_argument( "--plot-upper-limit-minutes", type=float, default=None, help=( "Optional upper limit for x-axis in minutes. " "If omitted, uses the full range implied by bins." ), ) parser.add_argument( "--student-idx", type=int, default=None, help=( "Optional 0-based index of student to plot. Index is based on " "sorted unique user_id values in the loaded interactions." ), ) parser.add_argument( "--log-y", action="store_true", help="Use log scale on y-axis.", ) return parser.parse_args() def load_interactions(path: Path, max_rows: int | None = None) -> pd.DataFrame: """Load fields required for student attempt timing analysis.""" usecols = ["id", "user_id", "end_time"] df = pd.read_csv(path, usecols=usecols, low_memory=False) df["id"] = pd.to_numeric(df["id"], errors="coerce") df["id"] = df["id"].fillna(-1).astype(int) df["user_id"] = df["user_id"].astype("string") df["end_time"] = pd.to_datetime(df["end_time"], errors="coerce", utc=True) df = df.dropna(subset=["user_id", "end_time"]).copy() df = df.sort_values(["user_id", "end_time", "id"], kind="mergesort") if max_rows is not None: if max_rows <= 0: raise ValueError("--max-rows must be a positive integer.") df = df.head(max_rows).copy() return df def select_student_by_index( df: pd.DataFrame, student_idx: int, ) -> tuple[pd.DataFrame, str, int]: """Select one student's interactions by 0-based index over unique IDs.""" student_ids = df["user_id"].drop_duplicates().tolist() total_students = len(student_ids) if total_students == 0: raise ValueError("No students found in loaded interactions.") if student_idx < 0 or student_idx >= total_students: raise ValueError( f"--student-idx must be in [0, {total_students - 1}], got {student_idx}." ) selected_student_id = str(student_ids[student_idx]) selected_df = df[df["user_id"] == selected_student_id].copy() return selected_df, selected_student_id, total_students def append_student_id_to_output_path(path: Path, student_id: str) -> Path: """Append a safe student-id suffix to output filename.""" safe_id = "".join( ch if ch.isalnum() or ch in ("-", "_") else "_" for ch in student_id ) return path.with_name(f"{path.stem}_{safe_id}{path.suffix}") def compute_attempt_elapsed_minutes(df: pd.DataFrame) -> pd.Series: """Compute elapsed minutes of each attempt from student's first attempt.""" first_times = df.groupby("user_id", sort=False)["end_time"].transform("min") elapsed_minutes = (df["end_time"] - first_times).dt.total_seconds() / 60.0 elapsed_minutes.name = "elapsed_minutes" return elapsed_minutes def build_fixed_width_bin_edges_minutes( valid_elapsed_minutes: pd.Series, bin_time_minutes: float, ) -> list[float]: """Build fixed-width bin edges from min/max elapsed minutes.""" min_elapsed = float(valid_elapsed_minutes.min()) max_elapsed = float(valid_elapsed_minutes.max()) start = bin_time_minutes * math.floor(min_elapsed / bin_time_minutes) end = bin_time_minutes * math.ceil(max_elapsed / bin_time_minutes) if math.isclose(start, 0.0, abs_tol=1e-12): start = 0.0 if math.isclose(end, start, abs_tol=1e-12): end = start + bin_time_minutes n_bins = int(round((end - start) / bin_time_minutes)) edges = [start + i * bin_time_minutes for i in range(n_bins + 1)] if edges[-1] <= max_elapsed: edges.append(edges[-1] + bin_time_minutes) return edges def format_minutes_tick(value: float, _pos: float) -> str: if value < 60: return f"{value:.0f}m" if value < 1440: return f"{value / 60:.0f}h" return f"{value / 1440:.0f}d" def summarize_attempt_distribution( elapsed_minutes: pd.Series, bin_time_minutes: float, ) -> pd.DataFrame: valid = elapsed_minutes.dropna().copy() if valid.empty: raise ValueError("No valid elapsed attempt times found.") edges = build_fixed_width_bin_edges_minutes(valid, bin_time_minutes) binned = pd.cut(valid, bins=edges, right=False, include_lowest=True) counts = binned.value_counts(sort=False) total_attempts = int(counts.sum()) probabilities = (counts / total_attempts).astype(float) bin_left = pd.Series(edges[:-1], dtype=float) bin_right = pd.Series(edges[1:], dtype=float) bin_width = bin_right - bin_left summary = pd.DataFrame( { "bin_left_min": bin_left.to_numpy(), "bin_right_min": bin_right.to_numpy(), "bin_width_min": bin_width.to_numpy(), "attempt_count": counts.to_numpy(), "probability": probabilities.to_numpy(dtype=float), "percentage": probabilities.to_numpy(dtype=float) * 100.0, } ) return summary def plot_distribution( summary_df: pd.DataFrame, output_path: Path, log_y: bool = False, plot_upper_limit_minutes: float | None = None, student_idx: int | None = None, ) -> None: """Create and save student-attempt distribution columns.""" output_path.parent.mkdir(parents=True, exist_ok=True) plt.style.use("seaborn-v0_8-whitegrid") if student_idx is not None: fig, ax = plt.subplots(figsize=(10, 5)) else: fig, ax = plt.subplots(figsize=(20, 5)) left = summary_df["bin_left_min"].to_numpy(dtype=float) width = summary_df["bin_width_min"].to_numpy(dtype=float) counts = summary_df["attempt_count"].to_numpy(dtype=float) bars = ax.bar( left, counts, width=width, align="edge", color="#4C78A8", # edgecolor="white", # linewidth=1.0, ) title = "Distribution of Student Attempts Over Elapsed Time" if student_idx is not None: title = f"{title} (student_idx={student_idx})" ax.set_title(title) ax.set_xlabel("Elapsed Time Since Student's First Attempt") ax.set_ylabel("Number of Attempts") x_min = float(left.min()) x_max = float((left + width).max()) if plot_upper_limit_minutes is not None: x_max = min(x_max, float(plot_upper_limit_minutes)) ax.set_xlim(x_min, x_max) ax.xaxis.set_major_locator(MaxNLocator(nbins=9)) ax.xaxis.set_major_formatter(FuncFormatter(format_minutes_tick)) ax.grid(axis="y", alpha=0.25, linewidth=0.8) ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) if log_y: ax.set_yscale("log") annotate_bars = len(summary_df) <= 40 if annotate_bars: for bar, pct in zip(bars, summary_df["percentage"]): if pct < 1.0: continue h = bar.get_height() if h <= 0: continue ax.annotate( f"{pct:.1f}%", xy=(bar.get_x() + bar.get_width() / 2.0, h), xytext=(0, 3), textcoords="offset points", ha="center", va="bottom", fontsize=8, ) plt.tight_layout() fig.savefig(output_path, dpi=400, bbox_inches="tight") plt.close(fig) def main() -> None: args = parse_args() if not args.interactions_path.exists(): raise FileNotFoundError( f"Interactions file not found: {args.interactions_path}" ) if args.bin_time <= 0: raise ValueError("--bin-time must be a positive number.") if args.plot_upper_limit_minutes is not None and args.plot_upper_limit_minutes <= 0: raise ValueError("--plot-upper-limit-minutes must be a positive number.") df = load_interactions(args.interactions_path, max_rows=args.max_rows) selected_student_id: str | None = None total_students = int(df["user_id"].nunique()) if args.student_idx is not None: df, selected_student_id, total_students = select_student_by_index( df, args.student_idx, ) output_plot_path = args.output_plot output_counts_path = args.output_counts if selected_student_id is not None: output_plot_path = append_student_id_to_output_path( output_plot_path, selected_student_id, ) output_counts_path = append_student_id_to_output_path( output_counts_path, selected_student_id, ) elapsed_minutes = compute_attempt_elapsed_minutes(df) summary = summarize_attempt_distribution(elapsed_minutes, args.bin_time) output_counts_path.parent.mkdir(parents=True, exist_ok=True) summary.to_csv(output_counts_path, index=False) plot_distribution( summary, output_plot_path, log_y=args.log_y, plot_upper_limit_minutes=args.plot_upper_limit_minutes, student_idx=args.student_idx, ) total_attempts = int(summary["attempt_count"].sum()) print("Done.") print(f"Interactions loaded: {len(df):,}") print(f"Students in loaded data: {total_students:,}") if selected_student_id is not None: print(f"Selected student idx: {args.student_idx}") print(f"Selected student id: {selected_student_id}") print(f"Attempts used: {total_attempts:,}") print(f"Bin width (min): {args.bin_time}") print(f"Saved plot: {output_plot_path}") print(f"Saved bin counts: {output_counts_path}") if __name__ == "__main__": main()