| |
| """Plot distribution of student attempts over elapsed time. |
| |
| This script reads FoundationalASSIST `Interactions.csv`, computes elapsed time |
| for each attempt from the student's first attempt, groups attempts into fixed |
| time bins, and plots the resulting column distribution. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import math |
| from pathlib import Path |
|
|
| import matplotlib.pyplot as plt |
| import pandas as pd |
| from matplotlib.ticker import FuncFormatter, MaxNLocator |
|
|
|
|
| DEFAULT_INTERACTIONS_PATH = ( |
| Path(__file__).resolve().parent.parent / "Data" / "Interactions.csv" |
| ) |
| DEFAULT_OUTPUT_PLOT = ( |
| Path(__file__).resolve().parent.parent |
| / "Results" |
| / "student_attempt_distribution.png" |
| ) |
| DEFAULT_OUTPUT_COUNTS = ( |
| Path(__file__).resolve().parent.parent |
| / "Results" |
| / "student_attempt_distribution_counts.csv" |
| ) |
|
|
|
|
| def parse_args() -> argparse.Namespace: |
| parser = argparse.ArgumentParser( |
| description=( |
| "Compute distribution of attempts over elapsed time from " |
| "Interactions.csv and plot binned columns." |
| ) |
| ) |
| parser.add_argument( |
| "--interactions-path", |
| type=Path, |
| default=DEFAULT_INTERACTIONS_PATH, |
| help="Path to Interactions.csv.", |
| ) |
| parser.add_argument( |
| "--output-plot", |
| type=Path, |
| default=DEFAULT_OUTPUT_PLOT, |
| help="Path to save the output figure.", |
| ) |
| parser.add_argument( |
| "--output-counts", |
| type=Path, |
| default=DEFAULT_OUTPUT_COUNTS, |
| help="Path to save binned attempt counts as CSV.", |
| ) |
| parser.add_argument( |
| "--max-rows", |
| type=int, |
| default=None, |
| help="Optional cap on rows after sorting (for quick debugging).", |
| ) |
| parser.add_argument( |
| "--bin-time", |
| type=float, |
| default=10.0, |
| help=( |
| "Fixed bin width in minutes. " |
| "For example, --bin-time 10 creates bins [0,10), [10,20), ..." |
| ), |
| ) |
| parser.add_argument( |
| "--plot-upper-limit-minutes", |
| type=float, |
| default=None, |
| help=( |
| "Optional upper limit for x-axis in minutes. " |
| "If omitted, uses the full range implied by bins." |
| ), |
| ) |
| parser.add_argument( |
| "--student-idx", |
| type=int, |
| default=None, |
| help=( |
| "Optional 0-based index of student to plot. Index is based on " |
| "sorted unique user_id values in the loaded interactions." |
| ), |
| ) |
| parser.add_argument( |
| "--log-y", |
| action="store_true", |
| help="Use log scale on y-axis.", |
| ) |
| return parser.parse_args() |
|
|
|
|
| def load_interactions(path: Path, max_rows: int | None = None) -> pd.DataFrame: |
| """Load fields required for student attempt timing analysis.""" |
| usecols = ["id", "user_id", "end_time"] |
| df = pd.read_csv(path, usecols=usecols, low_memory=False) |
|
|
| df["id"] = pd.to_numeric(df["id"], errors="coerce") |
| df["id"] = df["id"].fillna(-1).astype(int) |
| df["user_id"] = df["user_id"].astype("string") |
| df["end_time"] = pd.to_datetime(df["end_time"], errors="coerce", utc=True) |
|
|
| df = df.dropna(subset=["user_id", "end_time"]).copy() |
| df = df.sort_values(["user_id", "end_time", "id"], kind="mergesort") |
|
|
| if max_rows is not None: |
| if max_rows <= 0: |
| raise ValueError("--max-rows must be a positive integer.") |
| df = df.head(max_rows).copy() |
|
|
| return df |
|
|
|
|
| def select_student_by_index( |
| df: pd.DataFrame, |
| student_idx: int, |
| ) -> tuple[pd.DataFrame, str, int]: |
| """Select one student's interactions by 0-based index over unique IDs.""" |
| student_ids = df["user_id"].drop_duplicates().tolist() |
| total_students = len(student_ids) |
|
|
| if total_students == 0: |
| raise ValueError("No students found in loaded interactions.") |
| if student_idx < 0 or student_idx >= total_students: |
| raise ValueError( |
| f"--student-idx must be in [0, {total_students - 1}], got {student_idx}." |
| ) |
|
|
| selected_student_id = str(student_ids[student_idx]) |
| selected_df = df[df["user_id"] == selected_student_id].copy() |
| return selected_df, selected_student_id, total_students |
|
|
|
|
| def append_student_id_to_output_path(path: Path, student_id: str) -> Path: |
| """Append a safe student-id suffix to output filename.""" |
| safe_id = "".join( |
| ch if ch.isalnum() or ch in ("-", "_") else "_" for ch in student_id |
| ) |
| return path.with_name(f"{path.stem}_{safe_id}{path.suffix}") |
|
|
|
|
| def compute_attempt_elapsed_minutes(df: pd.DataFrame) -> pd.Series: |
| """Compute elapsed minutes of each attempt from student's first attempt.""" |
| first_times = df.groupby("user_id", sort=False)["end_time"].transform("min") |
| elapsed_minutes = (df["end_time"] - first_times).dt.total_seconds() / 60.0 |
| elapsed_minutes.name = "elapsed_minutes" |
| return elapsed_minutes |
|
|
|
|
| def build_fixed_width_bin_edges_minutes( |
| valid_elapsed_minutes: pd.Series, |
| bin_time_minutes: float, |
| ) -> list[float]: |
| """Build fixed-width bin edges from min/max elapsed minutes.""" |
| min_elapsed = float(valid_elapsed_minutes.min()) |
| max_elapsed = float(valid_elapsed_minutes.max()) |
|
|
| start = bin_time_minutes * math.floor(min_elapsed / bin_time_minutes) |
| end = bin_time_minutes * math.ceil(max_elapsed / bin_time_minutes) |
|
|
| if math.isclose(start, 0.0, abs_tol=1e-12): |
| start = 0.0 |
| if math.isclose(end, start, abs_tol=1e-12): |
| end = start + bin_time_minutes |
|
|
| n_bins = int(round((end - start) / bin_time_minutes)) |
| edges = [start + i * bin_time_minutes for i in range(n_bins + 1)] |
| if edges[-1] <= max_elapsed: |
| edges.append(edges[-1] + bin_time_minutes) |
|
|
| return edges |
|
|
|
|
| def format_minutes_tick(value: float, _pos: float) -> str: |
| if value < 60: |
| return f"{value:.0f}m" |
| if value < 1440: |
| return f"{value / 60:.0f}h" |
| return f"{value / 1440:.0f}d" |
|
|
|
|
| def summarize_attempt_distribution( |
| elapsed_minutes: pd.Series, |
| bin_time_minutes: float, |
| ) -> pd.DataFrame: |
| valid = elapsed_minutes.dropna().copy() |
| if valid.empty: |
| raise ValueError("No valid elapsed attempt times found.") |
|
|
| edges = build_fixed_width_bin_edges_minutes(valid, bin_time_minutes) |
| binned = pd.cut(valid, bins=edges, right=False, include_lowest=True) |
| counts = binned.value_counts(sort=False) |
| total_attempts = int(counts.sum()) |
| probabilities = (counts / total_attempts).astype(float) |
|
|
| bin_left = pd.Series(edges[:-1], dtype=float) |
| bin_right = pd.Series(edges[1:], dtype=float) |
| bin_width = bin_right - bin_left |
|
|
| summary = pd.DataFrame( |
| { |
| "bin_left_min": bin_left.to_numpy(), |
| "bin_right_min": bin_right.to_numpy(), |
| "bin_width_min": bin_width.to_numpy(), |
| "attempt_count": counts.to_numpy(), |
| "probability": probabilities.to_numpy(dtype=float), |
| "percentage": probabilities.to_numpy(dtype=float) * 100.0, |
| } |
| ) |
| return summary |
|
|
|
|
| def plot_distribution( |
| summary_df: pd.DataFrame, |
| output_path: Path, |
| log_y: bool = False, |
| plot_upper_limit_minutes: float | None = None, |
| student_idx: int | None = None, |
| ) -> None: |
| """Create and save student-attempt distribution columns.""" |
| output_path.parent.mkdir(parents=True, exist_ok=True) |
|
|
| plt.style.use("seaborn-v0_8-whitegrid") |
| if student_idx is not None: |
| fig, ax = plt.subplots(figsize=(10, 5)) |
| else: |
| fig, ax = plt.subplots(figsize=(20, 5)) |
|
|
| left = summary_df["bin_left_min"].to_numpy(dtype=float) |
| width = summary_df["bin_width_min"].to_numpy(dtype=float) |
| counts = summary_df["attempt_count"].to_numpy(dtype=float) |
|
|
| bars = ax.bar( |
| left, |
| counts, |
| width=width, |
| align="edge", |
| color="#4C78A8", |
| |
| |
| ) |
|
|
| title = "Distribution of Student Attempts Over Elapsed Time" |
| if student_idx is not None: |
| title = f"{title} (student_idx={student_idx})" |
| ax.set_title(title) |
| ax.set_xlabel("Elapsed Time Since Student's First Attempt") |
| ax.set_ylabel("Number of Attempts") |
|
|
| x_min = float(left.min()) |
| x_max = float((left + width).max()) |
| if plot_upper_limit_minutes is not None: |
| x_max = min(x_max, float(plot_upper_limit_minutes)) |
| ax.set_xlim(x_min, x_max) |
|
|
| ax.xaxis.set_major_locator(MaxNLocator(nbins=9)) |
| ax.xaxis.set_major_formatter(FuncFormatter(format_minutes_tick)) |
| ax.grid(axis="y", alpha=0.25, linewidth=0.8) |
| ax.spines["top"].set_visible(False) |
| ax.spines["right"].set_visible(False) |
|
|
| if log_y: |
| ax.set_yscale("log") |
|
|
| annotate_bars = len(summary_df) <= 40 |
| if annotate_bars: |
| for bar, pct in zip(bars, summary_df["percentage"]): |
| if pct < 1.0: |
| continue |
| h = bar.get_height() |
| if h <= 0: |
| continue |
| ax.annotate( |
| f"{pct:.1f}%", |
| xy=(bar.get_x() + bar.get_width() / 2.0, h), |
| xytext=(0, 3), |
| textcoords="offset points", |
| ha="center", |
| va="bottom", |
| fontsize=8, |
| ) |
|
|
| plt.tight_layout() |
| fig.savefig(output_path, dpi=400, bbox_inches="tight") |
| plt.close(fig) |
|
|
|
|
| def main() -> None: |
| args = parse_args() |
|
|
| if not args.interactions_path.exists(): |
| raise FileNotFoundError( |
| f"Interactions file not found: {args.interactions_path}" |
| ) |
| if args.bin_time <= 0: |
| raise ValueError("--bin-time must be a positive number.") |
| if args.plot_upper_limit_minutes is not None and args.plot_upper_limit_minutes <= 0: |
| raise ValueError("--plot-upper-limit-minutes must be a positive number.") |
|
|
| df = load_interactions(args.interactions_path, max_rows=args.max_rows) |
|
|
| selected_student_id: str | None = None |
| total_students = int(df["user_id"].nunique()) |
| if args.student_idx is not None: |
| df, selected_student_id, total_students = select_student_by_index( |
| df, |
| args.student_idx, |
| ) |
|
|
| output_plot_path = args.output_plot |
| output_counts_path = args.output_counts |
| if selected_student_id is not None: |
| output_plot_path = append_student_id_to_output_path( |
| output_plot_path, |
| selected_student_id, |
| ) |
| output_counts_path = append_student_id_to_output_path( |
| output_counts_path, |
| selected_student_id, |
| ) |
|
|
| elapsed_minutes = compute_attempt_elapsed_minutes(df) |
| summary = summarize_attempt_distribution(elapsed_minutes, args.bin_time) |
| output_counts_path.parent.mkdir(parents=True, exist_ok=True) |
| summary.to_csv(output_counts_path, index=False) |
|
|
| plot_distribution( |
| summary, |
| output_plot_path, |
| log_y=args.log_y, |
| plot_upper_limit_minutes=args.plot_upper_limit_minutes, |
| student_idx=args.student_idx, |
| ) |
|
|
| total_attempts = int(summary["attempt_count"].sum()) |
| print("Done.") |
| print(f"Interactions loaded: {len(df):,}") |
| print(f"Students in loaded data: {total_students:,}") |
| if selected_student_id is not None: |
| print(f"Selected student idx: {args.student_idx}") |
| print(f"Selected student id: {selected_student_id}") |
| print(f"Attempts used: {total_attempts:,}") |
| print(f"Bin width (min): {args.bin_time}") |
| print(f"Saved plot: {output_plot_path}") |
| print(f"Saved bin counts: {output_counts_path}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|