FoundationalASSIST / Code /plot_student_attempt_distribution.py

Upload folder using huggingface_hub

6256eb9 verified 13 days ago

11.6 kB

	#!/usr/bin/env python3
	"""Plot distribution of student attempts over elapsed time.

	This script reads FoundationalASSIST `Interactions.csv`, computes elapsed time
	for each attempt from the student's first attempt, groups attempts into fixed
	time bins, and plots the resulting column distribution.
	"""

	from __future__ import annotations

	import argparse
	import math
	from pathlib import Path

	import matplotlib.pyplot as plt
	import pandas as pd
	from matplotlib.ticker import FuncFormatter, MaxNLocator


	DEFAULT_INTERACTIONS_PATH = (
	Path(__file__).resolve().parent.parent / "Data" / "Interactions.csv"
	)
	DEFAULT_OUTPUT_PLOT = (
	Path(__file__).resolve().parent.parent
	/ "Results"
	/ "student_attempt_distribution.png"
	)
	DEFAULT_OUTPUT_COUNTS = (
	Path(__file__).resolve().parent.parent
	/ "Results"
	/ "student_attempt_distribution_counts.csv"
	)


	def parse_args() -> argparse.Namespace:
	parser = argparse.ArgumentParser(
	description=(
	"Compute distribution of attempts over elapsed time from "
	"Interactions.csv and plot binned columns."
	)
	)
	parser.add_argument(
	"--interactions-path",
	type=Path,
	default=DEFAULT_INTERACTIONS_PATH,
	help="Path to Interactions.csv.",
	)
	parser.add_argument(
	"--output-plot",
	type=Path,
	default=DEFAULT_OUTPUT_PLOT,
	help="Path to save the output figure.",
	)
	parser.add_argument(
	"--output-counts",
	type=Path,
	default=DEFAULT_OUTPUT_COUNTS,
	help="Path to save binned attempt counts as CSV.",
	)
	parser.add_argument(
	"--max-rows",
	type=int,
	default=None,
	help="Optional cap on rows after sorting (for quick debugging).",
	)
	parser.add_argument(
	"--bin-time",
	type=float,
	default=10.0,
	help=(
	"Fixed bin width in minutes. "
	"For example, --bin-time 10 creates bins [0,10), [10,20), ..."
	),
	)
	parser.add_argument(
	"--plot-upper-limit-minutes",
	type=float,
	default=None,
	help=(
	"Optional upper limit for x-axis in minutes. "
	"If omitted, uses the full range implied by bins."
	),
	)
	parser.add_argument(
	"--student-idx",
	type=int,
	default=None,
	help=(
	"Optional 0-based index of student to plot. Index is based on "
	"sorted unique user_id values in the loaded interactions."
	),
	)
	parser.add_argument(
	"--log-y",
	action="store_true",
	help="Use log scale on y-axis.",
	)
	return parser.parse_args()


	def load_interactions(path: Path, max_rows: int \| None = None) -> pd.DataFrame:
	"""Load fields required for student attempt timing analysis."""
	usecols = ["id", "user_id", "end_time"]
	df = pd.read_csv(path, usecols=usecols, low_memory=False)

	df["id"] = pd.to_numeric(df["id"], errors="coerce")
	df["id"] = df["id"].fillna(-1).astype(int)
	df["user_id"] = df["user_id"].astype("string")
	df["end_time"] = pd.to_datetime(df["end_time"], errors="coerce", utc=True)

	df = df.dropna(subset=["user_id", "end_time"]).copy()
	df = df.sort_values(["user_id", "end_time", "id"], kind="mergesort")

	if max_rows is not None:
	if max_rows <= 0:
	raise ValueError("--max-rows must be a positive integer.")
	df = df.head(max_rows).copy()

	return df


	def select_student_by_index(
	df: pd.DataFrame,
	student_idx: int,
	) -> tuple[pd.DataFrame, str, int]:
	"""Select one student's interactions by 0-based index over unique IDs."""
	student_ids = df["user_id"].drop_duplicates().tolist()
	total_students = len(student_ids)

	if total_students == 0:
	raise ValueError("No students found in loaded interactions.")
	if student_idx < 0 or student_idx >= total_students:
	raise ValueError(
	f"--student-idx must be in [0, {total_students - 1}], got {student_idx}."
	)

	selected_student_id = str(student_ids[student_idx])
	selected_df = df[df["user_id"] == selected_student_id].copy()
	return selected_df, selected_student_id, total_students


	def append_student_id_to_output_path(path: Path, student_id: str) -> Path:
	"""Append a safe student-id suffix to output filename."""
	safe_id = "".join(
	ch if ch.isalnum() or ch in ("-", "_") else "_" for ch in student_id
	)
	return path.with_name(f"{path.stem}_{safe_id}{path.suffix}")


	def compute_attempt_elapsed_minutes(df: pd.DataFrame) -> pd.Series:
	"""Compute elapsed minutes of each attempt from student's first attempt."""
	first_times = df.groupby("user_id", sort=False)["end_time"].transform("min")
	elapsed_minutes = (df["end_time"] - first_times).dt.total_seconds() / 60.0
	elapsed_minutes.name = "elapsed_minutes"
	return elapsed_minutes


	def build_fixed_width_bin_edges_minutes(
	valid_elapsed_minutes: pd.Series,
	bin_time_minutes: float,
	) -> list[float]:
	"""Build fixed-width bin edges from min/max elapsed minutes."""
	min_elapsed = float(valid_elapsed_minutes.min())
	max_elapsed = float(valid_elapsed_minutes.max())

	start = bin_time_minutes * math.floor(min_elapsed / bin_time_minutes)
	end = bin_time_minutes * math.ceil(max_elapsed / bin_time_minutes)

	if math.isclose(start, 0.0, abs_tol=1e-12):
	start = 0.0
	if math.isclose(end, start, abs_tol=1e-12):
	end = start + bin_time_minutes

	n_bins = int(round((end - start) / bin_time_minutes))
	edges = [start + i * bin_time_minutes for i in range(n_bins + 1)]
	if edges[-1] <= max_elapsed:
	edges.append(edges[-1] + bin_time_minutes)

	return edges


	def format_minutes_tick(value: float, _pos: float) -> str:
	if value < 60:
	return f"{value:.0f}m"
	if value < 1440:
	return f"{value / 60:.0f}h"
	return f"{value / 1440:.0f}d"


	def summarize_attempt_distribution(
	elapsed_minutes: pd.Series,
	bin_time_minutes: float,
	) -> pd.DataFrame:
	valid = elapsed_minutes.dropna().copy()
	if valid.empty:
	raise ValueError("No valid elapsed attempt times found.")

	edges = build_fixed_width_bin_edges_minutes(valid, bin_time_minutes)
	binned = pd.cut(valid, bins=edges, right=False, include_lowest=True)
	counts = binned.value_counts(sort=False)
	total_attempts = int(counts.sum())
	probabilities = (counts / total_attempts).astype(float)

	bin_left = pd.Series(edges[:-1], dtype=float)
	bin_right = pd.Series(edges[1:], dtype=float)
	bin_width = bin_right - bin_left

	summary = pd.DataFrame(
	{
	"bin_left_min": bin_left.to_numpy(),
	"bin_right_min": bin_right.to_numpy(),
	"bin_width_min": bin_width.to_numpy(),
	"attempt_count": counts.to_numpy(),
	"probability": probabilities.to_numpy(dtype=float),
	"percentage": probabilities.to_numpy(dtype=float) * 100.0,
	}
	)
	return summary


	def plot_distribution(
	summary_df: pd.DataFrame,
	output_path: Path,
	log_y: bool = False,
	plot_upper_limit_minutes: float \| None = None,
	student_idx: int \| None = None,
	) -> None:
	"""Create and save student-attempt distribution columns."""
	output_path.parent.mkdir(parents=True, exist_ok=True)

	plt.style.use("seaborn-v0_8-whitegrid")
	if student_idx is not None:
	fig, ax = plt.subplots(figsize=(10, 5))
	else:
	fig, ax = plt.subplots(figsize=(20, 5))

	left = summary_df["bin_left_min"].to_numpy(dtype=float)
	width = summary_df["bin_width_min"].to_numpy(dtype=float)
	counts = summary_df["attempt_count"].to_numpy(dtype=float)

	bars = ax.bar(
	left,
	counts,
	width=width,
	align="edge",
	color="#4C78A8",
	# edgecolor="white",
	# linewidth=1.0,
	)

	title = "Distribution of Student Attempts Over Elapsed Time"
	if student_idx is not None:
	title = f"{title} (student_idx={student_idx})"
	ax.set_title(title)
	ax.set_xlabel("Elapsed Time Since Student's First Attempt")
	ax.set_ylabel("Number of Attempts")

	x_min = float(left.min())
	x_max = float((left + width).max())
	if plot_upper_limit_minutes is not None:
	x_max = min(x_max, float(plot_upper_limit_minutes))
	ax.set_xlim(x_min, x_max)

	ax.xaxis.set_major_locator(MaxNLocator(nbins=9))
	ax.xaxis.set_major_formatter(FuncFormatter(format_minutes_tick))
	ax.grid(axis="y", alpha=0.25, linewidth=0.8)
	ax.spines["top"].set_visible(False)
	ax.spines["right"].set_visible(False)

	if log_y:
	ax.set_yscale("log")

	annotate_bars = len(summary_df) <= 40
	if annotate_bars:
	for bar, pct in zip(bars, summary_df["percentage"]):
	if pct < 1.0:
	continue
	h = bar.get_height()
	if h <= 0:
	continue
	ax.annotate(
	f"{pct:.1f}%",
	xy=(bar.get_x() + bar.get_width() / 2.0, h),
	xytext=(0, 3),
	textcoords="offset points",
	ha="center",
	va="bottom",
	fontsize=8,
	)

	plt.tight_layout()
	fig.savefig(output_path, dpi=400, bbox_inches="tight")
	plt.close(fig)


	def main() -> None:
	args = parse_args()

	if not args.interactions_path.exists():
	raise FileNotFoundError(
	f"Interactions file not found: {args.interactions_path}"
	)
	if args.bin_time <= 0:
	raise ValueError("--bin-time must be a positive number.")
	if args.plot_upper_limit_minutes is not None and args.plot_upper_limit_minutes <= 0:
	raise ValueError("--plot-upper-limit-minutes must be a positive number.")

	df = load_interactions(args.interactions_path, max_rows=args.max_rows)

	selected_student_id: str \| None = None
	total_students = int(df["user_id"].nunique())
	if args.student_idx is not None:
	df, selected_student_id, total_students = select_student_by_index(
	df,
	args.student_idx,
	)

	output_plot_path = args.output_plot
	output_counts_path = args.output_counts
	if selected_student_id is not None:
	output_plot_path = append_student_id_to_output_path(
	output_plot_path,
	selected_student_id,
	)
	output_counts_path = append_student_id_to_output_path(
	output_counts_path,
	selected_student_id,
	)

	elapsed_minutes = compute_attempt_elapsed_minutes(df)
	summary = summarize_attempt_distribution(elapsed_minutes, args.bin_time)
	output_counts_path.parent.mkdir(parents=True, exist_ok=True)
	summary.to_csv(output_counts_path, index=False)

	plot_distribution(
	summary,
	output_plot_path,
	log_y=args.log_y,
	plot_upper_limit_minutes=args.plot_upper_limit_minutes,
	student_idx=args.student_idx,
	)

	total_attempts = int(summary["attempt_count"].sum())
	print("Done.")
	print(f"Interactions loaded: {len(df):,}")
	print(f"Students in loaded data: {total_students:,}")
	if selected_student_id is not None:
	print(f"Selected student idx: {args.student_idx}")
	print(f"Selected student id: {selected_student_id}")
	print(f"Attempts used: {total_attempts:,}")
	print(f"Bin width (min): {args.bin_time}")
	print(f"Saved plot: {output_plot_path}")
	print(f"Saved bin counts: {output_counts_path}")


	if __name__ == "__main__":
	main()