DattaIndranuj / attendance_core.py
IndranujDatta's picture
Upload 6 files
b3271a4 verified
import os
import sqlite3
from datetime import datetime
import numpy as np
import pandas as pd
DB_PATH = os.environ.get("ATTENDANCE_DB_PATH", "attendance.db")
def connect(db_path: str = DB_PATH):
return sqlite3.connect(db_path, check_same_thread=False)
def init_db(conn: sqlite3.Connection):
cur = conn.cursor()
cur.execute("""
CREATE TABLE IF NOT EXISTS attendance (
StudentID TEXT,
StudentName TEXT,
Subject TEXT,
Date TEXT,
Status TEXT CHECK(Status IN ('Present','Absent')),
PRIMARY KEY (StudentID, Subject, Date)
);
""")
conn.commit()
def normalize_status(x: str) -> str:
if x is None:
return "Absent"
x = str(x).strip().lower()
if x in ["p", "present", "1", "yes", "y"]:
return "Present"
return "Absent"
def load_csv_to_df(file_path: str) -> pd.DataFrame:
df = pd.read_csv(file_path)
expected = {"StudentID", "Subject", "Date", "Status"}
missing = expected - set(map(str, df.columns))
if missing:
raise ValueError(f"Missing required columns: {missing}. Required: {sorted(expected)}")
# Optional column for names
if "StudentName" not in df.columns:
df["StudentName"] = df["StudentID"]
# normalize
df["Status"] = df["Status"].apply(normalize_status)
# validate date
pd.to_datetime(df["Date"], errors="raise")
return df[["StudentID", "StudentName", "Subject", "Date", "Status"]]
def ingest_df(conn: sqlite3.Connection, df: pd.DataFrame):
init_db(conn)
df = df.copy()
df["Status"] = df["Status"].apply(normalize_status)
# ensure Date is string YYYY-MM-DD
df["Date"] = pd.to_datetime(df["Date"]).dt.strftime("%Y-%m-%d")
df.to_sql("attendance", conn, if_exists="append", index=False)
# remove duplicates by replacing with most recent
cur = conn.cursor()
cur.execute("""
CREATE TEMP TABLE t AS
SELECT * FROM attendance
GROUP BY StudentID, Subject, Date
""")
cur.execute("DELETE FROM attendance")
cur.execute("INSERT INTO attendance SELECT * FROM t")
cur.execute("DROP TABLE t")
conn.commit()
def get_dataframe(conn: sqlite3.Connection,
start_date: str | None = None,
end_date: str | None = None,
subject: str | None = None,
student_id: str | None = None) -> pd.DataFrame:
init_db(conn)
q = "SELECT StudentID, StudentName, Subject, Date, Status FROM attendance WHERE 1=1"
params = []
if start_date:
q += " AND Date >= ?"
params.append(start_date)
if end_date:
q += " AND Date <= ?"
params.append(end_date)
if subject:
q += " AND Subject = ?"
params.append(subject)
if student_id:
q += " AND StudentID = ?"
params.append(student_id)
q += " ORDER BY Date, Subject, StudentID"
df = pd.read_sql_query(q, conn, params=params)
return df
def compute_percentages_numpy(df: pd.DataFrame) -> pd.DataFrame:
# per student overall percentage using NumPy
if df.empty:
return pd.DataFrame(columns=["StudentID", "StudentName", "TotalDays", "PresentDays", "AttendancePercent"])
# map present to 1, absent to 0
status_arr = (df["Status"].to_numpy() == "Present").astype(int)
# group indices by StudentID
by_student = df.groupby(["StudentID", "StudentName"]).indices
records = []
for (sid, sname), idx in by_student.items():
vals = status_arr[idx]
total = vals.size
present = int(vals.sum())
pct = (present / total) * 100 if total else 0.0
records.append((sid, sname, total, present, pct))
out = pd.DataFrame(records, columns=["StudentID", "StudentName", "TotalDays", "PresentDays", "AttendancePercent"])
out = out.sort_values(["AttendancePercent", "StudentID"], ascending=[True, True]).reset_index(drop=True)
return out
def summary_per_subject(df: pd.DataFrame) -> pd.DataFrame:
if df.empty:
return pd.DataFrame(columns=["Subject", "TotalClasses", "AvgAttendancePercent"])
# per (Student, Subject) attendance percent then average across students
df_bin = df.assign(Present=(df["Status"] == "Present").astype(int))
per_ss = (df_bin.groupby(["StudentID", "StudentName", "Subject"])
.agg(Total=("Present","size"), Present=("Present","sum"))
.reset_index())
per_ss["AttendancePercent"] = per_ss["Present"]/per_ss["Total"]*100
# subject summary
subject_summary = (per_ss.groupby("Subject")
.agg(TotalClasses=("Total","sum"),
AvgAttendancePercent=("AttendancePercent","mean"))
.reset_index()
.sort_values("Subject"))
return subject_summary
def students_below_threshold(df: pd.DataFrame, threshold: float=75.0) -> pd.DataFrame:
overall = compute_percentages_numpy(df)
low = overall[overall["AttendancePercent"] < threshold].reset_index(drop=True)
return low
def export_csv(df: pd.DataFrame, path: str):
df.to_csv(path, index=False)
return path