samder03's picture
Update app.py
0ab3adf verified
import pandas as pd # For working with dataframes
import datasets # To upload out data to HF
import huggingface_hub # For each HF authentication in notebooks
from openai import OpenAI
import os
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np, math, joblib, warnings
import torch, torch.nn as nn, torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from scipy.stats import spearmanr
from datetime import datetime, timezone, timedelta
from sentence_transformers import SentenceTransformer
# load data
ds_dict = datasets.load_dataset("samder03/Project1")
ds = ds_dict["original"]
df = ds.to_pandas() if hasattr(ds, "to_pandas") else ds.copy()
imp_col = "importance (1-10)"
dur_col = "how long it takes (hours)"
hor_col = "when it's due (days)"
# embeddings
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer("all-MiniLM-L6-v2") # 384-dim
embeddings = embedder.encode(df['task'].astype(str).tolist(), normalize_embeddings=True).astype(np.float32) # (N,384)
df["task_embedding"] = list(embeddings)
def valid_mask(series: pd.Series):
return series.notna() & (series.astype(str).str.strip() != "")
m_imp = valid_mask(df[imp_col])
m_dur = valid_mask(df[dur_col])
m_core = m_imp & m_dur
X_all = np.stack(df.loc[m_core, "task_embedding"].values).astype(np.float32)
y_imp = df.loc[m_core, imp_col].astype(float).values
y_dur = df.loc[m_core, dur_col].astype(float).values
if hor_col:
y_hor_raw = df[hor_col].astype(float).values
y_hor_raw[y_hor_raw < 0] = np.nan # -1 => no deadline label
y_hor = y_hor_raw[m_core]
else:
y_hor = None
rng = 42
idx_all = np.arange(len(X_all))
# first: carve out 20% for test
train_idx, test_idx = train_test_split(idx_all, test_size=0.20, random_state=rng)
# then: from the remaining 80%, carve out 12.5% for val (=> 10% overall)
train_idx, val_idx = train_test_split(train_idx, test_size=0.125, random_state=rng)
# slice features/labels
X_tr, X_va, X_te = X_all[train_idx], X_all[val_idx], X_all[test_idx]
I_tr, I_va, I_te = y_imp[train_idx], y_imp[val_idx], y_imp[test_idx]
D_tr, D_va, D_te = y_dur[train_idx], y_dur[val_idx], y_dur[test_idx]
# horizon: -1 already converted to NaN upstream
if y_hor is not None:
H_tr_all, H_va_all, H_te_all = y_hor[train_idx], y_hor[val_idx], y_hor[test_idx]
mH_tr = np.isfinite(H_tr_all) # True where horizon label exists
mH_va = np.isfinite(H_va_all)
mH_te = np.isfinite(H_te_all)
else:
H_tr_all = H_va_all = H_te_all = None
mH_tr = mH_va = mH_te = None
# ---------- 5) Scale embeddings (fit on TRAIN only) ----------
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_tr) # fit only on train to avoid leakage
Xs_tr = scaler.transform(X_tr).astype(np.float32)
Xs_va = scaler.transform(X_va).astype(np.float32)
Xs_te = scaler.transform(X_te).astype(np.float32)
# persist for inference
import joblib
joblib.dump(scaler, "mtl_scaler.joblib")
import numpy as np, random
import torch, torch.nn as nn, torch.optim as optim
from scipy.stats import spearmanr
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
# 0) Setup: device, seeds, helper
torch.manual_seed(42); np.random.seed(42); random.seed(42)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(42)
torch.backends.cudnn.benchmark = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def tt(a, dtype=torch.float32): return torch.from_numpy(a).to(device).to(dtype)
def safe_any_mask(m):
return (m is not None) and isinstance(m, torch.Tensor) and m.numel() > 0 and m.any().item()
def safe_spearman(a, b):
r = spearmanr(a, b).correlation
return float('nan') if r is None else float(r)
Xt_tr, Xt_va, Xt_te = tt(Xs_tr), tt(Xs_va), tt(Xs_te)
yI_tr, yI_va, yI_te = tt(I_tr), tt(I_va), tt(I_te)
yD_tr, yD_va, yD_te = tt(D_tr), tt(D_va), tt(D_te)
if y_hor is not None:
H_tr_all_t, H_va_all_t, H_te_all_t = tt(H_tr_all), tt(H_va_all), tt(H_te_all)
mH_tr_t = torch.from_numpy(mH_tr.astype(bool)).to(device)
mH_va_t = torch.from_numpy(mH_va.astype(bool)).to(device)
mH_te_t = torch.from_numpy(mH_te.astype(bool)).to(device)
# 3) Model: Multi-Task MLP (shared trunk + 3 heads)
# Slightly wider trunk; textbook uncertainty weighting (0.5 factor)
class MTLNet(nn.Module):
def __init__(self, d_in, d_hid=512):
super().__init__()
self.trunk = nn.Sequential(
nn.Linear(d_in, d_hid), nn.ReLU(), nn.Dropout(0.2),
nn.Linear(d_hid, 256), nn.ReLU(), nn.Dropout(0.1),
)
self.head_imp = nn.Linear(256, 1) # importance (raw)
self.head_dur = nn.Linear(256, 1) # log-hours
self.head_hor = nn.Linear(256, 1) # log-days
# homoscedastic uncertainty (log sigma per task)
self.log_sigma_imp = nn.Parameter(torch.tensor(0.0))
self.log_sigma_dur = nn.Parameter(torch.tensor(0.0))
self.log_sigma_hor = nn.Parameter(torch.tensor(0.0))
self._L1 = nn.SmoothL1Loss()
self._MSE = nn.MSELoss()
def forward(self, x):
h = self.trunk(x)
return (
self.head_imp(h).squeeze(-1),
self.head_dur(h).squeeze(-1),
self.head_hor(h).squeeze(-1),
)
def multitask_loss(self, xb, yI, yD, yH=None, mH=None):
rI, rD, rH = self(xb)
# importance: SmoothL1 on raw scale
l_imp = self._L1(rI, yI)
# duration: MSE on log1p(hours)
l_dur = self._MSE(rD, torch.log1p(yD))
loss = 0.5*torch.exp(-self.log_sigma_imp)*l_imp + self.log_sigma_imp \
+ 0.5*torch.exp(-self.log_sigma_dur)*l_dur + self.log_sigma_dur
l_hor_val = None
if (yH is not None) and safe_any_mask(mH):
l_hor = self._MSE(rH[mH], torch.log1p(yH[mH]))
loss = loss + 0.5*torch.exp(-self.log_sigma_hor)*l_hor + self.log_sigma_hor
l_hor_val = float(l_hor.item())
return loss, (float(l_imp.item()), float(l_dur.item()), l_hor_val)
net = MTLNet(d_in=Xt_tr.shape[1]).to(device)
# 4) Prediction + Eval helpers
@torch.no_grad()
def predict_heads(Xt):
net.eval()
rI, rD, rH = net(Xt)
I = torch.clamp(rI, 1.0, 10.0) # importance 1..10
Hh = torch.expm1(rD).clamp(0.25, 12.0) # hours
Hd = torch.expm1(rH).clamp(0.0, 30.0) # days
return I, Hh, Hd
def eval_block(Xt, yI_true, yD_true, yH_true=None, mH=None):
I, Hh, Hd = predict_heads(Xt)
I_np, H_np, Hd_np = I.detach().cpu().numpy(), Hh.detach().cpu().numpy(), Hd.detach().cpu().numpy()
yI_np, yD_np = yI_true.detach().cpu().numpy(), yD_true.detach().cpu().numpy()
maeI = mean_absolute_error(yI_np, I_np)
maeD = mean_absolute_error(yD_np, H_np)
rhoI = safe_spearman(yI_np, I_np) if len(I_np) > 1 else float('nan')
rhoD = safe_spearman(yD_np, H_np) if len(H_np) > 1 else float('nan')
out = {"maeI": maeI, "maeD": maeD, "rhoI": rhoI, "rhoD": rhoD}
if (yH_true is not None) and (mH is not None) and mH.any().item():
yH_np, mH_np = yH_true.detach().cpu().numpy(), mH.detach().cpu().numpy().astype(bool)
if mH_np.sum() > 0:
maeH = mean_absolute_error(yH_np[mH_np], Hd_np[mH_np])
rhoH = safe_spearman(yH_np[mH_np], Hd_np[mH_np]) if mH_np.sum() > 1 else float('nan')
out.update({"maeH": maeH, "rhoH": rhoH})
return out
# 5) Train loop with per-batch cosine, AMP (new API), early stop
EPOCHS = 120
BATCH = 64
best_val = float("inf")
patience = 20
bad = 0
opt = optim.AdamW(net.parameters(), lr=2e-4, weight_decay=2e-4)
sched = optim.lr_scheduler.CosineAnnealingWarmRestarts(opt, T_0=60, T_mult=2, eta_min=1e-6)
scaler = torch.amp.GradScaler('cuda', enabled=torch.cuda.is_available())
n_tr = Xt_tr.shape[0]
for ep in range(1, EPOCHS + 1):
net.train()
order = torch.randperm(n_tr, device=device)
tot_loss = 0.0
for s in range(0, n_tr, BATCH):
e = min(s + BATCH, n_tr)
idx = order[s:e]
xb, yi, yd = Xt_tr[idx], yI_tr[idx], yD_tr[idx]
yh, mh = (H_tr_all_t[idx], mH_tr_t[idx]) if H_tr_all_t is not None else (None, None)
opt.zero_grad(set_to_none=True)
with torch.autocast('cuda', enabled=torch.cuda.is_available()):
loss, (lI, lD, lH) = net.multitask_loss(xb, yi, yd, yh, mh)
scaler.scale(loss).backward()
scaler.unscale_(opt)
torch.nn.utils.clip_grad_norm_(net.parameters(), max_norm=1.0)
scaler.step(opt); scaler.update()
# per-iteration cosine step (nice smooth LR curve)
progress = (s + BATCH) / max(n_tr, 1)
sched.step((ep - 1) + progress)
tot_loss += float(loss.item())
# ---- validation ----
stats_va = eval_block(
Xt_va, yI_va, yD_va,
(H_va_all_t if H_va_all_t is not None else None),
(mH_va_t if mH_va_t is not None else None)
)
total_val = stats_va["maeI"] + stats_va["maeD"] + (stats_va.get("maeH", 0.0))
if ep % 5 == 0:
lr_now = opt.param_groups[0]["lr"]
extraH = f" hor={stats_va.get('maeH', float('nan')):.3f}" if "maeH" in stats_va else ""
# ---- early stopping on summed MAE ----
if total_val < best_val - 1e-4:
best_val = total_val
bad = 0
torch.save(net.state_dict(), "mtl_net.pt")
else:
bad += 1
if bad >= patience:
break
# 6) TEST with best checkpoint + final confirmation
net.load_state_dict(torch.load("mtl_net.pt", map_location=device))
stats_te = eval_block(
Xt_te, yI_te, yD_te,
(H_te_all_t if H_te_all_t is not None else None),
(mH_te_t if mH_te_t is not None else None)
)
from datetime import datetime, timedelta, timezone
import numpy as np
import pandas as pd
import torch
import joblib
# --- helpers reused by build_priority_todo() ---
def _parse_due_utc(s):
"""Parse ISO8601 to tz-aware UTC datetime; return None if empty/invalid."""
if not isinstance(s, str) or not s.strip():
return None
try:
dt = datetime.fromisoformat(s.strip())
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
else:
dt = dt.astimezone(timezone.utc)
return dt
except Exception:
return None
def _due_from_model(now_utc, h_days):
if h_days is None: return None
try:
if not np.isfinite(h_days): return None
except Exception:
return None
return now_utc + (timedelta(hours=2) if h_days <= 0 else timedelta(days=float(h_days)))
def _due_from_heuristic(now_utc, I, H_hours):
base = (10.0 - float(I)) / 3.0
size = float(H_hours) / 6.0
days = float(np.clip(base + size, 0.5, 14.0))
return now_utc + timedelta(days=days)
def parse_user_due_utc(s, *, local_tz=timezone.utc, eod_hour=23, eod_min=59):
"""
Parse user string -> (dt_utc, had_time)
- accepts ISO and common US formats
- naive -> local_tz
- date-only -> end-of-day local, then convert to UTC
Returns (None, False) if invalid.
"""
if not isinstance(s, str) or not s.strip():
return None, False
s = s.strip()
# heuristic: did they include a time?
had_time = bool(re.search(r"\d:\d|\dT\d", s))
# 1) try ISO first
try:
dt = datetime.fromisoformat(s)
if dt.tzinfo is None:
dt = dt.replace(tzinfo=local_tz)
return dt.astimezone(timezone.utc), (had_time or dt.time() != datetime.min.time())
except Exception:
pass
# 2) date-only patterns
for pat in ("%m/%d/%Y","%m/%d/%y","%Y-%m-%d","%Y/%m/%d","%m-%d-%Y","%m-%d-%y"):
try:
d = datetime.strptime(s, pat).replace(
hour=eod_hour, minute=eod_min, second=0, microsecond=0, tzinfo=local_tz
)
return d.astimezone(timezone.utc), False
except Exception:
pass
# 3) datetime patterns (naive -> local)
for pat in ("%m/%d/%Y %H:%M","%m/%d/%Y %H:%M:%S",
"%m/%d/%y %H:%M","%m/%d/%y %H:%M:%S",
"%Y-%m-%d %H:%M","%Y-%m-%d %H:%M:%S",
"%Y/%m/%d %H:%M","%Y/%m/%d %H:%M:%S"):
try:
d = datetime.strptime(s, pat).replace(tzinfo=local_tz)
return d.astimezone(timezone.utc), True
except Exception:
pass
return None, False
def _fmt_utc_for_display(dt_utc, had_time):
"""Uniform UTC display: date-only → YYYY-MM-DD; datetime → YYYY-MM-DDTHH:MM:SSZ."""
if had_time:
return dt_utc.strftime("%Y-%m-%dT%H:%M:%SZ")
return dt_utc.date().isoformat()
def priority_score(I, H_hours, due_dt, now=None, wI=0.70, wDeadline=0.25, wDur=0.05):
if now is None:
now = datetime.now(timezone.utc)
hours_left = max((due_dt - now).total_seconds()/3600.0, 0.25)
if hours_left > 24:
deadline_pressure = min(1.0, 0.5*(1.0/(1.0 + hours_left/24.0)))
else:
deadline_pressure = min(1.0, 1.0/(1.0 + hours_left/6.0))
dur_pressure = min(1.0, float(H_hours)/2.0)
p01 = wI*(float(I)/10.0) + wDeadline*deadline_pressure + wDur*dur_pressure
return round(1 + 9*p01, 1)
from datetime import datetime, timezone
def reorder_tasks(tasks_string, user_due_iso=None):
now = datetime.now(timezone.utc)
tasks = [t.strip() for t in str(tasks_string).splitlines() if t.strip()]
if not tasks:
# Return empty boxes and clear checkboxes
return "", "", "", gr.update(choices=[], value=[])
# split/normalize user dates (string or list)
if isinstance(user_due_iso, str) or user_due_iso is None:
due_lines = [s.strip() for s in str(user_due_iso or "").splitlines()]
due_lines = [d if d else None for d in due_lines]
else:
due_lines = [None if (d is None or str(d).strip()=="") else str(d).strip()
for d in list(user_due_iso)]
if len(due_lines) < len(tasks):
due_lines += [None] * (len(tasks) - len(due_lines))
elif len(due_lines) > len(tasks):
due_lines = due_lines[:len(tasks)]
# embed -> scale
X = embedder.encode(tasks, normalize_embeddings=True).astype(np.float32)
sc = joblib.load("mtl_scaler.joblib")
Xs = sc.transform(X).astype(np.float32)
# predict
net.eval()
with torch.no_grad():
rI, rD, rH = net(torch.from_numpy(Xs).to(device))
I = torch.clamp(rI, 1.0, 10.0).cpu().numpy()
H = torch.expm1(rD).clamp(0.25, 12.0).cpu().numpy() # duration (hrs), 0.25–12
Hd = torch.expm1(rH).clamp(0.0, 30.0).cpu().numpy() # horizon (days), 0–30
now = datetime.now(timezone.utc)
assumed_local_tz = timezone.utc
rows = []
for t, due_s, i_imp, h_hrs, h_days in zip(tasks, due_lines, I, H, Hd):
# parse user date (if provided)
due_user_utc, had_time = parse_user_due_utc(due_s, local_tz=assumed_local_tz) if due_s else (None, False)
# choose due_for_scores and display string
if due_user_utc is not None:
due_for_scores = due_user_utc
display_due_str = _fmt_utc_for_display(due_user_utc, had_time)
else:
due_for_scores = _due_from_model(now, float(h_days)) or _due_from_heuristic(now, float(i_imp), float(h_hrs))
display_due_str = _fmt_utc_for_display(due_for_scores, False)
# priority score
P = priority_score(float(i_imp), float(h_hrs), due_for_scores, now=now)
rows.append({
"task": t,
"display_due": display_due_str,
"suggested_due_iso": due_for_scores.isoformat(),
"duration_hrs": float(h_hrs), # keep raw number for sorting/formatting
"priority_1to10": P,
})
out = pd.DataFrame(rows).sort_values(
["priority_1to10", "suggested_due_iso"], ascending=[False, True]
).reset_index(drop=True)
choices = [f"{i+1}. {t}" for i, t in enumerate(out["task"].tolist())]
# Prepare the three text outputs
task_lines = "\n".join(out["task"].tolist())
due_lines_out = "\n".join(out["display_due"].tolist())
duration_lines = "\n".join(f"{d:.1f}" for d in out["duration_hrs"].tolist())
# IMPORTANT: don't wipe user selections each run
checkbox_update = gr.update(choices=choices)
return task_lines, due_lines_out, duration_lines, checkbox_update
import gradio as gr # For building the interface
with gr.Blocks() as demo:
gr.Markdown("# Automated Task Prioritizer")
gr.Markdown("This app will take your to-do list and reorder it based on importance, urgency, and duration.")
with gr.Tab("Task Entry"):
with gr.Row():
sample_tasks = gr.Textbox(label="Task List", lines=10, placeholder="One task per line")
due_dates = gr.Textbox(label="Due Date", lines=10, placeholder="One date per line (optional)")
run_btn = gr.Button("Prioritize")
gr.Examples(
examples=[[
"finish lab report before monday\n"
"email TA about grading\n"
"practice dance combo 20 minutes\n"
"apply to 3 jobs\n"
"review calculus problem set (10 problems)\n"
"call dentist to schedule appointment\n"
"draft 1-page cover letter\n"
"wash dishes\n"
"organize notes for history essay\n"
"watch tv show",
"10/6/25\n"
"10/4/25\n"
"10/4/25\n"
"\n"
"10/8/25\n"
"10/3/25\n"
"10/11/25\n"
"10/3/25\n"
"10/5/25\n"
]],
inputs=[sample_tasks, due_dates],
label="Example",
examples_per_page=1,
cache_examples=False,
)
with gr.Tab("Prioritized List"):
with gr.Row():
priority_task = gr.Textbox(label="Prioritized Task List", lines=10, interactive=False)
date_box = gr.Textbox(label="Due Date", lines=10, interactive=False)
durations_box = gr.Textbox(label="Duration (hrs)", lines=10, interactive=False)
done_boxes = gr.CheckboxGroup(label="Mark completed tasks", interactive=True) # <-- ensure interactive
# Wire up
run_btn.click(
fn=reorder_tasks,
inputs=[sample_tasks, due_dates],
outputs=[priority_task, date_box, durations_box, done_boxes]
)
if __name__ == "__main__":
demo.launch(debug=True)