buckeyeguy commited on
Commit
c19f3d3
·
verified ·
1 Parent(s): a2701e1

Upload data_loader.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data_loader.py +30 -7
data_loader.py CHANGED
@@ -43,19 +43,42 @@ def load_data() -> tuple[pd.DataFrame, pd.DataFrame, dict]:
43
  if "walltime_used" in jobs.columns:
44
  jobs["walltime_hours"] = jobs["walltime_used"] / 3600.0
45
 
46
- # Timeout classification — interactive vs batch
47
  if "launch_method" in jobs.columns and "last_state" in jobs.columns:
48
  import numpy as np
49
 
50
- from config import INTERACTIVE_METHODS
51
 
52
- is_timeout = jobs["last_state"] == "TIMEOUT"
53
  is_interactive = jobs["launch_method"].isin(INTERACTIVE_METHODS)
54
- jobs["timeout_category"] = np.where(
55
- ~is_timeout,
56
- jobs["last_state"],
57
- np.where(is_interactive, "Interactive Timeout", "Batch Timeout"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  )
 
 
 
 
 
 
 
 
 
 
59
 
60
  # Queue wait time
61
  if "submit_time" in jobs.columns and "start_time" in jobs.columns:
 
43
  if "walltime_used" in jobs.columns:
44
  jobs["walltime_hours"] = jobs["walltime_used"] / 3600.0
45
 
46
+ # Behavioral outcome classification
47
  if "launch_method" in jobs.columns and "last_state" in jobs.columns:
48
  import numpy as np
49
 
50
+ from config import INTERACTIVE_METHODS, QUICK_EXIT_SECONDS
51
 
 
52
  is_interactive = jobs["launch_method"].isin(INTERACTIVE_METHODS)
53
+ wt = jobs.get("walltime_used", pd.Series(dtype="float64"))
54
+ state = jobs["last_state"]
55
+
56
+ # Start with batch classification (maps exit state directly)
57
+ outcome = state.map(
58
+ {
59
+ "COMPLETED": "Completed",
60
+ "FAILED": "Failed",
61
+ "TIMEOUT": "Timed Out",
62
+ "OUT_OF_MEMORY": "Out of Memory",
63
+ }
64
+ ).fillna("Cancelled") # All CANCELLED variants + NODE_FAIL → "Cancelled"
65
+
66
+ # Override for interactive jobs
67
+ is_quick = is_interactive & (wt < QUICK_EXIT_SECONDS)
68
+ is_failed_interactive = is_interactive & state.isin({"FAILED", "OUT_OF_MEMORY"})
69
+ is_user_ended = (
70
+ is_interactive & ~is_quick & ~is_failed_interactive & state.str.startswith("CANCELLED")
71
  )
72
+ is_session_expired = (
73
+ is_interactive & ~is_quick & ~is_failed_interactive & (state == "TIMEOUT")
74
+ )
75
+
76
+ outcome = np.where(is_quick, "Quick Exit", outcome)
77
+ outcome = np.where(is_failed_interactive, "Failed", outcome)
78
+ outcome = np.where(is_user_ended, "User Ended", outcome)
79
+ outcome = np.where(is_session_expired, "Session Expired", outcome)
80
+
81
+ jobs["outcome_category"] = outcome
82
 
83
  # Queue wait time
84
  if "submit_time" in jobs.columns and "start_time" in jobs.columns: