buckeyeguy commited on
Commit
73b302f
·
verified ·
1 Parent(s): 4875f79

Upload data_loader.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data_loader.py +65 -0
data_loader.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Load data from HF Dataset with Streamlit caching."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+
7
+ import pandas as pd
8
+ import streamlit as st
9
+ from huggingface_hub import hf_hub_download
10
+
11
+ DATASET_REPO = "buckeyeguy/osc-usage-data"
12
+
13
+
14
+ @st.cache_data(ttl=300)
15
+ def load_data() -> tuple[pd.DataFrame, pd.DataFrame, dict]:
16
+ """Download Parquet + metadata from HF Dataset. Cached for 5 min."""
17
+ jobs_path = hf_hub_download(repo_id=DATASET_REPO, filename="jobs.parquet", repo_type="dataset")
18
+ snapshots_path = hf_hub_download(
19
+ repo_id=DATASET_REPO, filename="snapshots.parquet", repo_type="dataset"
20
+ )
21
+ metadata_path = hf_hub_download(
22
+ repo_id=DATASET_REPO, filename="metadata.json", repo_type="dataset"
23
+ )
24
+
25
+ jobs = pd.read_parquet(jobs_path)
26
+ snapshots = pd.read_parquet(snapshots_path)
27
+ with open(metadata_path) as f:
28
+ metadata = json.load(f)
29
+
30
+ # Ensure datetime columns
31
+ for col in ["submit_time", "start_time", "end_time"]:
32
+ if col in jobs.columns:
33
+ jobs[col] = pd.to_datetime(jobs[col])
34
+
35
+ # Add derived columns
36
+ if "end_time" in jobs.columns:
37
+ jobs["end_date"] = jobs["end_time"].dt.date
38
+ jobs["end_month"] = jobs["end_time"].dt.to_period("M").astype(str)
39
+ jobs["end_dow"] = jobs["end_time"].dt.dayofweek # 0=Mon
40
+ jobs["end_hour"] = jobs["end_time"].dt.hour
41
+
42
+ if "walltime_used" in jobs.columns:
43
+ jobs["walltime_hours"] = jobs["walltime_used"] / 3600.0
44
+
45
+ return jobs, snapshots, metadata
46
+
47
+
48
+ def filter_jobs(
49
+ jobs: pd.DataFrame,
50
+ date_range: tuple | None = None,
51
+ projects: list[str] | None = None,
52
+ users: list[str] | None = None,
53
+ systems: list[str] | None = None,
54
+ ) -> pd.DataFrame:
55
+ """Apply sidebar filters to jobs DataFrame."""
56
+ df = jobs.copy()
57
+ if date_range and "end_date" in df.columns:
58
+ df = df[df["end_date"].between(date_range[0], date_range[1])]
59
+ if projects:
60
+ df = df[df["project_code"].isin(projects)]
61
+ if users:
62
+ df = df[df["username"].isin(users)]
63
+ if systems:
64
+ df = df[df["system_code"].isin(systems)]
65
+ return df