Spaces:
Running
Running
Upload data_loader.py with huggingface_hub
Browse files- data_loader.py +65 -0
data_loader.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Load data from HF Dataset with Streamlit caching."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import streamlit as st
|
| 9 |
+
from huggingface_hub import hf_hub_download
|
| 10 |
+
|
| 11 |
+
DATASET_REPO = "buckeyeguy/osc-usage-data"
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@st.cache_data(ttl=300)
|
| 15 |
+
def load_data() -> tuple[pd.DataFrame, pd.DataFrame, dict]:
|
| 16 |
+
"""Download Parquet + metadata from HF Dataset. Cached for 5 min."""
|
| 17 |
+
jobs_path = hf_hub_download(repo_id=DATASET_REPO, filename="jobs.parquet", repo_type="dataset")
|
| 18 |
+
snapshots_path = hf_hub_download(
|
| 19 |
+
repo_id=DATASET_REPO, filename="snapshots.parquet", repo_type="dataset"
|
| 20 |
+
)
|
| 21 |
+
metadata_path = hf_hub_download(
|
| 22 |
+
repo_id=DATASET_REPO, filename="metadata.json", repo_type="dataset"
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
jobs = pd.read_parquet(jobs_path)
|
| 26 |
+
snapshots = pd.read_parquet(snapshots_path)
|
| 27 |
+
with open(metadata_path) as f:
|
| 28 |
+
metadata = json.load(f)
|
| 29 |
+
|
| 30 |
+
# Ensure datetime columns
|
| 31 |
+
for col in ["submit_time", "start_time", "end_time"]:
|
| 32 |
+
if col in jobs.columns:
|
| 33 |
+
jobs[col] = pd.to_datetime(jobs[col])
|
| 34 |
+
|
| 35 |
+
# Add derived columns
|
| 36 |
+
if "end_time" in jobs.columns:
|
| 37 |
+
jobs["end_date"] = jobs["end_time"].dt.date
|
| 38 |
+
jobs["end_month"] = jobs["end_time"].dt.to_period("M").astype(str)
|
| 39 |
+
jobs["end_dow"] = jobs["end_time"].dt.dayofweek # 0=Mon
|
| 40 |
+
jobs["end_hour"] = jobs["end_time"].dt.hour
|
| 41 |
+
|
| 42 |
+
if "walltime_used" in jobs.columns:
|
| 43 |
+
jobs["walltime_hours"] = jobs["walltime_used"] / 3600.0
|
| 44 |
+
|
| 45 |
+
return jobs, snapshots, metadata
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def filter_jobs(
|
| 49 |
+
jobs: pd.DataFrame,
|
| 50 |
+
date_range: tuple | None = None,
|
| 51 |
+
projects: list[str] | None = None,
|
| 52 |
+
users: list[str] | None = None,
|
| 53 |
+
systems: list[str] | None = None,
|
| 54 |
+
) -> pd.DataFrame:
|
| 55 |
+
"""Apply sidebar filters to jobs DataFrame."""
|
| 56 |
+
df = jobs.copy()
|
| 57 |
+
if date_range and "end_date" in df.columns:
|
| 58 |
+
df = df[df["end_date"].between(date_range[0], date_range[1])]
|
| 59 |
+
if projects:
|
| 60 |
+
df = df[df["project_code"].isin(projects)]
|
| 61 |
+
if users:
|
| 62 |
+
df = df[df["username"].isin(users)]
|
| 63 |
+
if systems:
|
| 64 |
+
df = df[df["system_code"].isin(systems)]
|
| 65 |
+
return df
|