Spaces:

davanstrien
/

trackio-test-space-837208

Sleeping

App Files Files Community

trackio-test-space-837208 / utils.py

davanstrien HF Staff

Upload folder using huggingface_hub

bade91f verified 6 months ago

raw

history blame contribute delete

14.7 kB

	import re
	import sys
	import time
	from pathlib import Path
	from typing import TYPE_CHECKING

	import huggingface_hub
	import numpy as np
	import pandas as pd
	from huggingface_hub.constants import HF_HOME

	if TYPE_CHECKING:
	from trackio.commit_scheduler import CommitScheduler
	from trackio.dummy_commit_scheduler import DummyCommitScheduler

	RESERVED_KEYS = ["project", "run", "timestamp", "step", "time", "metrics"]
	TRACKIO_DIR = Path(HF_HOME) / "trackio"

	TRACKIO_LOGO_DIR = Path(__file__).parent / "assets"


	def generate_readable_name(used_names: list[str], space_id: str \| None = None) -> str:
	"""
	Generates a random, readable name like "dainty-sunset-0".
	If space_id is provided, generates username-timestamp format instead.
	"""
	if space_id is not None:
	username = huggingface_hub.whoami()["name"]
	timestamp = int(time.time())
	return f"{username}-{timestamp}"
	adjectives = [
	"dainty",
	"brave",
	"calm",
	"eager",
	"fancy",
	"gentle",
	"happy",
	"jolly",
	"kind",
	"lively",
	"merry",
	"nice",
	"proud",
	"quick",
	"hugging",
	"silly",
	"tidy",
	"witty",
	"zealous",
	"bright",
	"shy",
	"bold",
	"clever",
	"daring",
	"elegant",
	"faithful",
	"graceful",
	"honest",
	"inventive",
	"jovial",
	"keen",
	"lucky",
	"modest",
	"noble",
	"optimistic",
	"patient",
	"quirky",
	"resourceful",
	"sincere",
	"thoughtful",
	"upbeat",
	"valiant",
	"warm",
	"youthful",
	"zesty",
	"adventurous",
	"breezy",
	"cheerful",
	"delightful",
	"energetic",
	"fearless",
	"glad",
	"hopeful",
	"imaginative",
	"joyful",
	"kindly",
	"luminous",
	"mysterious",
	"neat",
	"outgoing",
	"playful",
	"radiant",
	"spirited",
	"tranquil",
	"unique",
	"vivid",
	"wise",
	"zany",
	"artful",
	"bubbly",
	"charming",
	"dazzling",
	"earnest",
	"festive",
	"gentlemanly",
	"hearty",
	"intrepid",
	"jubilant",
	"knightly",
	"lively",
	"magnetic",
	"nimble",
	"orderly",
	"peaceful",
	"quick-witted",
	"robust",
	"sturdy",
	"trusty",
	"upstanding",
	"vibrant",
	"whimsical",
	]
	nouns = [
	"sunset",
	"forest",
	"river",
	"mountain",
	"breeze",
	"meadow",
	"ocean",
	"valley",
	"sky",
	"field",
	"cloud",
	"star",
	"rain",
	"leaf",
	"stone",
	"flower",
	"bird",
	"tree",
	"wave",
	"trail",
	"island",
	"desert",
	"hill",
	"lake",
	"pond",
	"grove",
	"canyon",
	"reef",
	"bay",
	"peak",
	"glade",
	"marsh",
	"cliff",
	"dune",
	"spring",
	"brook",
	"cave",
	"plain",
	"ridge",
	"wood",
	"blossom",
	"petal",
	"root",
	"branch",
	"seed",
	"acorn",
	"pine",
	"willow",
	"cedar",
	"elm",
	"falcon",
	"eagle",
	"sparrow",
	"robin",
	"owl",
	"finch",
	"heron",
	"crane",
	"duck",
	"swan",
	"fox",
	"wolf",
	"bear",
	"deer",
	"moose",
	"otter",
	"beaver",
	"lynx",
	"hare",
	"badger",
	"butterfly",
	"bee",
	"ant",
	"beetle",
	"dragonfly",
	"firefly",
	"ladybug",
	"moth",
	"spider",
	"worm",
	"coral",
	"kelp",
	"shell",
	"pebble",
	"face",
	"boulder",
	"cobble",
	"sand",
	"wavelet",
	"tide",
	"current",
	"mist",
	]
	number = 0
	name = f"{adjectives[0]}-{nouns[0]}-{number}"
	while name in used_names:
	number += 1
	adjective = adjectives[number % len(adjectives)]
	noun = nouns[number % len(nouns)]
	name = f"{adjective}-{noun}-{number}"
	return name


	def block_except_in_notebook():
	in_notebook = bool(getattr(sys, "ps1", sys.flags.interactive))
	if in_notebook:
	return
	try:
	while True:
	time.sleep(0.1)
	except (KeyboardInterrupt, OSError):
	print("Keyboard interruption in main thread... closing dashboard.")


	def simplify_column_names(columns: list[str]) -> dict[str, str]:
	"""
	Simplifies column names to first 10 alphanumeric or "/" characters with unique suffixes.

	Args:
	columns: List of original column names

	Returns:
	Dictionary mapping original column names to simplified names
	"""
	simplified_names = {}
	used_names = set()

	for col in columns:
	alphanumeric = re.sub(r"[^a-zA-Z0-9/]", "", col)
	base_name = alphanumeric[:10] if alphanumeric else f"col_{len(used_names)}"

	final_name = base_name
	suffix = 1
	while final_name in used_names:
	final_name = f"{base_name}_{suffix}"
	suffix += 1

	simplified_names[col] = final_name
	used_names.add(final_name)

	return simplified_names


	def print_dashboard_instructions(project: str) -> None:
	"""
	Prints instructions for viewing the Trackio dashboard.

	Args:
	project: The name of the project to show dashboard for.
	"""
	YELLOW = "\033[93m"
	BOLD = "\033[1m"
	RESET = "\033[0m"

	print("* View dashboard by running in your terminal:")
	print(f'{BOLD}{YELLOW}trackio show --project "{project}"{RESET}')
	print(f'* or by running in Python: trackio.show(project="{project}")')


	def preprocess_space_and_dataset_ids(
	space_id: str \| None, dataset_id: str \| None
	) -> tuple[str \| None, str \| None]:
	if space_id is not None and "/" not in space_id:
	username = huggingface_hub.whoami()["name"]
	space_id = f"{username}/{space_id}"
	if dataset_id is not None and "/" not in dataset_id:
	username = huggingface_hub.whoami()["name"]
	dataset_id = f"{username}/{dataset_id}"
	if space_id is not None and dataset_id is None:
	dataset_id = f"{space_id}-dataset"
	return space_id, dataset_id


	def fibo():
	"""Generator for Fibonacci backoff: 1, 1, 2, 3, 5, 8, ..."""
	a, b = 1, 1
	while True:
	yield a
	a, b = b, a + b


	COLOR_PALETTE = [
	"#3B82F6",
	"#EF4444",
	"#10B981",
	"#F59E0B",
	"#8B5CF6",
	"#EC4899",
	"#06B6D4",
	"#84CC16",
	"#F97316",
	"#6366F1",
	]


	def get_color_mapping(runs: list[str], smoothing: bool) -> dict[str, str]:
	"""Generate color mapping for runs, with transparency for original data when smoothing is enabled."""
	color_map = {}

	for i, run in enumerate(runs):
	base_color = COLOR_PALETTE[i % len(COLOR_PALETTE)]

	if smoothing:
	color_map[f"{run}_smoothed"] = base_color
	color_map[f"{run}_original"] = base_color + "4D"
	else:
	color_map[run] = base_color

	return color_map


	def downsample(
	df: pd.DataFrame,
	x: str,
	y: str,
	color: str \| None,
	x_lim: tuple[float, float] \| None = None,
	) -> pd.DataFrame:
	if df.empty:
	return df

	columns_to_keep = [x, y]
	if color is not None and color in df.columns:
	columns_to_keep.append(color)
	df = df[columns_to_keep].copy()

	n_bins = 100

	if color is not None and color in df.columns:
	groups = df.groupby(color)
	else:
	groups = [(None, df)]

	downsampled_indices = []

	for _, group_df in groups:
	if group_df.empty:
	continue

	group_df = group_df.sort_values(x)

	if x_lim is not None:
	x_min, x_max = x_lim
	before_point = group_df[group_df[x] < x_min].tail(1)
	after_point = group_df[group_df[x] > x_max].head(1)
	group_df = group_df[(group_df[x] >= x_min) & (group_df[x] <= x_max)]
	else:
	before_point = after_point = None
	x_min = group_df[x].min()
	x_max = group_df[x].max()

	if before_point is not None and not before_point.empty:
	downsampled_indices.extend(before_point.index.tolist())
	if after_point is not None and not after_point.empty:
	downsampled_indices.extend(after_point.index.tolist())

	if group_df.empty:
	continue

	if x_min == x_max:
	min_y_idx = group_df[y].idxmin()
	max_y_idx = group_df[y].idxmax()
	if min_y_idx != max_y_idx:
	downsampled_indices.extend([min_y_idx, max_y_idx])
	else:
	downsampled_indices.append(min_y_idx)
	continue

	if len(group_df) < 500:
	downsampled_indices.extend(group_df.index.tolist())
	continue

	bins = np.linspace(x_min, x_max, n_bins + 1)
	group_df["bin"] = pd.cut(
	group_df[x], bins=bins, labels=False, include_lowest=True
	)

	for bin_idx in group_df["bin"].dropna().unique():
	bin_data = group_df[group_df["bin"] == bin_idx]
	if bin_data.empty:
	continue

	min_y_idx = bin_data[y].idxmin()
	max_y_idx = bin_data[y].idxmax()

	downsampled_indices.append(min_y_idx)
	if min_y_idx != max_y_idx:
	downsampled_indices.append(max_y_idx)

	unique_indices = list(set(downsampled_indices))

	downsampled_df = df.loc[unique_indices].copy()
	downsampled_df = downsampled_df.sort_values(x).reset_index(drop=True)
	downsampled_df = downsampled_df.drop(columns=["bin"], errors="ignore")

	return downsampled_df


	def sort_metrics_by_prefix(metrics: list[str]) -> list[str]:
	"""
	Sort metrics by grouping prefixes together for dropdown/list display.
	Metrics without prefixes come first, then grouped by prefix.

	Args:
	metrics: List of metric names

	Returns:
	List of metric names sorted by prefix

	Example:
	Input: ["train/loss", "loss", "train/acc", "val/loss"]
	Output: ["loss", "train/acc", "train/loss", "val/loss"]
	"""
	groups = group_metrics_by_prefix(metrics)
	result = []

	if "charts" in groups:
	result.extend(groups["charts"])

	for group_name in sorted(groups.keys()):
	if group_name != "charts":
	result.extend(groups[group_name])

	return result


	def group_metrics_by_prefix(metrics: list[str]) -> dict[str, list[str]]:
	"""
	Group metrics by their prefix. Metrics without prefix go to 'charts' group.

	Args:
	metrics: List of metric names

	Returns:
	Dictionary with prefix names as keys and lists of metrics as values

	Example:
	Input: ["loss", "accuracy", "train/loss", "train/acc", "val/loss"]
	Output: {
	"charts": ["loss", "accuracy"],
	"train": ["train/loss", "train/acc"],
	"val": ["val/loss"]
	}
	"""
	no_prefix = []
	with_prefix = []

	for metric in metrics:
	if "/" in metric:
	with_prefix.append(metric)
	else:
	no_prefix.append(metric)

	no_prefix.sort()

	prefix_groups = {}
	for metric in with_prefix:
	prefix = metric.split("/")[0]
	if prefix not in prefix_groups:
	prefix_groups[prefix] = []
	prefix_groups[prefix].append(metric)

	for prefix in prefix_groups:
	prefix_groups[prefix].sort()

	groups = {}
	if no_prefix:
	groups["charts"] = no_prefix

	for prefix in sorted(prefix_groups.keys()):
	groups[prefix] = prefix_groups[prefix]

	return groups


	def group_metrics_with_subprefixes(metrics: list[str]) -> dict:
	"""
	Group metrics with simple 2-level nested structure detection.

	Returns a dictionary where each prefix group can have:
	- direct_metrics: list of metrics at this level (e.g., "train/acc")
	- subgroups: dict of subgroup name -> list of metrics (e.g., "loss" -> ["train/loss/norm", "train/loss/unnorm"])

	Example:
	Input: ["loss", "train/acc", "train/loss/normalized", "train/loss/unnormalized", "val/loss"]
	Output: {
	"charts": {
	"direct_metrics": ["loss"],
	"subgroups": {}
	},
	"train": {
	"direct_metrics": ["train/acc"],
	"subgroups": {
	"loss": ["train/loss/normalized", "train/loss/unnormalized"]
	}
	},
	"val": {
	"direct_metrics": ["val/loss"],
	"subgroups": {}
	}
	}
	"""
	result = {}

	for metric in metrics:
	if "/" not in metric:
	if "charts" not in result:
	result["charts"] = {"direct_metrics": [], "subgroups": {}}
	result["charts"]["direct_metrics"].append(metric)
	else:
	parts = metric.split("/")
	main_prefix = parts[0]

	if main_prefix not in result:
	result[main_prefix] = {"direct_metrics": [], "subgroups": {}}

	if len(parts) == 2:
	result[main_prefix]["direct_metrics"].append(metric)
	else:
	subprefix = parts[1]
	if subprefix not in result[main_prefix]["subgroups"]:
	result[main_prefix]["subgroups"][subprefix] = []
	result[main_prefix]["subgroups"][subprefix].append(metric)

	for group_data in result.values():
	group_data["direct_metrics"].sort()
	for subgroup_metrics in group_data["subgroups"].values():
	subgroup_metrics.sort()

	if "charts" in result and not result["charts"]["direct_metrics"]:
	del result["charts"]

	return result


	def get_sync_status(scheduler: "CommitScheduler \| DummyCommitScheduler") -> int \| None:
	"""Get the sync status from the CommitScheduler in an integer number of minutes, or None if not synced yet."""
	if getattr(
	scheduler, "last_push_time", None
	): # DummyCommitScheduler doesn't have last_push_time
	time_diff = time.time() - scheduler.last_push_time
	return int(time_diff / 60)
	else:
	return None