TWLab's picture
Add publication-ready ML project structure with full source code
e2b220f verified
"""
Utility functions: configuration loading, logging setup, reproducibility.
"""
from __future__ import annotations
import logging
import os
import random
from pathlib import Path
from typing import Any, Dict
import numpy as np
import torch
import yaml
def set_seed(seed: int = 42) -> None:
"""Set random seeds for full reproducibility across all libraries."""
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
# Deterministic operations (may reduce performance)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
def load_yaml_config(path: str | Path) -> Dict[str, Any]:
"""Load YAML configuration file with validation."""
path = Path(path)
if not path.exists():
raise FileNotFoundError(f"Config not found: {path}")
with open(path) as f:
config = yaml.safe_load(f)
return config
def setup_logging(
level: str = "INFO",
log_file: str | Path | None = None,
) -> None:
"""Configure logging for the project."""
handlers = [logging.StreamHandler()]
if log_file:
log_file = Path(log_file)
log_file.parent.mkdir(parents=True, exist_ok=True)
handlers.append(logging.FileHandler(log_file))
logging.basicConfig(
level=getattr(logging, level.upper()),
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
handlers=handlers,
force=True,
)
def get_device() -> str:
"""Get best available compute device."""
if torch.cuda.is_available():
return "cuda"
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
return "mps"
return "cpu"
def format_metrics_table(
metrics_dict: Dict[str, Dict[str, float]],
fmt: str = ".4f",
) -> str:
"""Format metrics dictionary as a publication-ready markdown table."""
import pandas as pd
df = pd.DataFrame(metrics_dict).T
return df.to_markdown(floatfmt=fmt)