Spaces:

DTanzillo
/

Inspiration-Health-Data

Sleeping

App Files Files Community

Inspiration-Health-Data / scripts /featureEngineering.py

DTanzillo

Upload 5 files

1cfa929 verified 5 months ago

raw

history blame contribute delete

2.31 kB

	import numpy as np
	import pandas as pd

	def parse_timepoint(timepoint: str) -> int:
	"""
	Convert timepoint strings like 'L-3', 'L0', 'R+0', 'R+1' into numeric flight days
	on a stretched scale.
	In particular, we are converting the 3 dats of flight into 30 days so there is a
	difference, the final chart will have fake data in it.
	Convention:
	L-0 -> 0 (launch day = Flight Day 0)
	L-3 -> -3 (3 days before launch)
	R+0 -> 30 (last day in space, stretched to day 30)
	R+1 -> 31 (first recovery day)
	R+N -> N+30 (general rule for post-launch days)
	"""
	label = str(timepoint).strip().upper()

	if label.startswith("L"): # Pre-launch
	number = int(label.replace("L", "").replace("+", "").replace("-", "") or "0")
	return -number
	elif label.startswith("R"): # Return / post-flight
	number = int(label.replace("R", "").replace("+", "").replace("-", "") or "0")
	return number + 30

	return np.nan


	def add_flight_day(df: pd.DataFrame) -> pd.DataFrame:
	"""
	Add a 'flight_day' column to a dataframe that already has 'timepoint' and 'astronautID'.
	Drops 'Sample Name' if present, since it's redundant.
	"""
	df = df.copy()
	if "timepoint" not in df.columns:
	raise ValueError("DataFrame must contain a 'timepoint' column")

	# create numeric scale
	df["flight_day"] = df["timepoint"].apply(parse_timepoint)

	# drop redundant 'Sample Name' if it exists
	if "Sample Name" in df.columns:
	df = df.drop(columns=["Sample Name"])

	return df

	def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:
	"""
	Add derived feature: Anion Gap.
	Anion Gap = Sodium − Chloride − Carbon Dioxide
	"""
	df = df.copy()

	if all(c in df.columns for c in ["sodium_value", "chloride_value", "carbon_dioxide_value"]):
	df["anion_gap_value"] = (
	df["sodium_value"].astype(float)
	- df["chloride_value"].astype(float)
	- df["carbon_dioxide_value"].astype(float)
	)
	# Placeholders; min/max defined manually in stats.ANALYTE_INFO
	df["anion_gap_range_min"] = np.nan
	df["anion_gap_range_max"] = np.nan
	return df