MuscleCare-Train-AI / train_e2e.py
Merry99's picture
prevent hugging face spaces pause
ece3e89
"""
End-to-End ๋ชจ๋ธ ํ•™์Šต ์Šคํฌ๋ฆฝํŠธ (TensorFlow)
๋‹จ์ผ ์œˆ๋„์šฐ(์„ผ์„œ ํŠน์ง• + user_emb)๋ฅผ ์ž…๋ ฅ์œผ๋กœ ๋ฐ›์•„ ํ”ผ๋กœ๋„๋ฅผ ์˜ˆ์ธกํ•˜๋Š”
MLP ๊ธฐ๋ฐ˜ ํšŒ๊ท€ ๋ชจ๋ธ์„ ํ•™์Šตํ•˜๊ณ  SavedModel/TFLite ํ˜•์‹์œผ๋กœ ์ €์žฅํ•ฉ๋‹ˆ๋‹ค.
"""
import os
import json
from typing import Dict, Iterable, Optional, Tuple, Union
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import BatchNormalization, Dense, Dropout, Input
from tensorflow.keras.models import Model
from load_dataset import load_musclecare_dataset
FEATURE_COLUMNS = [
'rms_acc',
'rms_gyro',
'mean_freq_acc',
'mean_freq_gyro',
'entropy_acc',
'entropy_gyro',
'jerk_mean',
'jerk_std',
'stability_index',
'fatigue_prev',
]
DEFAULT_EPOCHS = 30
DEFAULT_EMBED_DIM = 12
DEFAULT_BATCH_SIZE = 64
def parse_user_emb(emb: Union[str, Iterable[float], np.ndarray]) -> np.ndarray:
"""์‚ฌ์šฉ์ž ์ž„๋ฒ ๋”ฉ์„ numpy ๋ฐฐ์—ด๋กœ ๋ณ€ํ™˜"""
arr: Optional[np.ndarray] = None
if isinstance(emb, np.ndarray):
arr = emb.astype(np.float32)
elif isinstance(emb, str):
try:
arr = np.array(json.loads(emb), dtype=np.float32)
except (json.JSONDecodeError, TypeError):
arr = None
elif isinstance(emb, Iterable):
arr = np.array(list(emb), dtype=np.float32)
if arr is None or arr.ndim == 0:
arr = np.zeros(DEFAULT_EMBED_DIM, dtype=np.float32)
return arr
def pad_embedding(embedding: np.ndarray, target_dim: int) -> np.ndarray:
"""์ž„๋ฒ ๋”ฉ ๊ธธ์ด๋ฅผ target_dim์— ๋งž์ถฐ ํŒจ๋”ฉ"""
padded = np.zeros(target_dim, dtype=np.float32)
length = min(target_dim, embedding.size)
padded[:length] = embedding[:length]
return padded
def dataset_split_to_dataframe(dataset_split) -> pd.DataFrame:
"""HuggingFace Dataset split์„ pandas DataFrame์œผ๋กœ ๋ณ€ํ™˜"""
if hasattr(dataset_split, "to_pandas"):
return dataset_split.to_pandas()
return pd.DataFrame(dataset_split)
def build_dataframe_from_source(
dataset_source,
exclude_sessions: Optional[Iterable[str]] = None
) -> pd.DataFrame:
"""๋ฐ์ดํ„ฐ ์†Œ์Šค๋ฅผ ๋‹จ์ผ DataFrame์œผ๋กœ ํ†ตํ•ฉ"""
frames = []
exclude_sessions = set(exclude_sessions or [])
if hasattr(dataset_source, "items"):
iterator = dataset_source.items()
else:
iterator = [("all", dataset_source)]
for split_name, split_dataset in iterator:
df_split = dataset_split_to_dataframe(split_dataset)
if df_split.empty:
continue
if exclude_sessions:
if 'session_id' not in df_split.columns:
raise KeyError("๋ฐ์ดํ„ฐ์…‹์— 'session_id' ์ปฌ๋Ÿผ์ด ์—†์Šต๋‹ˆ๋‹ค.")
df_split = df_split[~df_split['session_id'].isin(exclude_sessions)]
if not df_split.empty:
frames.append(df_split)
print(f" - {split_name}: {len(df_split)}๊ฐœ ์ƒ˜ํ”Œ (ํ•„ํ„ฐ๋ง ํ›„)")
if not frames:
return pd.DataFrame()
return pd.concat(frames, ignore_index=True)
def prepare_training_arrays(
df: pd.DataFrame,
feature_cols: Iterable[str]
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
"""๋‹จ์ผ ์œˆ๋„์šฐ ์ž…๋ ฅ์„ ์œ„ํ•œ ํ•™์Šต ๋ฐ์ดํ„ฐ๋ฅผ ์ƒ์„ฑ"""
required_columns = set(feature_cols) | {'fatigue', 'user_emb'}
missing_columns = required_columns - set(df.columns)
if missing_columns:
raise KeyError(f"๋ฐ์ดํ„ฐ์…‹์— ๋ˆ„๋ฝ๋œ ์ปฌ๋Ÿผ์ด ์žˆ์Šต๋‹ˆ๋‹ค: {sorted(missing_columns)}")
feature_values = (
df[list(feature_cols)]
.astype(np.float32)
.replace([np.inf, -np.inf], np.nan)
.fillna(0.0)
)
scaler = StandardScaler()
features_scaled = scaler.fit_transform(feature_values).astype(np.float32)
user_embeddings = np.stack([
emb.astype(np.float32) if isinstance(emb, np.ndarray) else np.zeros(DEFAULT_EMBED_DIM, dtype=np.float32)
for emb in df['user_emb']
])
X = np.concatenate([features_scaled, user_embeddings], axis=1).astype(np.float32)
y = df['fatigue'].astype(np.float32).to_numpy()
return X, y, scaler.mean_.astype(np.float32), scaler.scale_.astype(np.float32)
def build_dense_regression_model(
input_dim: int,
learning_rate: float = 0.001
) -> Model:
"""๋‹จ์ผ ์œˆ๋„์šฐ ์ž…๋ ฅ์šฉ MLP ํšŒ๊ท€ ๋ชจ๋ธ"""
inputs = Input(shape=(input_dim,), name="features")
x = Dense(128, activation='relu')(inputs)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)
x = Dense(64, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
outputs = Dense(1, activation='linear', name='fatigue')(x)
model = Model(inputs=inputs, outputs=outputs)
model.compile(
optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
loss='mse',
metrics=['mae']
)
return model
def ensure_embeddings(df: pd.DataFrame) -> Tuple[pd.DataFrame, int]:
"""user_emb ์ปฌ๋Ÿผ์„ numpy ๋ฐฐ์—ด๋กœ ์ •๊ทœํ™”ํ•˜๊ณ  ํ†ต์ผ๋œ ์ฐจ์›์œผ๋กœ ํŒจ๋”ฉ"""
if 'user_emb' not in df.columns:
raise KeyError("๋ฐ์ดํ„ฐ์…‹์— 'user_emb' ์ปฌ๋Ÿผ์ด ์—†์Šต๋‹ˆ๋‹ค.")
df = df.copy()
df['user_emb'] = df['user_emb'].apply(parse_user_emb)
dims = [
emb.size for emb in df['user_emb']
if isinstance(emb, np.ndarray) and emb.size > 0
]
target_dim = max(dims) if dims else DEFAULT_EMBED_DIM
df['user_emb'] = df['user_emb'].apply(lambda emb: pad_embedding(emb, target_dim))
return df, target_dim
def main(
data_list: Optional[Iterable[Dict]] = None,
exclude_sessions: Optional[Iterable[str]] = None,
epochs: int = DEFAULT_EPOCHS
) -> Optional[Dict[str, str]]:
"""
๋ฉ”์ธ ํ•™์Šต ํ•จ์ˆ˜
Args:
data_list: ์‚ฌ์šฉํ•  ๋ฐ์ดํ„ฐ ๋ฆฌ์ŠคํŠธ (None์ด๋ฉด ์ „์ฒด ๋ฐ์ดํ„ฐ ์‚ฌ์šฉ)
exclude_sessions: ์ œ์™ธํ•  session_id ์ง‘ํ•ฉ (์ค‘๋ณต ๋ฐฉ์ง€์šฉ)
epochs: ํ•™์Šต ์—ํฌํฌ ์ˆ˜
"""
print("=" * 80)
print("MuscleCare Train AI - TensorFlow Single-Window Training")
print("=" * 80)
tf.keras.utils.set_random_seed(42)
# 1๏ธโƒฃ ๋ฐ์ดํ„ฐ ๋กœ๋“œ
print("1๏ธโƒฃ ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋”ฉ ์ค‘...")
if data_list is None:
dataset_source = load_musclecare_dataset()
df = build_dataframe_from_source(dataset_source, exclude_sessions)
else:
df = pd.DataFrame(data_list)
if exclude_sessions:
df = df[~df['session_id'].isin(set(exclude_sessions))]
if df.empty:
print("โš ๏ธ ํ•™์Šต ๊ฐ€๋Šฅํ•œ ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. ํ•™์Šต์„ ์ข…๋ฃŒํ•ฉ๋‹ˆ๋‹ค.")
print("=" * 80)
return None
print(f"โœ… ๋ฐ์ดํ„ฐ ๋กœ๋“œ ์™„๋ฃŒ: {len(df)}๊ฐœ ํ–‰")
# 2๏ธโƒฃ ์‚ฌ์šฉ์ž ์ž„๋ฒ ๋”ฉ ์ •๊ทœํ™”
print("2๏ธโƒฃ ์‚ฌ์šฉ์ž ์ž„๋ฒ ๋”ฉ ์ •๊ทœํ™” ์ค‘...")
df, emb_dim = ensure_embeddings(df)
print(f"โœ… ์ž„๋ฒ ๋”ฉ ์ฐจ์›: {emb_dim}")
# 3๏ธโƒฃ ํ•™์Šต ๋ฐ์ดํ„ฐ ์ƒ์„ฑ
print("3๏ธโƒฃ ํ•™์Šต ๋ฐ์ดํ„ฐ ์ƒ์„ฑ ์ค‘...")
X, y, scaler_mean, scaler_scale = prepare_training_arrays(df, FEATURE_COLUMNS)
if X.size == 0:
print("โš ๏ธ ํ•™์Šตํ•  ์ž…๋ ฅ ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. ํ•™์Šต์„ ์ข…๋ฃŒํ•ฉ๋‹ˆ๋‹ค.")
print("=" * 80)
return None
num_samples, input_dim = X.shape
print(f"โœ… ํ•™์Šต ๋ฐ์ดํ„ฐ ์ƒ์„ฑ ์™„๋ฃŒ: {num_samples}๊ฐœ ์ƒ˜ํ”Œ, ์ž…๋ ฅ ์ฐจ์› {input_dim}")
# 4๏ธโƒฃ ๋ชจ๋ธ ์ƒ์„ฑ
print("4๏ธโƒฃ ๋ชจ๋ธ ์ƒ์„ฑ ์ค‘...")
model = build_dense_regression_model(input_dim)
model.summary(print_fn=lambda x: print(" " + x))
print("โœ… ๋ชจ๋ธ ์ƒ์„ฑ ์™„๋ฃŒ")
# 5๏ธโƒฃ ๋ชจ๋ธ ํ•™์Šต
print("5๏ธโƒฃ ๋ชจ๋ธ ํ•™์Šต ์‹œ์ž‘...")
callbacks = [
tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True),
tf.keras.callbacks.ReduceLROnPlateau(patience=3, factor=0.5),
]
validation_split = 0.1 if num_samples >= 20 else 0.0
history = model.fit(
X,
y,
epochs=epochs,
batch_size=min(DEFAULT_BATCH_SIZE, num_samples),
shuffle=True,
validation_split=validation_split,
callbacks=callbacks,
verbose=1,
)
print("โœ… ๋ชจ๋ธ ํ•™์Šต ์™„๋ฃŒ")
# 6๏ธโƒฃ ๋ชจ๋ธ ๋ฐ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์ €์žฅ
print("6๏ธโƒฃ ๋ชจ๋ธ ์ €์žฅ ์ค‘...")
model_dir = './model'
os.makedirs(model_dir, exist_ok=True)
keras_model_path = os.path.join(model_dir, 'cnn_gru_fatigue.keras')
model.save(keras_model_path)
metadata = {
"feature_columns": list(FEATURE_COLUMNS),
"embedding_dim": emb_dim,
"input_dim": input_dim,
"epochs": epochs,
"num_samples": int(num_samples),
"scaler": {
"mean": scaler_mean.tolist(),
"scale": scaler_scale.tolist(),
},
"history": {
"loss": history.history.get('loss', []),
"mae": history.history.get('mae', []),
"val_loss": history.history.get('val_loss', []),
"val_mae": history.history.get('val_mae', []),
},
}
metadata_path = os.path.join(model_dir, 'cnn_gru_fatigue_metadata.json')
with open(metadata_path, 'w', encoding='utf-8') as f:
json.dump(metadata, f, ensure_ascii=False, indent=2)
print(f"โœ… ๋ชจ๋ธ ์ €์žฅ ์™„๋ฃŒ: {keras_model_path}")
print(f" ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์ €์žฅ: {metadata_path}")
# 7๏ธโƒฃ TFLite ๋ณ€ํ™˜
print("7๏ธโƒฃ TFLite ๋ณ€ํ™˜ ์ค‘...")
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.target_spec.supported_ops = [
tf.lite.OpsSet.TFLITE_BUILTINS,
tf.lite.OpsSet.SELECT_TF_OPS,
]
converter._experimental_lower_tensor_list_ops = False
converter.optimizations = [tf.lite.Optimize.DEFAULT]
tflite_model = converter.convert()
tflite_model_path = os.path.join(model_dir, 'cnn_gru_fatigue.tflite')
with open(tflite_model_path, 'wb') as f:
f.write(tflite_model)
print(f"โœ… TFLite ๋ชจ๋ธ ์ €์žฅ ์™„๋ฃŒ: {tflite_model_path}")
print("=" * 80)
return {
"keras": os.path.abspath(keras_model_path),
"tflite": os.path.abspath(tflite_model_path),
"metadata": os.path.abspath(metadata_path),
}
if __name__ == "__main__":
main()