Spaces:
Running
Running
| """ | |
| End-to-End ๋ชจ๋ธ ํ์ต ์คํฌ๋ฆฝํธ (TensorFlow) | |
| ๋จ์ผ ์๋์ฐ(์ผ์ ํน์ง + user_emb)๋ฅผ ์ ๋ ฅ์ผ๋ก ๋ฐ์ ํผ๋ก๋๋ฅผ ์์ธกํ๋ | |
| MLP ๊ธฐ๋ฐ ํ๊ท ๋ชจ๋ธ์ ํ์ตํ๊ณ SavedModel/TFLite ํ์์ผ๋ก ์ ์ฅํฉ๋๋ค. | |
| """ | |
| import os | |
| import json | |
| from typing import Dict, Iterable, Optional, Tuple, Union | |
| import numpy as np | |
| import pandas as pd | |
| import tensorflow as tf | |
| from sklearn.preprocessing import StandardScaler | |
| from tensorflow.keras.layers import BatchNormalization, Dense, Dropout, Input | |
| from tensorflow.keras.models import Model | |
| from load_dataset import load_musclecare_dataset | |
| FEATURE_COLUMNS = [ | |
| 'rms_acc', | |
| 'rms_gyro', | |
| 'mean_freq_acc', | |
| 'mean_freq_gyro', | |
| 'entropy_acc', | |
| 'entropy_gyro', | |
| 'jerk_mean', | |
| 'jerk_std', | |
| 'stability_index', | |
| 'fatigue_prev', | |
| ] | |
| DEFAULT_EPOCHS = 30 | |
| DEFAULT_EMBED_DIM = 12 | |
| DEFAULT_BATCH_SIZE = 64 | |
| def parse_user_emb(emb: Union[str, Iterable[float], np.ndarray]) -> np.ndarray: | |
| """์ฌ์ฉ์ ์๋ฒ ๋ฉ์ numpy ๋ฐฐ์ด๋ก ๋ณํ""" | |
| arr: Optional[np.ndarray] = None | |
| if isinstance(emb, np.ndarray): | |
| arr = emb.astype(np.float32) | |
| elif isinstance(emb, str): | |
| try: | |
| arr = np.array(json.loads(emb), dtype=np.float32) | |
| except (json.JSONDecodeError, TypeError): | |
| arr = None | |
| elif isinstance(emb, Iterable): | |
| arr = np.array(list(emb), dtype=np.float32) | |
| if arr is None or arr.ndim == 0: | |
| arr = np.zeros(DEFAULT_EMBED_DIM, dtype=np.float32) | |
| return arr | |
| def pad_embedding(embedding: np.ndarray, target_dim: int) -> np.ndarray: | |
| """์๋ฒ ๋ฉ ๊ธธ์ด๋ฅผ target_dim์ ๋ง์ถฐ ํจ๋ฉ""" | |
| padded = np.zeros(target_dim, dtype=np.float32) | |
| length = min(target_dim, embedding.size) | |
| padded[:length] = embedding[:length] | |
| return padded | |
| def dataset_split_to_dataframe(dataset_split) -> pd.DataFrame: | |
| """HuggingFace Dataset split์ pandas DataFrame์ผ๋ก ๋ณํ""" | |
| if hasattr(dataset_split, "to_pandas"): | |
| return dataset_split.to_pandas() | |
| return pd.DataFrame(dataset_split) | |
| def build_dataframe_from_source( | |
| dataset_source, | |
| exclude_sessions: Optional[Iterable[str]] = None | |
| ) -> pd.DataFrame: | |
| """๋ฐ์ดํฐ ์์ค๋ฅผ ๋จ์ผ DataFrame์ผ๋ก ํตํฉ""" | |
| frames = [] | |
| exclude_sessions = set(exclude_sessions or []) | |
| if hasattr(dataset_source, "items"): | |
| iterator = dataset_source.items() | |
| else: | |
| iterator = [("all", dataset_source)] | |
| for split_name, split_dataset in iterator: | |
| df_split = dataset_split_to_dataframe(split_dataset) | |
| if df_split.empty: | |
| continue | |
| if exclude_sessions: | |
| if 'session_id' not in df_split.columns: | |
| raise KeyError("๋ฐ์ดํฐ์ ์ 'session_id' ์ปฌ๋ผ์ด ์์ต๋๋ค.") | |
| df_split = df_split[~df_split['session_id'].isin(exclude_sessions)] | |
| if not df_split.empty: | |
| frames.append(df_split) | |
| print(f" - {split_name}: {len(df_split)}๊ฐ ์ํ (ํํฐ๋ง ํ)") | |
| if not frames: | |
| return pd.DataFrame() | |
| return pd.concat(frames, ignore_index=True) | |
| def prepare_training_arrays( | |
| df: pd.DataFrame, | |
| feature_cols: Iterable[str] | |
| ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: | |
| """๋จ์ผ ์๋์ฐ ์ ๋ ฅ์ ์ํ ํ์ต ๋ฐ์ดํฐ๋ฅผ ์์ฑ""" | |
| required_columns = set(feature_cols) | {'fatigue', 'user_emb'} | |
| missing_columns = required_columns - set(df.columns) | |
| if missing_columns: | |
| raise KeyError(f"๋ฐ์ดํฐ์ ์ ๋๋ฝ๋ ์ปฌ๋ผ์ด ์์ต๋๋ค: {sorted(missing_columns)}") | |
| feature_values = ( | |
| df[list(feature_cols)] | |
| .astype(np.float32) | |
| .replace([np.inf, -np.inf], np.nan) | |
| .fillna(0.0) | |
| ) | |
| scaler = StandardScaler() | |
| features_scaled = scaler.fit_transform(feature_values).astype(np.float32) | |
| user_embeddings = np.stack([ | |
| emb.astype(np.float32) if isinstance(emb, np.ndarray) else np.zeros(DEFAULT_EMBED_DIM, dtype=np.float32) | |
| for emb in df['user_emb'] | |
| ]) | |
| X = np.concatenate([features_scaled, user_embeddings], axis=1).astype(np.float32) | |
| y = df['fatigue'].astype(np.float32).to_numpy() | |
| return X, y, scaler.mean_.astype(np.float32), scaler.scale_.astype(np.float32) | |
| def build_dense_regression_model( | |
| input_dim: int, | |
| learning_rate: float = 0.001 | |
| ) -> Model: | |
| """๋จ์ผ ์๋์ฐ ์ ๋ ฅ์ฉ MLP ํ๊ท ๋ชจ๋ธ""" | |
| inputs = Input(shape=(input_dim,), name="features") | |
| x = Dense(128, activation='relu')(inputs) | |
| x = BatchNormalization()(x) | |
| x = Dropout(0.3)(x) | |
| x = Dense(64, activation='relu')(x) | |
| x = BatchNormalization()(x) | |
| x = Dropout(0.2)(x) | |
| outputs = Dense(1, activation='linear', name='fatigue')(x) | |
| model = Model(inputs=inputs, outputs=outputs) | |
| model.compile( | |
| optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), | |
| loss='mse', | |
| metrics=['mae'] | |
| ) | |
| return model | |
| def ensure_embeddings(df: pd.DataFrame) -> Tuple[pd.DataFrame, int]: | |
| """user_emb ์ปฌ๋ผ์ numpy ๋ฐฐ์ด๋ก ์ ๊ทํํ๊ณ ํต์ผ๋ ์ฐจ์์ผ๋ก ํจ๋ฉ""" | |
| if 'user_emb' not in df.columns: | |
| raise KeyError("๋ฐ์ดํฐ์ ์ 'user_emb' ์ปฌ๋ผ์ด ์์ต๋๋ค.") | |
| df = df.copy() | |
| df['user_emb'] = df['user_emb'].apply(parse_user_emb) | |
| dims = [ | |
| emb.size for emb in df['user_emb'] | |
| if isinstance(emb, np.ndarray) and emb.size > 0 | |
| ] | |
| target_dim = max(dims) if dims else DEFAULT_EMBED_DIM | |
| df['user_emb'] = df['user_emb'].apply(lambda emb: pad_embedding(emb, target_dim)) | |
| return df, target_dim | |
| def main( | |
| data_list: Optional[Iterable[Dict]] = None, | |
| exclude_sessions: Optional[Iterable[str]] = None, | |
| epochs: int = DEFAULT_EPOCHS | |
| ) -> Optional[Dict[str, str]]: | |
| """ | |
| ๋ฉ์ธ ํ์ต ํจ์ | |
| Args: | |
| data_list: ์ฌ์ฉํ ๋ฐ์ดํฐ ๋ฆฌ์คํธ (None์ด๋ฉด ์ ์ฒด ๋ฐ์ดํฐ ์ฌ์ฉ) | |
| exclude_sessions: ์ ์ธํ session_id ์งํฉ (์ค๋ณต ๋ฐฉ์ง์ฉ) | |
| epochs: ํ์ต ์ํฌํฌ ์ | |
| """ | |
| print("=" * 80) | |
| print("MuscleCare Train AI - TensorFlow Single-Window Training") | |
| print("=" * 80) | |
| tf.keras.utils.set_random_seed(42) | |
| # 1๏ธโฃ ๋ฐ์ดํฐ ๋ก๋ | |
| print("1๏ธโฃ ๋ฐ์ดํฐ์ ๋ก๋ฉ ์ค...") | |
| if data_list is None: | |
| dataset_source = load_musclecare_dataset() | |
| df = build_dataframe_from_source(dataset_source, exclude_sessions) | |
| else: | |
| df = pd.DataFrame(data_list) | |
| if exclude_sessions: | |
| df = df[~df['session_id'].isin(set(exclude_sessions))] | |
| if df.empty: | |
| print("โ ๏ธ ํ์ต ๊ฐ๋ฅํ ๋ฐ์ดํฐ๊ฐ ์์ต๋๋ค. ํ์ต์ ์ข ๋ฃํฉ๋๋ค.") | |
| print("=" * 80) | |
| return None | |
| print(f"โ ๋ฐ์ดํฐ ๋ก๋ ์๋ฃ: {len(df)}๊ฐ ํ") | |
| # 2๏ธโฃ ์ฌ์ฉ์ ์๋ฒ ๋ฉ ์ ๊ทํ | |
| print("2๏ธโฃ ์ฌ์ฉ์ ์๋ฒ ๋ฉ ์ ๊ทํ ์ค...") | |
| df, emb_dim = ensure_embeddings(df) | |
| print(f"โ ์๋ฒ ๋ฉ ์ฐจ์: {emb_dim}") | |
| # 3๏ธโฃ ํ์ต ๋ฐ์ดํฐ ์์ฑ | |
| print("3๏ธโฃ ํ์ต ๋ฐ์ดํฐ ์์ฑ ์ค...") | |
| X, y, scaler_mean, scaler_scale = prepare_training_arrays(df, FEATURE_COLUMNS) | |
| if X.size == 0: | |
| print("โ ๏ธ ํ์ตํ ์ ๋ ฅ ๋ฐ์ดํฐ๊ฐ ์์ต๋๋ค. ํ์ต์ ์ข ๋ฃํฉ๋๋ค.") | |
| print("=" * 80) | |
| return None | |
| num_samples, input_dim = X.shape | |
| print(f"โ ํ์ต ๋ฐ์ดํฐ ์์ฑ ์๋ฃ: {num_samples}๊ฐ ์ํ, ์ ๋ ฅ ์ฐจ์ {input_dim}") | |
| # 4๏ธโฃ ๋ชจ๋ธ ์์ฑ | |
| print("4๏ธโฃ ๋ชจ๋ธ ์์ฑ ์ค...") | |
| model = build_dense_regression_model(input_dim) | |
| model.summary(print_fn=lambda x: print(" " + x)) | |
| print("โ ๋ชจ๋ธ ์์ฑ ์๋ฃ") | |
| # 5๏ธโฃ ๋ชจ๋ธ ํ์ต | |
| print("5๏ธโฃ ๋ชจ๋ธ ํ์ต ์์...") | |
| callbacks = [ | |
| tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True), | |
| tf.keras.callbacks.ReduceLROnPlateau(patience=3, factor=0.5), | |
| ] | |
| validation_split = 0.1 if num_samples >= 20 else 0.0 | |
| history = model.fit( | |
| X, | |
| y, | |
| epochs=epochs, | |
| batch_size=min(DEFAULT_BATCH_SIZE, num_samples), | |
| shuffle=True, | |
| validation_split=validation_split, | |
| callbacks=callbacks, | |
| verbose=1, | |
| ) | |
| print("โ ๋ชจ๋ธ ํ์ต ์๋ฃ") | |
| # 6๏ธโฃ ๋ชจ๋ธ ๋ฐ ๋ฉํ๋ฐ์ดํฐ ์ ์ฅ | |
| print("6๏ธโฃ ๋ชจ๋ธ ์ ์ฅ ์ค...") | |
| model_dir = './model' | |
| os.makedirs(model_dir, exist_ok=True) | |
| keras_model_path = os.path.join(model_dir, 'cnn_gru_fatigue.keras') | |
| model.save(keras_model_path) | |
| metadata = { | |
| "feature_columns": list(FEATURE_COLUMNS), | |
| "embedding_dim": emb_dim, | |
| "input_dim": input_dim, | |
| "epochs": epochs, | |
| "num_samples": int(num_samples), | |
| "scaler": { | |
| "mean": scaler_mean.tolist(), | |
| "scale": scaler_scale.tolist(), | |
| }, | |
| "history": { | |
| "loss": history.history.get('loss', []), | |
| "mae": history.history.get('mae', []), | |
| "val_loss": history.history.get('val_loss', []), | |
| "val_mae": history.history.get('val_mae', []), | |
| }, | |
| } | |
| metadata_path = os.path.join(model_dir, 'cnn_gru_fatigue_metadata.json') | |
| with open(metadata_path, 'w', encoding='utf-8') as f: | |
| json.dump(metadata, f, ensure_ascii=False, indent=2) | |
| print(f"โ ๋ชจ๋ธ ์ ์ฅ ์๋ฃ: {keras_model_path}") | |
| print(f" ๋ฉํ๋ฐ์ดํฐ ์ ์ฅ: {metadata_path}") | |
| # 7๏ธโฃ TFLite ๋ณํ | |
| print("7๏ธโฃ TFLite ๋ณํ ์ค...") | |
| converter = tf.lite.TFLiteConverter.from_keras_model(model) | |
| converter.target_spec.supported_ops = [ | |
| tf.lite.OpsSet.TFLITE_BUILTINS, | |
| tf.lite.OpsSet.SELECT_TF_OPS, | |
| ] | |
| converter._experimental_lower_tensor_list_ops = False | |
| converter.optimizations = [tf.lite.Optimize.DEFAULT] | |
| tflite_model = converter.convert() | |
| tflite_model_path = os.path.join(model_dir, 'cnn_gru_fatigue.tflite') | |
| with open(tflite_model_path, 'wb') as f: | |
| f.write(tflite_model) | |
| print(f"โ TFLite ๋ชจ๋ธ ์ ์ฅ ์๋ฃ: {tflite_model_path}") | |
| print("=" * 80) | |
| return { | |
| "keras": os.path.abspath(keras_model_path), | |
| "tflite": os.path.abspath(tflite_model_path), | |
| "metadata": os.path.abspath(metadata_path), | |
| } | |
| if __name__ == "__main__": | |
| main() | |