|
|
import os |
|
|
import sys |
|
|
import itertools |
|
|
import numpy as np |
|
|
import tensorflow as tf |
|
|
from sklearn.model_selection import train_test_split |
|
|
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
|
|
|
|
from src.model import MalConv |
|
|
from src.utils import preprocess_dataset |
|
|
|
|
|
def hyperparameter_search(csv_path, |
|
|
param_grid=None, |
|
|
max_length=2**20, |
|
|
epochs=5, |
|
|
validation_split=0.2): |
|
|
""" |
|
|
๊ทธ๋ฆฌ๋ ์์น๋ฅผ ํตํ ํ์ดํผํ๋ผ๋ฏธํฐ ์ต์ ํ |
|
|
|
|
|
Args: |
|
|
csv_path: ํ๋ จ ๋ฐ์ดํฐ CSV ๊ฒฝ๋ก |
|
|
param_grid: ํ์ดํผํ๋ผ๋ฏธํฐ ๊ทธ๋ฆฌ๋ |
|
|
max_length: ์ต๋ ์
๋ ฅ ๊ธธ์ด |
|
|
epochs: ํ๋ จ ์ํฌํฌ ์ |
|
|
validation_split: ๊ฒ์ฆ ๋ฐ์ดํฐ ๋น์จ |
|
|
""" |
|
|
|
|
|
if param_grid is None: |
|
|
param_grid = { |
|
|
'embedding_size': [8, 16], |
|
|
'num_filters': [64, 128], |
|
|
'fc_size': [64, 128], |
|
|
'learning_rate': [0.001, 0.0001] |
|
|
} |
|
|
|
|
|
print("๋ฐ์ดํฐ ๋ก๋ฉ ์ค...") |
|
|
X, y = preprocess_dataset(csv_path, max_length) |
|
|
X_train, X_val, y_train, y_val = train_test_split( |
|
|
X, y, test_size=validation_split, random_state=42, stratify=y |
|
|
) |
|
|
|
|
|
|
|
|
param_names = list(param_grid.keys()) |
|
|
param_values = list(param_grid.values()) |
|
|
param_combinations = list(itertools.product(*param_values)) |
|
|
|
|
|
best_score = 0 |
|
|
best_params = None |
|
|
results = [] |
|
|
|
|
|
print(f"์ด {len(param_combinations)}๊ฐ์ ์กฐํฉ์ ํ
์คํธํฉ๋๋ค.") |
|
|
|
|
|
for i, params in enumerate(param_combinations): |
|
|
param_dict = dict(zip(param_names, params)) |
|
|
print(f"\n[{i+1}/{len(param_combinations)}] ํ
์คํธ ์ค: {param_dict}") |
|
|
|
|
|
try: |
|
|
|
|
|
model = MalConv( |
|
|
max_input_length=max_length, |
|
|
embedding_size=param_dict['embedding_size'], |
|
|
num_filters=param_dict['num_filters'], |
|
|
fc_size=param_dict['fc_size'] |
|
|
) |
|
|
|
|
|
|
|
|
model.compile( |
|
|
optimizer=tf.keras.optimizers.Adam( |
|
|
learning_rate=param_dict['learning_rate'] |
|
|
), |
|
|
loss='binary_crossentropy', |
|
|
metrics=['accuracy'] |
|
|
) |
|
|
|
|
|
|
|
|
dummy_input = np.zeros((1, max_length), dtype=np.uint8) |
|
|
_ = model(dummy_input) |
|
|
|
|
|
|
|
|
history = model.fit( |
|
|
X_train, y_train, |
|
|
batch_size=16, |
|
|
epochs=epochs, |
|
|
validation_data=(X_val, y_val), |
|
|
verbose=0 |
|
|
) |
|
|
|
|
|
|
|
|
val_loss, val_acc = model.evaluate(X_val, y_val, verbose=0) |
|
|
|
|
|
result = { |
|
|
'params': param_dict, |
|
|
'val_accuracy': val_acc, |
|
|
'val_loss': val_loss |
|
|
} |
|
|
results.append(result) |
|
|
|
|
|
print(f"๊ฒ์ฆ ์ ํ๋: {val_acc:.4f}") |
|
|
|
|
|
|
|
|
if val_acc > best_score: |
|
|
best_score = val_acc |
|
|
best_params = param_dict |
|
|
print(f"์๋ก์ด ์ต๊ณ ์ฑ๋ฅ! ์ ํ๋: {best_score:.4f}") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"์๋ฌ ๋ฐ์: {e}") |
|
|
continue |
|
|
|
|
|
print("\n" + "="*50) |
|
|
print("ํ์ดํผํ๋ผ๋ฏธํฐ ํ๋ ์๋ฃ") |
|
|
print("="*50) |
|
|
print(f"์ต๊ณ ์ฑ๋ฅ: {best_score:.4f}") |
|
|
print(f"์ต์ ํ์ดํผํ๋ผ๋ฏธํฐ: {best_params}") |
|
|
|
|
|
|
|
|
results.sort(key=lambda x: x['val_accuracy'], reverse=True) |
|
|
|
|
|
print("\n์์ 5๊ฐ ๊ฒฐ๊ณผ:") |
|
|
for i, result in enumerate(results[:5]): |
|
|
print(f"{i+1}. ์ ํ๋: {result['val_accuracy']:.4f}, " |
|
|
f"ํ๋ผ๋ฏธํฐ: {result['params']}") |
|
|
|
|
|
return best_params, results |
|
|
|
|
|
def main(): |
|
|
csv_path = "Input/sample_data.csv" |
|
|
|
|
|
|
|
|
param_grid = { |
|
|
'embedding_size': [8, 16], |
|
|
'num_filters': [64, 128], |
|
|
'fc_size': [64, 128], |
|
|
'learning_rate': [0.001, 0.0001] |
|
|
} |
|
|
|
|
|
best_params, results = hyperparameter_search( |
|
|
csv_path=csv_path, |
|
|
param_grid=param_grid, |
|
|
epochs=3 |
|
|
) |
|
|
|
|
|
print(f"\n์ต์ ํ์ดํผํ๋ผ๋ฏธํฐ๋ก ๋ชจ๋ธ์ ๋ค์ ํ๋ จํ์ธ์:") |
|
|
print(f"python src/train.py {csv_path} --epochs 10") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|