Upload 5 files
Browse files- evonet_optimizer.py +500 -0
- v2.py +643 -0
- v3.py +784 -0
- v4.py +1327 -0
- v5.py +1330 -0
evonet_optimizer.py
ADDED
|
@@ -0,0 +1,500 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import subprocess
|
| 3 |
+
import sys
|
| 4 |
+
import argparse
|
| 5 |
+
import random
|
| 6 |
+
import logging
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
import json
|
| 9 |
+
from typing import List, Tuple, Dict, Any
|
| 10 |
+
|
| 11 |
+
import numpy as np
|
| 12 |
+
import tensorflow as tf
|
| 13 |
+
from tensorflow.keras.models import Sequential, load_model, clone_model
|
| 14 |
+
from tensorflow.keras.layers import Dense, Input
|
| 15 |
+
from tensorflow.keras.optimizers import Adam
|
| 16 |
+
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
|
| 17 |
+
import matplotlib.pyplot as plt
|
| 18 |
+
from scipy.stats import kendalltau
|
| 19 |
+
|
| 20 |
+
# --- Constants ---
|
| 21 |
+
DEFAULT_SEQ_LENGTH = 10
|
| 22 |
+
DEFAULT_POP_SIZE = 50
|
| 23 |
+
DEFAULT_GENERATIONS = 50
|
| 24 |
+
DEFAULT_MUTATION_RATE = 0.4 # Probability of applying any mutation to an individual
|
| 25 |
+
DEFAULT_WEIGHT_MUT_RATE = 0.8 # If mutation occurs, probability of weight perturbation
|
| 26 |
+
DEFAULT_ACTIVATION_MUT_RATE = 0.2 # If mutation occurs, probability of activation change
|
| 27 |
+
DEFAULT_MUTATION_STRENGTH = 0.1 # Magnitude of weight perturbation
|
| 28 |
+
DEFAULT_TOURNAMENT_SIZE = 5
|
| 29 |
+
DEFAULT_ELITISM_COUNT = 2 # Keep top N individuals directly
|
| 30 |
+
DEFAULT_EPOCHS_FINAL_TRAIN = 100
|
| 31 |
+
DEFAULT_BATCH_SIZE = 64
|
| 32 |
+
|
| 33 |
+
# --- Logging Setup ---
|
| 34 |
+
def setup_logging(log_dir: str, log_level=logging.INFO) -> None:
|
| 35 |
+
"""Configures logging to file and console."""
|
| 36 |
+
log_filename = os.path.join(log_dir, 'evolution.log')
|
| 37 |
+
logging.basicConfig(
|
| 38 |
+
level=log_level,
|
| 39 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
| 40 |
+
handlers=[
|
| 41 |
+
logging.FileHandler(log_filename),
|
| 42 |
+
logging.StreamHandler(sys.stdout) # Also print to console
|
| 43 |
+
]
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
# --- GPU Check ---
|
| 47 |
+
def check_gpu() -> bool:
|
| 48 |
+
"""Checks for GPU availability and sets memory growth."""
|
| 49 |
+
gpus = tf.config.list_physical_devices('GPU')
|
| 50 |
+
if gpus:
|
| 51 |
+
try:
|
| 52 |
+
# Currently, memory growth needs to be the same across GPUs
|
| 53 |
+
for gpu in gpus:
|
| 54 |
+
tf.config.experimental.set_memory_growth(gpu, True)
|
| 55 |
+
logical_gpus = tf.config.list_logical_devices('GPU')
|
| 56 |
+
logging.info(f"{len(gpus)} Physical GPUs, {len(logical_gpus)} Logical GPUs found.")
|
| 57 |
+
logging.info(f"Using GPU: {gpus[0].name}")
|
| 58 |
+
return True
|
| 59 |
+
except RuntimeError as e:
|
| 60 |
+
# Memory growth must be set before GPUs have been initialized
|
| 61 |
+
logging.error(f"Error setting memory growth: {e}")
|
| 62 |
+
return False
|
| 63 |
+
else:
|
| 64 |
+
logging.warning("GPU not found. Using CPU.")
|
| 65 |
+
return False
|
| 66 |
+
|
| 67 |
+
# --- Data Generation ---
|
| 68 |
+
def generate_data(num_samples: int, seq_length: int) -> Tuple[np.ndarray, np.ndarray]:
|
| 69 |
+
"""Generates random sequences and their sorted versions."""
|
| 70 |
+
logging.info(f"Generating {num_samples} samples with sequence length {seq_length}...")
|
| 71 |
+
X = np.random.rand(num_samples, seq_length) * 100
|
| 72 |
+
y = np.sort(X, axis=1)
|
| 73 |
+
logging.info("Data generation complete.")
|
| 74 |
+
return X, y
|
| 75 |
+
|
| 76 |
+
# --- Neuroevolution Core ---
|
| 77 |
+
def create_individual(seq_length: int) -> Sequential:
|
| 78 |
+
"""Creates a Keras Sequential model with random architecture."""
|
| 79 |
+
model = Sequential(name=f"model_random_{random.randint(1000, 9999)}")
|
| 80 |
+
num_hidden_layers = random.randint(1, 4) # Reduced max layers for simplicity
|
| 81 |
+
neurons_per_layer = [random.randint(8, 64) for _ in range(num_hidden_layers)]
|
| 82 |
+
activations = [random.choice(['relu', 'tanh', 'sigmoid']) for _ in range(num_hidden_layers)]
|
| 83 |
+
|
| 84 |
+
# Input Layer
|
| 85 |
+
model.add(Input(shape=(seq_length,)))
|
| 86 |
+
|
| 87 |
+
# Hidden Layers
|
| 88 |
+
for i in range(num_hidden_layers):
|
| 89 |
+
model.add(Dense(neurons_per_layer[i], activation=activations[i]))
|
| 90 |
+
|
| 91 |
+
# Output Layer - must match sequence length for sorting
|
| 92 |
+
model.add(Dense(seq_length, activation='linear')) # Linear activation for regression output
|
| 93 |
+
|
| 94 |
+
# Compile the model immediately for weight manipulation capabilities
|
| 95 |
+
# Use a standard optimizer; learning rate might be adjusted during final training
|
| 96 |
+
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
|
| 97 |
+
return model
|
| 98 |
+
|
| 99 |
+
@tf.function # Potentially speeds up prediction
|
| 100 |
+
def get_predictions(model: Sequential, X: np.ndarray, batch_size: int) -> tf.Tensor:
|
| 101 |
+
"""Gets model predictions using tf.function."""
|
| 102 |
+
return model(X, training=False) # Use __call__ inside tf.function
|
| 103 |
+
|
| 104 |
+
def calculate_fitness(individual: Sequential, X: np.ndarray, y: np.ndarray, batch_size: int) -> float:
|
| 105 |
+
"""Calculates fitness based on inverse MSE. Handles potential errors."""
|
| 106 |
+
try:
|
| 107 |
+
# Ensure data is float32 for TensorFlow
|
| 108 |
+
X_tf = tf.cast(X, tf.float32)
|
| 109 |
+
y_tf = tf.cast(y, tf.float32)
|
| 110 |
+
|
| 111 |
+
# Use the tf.function decorated prediction function
|
| 112 |
+
y_pred_tf = get_predictions(individual, X_tf, batch_size)
|
| 113 |
+
|
| 114 |
+
# Calculate MSE using TensorFlow operations for potential GPU acceleration
|
| 115 |
+
mse = tf.reduce_mean(tf.square(y_tf - y_pred_tf))
|
| 116 |
+
mse_val = mse.numpy() # Get the numpy value
|
| 117 |
+
|
| 118 |
+
# Fitness: Inverse MSE (add small epsilon to avoid division by zero)
|
| 119 |
+
fitness_score = 1.0 / (mse_val + 1e-8)
|
| 120 |
+
|
| 121 |
+
# Handle potential NaN or Inf values in fitness
|
| 122 |
+
if not np.isfinite(fitness_score):
|
| 123 |
+
logging.warning(f"Non-finite fitness detected ({fitness_score}) for model {individual.name}. Assigning low fitness.")
|
| 124 |
+
return 1e-8 # Assign a very low fitness
|
| 125 |
+
|
| 126 |
+
return float(fitness_score)
|
| 127 |
+
|
| 128 |
+
except Exception as e:
|
| 129 |
+
logging.error(f"Error during fitness calculation for model {individual.name}: {e}", exc_info=True)
|
| 130 |
+
return 1e-8 # Return minimal fitness on error
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def mutate_individual(individual: Sequential, weight_mut_rate: float, act_mut_rate: float, mut_strength: float) -> Sequential:
|
| 134 |
+
"""Applies mutations (weight perturbation, activation change) to an individual."""
|
| 135 |
+
mutated_model = clone_model(individual)
|
| 136 |
+
mutated_model.set_weights(individual.get_weights()) # Crucial: Copy weights
|
| 137 |
+
|
| 138 |
+
mutated = False
|
| 139 |
+
# 1. Weight Mutation
|
| 140 |
+
if random.random() < weight_mut_rate:
|
| 141 |
+
mutated = True
|
| 142 |
+
for layer in mutated_model.layers:
|
| 143 |
+
if isinstance(layer, Dense):
|
| 144 |
+
weights_biases = layer.get_weights()
|
| 145 |
+
new_weights_biases = []
|
| 146 |
+
for wb in weights_biases:
|
| 147 |
+
noise = np.random.normal(0, mut_strength, wb.shape)
|
| 148 |
+
new_weights_biases.append(wb + noise)
|
| 149 |
+
if new_weights_biases: # Ensure layer had weights
|
| 150 |
+
layer.set_weights(new_weights_biases)
|
| 151 |
+
# logging.debug(f"Applied weight mutation to {mutated_model.name}")
|
| 152 |
+
|
| 153 |
+
# 2. Activation Mutation (Applied independently)
|
| 154 |
+
if random.random() < act_mut_rate:
|
| 155 |
+
# Find Dense layers eligible for activation change (not the output layer)
|
| 156 |
+
dense_layers = [layer for layer in mutated_model.layers if isinstance(layer, Dense)]
|
| 157 |
+
if len(dense_layers) > 1: # Ensure there's at least one hidden layer
|
| 158 |
+
mutated = True
|
| 159 |
+
layer_to_mutate = random.choice(dense_layers[:-1]) # Exclude output layer
|
| 160 |
+
current_activation = layer_to_mutate.get_config().get('activation', 'linear')
|
| 161 |
+
possible_activations = ['relu', 'tanh', 'sigmoid']
|
| 162 |
+
if current_activation in possible_activations:
|
| 163 |
+
possible_activations.remove(current_activation)
|
| 164 |
+
new_activation = random.choice(possible_activations)
|
| 165 |
+
|
| 166 |
+
# Rebuild the model config with the new activation
|
| 167 |
+
# This is safer than trying to modify layer activation in-place
|
| 168 |
+
config = mutated_model.get_config()
|
| 169 |
+
for layer_config in config['layers']:
|
| 170 |
+
if layer_config['config']['name'] == layer_to_mutate.name:
|
| 171 |
+
layer_config['config']['activation'] = new_activation
|
| 172 |
+
# logging.debug(f"Changed activation of layer {layer_to_mutate.name} to {new_activation} in {mutated_model.name}")
|
| 173 |
+
break # Found the layer
|
| 174 |
+
|
| 175 |
+
# Create a new model from the modified config
|
| 176 |
+
# Important: Need to re-compile after structural changes from config
|
| 177 |
+
try:
|
| 178 |
+
mutated_model_new_act = Sequential.from_config(config)
|
| 179 |
+
mutated_model_new_act.compile(optimizer=Adam(learning_rate=0.001), loss='mse') # Re-compile
|
| 180 |
+
mutated_model = mutated_model_new_act # Replace the old model
|
| 181 |
+
except Exception as e:
|
| 182 |
+
logging.error(f"Error rebuilding model after activation mutation for {mutated_model.name}: {e}")
|
| 183 |
+
# Revert mutation if rebuilding fails
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
# Re-compile the final mutated model to ensure optimizer state is fresh
|
| 187 |
+
if mutated:
|
| 188 |
+
mutated_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
|
| 189 |
+
mutated_model._name = f"mutated_{individual.name}" # Rename
|
| 190 |
+
|
| 191 |
+
return mutated_model
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
def tournament_selection(population: List[Sequential], fitness_scores: List[float], k: int) -> Sequential:
|
| 195 |
+
"""Selects the best individual from a randomly chosen tournament group."""
|
| 196 |
+
tournament_indices = random.sample(range(len(population)), k)
|
| 197 |
+
tournament_fitness = [fitness_scores[i] for i in tournament_indices]
|
| 198 |
+
winner_index_in_tournament = np.argmax(tournament_fitness)
|
| 199 |
+
winner_original_index = tournament_indices[winner_index_in_tournament]
|
| 200 |
+
return population[winner_original_index]
|
| 201 |
+
|
| 202 |
+
def evolve_population(population: List[Sequential], X: np.ndarray, y: np.ndarray, generations: int,
|
| 203 |
+
mutation_rate: float, weight_mut_rate: float, act_mut_rate: float, mut_strength: float,
|
| 204 |
+
tournament_size: int, elitism_count: int, batch_size: int) -> Tuple[Sequential, List[float], List[float]]:
|
| 205 |
+
"""Runs the evolutionary process."""
|
| 206 |
+
best_fitness_history = []
|
| 207 |
+
avg_fitness_history = []
|
| 208 |
+
best_model_overall = None
|
| 209 |
+
best_fitness_overall = -1.0
|
| 210 |
+
|
| 211 |
+
for gen in range(generations):
|
| 212 |
+
# 1. Evaluate Fitness
|
| 213 |
+
fitness_scores = [calculate_fitness(ind, X, y, batch_size) for ind in population]
|
| 214 |
+
|
| 215 |
+
# Track overall best
|
| 216 |
+
current_best_idx = np.argmax(fitness_scores)
|
| 217 |
+
current_best_fitness = fitness_scores[current_best_idx]
|
| 218 |
+
if current_best_fitness > best_fitness_overall:
|
| 219 |
+
best_fitness_overall = current_best_fitness
|
| 220 |
+
# Keep a copy of the best model structure and weights
|
| 221 |
+
best_model_overall = clone_model(population[current_best_idx])
|
| 222 |
+
best_model_overall.set_weights(population[current_best_idx].get_weights())
|
| 223 |
+
best_model_overall.compile(optimizer=Adam(), loss='mse') # Re-compile just in case
|
| 224 |
+
logging.info(f"Generation {gen+1}: New overall best fitness: {best_fitness_overall:.4f}")
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
avg_fitness = np.mean(fitness_scores)
|
| 228 |
+
best_fitness_history.append(current_best_fitness)
|
| 229 |
+
avg_fitness_history.append(avg_fitness)
|
| 230 |
+
|
| 231 |
+
logging.info(f"Generation {gen+1}/{generations} - Best Fitness: {current_best_fitness:.4f}, Avg Fitness: {avg_fitness:.4f}")
|
| 232 |
+
|
| 233 |
+
new_population = []
|
| 234 |
+
|
| 235 |
+
# 2. Elitism: Carry over the best individuals
|
| 236 |
+
if elitism_count > 0:
|
| 237 |
+
elite_indices = np.argsort(fitness_scores)[-elitism_count:]
|
| 238 |
+
for idx in elite_indices:
|
| 239 |
+
# Clone elite models to avoid modifications affecting originals if selected again
|
| 240 |
+
elite_clone = clone_model(population[idx])
|
| 241 |
+
elite_clone.set_weights(population[idx].get_weights())
|
| 242 |
+
elite_clone.compile(optimizer=Adam(), loss='mse') # Ensure compiled
|
| 243 |
+
new_population.append(elite_clone)
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
# 3. Selection & Reproduction for the rest of the population
|
| 247 |
+
while len(new_population) < len(population):
|
| 248 |
+
# Select parent(s) using tournament selection
|
| 249 |
+
parent = tournament_selection(population, fitness_scores, tournament_size)
|
| 250 |
+
|
| 251 |
+
# Create child through mutation (crossover could be added here)
|
| 252 |
+
child = parent # Start with the parent
|
| 253 |
+
if random.random() < mutation_rate:
|
| 254 |
+
# Clone parent before mutation to avoid modifying the original selected parent
|
| 255 |
+
parent_clone = clone_model(parent)
|
| 256 |
+
parent_clone.set_weights(parent.get_weights())
|
| 257 |
+
parent_clone.compile(optimizer=Adam(), loss='mse') # Ensure compiled
|
| 258 |
+
child = mutate_individual(parent_clone, weight_mut_rate, act_mut_rate, mut_strength)
|
| 259 |
+
else:
|
| 260 |
+
# If no mutation, still clone the parent to ensure new population has distinct objects
|
| 261 |
+
child = clone_model(parent)
|
| 262 |
+
child.set_weights(parent.get_weights())
|
| 263 |
+
child.compile(optimizer=Adam(), loss='mse') # Ensure compiled
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
new_population.append(child)
|
| 267 |
+
|
| 268 |
+
population = new_population[:len(population)] # Ensure population size is maintained
|
| 269 |
+
|
| 270 |
+
if best_model_overall is None: # Handle case where no improvement was ever found
|
| 271 |
+
best_idx = np.argmax([calculate_fitness(ind, X, y, batch_size) for ind in population])
|
| 272 |
+
best_model_overall = population[best_idx]
|
| 273 |
+
|
| 274 |
+
return best_model_overall, best_fitness_history, avg_fitness_history
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
# --- Plotting ---
|
| 278 |
+
def plot_fitness_history(history_best: List[float], history_avg: List[float], output_dir: str) -> None:
|
| 279 |
+
"""Plots and saves the fitness history."""
|
| 280 |
+
plt.figure(figsize=(12, 6))
|
| 281 |
+
plt.plot(history_best, label="Best Fitness per Generation", marker='o', linestyle='-')
|
| 282 |
+
plt.plot(history_avg, label="Average Fitness per Generation", marker='x', linestyle='--')
|
| 283 |
+
plt.xlabel("Generation")
|
| 284 |
+
plt.ylabel("Fitness Score (1 / MSE)")
|
| 285 |
+
plt.title("Evolutionary Process Fitness History")
|
| 286 |
+
plt.legend()
|
| 287 |
+
plt.grid(True)
|
| 288 |
+
plt.tight_layout()
|
| 289 |
+
plot_path = os.path.join(output_dir, "fitness_history.png")
|
| 290 |
+
plt.savefig(plot_path)
|
| 291 |
+
plt.close()
|
| 292 |
+
logging.info(f"Fitness history plot saved to {plot_path}")
|
| 293 |
+
|
| 294 |
+
# --- Evaluation ---
|
| 295 |
+
def evaluate_model(model: Sequential, X_test: np.ndarray, y_test: np.ndarray, batch_size: int) -> Dict[str, float]:
|
| 296 |
+
"""Evaluates the final model on the test set."""
|
| 297 |
+
logging.info("Evaluating final model on test data...")
|
| 298 |
+
y_pred = model.predict(X_test, batch_size=batch_size, verbose=0)
|
| 299 |
+
test_mse = np.mean(np.square(y_test - y_pred))
|
| 300 |
+
logging.info(f"Final Test MSE: {test_mse:.6f}")
|
| 301 |
+
|
| 302 |
+
# Calculate Kendall's Tau for a sample (can be slow for large datasets)
|
| 303 |
+
sample_size = min(100, X_test.shape[0])
|
| 304 |
+
taus = []
|
| 305 |
+
indices = np.random.choice(X_test.shape[0], sample_size, replace=False)
|
| 306 |
+
for i in indices:
|
| 307 |
+
tau, _ = kendalltau(y_test[i], y_pred[i])
|
| 308 |
+
if not np.isnan(tau): # Handle potential NaN if predictions are constant
|
| 309 |
+
taus.append(tau)
|
| 310 |
+
avg_kendall_tau = np.mean(taus) if taus else 0.0
|
| 311 |
+
logging.info(f"Average Kendall's Tau (on {sample_size} samples): {avg_kendall_tau:.4f}")
|
| 312 |
+
|
| 313 |
+
return {
|
| 314 |
+
"test_mse": float(test_mse),
|
| 315 |
+
"avg_kendall_tau": float(avg_kendall_tau)
|
| 316 |
+
}
|
| 317 |
+
|
| 318 |
+
# --- Main Pipeline ---
|
| 319 |
+
def run_pipeline(args: argparse.Namespace):
|
| 320 |
+
"""Executes the complete neuroevolution pipeline."""
|
| 321 |
+
|
| 322 |
+
# Create unique output directory for this run
|
| 323 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 324 |
+
output_dir = os.path.join(args.output_base_dir, f"evorun_{timestamp}")
|
| 325 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 326 |
+
|
| 327 |
+
# Setup logging for this run
|
| 328 |
+
setup_logging(output_dir)
|
| 329 |
+
logging.info(f"Starting EvoNet Pipeline Run: {timestamp}")
|
| 330 |
+
logging.info(f"Output directory: {output_dir}")
|
| 331 |
+
|
| 332 |
+
# Log arguments/configuration
|
| 333 |
+
logging.info("Configuration:")
|
| 334 |
+
args_dict = vars(args)
|
| 335 |
+
for k, v in args_dict.items():
|
| 336 |
+
logging.info(f" {k}: {v}")
|
| 337 |
+
# Save config to file
|
| 338 |
+
config_path = os.path.join(output_dir, "config.json")
|
| 339 |
+
with open(config_path, 'w') as f:
|
| 340 |
+
json.dump(args_dict, f, indent=4)
|
| 341 |
+
logging.info(f"Configuration saved to {config_path}")
|
| 342 |
+
|
| 343 |
+
|
| 344 |
+
# Set random seeds for reproducibility
|
| 345 |
+
random.seed(args.seed)
|
| 346 |
+
np.random.seed(args.seed)
|
| 347 |
+
tf.random.set_seed(args.seed)
|
| 348 |
+
logging.info(f"Using random seed: {args.seed}")
|
| 349 |
+
|
| 350 |
+
# Check GPU
|
| 351 |
+
check_gpu()
|
| 352 |
+
|
| 353 |
+
# Generate Data
|
| 354 |
+
X_train, y_train = generate_data(args.train_samples, args.seq_length)
|
| 355 |
+
X_test, y_test = generate_data(args.test_samples, args.seq_length)
|
| 356 |
+
|
| 357 |
+
# Initialize Population
|
| 358 |
+
logging.info(f"Initializing population of {args.pop_size} individuals...")
|
| 359 |
+
population = [create_individual(args.seq_length) for _ in range(args.pop_size)]
|
| 360 |
+
logging.info("Population initialized.")
|
| 361 |
+
|
| 362 |
+
# Run Evolution
|
| 363 |
+
logging.info(f"Starting evolution for {args.generations} generations...")
|
| 364 |
+
best_model_unevolved, best_fitness_hist, avg_fitness_hist = evolve_population(
|
| 365 |
+
population, X_train, y_train, args.generations,
|
| 366 |
+
args.mutation_rate, args.weight_mut_rate, args.activation_mut_rate, args.mutation_strength,
|
| 367 |
+
args.tournament_size, args.elitism_count, args.batch_size
|
| 368 |
+
)
|
| 369 |
+
logging.info("Evolution complete.")
|
| 370 |
+
|
| 371 |
+
# Save fitness history data
|
| 372 |
+
history_path = os.path.join(output_dir, "fitness_history.csv")
|
| 373 |
+
history_data = np.array([best_fitness_hist, avg_fitness_hist]).T
|
| 374 |
+
np.savetxt(history_path, history_data, delimiter=',', header='BestFitness,AvgFitness', comments='')
|
| 375 |
+
logging.info(f"Fitness history data saved to {history_path}")
|
| 376 |
+
|
| 377 |
+
# Plot fitness history
|
| 378 |
+
plot_fitness_history(best_fitness_hist, avg_fitness_hist, output_dir)
|
| 379 |
+
|
| 380 |
+
# Final Training of the Best Model
|
| 381 |
+
logging.info("Starting final training of the best evolved model...")
|
| 382 |
+
# Clone the best model again to ensure we don't modify the original reference unintentionally
|
| 383 |
+
final_model = clone_model(best_model_unevolved)
|
| 384 |
+
final_model.set_weights(best_model_unevolved.get_weights())
|
| 385 |
+
# Use a fresh optimizer instance for final training
|
| 386 |
+
final_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
|
| 387 |
+
|
| 388 |
+
# Callbacks for efficient training
|
| 389 |
+
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1)
|
| 390 |
+
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-6, verbose=1)
|
| 391 |
+
|
| 392 |
+
# Use a portion of training data for validation during final training
|
| 393 |
+
history = final_model.fit(
|
| 394 |
+
X_train, y_train,
|
| 395 |
+
epochs=args.epochs_final_train,
|
| 396 |
+
batch_size=args.batch_size,
|
| 397 |
+
validation_split=0.2, # Use 20% of training data for validation
|
| 398 |
+
callbacks=[early_stopping, reduce_lr],
|
| 399 |
+
verbose=2 # Show one line per epoch
|
| 400 |
+
)
|
| 401 |
+
logging.info("Final training complete.")
|
| 402 |
+
|
| 403 |
+
# Evaluate the TRAINED final model
|
| 404 |
+
final_metrics = evaluate_model(final_model, X_test, y_test, args.batch_size)
|
| 405 |
+
|
| 406 |
+
# Save the TRAINED final model
|
| 407 |
+
model_path = os.path.join(output_dir, "best_evolved_model_trained.keras") # Use .keras format
|
| 408 |
+
final_model.save(model_path)
|
| 409 |
+
logging.info(f"Final trained model saved to {model_path}")
|
| 410 |
+
|
| 411 |
+
# Save final results
|
| 412 |
+
results = {
|
| 413 |
+
"config": args_dict,
|
| 414 |
+
"final_evaluation": final_metrics,
|
| 415 |
+
"evolution_summary": {
|
| 416 |
+
"best_fitness_overall": best_fitness_hist[-1] if best_fitness_hist else None,
|
| 417 |
+
"avg_fitness_final_gen": avg_fitness_hist[-1] if avg_fitness_hist else None,
|
| 418 |
+
},
|
| 419 |
+
"training_history": history.history # Include loss/val_loss history from final training
|
| 420 |
+
}
|
| 421 |
+
results_path = os.path.join(output_dir, "final_results.json")
|
| 422 |
+
# Convert numpy types in history to native Python types for JSON serialization
|
| 423 |
+
for key in results['training_history']:
|
| 424 |
+
results['training_history'][key] = [float(v) for v in results['training_history'][key]]
|
| 425 |
+
|
| 426 |
+
with open(results_path, 'w') as f:
|
| 427 |
+
json.dump(results, f, indent=4)
|
| 428 |
+
logging.info(f"Final results saved to {results_path}")
|
| 429 |
+
logging.info("Pipeline finished successfully!")
|
| 430 |
+
|
| 431 |
+
|
| 432 |
+
# --- Argument Parser ---
|
| 433 |
+
def parse_arguments() -> argparse.Namespace:
|
| 434 |
+
parser = argparse.ArgumentParser(description="EvoNet: Neuroevolution for Sorting Task")
|
| 435 |
+
|
| 436 |
+
# --- Directory ---
|
| 437 |
+
parser.add_argument('--output_base_dir', type=str, default=os.path.join(os.getcwd(), "evonet_runs"),
|
| 438 |
+
help='Base directory to store run results.')
|
| 439 |
+
|
| 440 |
+
# --- Data ---
|
| 441 |
+
parser.add_argument('--seq_length', type=int, default=DEFAULT_SEQ_LENGTH,
|
| 442 |
+
help='Length of the sequences to sort.')
|
| 443 |
+
parser.add_argument('--train_samples', type=int, default=5000, help='Number of training samples.')
|
| 444 |
+
parser.add_argument('--test_samples', type=int, default=1000, help='Number of test samples.')
|
| 445 |
+
|
| 446 |
+
# --- Evolution Parameters ---
|
| 447 |
+
parser.add_argument('--pop_size', type=int, default=DEFAULT_POP_SIZE, help='Population size.')
|
| 448 |
+
parser.add_argument('--generations', type=int, default=DEFAULT_GENERATIONS, help='Number of generations.')
|
| 449 |
+
parser.add_argument('--mutation_rate', type=float, default=DEFAULT_MUTATION_RATE,
|
| 450 |
+
help='Overall probability of mutating an individual.')
|
| 451 |
+
parser.add_argument('--weight_mut_rate', type=float, default=DEFAULT_WEIGHT_MUT_RATE,
|
| 452 |
+
help='Probability of weight perturbation if mutation occurs.')
|
| 453 |
+
parser.add_argument('--activation_mut_rate', type=float, default=DEFAULT_ACTIVATION_MUT_RATE,
|
| 454 |
+
help='Probability of activation change if mutation occurs.')
|
| 455 |
+
parser.add_argument('--mutation_strength', type=float, default=DEFAULT_MUTATION_STRENGTH,
|
| 456 |
+
help='Standard deviation of Gaussian noise for weight mutation.')
|
| 457 |
+
parser.add_argument('--tournament_size', type=int, default=DEFAULT_TOURNAMENT_SIZE,
|
| 458 |
+
help='Number of individuals participating in tournament selection.')
|
| 459 |
+
parser.add_argument('--elitism_count', type=int, default=DEFAULT_ELITISM_COUNT,
|
| 460 |
+
help='Number of best individuals to carry over directly.')
|
| 461 |
+
|
| 462 |
+
# --- Training & Evaluation ---
|
| 463 |
+
parser.add_argument('--batch_size', type=int, default=DEFAULT_BATCH_SIZE, help='Batch size for predictions and training.')
|
| 464 |
+
parser.add_argument('--epochs_final_train', type=int, default=DEFAULT_EPOCHS_FINAL_TRAIN,
|
| 465 |
+
help='Max epochs for final training of the best model.')
|
| 466 |
+
|
| 467 |
+
# --- Reproducibility ---
|
| 468 |
+
parser.add_argument('--seed', type=int, default=None, help='Random seed for reproducibility (default: random).')
|
| 469 |
+
|
| 470 |
+
args = parser.parse_args()
|
| 471 |
+
|
| 472 |
+
# If seed is not provided, generate one
|
| 473 |
+
if args.seed is None:
|
| 474 |
+
args.seed = random.randint(0, 2**32 - 1)
|
| 475 |
+
|
| 476 |
+
return args
|
| 477 |
+
|
| 478 |
+
|
| 479 |
+
# --- Main Execution ---
|
| 480 |
+
if __name__ == "__main__":
|
| 481 |
+
# 1. Parse Command Line Arguments
|
| 482 |
+
cli_args = parse_arguments()
|
| 483 |
+
|
| 484 |
+
# Ensure output directory exists
|
| 485 |
+
os.makedirs(cli_args.output_base_dir, exist_ok=True)
|
| 486 |
+
|
| 487 |
+
# 2. Run the Pipeline
|
| 488 |
+
try:
|
| 489 |
+
run_pipeline(cli_args)
|
| 490 |
+
except Exception as e:
|
| 491 |
+
# Log any uncaught exceptions during the pipeline execution
|
| 492 |
+
# The logger might not be set up if error is early, so print as fallback
|
| 493 |
+
print(f"FATAL ERROR in pipeline execution: {e}", file=sys.stderr)
|
| 494 |
+
# Attempt to log if logger was initialized
|
| 495 |
+
if logging.getLogger().hasHandlers():
|
| 496 |
+
logging.critical("FATAL ERROR in pipeline execution:", exc_info=True)
|
| 497 |
+
else:
|
| 498 |
+
import traceback
|
| 499 |
+
print(traceback.format_exc(), file=sys.stderr)
|
| 500 |
+
sys.exit(1) # Exit with error code
|
v2.py
ADDED
|
@@ -0,0 +1,643 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==============================================================================
|
| 2 |
+
# EvoNet Optimizer 2 - Revize Edilmiş ve İyileştirilmiş Kod
|
| 3 |
+
# Açıklama: Bu kod, sıralama görevini öğrenmek için rastgele topolojilere
|
| 4 |
+
# sahip sinir ağlarını evrimleştiren bir neuroevolution süreci uygular.
|
| 5 |
+
# Daha sağlam hata kontrolü, yapılandırma, loglama ve iyileştirilmiş
|
| 6 |
+
# evrimsel operatörler içerir.
|
| 7 |
+
# ==============================================================================
|
| 8 |
+
|
| 9 |
+
import os
|
| 10 |
+
import subprocess
|
| 11 |
+
import sys
|
| 12 |
+
import argparse
|
| 13 |
+
import random
|
| 14 |
+
import logging
|
| 15 |
+
from datetime import datetime
|
| 16 |
+
import json
|
| 17 |
+
from typing import List, Tuple, Dict, Any
|
| 18 |
+
|
| 19 |
+
import numpy as np
|
| 20 |
+
import tensorflow as tf
|
| 21 |
+
from tensorflow.keras.models import Sequential, load_model, clone_model
|
| 22 |
+
from tensorflow.keras.layers import Dense, Input
|
| 23 |
+
from tensorflow.keras.optimizers import Adam
|
| 24 |
+
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
|
| 25 |
+
import matplotlib.pyplot as plt
|
| 26 |
+
from scipy.stats import kendalltau
|
| 27 |
+
|
| 28 |
+
# --- Sabitler ve Varsayılan Değerler ---
|
| 29 |
+
DEFAULT_SEQ_LENGTH = 10
|
| 30 |
+
DEFAULT_POP_SIZE = 50
|
| 31 |
+
DEFAULT_GENERATIONS = 50
|
| 32 |
+
DEFAULT_MUTATION_RATE = 0.4 # Bireye mutasyon uygulama olasılığı
|
| 33 |
+
DEFAULT_WEIGHT_MUT_RATE = 0.8 # Mutasyon olursa, ağırlık bozulması olasılığı
|
| 34 |
+
DEFAULT_ACTIVATION_MUT_RATE = 0.2 # Mutasyon olursa, aktivasyon değişimi olasılığı
|
| 35 |
+
DEFAULT_MUTATION_STRENGTH = 0.1 # Ağırlık bozulmasının büyüklüğü (std dev)
|
| 36 |
+
DEFAULT_TOURNAMENT_SIZE = 5 # Turnuva seçilimindeki birey sayısı
|
| 37 |
+
DEFAULT_ELITISM_COUNT = 2 # Sonraki nesle doğrudan aktarılacak en iyi birey sayısı
|
| 38 |
+
DEFAULT_EPOCHS_FINAL_TRAIN = 100 # En iyi modelin son eğitimindeki max epoch
|
| 39 |
+
DEFAULT_BATCH_SIZE = 64 # Tahmin ve eğitim için batch boyutu
|
| 40 |
+
DEFAULT_OUTPUT_BASE_DIR = os.path.join(os.getcwd(), "evonet_runs_revised") # Ana çıktı klasörü
|
| 41 |
+
|
| 42 |
+
# --- Loglama Ayarları ---
|
| 43 |
+
def setup_logging(log_dir: str, log_level=logging.INFO) -> None:
|
| 44 |
+
"""Loglamayı dosyaya ve konsola ayarlayan fonksiyon."""
|
| 45 |
+
log_filename = os.path.join(log_dir, 'evolution_run.log')
|
| 46 |
+
# Önceki handler'ları temizle (Jupyter gibi ortamlarda tekrar çalıştırmada önemli)
|
| 47 |
+
for handler in logging.root.handlers[:]:
|
| 48 |
+
logging.root.removeHandler(handler)
|
| 49 |
+
# Yeni handler'ları ayarla
|
| 50 |
+
logging.basicConfig(
|
| 51 |
+
level=log_level,
|
| 52 |
+
format='%(asctime)s - %(levelname)-8s - %(message)s',
|
| 53 |
+
handlers=[
|
| 54 |
+
logging.FileHandler(log_filename, mode='w'), # 'w' modu ile her çalıştırmada üzerine yazar
|
| 55 |
+
logging.StreamHandler(sys.stdout)
|
| 56 |
+
]
|
| 57 |
+
)
|
| 58 |
+
logging.info("Logging setup complete.")
|
| 59 |
+
|
| 60 |
+
# --- GPU Kontrolü ---
|
| 61 |
+
def check_gpu() -> bool:
|
| 62 |
+
"""GPU varlığını kontrol eder ve bellek artışını ayarlar."""
|
| 63 |
+
gpus = tf.config.list_physical_devices('GPU')
|
| 64 |
+
if gpus:
|
| 65 |
+
try:
|
| 66 |
+
for gpu in gpus:
|
| 67 |
+
tf.config.experimental.set_memory_growth(gpu, True)
|
| 68 |
+
logical_gpus = tf.config.list_logical_devices('GPU')
|
| 69 |
+
logging.info(f"{len(gpus)} Physical GPUs, {len(logical_gpus)} Logical GPUs found.")
|
| 70 |
+
if logical_gpus:
|
| 71 |
+
logging.info(f"Using GPU: {tf.config.experimental.get_device_details(gpus[0])['device_name']}")
|
| 72 |
+
return True
|
| 73 |
+
except RuntimeError as e:
|
| 74 |
+
logging.error(f"Error setting memory growth for GPU: {e}", exc_info=True)
|
| 75 |
+
return False
|
| 76 |
+
else:
|
| 77 |
+
logging.warning("GPU not found. Using CPU.")
|
| 78 |
+
return False
|
| 79 |
+
|
| 80 |
+
# --- Veri Üretimi ---
|
| 81 |
+
def generate_data(num_samples: int, seq_length: int) -> Tuple[np.ndarray, np.ndarray]:
|
| 82 |
+
"""Rastgele diziler ve sıralanmış hallerini üretir."""
|
| 83 |
+
logging.info(f"Generating {num_samples} samples with sequence length {seq_length}...")
|
| 84 |
+
try:
|
| 85 |
+
X = np.random.rand(num_samples, seq_length).astype(np.float32) * 100
|
| 86 |
+
y = np.sort(X, axis=1).astype(np.float32)
|
| 87 |
+
logging.info("Data generation successful.")
|
| 88 |
+
return X, y
|
| 89 |
+
except Exception as e:
|
| 90 |
+
logging.error(f"Error during data generation: {e}", exc_info=True)
|
| 91 |
+
raise # Hatanın yukarıya bildirilmesi önemli
|
| 92 |
+
|
| 93 |
+
# --- Neuroevolution Çekirdeği ---
|
| 94 |
+
def create_individual(seq_length: int, input_shape: Tuple) -> Sequential:
|
| 95 |
+
"""Rastgele mimariye sahip bir Keras Sequential modeli oluşturur ve derler."""
|
| 96 |
+
try:
|
| 97 |
+
model = Sequential(name=f"model_random_{random.randint(10000, 99999)}")
|
| 98 |
+
num_hidden_layers = random.randint(1, 4)
|
| 99 |
+
neurons_per_layer = [random.randint(8, 64) for _ in range(num_hidden_layers)]
|
| 100 |
+
activations = [random.choice(['relu', 'tanh', 'sigmoid']) for _ in range(num_hidden_layers)]
|
| 101 |
+
|
| 102 |
+
model.add(Input(shape=input_shape)) # Input katmanı
|
| 103 |
+
|
| 104 |
+
for i in range(num_hidden_layers): # Gizli katmanlar
|
| 105 |
+
model.add(Dense(neurons_per_layer[i], activation=activations[i]))
|
| 106 |
+
|
| 107 |
+
model.add(Dense(seq_length, activation='linear')) # Çıkış katmanı
|
| 108 |
+
|
| 109 |
+
# Ağırlık manipülasyonu ve potansiyel eğitim için modeli derle
|
| 110 |
+
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
|
| 111 |
+
#logging.debug(f"Created individual: {model.name} with {len(model.layers)} layers.")
|
| 112 |
+
return model
|
| 113 |
+
except Exception as e:
|
| 114 |
+
logging.error(f"Error creating individual model: {e}", exc_info=True)
|
| 115 |
+
raise
|
| 116 |
+
|
| 117 |
+
@tf.function # TensorFlow grafiği olarak derleyerek potansiyel hızlandırma
|
| 118 |
+
def get_predictions(model: Sequential, X: tf.Tensor) -> tf.Tensor:
|
| 119 |
+
"""Model tahminlerini tf.function kullanarak alır."""
|
| 120 |
+
return model(X, training=False)
|
| 121 |
+
|
| 122 |
+
def calculate_fitness(individual: Sequential, X: np.ndarray, y: np.ndarray, batch_size: int) -> float:
|
| 123 |
+
"""Bir bireyin fitness değerini (1/MSE) hesaplar, hataları yönetir."""
|
| 124 |
+
if not isinstance(X, tf.Tensor): X = tf.cast(X, tf.float32)
|
| 125 |
+
if not isinstance(y, tf.Tensor): y = tf.cast(y, tf.float32)
|
| 126 |
+
|
| 127 |
+
try:
|
| 128 |
+
y_pred_tf = get_predictions(individual, X) # Batching predict içinde yapılır
|
| 129 |
+
mse = tf.reduce_mean(tf.square(y - y_pred_tf))
|
| 130 |
+
mse_val = mse.numpy()
|
| 131 |
+
|
| 132 |
+
# Fitness: Ters MSE (sıfıra bölmeyi önlemek için epsilon ekle)
|
| 133 |
+
fitness_score = 1.0 / (mse_val + 1e-8)
|
| 134 |
+
|
| 135 |
+
if not np.isfinite(fitness_score) or fitness_score < 0:
|
| 136 |
+
logging.warning(f"Non-finite or negative fitness detected ({fitness_score:.4g}) for model {individual.name}. Assigning minimal fitness.")
|
| 137 |
+
return 1e-8 # Çok düşük bir fitness ata
|
| 138 |
+
|
| 139 |
+
#logging.debug(f"Fitness for {individual.name}: {fitness_score:.4f} (MSE: {mse_val:.4f})")
|
| 140 |
+
return float(fitness_score)
|
| 141 |
+
|
| 142 |
+
except tf.errors.InvalidArgumentError as e:
|
| 143 |
+
logging.error(f"TensorFlow InvalidArgumentError during fitness calculation for model {individual.name} (potential shape mismatch?): {e}")
|
| 144 |
+
return 1e-8
|
| 145 |
+
except Exception as e:
|
| 146 |
+
logging.error(f"Unhandled error during fitness calculation for model {individual.name}: {e}", exc_info=True)
|
| 147 |
+
return 1e-8 # Hata durumunda minimum fitness döndür
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def mutate_individual(individual: Sequential, weight_mut_rate: float, act_mut_rate: float, mut_strength: float) -> Sequential:
|
| 151 |
+
"""Bir bireye mutasyonlar uygular (ağırlık bozulması, aktivasyon değişimi)."""
|
| 152 |
+
try:
|
| 153 |
+
# Mutasyon için modeli klonla, orijinali bozma
|
| 154 |
+
mutated_model = clone_model(individual)
|
| 155 |
+
mutated_model.set_weights(individual.get_weights())
|
| 156 |
+
|
| 157 |
+
mutated = False
|
| 158 |
+
# 1. Ağırlık Mutasyonu
|
| 159 |
+
if random.random() < weight_mut_rate:
|
| 160 |
+
mutated = True
|
| 161 |
+
for layer in mutated_model.layers:
|
| 162 |
+
if isinstance(layer, Dense) and layer.get_weights(): # Sadece ağırlığı olan Dense katmanları
|
| 163 |
+
weights_biases = layer.get_weights()
|
| 164 |
+
new_weights_biases = []
|
| 165 |
+
for wb in weights_biases:
|
| 166 |
+
noise = np.random.normal(0, mut_strength, wb.shape).astype(np.float32)
|
| 167 |
+
new_weights_biases.append(wb + noise)
|
| 168 |
+
layer.set_weights(new_weights_biases)
|
| 169 |
+
|
| 170 |
+
# 2. Aktivasyon Mutasyonu (Bağımsız olasılık)
|
| 171 |
+
if random.random() < act_mut_rate:
|
| 172 |
+
dense_layers = [layer for layer in mutated_model.layers if isinstance(layer, Dense)]
|
| 173 |
+
if len(dense_layers) > 1: # En az bir gizli katman varsa
|
| 174 |
+
layer_to_mutate = random.choice(dense_layers[:-1]) # Çıkış katmanı hariç
|
| 175 |
+
current_activation_name = tf.keras.activations.serialize(layer_to_mutate.activation)
|
| 176 |
+
possible_activations = ['relu', 'tanh', 'sigmoid']
|
| 177 |
+
if current_activation_name in possible_activations:
|
| 178 |
+
possible_activations.remove(current_activation_name)
|
| 179 |
+
if possible_activations: # Değiştirilecek başka aktivasyon varsa
|
| 180 |
+
new_activation = random.choice(possible_activations)
|
| 181 |
+
# Katman konfigürasyonunu güncellemek daha güvenli
|
| 182 |
+
layer_config = layer_to_mutate.get_config()
|
| 183 |
+
layer_config['activation'] = new_activation
|
| 184 |
+
# Yeni konfigürasyondan yeni katman oluştur ve ağırlıkları aktar
|
| 185 |
+
try:
|
| 186 |
+
new_layer = Dense.from_config(layer_config)
|
| 187 |
+
# Model içinde katmanı değiştirmek yerine, modeli yeniden oluşturmak daha sağlam olabilir.
|
| 188 |
+
# Ancak basitlik için bu yaklaşımı deneyelim (riskli olabilir).
|
| 189 |
+
# Aktivasyon değiştirmek için katmanı yeniden build etmek gerekebilir.
|
| 190 |
+
# Bu kısım karmaşık olabilir, şimdilik loglayalım.
|
| 191 |
+
logging.debug(f"Attempting activation change on layer {layer_to_mutate.name} to {new_activation} (Implementation needs robust handling).")
|
| 192 |
+
# Gerçek uygulamada modeli yeniden oluşturmak daha iyi olabilir.
|
| 193 |
+
# Şimdilik sadece ağırlık mutasyonuna odaklanalım. Aktivasyon mutasyonu deneysel kalabilir.
|
| 194 |
+
mutated = True # Aktivasyon mutasyon denemesi yapıldı olarak işaretle
|
| 195 |
+
except Exception as e:
|
| 196 |
+
logging.warning(f"Could not directly modify/rebuild layer for activation change: {e}")
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
# Mutasyona uğradıysa modeli yeniden derle (optimizer durumu sıfırlanabilir)
|
| 200 |
+
if mutated:
|
| 201 |
+
mutated_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
|
| 202 |
+
mutated_model._name = f"mutated_{individual.name}_{random.randint(1000,9999)}" # İsmi güncelle
|
| 203 |
+
#logging.debug(f"Mutated model {individual.name} -> {mutated_model.name}")
|
| 204 |
+
|
| 205 |
+
return mutated_model
|
| 206 |
+
except Exception as e:
|
| 207 |
+
logging.error(f"Error during mutation of model {individual.name}: {e}", exc_info=True)
|
| 208 |
+
return individual # Hata olursa orijinal bireyi döndür
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
def tournament_selection(population: List[Sequential], fitness_scores: List[float], k: int) -> Sequential:
|
| 212 |
+
"""Rastgele seçilen bir turnuva grubundan en iyi bireyi seçer."""
|
| 213 |
+
if not population:
|
| 214 |
+
raise ValueError("Population cannot be empty for selection.")
|
| 215 |
+
if len(population) < k:
|
| 216 |
+
logging.warning(f"Tournament size {k} is larger than population size {len(population)}. Using population size.")
|
| 217 |
+
k = len(population)
|
| 218 |
+
try:
|
| 219 |
+
tournament_indices = random.sample(range(len(population)), k)
|
| 220 |
+
tournament_fitness = [fitness_scores[i] for i in tournament_indices]
|
| 221 |
+
winner_local_idx = np.argmax(tournament_fitness)
|
| 222 |
+
winner_global_idx = tournament_indices[winner_local_idx]
|
| 223 |
+
#logging.debug(f"Tournament winner: Index {winner_global_idx}, Fitness: {fitness_scores[winner_global_idx]:.4f}")
|
| 224 |
+
return population[winner_global_idx]
|
| 225 |
+
except Exception as e:
|
| 226 |
+
logging.error(f"Error during tournament selection: {e}", exc_info=True)
|
| 227 |
+
# Hata durumunda rastgele bir birey seçmek bir alternatif olabilir
|
| 228 |
+
return random.choice(population)
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
def evolve_population(population: List[Sequential], X: np.ndarray, y: np.ndarray, generations: int,
|
| 232 |
+
mutation_rate: float, weight_mut_rate: float, act_mut_rate: float, mut_strength: float,
|
| 233 |
+
tournament_size: int, elitism_count: int, batch_size: int) -> Tuple[Sequential, List[float], List[float]]:
|
| 234 |
+
"""Evrimsel süreci çalıştırır, en iyi modeli ve fitness geçmişini döndürür."""
|
| 235 |
+
best_fitness_history = []
|
| 236 |
+
avg_fitness_history = []
|
| 237 |
+
best_model_overall = None
|
| 238 |
+
best_fitness_overall = -np.inf # Negatif sonsuz ile başla
|
| 239 |
+
|
| 240 |
+
# Veriyi TensorFlow tensor'üne dönüştür (döngü dışında bir kez yap)
|
| 241 |
+
X_tf = tf.cast(X, tf.float32)
|
| 242 |
+
y_tf = tf.cast(y, tf.float32)
|
| 243 |
+
|
| 244 |
+
for gen in range(generations):
|
| 245 |
+
generation_start_time = datetime.now()
|
| 246 |
+
# 1. Fitness Değerlendirme
|
| 247 |
+
try:
|
| 248 |
+
# Tüm popülasyon için fitness'ı hesapla
|
| 249 |
+
fitness_scores = [calculate_fitness(ind, X_tf, y_tf, batch_size) for ind in population]
|
| 250 |
+
except Exception as e:
|
| 251 |
+
logging.critical(f"Error calculating fitness for population in Generation {gen+1}: {e}", exc_info=True)
|
| 252 |
+
# Bu kritik bir hata, süreci durdurmak gerekebilir veya önceki popülasyonla devam edilebilir.
|
| 253 |
+
# Şimdilik en iyi modeli döndürelim ve çıkalım.
|
| 254 |
+
if best_model_overall: return best_model_overall, best_fitness_history, avg_fitness_history
|
| 255 |
+
else: raise # Henüz iyi model yoksa hatayı yükselt
|
| 256 |
+
|
| 257 |
+
# 2. İstatistikler ve En İyiyi Takip Etme
|
| 258 |
+
current_best_idx = np.argmax(fitness_scores)
|
| 259 |
+
current_best_fitness = fitness_scores[current_best_idx]
|
| 260 |
+
avg_fitness = np.mean(fitness_scores)
|
| 261 |
+
best_fitness_history.append(current_best_fitness)
|
| 262 |
+
avg_fitness_history.append(avg_fitness)
|
| 263 |
+
|
| 264 |
+
if current_best_fitness > best_fitness_overall:
|
| 265 |
+
best_fitness_overall = current_best_fitness
|
| 266 |
+
try:
|
| 267 |
+
# En iyi modelin yapısını ve ağırlıklarını güvenli bir şekilde kopyala
|
| 268 |
+
best_model_overall = clone_model(population[current_best_idx])
|
| 269 |
+
best_model_overall.set_weights(population[current_best_idx].get_weights())
|
| 270 |
+
best_model_overall.compile(optimizer=Adam(), loss='mse') # Yeniden derle
|
| 271 |
+
logging.info(f"Generation {gen+1}: *** New overall best fitness found: {best_fitness_overall:.6f} ***")
|
| 272 |
+
except Exception as e:
|
| 273 |
+
logging.error(f"Could not clone or set weights for the new best model: {e}", exc_info=True)
|
| 274 |
+
# Klonlama başarısız olursa devam et, ama en iyi model güncellenmemiş olabilir.
|
| 275 |
+
best_fitness_overall = current_best_fitness # Fitness'�� yine de güncelle
|
| 276 |
+
|
| 277 |
+
generation_time = (datetime.now() - generation_start_time).total_seconds()
|
| 278 |
+
logging.info(f"Generation {gen+1}/{generations} | Best Fitness: {current_best_fitness:.6f} | Avg Fitness: {avg_fitness:.6f} | Time: {generation_time:.2f}s")
|
| 279 |
+
|
| 280 |
+
# 3. Yeni Popülasyon Oluşturma
|
| 281 |
+
new_population = []
|
| 282 |
+
|
| 283 |
+
# 3a. Elitizm
|
| 284 |
+
if elitism_count > 0 and len(population) >= elitism_count:
|
| 285 |
+
try:
|
| 286 |
+
elite_indices = np.argsort(fitness_scores)[-elitism_count:]
|
| 287 |
+
for idx in elite_indices:
|
| 288 |
+
elite_clone = clone_model(population[idx])
|
| 289 |
+
elite_clone.set_weights(population[idx].get_weights())
|
| 290 |
+
elite_clone.compile(optimizer=Adam(), loss='mse')
|
| 291 |
+
new_population.append(elite_clone)
|
| 292 |
+
#logging.debug(f"Added elite model {elite_clone.name} (Index: {idx}, Fitness: {fitness_scores[idx]:.4f})")
|
| 293 |
+
except Exception as e:
|
| 294 |
+
logging.error(f"Error during elitism: {e}", exc_info=True)
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
# 3b. Seçilim ve Üreme (Kalan Bireyler İçin)
|
| 298 |
+
num_to_generate = len(population) - len(new_population)
|
| 299 |
+
offspring_population = []
|
| 300 |
+
while len(offspring_population) < num_to_generate:
|
| 301 |
+
try:
|
| 302 |
+
# Ebeveyn seç
|
| 303 |
+
parent = tournament_selection(population, fitness_scores, tournament_size)
|
| 304 |
+
|
| 305 |
+
# Çocuk oluştur (mutasyon uygula veya uygulama)
|
| 306 |
+
if random.random() < mutation_rate:
|
| 307 |
+
child = mutate_individual(parent, weight_mut_rate, act_mut_rate, mut_strength)
|
| 308 |
+
else:
|
| 309 |
+
# Mutasyon yoksa, yine de klonla ki aynı nesne referansı olmasın
|
| 310 |
+
child = clone_model(parent)
|
| 311 |
+
child.set_weights(parent.get_weights())
|
| 312 |
+
child.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
|
| 313 |
+
child._name = f"cloned_{parent.name}_{random.randint(1000,9999)}" # İsmi güncelle
|
| 314 |
+
|
| 315 |
+
offspring_population.append(child)
|
| 316 |
+
except Exception as e:
|
| 317 |
+
logging.error(f"Error during selection/reproduction cycle: {e}", exc_info=True)
|
| 318 |
+
# Hata durumunda döngüyü kırmak veya rastgele birey eklemek düşünülebilir
|
| 319 |
+
# Şimdilik döngü devam etsin, belki sonraki denemede düzelir
|
| 320 |
+
if len(offspring_population) < num_to_generate: # Eksik kalmaması için rastgele ekle
|
| 321 |
+
logging.warning("Adding random individual due to reproduction error.")
|
| 322 |
+
offspring_population.append(create_individual(y.shape[1], X.shape[1:]))
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
new_population.extend(offspring_population)
|
| 326 |
+
population = new_population # Popülasyonu güncelle
|
| 327 |
+
|
| 328 |
+
# Döngü bittiğinde en iyi modeli döndür
|
| 329 |
+
if best_model_overall is None and population: # Hiç iyileşme olmadıysa veya elitizm yoksa
|
| 330 |
+
logging.warning("No overall best model tracked (or cloning failed). Returning best from final population.")
|
| 331 |
+
final_fitness_scores = [calculate_fitness(ind, X_tf, y_tf, batch_size) for ind in population]
|
| 332 |
+
best_idx_final = np.argmax(final_fitness_scores)
|
| 333 |
+
best_model_overall = population[best_idx_final]
|
| 334 |
+
elif not population:
|
| 335 |
+
logging.error("Evolution finished with an empty population!")
|
| 336 |
+
return None, best_fitness_history, avg_fitness_history
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
logging.info(f"Evolution finished. Best fitness achieved: {best_fitness_overall:.6f}")
|
| 340 |
+
return best_model_overall, best_fitness_history, avg_fitness_history
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
# --- Grafik Çizimi ---
|
| 344 |
+
def plot_fitness_history(history_best: List[float], history_avg: List[float], output_dir: str) -> None:
|
| 345 |
+
"""Fitness geçmişini çizer ve kaydeder."""
|
| 346 |
+
if not history_best or not history_avg:
|
| 347 |
+
logging.warning("Fitness history is empty, cannot plot.")
|
| 348 |
+
return
|
| 349 |
+
try:
|
| 350 |
+
plt.figure(figsize=(12, 7))
|
| 351 |
+
plt.plot(history_best, label="Best Fitness per Generation", marker='o', linestyle='-', linewidth=2)
|
| 352 |
+
plt.plot(history_avg, label="Average Fitness per Generation", marker='x', linestyle='--', alpha=0.7)
|
| 353 |
+
plt.xlabel("Generation")
|
| 354 |
+
plt.ylabel("Fitness Score (1 / MSE)")
|
| 355 |
+
plt.title("Evolutionary Process Fitness History")
|
| 356 |
+
plt.legend()
|
| 357 |
+
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
|
| 358 |
+
plt.tight_layout()
|
| 359 |
+
plot_path = os.path.join(output_dir, "fitness_history.png")
|
| 360 |
+
plt.savefig(plot_path)
|
| 361 |
+
plt.close() # Bellekte figürü kapat
|
| 362 |
+
logging.info(f"Fitness history plot saved to {plot_path}")
|
| 363 |
+
except Exception as e:
|
| 364 |
+
logging.error(f"Error plotting fitness history: {e}", exc_info=True)
|
| 365 |
+
|
| 366 |
+
# --- Değerlendirme ---
|
| 367 |
+
def evaluate_model(model: Sequential, X_test: np.ndarray, y_test: np.ndarray, batch_size: int) -> Dict[str, float]:
|
| 368 |
+
"""Son modeli test verisi üzerinde değerlendirir."""
|
| 369 |
+
if model is None:
|
| 370 |
+
logging.error("Cannot evaluate a None model.")
|
| 371 |
+
return {"test_mse": np.inf, "avg_kendall_tau": 0.0}
|
| 372 |
+
logging.info("Evaluating final model on test data...")
|
| 373 |
+
try:
|
| 374 |
+
y_pred = model.predict(X_test, batch_size=batch_size, verbose=0)
|
| 375 |
+
test_mse = np.mean(np.square(y_test - y_pred))
|
| 376 |
+
logging.info(f"Final Test MSE: {test_mse:.6f}")
|
| 377 |
+
|
| 378 |
+
# Kendall's Tau (örneklem üzerinde)
|
| 379 |
+
sample_size = min(500, X_test.shape[0]) # Örneklem boyutunu ayarla
|
| 380 |
+
taus = []
|
| 381 |
+
indices = np.random.choice(X_test.shape[0], sample_size, replace=False)
|
| 382 |
+
for i in indices:
|
| 383 |
+
try:
|
| 384 |
+
tau, _ = kendalltau(y_test[i], y_pred[i])
|
| 385 |
+
if not np.isnan(tau): taus.append(tau)
|
| 386 |
+
except ValueError as ve: # Eğer y_pred sabit değerler içeriyorsa
|
| 387 |
+
logging.debug(f"Kendall tau ValueError for sample {i}: {ve}")
|
| 388 |
+
|
| 389 |
+
avg_kendall_tau = np.mean(taus) if taus else 0.0
|
| 390 |
+
logging.info(f"Average Kendall's Tau (on {sample_size} samples): {avg_kendall_tau:.4f}")
|
| 391 |
+
|
| 392 |
+
return {
|
| 393 |
+
"test_mse": float(test_mse),
|
| 394 |
+
"avg_kendall_tau": float(avg_kendall_tau)
|
| 395 |
+
}
|
| 396 |
+
except Exception as e:
|
| 397 |
+
logging.error(f"Error during final model evaluation: {e}", exc_info=True)
|
| 398 |
+
return {"test_mse": np.inf, "avg_kendall_tau": 0.0} # Hata durumunda kötü değerler döndür
|
| 399 |
+
|
| 400 |
+
# --- Ana İş Akışı ---
|
| 401 |
+
def run_pipeline(args: argparse.Namespace):
|
| 402 |
+
"""Tüm neuroevolution iş akışını çalıştırır."""
|
| 403 |
+
|
| 404 |
+
# Benzersiz çıktı klasörü oluştur
|
| 405 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 406 |
+
run_name = f"evorun_{timestamp}_gen{args.generations}_pop{args.pop_size}"
|
| 407 |
+
output_dir = os.path.join(args.output_base_dir, run_name)
|
| 408 |
+
try:
|
| 409 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 410 |
+
except OSError as e:
|
| 411 |
+
print(f"FATAL: Could not create output directory: {output_dir}. Error: {e}", file=sys.stderr)
|
| 412 |
+
sys.exit(1)
|
| 413 |
+
|
| 414 |
+
# Loglamayı ayarla
|
| 415 |
+
setup_logging(output_dir)
|
| 416 |
+
logging.info(f"========== Starting EvoNet Pipeline Run: {run_name} ==========")
|
| 417 |
+
logging.info(f"Output directory: {output_dir}")
|
| 418 |
+
|
| 419 |
+
# Argümanları logla ve kaydet
|
| 420 |
+
logging.info("--- Configuration ---")
|
| 421 |
+
args_dict = vars(args)
|
| 422 |
+
for k, v in args_dict.items():
|
| 423 |
+
logging.info(f" {k:<20}: {v}")
|
| 424 |
+
logging.info("---------------------")
|
| 425 |
+
config_path = os.path.join(output_dir, "config.json")
|
| 426 |
+
try:
|
| 427 |
+
with open(config_path, 'w') as f:
|
| 428 |
+
json.dump(args_dict, f, indent=4, sort_keys=True)
|
| 429 |
+
logging.info(f"Configuration saved to {config_path}")
|
| 430 |
+
except Exception as e:
|
| 431 |
+
logging.error(f"Failed to save configuration: {e}", exc_info=True)
|
| 432 |
+
|
| 433 |
+
|
| 434 |
+
# Rastgele tohumları ayarla
|
| 435 |
+
try:
|
| 436 |
+
random.seed(args.seed)
|
| 437 |
+
np.random.seed(args.seed)
|
| 438 |
+
tf.random.set_seed(args.seed)
|
| 439 |
+
logging.info(f"Using random seed: {args.seed}")
|
| 440 |
+
# Deterministic ops (TensorFlow >= 2.8): Opsiyonel, performansı düşürebilir ama tekrarlanabilirliği artırır
|
| 441 |
+
# tf.config.experimental.enable_op_determinism()
|
| 442 |
+
except Exception as e:
|
| 443 |
+
logging.warning(f"Could not set all random seeds: {e}")
|
| 444 |
+
|
| 445 |
+
|
| 446 |
+
# GPU kontrolü
|
| 447 |
+
is_gpu_available = check_gpu()
|
| 448 |
+
|
| 449 |
+
# Veri Üretimi
|
| 450 |
+
try:
|
| 451 |
+
X_train, y_train = generate_data(args.train_samples, args.seq_length)
|
| 452 |
+
X_test, y_test = generate_data(args.test_samples, args.seq_length)
|
| 453 |
+
input_shape = X_train.shape[1:] # Model oluşturmak için girdi şekli
|
| 454 |
+
except Exception:
|
| 455 |
+
logging.critical("Failed to generate data. Exiting.")
|
| 456 |
+
sys.exit(1)
|
| 457 |
+
|
| 458 |
+
|
| 459 |
+
# Popülasyon Başlatma
|
| 460 |
+
logging.info(f"--- Initializing Population (Size: {args.pop_size}) ---")
|
| 461 |
+
try:
|
| 462 |
+
population = [create_individual(args.seq_length, input_shape) for _ in range(args.pop_size)]
|
| 463 |
+
logging.info("Population initialized successfully.")
|
| 464 |
+
except Exception:
|
| 465 |
+
logging.critical("Failed to initialize population. Exiting.")
|
| 466 |
+
sys.exit(1)
|
| 467 |
+
|
| 468 |
+
# Evrim Süreci
|
| 469 |
+
logging.info(f"--- Starting Evolution ({args.generations} Generations) ---")
|
| 470 |
+
try:
|
| 471 |
+
best_model_unevolved, best_fitness_hist, avg_fitness_hist = evolve_population(
|
| 472 |
+
population, X_train, y_train, args.generations,
|
| 473 |
+
args.mutation_rate, args.weight_mut_rate, args.activation_mut_rate, args.mutation_strength,
|
| 474 |
+
args.tournament_size, args.elitism_count, args.batch_size
|
| 475 |
+
)
|
| 476 |
+
except Exception as e:
|
| 477 |
+
logging.critical(f"Fatal error during evolution process: {e}", exc_info=True)
|
| 478 |
+
sys.exit(1)
|
| 479 |
+
logging.info("--- Evolution Complete ---")
|
| 480 |
+
|
| 481 |
+
# Fitness geçmişini kaydet ve çizdir
|
| 482 |
+
if best_fitness_hist and avg_fitness_hist:
|
| 483 |
+
history_path = os.path.join(output_dir, "fitness_history.csv")
|
| 484 |
+
try:
|
| 485 |
+
history_data = np.array([np.arange(1, len(best_fitness_hist) + 1), best_fitness_hist, avg_fitness_hist]).T
|
| 486 |
+
np.savetxt(history_path, history_data, delimiter=',', header='Generation,BestFitness,AvgFitness', comments='', fmt=['%d', '%.8f', '%.8f'])
|
| 487 |
+
logging.info(f"Fitness history data saved to {history_path}")
|
| 488 |
+
except Exception as e:
|
| 489 |
+
logging.error(f"Could not save fitness history data: {e}", exc_info=True)
|
| 490 |
+
plot_fitness_history(best_fitness_hist, avg_fitness_hist, output_dir)
|
| 491 |
+
else:
|
| 492 |
+
logging.warning("Fitness history is empty, skipping saving/plotting.")
|
| 493 |
+
|
| 494 |
+
|
| 495 |
+
# En İyi Modelin Son Eğitimi
|
| 496 |
+
if best_model_unevolved is None:
|
| 497 |
+
logging.error("Evolution did not yield a best model. Skipping final training and evaluation.")
|
| 498 |
+
final_metrics = {"test_mse": np.inf, "avg_kendall_tau": 0.0}
|
| 499 |
+
final_model_path = None
|
| 500 |
+
training_summary = {}
|
| 501 |
+
else:
|
| 502 |
+
logging.info("--- Starting Final Training of Best Evolved Model ---")
|
| 503 |
+
try:
|
| 504 |
+
# En iyi modeli tekrar klonla ve derle (güvenlik için)
|
| 505 |
+
final_model = clone_model(best_model_unevolved)
|
| 506 |
+
final_model.set_weights(best_model_unevolved.get_weights())
|
| 507 |
+
# Son eğitim için belki farklı bir öğrenme oranı denenebilir
|
| 508 |
+
final_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
|
| 509 |
+
logging.info("Model Summary of Best Evolved (Untrained):")
|
| 510 |
+
final_model.summary(print_fn=logging.info)
|
| 511 |
+
|
| 512 |
+
|
| 513 |
+
# Callback'ler
|
| 514 |
+
early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True, verbose=1) # Sabrı biraz artır
|
| 515 |
+
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=7, min_lr=1e-7, verbose=1) # Faktörü ve sabrı ayarla
|
| 516 |
+
|
| 517 |
+
history = final_model.fit(
|
| 518 |
+
X_train, y_train,
|
| 519 |
+
epochs=args.epochs_final_train,
|
| 520 |
+
batch_size=args.batch_size,
|
| 521 |
+
validation_split=0.2, # Eğitim verisinin %20'si validasyon için
|
| 522 |
+
callbacks=[early_stopping, reduce_lr],
|
| 523 |
+
verbose=2 # Her epoch için bir satır log
|
| 524 |
+
)
|
| 525 |
+
logging.info("Final training complete.")
|
| 526 |
+
training_summary = {
|
| 527 |
+
"epochs_run": len(history.history['loss']),
|
| 528 |
+
"final_train_loss": history.history['loss'][-1],
|
| 529 |
+
"final_val_loss": history.history['val_loss'][-1]
|
| 530 |
+
}
|
| 531 |
+
|
| 532 |
+
# Eğitilmiş modeli değerlendir
|
| 533 |
+
final_metrics = evaluate_model(final_model, X_test, y_test, args.batch_size)
|
| 534 |
+
|
| 535 |
+
# Eğitilmiş modeli kaydet
|
| 536 |
+
final_model_path = os.path.join(output_dir, "best_evolved_model_trained.keras")
|
| 537 |
+
final_model.save(final_model_path)
|
| 538 |
+
logging.info(f"Final trained model saved to {final_model_path}")
|
| 539 |
+
|
| 540 |
+
except Exception as e:
|
| 541 |
+
logging.error(f"Error during final training or evaluation: {e}", exc_info=True)
|
| 542 |
+
final_metrics = {"test_mse": np.inf, "avg_kendall_tau": 0.0}
|
| 543 |
+
final_model_path = None
|
| 544 |
+
training_summary = {"error": str(e)}
|
| 545 |
+
|
| 546 |
+
|
| 547 |
+
# Sonuçları Kaydet
|
| 548 |
+
logging.info("--- Saving Final Results ---")
|
| 549 |
+
final_results = {
|
| 550 |
+
"run_info": {
|
| 551 |
+
"run_name": run_name,
|
| 552 |
+
"timestamp": timestamp,
|
| 553 |
+
"output_directory": output_dir,
|
| 554 |
+
"gpu_used": is_gpu_available,
|
| 555 |
+
},
|
| 556 |
+
"config": args_dict,
|
| 557 |
+
"evolution_summary": {
|
| 558 |
+
"generations_run": len(best_fitness_hist) if best_fitness_hist else 0,
|
| 559 |
+
"best_fitness_achieved": best_fitness_overall if best_fitness_overall > -np.inf else None,
|
| 560 |
+
"best_fitness_final_gen": best_fitness_hist[-1] if best_fitness_hist else None,
|
| 561 |
+
"avg_fitness_final_gen": avg_fitness_hist[-1] if avg_fitness_hist else None,
|
| 562 |
+
},
|
| 563 |
+
"final_training_summary": training_summary,
|
| 564 |
+
"final_evaluation_on_test": final_metrics,
|
| 565 |
+
"saved_model_path": final_model_path
|
| 566 |
+
}
|
| 567 |
+
results_path = os.path.join(output_dir, "final_results.json")
|
| 568 |
+
try:
|
| 569 |
+
# JSON'a kaydederken NumPy türlerini dönüştür
|
| 570 |
+
def convert_numpy_types(obj):
|
| 571 |
+
if isinstance(obj, np.integer): return int(obj)
|
| 572 |
+
elif isinstance(obj, np.floating): return float(obj)
|
| 573 |
+
elif isinstance(obj, np.ndarray): return obj.tolist()
|
| 574 |
+
return obj
|
| 575 |
+
with open(results_path, 'w') as f:
|
| 576 |
+
json.dump(final_results, f, indent=4, default=convert_numpy_types) # default handler ekle
|
| 577 |
+
logging.info(f"Final results summary saved to {results_path}")
|
| 578 |
+
except Exception as e:
|
| 579 |
+
logging.error(f"Failed to save final results JSON: {e}", exc_info=True)
|
| 580 |
+
|
| 581 |
+
logging.info(f"========== Pipeline Run {run_name} Finished ==========")
|
| 582 |
+
|
| 583 |
+
|
| 584 |
+
# --- Argüman Ayrıştırıcı ---
|
| 585 |
+
def parse_arguments() -> argparse.Namespace:
|
| 586 |
+
parser = argparse.ArgumentParser(description="EvoNet Revised: Neuroevolution for Sorting Task")
|
| 587 |
+
|
| 588 |
+
# --- Dizinler ---
|
| 589 |
+
parser.add_argument('--output_base_dir', type=str, default=DEFAULT_OUTPUT_BASE_DIR,
|
| 590 |
+
help='Base directory to store run results.')
|
| 591 |
+
|
| 592 |
+
# --- Veri Ayarları ---
|
| 593 |
+
parser.add_argument('--seq_length', type=int, default=DEFAULT_SEQ_LENGTH, help='Length of sequences.')
|
| 594 |
+
parser.add_argument('--train_samples', type=int, default=5000, help='Number of training samples.')
|
| 595 |
+
parser.add_argument('--test_samples', type=int, default=1000, help='Number of test samples.')
|
| 596 |
+
|
| 597 |
+
# --- Evrim Parametreleri ---
|
| 598 |
+
parser.add_argument('--pop_size', type=int, default=DEFAULT_POP_SIZE, help='Population size.')
|
| 599 |
+
parser.add_argument('--generations', type=int, default=DEFAULT_GENERATIONS, help='Number of generations.')
|
| 600 |
+
parser.add_argument('--mutation_rate', type=float, default=DEFAULT_MUTATION_RATE, help='Overall mutation probability.')
|
| 601 |
+
parser.add_argument('--weight_mut_rate', type=float, default=DEFAULT_WEIGHT_MUT_RATE, help='Weight mutation probability (if mutation occurs).')
|
| 602 |
+
parser.add_argument('--activation_mut_rate', type=float, default=DEFAULT_ACTIVATION_MUT_RATE, help='Activation mutation probability (if mutation occurs).')
|
| 603 |
+
parser.add_argument('--mutation_strength', type=float, default=DEFAULT_MUTATION_STRENGTH, help='Std dev for weight mutation noise.')
|
| 604 |
+
parser.add_argument('--tournament_size', type=int, default=DEFAULT_TOURNAMENT_SIZE, help='Number of individuals in tournament selection.')
|
| 605 |
+
parser.add_argument('--elitism_count', type=int, default=DEFAULT_ELITISM_COUNT, help='Number of elite individuals to carry over.')
|
| 606 |
+
|
| 607 |
+
# --- Eğitim ve Değerlendirme ---
|
| 608 |
+
parser.add_argument('--batch_size', type=int, default=DEFAULT_BATCH_SIZE, help='Batch size for predictions and final training.')
|
| 609 |
+
parser.add_argument('--epochs_final_train', type=int, default=DEFAULT_EPOCHS_FINAL_TRAIN, help='Max epochs for final training.')
|
| 610 |
+
|
| 611 |
+
# --- Tekrarlanabilirlik ---
|
| 612 |
+
parser.add_argument('--seed', type=int, default=None, help='Random seed (default: random).')
|
| 613 |
+
|
| 614 |
+
args = parser.parse_args()
|
| 615 |
+
|
| 616 |
+
# Varsayılan tohum ayarla (eğer verilmediyse)
|
| 617 |
+
if args.seed is None:
|
| 618 |
+
args.seed = random.randint(0, 2**32 - 1)
|
| 619 |
+
print(f"Generated random seed: {args.seed}") # Loglama başlamadan önce print et
|
| 620 |
+
|
| 621 |
+
return args
|
| 622 |
+
|
| 623 |
+
|
| 624 |
+
# --- Ana Çalıştırma Bloğu ---
|
| 625 |
+
if __name__ == "__main__":
|
| 626 |
+
# Argümanları ayrıştır
|
| 627 |
+
cli_args = parse_arguments()
|
| 628 |
+
|
| 629 |
+
# Ana iş akışını çalıştır
|
| 630 |
+
try:
|
| 631 |
+
run_pipeline(cli_args)
|
| 632 |
+
except SystemExit: # sys.exit() çağrılarını yakala ve normal çıkış yap
|
| 633 |
+
pass
|
| 634 |
+
except Exception as e:
|
| 635 |
+
# Loglama başlamamışsa bile hatayı yazdırmaya çalış
|
| 636 |
+
print(f"\nFATAL UNHANDLED ERROR in main execution block: {e}", file=sys.stderr)
|
| 637 |
+
# Loglama ayarlandıysa oraya da yaz
|
| 638 |
+
if logging.getLogger().hasHandlers():
|
| 639 |
+
logging.critical("FATAL UNHANDLED ERROR in main execution block:", exc_info=True)
|
| 640 |
+
else:
|
| 641 |
+
import traceback
|
| 642 |
+
print(traceback.format_exc(), file=sys.stderr)
|
| 643 |
+
sys.exit(1) # Hata kodu ile çık
|
v3.py
ADDED
|
@@ -0,0 +1,784 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==============================================================================
|
| 2 |
+
# EvoNet Optimizer - v3 - Daha İleri İyileştirmeler
|
| 3 |
+
# Açıklama: Çaprazlama, Kontrol Noktası eklenmiş, Adaptif Mutasyon ve
|
| 4 |
+
# Gelişmiş Fitness için kavramsal öneriler içeren versiyon.
|
| 5 |
+
# ==============================================================================
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import subprocess
|
| 9 |
+
import sys
|
| 10 |
+
import argparse
|
| 11 |
+
import random
|
| 12 |
+
import logging
|
| 13 |
+
from datetime import datetime
|
| 14 |
+
import json
|
| 15 |
+
import pickle # Checkpointing için
|
| 16 |
+
import time # Checkpointing için
|
| 17 |
+
from typing import List, Tuple, Dict, Any, Optional
|
| 18 |
+
|
| 19 |
+
import numpy as np
|
| 20 |
+
import tensorflow as tf
|
| 21 |
+
from tensorflow.keras.models import Sequential, load_model, clone_model
|
| 22 |
+
from tensorflow.keras.layers import Dense, Input
|
| 23 |
+
from tensorflow.keras.optimizers import Adam
|
| 24 |
+
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
|
| 25 |
+
import matplotlib.pyplot as plt
|
| 26 |
+
from scipy.stats import kendalltau
|
| 27 |
+
|
| 28 |
+
# --- Sabitler ve Varsayılan Değerler ---
|
| 29 |
+
DEFAULT_SEQ_LENGTH = 10
|
| 30 |
+
DEFAULT_POP_SIZE = 50
|
| 31 |
+
DEFAULT_GENERATIONS = 50
|
| 32 |
+
DEFAULT_CROSSOVER_RATE = 0.6 # Çaprazlama uygulama olasılığı
|
| 33 |
+
DEFAULT_MUTATION_RATE = 0.4 # Mutasyon uygulama olasılığı (eğer çaprazlama olmazsa)
|
| 34 |
+
DEFAULT_WEIGHT_MUT_RATE = 0.8
|
| 35 |
+
DEFAULT_ACTIVATION_MUT_RATE = 0.2 # Aktivasyon mutasyonu hala deneysel
|
| 36 |
+
DEFAULT_MUTATION_STRENGTH = 0.1
|
| 37 |
+
DEFAULT_TOURNAMENT_SIZE = 5
|
| 38 |
+
DEFAULT_ELITISM_COUNT = 2
|
| 39 |
+
DEFAULT_EPOCHS_FINAL_TRAIN = 100
|
| 40 |
+
DEFAULT_BATCH_SIZE = 64
|
| 41 |
+
DEFAULT_OUTPUT_BASE_DIR = os.path.join(os.getcwd(), "evonet_runs_v3")
|
| 42 |
+
DEFAULT_CHECKPOINT_INTERVAL = 10 # Kaç nesilde bir checkpoint alınacağı (0 = kapalı)
|
| 43 |
+
|
| 44 |
+
# --- Loglama Ayarları ---
|
| 45 |
+
# (setup_logging fonksiyonu öncekiyle aynı, tekrar eklemiyorum)
|
| 46 |
+
def setup_logging(log_dir: str, log_level=logging.INFO) -> None:
|
| 47 |
+
log_filename = os.path.join(log_dir, 'evolution_run.log')
|
| 48 |
+
for handler in logging.root.handlers[:]: logging.root.removeHandler(handler)
|
| 49 |
+
logging.basicConfig(
|
| 50 |
+
level=log_level,
|
| 51 |
+
format='%(asctime)s - %(levelname)-8s - %(message)s',
|
| 52 |
+
handlers=[
|
| 53 |
+
logging.FileHandler(log_filename, mode='a'), # 'a' mode append for resuming
|
| 54 |
+
logging.StreamHandler(sys.stdout)
|
| 55 |
+
]
|
| 56 |
+
)
|
| 57 |
+
logging.info("Logging setup complete.")
|
| 58 |
+
|
| 59 |
+
# --- GPU Kontrolü ---
|
| 60 |
+
# (check_gpu fonksiyonu öncekiyle aynı, tekrar eklemiyorum)
|
| 61 |
+
def check_gpu() -> bool:
|
| 62 |
+
gpus = tf.config.list_physical_devices('GPU')
|
| 63 |
+
if gpus:
|
| 64 |
+
try:
|
| 65 |
+
for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True)
|
| 66 |
+
logical_gpus = tf.config.list_logical_devices('GPU')
|
| 67 |
+
logging.info(f"{len(gpus)} Physical GPUs, {len(logical_gpus)} Logical GPUs found.")
|
| 68 |
+
if logical_gpus: logging.info(f"Using GPU: {tf.config.experimental.get_device_details(gpus[0])['device_name']}")
|
| 69 |
+
return True
|
| 70 |
+
except RuntimeError as e:
|
| 71 |
+
logging.error(f"Error setting memory growth for GPU: {e}", exc_info=True)
|
| 72 |
+
return False
|
| 73 |
+
else:
|
| 74 |
+
logging.warning("GPU not found. Using CPU.")
|
| 75 |
+
return False
|
| 76 |
+
|
| 77 |
+
# --- Veri Üretimi ---
|
| 78 |
+
# (generate_data fonksiyonu öncekiyle aynı, tekrar eklemiyorum)
|
| 79 |
+
def generate_data(num_samples: int, seq_length: int) -> Tuple[np.ndarray, np.ndarray]:
|
| 80 |
+
logging.info(f"Generating {num_samples} samples with sequence length {seq_length}...")
|
| 81 |
+
try:
|
| 82 |
+
X = np.random.rand(num_samples, seq_length).astype(np.float32) * 100
|
| 83 |
+
y = np.sort(X, axis=1).astype(np.float32)
|
| 84 |
+
logging.info("Data generation successful.")
|
| 85 |
+
return X, y
|
| 86 |
+
except Exception as e:
|
| 87 |
+
logging.error(f"Error during data generation: {e}", exc_info=True)
|
| 88 |
+
raise
|
| 89 |
+
|
| 90 |
+
# --- Neuroevolution Çekirdeği ---
|
| 91 |
+
|
| 92 |
+
def create_individual(seq_length: int, input_shape: Tuple) -> Sequential:
|
| 93 |
+
"""Rastgele mimariye sahip bir Keras Sequential modeli oluşturur ve derler."""
|
| 94 |
+
# (Fonksiyon öncekiyle büyük ölçüde aynı, isim revize edildi)
|
| 95 |
+
try:
|
| 96 |
+
model = Sequential(name=f"model_rnd_{random.randint(10000, 99999)}")
|
| 97 |
+
num_hidden_layers = random.randint(1, 4)
|
| 98 |
+
neurons_per_layer = [random.randint(8, 64) for _ in range(num_hidden_layers)]
|
| 99 |
+
activations = [random.choice(['relu', 'tanh', 'sigmoid']) for _ in range(num_hidden_layers)]
|
| 100 |
+
model.add(Input(shape=input_shape))
|
| 101 |
+
for i in range(num_hidden_layers):
|
| 102 |
+
model.add(Dense(neurons_per_layer[i], activation=activations[i]))
|
| 103 |
+
model.add(Dense(seq_length, activation='linear'))
|
| 104 |
+
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
|
| 105 |
+
return model
|
| 106 |
+
except Exception as e:
|
| 107 |
+
logging.error(f"Error creating individual model: {e}", exc_info=True)
|
| 108 |
+
raise
|
| 109 |
+
|
| 110 |
+
@tf.function
|
| 111 |
+
def get_predictions(model: Sequential, X: tf.Tensor) -> tf.Tensor:
|
| 112 |
+
"""Model tahminlerini tf.function kullanarak alır."""
|
| 113 |
+
return model(X, training=False)
|
| 114 |
+
|
| 115 |
+
def calculate_fitness(individual: Sequential, X: np.ndarray, y: np.ndarray, batch_size: int, fitness_params: Dict = None) -> float:
|
| 116 |
+
"""Bir bireyin fitness değerini hesaplar. Gelişmiş fitness için öneri içerir."""
|
| 117 |
+
# --- KAVRAMSAL: Gelişmiş Fitness Fonksiyonu ---
|
| 118 |
+
# Burada sadece MSE kullanılıyor. Daha gelişmiş bir fitness için:
|
| 119 |
+
# 1. Diğer metrikleri hesapla (örn: Kendall Tau).
|
| 120 |
+
# 2. Model karmaşıklığını hesapla (örn: parametre sayısı).
|
| 121 |
+
# 3. Bu değerleri ağırlıklı bir formülle birleştir.
|
| 122 |
+
# fitness_params = fitness_params or {}
|
| 123 |
+
# w_mse = fitness_params.get('w_mse', 1.0)
|
| 124 |
+
# w_tau = fitness_params.get('w_tau', 0.1)
|
| 125 |
+
# w_comp = fitness_params.get('w_comp', 0.0001)
|
| 126 |
+
# --------------------------------------------
|
| 127 |
+
if not isinstance(X, tf.Tensor): X = tf.cast(X, tf.float32)
|
| 128 |
+
if not isinstance(y, tf.Tensor): y = tf.cast(y, tf.float32)
|
| 129 |
+
try:
|
| 130 |
+
y_pred_tf = get_predictions(individual, X)
|
| 131 |
+
mse = tf.reduce_mean(tf.square(y - y_pred_tf))
|
| 132 |
+
mse_val = mse.numpy()
|
| 133 |
+
fitness_score = 1.0 / (mse_val + 1e-8) # Temel fitness
|
| 134 |
+
|
| 135 |
+
# --- KAVRAMSAL: Gelişmiş Fitness Hesabı ---
|
| 136 |
+
# if w_tau > 0 or w_comp > 0:
|
| 137 |
+
# # Kendall Tau hesapla (maliyetli olabilir, örneklem gerekebilir)
|
| 138 |
+
# tau_val = calculate_avg_kendall_tau(y.numpy(), y_pred_tf.numpy(), sample_size=100) # Örnek bir fonksiyon
|
| 139 |
+
# # Karmaşıklık hesapla
|
| 140 |
+
# complexity = individual.count_params()
|
| 141 |
+
# # Birleştirilmiş fitness
|
| 142 |
+
# fitness_score = w_mse * fitness_score + w_tau * tau_val - w_comp * complexity
|
| 143 |
+
# --------------------------------------------
|
| 144 |
+
|
| 145 |
+
if not np.isfinite(fitness_score) or fitness_score < -1e6: # Negatif olabilen fitness için kontrol
|
| 146 |
+
logging.warning(f"Non-finite or very low fitness ({fitness_score:.4g}) for model {individual.name}. Assigning minimal fitness.")
|
| 147 |
+
return -1e7 # Gelişmiş fitness negatif olabileceği için daha düşük sınır
|
| 148 |
+
return float(fitness_score)
|
| 149 |
+
except Exception as e:
|
| 150 |
+
logging.error(f"Error during fitness calculation for model {individual.name}: {e}", exc_info=True)
|
| 151 |
+
return -1e7
|
| 152 |
+
|
| 153 |
+
# (Aktivasyon mutasyonu hala deneysel, ana odak ağırlık mutasyonunda)
|
| 154 |
+
def mutate_individual(individual: Sequential, weight_mut_rate: float, mut_strength: float) -> Sequential:
|
| 155 |
+
"""Bir bireye ağırlık bozulması mutasyonu uygular."""
|
| 156 |
+
try:
|
| 157 |
+
mutated_model = clone_model(individual)
|
| 158 |
+
mutated_model.set_weights(individual.get_weights())
|
| 159 |
+
mutated = False
|
| 160 |
+
if random.random() < weight_mut_rate: # Ağırlık mutasyon olasılığı (dışarıdan gelen genel rate ile birleştirilebilir)
|
| 161 |
+
mutated = True
|
| 162 |
+
for layer in mutated_model.layers:
|
| 163 |
+
if isinstance(layer, Dense) and layer.get_weights():
|
| 164 |
+
weights_biases = layer.get_weights()
|
| 165 |
+
new_weights_biases = [wb + np.random.normal(0, mut_strength, wb.shape).astype(np.float32) for wb in weights_biases]
|
| 166 |
+
layer.set_weights(new_weights_biases)
|
| 167 |
+
|
| 168 |
+
if mutated:
|
| 169 |
+
mutated_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
|
| 170 |
+
mutated_model._name = f"mutated_{individual.name}_{random.randint(1000,9999)}"
|
| 171 |
+
return mutated_model
|
| 172 |
+
except Exception as e:
|
| 173 |
+
logging.error(f"Error during mutation of model {individual.name}: {e}", exc_info=True)
|
| 174 |
+
return individual
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def check_architecture_compatibility(model1: Sequential, model2: Sequential) -> bool:
|
| 178 |
+
"""İki modelin basit çaprazlama için uyumlu olup olmadığını kontrol eder (katman sayısı ve tipleri)."""
|
| 179 |
+
if len(model1.layers) != len(model2.layers):
|
| 180 |
+
return False
|
| 181 |
+
for l1, l2 in zip(model1.layers, model2.layers):
|
| 182 |
+
if type(l1) != type(l2):
|
| 183 |
+
return False
|
| 184 |
+
# Daha detaylı kontrol (nöron sayısı vb.) eklenebilir, ancak basit tutalım.
|
| 185 |
+
return True
|
| 186 |
+
|
| 187 |
+
def crossover_individuals(parent1: Sequential, parent2: Sequential) -> Tuple[Optional[Sequential], Optional[Sequential]]:
|
| 188 |
+
"""İki ebeveynden basit ağırlık ortalaması/karıştırması ile çocuklar oluşturur."""
|
| 189 |
+
# Mimari uyumluluğunu kontrol et (basit versiyon)
|
| 190 |
+
if not check_architecture_compatibility(parent1, parent2):
|
| 191 |
+
logging.debug("Skipping crossover due to incompatible architectures.")
|
| 192 |
+
return None, None # Uyumsuzsa çaprazlama yapma
|
| 193 |
+
|
| 194 |
+
try:
|
| 195 |
+
# Çocukları ebeveynleri klonlayarak başlat
|
| 196 |
+
child1 = clone_model(parent1)
|
| 197 |
+
child2 = clone_model(parent2)
|
| 198 |
+
child1.set_weights(parent1.get_weights()) # Başlangıç ağırlıklarını ata
|
| 199 |
+
child2.set_weights(parent2.get_weights())
|
| 200 |
+
|
| 201 |
+
p1_weights = parent1.get_weights()
|
| 202 |
+
p2_weights = parent2.get_weights()
|
| 203 |
+
child1_new_weights = []
|
| 204 |
+
child2_new_weights = []
|
| 205 |
+
|
| 206 |
+
# Katman katman ağırlıkları çaprazla
|
| 207 |
+
for i in range(len(p1_weights)): # Ağırlık matrisleri/bias vektörleri üzerinde döngü
|
| 208 |
+
w1 = p1_weights[i]
|
| 209 |
+
w2 = p2_weights[i]
|
| 210 |
+
# Basit ortalama veya rastgele seçim (örnek: rastgele seçim)
|
| 211 |
+
mask = np.random.rand(*w1.shape) < 0.5
|
| 212 |
+
cw1 = np.where(mask, w1, w2)
|
| 213 |
+
cw2 = np.where(mask, w2, w1) # Ters maske ile
|
| 214 |
+
# Veya basit ortalama: cw1 = (w1 + w2) / 2.0; cw2 = cw1
|
| 215 |
+
child1_new_weights.append(cw1.astype(np.float32))
|
| 216 |
+
child2_new_weights.append(cw2.astype(np.float32))
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
child1.set_weights(child1_new_weights)
|
| 220 |
+
child2.set_weights(child2_new_weights)
|
| 221 |
+
|
| 222 |
+
# Çocukları derle
|
| 223 |
+
child1.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
|
| 224 |
+
child2.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
|
| 225 |
+
child1._name = f"xover_{parent1.name[:10]}_{parent2.name[:10]}_c1_{random.randint(1000,9999)}"
|
| 226 |
+
child2._name = f"xover_{parent1.name[:10]}_{parent2.name[:10]}_c2_{random.randint(1000,9999)}"
|
| 227 |
+
#logging.debug(f"Crossover performed between {parent1.name} and {parent2.name}")
|
| 228 |
+
return child1, child2
|
| 229 |
+
|
| 230 |
+
except Exception as e:
|
| 231 |
+
logging.error(f"Error during crossover between {parent1.name} and {parent2.name}: {e}", exc_info=True)
|
| 232 |
+
return None, None # Hata olursa çocuk üretme
|
| 233 |
+
|
| 234 |
+
# (tournament_selection fonksiyonu öncekiyle aynı)
|
| 235 |
+
def tournament_selection(population: List[Sequential], fitness_scores: List[float], k: int) -> Sequential:
|
| 236 |
+
if not population: raise ValueError("Population cannot be empty.")
|
| 237 |
+
if len(population) < k: k = len(population)
|
| 238 |
+
try:
|
| 239 |
+
tournament_indices = random.sample(range(len(population)), k)
|
| 240 |
+
tournament_fitness = [fitness_scores[i] for i in tournament_indices]
|
| 241 |
+
winner_local_idx = np.argmax(tournament_fitness)
|
| 242 |
+
winner_global_idx = tournament_indices[winner_local_idx]
|
| 243 |
+
return population[winner_global_idx]
|
| 244 |
+
except Exception as e:
|
| 245 |
+
logging.error(f"Error during tournament selection: {e}", exc_info=True)
|
| 246 |
+
return random.choice(population)
|
| 247 |
+
|
| 248 |
+
# --- Checkpointing ---
|
| 249 |
+
def save_checkpoint(output_dir: str, generation: int, population: List[Sequential], rnd_state: Tuple, np_rnd_state: Tuple, tf_rnd_state: Any):
|
| 250 |
+
"""Evrim durumunu kaydeder."""
|
| 251 |
+
checkpoint_dir = os.path.join(output_dir, "checkpoints")
|
| 252 |
+
os.makedirs(checkpoint_dir, exist_ok=True)
|
| 253 |
+
checkpoint_file = os.path.join(checkpoint_dir, f"evo_gen_{generation}.pkl")
|
| 254 |
+
logging.info(f"Saving checkpoint for generation {generation} to {checkpoint_file}...")
|
| 255 |
+
try:
|
| 256 |
+
# Modelleri kaydetmek için ağırlıkları ve konfigürasyonları al
|
| 257 |
+
population_state = []
|
| 258 |
+
for model in population:
|
| 259 |
+
try:
|
| 260 |
+
# Önce modeli diske kaydetmeyi dene (daha sağlam olabilir ama yavaş)
|
| 261 |
+
# model_path = os.path.join(checkpoint_dir, f"model_gen{generation}_{model.name}.keras")
|
| 262 |
+
# model.save(model_path)
|
| 263 |
+
# population_state.append({"config": model.get_config(), "saved_path": model_path})
|
| 264 |
+
|
| 265 |
+
# Alternatif: Ağırlık ve config'i pickle içine göm (daha riskli)
|
| 266 |
+
population_state.append({
|
| 267 |
+
"name": model.name,
|
| 268 |
+
"config": model.get_config(),
|
| 269 |
+
"weights": model.get_weights()
|
| 270 |
+
})
|
| 271 |
+
except Exception as e:
|
| 272 |
+
logging.error(f"Could not serialize model {model.name} for checkpoint: {e}")
|
| 273 |
+
population_state.append(None) # Hata durumunda None ekle
|
| 274 |
+
|
| 275 |
+
state = {
|
| 276 |
+
"generation": generation,
|
| 277 |
+
"population_state": [p for p in population_state if p is not None], # Başarısız olanları çıkarma
|
| 278 |
+
"random_state": rnd_state,
|
| 279 |
+
"numpy_random_state": np_rnd_state,
|
| 280 |
+
"tensorflow_random_state": tf_rnd_state, # TensorFlow state'i pickle ile kaydetmek sorunlu olabilir
|
| 281 |
+
"timestamp": datetime.now().isoformat()
|
| 282 |
+
}
|
| 283 |
+
with open(checkpoint_file, 'wb') as f:
|
| 284 |
+
pickle.dump(state, f)
|
| 285 |
+
logging.info(f"Checkpoint saved successfully for generation {generation}.")
|
| 286 |
+
except Exception as e:
|
| 287 |
+
logging.error(f"Failed to save checkpoint for generation {generation}: {e}", exc_info=True)
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
def load_checkpoint(checkpoint_path: str) -> Optional[Dict]:
|
| 291 |
+
"""Kaydedilmiş evrim durumunu yükler."""
|
| 292 |
+
if not os.path.exists(checkpoint_path):
|
| 293 |
+
logging.error(f"Checkpoint file not found: {checkpoint_path}")
|
| 294 |
+
return None
|
| 295 |
+
logging.info(f"Loading checkpoint from {checkpoint_path}...")
|
| 296 |
+
try:
|
| 297 |
+
with open(checkpoint_path, 'rb') as f:
|
| 298 |
+
state = pickle.load(f)
|
| 299 |
+
|
| 300 |
+
population = []
|
| 301 |
+
for model_state in state["population_state"]:
|
| 302 |
+
try:
|
| 303 |
+
# Eğer model ayrı kaydedildiyse:
|
| 304 |
+
# model = load_model(model_state["saved_path"])
|
| 305 |
+
# population.append(model)
|
| 306 |
+
|
| 307 |
+
# Pickle içine gömüldüyse:
|
| 308 |
+
model = Sequential.from_config(model_state["config"])
|
| 309 |
+
model.set_weights(model_state["weights"])
|
| 310 |
+
# Modelin yeniden derlenmesi GEREKİR!
|
| 311 |
+
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
|
| 312 |
+
model._name = model_state.get("name", f"model_loaded_{random.randint(1000,9999)}") # İsmi geri yükle
|
| 313 |
+
population.append(model)
|
| 314 |
+
except Exception as e:
|
| 315 |
+
logging.error(f"Failed to load model state from checkpoint for model {model_state.get('name', 'UNKNOWN')}: {e}")
|
| 316 |
+
|
| 317 |
+
# Sadece başarıyla yüklenen modelleri al
|
| 318 |
+
state["population"] = population
|
| 319 |
+
if not population:
|
| 320 |
+
logging.error("Failed to load any model from the checkpoint population state.")
|
| 321 |
+
return None # Hiç model yüklenemediyse checkpoint geçersiz
|
| 322 |
+
|
| 323 |
+
logging.info(f"Checkpoint loaded successfully. Resuming from generation {state['generation'] + 1}.")
|
| 324 |
+
return state
|
| 325 |
+
except Exception as e:
|
| 326 |
+
logging.error(f"Failed to load checkpoint from {checkpoint_path}: {e}", exc_info=True)
|
| 327 |
+
return None
|
| 328 |
+
|
| 329 |
+
def find_latest_checkpoint(output_dir: str) -> Optional[str]:
|
| 330 |
+
"""Verilen klasördeki en son checkpoint dosyasını bulur."""
|
| 331 |
+
checkpoint_dir = os.path.join(output_dir, "checkpoints")
|
| 332 |
+
if not os.path.isdir(checkpoint_dir):
|
| 333 |
+
return None
|
| 334 |
+
checkpoints = [f for f in os.listdir(checkpoint_dir) if f.startswith("evo_gen_") and f.endswith(".pkl")]
|
| 335 |
+
if not checkpoints:
|
| 336 |
+
return None
|
| 337 |
+
# Dosya adından nesil numarasını çıkar ve en yükseğini bul
|
| 338 |
+
latest_gen = -1
|
| 339 |
+
latest_file = None
|
| 340 |
+
for cp in checkpoints:
|
| 341 |
+
try:
|
| 342 |
+
gen_num = int(cp.split('_')[2].split('.')[0])
|
| 343 |
+
if gen_num > latest_gen:
|
| 344 |
+
latest_gen = gen_num
|
| 345 |
+
latest_file = os.path.join(checkpoint_dir, cp)
|
| 346 |
+
except (IndexError, ValueError):
|
| 347 |
+
logging.warning(f"Could not parse generation number from checkpoint file: {cp}")
|
| 348 |
+
continue
|
| 349 |
+
return latest_file
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
# --- Ana Evrim Döngüsü (Checkpoint ve Crossover ile) ---
|
| 353 |
+
def evolve_population_v3(population: List[Sequential], X: np.ndarray, y: np.ndarray, start_generation: int, total_generations: int,
|
| 354 |
+
crossover_rate: float, mutation_rate: float, weight_mut_rate: float, mut_strength: float,
|
| 355 |
+
tournament_size: int, elitism_count: int, batch_size: int,
|
| 356 |
+
output_dir: str, checkpoint_interval: int) -> Tuple[Optional[Sequential], List[float], List[float]]:
|
| 357 |
+
"""Evrimsel süreci çalıştırır (Checkpoint ve Crossover içerir)."""
|
| 358 |
+
best_fitness_history = []
|
| 359 |
+
avg_fitness_history = []
|
| 360 |
+
best_model_overall = None
|
| 361 |
+
best_fitness_overall = -np.inf
|
| 362 |
+
|
| 363 |
+
X_tf = tf.cast(X, tf.float32)
|
| 364 |
+
y_tf = tf.cast(y, tf.float32)
|
| 365 |
+
|
| 366 |
+
# --- KAVRAMSAL: Uyarlanabilir Mutasyon Oranı ---
|
| 367 |
+
# current_mutation_rate = mutation_rate # Başlangıç değeri
|
| 368 |
+
# stagnation_counter = 0
|
| 369 |
+
# --------------------------------------------
|
| 370 |
+
|
| 371 |
+
for gen in range(start_generation, total_generations):
|
| 372 |
+
generation_start_time = datetime.now()
|
| 373 |
+
# 1. Fitness Değerlendirme
|
| 374 |
+
try:
|
| 375 |
+
fitness_scores = [calculate_fitness(ind, X_tf, y_tf, batch_size) for ind in population]
|
| 376 |
+
except Exception as e:
|
| 377 |
+
logging.critical(f"Error calculating fitness for population in Generation {gen+1}: {e}", exc_info=True)
|
| 378 |
+
if best_model_overall: return best_model_overall, best_fitness_history, avg_fitness_history
|
| 379 |
+
else: raise
|
| 380 |
+
|
| 381 |
+
# 2. İstatistikler ve En İyiyi Takip
|
| 382 |
+
current_best_idx = np.argmax(fitness_scores)
|
| 383 |
+
current_best_fitness = fitness_scores[current_best_idx]
|
| 384 |
+
avg_fitness = np.mean(fitness_scores)
|
| 385 |
+
best_fitness_history.append(current_best_fitness)
|
| 386 |
+
avg_fitness_history.append(avg_fitness)
|
| 387 |
+
|
| 388 |
+
new_best_found = False
|
| 389 |
+
if current_best_fitness > best_fitness_overall:
|
| 390 |
+
best_fitness_overall = current_best_fitness
|
| 391 |
+
new_best_found = True
|
| 392 |
+
try:
|
| 393 |
+
best_model_overall = clone_model(population[current_best_idx])
|
| 394 |
+
best_model_overall.set_weights(population[current_best_idx].get_weights())
|
| 395 |
+
best_model_overall.compile(optimizer=Adam(), loss='mse')
|
| 396 |
+
logging.info(f"Generation {gen+1}: *** New overall best fitness found: {best_fitness_overall:.6f} ***")
|
| 397 |
+
except Exception as e:
|
| 398 |
+
logging.error(f"Could not clone new best model: {e}", exc_info=True)
|
| 399 |
+
best_fitness_overall = current_best_fitness # Sadece fitness'ı güncelle
|
| 400 |
+
|
| 401 |
+
generation_time = (datetime.now() - generation_start_time).total_seconds()
|
| 402 |
+
logging.info(f"Generation {gen+1}/{total_generations} | Best Fitness: {current_best_fitness:.6f} | Avg Fitness: {avg_fitness:.6f} | Time: {generation_time:.2f}s")
|
| 403 |
+
|
| 404 |
+
# --- KAVRAMSAL: Uyarlanabilir Mutasyon Oranı Güncelleme ---
|
| 405 |
+
# if new_best_found:
|
| 406 |
+
# stagnation_counter = 0
|
| 407 |
+
# # current_mutation_rate = max(min_mutation_rate, current_mutation_rate * 0.98) # Azalt
|
| 408 |
+
# else:
|
| 409 |
+
# stagnation_counter += 1
|
| 410 |
+
# if stagnation_counter > stagnation_limit:
|
| 411 |
+
# # current_mutation_rate = min(max_mutation_rate, current_mutation_rate * 1.1) # Artır
|
| 412 |
+
# stagnation_counter = 0 # Sayacı sıfırla
|
| 413 |
+
# logging.debug(f"Current mutation rate: {current_mutation_rate:.4f}")
|
| 414 |
+
# --------------------------------------------
|
| 415 |
+
|
| 416 |
+
# 3. Yeni Popülasyon Oluşturma
|
| 417 |
+
new_population = []
|
| 418 |
+
|
| 419 |
+
# 3a. Elitizm
|
| 420 |
+
if elitism_count > 0 and len(population) >= elitism_count:
|
| 421 |
+
try:
|
| 422 |
+
elite_indices = np.argsort(fitness_scores)[-elitism_count:]
|
| 423 |
+
for idx in elite_indices:
|
| 424 |
+
elite_clone = clone_model(population[idx])
|
| 425 |
+
elite_clone.set_weights(population[idx].get_weights())
|
| 426 |
+
elite_clone.compile(optimizer=Adam(), loss='mse')
|
| 427 |
+
new_population.append(elite_clone)
|
| 428 |
+
except Exception as e:
|
| 429 |
+
logging.error(f"Error during elitism: {e}", exc_info=True)
|
| 430 |
+
|
| 431 |
+
|
| 432 |
+
# 3b. Seçilim, Çaprazlama ve Mutasyon
|
| 433 |
+
num_to_generate = len(population) - len(new_population)
|
| 434 |
+
generated_count = 0
|
| 435 |
+
while generated_count < num_to_generate:
|
| 436 |
+
try:
|
| 437 |
+
# İki ebeveyn seç
|
| 438 |
+
parent1 = tournament_selection(population, fitness_scores, tournament_size)
|
| 439 |
+
parent2 = tournament_selection(population, fitness_scores, tournament_size)
|
| 440 |
+
|
| 441 |
+
child1, child2 = None, None # Çocukları başlat
|
| 442 |
+
|
| 443 |
+
# Çaprazlama uygula (belirli bir olasılıkla)
|
| 444 |
+
if random.random() < crossover_rate and parent1 is not parent2:
|
| 445 |
+
child1, child2 = crossover_individuals(parent1, parent2)
|
| 446 |
+
|
| 447 |
+
# Eğer çaprazlama yapılmadıysa veya başarısız olduysa, mutasyonla devam et
|
| 448 |
+
if child1 is None: # İlk çocuk oluşmadıysa
|
| 449 |
+
# Ebeveynlerden birini mutasyona uğrat
|
| 450 |
+
parent_to_mutate = parent1 # Veya parent2 veya rastgele biri
|
| 451 |
+
if random.random() < mutation_rate: # Genel mutasyon oranı kontrolü
|
| 452 |
+
child1 = mutate_individual(parent_to_mutate, weight_mut_rate, mut_strength)
|
| 453 |
+
else: # Mutasyon da olmazsa, ebeveyni klonla
|
| 454 |
+
child1 = clone_model(parent_to_mutate); child1.set_weights(parent_to_mutate.get_weights())
|
| 455 |
+
child1.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
|
| 456 |
+
child1._name = f"cloned_{parent_to_mutate.name}_{random.randint(1000,9999)}"
|
| 457 |
+
|
| 458 |
+
# Yeni popülasyona ekle
|
| 459 |
+
if child1:
|
| 460 |
+
new_population.append(child1)
|
| 461 |
+
generated_count += 1
|
| 462 |
+
if generated_count >= num_to_generate: break # Gerekli sayıya ulaşıldıysa çık
|
| 463 |
+
|
| 464 |
+
else: # Çaprazlama başarılı olduysa (child1 ve child2 var)
|
| 465 |
+
# Çaprazlama sonrası çocuklara ayrıca mutasyon uygulama seçeneği eklenebilir
|
| 466 |
+
# if random.random() < post_crossover_mutation_rate: child1 = mutate(...)
|
| 467 |
+
# if random.random() < post_crossover_mutation_rate: child2 = mutate(...)
|
| 468 |
+
|
| 469 |
+
new_population.append(child1)
|
| 470 |
+
generated_count += 1
|
| 471 |
+
if generated_count >= num_to_generate: break
|
| 472 |
+
|
| 473 |
+
if child2: # İkinci çocuk da varsa ekle
|
| 474 |
+
new_population.append(child2)
|
| 475 |
+
generated_count += 1
|
| 476 |
+
if generated_count >= num_to_generate: break
|
| 477 |
+
|
| 478 |
+
except Exception as e:
|
| 479 |
+
logging.error(f"Error during selection/reproduction cycle: {e}", exc_info=True)
|
| 480 |
+
if generated_count < num_to_generate: # Eksik kalırsa rastgele doldur
|
| 481 |
+
logging.warning("Adding random individual due to reproduction error.")
|
| 482 |
+
new_population.append(create_individual(y.shape[1], X.shape[1:]))
|
| 483 |
+
generated_count += 1
|
| 484 |
+
|
| 485 |
+
population = new_population[:len(population)] # Popülasyon boyutunu garantile
|
| 486 |
+
|
| 487 |
+
# 4. Checkpoint Alma
|
| 488 |
+
if checkpoint_interval > 0 and (gen + 1) % checkpoint_interval == 0:
|
| 489 |
+
try:
|
| 490 |
+
# Rastgele durumları al
|
| 491 |
+
rnd_state = random.getstate()
|
| 492 |
+
np_rnd_state = np.random.get_state()
|
| 493 |
+
# tf_rnd_state = tf.random.get_global_generator().state # TF state kaydetmek zor olabilir
|
| 494 |
+
tf_rnd_state = None # Şimdilik None
|
| 495 |
+
save_checkpoint(output_dir, gen + 1, population, rnd_state, np_rnd_state, tf_rnd_state)
|
| 496 |
+
except Exception as e:
|
| 497 |
+
logging.error(f"Failed to execute checkpoint saving for generation {gen+1}: {e}", exc_info=True)
|
| 498 |
+
|
| 499 |
+
|
| 500 |
+
# Döngü sonu
|
| 501 |
+
if best_model_overall is None and population:
|
| 502 |
+
logging.warning("No overall best model tracked. Returning best from final population.")
|
| 503 |
+
final_fitness_scores = [calculate_fitness(ind, X_tf, y_tf, batch_size) for ind in population]
|
| 504 |
+
best_idx_final = np.argmax(final_fitness_scores)
|
| 505 |
+
best_model_overall = population[best_idx_final]
|
| 506 |
+
elif not population:
|
| 507 |
+
logging.error("Evolution finished with an empty population!")
|
| 508 |
+
return None, best_fitness_history, avg_fitness_history
|
| 509 |
+
|
| 510 |
+
logging.info(f"Evolution finished. Best fitness achieved: {best_fitness_overall:.6f}")
|
| 511 |
+
return best_model_overall, best_fitness_history, avg_fitness_history
|
| 512 |
+
|
| 513 |
+
# --- Grafik Çizimi (Öncekiyle aynı) ---
|
| 514 |
+
def plot_fitness_history(history_best: List[float], history_avg: List[float], output_dir: str) -> None:
|
| 515 |
+
if not history_best or not history_avg:
|
| 516 |
+
logging.warning("Fitness history is empty, cannot plot.")
|
| 517 |
+
return
|
| 518 |
+
try:
|
| 519 |
+
plt.figure(figsize=(12, 7)); plt.plot(history_best, label="Best Fitness", marker='o', linestyle='-', linewidth=2)
|
| 520 |
+
plt.plot(history_avg, label="Average Fitness", marker='x', linestyle='--', alpha=0.7); plt.xlabel("Generation")
|
| 521 |
+
plt.ylabel("Fitness Score"); plt.title("Evolutionary Fitness History"); plt.legend(); plt.grid(True); plt.tight_layout()
|
| 522 |
+
plot_path = os.path.join(output_dir, "fitness_history.png"); plt.savefig(plot_path); plt.close()
|
| 523 |
+
logging.info(f"Fitness history plot saved to {plot_path}")
|
| 524 |
+
except Exception as e: logging.error(f"Error plotting fitness history: {e}", exc_info=True)
|
| 525 |
+
|
| 526 |
+
# --- Değerlendirme (Öncekiyle aynı) ---
|
| 527 |
+
def evaluate_model(model: Sequential, X_test: np.ndarray, y_test: np.ndarray, batch_size: int) -> Dict[str, float]:
|
| 528 |
+
if model is None: return {"test_mse": np.inf, "avg_kendall_tau": 0.0}
|
| 529 |
+
logging.info("Evaluating final model on test data...")
|
| 530 |
+
try:
|
| 531 |
+
y_pred = model.predict(X_test, batch_size=batch_size, verbose=0)
|
| 532 |
+
test_mse = np.mean(np.square(y_test - y_pred))
|
| 533 |
+
logging.info(f"Final Test MSE: {test_mse:.6f}")
|
| 534 |
+
sample_size = min(500, X_test.shape[0]); taus = []; indices = np.random.choice(X_test.shape[0], sample_size, replace=False)
|
| 535 |
+
for i in indices:
|
| 536 |
+
try: tau, _ = kendalltau(y_test[i], y_pred[i]);
|
| 537 |
+
if not np.isnan(tau): taus.append(tau)
|
| 538 |
+
except ValueError: pass # Handle constant prediction case
|
| 539 |
+
avg_kendall_tau = np.mean(taus) if taus else 0.0
|
| 540 |
+
logging.info(f"Average Kendall's Tau (on {sample_size} samples): {avg_kendall_tau:.4f}")
|
| 541 |
+
return {"test_mse": float(test_mse), "avg_kendall_tau": float(avg_kendall_tau)}
|
| 542 |
+
except Exception as e:
|
| 543 |
+
logging.error(f"Error during final model evaluation: {e}", exc_info=True)
|
| 544 |
+
return {"test_mse": np.inf, "avg_kendall_tau": 0.0}
|
| 545 |
+
|
| 546 |
+
# --- Ana İş Akışı (Checkpoint Yükleme ile) ---
|
| 547 |
+
def run_pipeline_v3(args: argparse.Namespace):
|
| 548 |
+
"""Checkpoint ve Crossover içeren ana iş akışı."""
|
| 549 |
+
|
| 550 |
+
# Çalıştırma adı ve çıktı klasörü
|
| 551 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 552 |
+
run_name = f"evorun_{timestamp}_gen{args.generations}_pop{args.pop_size}"
|
| 553 |
+
# Eğer resume path verilmişse, o klasörü kullan
|
| 554 |
+
output_dir = args.resume_from if args.resume_from else os.path.join(args.output_base_dir, run_name)
|
| 555 |
+
resume_run = bool(args.resume_from)
|
| 556 |
+
if resume_run:
|
| 557 |
+
run_name = os.path.basename(output_dir) # Klasör adını kullan
|
| 558 |
+
logging.info(f"Attempting to resume run from: {output_dir}")
|
| 559 |
+
else:
|
| 560 |
+
try: os.makedirs(output_dir, exist_ok=True)
|
| 561 |
+
except OSError as e: print(f"FATAL: Could not create output directory: {output_dir}. Error: {e}", file=sys.stderr); sys.exit(1)
|
| 562 |
+
|
| 563 |
+
# Loglamayı ayarla ('a' modu ile devam etmeye uygun)
|
| 564 |
+
setup_logging(output_dir)
|
| 565 |
+
logging.info(f"========== Starting/Resuming EvoNet Pipeline Run: {run_name} ==========")
|
| 566 |
+
logging.info(f"Output directory: {output_dir}")
|
| 567 |
+
|
| 568 |
+
# --- Checkpoint Yükleme ---
|
| 569 |
+
start_generation = 0
|
| 570 |
+
population = []
|
| 571 |
+
initial_state_loaded = False
|
| 572 |
+
latest_checkpoint_path = find_latest_checkpoint(output_dir) if resume_run else None
|
| 573 |
+
|
| 574 |
+
if latest_checkpoint_path:
|
| 575 |
+
loaded_state = load_checkpoint(latest_checkpoint_path)
|
| 576 |
+
if loaded_state:
|
| 577 |
+
start_generation = loaded_state['generation'] # Kaldığı nesilden başla
|
| 578 |
+
population = loaded_state['population']
|
| 579 |
+
# Rastgele durumları geri yükle
|
| 580 |
+
try:
|
| 581 |
+
random.setstate(loaded_state['random_state'])
|
| 582 |
+
np.random.set_state(loaded_state['numpy_random_state'])
|
| 583 |
+
# tf.random.set_global_generator(tf.random.Generator.from_state(loaded_state['tensorflow_random_state'])) # TF state sorunlu olabilir
|
| 584 |
+
logging.info(f"Random states restored from checkpoint.")
|
| 585 |
+
except Exception as e:
|
| 586 |
+
logging.warning(f"Could not fully restore random states from checkpoint: {e}")
|
| 587 |
+
initial_state_loaded = True
|
| 588 |
+
logging.info(f"Resuming from Generation {start_generation + 1} with {len(population)} individuals.")
|
| 589 |
+
else:
|
| 590 |
+
logging.error("Failed to load checkpoint. Starting from scratch.")
|
| 591 |
+
resume_run = False # Checkpoint yüklenemediyse sıfırdan başla
|
| 592 |
+
elif resume_run:
|
| 593 |
+
logging.warning(f"Resume requested but no valid checkpoint found in {output_dir}. Starting from scratch.")
|
| 594 |
+
resume_run = False # Checkpoint yoksa sıfırdan başla
|
| 595 |
+
|
| 596 |
+
|
| 597 |
+
# --- Sıfırdan Başlama veya Devam Etme Ayarları ---
|
| 598 |
+
if not initial_state_loaded:
|
| 599 |
+
# Argümanları logla ve kaydet (sadece sıfırdan başlarken)
|
| 600 |
+
logging.info("--- Configuration ---")
|
| 601 |
+
args_dict = vars(args)
|
| 602 |
+
for k, v in args_dict.items(): logging.info(f" {k:<20}: {v}")
|
| 603 |
+
logging.info("---------------------")
|
| 604 |
+
config_path = os.path.join(output_dir, "config.json")
|
| 605 |
+
try:
|
| 606 |
+
with open(config_path, 'w') as f: json.dump(args_dict, f, indent=4, sort_keys=True)
|
| 607 |
+
logging.info(f"Configuration saved to {config_path}")
|
| 608 |
+
except Exception as e: logging.error(f"Failed to save configuration: {e}", exc_info=True)
|
| 609 |
+
|
| 610 |
+
# Rastgele tohumları ayarla
|
| 611 |
+
try:
|
| 612 |
+
random.seed(args.seed); np.random.seed(args.seed); tf.random.set_seed(args.seed)
|
| 613 |
+
logging.info(f"Using random seed: {args.seed}")
|
| 614 |
+
except Exception as e: logging.warning(f"Could not set all random seeds: {e}")
|
| 615 |
+
|
| 616 |
+
# GPU kontrolü
|
| 617 |
+
is_gpu_available = check_gpu()
|
| 618 |
+
|
| 619 |
+
# Veri Üretimi
|
| 620 |
+
try:
|
| 621 |
+
X_train, y_train = generate_data(args.train_samples, args.seq_length)
|
| 622 |
+
X_test, y_test = generate_data(args.test_samples, args.seq_length)
|
| 623 |
+
input_shape = X_train.shape[1:]
|
| 624 |
+
except Exception: logging.critical("Failed to generate data. Exiting."); sys.exit(1)
|
| 625 |
+
|
| 626 |
+
# Popülasyon Başlatma
|
| 627 |
+
logging.info(f"--- Initializing Population (Size: {args.pop_size}) ---")
|
| 628 |
+
try:
|
| 629 |
+
population = [create_individual(args.seq_length, input_shape) for _ in range(args.pop_size)]
|
| 630 |
+
logging.info("Population initialized successfully.")
|
| 631 |
+
except Exception: logging.critical("Failed to initialize population. Exiting."); sys.exit(1)
|
| 632 |
+
else:
|
| 633 |
+
# Checkpoint'ten devam ediliyorsa, veriyi yeniden üretmemiz gerekebilir
|
| 634 |
+
# veya checkpoint'e veriyi de dahil edebiliriz (büyük olabilir).
|
| 635 |
+
# Şimdilik veriyi yeniden üretelim.
|
| 636 |
+
logging.info("Reloading data for resumed run...")
|
| 637 |
+
is_gpu_available = check_gpu() # GPU durumunu tekrar kontrol et
|
| 638 |
+
try:
|
| 639 |
+
X_train, y_train = generate_data(args.train_samples, args.seq_length)
|
| 640 |
+
X_test, y_test = generate_data(args.test_samples, args.seq_length)
|
| 641 |
+
except Exception: logging.critical("Failed to reload data for resumed run. Exiting."); sys.exit(1)
|
| 642 |
+
# Config dosyasını tekrar okuyup loglayabiliriz
|
| 643 |
+
config_path = os.path.join(output_dir, "config.json")
|
| 644 |
+
try:
|
| 645 |
+
with open(config_path, 'r') as f: args_dict = json.load(f)
|
| 646 |
+
logging.info("--- Loaded Configuration (from resumed run) ---")
|
| 647 |
+
for k, v in args_dict.items(): logging.info(f" {k:<20}: {v}")
|
| 648 |
+
logging.info("-----------------------------------------------")
|
| 649 |
+
except Exception as e:
|
| 650 |
+
logging.warning(f"Could not reload config.json: {e}")
|
| 651 |
+
args_dict = vars(args) # Argümanları kullan
|
| 652 |
+
|
| 653 |
+
|
| 654 |
+
# Evrim Süreci
|
| 655 |
+
logging.info(f"--- Starting/Resuming Evolution ({args.generations} Total Generations) ---")
|
| 656 |
+
if start_generation >= args.generations:
|
| 657 |
+
logging.warning(f"Loaded checkpoint generation ({start_generation}) is already >= total generations ({args.generations}). Skipping evolution.")
|
| 658 |
+
best_model_unevolved = population[0] if population else None # En iyi modeli checkpoint'ten almaya çalışmak lazım
|
| 659 |
+
best_fitness_hist, avg_fitness_hist = [], [] # Geçmişi de yüklemek lazım
|
| 660 |
+
# TODO: Checkpoint'ten en iyi modeli ve geçmişi de yükle
|
| 661 |
+
# Şimdilik basitleştirilmiş - evrim atlanıyor
|
| 662 |
+
else:
|
| 663 |
+
try:
|
| 664 |
+
best_model_unevolved, best_fitness_hist, avg_fitness_hist = evolve_population_v3(
|
| 665 |
+
population, X_train, y_train, start_generation, args.generations,
|
| 666 |
+
args.crossover_rate, args.mutation_rate, args.weight_mut_rate, args.mutation_strength,
|
| 667 |
+
args.tournament_size, args.elitism_count, args.batch_size,
|
| 668 |
+
output_dir, args.checkpoint_interval
|
| 669 |
+
)
|
| 670 |
+
except Exception as e:
|
| 671 |
+
logging.critical(f"Fatal error during evolution process: {e}", exc_info=True)
|
| 672 |
+
sys.exit(1)
|
| 673 |
+
logging.info("--- Evolution Complete ---")
|
| 674 |
+
|
| 675 |
+
# (Fitness geçmişini kaydetme ve çizdirme - öncekiyle aynı)
|
| 676 |
+
if best_fitness_hist or avg_fitness_hist: # Sadece listeler boş değilse
|
| 677 |
+
# Geçmişi de checkpoint'ten yükleyip birleştirmek gerekebilir.
|
| 678 |
+
# Şimdilik sadece bu çalıştırmadaki kısmı kaydediyoruz/çizdiriyoruz.
|
| 679 |
+
# TODO: Checkpoint'ten yüklenen geçmişle birleştir.
|
| 680 |
+
plot_fitness_history(best_fitness_hist, avg_fitness_hist, output_dir)
|
| 681 |
+
history_path = os.path.join(output_dir, "fitness_history_run.csv") # Farklı isim?
|
| 682 |
+
try:
|
| 683 |
+
history_data = np.array([np.arange(start_generation + 1, start_generation + len(best_fitness_hist) + 1), best_fitness_hist, avg_fitness_hist]).T
|
| 684 |
+
np.savetxt(history_path, history_data, delimiter=',', header='Generation,BestFitness,AvgFitness', comments='', fmt=['%d', '%.8f', '%.8f'])
|
| 685 |
+
logging.info(f"Fitness history (this run) saved to {history_path}")
|
| 686 |
+
except Exception as e: logging.error(f"Could not save fitness history data: {e}")
|
| 687 |
+
else: logging.warning("Fitness history is empty, skipping saving/plotting.")
|
| 688 |
+
|
| 689 |
+
# (En iyi modelin son eğitimi, değerlendirme ve sonuç kaydı - öncekiyle aynı)
|
| 690 |
+
if best_model_unevolved is None:
|
| 691 |
+
logging.error("Evolution did not yield a best model. Skipping final training and evaluation.")
|
| 692 |
+
final_metrics = {"test_mse": np.inf, "avg_kendall_tau": 0.0}; final_model_path = None; training_summary = {}
|
| 693 |
+
else:
|
| 694 |
+
logging.info("--- Starting Final Training of Best Evolved Model ---")
|
| 695 |
+
try:
|
| 696 |
+
final_model = clone_model(best_model_unevolved); final_model.set_weights(best_model_unevolved.get_weights())
|
| 697 |
+
final_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
|
| 698 |
+
logging.info("Model Summary of Best Evolved (Untrained):"); final_model.summary(print_fn=logging.info)
|
| 699 |
+
early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True, verbose=1)
|
| 700 |
+
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=7, min_lr=1e-7, verbose=1)
|
| 701 |
+
history = final_model.fit(X_train, y_train, epochs=args.epochs_final_train, batch_size=args.batch_size, validation_split=0.2, callbacks=[early_stopping, reduce_lr], verbose=2)
|
| 702 |
+
logging.info("Final training complete.")
|
| 703 |
+
training_summary = {"epochs_run": len(history.history['loss']), "final_train_loss": history.history['loss'][-1], "final_val_loss": history.history['val_loss'][-1]}
|
| 704 |
+
final_metrics = evaluate_model(final_model, X_test, y_test, args.batch_size)
|
| 705 |
+
final_model_path = os.path.join(output_dir, "best_evolved_model_trained.keras")
|
| 706 |
+
final_model.save(final_model_path); logging.info(f"Final trained model saved to {final_model_path}")
|
| 707 |
+
except Exception as e:
|
| 708 |
+
logging.error(f"Error during final training or evaluation: {e}", exc_info=True)
|
| 709 |
+
final_metrics = {"test_mse": np.inf, "avg_kendall_tau": 0.0}; final_model_path = None; training_summary = {"error": str(e)}
|
| 710 |
+
|
| 711 |
+
logging.info("--- Saving Final Results ---")
|
| 712 |
+
final_results = { # ... (öncekiyle aynı sonuç yapısı) ...
|
| 713 |
+
"run_info": {"run_name": run_name, "timestamp": timestamp, "output_directory": output_dir, "gpu_used": is_gpu_available, "resumed": resume_run},
|
| 714 |
+
"config": args_dict,
|
| 715 |
+
"evolution_summary": { # TODO: Checkpoint'ten yüklenen geçmişle birleştirilmeli
|
| 716 |
+
"generations_run_this_session": len(best_fitness_hist) if best_fitness_hist else 0,
|
| 717 |
+
"best_fitness_achieved_overall": best_fitness_overall if best_fitness_overall > -np.inf else None,
|
| 718 |
+
"best_fitness_final_gen": best_fitness_hist[-1] if best_fitness_hist else None,
|
| 719 |
+
"avg_fitness_final_gen": avg_fitness_hist[-1] if avg_fitness_hist else None, },
|
| 720 |
+
"final_training_summary": training_summary, "final_evaluation_on_test": final_metrics, "saved_model_path": final_model_path }
|
| 721 |
+
results_path = os.path.join(output_dir, "final_results.json")
|
| 722 |
+
try:
|
| 723 |
+
def convert_numpy_types(obj):
|
| 724 |
+
if isinstance(obj, np.integer): return int(obj)
|
| 725 |
+
elif isinstance(obj, np.floating): return float(obj)
|
| 726 |
+
elif isinstance(obj, np.ndarray): return obj.tolist()
|
| 727 |
+
return obj
|
| 728 |
+
with open(results_path, 'w') as f: json.dump(final_results, f, indent=4, default=convert_numpy_types)
|
| 729 |
+
logging.info(f"Final results summary saved to {results_path}")
|
| 730 |
+
except Exception as e: logging.error(f"Failed to save final results JSON: {e}", exc_info=True)
|
| 731 |
+
|
| 732 |
+
logging.info(f"========== Pipeline Run {run_name} Finished ==========")
|
| 733 |
+
|
| 734 |
+
|
| 735 |
+
# --- Argüman Ayrıştırıcı (Yeni Argümanlar Eklendi) ---
|
| 736 |
+
def parse_arguments_v3() -> argparse.Namespace:
|
| 737 |
+
parser = argparse.ArgumentParser(description="EvoNet v3: Neuroevolution with Crossover & Checkpointing")
|
| 738 |
+
|
| 739 |
+
# --- Dizinler ve Kontrol ---
|
| 740 |
+
parser.add_argument('--output_base_dir', type=str, default=DEFAULT_OUTPUT_BASE_DIR, help='Base directory for new runs.')
|
| 741 |
+
parser.add_argument('--resume_from', type=str, default=None, help='Path to a previous run directory to resume from.')
|
| 742 |
+
parser.add_argument('--checkpoint_interval', type=int, default=DEFAULT_CHECKPOINT_INTERVAL, help='Save checkpoint every N generations (0 to disable).')
|
| 743 |
+
|
| 744 |
+
# --- Veri Ayarları ---
|
| 745 |
+
parser.add_argument('--seq_length', type=int, default=DEFAULT_SEQ_LENGTH, help='Length of sequences.')
|
| 746 |
+
parser.add_argument('--train_samples', type=int, default=5000, help='Number of training samples.')
|
| 747 |
+
parser.add_argument('--test_samples', type=int, default=1000, help='Number of test samples.')
|
| 748 |
+
|
| 749 |
+
# --- Evrim Parametreleri ---
|
| 750 |
+
parser.add_argument('--pop_size', type=int, default=DEFAULT_POP_SIZE, help='Population size.')
|
| 751 |
+
parser.add_argument('--generations', type=int, default=DEFAULT_GENERATIONS, help='Total number of generations.')
|
| 752 |
+
parser.add_argument('--crossover_rate', type=float, default=DEFAULT_CROSSOVER_RATE, help='Probability of applying crossover.')
|
| 753 |
+
parser.add_argument('--mutation_rate', type=float, default=DEFAULT_MUTATION_RATE, help='Probability of applying mutation (if crossover is not applied).')
|
| 754 |
+
parser.add_argument('--weight_mut_rate', type=float, default=DEFAULT_WEIGHT_MUT_RATE, help='Weight mutation probability within mutation.')
|
| 755 |
+
# parser.add_argument('--activation_mut_rate', type=float, default=DEFAULT_ACTIVATION_MUT_RATE, help='Activation mutation probability (experimental).')
|
| 756 |
+
parser.add_argument('--mutation_strength', type=float, default=DEFAULT_MUTATION_STRENGTH, help='Std dev for weight mutation noise.')
|
| 757 |
+
parser.add_argument('--tournament_size', type=int, default=DEFAULT_TOURNAMENT_SIZE, help='Tournament selection size.')
|
| 758 |
+
parser.add_argument('--elitism_count', type=int, default=DEFAULT_ELITISM_COUNT, help='Number of elite individuals.')
|
| 759 |
+
|
| 760 |
+
# --- Eğitim ve Değerlendirme ---
|
| 761 |
+
parser.add_argument('--batch_size', type=int, default=DEFAULT_BATCH_SIZE, help='Batch size.')
|
| 762 |
+
parser.add_argument('--epochs_final_train', type=int, default=DEFAULT_EPOCHS_FINAL_TRAIN, help='Max epochs for final training.')
|
| 763 |
+
|
| 764 |
+
# --- Tekrarlanabilirlik ---
|
| 765 |
+
parser.add_argument('--seed', type=int, default=None, help='Random seed (default: random).')
|
| 766 |
+
|
| 767 |
+
args = parser.parse_args()
|
| 768 |
+
if args.seed is None: args.seed = random.randint(0, 2**32 - 1); print(f"Generated random seed: {args.seed}")
|
| 769 |
+
# Basit kontrol: Crossover + Mutation oranı > 1 olmamalı (teknik olarak olabilir ama mantık gereği biri seçilmeli)
|
| 770 |
+
# if args.crossover_rate + args.mutation_rate > 1.0: logging.warning("Sum of crossover and mutation rates exceeds 1.0")
|
| 771 |
+
return args
|
| 772 |
+
|
| 773 |
+
|
| 774 |
+
# --- Ana Çalıştırma Bloğu ---
|
| 775 |
+
if __name__ == "__main__":
|
| 776 |
+
cli_args = parse_arguments_v3()
|
| 777 |
+
try:
|
| 778 |
+
run_pipeline_v3(cli_args)
|
| 779 |
+
except SystemExit: pass
|
| 780 |
+
except Exception as e:
|
| 781 |
+
print(f"\nFATAL UNHANDLED ERROR in main execution block: {e}", file=sys.stderr)
|
| 782 |
+
if logging.getLogger().hasHandlers(): logging.critical("FATAL UNHANDLED ERROR:", exc_info=True)
|
| 783 |
+
else: import traceback; print(traceback.format_exc(), file=sys.stderr)
|
| 784 |
+
sys.exit(1)
|
v4.py
ADDED
|
@@ -0,0 +1,1327 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==============================================================================
|
| 2 |
+
# EvoNet Optimizer - v4 - PyTorch Tabanlı Geliştirilmiş Sürüm
|
| 3 |
+
# Açıklama: TensorFlow'dan PyTorch'a geçiş yapılmış, modern PyTorch
|
| 4 |
+
# pratikleri kullanılmış, esneklik artırılmış, kod kalitesi
|
| 5 |
+
# iyileştirilmiş ve PyTorch ekosistemine uygun hale getirilmiştir.
|
| 6 |
+
# Çaprazlama, Kontrol Noktası, Adaptif Mutasyon (kavramsal) ve
|
| 7 |
+
# Gelişmiş Fitness (kavramsal) özellikleri korunmuştur.
|
| 8 |
+
# ==============================================================================
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
import subprocess
|
| 12 |
+
import sys
|
| 13 |
+
import argparse
|
| 14 |
+
import random
|
| 15 |
+
import logging
|
| 16 |
+
from datetime import datetime
|
| 17 |
+
import json
|
| 18 |
+
import copy # Model klonlama ve durum dikteleri için
|
| 19 |
+
import time
|
| 20 |
+
from typing import List, Tuple, Dict, Any, Optional, Union
|
| 21 |
+
|
| 22 |
+
import numpy as np
|
| 23 |
+
import torch
|
| 24 |
+
import torch.nn as nn
|
| 25 |
+
import torch.optim as optim
|
| 26 |
+
from torch.utils.data import TensorDataset, DataLoader
|
| 27 |
+
import matplotlib.pyplot as plt
|
| 28 |
+
from scipy.stats import kendalltau # Hala numpy/scipy kullanıyoruz
|
| 29 |
+
|
| 30 |
+
# --- Sabitler ve Varsayılan Değerler ---
|
| 31 |
+
DEFAULT_SEQ_LENGTH = 10
|
| 32 |
+
DEFAULT_POP_SIZE = 50
|
| 33 |
+
DEFAULT_GENERATIONS = 50
|
| 34 |
+
DEFAULT_CROSSOVER_RATE = 0.6
|
| 35 |
+
DEFAULT_MUTATION_RATE = 0.4 # Eğer çaprazlama olmazsa mutasyon olasılığı
|
| 36 |
+
DEFAULT_WEIGHT_MUT_RATE = 0.8 # Ağırlık mutasyonu olasılığı (mutasyon içinde)
|
| 37 |
+
# Aktivasyon mutasyonu PyTorch'ta daha farklı ele alınmalı, şimdilik odak ağırlıkta.
|
| 38 |
+
DEFAULT_MUTATION_STRENGTH = 0.1
|
| 39 |
+
DEFAULT_TOURNAMENT_SIZE = 5
|
| 40 |
+
DEFAULT_ELITISM_COUNT = 2
|
| 41 |
+
DEFAULT_EPOCHS_FINAL_TRAIN = 100
|
| 42 |
+
DEFAULT_BATCH_SIZE = 64
|
| 43 |
+
DEFAULT_OUTPUT_BASE_DIR = os.path.join(os.getcwd(), "evonet_runs_v4_pytorch")
|
| 44 |
+
DEFAULT_CHECKPOINT_INTERVAL = 10 # Nesil başına checkpoint (0 = kapalı)
|
| 45 |
+
DEFAULT_DEVICE = "auto" # "auto", "cpu", "cuda"
|
| 46 |
+
|
| 47 |
+
# --- Loglama Ayarları ---
|
| 48 |
+
# (setup_logging fonksiyonu öncekiyle aynı, tekrar eklemiyorum)
|
| 49 |
+
def setup_logging(log_dir: str, log_level=logging.INFO) -> None:
|
| 50 |
+
log_filename = os.path.join(log_dir, 'evolution_run_pytorch.log')
|
| 51 |
+
# Mevcut handler'ları temizle (özellikle tekrar çalıştırmalarda önemli)
|
| 52 |
+
for handler in logging.root.handlers[:]:
|
| 53 |
+
handler.close() # Önce kapat
|
| 54 |
+
logging.root.removeHandler(handler)
|
| 55 |
+
logging.basicConfig(
|
| 56 |
+
level=log_level,
|
| 57 |
+
format='%(asctime)s - %(levelname)-8s [%(filename)s:%(lineno)d] - %(message)s', # Daha detaylı format
|
| 58 |
+
handlers=[
|
| 59 |
+
logging.FileHandler(log_filename, mode='a'), # append modu
|
| 60 |
+
logging.StreamHandler(sys.stdout)
|
| 61 |
+
]
|
| 62 |
+
)
|
| 63 |
+
logging.info("="*50)
|
| 64 |
+
logging.info("PyTorch EvoNet v4 Logging Başlatıldı.")
|
| 65 |
+
logging.info("="*50)
|
| 66 |
+
|
| 67 |
+
# --- Cihaz (GPU/CPU) Ayarları ---
|
| 68 |
+
def setup_device(requested_device: str) -> torch.device:
|
| 69 |
+
""" Kullanılabilir cihaza göre PyTorch cihazını ayarlar. """
|
| 70 |
+
if requested_device == "auto":
|
| 71 |
+
if torch.cuda.is_available():
|
| 72 |
+
device_name = "cuda"
|
| 73 |
+
logging.info(f"CUDA (GPU) kullanılabilir: {torch.cuda.get_device_name(0)}")
|
| 74 |
+
else:
|
| 75 |
+
device_name = "cpu"
|
| 76 |
+
logging.info("CUDA (GPU) bulunamadı. CPU kullanılacak.")
|
| 77 |
+
elif requested_device == "cuda":
|
| 78 |
+
if torch.cuda.is_available():
|
| 79 |
+
device_name = "cuda"
|
| 80 |
+
logging.info(f"CUDA (GPU) manuel olarak seçildi: {torch.cuda.get_device_name(0)}")
|
| 81 |
+
else:
|
| 82 |
+
logging.warning("CUDA (GPU) istendi ancak bulunamadı! CPU kullanılacak.")
|
| 83 |
+
device_name = "cpu"
|
| 84 |
+
else: # cpu veya geçersiz değer
|
| 85 |
+
device_name = "cpu"
|
| 86 |
+
logging.info("CPU manuel olarak seçildi veya geçersiz cihaz belirtildi.")
|
| 87 |
+
|
| 88 |
+
return torch.device(device_name)
|
| 89 |
+
|
| 90 |
+
# --- Veri Üretimi ---
|
| 91 |
+
# (generate_data fonksiyonu öncekiyle aynı, NumPy tabanlı)
|
| 92 |
+
def generate_data(num_samples: int, seq_length: int) -> Tuple[np.ndarray, np.ndarray]:
|
| 93 |
+
logging.info(f"Generating {num_samples} samples with sequence length {seq_length}...")
|
| 94 |
+
try:
|
| 95 |
+
# Veriyi float32 olarak üretmek PyTorch için genellikle daha iyidir
|
| 96 |
+
X = np.random.rand(num_samples, seq_length).astype(np.float32) * 100
|
| 97 |
+
y = np.sort(X, axis=1).astype(np.float32)
|
| 98 |
+
logging.info("Data generation successful.")
|
| 99 |
+
return X, y
|
| 100 |
+
except Exception as e:
|
| 101 |
+
logging.error(f"Error during data generation: {e}", exc_info=True)
|
| 102 |
+
raise
|
| 103 |
+
|
| 104 |
+
# --- PyTorch Sinir Ağı Modeli ---
|
| 105 |
+
class NeuralNetwork(nn.Module):
|
| 106 |
+
""" Dinamik olarak yapılandırılabilen basit bir PyTorch MLP modeli. """
|
| 107 |
+
def __init__(self, input_size: int, output_size: int, hidden_dims: List[int], activations: List[str]):
|
| 108 |
+
super().__init__()
|
| 109 |
+
self.input_size = input_size
|
| 110 |
+
self.output_size = output_size
|
| 111 |
+
self.hidden_dims = hidden_dims
|
| 112 |
+
self.activations_str = activations # Mimarinin string listesi (checkpoint için)
|
| 113 |
+
|
| 114 |
+
layers = []
|
| 115 |
+
last_dim = input_size
|
| 116 |
+
for i, h_dim in enumerate(hidden_dims):
|
| 117 |
+
layers.append(nn.Linear(last_dim, h_dim))
|
| 118 |
+
act_func_str = activations[i].lower()
|
| 119 |
+
if act_func_str == 'relu':
|
| 120 |
+
layers.append(nn.ReLU())
|
| 121 |
+
elif act_func_str == 'tanh':
|
| 122 |
+
layers.append(nn.Tanh())
|
| 123 |
+
elif act_func_str == 'sigmoid':
|
| 124 |
+
layers.append(nn.Sigmoid())
|
| 125 |
+
else:
|
| 126 |
+
logging.warning(f"Bilinmeyen aktivasyon '{activations[i]}', ReLU kullanılıyor.")
|
| 127 |
+
layers.append(nn.ReLU()) # Varsayılan
|
| 128 |
+
last_dim = h_dim
|
| 129 |
+
|
| 130 |
+
# Çıkış katmanı (genellikle lineer aktivasyon)
|
| 131 |
+
layers.append(nn.Linear(last_dim, output_size))
|
| 132 |
+
|
| 133 |
+
self.network = nn.Sequential(*layers)
|
| 134 |
+
self.architecture_id = self._generate_architecture_id() # Mimarinin özeti
|
| 135 |
+
# Modelin adını (ID'sini) oluşturma (opsiyonel, loglama için kullanışlı)
|
| 136 |
+
self.model_name = f"model_{self.architecture_id}_rnd{random.randint(10000, 99999)}"
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 140 |
+
return self.network(x)
|
| 141 |
+
|
| 142 |
+
def get_architecture(self) -> Dict[str, Any]:
|
| 143 |
+
""" Model mimarisini döndürür (checkpointing için). """
|
| 144 |
+
return {
|
| 145 |
+
"input_size": self.input_size,
|
| 146 |
+
"output_size": self.output_size,
|
| 147 |
+
"hidden_dims": self.hidden_dims,
|
| 148 |
+
"activations": self.activations_str
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
def _generate_architecture_id(self) -> str:
|
| 152 |
+
""" Mimariden kısa bir kimlik üretir. """
|
| 153 |
+
h_dims_str = '_'.join(map(str, self.hidden_dims))
|
| 154 |
+
acts_str = ''.join([a[0].upper() for a in self.activations_str]) # R_T_S
|
| 155 |
+
return f"I{self.input_size}_H{h_dims_str}_A{acts_str}_O{self.output_size}"
|
| 156 |
+
|
| 157 |
+
# Eşitlik kontrolü mimari bazında yapılabilir
|
| 158 |
+
def __eq__(self, other):
|
| 159 |
+
if not isinstance(other, NeuralNetwork):
|
| 160 |
+
return NotImplemented
|
| 161 |
+
return self.get_architecture() == other.get_architecture()
|
| 162 |
+
|
| 163 |
+
def __hash__(self):
|
| 164 |
+
# Mimariyi temsil eden bir tuple oluştur ve hash'ini al
|
| 165 |
+
arch_tuple = (
|
| 166 |
+
self.input_size,
|
| 167 |
+
self.output_size,
|
| 168 |
+
tuple(self.hidden_dims),
|
| 169 |
+
tuple(self.activations_str)
|
| 170 |
+
)
|
| 171 |
+
return hash(arch_tuple)
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
# --- Neuroevolution Çekirdeği (PyTorch) ---
|
| 175 |
+
|
| 176 |
+
def create_individual_pytorch(input_size: int, output_size: int) -> NeuralNetwork:
|
| 177 |
+
""" Rastgele mimariye sahip bir PyTorch NeuralNetwork modeli oluşturur. """
|
| 178 |
+
try:
|
| 179 |
+
num_hidden_layers = random.randint(1, 4)
|
| 180 |
+
hidden_dims = [random.randint(16, 128) for _ in range(num_hidden_layers)] # Biraz daha geniş aralık
|
| 181 |
+
activations = [random.choice(['relu', 'tanh', 'sigmoid']) for _ in range(num_hidden_layers)]
|
| 182 |
+
|
| 183 |
+
model = NeuralNetwork(input_size, output_size, hidden_dims, activations)
|
| 184 |
+
# PyTorch'ta model oluşturulduktan sonra compile gerekmez.
|
| 185 |
+
# Ağırlıklar zaten rastgele başlatılır.
|
| 186 |
+
logging.debug(f"Created individual: {model.model_name}")
|
| 187 |
+
return model
|
| 188 |
+
except Exception as e:
|
| 189 |
+
logging.error(f"Error creating PyTorch individual model: {e}", exc_info=True)
|
| 190 |
+
raise
|
| 191 |
+
|
| 192 |
+
# PyTorch için model kopyalama işlevi
|
| 193 |
+
def clone_pytorch_model(model: NeuralNetwork, device: torch.device) -> NeuralNetwork:
|
| 194 |
+
""" Bir PyTorch modelini (mimari ve ağırlıklar) klonlar. """
|
| 195 |
+
try:
|
| 196 |
+
# 1. Aynı mimariyle yeni bir model oluştur
|
| 197 |
+
arch = model.get_architecture()
|
| 198 |
+
cloned_model = NeuralNetwork(**arch)
|
| 199 |
+
# 2. Orijinal modelin state_dict'ini kopyala
|
| 200 |
+
cloned_model.load_state_dict(copy.deepcopy(model.state_dict()))
|
| 201 |
+
# 3. Yeni modeli doğru cihaza taşı
|
| 202 |
+
cloned_model.to(device)
|
| 203 |
+
cloned_model.model_name = f"cloned_{model.model_name}_{random.randint(1000,9999)}"
|
| 204 |
+
logging.debug(f"Cloned model {model.model_name} to {cloned_model.model_name}")
|
| 205 |
+
return cloned_model
|
| 206 |
+
except Exception as e:
|
| 207 |
+
logging.error(f"Error cloning PyTorch model {model.model_name}: {e}", exc_info=True)
|
| 208 |
+
raise
|
| 209 |
+
|
| 210 |
+
def calculate_fitness_pytorch(
|
| 211 |
+
individual: NeuralNetwork,
|
| 212 |
+
X: torch.Tensor,
|
| 213 |
+
y: torch.Tensor,
|
| 214 |
+
device: torch.device,
|
| 215 |
+
fitness_params: Optional[Dict] = None
|
| 216 |
+
) -> float:
|
| 217 |
+
""" Bir bireyin fitness değerini PyTorch kullanarak hesaplar. """
|
| 218 |
+
# --- KAVRAMSAL: Gelişmiş Fitness Fonksiyonu (PyTorch ile uyumlu) ---
|
| 219 |
+
# fitness_params = fitness_params or {}
|
| 220 |
+
# w_mse = fitness_params.get('w_mse', 1.0)
|
| 221 |
+
# w_tau = fitness_params.get('w_tau', 0.1) # Kendall Tau ağırlığı
|
| 222 |
+
# w_comp = fitness_params.get('w_comp', 0.0001) # Karmaşıklık cezası ağırlığı
|
| 223 |
+
# --------------------------------------------
|
| 224 |
+
|
| 225 |
+
individual.eval() # Modeli değerlendirme moduna al (dropout vs. etkisizleşir)
|
| 226 |
+
individual.to(device) # Modeli doğru cihaza taşı
|
| 227 |
+
X, y = X.to(device), y.to(device) # Veriyi doğru cihaza taşı
|
| 228 |
+
|
| 229 |
+
try:
|
| 230 |
+
with torch.no_grad(): # Gradyan hesaplamasını kapat (inferans için)
|
| 231 |
+
y_pred = individual(X)
|
| 232 |
+
# Temel Fitness: MSE (Mean Squared Error)
|
| 233 |
+
# loss_fn = nn.MSELoss()
|
| 234 |
+
# mse_val = loss_fn(y_pred, y).item()
|
| 235 |
+
# Alternatif manuel hesaplama:
|
| 236 |
+
mse_val = torch.mean((y_pred - y)**2).item()
|
| 237 |
+
|
| 238 |
+
# MSE sonsuz veya NaN ise minimum fitness ata
|
| 239 |
+
if not np.isfinite(mse_val):
|
| 240 |
+
logging.warning(f"Non-finite MSE ({mse_val}) for model {individual.model_name}. Assigning minimal fitness.")
|
| 241 |
+
return -1e9 # Çok düşük bir değer ata
|
| 242 |
+
|
| 243 |
+
# Temel Fitness (MSE'nin tersi, daha yüksek daha iyi)
|
| 244 |
+
fitness_score = 1.0 / (mse_val + 1e-9) # Sıfıra bölme hatasını önle
|
| 245 |
+
|
| 246 |
+
# --- KAVRAMSAL: Gelişmiş Fitness Hesabı ---
|
| 247 |
+
# if w_tau > 0 or w_comp > 0:
|
| 248 |
+
# # Kendall Tau hesapla (NumPy'a çevirerek, maliyetli olabilir)
|
| 249 |
+
# y_np = y.cpu().numpy()
|
| 250 |
+
# y_pred_np = y_pred.cpu().numpy()
|
| 251 |
+
# tau_val = calculate_avg_kendall_tau(y_np, y_pred_np, sample_size=100) # Örnek fonksiyon
|
| 252 |
+
#
|
| 253 |
+
# # Karmaşıklık hesapla (parametre sayısı)
|
| 254 |
+
# complexity = sum(p.numel() for p in individual.parameters() if p.requires_grad)
|
| 255 |
+
#
|
| 256 |
+
# # Birleştirilmiş fitness (Örnek formül)
|
| 257 |
+
# # MSE'yi minimize etmek istediğimiz için 1/MSE kullanıyoruz.
|
| 258 |
+
# # Tau'yu maksimize etmek istiyoruz.
|
| 259 |
+
# # Karmaşıklığı minimize etmek istiyoruz.
|
| 260 |
+
# fitness_score = (w_mse * fitness_score) + (w_tau * tau_val) - (w_comp * complexity)
|
| 261 |
+
# --------------------------------------------
|
| 262 |
+
|
| 263 |
+
# Sonuçta yine de çok düşük veya sonsuz fitness kontrolü
|
| 264 |
+
if not np.isfinite(fitness_score) or fitness_score < -1e8:
|
| 265 |
+
logging.warning(f"Non-finite or very low final fitness ({fitness_score:.4g}) for model {individual.model_name}. Assigning minimal fitness.")
|
| 266 |
+
return -1e9
|
| 267 |
+
|
| 268 |
+
return float(fitness_score)
|
| 269 |
+
|
| 270 |
+
except Exception as e:
|
| 271 |
+
logging.error(f"Error during fitness calculation for model {individual.model_name}: {e}", exc_info=True)
|
| 272 |
+
return -1e9 # Hata durumunda çok düşük fitness
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
def mutate_individual_pytorch(
|
| 276 |
+
individual: NeuralNetwork,
|
| 277 |
+
weight_mut_rate: float, # Bu parametre aslında ağırlıkların *ne kadarının* mutasyona uğrayacağını belirleyebilir
|
| 278 |
+
mutation_strength: float,
|
| 279 |
+
device: torch.device
|
| 280 |
+
) -> NeuralNetwork:
|
| 281 |
+
""" Bir PyTorch bireyine ağırlık bozulması mutasyonu uygular. """
|
| 282 |
+
try:
|
| 283 |
+
# Önemli: Orijinal modeli değiştirmemek için klonla
|
| 284 |
+
mutated_model = clone_pytorch_model(individual, device)
|
| 285 |
+
mutated_model.model_name = f"mutated_{individual.model_name}_{random.randint(1000,9999)}"
|
| 286 |
+
|
| 287 |
+
mutated = False
|
| 288 |
+
# Modelin state_dict'i üzerinde değişiklik yap
|
| 289 |
+
state_dict = mutated_model.state_dict()
|
| 290 |
+
new_state_dict = copy.deepcopy(state_dict) # Derin kopya al
|
| 291 |
+
|
| 292 |
+
for name, param in new_state_dict.items():
|
| 293 |
+
# Sadece eğitilebilir ağırlık/bias tensörlerini değiştir
|
| 294 |
+
if param.requires_grad and random.random() < weight_mut_rate : # Her parametre için mutasyon olasılığı
|
| 295 |
+
mutated = True
|
| 296 |
+
# Gaussian gürültü ekle
|
| 297 |
+
noise = torch.randn_like(param) * mutation_strength
|
| 298 |
+
new_state_dict[name] = param + noise.to(param.device) # Gürültüyü doğru cihaza taşı
|
| 299 |
+
|
| 300 |
+
if mutated:
|
| 301 |
+
mutated_model.load_state_dict(new_state_dict)
|
| 302 |
+
logging.debug(f"Mutated model {individual.model_name} -> {mutated_model.model_name}")
|
| 303 |
+
return mutated_model
|
| 304 |
+
else:
|
| 305 |
+
# Mutasyon uygulanmadıysa, klonlanmış modeli (isim değiştirilmiş) döndür veya orijinali?
|
| 306 |
+
# Mantıksal olarak mutasyon fonksiyonu çağrıldıysa bir değişiklik beklenir.
|
| 307 |
+
# Eğer hiç parametre mutasyona uğramadıysa bile farklı bir obje döndürmek tutarlı olabilir.
|
| 308 |
+
logging.debug(f"Mutation applied to {individual.model_name}, but no weights changed based on rate.")
|
| 309 |
+
return mutated_model # Klonlanmış, potansiyel olarak ismi değişmiş modeli döndür
|
| 310 |
+
|
| 311 |
+
except Exception as e:
|
| 312 |
+
logging.error(f"Error during PyTorch mutation of model {individual.model_name}: {e}", exc_info=True)
|
| 313 |
+
# Hata durumunda orijinal bireyi döndürmek güvenli bir seçenek olabilir
|
| 314 |
+
# return individual
|
| 315 |
+
# Ancak evrimsel süreçte sorun yaratabilir, bu yüzden klonlanmışı döndürmek daha iyi
|
| 316 |
+
return clone_pytorch_model(individual, device) # Hata durumunda temiz klon döndür
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
def check_architecture_compatibility_pytorch(model1: NeuralNetwork, model2: NeuralNetwork) -> bool:
|
| 320 |
+
""" İki PyTorch modelinin basit çaprazlama için uyumlu olup olmadığını kontrol eder. """
|
| 321 |
+
# Mimari bilgilerini karşılaştır
|
| 322 |
+
return model1.get_architecture() == model2.get_architecture()
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
def crossover_individuals_pytorch(
|
| 326 |
+
parent1: NeuralNetwork,
|
| 327 |
+
parent2: NeuralNetwork,
|
| 328 |
+
device: torch.device
|
| 329 |
+
) -> Tuple[Optional[NeuralNetwork], Optional[NeuralNetwork]]:
|
| 330 |
+
""" İki PyTorch ebeveynden basit ağırlık ortalaması/karıştırması ile çocuklar oluşturur. """
|
| 331 |
+
|
| 332 |
+
# 1. Mimari uyumluluğunu kontrol et
|
| 333 |
+
if not check_architecture_compatibility_pytorch(parent1, parent2):
|
| 334 |
+
logging.debug(f"Skipping crossover between {parent1.model_name} and {parent2.model_name} due to incompatible architectures.")
|
| 335 |
+
return None, None
|
| 336 |
+
|
| 337 |
+
try:
|
| 338 |
+
# 2. Çocuklar için yeni model örnekleri oluştur (aynı mimariyle)
|
| 339 |
+
arch = parent1.get_architecture() # İkisi de aynı mimariye sahip
|
| 340 |
+
child1 = NeuralNetwork(**arch).to(device)
|
| 341 |
+
child2 = NeuralNetwork(**arch).to(device)
|
| 342 |
+
child1.model_name = f"xover_{parent1.architecture_id}_c1_{random.randint(1000,9999)}"
|
| 343 |
+
child2.model_name = f"xover_{parent1.architecture_id}_c2_{random.randint(1000,9999)}"
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
# 3. Ebeveynlerin state_dict'lerini al
|
| 347 |
+
p1_state = parent1.state_dict()
|
| 348 |
+
p2_state = parent2.state_dict()
|
| 349 |
+
|
| 350 |
+
# 4. Çocukların state_dict'lerini oluştur
|
| 351 |
+
c1_state = child1.state_dict() # Başlangıç (rastgele) state'i al
|
| 352 |
+
c2_state = child2.state_dict()
|
| 353 |
+
|
| 354 |
+
for name in p1_state: # Parametre isimleri üzerinden döngü
|
| 355 |
+
param1 = p1_state[name]
|
| 356 |
+
param2 = p2_state[name]
|
| 357 |
+
|
| 358 |
+
# Basit ortalama çaprazlama (daha fazla yöntem eklenebilir)
|
| 359 |
+
# c1_state[name] = (param1 + param2) / 2.0
|
| 360 |
+
# c2_state[name] = (param1 + param2) / 2.0 # Ortalama için ikisi de aynı
|
| 361 |
+
|
| 362 |
+
# Tek nokta veya uniform crossover (ağırlık matrisi üzerinde)
|
| 363 |
+
mask = torch.rand_like(param1) < 0.5
|
| 364 |
+
c1_state[name] = torch.where(mask, param1, param2)
|
| 365 |
+
c2_state[name] = torch.where(mask, param2, param1) # Ters maske ile
|
| 366 |
+
|
| 367 |
+
# 5. Yeni state_dict'leri çocuklara yükle
|
| 368 |
+
child1.load_state_dict(c1_state)
|
| 369 |
+
child2.load_state_dict(c2_state)
|
| 370 |
+
|
| 371 |
+
logging.debug(f"Crossover performed between {parent1.model_name} and {parent2.model_name}")
|
| 372 |
+
return child1, child2
|
| 373 |
+
|
| 374 |
+
except Exception as e:
|
| 375 |
+
logging.error(f"Error during PyTorch crossover between {parent1.model_name} and {parent2.model_name}: {e}", exc_info=True)
|
| 376 |
+
return None, None
|
| 377 |
+
|
| 378 |
+
# (tournament_selection fonksiyonu öncekiyle aynı mantıkta çalışır, sadece model yerine
|
| 379 |
+
# NeuralNetwork objesini döndürür)
|
| 380 |
+
def tournament_selection(
|
| 381 |
+
population: List[NeuralNetwork],
|
| 382 |
+
fitness_scores: List[float],
|
| 383 |
+
k: int
|
| 384 |
+
) -> NeuralNetwork:
|
| 385 |
+
""" Turnuva seçimi ile popülasyondan bir birey seçer. """
|
| 386 |
+
if not population:
|
| 387 |
+
raise ValueError("Population cannot be empty for tournament selection.")
|
| 388 |
+
if len(population) < k:
|
| 389 |
+
logging.warning(f"Tournament size ({k}) is larger than population size ({len(population)}). Using population size.")
|
| 390 |
+
k = len(population)
|
| 391 |
+
if k <= 0:
|
| 392 |
+
logging.warning(f"Tournament size ({k}) must be positive. Using 1.")
|
| 393 |
+
k = 1
|
| 394 |
+
|
| 395 |
+
try:
|
| 396 |
+
# Popülasyondan k bireyi rastgele seç (indeksleriyle)
|
| 397 |
+
tournament_indices = random.sample(range(len(population)), k)
|
| 398 |
+
# Seçilenlerin fitness skorlarını ve kendilerini al
|
| 399 |
+
tournament_contenders = [(fitness_scores[i], population[i]) for i in tournament_indices]
|
| 400 |
+
# Fitness'a göre en iyiyi seç
|
| 401 |
+
winner = max(tournament_contenders, key=lambda item: item[0])[1] # item[0] fitness, item[1] model
|
| 402 |
+
return winner
|
| 403 |
+
except Exception as e:
|
| 404 |
+
logging.error(f"Error during tournament selection: {e}", exc_info=True)
|
| 405 |
+
# Hata durumunda rastgele bir birey döndür
|
| 406 |
+
return random.choice(population)
|
| 407 |
+
|
| 408 |
+
|
| 409 |
+
# --- Checkpointing (PyTorch) ---
|
| 410 |
+
def save_checkpoint_pytorch(output_dir: str, generation: int, population: List[NeuralNetwork], rnd_state: Any, np_rnd_state: Any, torch_rnd_state: Any):
|
| 411 |
+
""" Evrim durumunu (PyTorch modelleri ve rastgele durumlar) kaydeder. """
|
| 412 |
+
checkpoint_dir = os.path.join(output_dir, "checkpoints_pytorch")
|
| 413 |
+
os.makedirs(checkpoint_dir, exist_ok=True)
|
| 414 |
+
checkpoint_file = os.path.join(checkpoint_dir, f"evo_gen_{generation}.pt") # .pt uzantısı PyTorch için yaygın
|
| 415 |
+
logging.info(f"Saving checkpoint for generation {generation} to {checkpoint_file}...")
|
| 416 |
+
|
| 417 |
+
population_state = []
|
| 418 |
+
for model in population:
|
| 419 |
+
try:
|
| 420 |
+
# Her model için mimariyi ve state_dict'i kaydet
|
| 421 |
+
population_state.append({
|
| 422 |
+
"name": model.model_name,
|
| 423 |
+
"architecture": model.get_architecture(),
|
| 424 |
+
"state_dict": model.state_dict()
|
| 425 |
+
})
|
| 426 |
+
except Exception as e:
|
| 427 |
+
logging.error(f"Could not serialize model {model.model_name} for checkpoint: {e}")
|
| 428 |
+
# Başarısız olursa bu modeli atla
|
| 429 |
+
|
| 430 |
+
state = {
|
| 431 |
+
"generation": generation,
|
| 432 |
+
"population_state": population_state, # Sadece başarılı olanları içerir
|
| 433 |
+
"random_state": rnd_state,
|
| 434 |
+
"numpy_random_state": np_rnd_state,
|
| 435 |
+
"torch_random_state": torch_rnd_state, # PyTorch RNG durumu
|
| 436 |
+
"timestamp": datetime.now().isoformat()
|
| 437 |
+
}
|
| 438 |
+
|
| 439 |
+
try:
|
| 440 |
+
torch.save(state, checkpoint_file)
|
| 441 |
+
logging.info(f"Checkpoint saved successfully for generation {generation}.")
|
| 442 |
+
except Exception as e:
|
| 443 |
+
logging.error(f"Failed to save checkpoint using torch.save for generation {generation}: {e}", exc_info=True)
|
| 444 |
+
|
| 445 |
+
|
| 446 |
+
def load_checkpoint_pytorch(checkpoint_path: str, device: torch.device) -> Optional[Dict]:
|
| 447 |
+
""" Kaydedilmiş PyTorch evrim durumunu yükler. """
|
| 448 |
+
if not os.path.exists(checkpoint_path):
|
| 449 |
+
logging.error(f"Checkpoint file not found: {checkpoint_path}")
|
| 450 |
+
return None
|
| 451 |
+
logging.info(f"Loading checkpoint from {checkpoint_path}...")
|
| 452 |
+
|
| 453 |
+
try:
|
| 454 |
+
# Checkpoint'i CPU'ya yüklemek genellikle daha güvenlidir, sonra cihaza taşınır
|
| 455 |
+
checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
|
| 456 |
+
|
| 457 |
+
population = []
|
| 458 |
+
for model_state in checkpoint["population_state"]:
|
| 459 |
+
try:
|
| 460 |
+
# 1. Mimariden modeli yeniden oluştur
|
| 461 |
+
arch = model_state["architecture"]
|
| 462 |
+
model = NeuralNetwork(**arch)
|
| 463 |
+
# 2. Kaydedilmiş state_dict'i yükle
|
| 464 |
+
model.load_state_dict(model_state["state_dict"])
|
| 465 |
+
# 3. Modeli istenen cihaza taşı
|
| 466 |
+
model.to(device)
|
| 467 |
+
# 4. Model adını geri yükle (opsiyonel)
|
| 468 |
+
model.model_name = model_state.get("name", f"loaded_model_{random.randint(1000,9999)}")
|
| 469 |
+
model.eval() # Değerlendirme modunda başlat
|
| 470 |
+
population.append(model)
|
| 471 |
+
except Exception as e:
|
| 472 |
+
logging.error(f"Failed to load model state from checkpoint for model {model_state.get('name', 'UNKNOWN')}: {e}", exc_info=True)
|
| 473 |
+
|
| 474 |
+
if not population:
|
| 475 |
+
logging.error("Failed to load any model from the checkpoint population state.")
|
| 476 |
+
return None # Hiç model yüklenemediyse checkpoint geçersiz
|
| 477 |
+
|
| 478 |
+
# Yüklenen popülasyonu state'e ekle
|
| 479 |
+
checkpoint["population"] = population
|
| 480 |
+
|
| 481 |
+
logging.info(f"Checkpoint loaded successfully. Resuming from generation {checkpoint['generation'] + 1}.")
|
| 482 |
+
return checkpoint
|
| 483 |
+
except Exception as e:
|
| 484 |
+
logging.error(f"Failed to load checkpoint from {checkpoint_path}: {e}", exc_info=True)
|
| 485 |
+
return None
|
| 486 |
+
|
| 487 |
+
def find_latest_checkpoint_pytorch(output_dir: str) -> Optional[str]:
|
| 488 |
+
""" Verilen klasördeki en son PyTorch checkpoint dosyasını (.pt) bulur. """
|
| 489 |
+
checkpoint_dir = os.path.join(output_dir, "checkpoints_pytorch")
|
| 490 |
+
if not os.path.isdir(checkpoint_dir):
|
| 491 |
+
return None
|
| 492 |
+
checkpoints = [f for f in os.listdir(checkpoint_dir) if f.startswith("evo_gen_") and f.endswith(".pt")]
|
| 493 |
+
if not checkpoints:
|
| 494 |
+
return None
|
| 495 |
+
|
| 496 |
+
latest_gen = -1
|
| 497 |
+
latest_file = None
|
| 498 |
+
for cp in checkpoints:
|
| 499 |
+
try:
|
| 500 |
+
gen_num = int(cp.split('_')[2].split('.')[0])
|
| 501 |
+
if gen_num > latest_gen:
|
| 502 |
+
latest_gen = gen_num
|
| 503 |
+
latest_file = os.path.join(checkpoint_dir, cp)
|
| 504 |
+
except (IndexError, ValueError):
|
| 505 |
+
logging.warning(f"Could not parse generation number from checkpoint file: {cp}")
|
| 506 |
+
continue
|
| 507 |
+
return latest_file
|
| 508 |
+
|
| 509 |
+
|
| 510 |
+
# --- Ana Evrim Döngüsü (PyTorch) ---
|
| 511 |
+
def evolve_population_pytorch(
|
| 512 |
+
population: List[NeuralNetwork],
|
| 513 |
+
X: np.ndarray, y: np.ndarray, # Veri hala NumPy olarak geliyor
|
| 514 |
+
start_generation: int, total_generations: int,
|
| 515 |
+
crossover_rate: float, mutation_rate: float, weight_mut_rate: float, mut_strength: float,
|
| 516 |
+
tournament_size: int, elitism_count: int, batch_size: int, # batch_size fitness'ta kullanılmıyor şu an
|
| 517 |
+
output_dir: str, checkpoint_interval: int, device: torch.device
|
| 518 |
+
) -> Tuple[Optional[NeuralNetwork], List[float], List[float]]:
|
| 519 |
+
""" PyTorch tabanlı evrimsel süreci çalıştırır. """
|
| 520 |
+
|
| 521 |
+
best_fitness_history = []
|
| 522 |
+
avg_fitness_history = []
|
| 523 |
+
best_model_overall: Optional[NeuralNetwork] = None
|
| 524 |
+
best_fitness_overall = -np.inf
|
| 525 |
+
|
| 526 |
+
# Veriyi PyTorch tensörlerine dönüştür ve cihaza gönder (bir kere)
|
| 527 |
+
# Büyük veri setleri için DataLoader düşünülebilir, ancak burada basit tutuyoruz
|
| 528 |
+
try:
|
| 529 |
+
X_torch = torch.from_numpy(X).float().to(device)
|
| 530 |
+
y_torch = torch.from_numpy(y).float().to(device)
|
| 531 |
+
except Exception as e:
|
| 532 |
+
logging.critical(f"Failed to convert data to PyTorch tensors or move to device: {e}", exc_info=True)
|
| 533 |
+
raise
|
| 534 |
+
|
| 535 |
+
# --- KAVRAMSAL: Uyarlanabilir Mutasyon Oranı (Adaptif Parametreler) ---
|
| 536 |
+
# current_mutation_strength = mut_strength
|
| 537 |
+
# stagnation_counter = 0
|
| 538 |
+
# stagnation_limit = 10 # Örneğin, 10 nesil iyileşme olmazsa...
|
| 539 |
+
# min_mut_strength = 0.01
|
| 540 |
+
# max_mut_strength = 0.5
|
| 541 |
+
# --------------------------------------------
|
| 542 |
+
|
| 543 |
+
pop_size = len(population)
|
| 544 |
+
|
| 545 |
+
for gen in range(start_generation, total_generations):
|
| 546 |
+
generation_start_time = time.time()
|
| 547 |
+
|
| 548 |
+
# 1. Fitness Değerlendirme
|
| 549 |
+
try:
|
| 550 |
+
# Paralelleştirme potansiyeli (eğer fitness hesaplama çok uzun sürüyorsa)
|
| 551 |
+
# Örnek: concurrent.futures kullanarak
|
| 552 |
+
fitness_scores = [calculate_fitness_pytorch(ind, X_torch, y_torch, device) for ind in population]
|
| 553 |
+
except Exception as e:
|
| 554 |
+
logging.critical(f"Error calculating fitness for population in Generation {gen+1}: {e}", exc_info=True)
|
| 555 |
+
# Hata durumunda en iyi modeli döndürmeye çalış
|
| 556 |
+
if best_model_overall:
|
| 557 |
+
return best_model_overall, best_fitness_history, avg_fitness_history
|
| 558 |
+
else:
|
| 559 |
+
raise # Eğer hiç en iyi model yoksa, hata ver
|
| 560 |
+
|
| 561 |
+
# 2. İstatistikler ve En İyiyi Takip
|
| 562 |
+
current_best_idx = np.argmax(fitness_scores)
|
| 563 |
+
current_best_fitness = fitness_scores[current_best_idx]
|
| 564 |
+
# NaN veya Inf değerlerini filtreleyerek ortalama hesapla
|
| 565 |
+
finite_scores = [s for s in fitness_scores if np.isfinite(s)]
|
| 566 |
+
avg_fitness = np.mean(finite_scores) if finite_scores else -np.inf
|
| 567 |
+
|
| 568 |
+
best_fitness_history.append(current_best_fitness)
|
| 569 |
+
avg_fitness_history.append(avg_fitness)
|
| 570 |
+
|
| 571 |
+
new_best_found = False
|
| 572 |
+
if current_best_fitness > best_fitness_overall and np.isfinite(current_best_fitness):
|
| 573 |
+
best_fitness_overall = current_best_fitness
|
| 574 |
+
new_best_found = True
|
| 575 |
+
try:
|
| 576 |
+
# En iyi modeli klonla (orijinal popülasyondaki değişmesin)
|
| 577 |
+
best_model_overall = clone_pytorch_model(population[current_best_idx], device)
|
| 578 |
+
logging.info(f"Generation {gen+1}: *** New overall best fitness found: {best_fitness_overall:.6f} (Model: {best_model_overall.model_name}) ***")
|
| 579 |
+
except Exception as e:
|
| 580 |
+
logging.error(f"Could not clone new best model {population[current_best_idx].model_name}: {e}", exc_info=True)
|
| 581 |
+
# Klonlama başarısız olursa, en azından fitness'ı takip et
|
| 582 |
+
best_model_overall = None # Klonlanamadığı için referansı tutma
|
| 583 |
+
# else: # En iyi bulunamadıysa veya aynıysa
|
| 584 |
+
# --- KAVRAMSAL: Adaptif Mutasyon Güncelleme ---
|
| 585 |
+
# stagnation_counter += 1
|
| 586 |
+
# logging.debug(f"Stagnation counter: {stagnation_counter}")
|
| 587 |
+
# if stagnation_counter >= stagnation_limit:
|
| 588 |
+
# current_mutation_strength = min(max_mut_strength, current_mutation_strength * 1.2) # Mutasyon gücünü artır
|
| 589 |
+
# logging.info(f"Stagnation detected. Increasing mutation strength to {current_mutation_strength:.4f}")
|
| 590 |
+
# stagnation_counter = 0 # Sayacı sıfırla
|
| 591 |
+
|
| 592 |
+
# if new_best_found:
|
| 593 |
+
# stagnation_counter = 0
|
| 594 |
+
# current_mutation_strength = max(min_mut_strength, current_mutation_strength * 0.95) # İyileşme varsa azalt
|
| 595 |
+
# logging.debug(f"Improvement found. Decreasing mutation strength to {current_mutation_strength:.4f}")
|
| 596 |
+
|
| 597 |
+
generation_time = time.time() - generation_start_time
|
| 598 |
+
logging.info(f"Generation {gen+1}/{total_generations} | Best Fitness: {current_best_fitness:.6f} | Avg Fitness: {avg_fitness:.6f} | Time: {generation_time:.2f}s")
|
| 599 |
+
|
| 600 |
+
# 3. Yeni Popülasyon Oluşturma
|
| 601 |
+
new_population = []
|
| 602 |
+
|
| 603 |
+
# 3a. Elitizm
|
| 604 |
+
if elitism_count > 0 and len(population) >= elitism_count:
|
| 605 |
+
try:
|
| 606 |
+
# Fitness skorlarına göre sırala ve en iyileri al (indeksleri)
|
| 607 |
+
elite_indices = np.argsort(fitness_scores)[-elitism_count:]
|
| 608 |
+
for idx in elite_indices:
|
| 609 |
+
# Elitleri klonlayarak yeni popülasyona ekle
|
| 610 |
+
elite_clone = clone_pytorch_model(population[idx], device)
|
| 611 |
+
elite_clone.model_name = f"elite_{population[idx].model_name}" # İsimlendirme
|
| 612 |
+
new_population.append(elite_clone)
|
| 613 |
+
logging.debug(f"Added {len(new_population)} elites to the next generation.")
|
| 614 |
+
except Exception as e:
|
| 615 |
+
logging.error(f"Error during elitism: {e}", exc_info=True)
|
| 616 |
+
|
| 617 |
+
# 3b. Seçilim, Çaprazlama ve Mutasyon ile kalanları doldur
|
| 618 |
+
num_to_generate = pop_size - len(new_population)
|
| 619 |
+
generated_count = 0
|
| 620 |
+
reproduction_attempts = 0 # Sonsuz döngüyü önlemek için
|
| 621 |
+
max_reproduction_attempts = num_to_generate * 5 # Cömert bir sınır
|
| 622 |
+
|
| 623 |
+
while generated_count < num_to_generate and reproduction_attempts < max_reproduction_attempts:
|
| 624 |
+
reproduction_attempts += 1
|
| 625 |
+
try:
|
| 626 |
+
# İki ebeveyn seç
|
| 627 |
+
parent1 = tournament_selection(population, fitness_scores, tournament_size)
|
| 628 |
+
parent2 = tournament_selection(population, fitness_scores, tournament_size)
|
| 629 |
+
|
| 630 |
+
child1, child2 = None, None
|
| 631 |
+
|
| 632 |
+
# Çaprazlama uygula (belirli bir olasılıkla ve farklı ebeveynlerse)
|
| 633 |
+
if random.random() < crossover_rate and parent1 is not parent2:
|
| 634 |
+
# logging.debug(f"Attempting crossover between {parent1.model_name} and {parent2.model_name}")
|
| 635 |
+
child1, child2 = crossover_individuals_pytorch(parent1, parent2, device)
|
| 636 |
+
|
| 637 |
+
# Eğer çaprazlama yapılmadıysa/başarısız olduysa veya tek çocuk üretildiyse
|
| 638 |
+
if child1 is None:
|
| 639 |
+
# Mutasyon uygula (belirli bir olasılıkla)
|
| 640 |
+
if random.random() < mutation_rate:
|
| 641 |
+
parent_to_mutate = parent1 # Veya parent2, veya rastgele biri
|
| 642 |
+
child1 = mutate_individual_pytorch(parent_to_mutate, weight_mut_rate, mut_strength, device) # Adaptif: current_mutation_strength
|
| 643 |
+
else:
|
| 644 |
+
# Ne çaprazlama ne mutasyon -> ebeveyni klonla
|
| 645 |
+
child1 = clone_pytorch_model(parent1, device)
|
| 646 |
+
child1.model_name = f"direct_clone_{parent1.model_name}_{random.randint(1000,9999)}"
|
| 647 |
+
|
| 648 |
+
# Çocukları yeni popülasyona ekle (eğer üretildilerse)
|
| 649 |
+
if child1:
|
| 650 |
+
new_population.append(child1)
|
| 651 |
+
generated_count += 1
|
| 652 |
+
if generated_count >= num_to_generate: break
|
| 653 |
+
|
| 654 |
+
if child2: # Eğer çaprazlama iki çocuk ürettiyse
|
| 655 |
+
# İkinci çocuğa da mutasyon uygulama seçeneği eklenebilir
|
| 656 |
+
# if random.random() < post_crossover_mutation_rate: child2 = mutate(...)
|
| 657 |
+
new_population.append(child2)
|
| 658 |
+
generated_count += 1
|
| 659 |
+
if generated_count >= num_to_generate: break
|
| 660 |
+
|
| 661 |
+
except Exception as e:
|
| 662 |
+
logging.error(f"Error during selection/reproduction cycle (attempt {reproduction_attempts}): {e}", exc_info=True)
|
| 663 |
+
# Hata durumunda döngüye devam etmeye çalış, ancak sınırı aşarsa durur.
|
| 664 |
+
# Güvenlik önlemi olarak rastgele birey eklenebilir ama hatayı maskeleyebilir.
|
| 665 |
+
|
| 666 |
+
# Eğer döngü sınırı aşıldıysa popülasyonu tamamla
|
| 667 |
+
if generated_count < num_to_generate:
|
| 668 |
+
logging.warning(f"Reproduction cycle finished early or hit attempt limit. Adding {num_to_generate - generated_count} random individuals.")
|
| 669 |
+
input_size = population[0].input_size # İlk bireyden al
|
| 670 |
+
output_size = population[0].output_size
|
| 671 |
+
for _ in range(num_to_generate - generated_count):
|
| 672 |
+
try:
|
| 673 |
+
random_ind = create_individual_pytorch(input_size, output_size).to(device)
|
| 674 |
+
new_population.append(random_ind)
|
| 675 |
+
except Exception as e:
|
| 676 |
+
logging.error(f"Failed to create random individual to fill population: {e}")
|
| 677 |
+
# Bu durumda popülasyon eksik kalabilir
|
| 678 |
+
|
| 679 |
+
population = new_population[:pop_size] # Popülasyon boyutunu garantile
|
| 680 |
+
|
| 681 |
+
# 4. Checkpoint Alma
|
| 682 |
+
if checkpoint_interval > 0 and (gen + 1) % checkpoint_interval == 0:
|
| 683 |
+
try:
|
| 684 |
+
rnd_state = random.getstate()
|
| 685 |
+
np_rnd_state = np.random.get_state()
|
| 686 |
+
torch_rnd_state = torch.get_rng_state() # PyTorch RNG durumu
|
| 687 |
+
# Cihaz RNG durumları da kaydedilebilir: torch.cuda.get_rng_state_all()
|
| 688 |
+
save_checkpoint_pytorch(output_dir, gen + 1, population, rnd_state, np_rnd_state, torch_rnd_state)
|
| 689 |
+
except Exception as e:
|
| 690 |
+
logging.error(f"Failed to execute checkpoint saving for generation {gen+1}: {e}", exc_info=True)
|
| 691 |
+
|
| 692 |
+
# Döngü sonu temizliği (GPU belleği için önemli olabilir)
|
| 693 |
+
if device.type == 'cuda':
|
| 694 |
+
torch.cuda.empty_cache()
|
| 695 |
+
|
| 696 |
+
# Evrim Döngüsü Sonu
|
| 697 |
+
if best_model_overall is None:
|
| 698 |
+
logging.warning("Evolution finished, but no single best model was tracked (possibly due to errors or all fitness being non-finite).")
|
| 699 |
+
# Son popülasyondan en iyiyi bulmaya çalış
|
| 700 |
+
if population:
|
| 701 |
+
final_fitness_scores = [calculate_fitness_pytorch(ind, X_torch, y_torch, device) for ind in population]
|
| 702 |
+
valid_scores = [(s, i) for i, s in enumerate(final_fitness_scores) if np.isfinite(s)]
|
| 703 |
+
if valid_scores:
|
| 704 |
+
best_idx_final = max(valid_scores, key=lambda item: item[0])[1]
|
| 705 |
+
best_model_overall = clone_pytorch_model(population[best_idx_final], device) # Klonla
|
| 706 |
+
best_fitness_overall = final_fitness_scores[best_idx_final]
|
| 707 |
+
logging.info(f"Selected best model from final population: {best_model_overall.model_name} with fitness {best_fitness_overall:.6f}")
|
| 708 |
+
else:
|
| 709 |
+
logging.error("Evolution finished. No valid finite fitness scores in the final population.")
|
| 710 |
+
return None, best_fitness_history, avg_fitness_history
|
| 711 |
+
else:
|
| 712 |
+
logging.error("Evolution finished with an empty population!")
|
| 713 |
+
return None, best_fitness_history, avg_fitness_history
|
| 714 |
+
else:
|
| 715 |
+
logging.info(f"Evolution finished. Best fitness achieved: {best_fitness_overall:.6f} by model {best_model_overall.model_name}")
|
| 716 |
+
|
| 717 |
+
return best_model_overall, best_fitness_history, avg_fitness_history
|
| 718 |
+
|
| 719 |
+
# --- Grafik Çizimi (Öncekiyle aynı, Matplotlib kullanıyor) ---
|
| 720 |
+
def plot_fitness_history(history_best: List[float], history_avg: List[float], output_dir: str, filename: str = "fitness_history_pytorch.png") -> None:
|
| 721 |
+
if not history_best or not history_avg:
|
| 722 |
+
logging.warning("Fitness history is empty, cannot plot.")
|
| 723 |
+
return
|
| 724 |
+
try:
|
| 725 |
+
plt.figure(figsize=(12, 7))
|
| 726 |
+
# NaN veya Inf değerlerini çizimde atlamak için filtrele
|
| 727 |
+
gens = np.arange(1, len(history_best) + 1)
|
| 728 |
+
valid_best_indices = [i for i, v in enumerate(history_best) if np.isfinite(v)]
|
| 729 |
+
valid_avg_indices = [i for i, v in enumerate(history_avg) if np.isfinite(v)]
|
| 730 |
+
|
| 731 |
+
if valid_best_indices:
|
| 732 |
+
plt.plot(gens[valid_best_indices], np.array(history_best)[valid_best_indices], label="Best Fitness", marker='o', linestyle='-', linewidth=2)
|
| 733 |
+
if valid_avg_indices:
|
| 734 |
+
plt.plot(gens[valid_avg_indices], np.array(history_avg)[valid_avg_indices], label="Average Fitness", marker='x', linestyle='--', alpha=0.7)
|
| 735 |
+
|
| 736 |
+
plt.xlabel("Generation")
|
| 737 |
+
plt.ylabel("Fitness Score")
|
| 738 |
+
plt.title("Evolutionary Fitness History (PyTorch)")
|
| 739 |
+
plt.legend()
|
| 740 |
+
plt.grid(True)
|
| 741 |
+
plt.tight_layout()
|
| 742 |
+
plot_path = os.path.join(output_dir, filename)
|
| 743 |
+
plt.savefig(plot_path)
|
| 744 |
+
plt.close() # Belleği boşalt
|
| 745 |
+
logging.info(f"Fitness history plot saved to {plot_path}")
|
| 746 |
+
except Exception as e:
|
| 747 |
+
logging.error(f"Error plotting fitness history: {e}", exc_info=True)
|
| 748 |
+
|
| 749 |
+
|
| 750 |
+
# --- Değerlendirme (PyTorch) ---
|
| 751 |
+
def evaluate_model_pytorch(
|
| 752 |
+
model: NeuralNetwork,
|
| 753 |
+
X_test: np.ndarray, y_test: np.ndarray,
|
| 754 |
+
batch_size: int, device: torch.device
|
| 755 |
+
) -> Dict[str, float]:
|
| 756 |
+
""" En iyi modeli test verisi üzerinde PyTorch ile değerlendirir. """
|
| 757 |
+
if model is None:
|
| 758 |
+
logging.error("Cannot evaluate a None model.")
|
| 759 |
+
return {"test_mse": np.inf, "avg_kendall_tau": 0.0}
|
| 760 |
+
|
| 761 |
+
logging.info("Evaluating final model on test data using PyTorch...")
|
| 762 |
+
model.eval() # Değerlendirme modu
|
| 763 |
+
model.to(device)
|
| 764 |
+
|
| 765 |
+
# NumPy verisini PyTorch DataLoader ile kullanmak
|
| 766 |
+
try:
|
| 767 |
+
test_dataset = TensorDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).float())
|
| 768 |
+
test_loader = DataLoader(test_dataset, batch_size=batch_size) # Shuffle=False önemli
|
| 769 |
+
except Exception as e:
|
| 770 |
+
logging.error(f"Failed to create PyTorch DataLoader for test data: {e}", exc_info=True)
|
| 771 |
+
return {"test_mse": np.inf, "avg_kendall_tau": 0.0}
|
| 772 |
+
|
| 773 |
+
all_preds = []
|
| 774 |
+
all_targets = []
|
| 775 |
+
total_mse = 0.0
|
| 776 |
+
num_batches = 0
|
| 777 |
+
|
| 778 |
+
try:
|
| 779 |
+
with torch.no_grad():
|
| 780 |
+
for inputs, targets in test_loader:
|
| 781 |
+
inputs, targets = inputs.to(device), targets.to(device)
|
| 782 |
+
outputs = model(inputs)
|
| 783 |
+
batch_mse = torch.mean((outputs - targets)**2)
|
| 784 |
+
total_mse += batch_mse.item()
|
| 785 |
+
num_batches += 1
|
| 786 |
+
# Kendall Tau için tahminleri ve hedefleri topla (CPU'da)
|
| 787 |
+
all_preds.append(outputs.cpu().numpy())
|
| 788 |
+
all_targets.append(targets.cpu().numpy())
|
| 789 |
+
|
| 790 |
+
avg_mse = total_mse / num_batches if num_batches > 0 else np.inf
|
| 791 |
+
logging.info(f"Final Test MSE: {avg_mse:.6f}")
|
| 792 |
+
|
| 793 |
+
# Kendall Tau hesaplaması
|
| 794 |
+
all_preds_np = np.concatenate(all_preds, axis=0)
|
| 795 |
+
all_targets_np = np.concatenate(all_targets, axis=0)
|
| 796 |
+
|
| 797 |
+
sample_size = min(500, all_targets_np.shape[0])
|
| 798 |
+
taus = []
|
| 799 |
+
if sample_size > 0:
|
| 800 |
+
indices = np.random.choice(all_targets_np.shape[0], sample_size, replace=False)
|
| 801 |
+
for i in indices:
|
| 802 |
+
try:
|
| 803 |
+
tau, _ = kendalltau(all_targets_np[i], all_preds_np[i])
|
| 804 |
+
if not np.isnan(tau):
|
| 805 |
+
taus.append(tau)
|
| 806 |
+
except ValueError: # Sabit tahmin durumu vb.
|
| 807 |
+
pass
|
| 808 |
+
avg_kendall_tau = np.mean(taus) if taus else 0.0
|
| 809 |
+
logging.info(f"Average Kendall's Tau (on {sample_size} samples): {avg_kendall_tau:.4f}")
|
| 810 |
+
|
| 811 |
+
return {"test_mse": float(avg_mse), "avg_kendall_tau": float(avg_kendall_tau)}
|
| 812 |
+
|
| 813 |
+
except Exception as e:
|
| 814 |
+
logging.error(f"Error during final PyTorch model evaluation: {e}", exc_info=True)
|
| 815 |
+
return {"test_mse": np.inf, "avg_kendall_tau": 0.0}
|
| 816 |
+
|
| 817 |
+
|
| 818 |
+
# --- Son Eğitim (PyTorch) ---
|
| 819 |
+
def train_final_model_pytorch(
|
| 820 |
+
model: NeuralNetwork,
|
| 821 |
+
X_train: np.ndarray, y_train: np.ndarray,
|
| 822 |
+
epochs: int, batch_size: int, learning_rate: float,
|
| 823 |
+
device: torch.device, output_dir: str
|
| 824 |
+
) -> Tuple[NeuralNetwork, Dict[str, Any]]:
|
| 825 |
+
""" En iyi evrimleşmiş modeli PyTorch ile eğitir (Early Stopping ve LR Scheduling ile). """
|
| 826 |
+
logging.info(f"--- Starting Final Training of Best Evolved Model ({model.model_name}) ---")
|
| 827 |
+
model.to(device) # Modeli cihaza taşı
|
| 828 |
+
|
| 829 |
+
# Veriyi DataLoader'a yükle
|
| 830 |
+
try:
|
| 831 |
+
train_dataset = TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float())
|
| 832 |
+
# Veriyi train/validation olarak ayır
|
| 833 |
+
val_split = 0.2
|
| 834 |
+
num_train = len(train_dataset)
|
| 835 |
+
split_idx = int(np.floor(val_split * num_train))
|
| 836 |
+
indices = list(range(num_train))
|
| 837 |
+
np.random.shuffle(indices) # Karıştır
|
| 838 |
+
train_indices, val_indices = indices[split_idx:], indices[:split_idx]
|
| 839 |
+
|
| 840 |
+
train_sampler = torch.utils.data.SubsetRandomSampler(train_indices)
|
| 841 |
+
val_sampler = torch.utils.data.SubsetRandomSampler(val_indices) # Veya SequentialSampler
|
| 842 |
+
|
| 843 |
+
train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler)
|
| 844 |
+
val_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=val_sampler)
|
| 845 |
+
logging.info(f"Created DataLoaders. Train samples: {len(train_indices)}, Val samples: {len(val_indices)}")
|
| 846 |
+
except Exception as e:
|
| 847 |
+
logging.error(f"Failed to create DataLoaders for final training: {e}", exc_info=True)
|
| 848 |
+
return model, {"error": "DataLoader creation failed"}
|
| 849 |
+
|
| 850 |
+
# Optimizatör ve Kayıp Fonksiyonu
|
| 851 |
+
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
|
| 852 |
+
criterion = nn.MSELoss() # Kayıp fonksiyonu
|
| 853 |
+
|
| 854 |
+
# Learning Rate Scheduler (Platoda Azaltma)
|
| 855 |
+
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.3, patience=7, verbose=True, min_lr=1e-7)
|
| 856 |
+
|
| 857 |
+
# Early Stopping Parametreleri
|
| 858 |
+
early_stopping_patience = 15
|
| 859 |
+
best_val_loss = np.inf
|
| 860 |
+
epochs_no_improve = 0
|
| 861 |
+
best_model_state = None # En iyi modelin state_dict'ini sakla
|
| 862 |
+
|
| 863 |
+
training_history = {'train_loss': [], 'val_loss': [], 'lr': []}
|
| 864 |
+
epochs_run = 0
|
| 865 |
+
|
| 866 |
+
try:
|
| 867 |
+
for epoch in range(epochs):
|
| 868 |
+
epochs_run += 1
|
| 869 |
+
model.train() # Eğitim modu
|
| 870 |
+
running_train_loss = 0.0
|
| 871 |
+
for i, (inputs, targets) in enumerate(train_loader):
|
| 872 |
+
inputs, targets = inputs.to(device), targets.to(device)
|
| 873 |
+
|
| 874 |
+
optimizer.zero_grad() # Gradyanları sıfırla
|
| 875 |
+
outputs = model(inputs) # İleri besleme
|
| 876 |
+
loss = criterion(outputs, targets) # Kaybı hesapla
|
| 877 |
+
loss.backward() # Geri yayılım
|
| 878 |
+
optimizer.step() # Ağırlıkları güncelle
|
| 879 |
+
|
| 880 |
+
running_train_loss += loss.item()
|
| 881 |
+
|
| 882 |
+
avg_train_loss = running_train_loss / len(train_loader) if len(train_loader) > 0 else 0.0
|
| 883 |
+
training_history['train_loss'].append(avg_train_loss)
|
| 884 |
+
training_history['lr'].append(optimizer.param_groups[0]['lr']) # Mevcut LR'yi kaydet
|
| 885 |
+
|
| 886 |
+
# ---- Validation ----
|
| 887 |
+
model.eval() # Değerlendirme modu
|
| 888 |
+
running_val_loss = 0.0
|
| 889 |
+
with torch.no_grad():
|
| 890 |
+
for inputs, targets in val_loader:
|
| 891 |
+
inputs, targets = inputs.to(device), targets.to(device)
|
| 892 |
+
outputs = model(inputs)
|
| 893 |
+
loss = criterion(outputs, targets)
|
| 894 |
+
running_val_loss += loss.item()
|
| 895 |
+
|
| 896 |
+
avg_val_loss = running_val_loss / len(val_loader) if len(val_loader) > 0 else np.inf
|
| 897 |
+
training_history['val_loss'].append(avg_val_loss)
|
| 898 |
+
|
| 899 |
+
logging.info(f"Epoch [{epoch+1}/{epochs}] Train Loss: {avg_train_loss:.6f} | Val Loss: {avg_val_loss:.6f} | LR: {optimizer.param_groups[0]['lr']:.2e}")
|
| 900 |
+
|
| 901 |
+
# Learning Rate Scheduling
|
| 902 |
+
scheduler.step(avg_val_loss)
|
| 903 |
+
|
| 904 |
+
# Early Stopping Kontrolü
|
| 905 |
+
if avg_val_loss < best_val_loss:
|
| 906 |
+
best_val_loss = avg_val_loss
|
| 907 |
+
epochs_no_improve = 0
|
| 908 |
+
# En iyi modelin durumunu kaydet (derin kopya)
|
| 909 |
+
best_model_state = copy.deepcopy(model.state_dict())
|
| 910 |
+
logging.debug(f"New best validation loss: {best_val_loss:.6f}. Saving model state.")
|
| 911 |
+
else:
|
| 912 |
+
epochs_no_improve += 1
|
| 913 |
+
|
| 914 |
+
if epochs_no_improve >= early_stopping_patience:
|
| 915 |
+
logging.info(f"Early stopping triggered after {epoch+1} epochs due to no improvement in validation loss for {early_stopping_patience} epochs.")
|
| 916 |
+
break
|
| 917 |
+
|
| 918 |
+
# Eğitim sonrası en iyi modeli yükle (eğer kaydedildiyse)
|
| 919 |
+
if best_model_state:
|
| 920 |
+
logging.info(f"Restoring model to best validation performance (Val Loss: {best_val_loss:.6f}).")
|
| 921 |
+
model.load_state_dict(best_model_state)
|
| 922 |
+
else:
|
| 923 |
+
logging.warning("No best model state was saved during training (possibly validation loss never improved).")
|
| 924 |
+
|
| 925 |
+
|
| 926 |
+
logging.info("Final training complete.")
|
| 927 |
+
training_summary = {
|
| 928 |
+
"epochs_run": epochs_run,
|
| 929 |
+
"final_train_loss": avg_train_loss, # Son epoch'un kaybı
|
| 930 |
+
"best_val_loss": best_val_loss, # Elde edilen en iyi val kaybı
|
| 931 |
+
"final_lr": optimizer.param_groups[0]['lr']
|
| 932 |
+
}
|
| 933 |
+
# Eğitim grafiğini çizdir (opsiyonel)
|
| 934 |
+
# plot_training_history(training_history, output_dir)
|
| 935 |
+
|
| 936 |
+
return model, training_summary
|
| 937 |
+
|
| 938 |
+
except Exception as e:
|
| 939 |
+
logging.error(f"Error during final PyTorch model training: {e}", exc_info=True)
|
| 940 |
+
return model, {"error": str(e)}
|
| 941 |
+
|
| 942 |
+
|
| 943 |
+
# --- Ana İş Akışı (PyTorch) ---
|
| 944 |
+
def run_pipeline_pytorch(args: argparse.Namespace):
|
| 945 |
+
""" Checkpoint ve PyTorch tabanlı ana iş akışı. """
|
| 946 |
+
|
| 947 |
+
# Cihazı Ayarla
|
| 948 |
+
device = setup_device(args.device)
|
| 949 |
+
|
| 950 |
+
# Çalıştırma adı ve çıktı klasörü
|
| 951 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 952 |
+
run_name = f"evorun_pt_{timestamp}_gen{args.generations}_pop{args.pop_size}"
|
| 953 |
+
output_dir = args.resume_from if args.resume_from else os.path.join(args.output_base_dir, run_name)
|
| 954 |
+
resume_run = bool(args.resume_from)
|
| 955 |
+
|
| 956 |
+
if resume_run:
|
| 957 |
+
run_name = os.path.basename(output_dir)
|
| 958 |
+
logging.info(f"Attempting to resume PyTorch run from: {output_dir}")
|
| 959 |
+
# Devam edilen çalıştırmada çıktı klasörü zaten var olmalı
|
| 960 |
+
if not os.path.isdir(output_dir):
|
| 961 |
+
logging.error(f"Resume directory not found: {output_dir}. Exiting.")
|
| 962 |
+
sys.exit(1)
|
| 963 |
+
else:
|
| 964 |
+
try:
|
| 965 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 966 |
+
except OSError as e:
|
| 967 |
+
print(f"FATAL: Could not create output directory: {output_dir}. Error: {e}", file=sys.stderr)
|
| 968 |
+
sys.exit(1)
|
| 969 |
+
|
| 970 |
+
# Loglamayı ayarla ('a' modu ile devam etmeye uygun)
|
| 971 |
+
setup_logging(output_dir)
|
| 972 |
+
logging.info(f"========== Starting/Resuming EvoNet v4 PyTorch Pipeline: {run_name} ==========")
|
| 973 |
+
logging.info(f"Output directory: {output_dir}")
|
| 974 |
+
logging.info(f"Using device: {device}")
|
| 975 |
+
|
| 976 |
+
# --- Checkpoint Yükleme ---
|
| 977 |
+
start_generation = 0
|
| 978 |
+
population = []
|
| 979 |
+
initial_state_loaded = False
|
| 980 |
+
loaded_history_best = [] # Yüklenecek geçmiş fitness verileri
|
| 981 |
+
loaded_history_avg = []
|
| 982 |
+
|
| 983 |
+
latest_checkpoint_path = find_latest_checkpoint_pytorch(output_dir) if resume_run else None
|
| 984 |
+
|
| 985 |
+
if latest_checkpoint_path:
|
| 986 |
+
loaded_state = load_checkpoint_pytorch(latest_checkpoint_path, device)
|
| 987 |
+
if loaded_state:
|
| 988 |
+
start_generation = loaded_state['generation']
|
| 989 |
+
population = loaded_state['population'] # Yüklenen modeller zaten doğru cihazda olmalı
|
| 990 |
+
# Rastgele durumları geri yükle
|
| 991 |
+
try:
|
| 992 |
+
random.setstate(loaded_state['random_state'])
|
| 993 |
+
np.random.set_state(loaded_state['numpy_random_state'])
|
| 994 |
+
torch.set_rng_state(loaded_state['torch_random_state'].cpu()) # CPU'ya yüklenen state'i kullan
|
| 995 |
+
if device.type == 'cuda' and 'torch_cuda_random_state' in loaded_state:
|
| 996 |
+
# TODO: CUDA RNG state'i de kaydet/yükle (gerekirse)
|
| 997 |
+
# torch.cuda.set_rng_state_all(loaded_state['torch_cuda_random_state'])
|
| 998 |
+
pass
|
| 999 |
+
logging.info(f"Random states restored from checkpoint (Generation {start_generation}).")
|
| 1000 |
+
except Exception as e:
|
| 1001 |
+
logging.warning(f"Could not fully restore random states from checkpoint: {e}")
|
| 1002 |
+
|
| 1003 |
+
# TODO: Fitness geçmişini de checkpoint'e kaydet/yükle
|
| 1004 |
+
# loaded_history_best = loaded_state.get('best_fitness_history', [])
|
| 1005 |
+
# loaded_history_avg = loaded_state.get('avg_fitness_history', [])
|
| 1006 |
+
|
| 1007 |
+
initial_state_loaded = True
|
| 1008 |
+
logging.info(f"Resuming from Generation {start_generation + 1} with {len(population)} individuals.")
|
| 1009 |
+
else:
|
| 1010 |
+
logging.error("Failed to load checkpoint. Starting from scratch.")
|
| 1011 |
+
resume_run = False
|
| 1012 |
+
elif resume_run:
|
| 1013 |
+
logging.warning(f"Resume requested but no valid PyTorch checkpoint (.pt) found in {output_dir}. Starting from scratch.")
|
| 1014 |
+
resume_run = False
|
| 1015 |
+
|
| 1016 |
+
|
| 1017 |
+
# --- Sıfırdan Başlama veya Devam Etme Ayarları ---
|
| 1018 |
+
# Argümanları logla ve kaydet (sadece sıfırdan başlarken veya config yoksa)
|
| 1019 |
+
config_path = os.path.join(output_dir, "config_pytorch.json")
|
| 1020 |
+
args_dict = vars(args)
|
| 1021 |
+
if not initial_state_loaded or not os.path.exists(config_path):
|
| 1022 |
+
logging.info("--- Configuration ---")
|
| 1023 |
+
for k, v in args_dict.items(): logging.info(f" {k:<25}: {v}")
|
| 1024 |
+
logging.info("---------------------")
|
| 1025 |
+
try:
|
| 1026 |
+
# Argümanları JSON olarak kaydet
|
| 1027 |
+
args_to_save = args_dict.copy()
|
| 1028 |
+
# Cihaz objesini string'e çevir
|
| 1029 |
+
args_to_save['device'] = str(device)
|
| 1030 |
+
with open(config_path, 'w') as f: json.dump(args_to_save, f, indent=4, sort_keys=True)
|
| 1031 |
+
logging.info(f"Configuration saved to {config_path}")
|
| 1032 |
+
except Exception as e: logging.error(f"Failed to save configuration: {e}", exc_info=True)
|
| 1033 |
+
else: # Devam ediliyorsa ve config varsa, onu logla
|
| 1034 |
+
try:
|
| 1035 |
+
with open(config_path, 'r') as f: loaded_args_dict = json.load(f)
|
| 1036 |
+
logging.info("--- Loaded Configuration (from resumed run) ---")
|
| 1037 |
+
for k, v in loaded_args_dict.items(): logging.info(f" {k:<25}: {v}")
|
| 1038 |
+
logging.info("-----------------------------------------------")
|
| 1039 |
+
# İsteğe bağlı: Yüklenen argümanlarla mevcut argümanları karşılaştır
|
| 1040 |
+
# for k, v in args_dict.items():
|
| 1041 |
+
# if k in loaded_args_dict and loaded_args_dict[k] != v:
|
| 1042 |
+
# logging.warning(f"Argument mismatch: '{k}' loaded as {loaded_args_dict[k]}, current is {v}")
|
| 1043 |
+
except Exception as e: logging.warning(f"Could not reload config.json: {e}")
|
| 1044 |
+
|
| 1045 |
+
|
| 1046 |
+
# Rastgele tohumları ayarla (her zaman, devam etse bile determinizm için önemli olabilir)
|
| 1047 |
+
# Ancak checkpoint'ten yüklenen state'ler bunu geçersiz kılabilir.
|
| 1048 |
+
# Genellikle sadece sıfırdan başlarken ayarlamak daha mantıklıdır.
|
| 1049 |
+
if not initial_state_loaded:
|
| 1050 |
+
try:
|
| 1051 |
+
seed = args.seed
|
| 1052 |
+
random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
|
| 1053 |
+
if device.type == 'cuda': torch.cuda.manual_seed_all(seed) # GPU için de
|
| 1054 |
+
# Potansiyel olarak deterministik algoritmaları zorla (performansı düşürebilir)
|
| 1055 |
+
# torch.backends.cudnn.deterministic = True
|
| 1056 |
+
# torch.backends.cudnn.benchmark = False
|
| 1057 |
+
logging.info(f"Using random seed: {seed}")
|
| 1058 |
+
except Exception as e: logging.warning(f"Could not set all random seeds: {e}")
|
| 1059 |
+
|
| 1060 |
+
|
| 1061 |
+
# Veri Üretimi (her zaman, checkpoint veriyi içermiyorsa)
|
| 1062 |
+
# Büyük veri setleri için veriyi kaydet/yükle mekanizması daha iyi olabilir.
|
| 1063 |
+
try:
|
| 1064 |
+
logging.info("Generating/Reloading data...")
|
| 1065 |
+
X_train, y_train = generate_data(args.train_samples, args.seq_length)
|
| 1066 |
+
X_test, y_test = generate_data(args.test_samples, args.seq_length)
|
| 1067 |
+
input_shape = X_train.shape[1] # Sadece özellik sayısı
|
| 1068 |
+
output_shape = y_train.shape[1]
|
| 1069 |
+
except Exception:
|
| 1070 |
+
logging.critical("Failed to generate/reload data. Exiting.")
|
| 1071 |
+
sys.exit(1)
|
| 1072 |
+
|
| 1073 |
+
|
| 1074 |
+
# Popülasyon Başlatma (sadece sıfırdan başlarken)
|
| 1075 |
+
if not initial_state_loaded:
|
| 1076 |
+
logging.info(f"--- Initializing Population (Size: {args.pop_size}) ---")
|
| 1077 |
+
try:
|
| 1078 |
+
population = [create_individual_pytorch(input_shape, output_shape).to(device) for _ in range(args.pop_size)]
|
| 1079 |
+
logging.info("Population initialized successfully.")
|
| 1080 |
+
except Exception:
|
| 1081 |
+
logging.critical("Failed to initialize population. Exiting.")
|
| 1082 |
+
sys.exit(1)
|
| 1083 |
+
|
| 1084 |
+
|
| 1085 |
+
# Evrim Süreci
|
| 1086 |
+
logging.info(f"--- Starting/Resuming PyTorch Evolution ({args.generations} Total Generations) ---")
|
| 1087 |
+
best_model_evolved: Optional[NeuralNetwork] = None
|
| 1088 |
+
best_fitness_hist = loaded_history_best # Yüklenen geçmişle başla
|
| 1089 |
+
avg_fitness_hist = loaded_history_avg
|
| 1090 |
+
|
| 1091 |
+
if start_generation >= args.generations:
|
| 1092 |
+
logging.warning(f"Loaded checkpoint generation ({start_generation}) is already >= total generations ({args.generations}). Skipping evolution.")
|
| 1093 |
+
# Checkpoint'ten en iyi modeli ve geçmişi düzgün yüklemek önemli
|
| 1094 |
+
# Şimdilik en iyi modeli popülasyondaki ilk model varsayalım (bu doğru olmayabilir!)
|
| 1095 |
+
if population:
|
| 1096 |
+
# TODO: Checkpoint'e en iyi modeli de kaydetmek daha iyi olur.
|
| 1097 |
+
# Geçici çözüm: Son popülasyondan en iyiyi seç
|
| 1098 |
+
try:
|
| 1099 |
+
logging.info("Selecting best model from loaded population as evolution is skipped...")
|
| 1100 |
+
fitness_scores_loaded = [calculate_fitness_pytorch(ind, torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float(), device) for ind in population]
|
| 1101 |
+
valid_scores_loaded = [(s, i) for i, s in enumerate(fitness_scores_loaded) if np.isfinite(s)]
|
| 1102 |
+
if valid_scores_loaded:
|
| 1103 |
+
best_idx_loaded = max(valid_scores_loaded, key=lambda item: item[0])[1]
|
| 1104 |
+
best_model_evolved = clone_pytorch_model(population[best_idx_loaded], device) # Klonla
|
| 1105 |
+
logging.info(f"Using model {best_model_evolved.model_name} from loaded population as best evolved model.")
|
| 1106 |
+
else:
|
| 1107 |
+
logging.warning("Could not determine best model from loaded population (no finite fitness).")
|
| 1108 |
+
best_model_evolved = None
|
| 1109 |
+
except Exception as e:
|
| 1110 |
+
logging.error(f"Error selecting best model from loaded population: {e}")
|
| 1111 |
+
best_model_evolved = None
|
| 1112 |
+
else:
|
| 1113 |
+
best_model_evolved = None # Popülasyon yüklenememişse
|
| 1114 |
+
# Geçmişi de yüklemek lazım (yukarıda TODO olarak belirtildi)
|
| 1115 |
+
best_fitness_hist, avg_fitness_hist = [], []
|
| 1116 |
+
else:
|
| 1117 |
+
try:
|
| 1118 |
+
best_model_evolved, gen_best_hist, gen_avg_hist = evolve_population_pytorch(
|
| 1119 |
+
population, X_train, y_train, start_generation, args.generations,
|
| 1120 |
+
args.crossover_rate, args.mutation_rate, args.weight_mut_rate, args.mutation_strength,
|
| 1121 |
+
args.tournament_size, args.elitism_count, args.batch_size, # batch_size evrimde doğrudan kullanılmıyor
|
| 1122 |
+
output_dir, args.checkpoint_interval, device
|
| 1123 |
+
)
|
| 1124 |
+
# Yüklenen geçmişle bu çalıştırmanın geçmişini birleştir
|
| 1125 |
+
best_fitness_hist.extend(gen_best_hist)
|
| 1126 |
+
avg_fitness_hist.extend(gen_avg_hist)
|
| 1127 |
+
|
| 1128 |
+
except Exception as e:
|
| 1129 |
+
logging.critical(f"Fatal error during PyTorch evolution process: {e}", exc_info=True)
|
| 1130 |
+
sys.exit(1)
|
| 1131 |
+
logging.info("--- PyTorch Evolution Complete ---")
|
| 1132 |
+
|
| 1133 |
+
# Fitness geçmişini kaydetme ve çizdirme
|
| 1134 |
+
if best_fitness_hist or avg_fitness_hist:
|
| 1135 |
+
plot_fitness_history(best_fitness_hist, avg_fitness_hist, output_dir)
|
| 1136 |
+
history_path = os.path.join(output_dir, "fitness_history_pytorch.csv")
|
| 1137 |
+
try:
|
| 1138 |
+
# Geçmişi CSV olarak kaydet
|
| 1139 |
+
history_data = np.array([
|
| 1140 |
+
np.arange(1, len(best_fitness_hist) + 1), # Nesil numaraları (1'den başlayarak)
|
| 1141 |
+
best_fitness_hist,
|
| 1142 |
+
avg_fitness_hist
|
| 1143 |
+
]).T
|
| 1144 |
+
np.savetxt(history_path, history_data, delimiter=',', header='Generation,BestFitness,AvgFitness', comments='', fmt=['%d', '%.8f', '%.8f'])
|
| 1145 |
+
logging.info(f"Full fitness history saved to {history_path}")
|
| 1146 |
+
except Exception as e:
|
| 1147 |
+
logging.error(f"Could not save fitness history data: {e}")
|
| 1148 |
+
else:
|
| 1149 |
+
logging.warning("Fitness history is empty after evolution, skipping saving/plotting.")
|
| 1150 |
+
|
| 1151 |
+
|
| 1152 |
+
# En iyi modelin son eğitimi, değerlendirme ve sonuç kaydı
|
| 1153 |
+
final_model_path = None
|
| 1154 |
+
training_summary = {}
|
| 1155 |
+
final_metrics = {"test_mse": np.inf, "avg_kendall_tau": 0.0}
|
| 1156 |
+
best_model_architecture = {}
|
| 1157 |
+
|
| 1158 |
+
if best_model_evolved is None:
|
| 1159 |
+
logging.error("Evolution did not yield a best model. Skipping final training and evaluation.")
|
| 1160 |
+
else:
|
| 1161 |
+
best_model_architecture = best_model_evolved.get_architecture()
|
| 1162 |
+
logging.info(f"Best evolved model architecture: {best_model_architecture}")
|
| 1163 |
+
# Model özetini logla (parametre sayısı vb.)
|
| 1164 |
+
try:
|
| 1165 |
+
num_params = sum(p.numel() for p in best_model_evolved.parameters() if p.requires_grad)
|
| 1166 |
+
logging.info(f"Best Evolved Model ({best_model_evolved.model_name}) - Trainable Parameters: {num_params}")
|
| 1167 |
+
# Daha detaylı özet için torchinfo gibi kütüphaneler kullanılabilir:
|
| 1168 |
+
# from torchinfo import summary
|
| 1169 |
+
# summary(best_model_evolved, input_size=(args.batch_size, input_shape)) # input_size örnektir
|
| 1170 |
+
except Exception as e:
|
| 1171 |
+
logging.warning(f"Could not log model summary details: {e}")
|
| 1172 |
+
|
| 1173 |
+
|
| 1174 |
+
# Son Eğitim
|
| 1175 |
+
try:
|
| 1176 |
+
# Eğitmeden önce bir klonunu alalım ki orijinal evrimleşmiş hali kaybolmasın
|
| 1177 |
+
model_to_train = clone_pytorch_model(best_model_evolved, device)
|
| 1178 |
+
final_model, training_summary = train_final_model_pytorch(
|
| 1179 |
+
model_to_train, X_train, y_train,
|
| 1180 |
+
args.epochs_final_train, args.batch_size, args.learning_rate, # Args'a learning_rate ekle
|
| 1181 |
+
device, output_dir
|
| 1182 |
+
)
|
| 1183 |
+
except Exception as e:
|
| 1184 |
+
logging.error(f"Error during final training setup or execution: {e}", exc_info=True)
|
| 1185 |
+
final_model = None # Eğitim başarısız
|
| 1186 |
+
training_summary = {"error": str(e)}
|
| 1187 |
+
|
| 1188 |
+
# Değerlendirme
|
| 1189 |
+
if final_model:
|
| 1190 |
+
final_metrics = evaluate_model_pytorch(final_model, X_test, y_test, args.batch_size, device)
|
| 1191 |
+
# Son eğitilmiş modeli kaydet
|
| 1192 |
+
final_model_path = os.path.join(output_dir, "best_evolved_model_trained_pytorch.pt")
|
| 1193 |
+
try:
|
| 1194 |
+
# Sadece state_dict kaydetmek genellikle daha iyidir
|
| 1195 |
+
torch.save({
|
| 1196 |
+
'architecture': final_model.get_architecture(),
|
| 1197 |
+
'model_state_dict': final_model.state_dict(),
|
| 1198 |
+
# 'optimizer_state_dict': optimizer.state_dict(), # Eğitimde kullanılan optimizatör durumu
|
| 1199 |
+
'training_summary': training_summary,
|
| 1200 |
+
'evaluation_metrics': final_metrics
|
| 1201 |
+
}, final_model_path)
|
| 1202 |
+
logging.info(f"Final trained model state and architecture saved to {final_model_path}")
|
| 1203 |
+
except Exception as e:
|
| 1204 |
+
logging.error(f"Failed to save final trained model: {e}", exc_info=True)
|
| 1205 |
+
final_model_path = None # Kaydedilemedi
|
| 1206 |
+
else:
|
| 1207 |
+
logging.error("Final model training failed or did not produce a model. Skipping evaluation and saving.")
|
| 1208 |
+
|
| 1209 |
+
|
| 1210 |
+
logging.info("--- Saving Final Results ---")
|
| 1211 |
+
final_results = {
|
| 1212 |
+
"run_info": {
|
| 1213 |
+
"run_name": run_name,
|
| 1214 |
+
"timestamp": timestamp,
|
| 1215 |
+
"output_directory": output_dir,
|
| 1216 |
+
"framework": "PyTorch",
|
| 1217 |
+
"device_used": str(device),
|
| 1218 |
+
"resumed_run": resume_run,
|
| 1219 |
+
"last_checkpoint_loaded": latest_checkpoint_path
|
| 1220 |
+
},
|
| 1221 |
+
"config": args_dict, # Başlangıç argümanları
|
| 1222 |
+
"evolution_summary": {
|
| 1223 |
+
"start_generation": start_generation,
|
| 1224 |
+
"end_generation": start_generation + len(best_fitness_hist) - (1 if loaded_history_best else 0), # Çalıştırılan son nesil
|
| 1225 |
+
"generations_run_this_session": len(best_fitness_hist) - len(loaded_history_best),
|
| 1226 |
+
"best_fitness_achieved_overall": max(best_fitness_hist) if best_fitness_hist and any(np.isfinite(f) for f in best_fitness_hist) else None,
|
| 1227 |
+
"best_fitness_final_gen": best_fitness_hist[-1] if best_fitness_hist and np.isfinite(best_fitness_hist[-1]) else None,
|
| 1228 |
+
"avg_fitness_final_gen": avg_fitness_hist[-1] if avg_fitness_hist and np.isfinite(avg_fitness_hist[-1]) else None,
|
| 1229 |
+
"best_model_architecture": best_model_architecture
|
| 1230 |
+
},
|
| 1231 |
+
"final_training_summary": training_summary,
|
| 1232 |
+
"final_evaluation_on_test": final_metrics,
|
| 1233 |
+
"saved_trained_model_path": final_model_path
|
| 1234 |
+
}
|
| 1235 |
+
results_path = os.path.join(output_dir, "final_results_pytorch.json")
|
| 1236 |
+
try:
|
| 1237 |
+
# NumPy ve diğer serileştirilemeyen türleri JSON'a uygun hale getir
|
| 1238 |
+
def convert_types(obj):
|
| 1239 |
+
if isinstance(obj, np.integer): return int(obj)
|
| 1240 |
+
elif isinstance(obj, np.floating): return float(obj)
|
| 1241 |
+
elif isinstance(obj, np.ndarray): return obj.tolist()
|
| 1242 |
+
elif isinstance(obj, torch.Tensor): return obj.tolist() # Tensörleri listeye çevir
|
| 1243 |
+
elif isinstance(obj, torch.device): return str(obj) # Cihazı string yap
|
| 1244 |
+
elif isinstance(obj, type): return obj.__name__ # Türleri isim olarak kaydet
|
| 1245 |
+
return obj
|
| 1246 |
+
with open(results_path, 'w') as f:
|
| 1247 |
+
json.dump(final_results, f, indent=4, default=convert_types, sort_keys=True)
|
| 1248 |
+
logging.info(f"Final results summary saved to {results_path}")
|
| 1249 |
+
except Exception as e:
|
| 1250 |
+
logging.error(f"Failed to save final results JSON: {e}", exc_info=True)
|
| 1251 |
+
|
| 1252 |
+
logging.info(f"========== PyTorch Pipeline Run {run_name} Finished ==========")
|
| 1253 |
+
|
| 1254 |
+
|
| 1255 |
+
# --- Argüman Ayrıştırıcı (PyTorch için Eklemeler) ---
|
| 1256 |
+
def parse_arguments_v4() -> argparse.Namespace:
|
| 1257 |
+
parser = argparse.ArgumentParser(description="EvoNet v4: Neuroevolution with PyTorch, Crossover & Checkpointing")
|
| 1258 |
+
|
| 1259 |
+
# --- Dizinler ve Kontrol ---
|
| 1260 |
+
parser.add_argument('--output_base_dir', type=str, default=DEFAULT_OUTPUT_BASE_DIR, help='Base directory for new runs.')
|
| 1261 |
+
parser.add_argument('--resume_from', type=str, default=None, help='Path to a previous run directory to resume from (PyTorch checkpoints).')
|
| 1262 |
+
parser.add_argument('--checkpoint_interval', type=int, default=DEFAULT_CHECKPOINT_INTERVAL, help='Save checkpoint every N generations (0 to disable).')
|
| 1263 |
+
parser.add_argument('--device', type=str, default=DEFAULT_DEVICE, choices=['auto', 'cpu', 'cuda'], help='Device to use (cpu, cuda, or auto-detect).')
|
| 1264 |
+
|
| 1265 |
+
# --- Veri Ayarları ---
|
| 1266 |
+
parser.add_argument('--seq_length', type=int, default=DEFAULT_SEQ_LENGTH, help='Length of sequences.')
|
| 1267 |
+
parser.add_argument('--train_samples', type=int, default=5000, help='Number of training samples.')
|
| 1268 |
+
parser.add_argument('--test_samples', type=int, default=1000, help='Number of test samples.')
|
| 1269 |
+
|
| 1270 |
+
# --- Evrim Parametreleri ---
|
| 1271 |
+
parser.add_argument('--pop_size', type=int, default=DEFAULT_POP_SIZE, help='Population size.')
|
| 1272 |
+
parser.add_argument('--generations', type=int, default=DEFAULT_GENERATIONS, help='Total number of generations.')
|
| 1273 |
+
parser.add_argument('--crossover_rate', type=float, default=DEFAULT_CROSSOVER_RATE, help='Probability of applying crossover.')
|
| 1274 |
+
parser.add_argument('--mutation_rate', type=float, default=DEFAULT_MUTATION_RATE, help='Probability of applying mutation (if crossover is not applied).')
|
| 1275 |
+
parser.add_argument('--weight_mut_rate', type=float, default=DEFAULT_WEIGHT_MUT_RATE, help='Probability for each weight/bias to be mutated if mutation occurs.')
|
| 1276 |
+
parser.add_argument('--mutation_strength', type=float, default=DEFAULT_MUTATION_STRENGTH, help='Std dev for weight mutation noise (Gaussian).')
|
| 1277 |
+
parser.add_argument('--tournament_size', type=int, default=DEFAULT_TOURNAMENT_SIZE, help='Tournament selection size.')
|
| 1278 |
+
parser.add_argument('--elitism_count', type=int, default=DEFAULT_ELITISM_COUNT, help='Number of elite individuals to carry over.')
|
| 1279 |
+
|
| 1280 |
+
# --- Eğitim ve Değerlendirme ---
|
| 1281 |
+
parser.add_argument('--batch_size', type=int, default=DEFAULT_BATCH_SIZE, help='Batch size for final training and evaluation.')
|
| 1282 |
+
parser.add_argument('--epochs_final_train', type=int, default=DEFAULT_EPOCHS_FINAL_TRAIN, help='Max epochs for final training of the best model.')
|
| 1283 |
+
parser.add_argument('--learning_rate', type=float, default=0.001, help='Learning rate for Adam optimizer during final training.')
|
| 1284 |
+
|
| 1285 |
+
# --- Tekrarlanabilirlik ---
|
| 1286 |
+
parser.add_argument('--seed', type=int, default=None, help='Random seed for Python, NumPy, and PyTorch (default: random).')
|
| 1287 |
+
|
| 1288 |
+
args = parser.parse_args()
|
| 1289 |
+
if args.seed is None:
|
| 1290 |
+
args.seed = random.randint(0, 2**32 - 1)
|
| 1291 |
+
print(f"Generated random seed: {args.seed}")
|
| 1292 |
+
|
| 1293 |
+
# Basit kontroller
|
| 1294 |
+
if args.elitism_count >= args.pop_size:
|
| 1295 |
+
print(f"Warning: Elitism count ({args.elitism_count}) >= Population size ({args.pop_size}). Setting elitism to PopSize - 1.")
|
| 1296 |
+
args.elitism_count = max(0, args.pop_size - 1)
|
| 1297 |
+
if args.tournament_size <= 0:
|
| 1298 |
+
print(f"Warning: Tournament size ({args.tournament_size}) must be > 0. Setting to 1.")
|
| 1299 |
+
args.tournament_size = 1
|
| 1300 |
+
if args.tournament_size > args.pop_size:
|
| 1301 |
+
print(f"Warning: Tournament size ({args.tournament_size}) > Population size ({args.pop_size}). Setting to PopSize.")
|
| 1302 |
+
args.tournament_size = args.pop_size
|
| 1303 |
+
|
| 1304 |
+
return args
|
| 1305 |
+
|
| 1306 |
+
|
| 1307 |
+
# --- Ana Çalıştırma Bloğu ---
|
| 1308 |
+
if __name__ == "__main__":
|
| 1309 |
+
cli_args = parse_arguments_v4()
|
| 1310 |
+
try:
|
| 1311 |
+
run_pipeline_pytorch(cli_args)
|
| 1312 |
+
except SystemExit:
|
| 1313 |
+
logging.info("SystemExit caught, exiting gracefully.")
|
| 1314 |
+
pass # Argparse veya bilinçli çıkışlar için
|
| 1315 |
+
except KeyboardInterrupt:
|
| 1316 |
+
print("\nKeyboardInterrupt detected. Exiting...")
|
| 1317 |
+
logging.warning("KeyboardInterrupt detected. Attempting graceful shutdown.")
|
| 1318 |
+
sys.exit(130) # Ctrl+C için standart çıkış kodu
|
| 1319 |
+
except Exception as e:
|
| 1320 |
+
# Loglama zaten ayarlandıysa, kritik hata logla
|
| 1321 |
+
if logging.getLogger().hasHandlers():
|
| 1322 |
+
logging.critical("FATAL UNHANDLED ERROR in main execution block:", exc_info=True)
|
| 1323 |
+
else: # Loglama başlamadan hata olursa stderr'a yaz
|
| 1324 |
+
import traceback
|
| 1325 |
+
print(f"\nFATAL UNHANDLED ERROR in main execution block: {e}", file=sys.stderr)
|
| 1326 |
+
print(traceback.format_exc(), file=sys.stderr)
|
| 1327 |
+
sys.exit(1) # Başarısız çıkış kodu
|
v5.py
ADDED
|
@@ -0,0 +1,1330 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==============================================================================
|
| 2 |
+
# EvoNet Optimizer - v5 - Adaptif & Paralel PyTorch Sürümü
|
| 3 |
+
# Açıklama: v4 üzerine inşa edilmiştir. Adaptif mutasyon gücü, fitness'ta
|
| 4 |
+
# karmaşıklık cezası, paralel fitness hesaplama (CPU),
|
| 5 |
+
# opsiyonel Weights & Biases entegrasyonu ve genel iyileştirmeler içerir.
|
| 6 |
+
# ==============================================================================
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
# os.environ["WANDB_SILENT"] = "true" # W&B loglarını azaltmak için (isteğe bağlı)
|
| 10 |
+
import sys
|
| 11 |
+
import argparse
|
| 12 |
+
import random
|
| 13 |
+
import logging
|
| 14 |
+
from datetime import datetime
|
| 15 |
+
import json
|
| 16 |
+
import copy
|
| 17 |
+
import time
|
| 18 |
+
from typing import List, Tuple, Dict, Any, Optional, Union
|
| 19 |
+
import concurrent.futures # Paralel fitness hesaplama için
|
| 20 |
+
|
| 21 |
+
import numpy as np
|
| 22 |
+
import torch
|
| 23 |
+
import torch.nn as nn
|
| 24 |
+
import torch.optim as optim
|
| 25 |
+
from torch.utils.data import TensorDataset, DataLoader
|
| 26 |
+
import matplotlib.pyplot as plt
|
| 27 |
+
from scipy.stats import kendalltau
|
| 28 |
+
|
| 29 |
+
# Opsiyonel W&B importu
|
| 30 |
+
try:
|
| 31 |
+
import wandb
|
| 32 |
+
_WANDB_AVAILABLE = True
|
| 33 |
+
except ImportError:
|
| 34 |
+
_WANDB_AVAILABLE = False
|
| 35 |
+
print("Warning: wandb library not found. Experiment tracking with W&B is disabled.")
|
| 36 |
+
print("Install with: pip install wandb")
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# --- Sabitler ve Varsayılan Değerler ---
|
| 40 |
+
DEFAULT_SEQ_LENGTH = 10
|
| 41 |
+
DEFAULT_POP_SIZE = 50
|
| 42 |
+
DEFAULT_GENERATIONS = 50
|
| 43 |
+
DEFAULT_CROSSOVER_RATE = 0.6
|
| 44 |
+
DEFAULT_MUTATION_RATE = 0.4
|
| 45 |
+
DEFAULT_WEIGHT_MUT_RATE = 0.8
|
| 46 |
+
DEFAULT_MUTATION_STRENGTH = 0.1 # Başlangıç mutasyon gücü
|
| 47 |
+
DEFAULT_TOURNAMENT_SIZE = 5
|
| 48 |
+
DEFAULT_ELITISM_COUNT = 2
|
| 49 |
+
DEFAULT_EPOCHS_FINAL_TRAIN = 100
|
| 50 |
+
DEFAULT_BATCH_SIZE = 64
|
| 51 |
+
DEFAULT_OUTPUT_BASE_DIR = os.path.join(os.getcwd(), "evonet_runs_v5_pytorch")
|
| 52 |
+
DEFAULT_CHECKPOINT_INTERVAL = 10
|
| 53 |
+
DEFAULT_DEVICE = "auto"
|
| 54 |
+
DEFAULT_NUM_WORKERS = 0 # Paralel fitness için worker sayısı (0 = Kapalı/Ana thread)
|
| 55 |
+
|
| 56 |
+
# Adaptif Mutasyon Parametreleri
|
| 57 |
+
DEFAULT_ADAPT_MUTATION = True
|
| 58 |
+
DEFAULT_STAGNATION_LIMIT = 10 # İyileşme olmazsa adaptasyon için nesil sayısı
|
| 59 |
+
DEFAULT_MUT_STRENGTH_DECAY = 0.98 # İyileşme olduğunda azaltma faktörü
|
| 60 |
+
DEFAULT_MUT_STRENGTH_INCREASE = 1.1 # Tıkanma olduğunda artırma faktörü
|
| 61 |
+
DEFAULT_MIN_MUT_STRENGTH = 0.005
|
| 62 |
+
DEFAULT_MAX_MUT_STRENGTH = 0.5
|
| 63 |
+
|
| 64 |
+
# Gelişmiş Fitness Parametreleri
|
| 65 |
+
DEFAULT_COMPLEXITY_PENALTY = 0.00001 # Parametre başına ceza ağırlığı
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
# --- Loglama Ayarları ---
|
| 69 |
+
# (setup_logging fonksiyonu öncekiyle aynı, v4'teki gibi)
|
| 70 |
+
def setup_logging(log_dir: str, log_level=logging.INFO) -> None:
|
| 71 |
+
log_filename = os.path.join(log_dir, 'evolution_run_pytorch_v5.log')
|
| 72 |
+
for handler in logging.root.handlers[:]:
|
| 73 |
+
handler.close()
|
| 74 |
+
logging.root.removeHandler(handler)
|
| 75 |
+
logging.basicConfig(
|
| 76 |
+
level=log_level,
|
| 77 |
+
format='%(asctime)s - %(levelname)-8s [%(filename)s:%(lineno)d] - %(message)s',
|
| 78 |
+
handlers=[
|
| 79 |
+
logging.FileHandler(log_filename, mode='a'),
|
| 80 |
+
logging.StreamHandler(sys.stdout)
|
| 81 |
+
]
|
| 82 |
+
)
|
| 83 |
+
logging.info("="*50)
|
| 84 |
+
logging.info("PyTorch EvoNet v5 Logging Başlatıldı.")
|
| 85 |
+
logging.info("="*50)
|
| 86 |
+
|
| 87 |
+
# --- Cihaz (GPU/CPU) Ayarları ---
|
| 88 |
+
# (setup_device fonksiyonu öncekiyle aynı, v4'teki gibi)
|
| 89 |
+
def setup_device(requested_device: str) -> torch.device:
|
| 90 |
+
""" Kullanılabilir cihaza göre PyTorch cihazını ayarlar. """
|
| 91 |
+
if requested_device == "auto":
|
| 92 |
+
if torch.cuda.is_available():
|
| 93 |
+
device_name = "cuda"
|
| 94 |
+
logging.info(f"CUDA (GPU) kullanılabilir: {torch.cuda.get_device_name(0)}")
|
| 95 |
+
else:
|
| 96 |
+
device_name = "cpu"
|
| 97 |
+
logging.info("CUDA (GPU) bulunamadı. CPU kullanılacak.")
|
| 98 |
+
elif requested_device == "cuda":
|
| 99 |
+
if torch.cuda.is_available():
|
| 100 |
+
device_name = "cuda"
|
| 101 |
+
logging.info(f"CUDA (GPU) manuel olarak seçildi: {torch.cuda.get_device_name(0)}")
|
| 102 |
+
else:
|
| 103 |
+
logging.warning("CUDA (GPU) istendi ancak bulunamadı! CPU kullanılacak.")
|
| 104 |
+
device_name = "cpu"
|
| 105 |
+
else: # cpu veya geçersiz değer
|
| 106 |
+
device_name = "cpu"
|
| 107 |
+
logging.info("CPU manuel olarak seçildi veya geçersiz cihaz belirtildi.")
|
| 108 |
+
|
| 109 |
+
return torch.device(device_name)
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
# --- Veri Üretimi ---
|
| 113 |
+
# (generate_data fonksiyonu öncekiyle aynı, v4'teki gibi)
|
| 114 |
+
def generate_data(num_samples: int, seq_length: int) -> Tuple[np.ndarray, np.ndarray]:
|
| 115 |
+
logging.info(f"Generating {num_samples} samples with sequence length {seq_length}...")
|
| 116 |
+
try:
|
| 117 |
+
X = np.random.rand(num_samples, seq_length).astype(np.float32) * 100
|
| 118 |
+
y = np.sort(X, axis=1).astype(np.float32)
|
| 119 |
+
logging.info("Data generation successful.")
|
| 120 |
+
return X, y
|
| 121 |
+
except Exception as e:
|
| 122 |
+
logging.error(f"Error during data generation: {e}", exc_info=True)
|
| 123 |
+
raise
|
| 124 |
+
|
| 125 |
+
# --- PyTorch Sinir Ağı Modeli ---
|
| 126 |
+
# (NeuralNetwork sınıfı öncekiyle büyük ölçüde aynı, v4'teki gibi)
|
| 127 |
+
# Küçük iyileştirme: get_num_params metodu eklendi.
|
| 128 |
+
class NeuralNetwork(nn.Module):
|
| 129 |
+
""" Dinamik olarak yapılandırılabilen basit bir PyTorch MLP modeli. """
|
| 130 |
+
def __init__(self, input_size: int, output_size: int, hidden_dims: List[int], activations: List[str]):
|
| 131 |
+
super().__init__()
|
| 132 |
+
self.input_size = input_size
|
| 133 |
+
self.output_size = output_size
|
| 134 |
+
self.hidden_dims = hidden_dims
|
| 135 |
+
self.activations_str = activations
|
| 136 |
+
|
| 137 |
+
layers = []
|
| 138 |
+
last_dim = input_size
|
| 139 |
+
for i, h_dim in enumerate(hidden_dims):
|
| 140 |
+
layers.append(nn.Linear(last_dim, h_dim))
|
| 141 |
+
act_func_str = activations[i].lower()
|
| 142 |
+
if act_func_str == 'relu': layers.append(nn.ReLU())
|
| 143 |
+
elif act_func_str == 'tanh': layers.append(nn.Tanh())
|
| 144 |
+
elif act_func_str == 'sigmoid': layers.append(nn.Sigmoid())
|
| 145 |
+
else:
|
| 146 |
+
logging.warning(f"Bilinmeyen aktivasyon '{activations[i]}', ReLU kullanılıyor.")
|
| 147 |
+
layers.append(nn.ReLU())
|
| 148 |
+
last_dim = h_dim
|
| 149 |
+
layers.append(nn.Linear(last_dim, output_size))
|
| 150 |
+
|
| 151 |
+
self.network = nn.Sequential(*layers)
|
| 152 |
+
self.architecture_id = self._generate_architecture_id()
|
| 153 |
+
self.model_name = f"model_{self.architecture_id}_rnd{random.randint(10000, 99999)}"
|
| 154 |
+
|
| 155 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 156 |
+
return self.network(x)
|
| 157 |
+
|
| 158 |
+
def get_architecture(self) -> Dict[str, Any]:
|
| 159 |
+
return {"input_size": self.input_size, "output_size": self.output_size,
|
| 160 |
+
"hidden_dims": self.hidden_dims, "activations": self.activations_str}
|
| 161 |
+
|
| 162 |
+
def _generate_architecture_id(self) -> str:
|
| 163 |
+
h_dims_str = '_'.join(map(str, self.hidden_dims))
|
| 164 |
+
acts_str = ''.join([a[0].upper() for a in self.activations_str])
|
| 165 |
+
return f"I{self.input_size}_H{h_dims_str}_A{acts_str}_O{self.output_size}"
|
| 166 |
+
|
| 167 |
+
def get_num_params(self, trainable_only: bool = True) -> int:
|
| 168 |
+
""" Modeldeki parametre sayısını döndürür. """
|
| 169 |
+
if trainable_only:
|
| 170 |
+
return sum(p.numel() for p in self.parameters() if p.requires_grad)
|
| 171 |
+
else:
|
| 172 |
+
return sum(p.numel() for p in self.parameters())
|
| 173 |
+
|
| 174 |
+
def __eq__(self, other):
|
| 175 |
+
if not isinstance(other, NeuralNetwork): return NotImplemented
|
| 176 |
+
return self.get_architecture() == other.get_architecture()
|
| 177 |
+
|
| 178 |
+
def __hash__(self):
|
| 179 |
+
arch_tuple = (self.input_size, self.output_size, tuple(self.hidden_dims), tuple(self.activations_str))
|
| 180 |
+
return hash(arch_tuple)
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
# --- Neuroevolution Çekirdeği (PyTorch v5) ---
|
| 184 |
+
|
| 185 |
+
# (create_individual_pytorch fonksiyonu öncekiyle aynı, v4'teki gibi)
|
| 186 |
+
def create_individual_pytorch(input_size: int, output_size: int) -> NeuralNetwork:
|
| 187 |
+
""" Rastgele mimariye sahip bir PyTorch NeuralNetwork modeli oluşturur. """
|
| 188 |
+
try:
|
| 189 |
+
num_hidden_layers = random.randint(1, 4)
|
| 190 |
+
hidden_dims = [random.randint(16, 128) for _ in range(num_hidden_layers)]
|
| 191 |
+
activations = [random.choice(['relu', 'tanh', 'sigmoid']) for _ in range(num_hidden_layers)]
|
| 192 |
+
model = NeuralNetwork(input_size, output_size, hidden_dims, activations)
|
| 193 |
+
logging.debug(f"Created individual: {model.model_name} with {model.get_num_params()} params")
|
| 194 |
+
return model
|
| 195 |
+
except Exception as e:
|
| 196 |
+
logging.error(f"Error creating PyTorch individual model: {e}", exc_info=True)
|
| 197 |
+
raise
|
| 198 |
+
|
| 199 |
+
# (clone_pytorch_model fonksiyonu öncekiyle aynı, v4'teki gibi)
|
| 200 |
+
def clone_pytorch_model(model: NeuralNetwork, device: torch.device) -> NeuralNetwork:
|
| 201 |
+
""" Bir PyTorch modelini (mimari ve ağırlıklar) klonlar. """
|
| 202 |
+
try:
|
| 203 |
+
arch = model.get_architecture()
|
| 204 |
+
cloned_model = NeuralNetwork(**arch)
|
| 205 |
+
cloned_model.load_state_dict(copy.deepcopy(model.state_dict()))
|
| 206 |
+
cloned_model.to(device)
|
| 207 |
+
cloned_model.model_name = f"cloned_{model.model_name}_{random.randint(1000,9999)}"
|
| 208 |
+
logging.debug(f"Cloned model {model.model_name} to {cloned_model.model_name}")
|
| 209 |
+
return cloned_model
|
| 210 |
+
except Exception as e:
|
| 211 |
+
logging.error(f"Error cloning PyTorch model {model.model_name}: {e}", exc_info=True)
|
| 212 |
+
raise
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
# Bu fonksiyon paralel işçiler tarafından çağrılacak
|
| 216 |
+
# Doğrudan model objesi yerine state_dict ve mimari alıyor
|
| 217 |
+
def _calculate_fitness_worker(
|
| 218 |
+
model_arch: Dict[str, Any],
|
| 219 |
+
model_state_dict: Dict[str, torch.Tensor],
|
| 220 |
+
X_np: np.ndarray, # Veriyi NumPy olarak alalım
|
| 221 |
+
y_np: np.ndarray,
|
| 222 |
+
device_str: str, # Cihazı string olarak alalım
|
| 223 |
+
fitness_params: Dict # Karmaşıklık cezası vb.
|
| 224 |
+
) -> float:
|
| 225 |
+
""" Bir modelin fitness'ını hesaplayan işçi fonksiyonu (paralel kullanım için). """
|
| 226 |
+
try:
|
| 227 |
+
# 1. Modeli yeniden oluştur
|
| 228 |
+
device = torch.device(device_str)
|
| 229 |
+
model = NeuralNetwork(**model_arch)
|
| 230 |
+
model.load_state_dict(model_state_dict)
|
| 231 |
+
model.to(device)
|
| 232 |
+
model.eval()
|
| 233 |
+
|
| 234 |
+
# 2. Veriyi Tensör'e çevir ve cihaza taşı
|
| 235 |
+
X = torch.from_numpy(X_np).float().to(device)
|
| 236 |
+
y = torch.from_numpy(y_np).float().to(device)
|
| 237 |
+
|
| 238 |
+
# 3. Fitness Hesaplama (v4'teki calculate_fitness_pytorch benzeri)
|
| 239 |
+
complexity_penalty_weight = fitness_params.get('complexity_penalty', 0.0)
|
| 240 |
+
|
| 241 |
+
with torch.no_grad():
|
| 242 |
+
y_pred = model(X)
|
| 243 |
+
mse_val = torch.mean((y_pred - y)**2).item()
|
| 244 |
+
|
| 245 |
+
if not np.isfinite(mse_val):
|
| 246 |
+
# Worker'da loglama yapmak yerine None veya özel bir değer döndürebiliriz
|
| 247 |
+
# logging.warning(f"Worker: Non-finite MSE ({mse_val}) for model.")
|
| 248 |
+
return -np.inf # Ana süreçte işlenecek
|
| 249 |
+
|
| 250 |
+
# Temel fitness (MSE'nin tersi)
|
| 251 |
+
fitness_score = 1.0 / (mse_val + 1e-9)
|
| 252 |
+
|
| 253 |
+
# Karmaşıklık Cezası Ekle
|
| 254 |
+
if complexity_penalty_weight > 0:
|
| 255 |
+
num_params = model.get_num_params(trainable_only=True)
|
| 256 |
+
complexity_penalty = complexity_penalty_weight * num_params
|
| 257 |
+
fitness_score -= complexity_penalty
|
| 258 |
+
# print(f"Debug: Model params: {num_params}, penalty: {complexity_penalty:.4f}, score after penalty: {fitness_score:.4f}") # DEBUG
|
| 259 |
+
|
| 260 |
+
# --- KAVRAMSAL: Diğer Gelişmiş Fitness Metrikleri ---
|
| 261 |
+
# tau_weight = fitness_params.get('w_tau', 0.0)
|
| 262 |
+
# if tau_weight > 0:
|
| 263 |
+
# y_np_local = y.cpu().numpy()
|
| 264 |
+
# y_pred_np_local = y_pred.cpu().numpy()
|
| 265 |
+
# tau_val = calculate_avg_kendall_tau(y_np_local, y_pred_np_local, sample_size=100)
|
| 266 |
+
# fitness_score += tau_weight * tau_val
|
| 267 |
+
# ----------------------------------------------------
|
| 268 |
+
|
| 269 |
+
if not np.isfinite(fitness_score):
|
| 270 |
+
return -np.inf
|
| 271 |
+
|
| 272 |
+
return float(fitness_score)
|
| 273 |
+
|
| 274 |
+
except Exception as e:
|
| 275 |
+
# Hataları ana sürece bildirmek için loglama yerine None/exception döndürmek daha iyi olabilir
|
| 276 |
+
# Ancak basitlik için burada loglayıp çok düşük değer döndürelim
|
| 277 |
+
# logging.error(f"Error in fitness worker: {e}", exc_info=True) # Bu log dosyasına yazılmaz
|
| 278 |
+
print(f"[Worker Error] Failed to calculate fitness: {e}", file=sys.stderr) # stderr'a yazdır
|
| 279 |
+
return -np.inf # Hata durumunda
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
# (mutate_individual_pytorch fonksiyonu öncekiyle aynı, v4'teki gibi)
|
| 283 |
+
# Sadece mutasyon gücünü parametre olarak alıyor
|
| 284 |
+
def mutate_individual_pytorch(
|
| 285 |
+
individual: NeuralNetwork,
|
| 286 |
+
weight_mut_rate: float,
|
| 287 |
+
current_mutation_strength: float, # Adaptif olarak gelen güç
|
| 288 |
+
device: torch.device
|
| 289 |
+
) -> NeuralNetwork:
|
| 290 |
+
""" Bir PyTorch bireyine adaptif güçle ağırlık mutasyonu uygular. """
|
| 291 |
+
try:
|
| 292 |
+
mutated_model = clone_pytorch_model(individual, device)
|
| 293 |
+
mutated_model.model_name = f"mutated_{individual.model_name}_{random.randint(1000,9999)}"
|
| 294 |
+
mutated = False
|
| 295 |
+
state_dict = mutated_model.state_dict()
|
| 296 |
+
new_state_dict = copy.deepcopy(state_dict)
|
| 297 |
+
|
| 298 |
+
for name, param in new_state_dict.items():
|
| 299 |
+
if param.requires_grad and random.random() < weight_mut_rate :
|
| 300 |
+
mutated = True
|
| 301 |
+
noise = torch.randn_like(param) * current_mutation_strength # Adaptif gücü kullan
|
| 302 |
+
new_state_dict[name] = param + noise.to(param.device)
|
| 303 |
+
|
| 304 |
+
if mutated:
|
| 305 |
+
mutated_model.load_state_dict(new_state_dict)
|
| 306 |
+
logging.debug(f"Mutated model {individual.model_name} -> {mutated_model.model_name} with strength {current_mutation_strength:.4f}")
|
| 307 |
+
return mutated_model
|
| 308 |
+
else:
|
| 309 |
+
logging.debug(f"Mutation applied to {individual.model_name}, but no weights changed based on rate.")
|
| 310 |
+
return mutated_model # Klonlanmış modeli döndür
|
| 311 |
+
|
| 312 |
+
except Exception as e:
|
| 313 |
+
logging.error(f"Error during PyTorch mutation of model {individual.model_name}: {e}", exc_info=True)
|
| 314 |
+
return clone_pytorch_model(individual, device)
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
# (check_architecture_compatibility_pytorch fonksiyonu öncekiyle aynı, v4'teki gibi)
|
| 318 |
+
def check_architecture_compatibility_pytorch(model1: NeuralNetwork, model2: NeuralNetwork) -> bool:
|
| 319 |
+
return model1.get_architecture() == model2.get_architecture()
|
| 320 |
+
|
| 321 |
+
# (crossover_individuals_pytorch fonksiyonu öncekiyle aynı, v4'teki gibi)
|
| 322 |
+
def crossover_individuals_pytorch(
|
| 323 |
+
parent1: NeuralNetwork,
|
| 324 |
+
parent2: NeuralNetwork,
|
| 325 |
+
device: torch.device
|
| 326 |
+
) -> Tuple[Optional[NeuralNetwork], Optional[NeuralNetwork]]:
|
| 327 |
+
""" İki PyTorch ebeveynden basit ağırlık ortalaması/karıştırması ile çocuklar oluşturur. """
|
| 328 |
+
if not check_architecture_compatibility_pytorch(parent1, parent2):
|
| 329 |
+
logging.debug(f"Skipping crossover between {parent1.model_name} and {parent2.model_name} due to incompatible architectures.")
|
| 330 |
+
return None, None
|
| 331 |
+
try:
|
| 332 |
+
arch = parent1.get_architecture()
|
| 333 |
+
child1 = NeuralNetwork(**arch).to(device)
|
| 334 |
+
child2 = NeuralNetwork(**arch).to(device)
|
| 335 |
+
child1.model_name = f"xover_{parent1.architecture_id}_c1_{random.randint(1000,9999)}"
|
| 336 |
+
child2.model_name = f"xover_{parent1.architecture_id}_c2_{random.randint(1000,9999)}"
|
| 337 |
+
p1_state, p2_state = parent1.state_dict(), parent2.state_dict()
|
| 338 |
+
c1_state, c2_state = child1.state_dict(), child2.state_dict()
|
| 339 |
+
for name in p1_state:
|
| 340 |
+
param1, param2 = p1_state[name], p2_state[name]
|
| 341 |
+
mask = torch.rand_like(param1) < 0.5
|
| 342 |
+
c1_state[name] = torch.where(mask, param1, param2)
|
| 343 |
+
c2_state[name] = torch.where(mask, param2, param1)
|
| 344 |
+
child1.load_state_dict(c1_state)
|
| 345 |
+
child2.load_state_dict(c2_state)
|
| 346 |
+
logging.debug(f"Crossover performed between {parent1.model_name} and {parent2.model_name}")
|
| 347 |
+
return child1, child2
|
| 348 |
+
except Exception as e:
|
| 349 |
+
logging.error(f"Error during PyTorch crossover between {parent1.model_name} and {parent2.model_name}: {e}", exc_info=True)
|
| 350 |
+
return None, None
|
| 351 |
+
|
| 352 |
+
# (tournament_selection fonksiyonu öncekiyle aynı, v4'teki gibi)
|
| 353 |
+
def tournament_selection(
|
| 354 |
+
population: List[NeuralNetwork],
|
| 355 |
+
fitness_scores: List[float],
|
| 356 |
+
k: int
|
| 357 |
+
) -> NeuralNetwork:
|
| 358 |
+
""" Turnuva seçimi ile popülasyondan bir birey seçer. """
|
| 359 |
+
if not population: raise ValueError("Population cannot be empty")
|
| 360 |
+
valid_indices = [i for i, score in enumerate(fitness_scores) if np.isfinite(score)]
|
| 361 |
+
if not valid_indices:
|
| 362 |
+
logging.warning("No individuals with finite fitness scores found for tournament selection. Returning random individual.")
|
| 363 |
+
return random.choice(population)
|
| 364 |
+
if len(valid_indices) < k: k = len(valid_indices)
|
| 365 |
+
if k <= 0: k = 1
|
| 366 |
+
|
| 367 |
+
try:
|
| 368 |
+
# Sadece geçerli fitness'a sahip olanlar arasından seç
|
| 369 |
+
tournament_indices_pool = random.sample(valid_indices, k)
|
| 370 |
+
tournament_contenders = [(fitness_scores[i], population[i]) for i in tournament_indices_pool]
|
| 371 |
+
winner = max(tournament_contenders, key=lambda item: item[0])[1]
|
| 372 |
+
return winner
|
| 373 |
+
except Exception as e:
|
| 374 |
+
logging.error(f"Error during tournament selection: {e}", exc_info=True)
|
| 375 |
+
return random.choice(population) # Hata durumunda rastgele
|
| 376 |
+
|
| 377 |
+
|
| 378 |
+
# --- Checkpointing (PyTorch v5) ---
|
| 379 |
+
# (save_checkpoint_pytorch fonksiyonu öncekiyle aynı, v4'teki gibi)
|
| 380 |
+
# İsteğe bağlı: Adaptif durum veya W&B run ID'si eklenebilir.
|
| 381 |
+
def save_checkpoint_pytorch(output_dir: str, generation: int, population: List[NeuralNetwork],
|
| 382 |
+
rnd_state: Any, np_rnd_state: Any, torch_rnd_state: Any,
|
| 383 |
+
wandb_run_id: Optional[str] = None): # W&B ID'si ekle
|
| 384 |
+
""" Evrim durumunu (PyTorch v5) kaydeder. """
|
| 385 |
+
checkpoint_dir = os.path.join(output_dir, "checkpoints_pytorch_v5")
|
| 386 |
+
os.makedirs(checkpoint_dir, exist_ok=True)
|
| 387 |
+
checkpoint_file = os.path.join(checkpoint_dir, f"evo_gen_{generation}.pt")
|
| 388 |
+
logging.info(f"Saving checkpoint for generation {generation} to {checkpoint_file}...")
|
| 389 |
+
|
| 390 |
+
population_state = []
|
| 391 |
+
for model in population:
|
| 392 |
+
try:
|
| 393 |
+
population_state.append({
|
| 394 |
+
"name": model.model_name,
|
| 395 |
+
"architecture": model.get_architecture(),
|
| 396 |
+
"state_dict": model.state_dict()
|
| 397 |
+
})
|
| 398 |
+
except Exception as e:
|
| 399 |
+
logging.error(f"Could not serialize model {model.model_name} for checkpoint: {e}")
|
| 400 |
+
|
| 401 |
+
state = {
|
| 402 |
+
"version": "v5", # Sürüm bilgisi ekle
|
| 403 |
+
"generation": generation,
|
| 404 |
+
"population_state": population_state,
|
| 405 |
+
"random_state": rnd_state,
|
| 406 |
+
"numpy_random_state": np_rnd_state,
|
| 407 |
+
"torch_random_state": torch_rnd_state,
|
| 408 |
+
"wandb_run_id": wandb_run_id, # W&B run ID
|
| 409 |
+
"timestamp": datetime.now().isoformat()
|
| 410 |
+
# İsteğe bağlı: Adaptif mutasyonun mevcut durumu (current_mutation_strength, stagnation_counter)
|
| 411 |
+
}
|
| 412 |
+
try:
|
| 413 |
+
torch.save(state, checkpoint_file)
|
| 414 |
+
logging.info(f"Checkpoint saved successfully for generation {generation}.")
|
| 415 |
+
except Exception as e:
|
| 416 |
+
logging.error(f"Failed to save checkpoint using torch.save for generation {generation}: {e}", exc_info=True)
|
| 417 |
+
|
| 418 |
+
# (load_checkpoint_pytorch fonksiyonu öncekiyle aynı, v4'teki gibi)
|
| 419 |
+
# Sadece W&B run ID'sini okur
|
| 420 |
+
def load_checkpoint_pytorch(checkpoint_path: str, device: torch.device) -> Optional[Dict]:
|
| 421 |
+
""" Kaydedilmiş PyTorch v5 evrim durumunu yükler. """
|
| 422 |
+
if not os.path.exists(checkpoint_path):
|
| 423 |
+
logging.error(f"Checkpoint file not found: {checkpoint_path}")
|
| 424 |
+
return None
|
| 425 |
+
logging.info(f"Loading checkpoint from {checkpoint_path}...")
|
| 426 |
+
try:
|
| 427 |
+
checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
|
| 428 |
+
if checkpoint.get("version") != "v5":
|
| 429 |
+
logging.warning(f"Loading checkpoint from a different version ({checkpoint.get('version', 'Unknown')}). Compatibility not guaranteed.")
|
| 430 |
+
|
| 431 |
+
population = []
|
| 432 |
+
for model_state in checkpoint["population_state"]:
|
| 433 |
+
try:
|
| 434 |
+
arch = model_state["architecture"]
|
| 435 |
+
model = NeuralNetwork(**arch)
|
| 436 |
+
model.load_state_dict(model_state["state_dict"])
|
| 437 |
+
model.to(device)
|
| 438 |
+
model.model_name = model_state.get("name", f"loaded_model_{random.randint(1000,9999)}")
|
| 439 |
+
model.eval()
|
| 440 |
+
population.append(model)
|
| 441 |
+
except Exception as e:
|
| 442 |
+
logging.error(f"Failed to load model state from checkpoint for model {model_state.get('name', 'UNKNOWN')}: {e}", exc_info=True)
|
| 443 |
+
|
| 444 |
+
if not population:
|
| 445 |
+
logging.error("Failed to load any model from the checkpoint population state.")
|
| 446 |
+
return None
|
| 447 |
+
|
| 448 |
+
checkpoint["population"] = population
|
| 449 |
+
logging.info(f"Checkpoint loaded successfully. Resuming from generation {checkpoint['generation'] + 1}.")
|
| 450 |
+
# W&B ID'sini döndür
|
| 451 |
+
checkpoint["wandb_run_id"] = checkpoint.get("wandb_run_id")
|
| 452 |
+
return checkpoint
|
| 453 |
+
except Exception as e:
|
| 454 |
+
logging.error(f"Failed to load checkpoint from {checkpoint_path}: {e}", exc_info=True)
|
| 455 |
+
return None
|
| 456 |
+
|
| 457 |
+
# (find_latest_checkpoint_pytorch fonksiyonu öncekiyle aynı, v4'teki gibi)
|
| 458 |
+
# Sadece klasör adını v5'e göre güncelleyebiliriz
|
| 459 |
+
def find_latest_checkpoint_pytorch(output_dir: str) -> Optional[str]:
|
| 460 |
+
""" Verilen klasördeki en son PyTorch v5 checkpoint dosyasını (.pt) bulur. """
|
| 461 |
+
checkpoint_dir = os.path.join(output_dir, "checkpoints_pytorch_v5") # v5 klasörü
|
| 462 |
+
if not os.path.isdir(checkpoint_dir): return None
|
| 463 |
+
checkpoints = [f for f in os.listdir(checkpoint_dir) if f.startswith("evo_gen_") and f.endswith(".pt")]
|
| 464 |
+
if not checkpoints: return None
|
| 465 |
+
latest_gen = -1
|
| 466 |
+
latest_file = None
|
| 467 |
+
for cp in checkpoints:
|
| 468 |
+
try:
|
| 469 |
+
gen_num = int(cp.split('_')[2].split('.')[0])
|
| 470 |
+
if gen_num > latest_gen:
|
| 471 |
+
latest_gen = gen_num
|
| 472 |
+
latest_file = os.path.join(checkpoint_dir, cp)
|
| 473 |
+
except (IndexError, ValueError):
|
| 474 |
+
logging.warning(f"Could not parse generation number from checkpoint file: {cp}")
|
| 475 |
+
continue
|
| 476 |
+
return latest_file
|
| 477 |
+
|
| 478 |
+
|
| 479 |
+
# --- Ana Evrim Döngüsü (PyTorch v5 - Adaptif, Paralel) ---
|
| 480 |
+
def evolve_population_pytorch_v5(
|
| 481 |
+
population: List[NeuralNetwork],
|
| 482 |
+
X_train_np: np.ndarray, y_train_np: np.ndarray, # Veriyi NumPy olarak al
|
| 483 |
+
start_generation: int, total_generations: int,
|
| 484 |
+
crossover_rate: float, mutation_rate: float, weight_mut_rate: float,
|
| 485 |
+
args: argparse.Namespace, # Tüm argümanları alalım
|
| 486 |
+
output_dir: str, device: torch.device,
|
| 487 |
+
wandb_run: Optional[Any] # W&B run objesi
|
| 488 |
+
) -> Tuple[Optional[NeuralNetwork], List[float], List[float]]:
|
| 489 |
+
""" PyTorch v5 tabanlı evrimsel süreci çalıştırır (Adaptif, Paralel). """
|
| 490 |
+
|
| 491 |
+
best_fitness_history = []
|
| 492 |
+
avg_fitness_history = []
|
| 493 |
+
best_model_overall: Optional[NeuralNetwork] = None
|
| 494 |
+
best_fitness_overall = -np.inf
|
| 495 |
+
|
| 496 |
+
# Adaptif Mutasyon için başlangıç değerleri
|
| 497 |
+
current_mutation_strength = args.mutation_strength
|
| 498 |
+
stagnation_counter = 0
|
| 499 |
+
|
| 500 |
+
pop_size = len(population)
|
| 501 |
+
fitness_params = {'complexity_penalty': args.complexity_penalty} # Fitness worker için parametreler
|
| 502 |
+
|
| 503 |
+
# Paralel işleyici havuzu (eğer worker > 0 ise)
|
| 504 |
+
# 'fork' yerine 'spawn' kullanmak daha güvenli olabilir (özellikle CUDA ile)
|
| 505 |
+
# Ancak 'spawn' daha fazla overhead yaratabilir. Duruma göre seçilebilir.
|
| 506 |
+
# context = torch.multiprocessing.get_context("spawn") if args.num_workers > 0 else None
|
| 507 |
+
# executor = concurrent.futures.ProcessPoolExecutor(max_workers=args.num_workers, mp_context=context) if args.num_workers > 0 else None
|
| 508 |
+
executor = concurrent.futures.ProcessPoolExecutor(max_workers=args.num_workers) if args.num_workers > 0 else None
|
| 509 |
+
if executor:
|
| 510 |
+
logging.info(f"Using ProcessPoolExecutor with {args.num_workers} workers for fitness evaluation.")
|
| 511 |
+
|
| 512 |
+
try: # Executor'ı düzgün kapatmak için try...finally
|
| 513 |
+
for gen in range(start_generation, total_generations):
|
| 514 |
+
generation_start_time = time.time()
|
| 515 |
+
|
| 516 |
+
# 1. Fitness Değerlendirme (Paralel veya Seri)
|
| 517 |
+
fitness_scores = [-np.inf] * pop_size # Başlangıç değeri
|
| 518 |
+
population_states = [(ind.get_architecture(), ind.state_dict()) for ind in population]
|
| 519 |
+
|
| 520 |
+
try:
|
| 521 |
+
if executor and args.num_workers > 0:
|
| 522 |
+
futures = [executor.submit(_calculate_fitness_worker,
|
| 523 |
+
arch, state, X_train_np, y_train_np,
|
| 524 |
+
str(device), fitness_params)
|
| 525 |
+
for arch, state in population_states]
|
| 526 |
+
# concurrent.futures.wait(futures) # Beklemeye gerek yok, as_completed daha iyi
|
| 527 |
+
results = []
|
| 528 |
+
# Sonuçları geldikçe işle (sırasız gelebilir)
|
| 529 |
+
for i, future in enumerate(concurrent.futures.as_completed(futures)):
|
| 530 |
+
try:
|
| 531 |
+
result = future.result()
|
| 532 |
+
results.append(result)
|
| 533 |
+
# print(f"DEBUG: Worker {i} finished with fitness {result}") # DEBUG
|
| 534 |
+
except Exception as exc:
|
| 535 |
+
logging.error(f"Fitness calculation job {i} generated an exception: {exc}")
|
| 536 |
+
results.append(-np.inf) # Hata durumunda minimum fitness
|
| 537 |
+
# Sonuçları doğru sıraya koymak GEREKLİ DEĞİL çünkü seçilim/elitizm zaten skorlara göre çalışır
|
| 538 |
+
# Ancak loglama/takip için orijinal sıra önemliyse, future'ları dict ile takip edip sıraya dizmek gerekir.
|
| 539 |
+
# Basitlik için, sonuç listesinin popülasyonla aynı sırada olduğunu varsayalım (as_completed sırayı bozar!)
|
| 540 |
+
# DÜZELTME: Sonuçları sıraya dizmek ŞART. Future'ları indeksle takip et.
|
| 541 |
+
results_map = {}
|
| 542 |
+
futures_map = {executor.submit(_calculate_fitness_worker,
|
| 543 |
+
pop[0], pop[1], X_train_np, y_train_np,
|
| 544 |
+
str(device), fitness_params): index
|
| 545 |
+
for index, pop in enumerate(population_states)}
|
| 546 |
+
|
| 547 |
+
for future in concurrent.futures.as_completed(futures_map):
|
| 548 |
+
original_index = futures_map[future]
|
| 549 |
+
try:
|
| 550 |
+
result = future.result()
|
| 551 |
+
fitness_scores[original_index] = result
|
| 552 |
+
except Exception as exc:
|
| 553 |
+
logging.error(f'Individual {original_index} generated an exception: {exc}')
|
| 554 |
+
fitness_scores[original_index] = -np.inf # Hata durumunda
|
| 555 |
+
|
| 556 |
+
else: # Seri hesaplama (num_workers=0)
|
| 557 |
+
logging.debug("Calculating fitness sequentially...")
|
| 558 |
+
temp_device = torch.device("cpu") # Seri hesaplamayı CPU'da yapmak GPU'yu meşgul etmez
|
| 559 |
+
# Ana süreçte modeli CPU'ya taşı, hesapla, sonucu al
|
| 560 |
+
for i, (arch, state) in enumerate(population_states):
|
| 561 |
+
# Modeli her seferinde yeniden oluşturmak yerine klonlamak daha verimli olabilir mi?
|
| 562 |
+
# Ancak _calculate_fitness_worker mantığına uymak için yeniden oluşturalım.
|
| 563 |
+
try:
|
| 564 |
+
model_instance = NeuralNetwork(**arch)
|
| 565 |
+
model_instance.load_state_dict(state)
|
| 566 |
+
model_instance.to(temp_device)
|
| 567 |
+
fitness_scores[i] = calculate_fitness_pytorch( # Bu fonksiyon artık sadece seri için
|
| 568 |
+
model_instance, X_train_np, y_train_np,
|
| 569 |
+
temp_device, fitness_params)
|
| 570 |
+
except Exception as e:
|
| 571 |
+
logging.error(f"Error calculating fitness for individual {i} sequentially: {e}")
|
| 572 |
+
fitness_scores[i] = -np.inf
|
| 573 |
+
|
| 574 |
+
|
| 575 |
+
except Exception as e:
|
| 576 |
+
logging.critical(f"Error during fitness evaluation distribution/collection in Gen {gen+1}: {e}", exc_info=True)
|
| 577 |
+
raise # Bu kritik bir hata, devam etmek zor
|
| 578 |
+
|
| 579 |
+
# Fitness hesaplama sonrası GPU belleğini temizle (paralel workerlar ayrı process olduğu için burada etkisi olmaz ama seri için kalabilir)
|
| 580 |
+
# if device.type == 'cuda': torch.cuda.empty_cache()
|
| 581 |
+
|
| 582 |
+
# 2. İstatistikler ve En İyiyi Takip
|
| 583 |
+
valid_indices = [i for i, score in enumerate(fitness_scores) if np.isfinite(score)]
|
| 584 |
+
if not valid_indices:
|
| 585 |
+
logging.error(f"Generation {gen+1}: No individuals with finite fitness scores found! Cannot proceed.")
|
| 586 |
+
# Burada ne yapmalı? Popülasyonu sıfırlamak mı, durmak mı? Şimdilik duralım.
|
| 587 |
+
raise RuntimeError(f"Evolution stopped at generation {gen+1} due to lack of valid individuals.")
|
| 588 |
+
|
| 589 |
+
current_best_idx_local = np.argmax([fitness_scores[i] for i in valid_indices])
|
| 590 |
+
current_best_idx_global = valid_indices[current_best_idx_local]
|
| 591 |
+
current_best_fitness = fitness_scores[current_best_idx_global]
|
| 592 |
+
|
| 593 |
+
finite_scores = [fitness_scores[i] for i in valid_indices]
|
| 594 |
+
avg_fitness = np.mean(finite_scores)
|
| 595 |
+
|
| 596 |
+
best_fitness_history.append(current_best_fitness)
|
| 597 |
+
avg_fitness_history.append(avg_fitness)
|
| 598 |
+
|
| 599 |
+
new_best_found = False
|
| 600 |
+
if current_best_fitness > best_fitness_overall:
|
| 601 |
+
best_fitness_overall = current_best_fitness
|
| 602 |
+
new_best_found = True
|
| 603 |
+
try:
|
| 604 |
+
best_model_overall = clone_pytorch_model(population[current_best_idx_global], device)
|
| 605 |
+
logging.info(f"Generation {gen+1}: *** New overall best fitness: {best_fitness_overall:.6f} (Model: {best_model_overall.model_name}) ***")
|
| 606 |
+
except Exception as e:
|
| 607 |
+
logging.error(f"Could not clone new best model {population[current_best_idx_global].model_name}: {e}", exc_info=True)
|
| 608 |
+
best_model_overall = None
|
| 609 |
+
# else: # En iyi bulunamadıysa veya aynıysa
|
| 610 |
+
# pass # Stagnation sayacı aşağıda artacak
|
| 611 |
+
|
| 612 |
+
generation_time = time.time() - generation_start_time
|
| 613 |
+
logging.info(f"Generation {gen+1}/{total_generations} | Best Fitness: {current_best_fitness:.6f} | Avg Fitness: {avg_fitness:.6f} | Mut Strength: {current_mutation_strength:.4f} | Time: {generation_time:.2f}s")
|
| 614 |
+
|
| 615 |
+
# W&B Loglama (eğer aktifse)
|
| 616 |
+
if wandb_run:
|
| 617 |
+
try:
|
| 618 |
+
wandb_run.log({
|
| 619 |
+
"generation": gen + 1,
|
| 620 |
+
"best_fitness": current_best_fitness,
|
| 621 |
+
"average_fitness": avg_fitness,
|
| 622 |
+
"mutation_strength": current_mutation_strength,
|
| 623 |
+
"generation_time_sec": generation_time,
|
| 624 |
+
"num_valid_individuals": len(valid_indices),
|
| 625 |
+
# "best_model_params": best_model_overall.get_num_params() if best_model_overall else None # En iyinin parametre sayısı
|
| 626 |
+
}, step=gen + 1) # Adım olarak nesil numarasını kullan
|
| 627 |
+
except Exception as e:
|
| 628 |
+
logging.warning(f"Failed to log metrics to W&B: {e}")
|
| 629 |
+
|
| 630 |
+
|
| 631 |
+
# Adaptif Mutasyon Gücü Güncelleme
|
| 632 |
+
if args.adapt_mutation:
|
| 633 |
+
if new_best_found:
|
| 634 |
+
stagnation_counter = 0
|
| 635 |
+
current_mutation_strength = max(args.min_mut_strength, current_mutation_strength * args.mut_strength_decay)
|
| 636 |
+
logging.debug(f"Improvement found. Decreasing mutation strength to {current_mutation_strength:.4f}")
|
| 637 |
+
else:
|
| 638 |
+
stagnation_counter += 1
|
| 639 |
+
logging.debug(f"No improvement. Stagnation counter: {stagnation_counter}")
|
| 640 |
+
if stagnation_counter >= args.stagnation_limit:
|
| 641 |
+
current_mutation_strength = min(args.max_mut_strength, current_mutation_strength * args.mut_strength_increase)
|
| 642 |
+
logging.info(f"Stagnation detected ({stagnation_counter} gens). Increasing mutation strength to {current_mutation_strength:.4f}")
|
| 643 |
+
stagnation_counter = 0 # Sayacı sıfırla
|
| 644 |
+
|
| 645 |
+
# 3. Yeni Popülasyon Oluşturma (Elitizm, Çaprazlama, Mutasyon)
|
| 646 |
+
new_population = []
|
| 647 |
+
|
| 648 |
+
# 3a. Elitizm
|
| 649 |
+
if args.elitism_count > 0 and len(population) >= args.elitism_count:
|
| 650 |
+
try:
|
| 651 |
+
# Sadece geçerli fitness'a sahip elitleri seç
|
| 652 |
+
sorted_valid_indices = sorted(valid_indices, key=lambda i: fitness_scores[i], reverse=True)
|
| 653 |
+
elite_indices = sorted_valid_indices[:args.elitism_count]
|
| 654 |
+
for idx in elite_indices:
|
| 655 |
+
elite_clone = clone_pytorch_model(population[idx], device)
|
| 656 |
+
elite_clone.model_name = f"elite_{population[idx].model_name}"
|
| 657 |
+
new_population.append(elite_clone)
|
| 658 |
+
logging.debug(f"Added {len(new_population)} elites to the next generation.")
|
| 659 |
+
except Exception as e:
|
| 660 |
+
logging.error(f"Error during elitism: {e}", exc_info=True)
|
| 661 |
+
|
| 662 |
+
# 3b. Kalanları Üretme
|
| 663 |
+
num_to_generate = pop_size - len(new_population)
|
| 664 |
+
generated_count = 0
|
| 665 |
+
reproduction_attempts = 0
|
| 666 |
+
max_reproduction_attempts = num_to_generate * 5 # Daha cömert sınır
|
| 667 |
+
|
| 668 |
+
while generated_count < num_to_generate and reproduction_attempts < max_reproduction_attempts:
|
| 669 |
+
reproduction_attempts += 1
|
| 670 |
+
try:
|
| 671 |
+
parent1 = tournament_selection(population, fitness_scores, args.tournament_size)
|
| 672 |
+
parent2 = tournament_selection(population, fitness_scores, args.tournament_size)
|
| 673 |
+
child1, child2 = None, None
|
| 674 |
+
|
| 675 |
+
if random.random() < crossover_rate and parent1 is not parent2:
|
| 676 |
+
child1, child2 = crossover_individuals_pytorch(parent1, parent2, device)
|
| 677 |
+
|
| 678 |
+
if child1 is None: # Çaprazlama olmadıysa veya başarısızsa
|
| 679 |
+
if random.random() < mutation_rate:
|
| 680 |
+
parent_to_mutate = parent1
|
| 681 |
+
child1 = mutate_individual_pytorch(parent_to_mutate, weight_mut_rate, current_mutation_strength, device)
|
| 682 |
+
else: # Klonlama
|
| 683 |
+
child1 = clone_pytorch_model(parent1, device)
|
| 684 |
+
child1.model_name = f"direct_clone_{parent1.model_name}_{random.randint(1000,9999)}"
|
| 685 |
+
|
| 686 |
+
if child1:
|
| 687 |
+
new_population.append(child1); generated_count += 1
|
| 688 |
+
if generated_count >= num_to_generate: break
|
| 689 |
+
if child2:
|
| 690 |
+
new_population.append(child2); generated_count += 1
|
| 691 |
+
if generated_count >= num_to_generate: break
|
| 692 |
+
|
| 693 |
+
except Exception as e:
|
| 694 |
+
logging.error(f"Error during selection/reproduction cycle (attempt {reproduction_attempts}): {e}", exc_info=True)
|
| 695 |
+
|
| 696 |
+
if generated_count < num_to_generate:
|
| 697 |
+
logging.warning(f"Reproduction cycle failed to generate enough individuals. Adding {num_to_generate - generated_count} random individuals.")
|
| 698 |
+
# Rastgele bireyleri eklemeden önce popülasyonun boş olmadığından emin ol
|
| 699 |
+
if population:
|
| 700 |
+
input_s = population[0].input_size
|
| 701 |
+
output_s = population[0].output_size
|
| 702 |
+
for _ in range(num_to_generate - generated_count):
|
| 703 |
+
try:
|
| 704 |
+
random_ind = create_individual_pytorch(input_s, output_s).to(device)
|
| 705 |
+
new_population.append(random_ind)
|
| 706 |
+
except Exception as e:
|
| 707 |
+
logging.error(f"Failed to create random individual to fill population: {e}")
|
| 708 |
+
else: # İlk popülasyon da boşsa veya hata oluştuysa
|
| 709 |
+
logging.error("Cannot create random individuals as initial population is unavailable.")
|
| 710 |
+
|
| 711 |
+
|
| 712 |
+
population = new_population[:pop_size] # Boyutu garantile
|
| 713 |
+
|
| 714 |
+
# 4. Checkpoint Alma
|
| 715 |
+
if args.checkpoint_interval > 0 and (gen + 1) % args.checkpoint_interval == 0:
|
| 716 |
+
try:
|
| 717 |
+
rnd_state = random.getstate()
|
| 718 |
+
np_rnd_state = np.random.get_state()
|
| 719 |
+
torch_rnd_state = torch.get_rng_state().cpu() # CPU state'i kaydet
|
| 720 |
+
wandb_id = wandb_run.id if wandb_run else None
|
| 721 |
+
save_checkpoint_pytorch(output_dir, gen + 1, population, rnd_state, np_rnd_state, torch_rnd_state, wandb_id)
|
| 722 |
+
except Exception as e:
|
| 723 |
+
logging.error(f"Failed to execute checkpoint saving for generation {gen+1}: {e}", exc_info=True)
|
| 724 |
+
|
| 725 |
+
# Bellek temizliği (çok büyük ağlarda işe yarayabilir)
|
| 726 |
+
# import gc; gc.collect()
|
| 727 |
+
# if device.type == 'cuda': torch.cuda.empty_cache()
|
| 728 |
+
|
| 729 |
+
finally: # Executor'ı her zaman kapat
|
| 730 |
+
if executor:
|
| 731 |
+
logging.info("Shutting down ProcessPoolExecutor...")
|
| 732 |
+
executor.shutdown(wait=True) # İşlerin bitmesini bekle
|
| 733 |
+
logging.info("Executor shut down.")
|
| 734 |
+
|
| 735 |
+
|
| 736 |
+
# Evrim Sonu
|
| 737 |
+
if best_model_overall is None and population:
|
| 738 |
+
logging.warning("Evolution finished, but no single best model was tracked. Selecting best from final population.")
|
| 739 |
+
# Son popülasyondan en iyiyi seçmek için fitness'ları tekrar hesapla (veya son skorları kullan?)
|
| 740 |
+
# En güvenlisi tekrar hesaplamak:
|
| 741 |
+
final_population_states = [(ind.get_architecture(), ind.state_dict()) for ind in population]
|
| 742 |
+
final_fitness_scores = [-np.inf] * len(population)
|
| 743 |
+
# Seri hesaplama yapalım (executor kapalı)
|
| 744 |
+
temp_device = torch.device("cpu")
|
| 745 |
+
for i, (arch, state) in enumerate(final_population_states):
|
| 746 |
+
try:
|
| 747 |
+
model_instance = NeuralNetwork(**arch); model_instance.load_state_dict(state); model_instance.to(temp_device)
|
| 748 |
+
final_fitness_scores[i] = calculate_fitness_pytorch(model_instance, X_train_np, y_train_np, temp_device, fitness_params)
|
| 749 |
+
except Exception: final_fitness_scores[i] = -np.inf
|
| 750 |
+
|
| 751 |
+
final_valid_indices = [i for i, score in enumerate(final_fitness_scores) if np.isfinite(score)]
|
| 752 |
+
if final_valid_indices:
|
| 753 |
+
best_idx_final = max(final_valid_indices, key=lambda i: final_fitness_scores[i])
|
| 754 |
+
best_model_overall = clone_pytorch_model(population[best_idx_final], device)
|
| 755 |
+
best_fitness_overall = final_fitness_scores[best_idx_final]
|
| 756 |
+
logging.info(f"Selected best model from final population: {best_model_overall.model_name} with fitness {best_fitness_overall:.6f}")
|
| 757 |
+
else:
|
| 758 |
+
logging.error("Evolution finished. No valid finite fitness scores in the final population.")
|
| 759 |
+
return None, best_fitness_history, avg_fitness_history
|
| 760 |
+
elif not population:
|
| 761 |
+
logging.error("Evolution finished with an empty population!")
|
| 762 |
+
return None, best_fitness_history, avg_fitness_history
|
| 763 |
+
else: # best_model_overall zaten bulundu
|
| 764 |
+
logging.info(f"Evolution finished. Best fitness achieved: {best_fitness_overall:.6f} by model {best_model_overall.model_name}")
|
| 765 |
+
|
| 766 |
+
return best_model_overall, best_fitness_history, avg_fitness_history
|
| 767 |
+
|
| 768 |
+
|
| 769 |
+
# --- Fitness Hesaplama (Seri - Ana Süreç veya Worker=0 için) ---
|
| 770 |
+
# Paralel worker'dan farklı olarak modeli doğrudan alır.
|
| 771 |
+
def calculate_fitness_pytorch(
|
| 772 |
+
individual: NeuralNetwork,
|
| 773 |
+
X_np: np.ndarray, y_np: np.ndarray, # Veriyi NumPy olarak alır
|
| 774 |
+
device: torch.device,
|
| 775 |
+
fitness_params: Dict
|
| 776 |
+
) -> float:
|
| 777 |
+
""" Bir bireyin fitness değerini hesaplar (Seri kullanım için). """
|
| 778 |
+
individual.eval()
|
| 779 |
+
individual.to(device)
|
| 780 |
+
# Veriyi Tensör'e çevir ve cihaza taşı
|
| 781 |
+
try:
|
| 782 |
+
X = torch.from_numpy(X_np).float().to(device)
|
| 783 |
+
y = torch.from_numpy(y_np).float().to(device)
|
| 784 |
+
except Exception as e:
|
| 785 |
+
logging.error(f"Error converting data to tensor or moving to device in calculate_fitness_pytorch: {e}")
|
| 786 |
+
return -np.inf
|
| 787 |
+
|
| 788 |
+
complexity_penalty_weight = fitness_params.get('complexity_penalty', 0.0)
|
| 789 |
+
|
| 790 |
+
try:
|
| 791 |
+
with torch.no_grad():
|
| 792 |
+
y_pred = individual(X)
|
| 793 |
+
mse_val = torch.mean((y_pred - y)**2).item()
|
| 794 |
+
|
| 795 |
+
if not np.isfinite(mse_val):
|
| 796 |
+
logging.warning(f"Non-finite MSE ({mse_val}) for model {individual.model_name} (Serial Calc). Assigning minimal fitness.")
|
| 797 |
+
return -np.inf
|
| 798 |
+
|
| 799 |
+
fitness_score = 1.0 / (mse_val + 1e-9)
|
| 800 |
+
|
| 801 |
+
if complexity_penalty_weight > 0:
|
| 802 |
+
num_params = individual.get_num_params(trainable_only=True)
|
| 803 |
+
complexity_penalty = complexity_penalty_weight * num_params
|
| 804 |
+
fitness_score -= complexity_penalty
|
| 805 |
+
|
| 806 |
+
if not np.isfinite(fitness_score):
|
| 807 |
+
logging.warning(f"Non-finite final fitness ({fitness_score:.4g}) for model {individual.model_name} (Serial Calc). Assigning minimal fitness.")
|
| 808 |
+
return -np.inf
|
| 809 |
+
|
| 810 |
+
return float(fitness_score)
|
| 811 |
+
|
| 812 |
+
except Exception as e:
|
| 813 |
+
logging.error(f"Error during serial fitness calculation for model {individual.model_name}: {e}", exc_info=True)
|
| 814 |
+
return -np.inf
|
| 815 |
+
|
| 816 |
+
|
| 817 |
+
# --- Grafik Çizimi ---
|
| 818 |
+
# (plot_fitness_history fonksiyonu öncekiyle aynı, v4'teki gibi)
|
| 819 |
+
def plot_fitness_history(history_best: List[float], history_avg: List[float], output_dir: str, filename: str = "fitness_history_pytorch_v5.png") -> None:
|
| 820 |
+
if not history_best or not history_avg: logging.warning("Fitness history empty, cannot plot."); return
|
| 821 |
+
try:
|
| 822 |
+
plt.figure(figsize=(12, 7))
|
| 823 |
+
gens = np.arange(1, len(history_best) + 1)
|
| 824 |
+
valid_best_indices = [i for i, v in enumerate(history_best) if np.isfinite(v)]
|
| 825 |
+
valid_avg_indices = [i for i, v in enumerate(history_avg) if np.isfinite(v)]
|
| 826 |
+
if valid_best_indices: plt.plot(gens[valid_best_indices], np.array(history_best)[valid_best_indices], label="Best Fitness", marker='o', linestyle='-', linewidth=2)
|
| 827 |
+
if valid_avg_indices: plt.plot(gens[valid_avg_indices], np.array(history_avg)[valid_avg_indices], label="Average Fitness", marker='x', linestyle='--', alpha=0.7)
|
| 828 |
+
plt.xlabel("Generation"); plt.ylabel("Fitness Score"); plt.title("Evolutionary Fitness History (PyTorch v5)"); plt.legend(); plt.grid(True); plt.tight_layout()
|
| 829 |
+
plot_path = os.path.join(output_dir, filename); plt.savefig(plot_path); plt.close()
|
| 830 |
+
logging.info(f"Fitness history plot saved to {plot_path}")
|
| 831 |
+
except Exception as e: logging.error(f"Error plotting fitness history: {e}", exc_info=True)
|
| 832 |
+
|
| 833 |
+
|
| 834 |
+
# --- Değerlendirme (PyTorch v5) ---
|
| 835 |
+
# (evaluate_model_pytorch fonksiyonu öncekiyle aynı, v4'teki gibi)
|
| 836 |
+
# Sadece loglamayı güncelleyebiliriz.
|
| 837 |
+
def evaluate_model_pytorch(
|
| 838 |
+
model: NeuralNetwork,
|
| 839 |
+
X_test_np: np.ndarray, y_test_np: np.ndarray,
|
| 840 |
+
batch_size: int, device: torch.device
|
| 841 |
+
) -> Dict[str, float]:
|
| 842 |
+
""" En iyi modeli test verisi üzerinde PyTorch v5 ile değerlendirir. """
|
| 843 |
+
if model is None: logging.error("Cannot evaluate a None model."); return {"test_mse": np.inf, "avg_kendall_tau": 0.0}
|
| 844 |
+
logging.info(f"Evaluating final model {model.model_name} on test data (PyTorch v5)...")
|
| 845 |
+
model.eval(); model.to(device)
|
| 846 |
+
try:
|
| 847 |
+
test_dataset = TensorDataset(torch.from_numpy(X_test_np).float(), torch.from_numpy(y_test_np).float())
|
| 848 |
+
test_loader = DataLoader(test_dataset, batch_size=batch_size)
|
| 849 |
+
except Exception as e:
|
| 850 |
+
logging.error(f"Failed to create PyTorch DataLoader for test data: {e}", exc_info=True)
|
| 851 |
+
return {"test_mse": np.inf, "avg_kendall_tau": 0.0}
|
| 852 |
+
|
| 853 |
+
all_preds, all_targets = [], []
|
| 854 |
+
total_mse, num_batches = 0.0, 0
|
| 855 |
+
try:
|
| 856 |
+
with torch.no_grad():
|
| 857 |
+
for inputs, targets in test_loader:
|
| 858 |
+
inputs, targets = inputs.to(device), targets.to(device)
|
| 859 |
+
outputs = model(inputs)
|
| 860 |
+
total_mse += torch.mean((outputs - targets)**2).item()
|
| 861 |
+
num_batches += 1
|
| 862 |
+
all_preds.append(outputs.cpu().numpy())
|
| 863 |
+
all_targets.append(targets.cpu().numpy())
|
| 864 |
+
|
| 865 |
+
avg_mse = total_mse / num_batches if num_batches > 0 else np.inf
|
| 866 |
+
logging.info(f"Final Test MSE: {avg_mse:.6f}")
|
| 867 |
+
all_preds_np = np.concatenate(all_preds, axis=0)
|
| 868 |
+
all_targets_np = np.concatenate(all_targets, axis=0)
|
| 869 |
+
sample_size = min(500, all_targets_np.shape[0]); taus = []
|
| 870 |
+
if sample_size > 0:
|
| 871 |
+
indices = np.random.choice(all_targets_np.shape[0], sample_size, replace=False)
|
| 872 |
+
for i in indices:
|
| 873 |
+
try:
|
| 874 |
+
tau, _ = kendalltau(all_targets_np[i], all_preds_np[i])
|
| 875 |
+
if not np.isnan(tau): taus.append(tau)
|
| 876 |
+
except ValueError: pass
|
| 877 |
+
avg_kendall_tau = np.mean(taus) if taus else 0.0
|
| 878 |
+
logging.info(f"Average Kendall's Tau (on {sample_size} samples): {avg_kendall_tau:.4f}")
|
| 879 |
+
return {"test_mse": float(avg_mse), "avg_kendall_tau": float(avg_kendall_tau)}
|
| 880 |
+
except Exception as e:
|
| 881 |
+
logging.error(f"Error during final model evaluation: {e}", exc_info=True)
|
| 882 |
+
return {"test_mse": np.inf, "avg_kendall_tau": 0.0}
|
| 883 |
+
|
| 884 |
+
|
| 885 |
+
# --- Son Eğitim (PyTorch v5) ---
|
| 886 |
+
# (train_final_model_pytorch fonksiyonu öncekiyle aynı, v4'teki gibi)
|
| 887 |
+
# Sadece loglamayı güncelleyebiliriz.
|
| 888 |
+
def train_final_model_pytorch(
|
| 889 |
+
model: NeuralNetwork,
|
| 890 |
+
X_train_np: np.ndarray, y_train_np: np.ndarray,
|
| 891 |
+
epochs: int, batch_size: int, learning_rate: float,
|
| 892 |
+
device: torch.device, output_dir: str,
|
| 893 |
+
wandb_run: Optional[Any] # W&B objesi
|
| 894 |
+
) -> Tuple[NeuralNetwork, Dict[str, Any]]:
|
| 895 |
+
""" En iyi evrimleşmiş modeli PyTorch v5 ile eğitir. """
|
| 896 |
+
logging.info(f"--- Starting Final Training of Best Evolved Model ({model.model_name}) ---")
|
| 897 |
+
model.to(device)
|
| 898 |
+
try:
|
| 899 |
+
train_dataset = TensorDataset(torch.from_numpy(X_train_np).float(), torch.from_numpy(y_train_np).float())
|
| 900 |
+
val_split = 0.2; num_train = len(train_dataset); split_idx = int(np.floor(val_split * num_train))
|
| 901 |
+
indices = list(range(num_train)); np.random.shuffle(indices)
|
| 902 |
+
train_indices, val_indices = indices[split_idx:], indices[:split_idx]
|
| 903 |
+
train_sampler = torch.utils.data.SubsetRandomSampler(train_indices)
|
| 904 |
+
val_sampler = torch.utils.data.SequentialSampler(val_indices) # Sıralı yapalım val'ı
|
| 905 |
+
train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler, num_workers=min(4, os.cpu_count() or 1)) # DataLoader workerları
|
| 906 |
+
val_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=val_sampler, num_workers=min(4, os.cpu_count() or 1))
|
| 907 |
+
logging.info(f"Created DataLoaders. Train samples: {len(train_indices)}, Val samples: {len(val_indices)}")
|
| 908 |
+
except Exception as e:
|
| 909 |
+
logging.error(f"Failed to create DataLoaders for final training: {e}", exc_info=True)
|
| 910 |
+
return model, {"error": "DataLoader creation failed"}
|
| 911 |
+
|
| 912 |
+
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
|
| 913 |
+
criterion = nn.MSELoss()
|
| 914 |
+
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.3, patience=7, verbose=False, min_lr=1e-7) # verbose=False
|
| 915 |
+
|
| 916 |
+
early_stopping_patience = 15; best_val_loss = np.inf; epochs_no_improve = 0; best_model_state = None
|
| 917 |
+
training_history = {'train_loss': [], 'val_loss': [], 'lr': []}; epochs_run = 0
|
| 918 |
+
|
| 919 |
+
try:
|
| 920 |
+
for epoch in range(epochs):
|
| 921 |
+
epochs_run += 1; model.train(); running_train_loss = 0.0
|
| 922 |
+
for inputs, targets in train_loader:
|
| 923 |
+
inputs, targets = inputs.to(device), targets.to(device)
|
| 924 |
+
optimizer.zero_grad(); outputs = model(inputs); loss = criterion(outputs, targets)
|
| 925 |
+
loss.backward(); optimizer.step()
|
| 926 |
+
running_train_loss += loss.item()
|
| 927 |
+
avg_train_loss = running_train_loss / len(train_loader) if len(train_loader) > 0 else 0.0
|
| 928 |
+
training_history['train_loss'].append(avg_train_loss); training_history['lr'].append(optimizer.param_groups[0]['lr'])
|
| 929 |
+
|
| 930 |
+
model.eval(); running_val_loss = 0.0
|
| 931 |
+
with torch.no_grad():
|
| 932 |
+
for inputs, targets in val_loader:
|
| 933 |
+
inputs, targets = inputs.to(device), targets.to(device)
|
| 934 |
+
running_val_loss += criterion(model(inputs), targets).item()
|
| 935 |
+
avg_val_loss = running_val_loss / len(val_loader) if len(val_loader) > 0 else np.inf
|
| 936 |
+
training_history['val_loss'].append(avg_val_loss)
|
| 937 |
+
logging.info(f"Epoch [{epoch+1}/{epochs}] Train Loss: {avg_train_loss:.6f} | Val Loss: {avg_val_loss:.6f} | LR: {optimizer.param_groups[0]['lr']:.2e}")
|
| 938 |
+
|
| 939 |
+
# W&B Loglama (final training)
|
| 940 |
+
if wandb_run:
|
| 941 |
+
try:
|
| 942 |
+
wandb_run.log({
|
| 943 |
+
"final_train_epoch": epoch + 1,
|
| 944 |
+
"final_train_loss": avg_train_loss,
|
| 945 |
+
"final_val_loss": avg_val_loss,
|
| 946 |
+
"final_learning_rate": optimizer.param_groups[0]['lr']
|
| 947 |
+
}, step=start_generation + epochs_run) # Toplam adım sayısı
|
| 948 |
+
except Exception as e:
|
| 949 |
+
logging.warning(f"Failed to log final training metrics to W&B: {e}")
|
| 950 |
+
|
| 951 |
+
|
| 952 |
+
scheduler.step(avg_val_loss)
|
| 953 |
+
if avg_val_loss < best_val_loss:
|
| 954 |
+
best_val_loss = avg_val_loss; epochs_no_improve = 0
|
| 955 |
+
best_model_state = copy.deepcopy(model.state_dict()); logging.debug(f"New best val loss: {best_val_loss:.6f}")
|
| 956 |
+
else: epochs_no_improve += 1
|
| 957 |
+
if epochs_no_improve >= early_stopping_patience:
|
| 958 |
+
logging.info(f"Early stopping triggered after {epoch+1} epochs."); break
|
| 959 |
+
|
| 960 |
+
if best_model_state: logging.info(f"Restoring model to best validation performance."); model.load_state_dict(best_model_state)
|
| 961 |
+
else: logging.warning("No best model state saved during training.")
|
| 962 |
+
|
| 963 |
+
logging.info("Final training complete.")
|
| 964 |
+
training_summary = {"epochs_run": epochs_run, "final_train_loss": avg_train_loss,
|
| 965 |
+
"best_val_loss": best_val_loss, "final_lr": optimizer.param_groups[0]['lr']}
|
| 966 |
+
return model, training_summary
|
| 967 |
+
|
| 968 |
+
except Exception as e:
|
| 969 |
+
logging.error(f"Error during final PyTorch model training: {e}", exc_info=True)
|
| 970 |
+
return model, {"error": str(e)}
|
| 971 |
+
|
| 972 |
+
|
| 973 |
+
# --- Ana İş Akışı (PyTorch v5) ---
|
| 974 |
+
def run_pipeline_pytorch_v5(args: argparse.Namespace):
|
| 975 |
+
""" Checkpoint, Adaptif, Paralel PyTorch v5 tabanlı ana iş akışı. """
|
| 976 |
+
|
| 977 |
+
wandb_run = None # W&B run objesi
|
| 978 |
+
output_dir = None # Hata durumunda tanımlı olması için
|
| 979 |
+
|
| 980 |
+
try: # Ana try bloğu, W&B finish için
|
| 981 |
+
device = setup_device(args.device)
|
| 982 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 983 |
+
run_name = f"evorun_pt_v5_{timestamp}_gen{args.generations}_pop{args.pop_size}"
|
| 984 |
+
output_dir = args.resume_from if args.resume_from else os.path.join(args.output_base_dir, run_name)
|
| 985 |
+
resume_run = bool(args.resume_from)
|
| 986 |
+
resumed_wandb_id = None
|
| 987 |
+
|
| 988 |
+
if resume_run:
|
| 989 |
+
run_name = os.path.basename(output_dir)
|
| 990 |
+
logging.info(f"Attempting to resume PyTorch v5 run from: {output_dir}")
|
| 991 |
+
if not os.path.isdir(output_dir): logging.error(f"Resume directory not found: {output_dir}. Exiting."); sys.exit(1)
|
| 992 |
+
else:
|
| 993 |
+
try: os.makedirs(output_dir, exist_ok=True)
|
| 994 |
+
except OSError as e: print(f"FATAL: Could not create output dir: {output_dir}. Error: {e}", file=sys.stderr); sys.exit(1)
|
| 995 |
+
|
| 996 |
+
setup_logging(output_dir)
|
| 997 |
+
logging.info(f"========== Starting/Resuming EvoNet v5 PyTorch Pipeline: {run_name} ==========")
|
| 998 |
+
logging.info(f"Output directory: {output_dir}")
|
| 999 |
+
logging.info(f"Using device: {device}")
|
| 1000 |
+
|
| 1001 |
+
# Checkpoint Yükleme
|
| 1002 |
+
start_generation = 0; population = []; initial_state_loaded = False; loaded_history_best = []; loaded_history_avg = []
|
| 1003 |
+
latest_checkpoint_path = find_latest_checkpoint_pytorch(output_dir) if resume_run else None
|
| 1004 |
+
|
| 1005 |
+
if latest_checkpoint_path:
|
| 1006 |
+
loaded_state = load_checkpoint_pytorch(latest_checkpoint_path, device)
|
| 1007 |
+
if loaded_state:
|
| 1008 |
+
start_generation = loaded_state['generation']
|
| 1009 |
+
population = loaded_state['population']
|
| 1010 |
+
resumed_wandb_id = loaded_state.get("wandb_run_id") # W&B ID'sini al
|
| 1011 |
+
try: # Random state yükleme
|
| 1012 |
+
random.setstate(loaded_state['random_state']); np.random.set_state(loaded_state['numpy_random_state'])
|
| 1013 |
+
torch.set_rng_state(loaded_state['torch_random_state'].cpu())
|
| 1014 |
+
logging.info(f"Random states restored from checkpoint (Generation {start_generation}).")
|
| 1015 |
+
except Exception as e: logging.warning(f"Could not fully restore random states: {e}")
|
| 1016 |
+
initial_state_loaded = True
|
| 1017 |
+
logging.info(f"Resuming from Generation {start_generation + 1} with {len(population)} individuals.")
|
| 1018 |
+
if resumed_wandb_id: logging.info(f"Found previous W&B run ID in checkpoint: {resumed_wandb_id}")
|
| 1019 |
+
else: logging.error("Failed to load checkpoint. Starting from scratch."); resume_run = False
|
| 1020 |
+
elif resume_run: logging.warning(f"Resume requested but no valid v5 checkpoint found. Starting from scratch."); resume_run = False
|
| 1021 |
+
|
| 1022 |
+
|
| 1023 |
+
# W&B Başlatma (eğer argüman verildiyse ve kütüphane varsa)
|
| 1024 |
+
if args.use_wandb and _WANDB_AVAILABLE:
|
| 1025 |
+
try:
|
| 1026 |
+
wandb_kwargs = {
|
| 1027 |
+
"project": args.wandb_project,
|
| 1028 |
+
"entity": args.wandb_entity,
|
| 1029 |
+
"name": run_name,
|
| 1030 |
+
"config": vars(args), # Argümanları kaydet
|
| 1031 |
+
"dir": output_dir, # Logları çıktı klasörüne yazdır
|
| 1032 |
+
"resume": "allow", # Devam etmeye izin ver
|
| 1033 |
+
"id": resumed_wandb_id # Eğer varsa önceki ID'yi kullan
|
| 1034 |
+
}
|
| 1035 |
+
# Entity boşsa argümandan çıkar
|
| 1036 |
+
if not wandb_kwargs["entity"]: del wandb_kwargs["entity"]
|
| 1037 |
+
|
| 1038 |
+
wandb_run = wandb.init(**wandb_kwargs)
|
| 1039 |
+
logging.info(f"Weights & Biases initialized. Run ID: {wandb_run.id if wandb_run else 'N/A'}")
|
| 1040 |
+
# Eğer yeni bir run başladıysa (resume edilmediyse) veya ID değiştiyse W&B ID'sini logla
|
| 1041 |
+
if wandb_run and (not resume_run or wandb_run.id != resumed_wandb_id):
|
| 1042 |
+
logging.info(f"Logging to W&B run: {wandb_run.get_url()}" if wandb_run else "W&B run URL not available.")
|
| 1043 |
+
|
| 1044 |
+
except Exception as e:
|
| 1045 |
+
logging.error(f"Failed to initialize Weights & Biases: {e}", exc_info=True)
|
| 1046 |
+
wandb_run = None # Başarısız olursa devam et ama loglama yapma
|
| 1047 |
+
|
| 1048 |
+
|
| 1049 |
+
# Config Kaydetme/Loglama (v4'teki gibi)
|
| 1050 |
+
config_path = os.path.join(output_dir, "config_pytorch_v5.json")
|
| 1051 |
+
args_dict = vars(args)
|
| 1052 |
+
if not initial_state_loaded or not os.path.exists(config_path):
|
| 1053 |
+
logging.info("--- Configuration ---")
|
| 1054 |
+
for k, v in args_dict.items(): logging.info(f" {k:<25}: {v}")
|
| 1055 |
+
logging.info("---------------------")
|
| 1056 |
+
try:
|
| 1057 |
+
args_to_save = args_dict.copy(); args_to_save['device'] = str(device)
|
| 1058 |
+
with open(config_path, 'w') as f: json.dump(args_to_save, f, indent=4, sort_keys=True)
|
| 1059 |
+
logging.info(f"Configuration saved to {config_path}")
|
| 1060 |
+
except Exception as e: logging.error(f"Failed to save configuration: {e}", exc_info=True)
|
| 1061 |
+
else: # Devam ediliyorsa logla
|
| 1062 |
+
try:
|
| 1063 |
+
with open(config_path, 'r') as f: loaded_args_dict = json.load(f)
|
| 1064 |
+
logging.info("--- Loaded Configuration (from resumed run) ---")
|
| 1065 |
+
for k, v in loaded_args_dict.items(): logging.info(f" {k:<25}: {v}")
|
| 1066 |
+
logging.info("-----------------------------------------------")
|
| 1067 |
+
except Exception as e: logging.warning(f"Could not reload config.json: {e}")
|
| 1068 |
+
|
| 1069 |
+
|
| 1070 |
+
# Random Tohum Ayarlama (sadece sıfırdan başlarken)
|
| 1071 |
+
if not initial_state_loaded:
|
| 1072 |
+
try:
|
| 1073 |
+
seed = args.seed
|
| 1074 |
+
random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
|
| 1075 |
+
if device.type == 'cuda': torch.cuda.manual_seed_all(seed)
|
| 1076 |
+
logging.info(f"Using random seed: {seed}")
|
| 1077 |
+
except Exception as e: logging.warning(f"Could not set all random seeds: {e}")
|
| 1078 |
+
|
| 1079 |
+
|
| 1080 |
+
# Veri Üretimi (her zaman)
|
| 1081 |
+
try:
|
| 1082 |
+
logging.info("Generating/Reloading data...")
|
| 1083 |
+
X_train_np, y_train_np = generate_data(args.train_samples, args.seq_length)
|
| 1084 |
+
X_test_np, y_test_np = generate_data(args.test_samples, args.seq_length)
|
| 1085 |
+
input_shape = X_train_np.shape[1]
|
| 1086 |
+
output_shape = y_train_np.shape[1]
|
| 1087 |
+
except Exception: logging.critical("Failed to generate/reload data. Exiting."); sys.exit(1)
|
| 1088 |
+
|
| 1089 |
+
|
| 1090 |
+
# Popülasyon Başlatma (sadece sıfırdan başlarken)
|
| 1091 |
+
if not initial_state_loaded:
|
| 1092 |
+
logging.info(f"--- Initializing Population (Size: {args.pop_size}) ---")
|
| 1093 |
+
try:
|
| 1094 |
+
population = [create_individual_pytorch(input_shape, output_shape).to(device) for _ in range(args.pop_size)]
|
| 1095 |
+
logging.info("Population initialized successfully.")
|
| 1096 |
+
except Exception: logging.critical("Failed to initialize population. Exiting."); sys.exit(1)
|
| 1097 |
+
|
| 1098 |
+
|
| 1099 |
+
# Evrim Süreci
|
| 1100 |
+
logging.info(f"--- Starting/Resuming PyTorch v5 Evolution ({args.generations} Total Generations) ---")
|
| 1101 |
+
best_model_evolved: Optional[NeuralNetwork] = None
|
| 1102 |
+
best_fitness_hist = loaded_history_best
|
| 1103 |
+
avg_fitness_hist = loaded_history_avg
|
| 1104 |
+
|
| 1105 |
+
if start_generation >= args.generations:
|
| 1106 |
+
logging.warning(f"Loaded checkpoint gen ({start_generation}) >= total gens ({args.generations}). Skipping evolution.")
|
| 1107 |
+
# Checkpoint'ten en iyiyi al (v4'teki gibi TODO: daha iyi yöntem)
|
| 1108 |
+
if population:
|
| 1109 |
+
# Son popülasyondan en iyiyi seç (fitness hesaplayarak)
|
| 1110 |
+
try:
|
| 1111 |
+
logging.info("Selecting best model from loaded population as evolution is skipped...")
|
| 1112 |
+
temp_device = torch.device("cpu")
|
| 1113 |
+
fitness_scores_loaded = [calculate_fitness_pytorch(ind, X_train_np, y_train_np, temp_device, {'complexity_penalty': args.complexity_penalty}) for ind in population]
|
| 1114 |
+
valid_scores_loaded = [(s, i) for i, s in enumerate(fitness_scores_loaded) if np.isfinite(s)]
|
| 1115 |
+
if valid_scores_loaded:
|
| 1116 |
+
best_idx_loaded = max(valid_scores_loaded, key=lambda item: item[0])[1]
|
| 1117 |
+
best_model_evolved = clone_pytorch_model(population[best_idx_loaded], device)
|
| 1118 |
+
logging.info(f"Using model {best_model_evolved.model_name} from loaded population.")
|
| 1119 |
+
else: logging.warning("Could not determine best model from loaded population."); best_model_evolved = None
|
| 1120 |
+
except Exception as e: logging.error(f"Error selecting best model from loaded population: {e}"); best_model_evolved = None
|
| 1121 |
+
else: best_model_evolved = None
|
| 1122 |
+
else:
|
| 1123 |
+
try:
|
| 1124 |
+
best_model_evolved, gen_best_hist, gen_avg_hist = evolve_population_pytorch_v5(
|
| 1125 |
+
population, X_train_np, y_train_np, start_generation, args.generations,
|
| 1126 |
+
args.crossover_rate, args.mutation_rate, args.weight_mut_rate,
|
| 1127 |
+
args, # Tüm argümanları geçir
|
| 1128 |
+
output_dir, device, wandb_run
|
| 1129 |
+
)
|
| 1130 |
+
best_fitness_hist.extend(gen_best_hist)
|
| 1131 |
+
avg_fitness_hist.extend(gen_avg_hist)
|
| 1132 |
+
except Exception as e:
|
| 1133 |
+
logging.critical(f"Fatal error during PyTorch v5 evolution process: {e}", exc_info=True)
|
| 1134 |
+
raise # Hatayı yukarı fırlat
|
| 1135 |
+
|
| 1136 |
+
logging.info("--- PyTorch v5 Evolution Complete ---")
|
| 1137 |
+
|
| 1138 |
+
# Fitness Geçmişi Kaydet/Çizdir (v4'teki gibi)
|
| 1139 |
+
if best_fitness_hist or avg_fitness_hist:
|
| 1140 |
+
plot_fitness_history(best_fitness_hist, avg_fitness_hist, output_dir)
|
| 1141 |
+
history_path = os.path.join(output_dir, "fitness_history_pytorch_v5.csv")
|
| 1142 |
+
try:
|
| 1143 |
+
history_data = np.array([np.arange(1, len(best_fitness_hist) + 1), best_fitness_hist, avg_fitness_hist]).T
|
| 1144 |
+
np.savetxt(history_path, history_data, delimiter=',', header='Generation,BestFitness,AvgFitness', comments='', fmt=['%d', '%.8f', '%.8f'])
|
| 1145 |
+
logging.info(f"Full fitness history saved to {history_path}")
|
| 1146 |
+
# W&B'ye tablo olarak logla (opsiyonel)
|
| 1147 |
+
if wandb_run:
|
| 1148 |
+
try:
|
| 1149 |
+
table = wandb.Table(data=history_data, columns=["Generation", "BestFitness", "AvgFitness"])
|
| 1150 |
+
wandb_run.log({"fitness_history_table": table})
|
| 1151 |
+
except Exception as e: logging.warning(f"Failed to log fitness history table to W&B: {e}")
|
| 1152 |
+
|
| 1153 |
+
except Exception as e: logging.error(f"Could not save fitness history data: {e}")
|
| 1154 |
+
else: logging.warning("Fitness history empty, skipping saving/plotting.")
|
| 1155 |
+
|
| 1156 |
+
# En İyi Modeli Eğit/Değerlendir/Kaydet
|
| 1157 |
+
final_model_path = None; training_summary = {}; final_metrics = {"test_mse": np.inf, "avg_kendall_tau": 0.0}; best_model_architecture = {}
|
| 1158 |
+
if best_model_evolved is None:
|
| 1159 |
+
logging.error("Evolution did not yield a best model. Skipping final training and evaluation.")
|
| 1160 |
+
else:
|
| 1161 |
+
best_model_architecture = best_model_evolved.get_architecture()
|
| 1162 |
+
logging.info(f"Best evolved model architecture: {best_model_architecture}")
|
| 1163 |
+
try:
|
| 1164 |
+
num_params = best_model_evolved.get_num_params(); logging.info(f"Best Evolved Model ({best_model_evolved.model_name}) - Params: {num_params}")
|
| 1165 |
+
if wandb_run: wandb_run.summary["best_evolved_params"] = num_params # W&B özete ekle
|
| 1166 |
+
except Exception as e: logging.warning(f"Could not log model summary details: {e}")
|
| 1167 |
+
|
| 1168 |
+
# Son Eğitim
|
| 1169 |
+
try:
|
| 1170 |
+
model_to_train = clone_pytorch_model(best_model_evolved, device)
|
| 1171 |
+
final_model, training_summary = train_final_model_pytorch(
|
| 1172 |
+
model_to_train, X_train_np, y_train_np,
|
| 1173 |
+
args.epochs_final_train, args.batch_size, args.learning_rate,
|
| 1174 |
+
device, output_dir, wandb_run
|
| 1175 |
+
)
|
| 1176 |
+
except Exception as e: logging.error(f"Error during final training: {e}", exc_info=True); final_model = None; training_summary = {"error": str(e)}
|
| 1177 |
+
|
| 1178 |
+
# Değerlendirme ve Kaydetme
|
| 1179 |
+
if final_model:
|
| 1180 |
+
final_metrics = evaluate_model_pytorch(final_model, X_test_np, y_test_np, args.batch_size, device)
|
| 1181 |
+
if wandb_run: wandb_run.summary.update(final_metrics) # W&B özete ekle
|
| 1182 |
+
|
| 1183 |
+
final_model_path = os.path.join(output_dir, "best_evolved_model_trained_pytorch_v5.pt")
|
| 1184 |
+
try:
|
| 1185 |
+
torch.save({'architecture': final_model.get_architecture(), 'model_state_dict': final_model.state_dict(),
|
| 1186 |
+
'training_summary': training_summary, 'evaluation_metrics': final_metrics}, final_model_path)
|
| 1187 |
+
logging.info(f"Final trained model state and architecture saved to {final_model_path}")
|
| 1188 |
+
# W&B'ye artifact olarak kaydet (opsiyonel)
|
| 1189 |
+
if wandb_run:
|
| 1190 |
+
try:
|
| 1191 |
+
artifact = wandb.Artifact(f'final_model_{run_name}', type='model')
|
| 1192 |
+
artifact.add_file(final_model_path)
|
| 1193 |
+
wandb_run.log_artifact(artifact)
|
| 1194 |
+
logging.info(f"Saved final model as W&B artifact.")
|
| 1195 |
+
except Exception as e: logging.warning(f"Failed to save model as W&B artifact: {e}")
|
| 1196 |
+
except Exception as e: logging.error(f"Failed to save final trained model: {e}", exc_info=True); final_model_path = None
|
| 1197 |
+
else: logging.error("Final model training failed. Skipping evaluation and saving.")
|
| 1198 |
+
|
| 1199 |
+
# Sonuçları Kaydet
|
| 1200 |
+
logging.info("--- Saving Final Results (v5) ---")
|
| 1201 |
+
final_results = {
|
| 1202 |
+
"run_info": {"run_name": run_name, "timestamp": timestamp, "output_directory": output_dir, "framework": "PyTorch",
|
| 1203 |
+
"version": "v5", "device_used": str(device), "resumed_run": resume_run, "last_checkpoint": latest_checkpoint_path,
|
| 1204 |
+
"wandb_url": wandb_run.get_url() if wandb_run else None},
|
| 1205 |
+
"config": args_dict,
|
| 1206 |
+
"evolution_summary": {
|
| 1207 |
+
"start_generation": start_generation, "end_generation": start_generation + len(best_fitness_hist),
|
| 1208 |
+
"generations_run_this_session": len(best_fitness_hist) - len(loaded_history_best),
|
| 1209 |
+
"best_fitness_overall": max(best_fitness_hist) if best_fitness_hist and any(np.isfinite(f) for f in best_fitness_hist) else None,
|
| 1210 |
+
"best_fitness_final_gen": best_fitness_hist[-1] if best_fitness_hist and np.isfinite(best_fitness_hist[-1]) else None,
|
| 1211 |
+
"avg_fitness_final_gen": avg_fitness_hist[-1] if avg_fitness_hist and np.isfinite(avg_fitness_hist[-1]) else None,
|
| 1212 |
+
"best_model_architecture": best_model_architecture,
|
| 1213 |
+
"best_model_params": best_model_evolved.get_num_params() if best_model_evolved else None
|
| 1214 |
+
},
|
| 1215 |
+
"final_training_summary": training_summary,
|
| 1216 |
+
"final_evaluation_on_test": final_metrics,
|
| 1217 |
+
"saved_trained_model_path": final_model_path
|
| 1218 |
+
}
|
| 1219 |
+
results_path = os.path.join(output_dir, "final_results_pytorch_v5.json")
|
| 1220 |
+
try:
|
| 1221 |
+
def convert_types(obj): # JSON için tür dönüştürücü
|
| 1222 |
+
if isinstance(obj, (np.integer, np.int_)): return int(obj)
|
| 1223 |
+
elif isinstance(obj, (np.floating, np.float_)): return float(obj)
|
| 1224 |
+
elif isinstance(obj, np.ndarray): return obj.tolist()
|
| 1225 |
+
elif isinstance(obj, torch.Tensor): return obj.tolist()
|
| 1226 |
+
elif isinstance(obj, torch.device): return str(obj)
|
| 1227 |
+
elif isinstance(obj, type): return obj.__name__
|
| 1228 |
+
elif isinstance(obj, argparse.Namespace): return vars(obj) # Argümanları dict yap
|
| 1229 |
+
return obj
|
| 1230 |
+
with open(results_path, 'w') as f: json.dump(final_results, f, indent=4, default=convert_types, sort_keys=True)
|
| 1231 |
+
logging.info(f"Final results summary saved to {results_path}")
|
| 1232 |
+
except Exception as e: logging.error(f"Failed to save final results JSON: {e}", exc_info=True)
|
| 1233 |
+
|
| 1234 |
+
except (Exception, KeyboardInterrupt) as e:
|
| 1235 |
+
# Hata veya kesinti durumunda loglama ve W&B bitirme
|
| 1236 |
+
if isinstance(e, KeyboardInterrupt):
|
| 1237 |
+
logging.warning("KeyboardInterrupt detected. Exiting.")
|
| 1238 |
+
else:
|
| 1239 |
+
logging.critical("Unhandled exception in pipeline:", exc_info=True)
|
| 1240 |
+
# W&B run'ı "crashed" veya "failed" olarak işaretle
|
| 1241 |
+
if wandb_run:
|
| 1242 |
+
exit_code = 1 if not isinstance(e, KeyboardInterrupt) else 130
|
| 1243 |
+
try:
|
| 1244 |
+
wandb.finish(exit_code=exit_code, quiet=True)
|
| 1245 |
+
logging.info(f"W&B run marked as {'failed' if exit_code==1 else 'killed'}.")
|
| 1246 |
+
except Exception as wb_e:
|
| 1247 |
+
logging.error(f"Error finishing W&B run: {wb_e}")
|
| 1248 |
+
# Hatayı tekrar fırlat veya çık
|
| 1249 |
+
if isinstance(e, KeyboardInterrupt): sys.exit(130)
|
| 1250 |
+
else: sys.exit(1)
|
| 1251 |
+
|
| 1252 |
+
finally:
|
| 1253 |
+
# W&B run'ı normal şekilde bitir (eğer hata olmadıysa)
|
| 1254 |
+
if wandb_run and not sys.exc_info()[0]: # Sadece hata yoksa bitir
|
| 1255 |
+
try:
|
| 1256 |
+
wandb.finish()
|
| 1257 |
+
logging.info("W&B run finished successfully.")
|
| 1258 |
+
except Exception as e:
|
| 1259 |
+
logging.error(f"Error finishing W&B run: {e}")
|
| 1260 |
+
|
| 1261 |
+
logging.info(f"========== PyTorch v5 Pipeline Run {run_name} Finished ==========")
|
| 1262 |
+
|
| 1263 |
+
|
| 1264 |
+
# --- Argüman Ayrıştırıcı (v5) ---
|
| 1265 |
+
def parse_arguments_v5() -> argparse.Namespace:
|
| 1266 |
+
parser = argparse.ArgumentParser(description="EvoNet v5: Adaptive & Parallel Neuroevolution with PyTorch")
|
| 1267 |
+
|
| 1268 |
+
# --- Dizinler ve Kontrol ---
|
| 1269 |
+
parser.add_argument('--output_base_dir', type=str, default=DEFAULT_OUTPUT_BASE_DIR)
|
| 1270 |
+
parser.add_argument('--resume_from', type=str, default=None, help='Path to previous run dir to resume.')
|
| 1271 |
+
parser.add_argument('--checkpoint_interval', type=int, default=DEFAULT_CHECKPOINT_INTERVAL, help='Checkpoint frequency (gens). 0=disable.')
|
| 1272 |
+
parser.add_argument('--device', type=str, default=DEFAULT_DEVICE, choices=['auto', 'cpu', 'cuda'])
|
| 1273 |
+
parser.add_argument('--seed', type=int, default=None, help='Random seed (default: random).')
|
| 1274 |
+
|
| 1275 |
+
# --- Veri ---
|
| 1276 |
+
parser.add_argument('--seq_length', type=int, default=DEFAULT_SEQ_LENGTH)
|
| 1277 |
+
parser.add_argument('--train_samples', type=int, default=5000)
|
| 1278 |
+
parser.add_argument('--test_samples', type=int, default=1000)
|
| 1279 |
+
|
| 1280 |
+
# --- Evrim Parametreleri ---
|
| 1281 |
+
evo_group = parser.add_argument_group('Evolution Parameters')
|
| 1282 |
+
evo_group.add_argument('--pop_size', type=int, default=DEFAULT_POP_SIZE)
|
| 1283 |
+
evo_group.add_argument('--generations', type=int, default=DEFAULT_GENERATIONS)
|
| 1284 |
+
evo_group.add_argument('--crossover_rate', type=float, default=DEFAULT_CROSSOVER_RATE)
|
| 1285 |
+
evo_group.add_argument('--mutation_rate', type=float, default=DEFAULT_MUTATION_RATE, help='Prob. of mutation if crossover is not applied.')
|
| 1286 |
+
evo_group.add_argument('--weight_mut_rate', type=float, default=DEFAULT_WEIGHT_MUT_RATE, help='Prob. for each weight to mutate if mutation occurs.')
|
| 1287 |
+
evo_group.add_argument('--tournament_size', type=int, default=DEFAULT_TOURNAMENT_SIZE)
|
| 1288 |
+
evo_group.add_argument('--elitism_count', type=int, default=DEFAULT_ELITISM_COUNT)
|
| 1289 |
+
evo_group.add_argument('--complexity_penalty', type=float, default=DEFAULT_COMPLEXITY_PENALTY, help='Penalty weight per parameter in fitness.')
|
| 1290 |
+
|
| 1291 |
+
# --- Adaptif Mutasyon ---
|
| 1292 |
+
adapt_group = parser.add_argument_group('Adaptive Mutation')
|
| 1293 |
+
adapt_group.add_argument('--adapt_mutation', action=argparse.BooleanOptionalAction, default=DEFAULT_ADAPT_MUTATION, help='Enable adaptive mutation strength.')
|
| 1294 |
+
adapt_group.add_argument('--mutation_strength', type=float, default=DEFAULT_MUTATION_STRENGTH, help='Initial mutation strength (std dev).')
|
| 1295 |
+
adapt_group.add_argument('--stagnation_limit', type=int, default=DEFAULT_STAGNATION_LIMIT, help='Generations without improvement to trigger adaptation.')
|
| 1296 |
+
adapt_group.add_argument('--mut_strength_decay', type=float, default=DEFAULT_MUT_STRENGTH_DECAY, help='Factor to decrease strength on improvement.')
|
| 1297 |
+
adapt_group.add_argument('--mut_strength_increase', type=float, default=DEFAULT_MUT_STRENGTH_INCREASE, help='Factor to increase strength on stagnation.')
|
| 1298 |
+
adapt_group.add_argument('--min_mut_strength', type=float, default=DEFAULT_MIN_MUT_STRENGTH)
|
| 1299 |
+
adapt_group.add_argument('--max_mut_strength', type=float, default=DEFAULT_MAX_MUT_STRENGTH)
|
| 1300 |
+
|
| 1301 |
+
# --- Paralellik ---
|
| 1302 |
+
parallel_group = parser.add_argument_group('Parallelism')
|
| 1303 |
+
parallel_group.add_argument('--num_workers', type=int, default=DEFAULT_NUM_WORKERS, help='Number of CPU workers for parallel fitness evaluation (0=disable/serial).')
|
| 1304 |
+
|
| 1305 |
+
# --- Eğitim ve Değerlendirme ---
|
| 1306 |
+
train_group = parser.add_argument_group('Final Training & Evaluation')
|
| 1307 |
+
train_group.add_argument('--batch_size', type=int, default=DEFAULT_BATCH_SIZE)
|
| 1308 |
+
train_group.add_argument('--epochs_final_train', type=int, default=DEFAULT_EPOCHS_FINAL_TRAIN)
|
| 1309 |
+
train_group.add_argument('--learning_rate', type=float, default=0.001, help='LR for final training.')
|
| 1310 |
+
|
| 1311 |
+
# --- Deney Takibi (W&B) ---
|
| 1312 |
+
wandb_group = parser.add_argument_group('Experiment Tracking (Weights & Biases)')
|
| 1313 |
+
wandb_group.add_argument('--use_wandb', action=argparse.BooleanOptionalAction, default=False, help='Enable W&B logging.')
|
| 1314 |
+
wandb_group.add_argument('--wandb_project', type=str, default="EvoNet-v5", help='W&B project name.')
|
| 1315 |
+
wandb_group.add_argument('--wandb_entity', type=str, default=None, help='W&B entity (username or team). Uses default if None.') # Genellikle kullanıcı adı veya takım
|
| 1316 |
+
|
| 1317 |
+
args = parser.parse_args()
|
| 1318 |
+
if args.seed is None: args.seed = random.randint(0, 2**32 - 1); print(f"Generated random seed: {args.seed}")
|
| 1319 |
+
if args.num_workers < 0: print(f"Warning: num_workers ({args.num_workers}) cannot be negative. Setting to 0."); args.num_workers = 0
|
| 1320 |
+
# Diğer v4 kontrolleri (elitism, tournament size) burada da yapılabilir.
|
| 1321 |
+
|
| 1322 |
+
return args
|
| 1323 |
+
|
| 1324 |
+
# --- Ana Çalıştırma Bloğu ---
|
| 1325 |
+
if __name__ == "__main__":
|
| 1326 |
+
# Önemli Not: concurrent.futures (özellikle ProcessPoolExecutor) ve
|
| 1327 |
+
# multiprocessing'in düzgün çalışması için ana kod bloğunun
|
| 1328 |
+
# `if __name__ == "__main__":` içinde olması genellikle gereklidir.
|
| 1329 |
+
cli_args = parse_arguments_v5()
|
| 1330 |
+
run_pipeline_pytorch_v5(cli_args)
|