NextGenC commited on
Commit
6b009f6
·
verified ·
1 Parent(s): cbd90c0

Upload 5 files

Browse files
Files changed (5) hide show
  1. evonet_optimizer.py +500 -0
  2. v2.py +643 -0
  3. v3.py +784 -0
  4. v4.py +1327 -0
  5. v5.py +1330 -0
evonet_optimizer.py ADDED
@@ -0,0 +1,500 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ import sys
4
+ import argparse
5
+ import random
6
+ import logging
7
+ from datetime import datetime
8
+ import json
9
+ from typing import List, Tuple, Dict, Any
10
+
11
+ import numpy as np
12
+ import tensorflow as tf
13
+ from tensorflow.keras.models import Sequential, load_model, clone_model
14
+ from tensorflow.keras.layers import Dense, Input
15
+ from tensorflow.keras.optimizers import Adam
16
+ from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
17
+ import matplotlib.pyplot as plt
18
+ from scipy.stats import kendalltau
19
+
20
+ # --- Constants ---
21
+ DEFAULT_SEQ_LENGTH = 10
22
+ DEFAULT_POP_SIZE = 50
23
+ DEFAULT_GENERATIONS = 50
24
+ DEFAULT_MUTATION_RATE = 0.4 # Probability of applying any mutation to an individual
25
+ DEFAULT_WEIGHT_MUT_RATE = 0.8 # If mutation occurs, probability of weight perturbation
26
+ DEFAULT_ACTIVATION_MUT_RATE = 0.2 # If mutation occurs, probability of activation change
27
+ DEFAULT_MUTATION_STRENGTH = 0.1 # Magnitude of weight perturbation
28
+ DEFAULT_TOURNAMENT_SIZE = 5
29
+ DEFAULT_ELITISM_COUNT = 2 # Keep top N individuals directly
30
+ DEFAULT_EPOCHS_FINAL_TRAIN = 100
31
+ DEFAULT_BATCH_SIZE = 64
32
+
33
+ # --- Logging Setup ---
34
+ def setup_logging(log_dir: str, log_level=logging.INFO) -> None:
35
+ """Configures logging to file and console."""
36
+ log_filename = os.path.join(log_dir, 'evolution.log')
37
+ logging.basicConfig(
38
+ level=log_level,
39
+ format='%(asctime)s - %(levelname)s - %(message)s',
40
+ handlers=[
41
+ logging.FileHandler(log_filename),
42
+ logging.StreamHandler(sys.stdout) # Also print to console
43
+ ]
44
+ )
45
+
46
+ # --- GPU Check ---
47
+ def check_gpu() -> bool:
48
+ """Checks for GPU availability and sets memory growth."""
49
+ gpus = tf.config.list_physical_devices('GPU')
50
+ if gpus:
51
+ try:
52
+ # Currently, memory growth needs to be the same across GPUs
53
+ for gpu in gpus:
54
+ tf.config.experimental.set_memory_growth(gpu, True)
55
+ logical_gpus = tf.config.list_logical_devices('GPU')
56
+ logging.info(f"{len(gpus)} Physical GPUs, {len(logical_gpus)} Logical GPUs found.")
57
+ logging.info(f"Using GPU: {gpus[0].name}")
58
+ return True
59
+ except RuntimeError as e:
60
+ # Memory growth must be set before GPUs have been initialized
61
+ logging.error(f"Error setting memory growth: {e}")
62
+ return False
63
+ else:
64
+ logging.warning("GPU not found. Using CPU.")
65
+ return False
66
+
67
+ # --- Data Generation ---
68
+ def generate_data(num_samples: int, seq_length: int) -> Tuple[np.ndarray, np.ndarray]:
69
+ """Generates random sequences and their sorted versions."""
70
+ logging.info(f"Generating {num_samples} samples with sequence length {seq_length}...")
71
+ X = np.random.rand(num_samples, seq_length) * 100
72
+ y = np.sort(X, axis=1)
73
+ logging.info("Data generation complete.")
74
+ return X, y
75
+
76
+ # --- Neuroevolution Core ---
77
+ def create_individual(seq_length: int) -> Sequential:
78
+ """Creates a Keras Sequential model with random architecture."""
79
+ model = Sequential(name=f"model_random_{random.randint(1000, 9999)}")
80
+ num_hidden_layers = random.randint(1, 4) # Reduced max layers for simplicity
81
+ neurons_per_layer = [random.randint(8, 64) for _ in range(num_hidden_layers)]
82
+ activations = [random.choice(['relu', 'tanh', 'sigmoid']) for _ in range(num_hidden_layers)]
83
+
84
+ # Input Layer
85
+ model.add(Input(shape=(seq_length,)))
86
+
87
+ # Hidden Layers
88
+ for i in range(num_hidden_layers):
89
+ model.add(Dense(neurons_per_layer[i], activation=activations[i]))
90
+
91
+ # Output Layer - must match sequence length for sorting
92
+ model.add(Dense(seq_length, activation='linear')) # Linear activation for regression output
93
+
94
+ # Compile the model immediately for weight manipulation capabilities
95
+ # Use a standard optimizer; learning rate might be adjusted during final training
96
+ model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
97
+ return model
98
+
99
+ @tf.function # Potentially speeds up prediction
100
+ def get_predictions(model: Sequential, X: np.ndarray, batch_size: int) -> tf.Tensor:
101
+ """Gets model predictions using tf.function."""
102
+ return model(X, training=False) # Use __call__ inside tf.function
103
+
104
+ def calculate_fitness(individual: Sequential, X: np.ndarray, y: np.ndarray, batch_size: int) -> float:
105
+ """Calculates fitness based on inverse MSE. Handles potential errors."""
106
+ try:
107
+ # Ensure data is float32 for TensorFlow
108
+ X_tf = tf.cast(X, tf.float32)
109
+ y_tf = tf.cast(y, tf.float32)
110
+
111
+ # Use the tf.function decorated prediction function
112
+ y_pred_tf = get_predictions(individual, X_tf, batch_size)
113
+
114
+ # Calculate MSE using TensorFlow operations for potential GPU acceleration
115
+ mse = tf.reduce_mean(tf.square(y_tf - y_pred_tf))
116
+ mse_val = mse.numpy() # Get the numpy value
117
+
118
+ # Fitness: Inverse MSE (add small epsilon to avoid division by zero)
119
+ fitness_score = 1.0 / (mse_val + 1e-8)
120
+
121
+ # Handle potential NaN or Inf values in fitness
122
+ if not np.isfinite(fitness_score):
123
+ logging.warning(f"Non-finite fitness detected ({fitness_score}) for model {individual.name}. Assigning low fitness.")
124
+ return 1e-8 # Assign a very low fitness
125
+
126
+ return float(fitness_score)
127
+
128
+ except Exception as e:
129
+ logging.error(f"Error during fitness calculation for model {individual.name}: {e}", exc_info=True)
130
+ return 1e-8 # Return minimal fitness on error
131
+
132
+
133
+ def mutate_individual(individual: Sequential, weight_mut_rate: float, act_mut_rate: float, mut_strength: float) -> Sequential:
134
+ """Applies mutations (weight perturbation, activation change) to an individual."""
135
+ mutated_model = clone_model(individual)
136
+ mutated_model.set_weights(individual.get_weights()) # Crucial: Copy weights
137
+
138
+ mutated = False
139
+ # 1. Weight Mutation
140
+ if random.random() < weight_mut_rate:
141
+ mutated = True
142
+ for layer in mutated_model.layers:
143
+ if isinstance(layer, Dense):
144
+ weights_biases = layer.get_weights()
145
+ new_weights_biases = []
146
+ for wb in weights_biases:
147
+ noise = np.random.normal(0, mut_strength, wb.shape)
148
+ new_weights_biases.append(wb + noise)
149
+ if new_weights_biases: # Ensure layer had weights
150
+ layer.set_weights(new_weights_biases)
151
+ # logging.debug(f"Applied weight mutation to {mutated_model.name}")
152
+
153
+ # 2. Activation Mutation (Applied independently)
154
+ if random.random() < act_mut_rate:
155
+ # Find Dense layers eligible for activation change (not the output layer)
156
+ dense_layers = [layer for layer in mutated_model.layers if isinstance(layer, Dense)]
157
+ if len(dense_layers) > 1: # Ensure there's at least one hidden layer
158
+ mutated = True
159
+ layer_to_mutate = random.choice(dense_layers[:-1]) # Exclude output layer
160
+ current_activation = layer_to_mutate.get_config().get('activation', 'linear')
161
+ possible_activations = ['relu', 'tanh', 'sigmoid']
162
+ if current_activation in possible_activations:
163
+ possible_activations.remove(current_activation)
164
+ new_activation = random.choice(possible_activations)
165
+
166
+ # Rebuild the model config with the new activation
167
+ # This is safer than trying to modify layer activation in-place
168
+ config = mutated_model.get_config()
169
+ for layer_config in config['layers']:
170
+ if layer_config['config']['name'] == layer_to_mutate.name:
171
+ layer_config['config']['activation'] = new_activation
172
+ # logging.debug(f"Changed activation of layer {layer_to_mutate.name} to {new_activation} in {mutated_model.name}")
173
+ break # Found the layer
174
+
175
+ # Create a new model from the modified config
176
+ # Important: Need to re-compile after structural changes from config
177
+ try:
178
+ mutated_model_new_act = Sequential.from_config(config)
179
+ mutated_model_new_act.compile(optimizer=Adam(learning_rate=0.001), loss='mse') # Re-compile
180
+ mutated_model = mutated_model_new_act # Replace the old model
181
+ except Exception as e:
182
+ logging.error(f"Error rebuilding model after activation mutation for {mutated_model.name}: {e}")
183
+ # Revert mutation if rebuilding fails
184
+
185
+
186
+ # Re-compile the final mutated model to ensure optimizer state is fresh
187
+ if mutated:
188
+ mutated_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
189
+ mutated_model._name = f"mutated_{individual.name}" # Rename
190
+
191
+ return mutated_model
192
+
193
+
194
+ def tournament_selection(population: List[Sequential], fitness_scores: List[float], k: int) -> Sequential:
195
+ """Selects the best individual from a randomly chosen tournament group."""
196
+ tournament_indices = random.sample(range(len(population)), k)
197
+ tournament_fitness = [fitness_scores[i] for i in tournament_indices]
198
+ winner_index_in_tournament = np.argmax(tournament_fitness)
199
+ winner_original_index = tournament_indices[winner_index_in_tournament]
200
+ return population[winner_original_index]
201
+
202
+ def evolve_population(population: List[Sequential], X: np.ndarray, y: np.ndarray, generations: int,
203
+ mutation_rate: float, weight_mut_rate: float, act_mut_rate: float, mut_strength: float,
204
+ tournament_size: int, elitism_count: int, batch_size: int) -> Tuple[Sequential, List[float], List[float]]:
205
+ """Runs the evolutionary process."""
206
+ best_fitness_history = []
207
+ avg_fitness_history = []
208
+ best_model_overall = None
209
+ best_fitness_overall = -1.0
210
+
211
+ for gen in range(generations):
212
+ # 1. Evaluate Fitness
213
+ fitness_scores = [calculate_fitness(ind, X, y, batch_size) for ind in population]
214
+
215
+ # Track overall best
216
+ current_best_idx = np.argmax(fitness_scores)
217
+ current_best_fitness = fitness_scores[current_best_idx]
218
+ if current_best_fitness > best_fitness_overall:
219
+ best_fitness_overall = current_best_fitness
220
+ # Keep a copy of the best model structure and weights
221
+ best_model_overall = clone_model(population[current_best_idx])
222
+ best_model_overall.set_weights(population[current_best_idx].get_weights())
223
+ best_model_overall.compile(optimizer=Adam(), loss='mse') # Re-compile just in case
224
+ logging.info(f"Generation {gen+1}: New overall best fitness: {best_fitness_overall:.4f}")
225
+
226
+
227
+ avg_fitness = np.mean(fitness_scores)
228
+ best_fitness_history.append(current_best_fitness)
229
+ avg_fitness_history.append(avg_fitness)
230
+
231
+ logging.info(f"Generation {gen+1}/{generations} - Best Fitness: {current_best_fitness:.4f}, Avg Fitness: {avg_fitness:.4f}")
232
+
233
+ new_population = []
234
+
235
+ # 2. Elitism: Carry over the best individuals
236
+ if elitism_count > 0:
237
+ elite_indices = np.argsort(fitness_scores)[-elitism_count:]
238
+ for idx in elite_indices:
239
+ # Clone elite models to avoid modifications affecting originals if selected again
240
+ elite_clone = clone_model(population[idx])
241
+ elite_clone.set_weights(population[idx].get_weights())
242
+ elite_clone.compile(optimizer=Adam(), loss='mse') # Ensure compiled
243
+ new_population.append(elite_clone)
244
+
245
+
246
+ # 3. Selection & Reproduction for the rest of the population
247
+ while len(new_population) < len(population):
248
+ # Select parent(s) using tournament selection
249
+ parent = tournament_selection(population, fitness_scores, tournament_size)
250
+
251
+ # Create child through mutation (crossover could be added here)
252
+ child = parent # Start with the parent
253
+ if random.random() < mutation_rate:
254
+ # Clone parent before mutation to avoid modifying the original selected parent
255
+ parent_clone = clone_model(parent)
256
+ parent_clone.set_weights(parent.get_weights())
257
+ parent_clone.compile(optimizer=Adam(), loss='mse') # Ensure compiled
258
+ child = mutate_individual(parent_clone, weight_mut_rate, act_mut_rate, mut_strength)
259
+ else:
260
+ # If no mutation, still clone the parent to ensure new population has distinct objects
261
+ child = clone_model(parent)
262
+ child.set_weights(parent.get_weights())
263
+ child.compile(optimizer=Adam(), loss='mse') # Ensure compiled
264
+
265
+
266
+ new_population.append(child)
267
+
268
+ population = new_population[:len(population)] # Ensure population size is maintained
269
+
270
+ if best_model_overall is None: # Handle case where no improvement was ever found
271
+ best_idx = np.argmax([calculate_fitness(ind, X, y, batch_size) for ind in population])
272
+ best_model_overall = population[best_idx]
273
+
274
+ return best_model_overall, best_fitness_history, avg_fitness_history
275
+
276
+
277
+ # --- Plotting ---
278
+ def plot_fitness_history(history_best: List[float], history_avg: List[float], output_dir: str) -> None:
279
+ """Plots and saves the fitness history."""
280
+ plt.figure(figsize=(12, 6))
281
+ plt.plot(history_best, label="Best Fitness per Generation", marker='o', linestyle='-')
282
+ plt.plot(history_avg, label="Average Fitness per Generation", marker='x', linestyle='--')
283
+ plt.xlabel("Generation")
284
+ plt.ylabel("Fitness Score (1 / MSE)")
285
+ plt.title("Evolutionary Process Fitness History")
286
+ plt.legend()
287
+ plt.grid(True)
288
+ plt.tight_layout()
289
+ plot_path = os.path.join(output_dir, "fitness_history.png")
290
+ plt.savefig(plot_path)
291
+ plt.close()
292
+ logging.info(f"Fitness history plot saved to {plot_path}")
293
+
294
+ # --- Evaluation ---
295
+ def evaluate_model(model: Sequential, X_test: np.ndarray, y_test: np.ndarray, batch_size: int) -> Dict[str, float]:
296
+ """Evaluates the final model on the test set."""
297
+ logging.info("Evaluating final model on test data...")
298
+ y_pred = model.predict(X_test, batch_size=batch_size, verbose=0)
299
+ test_mse = np.mean(np.square(y_test - y_pred))
300
+ logging.info(f"Final Test MSE: {test_mse:.6f}")
301
+
302
+ # Calculate Kendall's Tau for a sample (can be slow for large datasets)
303
+ sample_size = min(100, X_test.shape[0])
304
+ taus = []
305
+ indices = np.random.choice(X_test.shape[0], sample_size, replace=False)
306
+ for i in indices:
307
+ tau, _ = kendalltau(y_test[i], y_pred[i])
308
+ if not np.isnan(tau): # Handle potential NaN if predictions are constant
309
+ taus.append(tau)
310
+ avg_kendall_tau = np.mean(taus) if taus else 0.0
311
+ logging.info(f"Average Kendall's Tau (on {sample_size} samples): {avg_kendall_tau:.4f}")
312
+
313
+ return {
314
+ "test_mse": float(test_mse),
315
+ "avg_kendall_tau": float(avg_kendall_tau)
316
+ }
317
+
318
+ # --- Main Pipeline ---
319
+ def run_pipeline(args: argparse.Namespace):
320
+ """Executes the complete neuroevolution pipeline."""
321
+
322
+ # Create unique output directory for this run
323
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
324
+ output_dir = os.path.join(args.output_base_dir, f"evorun_{timestamp}")
325
+ os.makedirs(output_dir, exist_ok=True)
326
+
327
+ # Setup logging for this run
328
+ setup_logging(output_dir)
329
+ logging.info(f"Starting EvoNet Pipeline Run: {timestamp}")
330
+ logging.info(f"Output directory: {output_dir}")
331
+
332
+ # Log arguments/configuration
333
+ logging.info("Configuration:")
334
+ args_dict = vars(args)
335
+ for k, v in args_dict.items():
336
+ logging.info(f" {k}: {v}")
337
+ # Save config to file
338
+ config_path = os.path.join(output_dir, "config.json")
339
+ with open(config_path, 'w') as f:
340
+ json.dump(args_dict, f, indent=4)
341
+ logging.info(f"Configuration saved to {config_path}")
342
+
343
+
344
+ # Set random seeds for reproducibility
345
+ random.seed(args.seed)
346
+ np.random.seed(args.seed)
347
+ tf.random.set_seed(args.seed)
348
+ logging.info(f"Using random seed: {args.seed}")
349
+
350
+ # Check GPU
351
+ check_gpu()
352
+
353
+ # Generate Data
354
+ X_train, y_train = generate_data(args.train_samples, args.seq_length)
355
+ X_test, y_test = generate_data(args.test_samples, args.seq_length)
356
+
357
+ # Initialize Population
358
+ logging.info(f"Initializing population of {args.pop_size} individuals...")
359
+ population = [create_individual(args.seq_length) for _ in range(args.pop_size)]
360
+ logging.info("Population initialized.")
361
+
362
+ # Run Evolution
363
+ logging.info(f"Starting evolution for {args.generations} generations...")
364
+ best_model_unevolved, best_fitness_hist, avg_fitness_hist = evolve_population(
365
+ population, X_train, y_train, args.generations,
366
+ args.mutation_rate, args.weight_mut_rate, args.activation_mut_rate, args.mutation_strength,
367
+ args.tournament_size, args.elitism_count, args.batch_size
368
+ )
369
+ logging.info("Evolution complete.")
370
+
371
+ # Save fitness history data
372
+ history_path = os.path.join(output_dir, "fitness_history.csv")
373
+ history_data = np.array([best_fitness_hist, avg_fitness_hist]).T
374
+ np.savetxt(history_path, history_data, delimiter=',', header='BestFitness,AvgFitness', comments='')
375
+ logging.info(f"Fitness history data saved to {history_path}")
376
+
377
+ # Plot fitness history
378
+ plot_fitness_history(best_fitness_hist, avg_fitness_hist, output_dir)
379
+
380
+ # Final Training of the Best Model
381
+ logging.info("Starting final training of the best evolved model...")
382
+ # Clone the best model again to ensure we don't modify the original reference unintentionally
383
+ final_model = clone_model(best_model_unevolved)
384
+ final_model.set_weights(best_model_unevolved.get_weights())
385
+ # Use a fresh optimizer instance for final training
386
+ final_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
387
+
388
+ # Callbacks for efficient training
389
+ early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1)
390
+ reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-6, verbose=1)
391
+
392
+ # Use a portion of training data for validation during final training
393
+ history = final_model.fit(
394
+ X_train, y_train,
395
+ epochs=args.epochs_final_train,
396
+ batch_size=args.batch_size,
397
+ validation_split=0.2, # Use 20% of training data for validation
398
+ callbacks=[early_stopping, reduce_lr],
399
+ verbose=2 # Show one line per epoch
400
+ )
401
+ logging.info("Final training complete.")
402
+
403
+ # Evaluate the TRAINED final model
404
+ final_metrics = evaluate_model(final_model, X_test, y_test, args.batch_size)
405
+
406
+ # Save the TRAINED final model
407
+ model_path = os.path.join(output_dir, "best_evolved_model_trained.keras") # Use .keras format
408
+ final_model.save(model_path)
409
+ logging.info(f"Final trained model saved to {model_path}")
410
+
411
+ # Save final results
412
+ results = {
413
+ "config": args_dict,
414
+ "final_evaluation": final_metrics,
415
+ "evolution_summary": {
416
+ "best_fitness_overall": best_fitness_hist[-1] if best_fitness_hist else None,
417
+ "avg_fitness_final_gen": avg_fitness_hist[-1] if avg_fitness_hist else None,
418
+ },
419
+ "training_history": history.history # Include loss/val_loss history from final training
420
+ }
421
+ results_path = os.path.join(output_dir, "final_results.json")
422
+ # Convert numpy types in history to native Python types for JSON serialization
423
+ for key in results['training_history']:
424
+ results['training_history'][key] = [float(v) for v in results['training_history'][key]]
425
+
426
+ with open(results_path, 'w') as f:
427
+ json.dump(results, f, indent=4)
428
+ logging.info(f"Final results saved to {results_path}")
429
+ logging.info("Pipeline finished successfully!")
430
+
431
+
432
+ # --- Argument Parser ---
433
+ def parse_arguments() -> argparse.Namespace:
434
+ parser = argparse.ArgumentParser(description="EvoNet: Neuroevolution for Sorting Task")
435
+
436
+ # --- Directory ---
437
+ parser.add_argument('--output_base_dir', type=str, default=os.path.join(os.getcwd(), "evonet_runs"),
438
+ help='Base directory to store run results.')
439
+
440
+ # --- Data ---
441
+ parser.add_argument('--seq_length', type=int, default=DEFAULT_SEQ_LENGTH,
442
+ help='Length of the sequences to sort.')
443
+ parser.add_argument('--train_samples', type=int, default=5000, help='Number of training samples.')
444
+ parser.add_argument('--test_samples', type=int, default=1000, help='Number of test samples.')
445
+
446
+ # --- Evolution Parameters ---
447
+ parser.add_argument('--pop_size', type=int, default=DEFAULT_POP_SIZE, help='Population size.')
448
+ parser.add_argument('--generations', type=int, default=DEFAULT_GENERATIONS, help='Number of generations.')
449
+ parser.add_argument('--mutation_rate', type=float, default=DEFAULT_MUTATION_RATE,
450
+ help='Overall probability of mutating an individual.')
451
+ parser.add_argument('--weight_mut_rate', type=float, default=DEFAULT_WEIGHT_MUT_RATE,
452
+ help='Probability of weight perturbation if mutation occurs.')
453
+ parser.add_argument('--activation_mut_rate', type=float, default=DEFAULT_ACTIVATION_MUT_RATE,
454
+ help='Probability of activation change if mutation occurs.')
455
+ parser.add_argument('--mutation_strength', type=float, default=DEFAULT_MUTATION_STRENGTH,
456
+ help='Standard deviation of Gaussian noise for weight mutation.')
457
+ parser.add_argument('--tournament_size', type=int, default=DEFAULT_TOURNAMENT_SIZE,
458
+ help='Number of individuals participating in tournament selection.')
459
+ parser.add_argument('--elitism_count', type=int, default=DEFAULT_ELITISM_COUNT,
460
+ help='Number of best individuals to carry over directly.')
461
+
462
+ # --- Training & Evaluation ---
463
+ parser.add_argument('--batch_size', type=int, default=DEFAULT_BATCH_SIZE, help='Batch size for predictions and training.')
464
+ parser.add_argument('--epochs_final_train', type=int, default=DEFAULT_EPOCHS_FINAL_TRAIN,
465
+ help='Max epochs for final training of the best model.')
466
+
467
+ # --- Reproducibility ---
468
+ parser.add_argument('--seed', type=int, default=None, help='Random seed for reproducibility (default: random).')
469
+
470
+ args = parser.parse_args()
471
+
472
+ # If seed is not provided, generate one
473
+ if args.seed is None:
474
+ args.seed = random.randint(0, 2**32 - 1)
475
+
476
+ return args
477
+
478
+
479
+ # --- Main Execution ---
480
+ if __name__ == "__main__":
481
+ # 1. Parse Command Line Arguments
482
+ cli_args = parse_arguments()
483
+
484
+ # Ensure output directory exists
485
+ os.makedirs(cli_args.output_base_dir, exist_ok=True)
486
+
487
+ # 2. Run the Pipeline
488
+ try:
489
+ run_pipeline(cli_args)
490
+ except Exception as e:
491
+ # Log any uncaught exceptions during the pipeline execution
492
+ # The logger might not be set up if error is early, so print as fallback
493
+ print(f"FATAL ERROR in pipeline execution: {e}", file=sys.stderr)
494
+ # Attempt to log if logger was initialized
495
+ if logging.getLogger().hasHandlers():
496
+ logging.critical("FATAL ERROR in pipeline execution:", exc_info=True)
497
+ else:
498
+ import traceback
499
+ print(traceback.format_exc(), file=sys.stderr)
500
+ sys.exit(1) # Exit with error code
v2.py ADDED
@@ -0,0 +1,643 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==============================================================================
2
+ # EvoNet Optimizer 2 - Revize Edilmiş ve İyileştirilmiş Kod
3
+ # Açıklama: Bu kod, sıralama görevini öğrenmek için rastgele topolojilere
4
+ # sahip sinir ağlarını evrimleştiren bir neuroevolution süreci uygular.
5
+ # Daha sağlam hata kontrolü, yapılandırma, loglama ve iyileştirilmiş
6
+ # evrimsel operatörler içerir.
7
+ # ==============================================================================
8
+
9
+ import os
10
+ import subprocess
11
+ import sys
12
+ import argparse
13
+ import random
14
+ import logging
15
+ from datetime import datetime
16
+ import json
17
+ from typing import List, Tuple, Dict, Any
18
+
19
+ import numpy as np
20
+ import tensorflow as tf
21
+ from tensorflow.keras.models import Sequential, load_model, clone_model
22
+ from tensorflow.keras.layers import Dense, Input
23
+ from tensorflow.keras.optimizers import Adam
24
+ from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
25
+ import matplotlib.pyplot as plt
26
+ from scipy.stats import kendalltau
27
+
28
+ # --- Sabitler ve Varsayılan Değerler ---
29
+ DEFAULT_SEQ_LENGTH = 10
30
+ DEFAULT_POP_SIZE = 50
31
+ DEFAULT_GENERATIONS = 50
32
+ DEFAULT_MUTATION_RATE = 0.4 # Bireye mutasyon uygulama olasılığı
33
+ DEFAULT_WEIGHT_MUT_RATE = 0.8 # Mutasyon olursa, ağırlık bozulması olasılığı
34
+ DEFAULT_ACTIVATION_MUT_RATE = 0.2 # Mutasyon olursa, aktivasyon değişimi olasılığı
35
+ DEFAULT_MUTATION_STRENGTH = 0.1 # Ağırlık bozulmasının büyüklüğü (std dev)
36
+ DEFAULT_TOURNAMENT_SIZE = 5 # Turnuva seçilimindeki birey sayısı
37
+ DEFAULT_ELITISM_COUNT = 2 # Sonraki nesle doğrudan aktarılacak en iyi birey sayısı
38
+ DEFAULT_EPOCHS_FINAL_TRAIN = 100 # En iyi modelin son eğitimindeki max epoch
39
+ DEFAULT_BATCH_SIZE = 64 # Tahmin ve eğitim için batch boyutu
40
+ DEFAULT_OUTPUT_BASE_DIR = os.path.join(os.getcwd(), "evonet_runs_revised") # Ana çıktı klasörü
41
+
42
+ # --- Loglama Ayarları ---
43
+ def setup_logging(log_dir: str, log_level=logging.INFO) -> None:
44
+ """Loglamayı dosyaya ve konsola ayarlayan fonksiyon."""
45
+ log_filename = os.path.join(log_dir, 'evolution_run.log')
46
+ # Önceki handler'ları temizle (Jupyter gibi ortamlarda tekrar çalıştırmada önemli)
47
+ for handler in logging.root.handlers[:]:
48
+ logging.root.removeHandler(handler)
49
+ # Yeni handler'ları ayarla
50
+ logging.basicConfig(
51
+ level=log_level,
52
+ format='%(asctime)s - %(levelname)-8s - %(message)s',
53
+ handlers=[
54
+ logging.FileHandler(log_filename, mode='w'), # 'w' modu ile her çalıştırmada üzerine yazar
55
+ logging.StreamHandler(sys.stdout)
56
+ ]
57
+ )
58
+ logging.info("Logging setup complete.")
59
+
60
+ # --- GPU Kontrolü ---
61
+ def check_gpu() -> bool:
62
+ """GPU varlığını kontrol eder ve bellek artışını ayarlar."""
63
+ gpus = tf.config.list_physical_devices('GPU')
64
+ if gpus:
65
+ try:
66
+ for gpu in gpus:
67
+ tf.config.experimental.set_memory_growth(gpu, True)
68
+ logical_gpus = tf.config.list_logical_devices('GPU')
69
+ logging.info(f"{len(gpus)} Physical GPUs, {len(logical_gpus)} Logical GPUs found.")
70
+ if logical_gpus:
71
+ logging.info(f"Using GPU: {tf.config.experimental.get_device_details(gpus[0])['device_name']}")
72
+ return True
73
+ except RuntimeError as e:
74
+ logging.error(f"Error setting memory growth for GPU: {e}", exc_info=True)
75
+ return False
76
+ else:
77
+ logging.warning("GPU not found. Using CPU.")
78
+ return False
79
+
80
+ # --- Veri Üretimi ---
81
+ def generate_data(num_samples: int, seq_length: int) -> Tuple[np.ndarray, np.ndarray]:
82
+ """Rastgele diziler ve sıralanmış hallerini üretir."""
83
+ logging.info(f"Generating {num_samples} samples with sequence length {seq_length}...")
84
+ try:
85
+ X = np.random.rand(num_samples, seq_length).astype(np.float32) * 100
86
+ y = np.sort(X, axis=1).astype(np.float32)
87
+ logging.info("Data generation successful.")
88
+ return X, y
89
+ except Exception as e:
90
+ logging.error(f"Error during data generation: {e}", exc_info=True)
91
+ raise # Hatanın yukarıya bildirilmesi önemli
92
+
93
+ # --- Neuroevolution Çekirdeği ---
94
+ def create_individual(seq_length: int, input_shape: Tuple) -> Sequential:
95
+ """Rastgele mimariye sahip bir Keras Sequential modeli oluşturur ve derler."""
96
+ try:
97
+ model = Sequential(name=f"model_random_{random.randint(10000, 99999)}")
98
+ num_hidden_layers = random.randint(1, 4)
99
+ neurons_per_layer = [random.randint(8, 64) for _ in range(num_hidden_layers)]
100
+ activations = [random.choice(['relu', 'tanh', 'sigmoid']) for _ in range(num_hidden_layers)]
101
+
102
+ model.add(Input(shape=input_shape)) # Input katmanı
103
+
104
+ for i in range(num_hidden_layers): # Gizli katmanlar
105
+ model.add(Dense(neurons_per_layer[i], activation=activations[i]))
106
+
107
+ model.add(Dense(seq_length, activation='linear')) # Çıkış katmanı
108
+
109
+ # Ağırlık manipülasyonu ve potansiyel eğitim için modeli derle
110
+ model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
111
+ #logging.debug(f"Created individual: {model.name} with {len(model.layers)} layers.")
112
+ return model
113
+ except Exception as e:
114
+ logging.error(f"Error creating individual model: {e}", exc_info=True)
115
+ raise
116
+
117
+ @tf.function # TensorFlow grafiği olarak derleyerek potansiyel hızlandırma
118
+ def get_predictions(model: Sequential, X: tf.Tensor) -> tf.Tensor:
119
+ """Model tahminlerini tf.function kullanarak alır."""
120
+ return model(X, training=False)
121
+
122
+ def calculate_fitness(individual: Sequential, X: np.ndarray, y: np.ndarray, batch_size: int) -> float:
123
+ """Bir bireyin fitness değerini (1/MSE) hesaplar, hataları yönetir."""
124
+ if not isinstance(X, tf.Tensor): X = tf.cast(X, tf.float32)
125
+ if not isinstance(y, tf.Tensor): y = tf.cast(y, tf.float32)
126
+
127
+ try:
128
+ y_pred_tf = get_predictions(individual, X) # Batching predict içinde yapılır
129
+ mse = tf.reduce_mean(tf.square(y - y_pred_tf))
130
+ mse_val = mse.numpy()
131
+
132
+ # Fitness: Ters MSE (sıfıra bölmeyi önlemek için epsilon ekle)
133
+ fitness_score = 1.0 / (mse_val + 1e-8)
134
+
135
+ if not np.isfinite(fitness_score) or fitness_score < 0:
136
+ logging.warning(f"Non-finite or negative fitness detected ({fitness_score:.4g}) for model {individual.name}. Assigning minimal fitness.")
137
+ return 1e-8 # Çok düşük bir fitness ata
138
+
139
+ #logging.debug(f"Fitness for {individual.name}: {fitness_score:.4f} (MSE: {mse_val:.4f})")
140
+ return float(fitness_score)
141
+
142
+ except tf.errors.InvalidArgumentError as e:
143
+ logging.error(f"TensorFlow InvalidArgumentError during fitness calculation for model {individual.name} (potential shape mismatch?): {e}")
144
+ return 1e-8
145
+ except Exception as e:
146
+ logging.error(f"Unhandled error during fitness calculation for model {individual.name}: {e}", exc_info=True)
147
+ return 1e-8 # Hata durumunda minimum fitness döndür
148
+
149
+
150
+ def mutate_individual(individual: Sequential, weight_mut_rate: float, act_mut_rate: float, mut_strength: float) -> Sequential:
151
+ """Bir bireye mutasyonlar uygular (ağırlık bozulması, aktivasyon değişimi)."""
152
+ try:
153
+ # Mutasyon için modeli klonla, orijinali bozma
154
+ mutated_model = clone_model(individual)
155
+ mutated_model.set_weights(individual.get_weights())
156
+
157
+ mutated = False
158
+ # 1. Ağırlık Mutasyonu
159
+ if random.random() < weight_mut_rate:
160
+ mutated = True
161
+ for layer in mutated_model.layers:
162
+ if isinstance(layer, Dense) and layer.get_weights(): # Sadece ağırlığı olan Dense katmanları
163
+ weights_biases = layer.get_weights()
164
+ new_weights_biases = []
165
+ for wb in weights_biases:
166
+ noise = np.random.normal(0, mut_strength, wb.shape).astype(np.float32)
167
+ new_weights_biases.append(wb + noise)
168
+ layer.set_weights(new_weights_biases)
169
+
170
+ # 2. Aktivasyon Mutasyonu (Bağımsız olasılık)
171
+ if random.random() < act_mut_rate:
172
+ dense_layers = [layer for layer in mutated_model.layers if isinstance(layer, Dense)]
173
+ if len(dense_layers) > 1: # En az bir gizli katman varsa
174
+ layer_to_mutate = random.choice(dense_layers[:-1]) # Çıkış katmanı hariç
175
+ current_activation_name = tf.keras.activations.serialize(layer_to_mutate.activation)
176
+ possible_activations = ['relu', 'tanh', 'sigmoid']
177
+ if current_activation_name in possible_activations:
178
+ possible_activations.remove(current_activation_name)
179
+ if possible_activations: # Değiştirilecek başka aktivasyon varsa
180
+ new_activation = random.choice(possible_activations)
181
+ # Katman konfigürasyonunu güncellemek daha güvenli
182
+ layer_config = layer_to_mutate.get_config()
183
+ layer_config['activation'] = new_activation
184
+ # Yeni konfigürasyondan yeni katman oluştur ve ağırlıkları aktar
185
+ try:
186
+ new_layer = Dense.from_config(layer_config)
187
+ # Model içinde katmanı değiştirmek yerine, modeli yeniden oluşturmak daha sağlam olabilir.
188
+ # Ancak basitlik için bu yaklaşımı deneyelim (riskli olabilir).
189
+ # Aktivasyon değiştirmek için katmanı yeniden build etmek gerekebilir.
190
+ # Bu kısım karmaşık olabilir, şimdilik loglayalım.
191
+ logging.debug(f"Attempting activation change on layer {layer_to_mutate.name} to {new_activation} (Implementation needs robust handling).")
192
+ # Gerçek uygulamada modeli yeniden oluşturmak daha iyi olabilir.
193
+ # Şimdilik sadece ağırlık mutasyonuna odaklanalım. Aktivasyon mutasyonu deneysel kalabilir.
194
+ mutated = True # Aktivasyon mutasyon denemesi yapıldı olarak işaretle
195
+ except Exception as e:
196
+ logging.warning(f"Could not directly modify/rebuild layer for activation change: {e}")
197
+
198
+
199
+ # Mutasyona uğradıysa modeli yeniden derle (optimizer durumu sıfırlanabilir)
200
+ if mutated:
201
+ mutated_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
202
+ mutated_model._name = f"mutated_{individual.name}_{random.randint(1000,9999)}" # İsmi güncelle
203
+ #logging.debug(f"Mutated model {individual.name} -> {mutated_model.name}")
204
+
205
+ return mutated_model
206
+ except Exception as e:
207
+ logging.error(f"Error during mutation of model {individual.name}: {e}", exc_info=True)
208
+ return individual # Hata olursa orijinal bireyi döndür
209
+
210
+
211
+ def tournament_selection(population: List[Sequential], fitness_scores: List[float], k: int) -> Sequential:
212
+ """Rastgele seçilen bir turnuva grubundan en iyi bireyi seçer."""
213
+ if not population:
214
+ raise ValueError("Population cannot be empty for selection.")
215
+ if len(population) < k:
216
+ logging.warning(f"Tournament size {k} is larger than population size {len(population)}. Using population size.")
217
+ k = len(population)
218
+ try:
219
+ tournament_indices = random.sample(range(len(population)), k)
220
+ tournament_fitness = [fitness_scores[i] for i in tournament_indices]
221
+ winner_local_idx = np.argmax(tournament_fitness)
222
+ winner_global_idx = tournament_indices[winner_local_idx]
223
+ #logging.debug(f"Tournament winner: Index {winner_global_idx}, Fitness: {fitness_scores[winner_global_idx]:.4f}")
224
+ return population[winner_global_idx]
225
+ except Exception as e:
226
+ logging.error(f"Error during tournament selection: {e}", exc_info=True)
227
+ # Hata durumunda rastgele bir birey seçmek bir alternatif olabilir
228
+ return random.choice(population)
229
+
230
+
231
+ def evolve_population(population: List[Sequential], X: np.ndarray, y: np.ndarray, generations: int,
232
+ mutation_rate: float, weight_mut_rate: float, act_mut_rate: float, mut_strength: float,
233
+ tournament_size: int, elitism_count: int, batch_size: int) -> Tuple[Sequential, List[float], List[float]]:
234
+ """Evrimsel süreci çalıştırır, en iyi modeli ve fitness geçmişini döndürür."""
235
+ best_fitness_history = []
236
+ avg_fitness_history = []
237
+ best_model_overall = None
238
+ best_fitness_overall = -np.inf # Negatif sonsuz ile başla
239
+
240
+ # Veriyi TensorFlow tensor'üne dönüştür (döngü dışında bir kez yap)
241
+ X_tf = tf.cast(X, tf.float32)
242
+ y_tf = tf.cast(y, tf.float32)
243
+
244
+ for gen in range(generations):
245
+ generation_start_time = datetime.now()
246
+ # 1. Fitness Değerlendirme
247
+ try:
248
+ # Tüm popülasyon için fitness'ı hesapla
249
+ fitness_scores = [calculate_fitness(ind, X_tf, y_tf, batch_size) for ind in population]
250
+ except Exception as e:
251
+ logging.critical(f"Error calculating fitness for population in Generation {gen+1}: {e}", exc_info=True)
252
+ # Bu kritik bir hata, süreci durdurmak gerekebilir veya önceki popülasyonla devam edilebilir.
253
+ # Şimdilik en iyi modeli döndürelim ve çıkalım.
254
+ if best_model_overall: return best_model_overall, best_fitness_history, avg_fitness_history
255
+ else: raise # Henüz iyi model yoksa hatayı yükselt
256
+
257
+ # 2. İstatistikler ve En İyiyi Takip Etme
258
+ current_best_idx = np.argmax(fitness_scores)
259
+ current_best_fitness = fitness_scores[current_best_idx]
260
+ avg_fitness = np.mean(fitness_scores)
261
+ best_fitness_history.append(current_best_fitness)
262
+ avg_fitness_history.append(avg_fitness)
263
+
264
+ if current_best_fitness > best_fitness_overall:
265
+ best_fitness_overall = current_best_fitness
266
+ try:
267
+ # En iyi modelin yapısını ve ağırlıklarını güvenli bir şekilde kopyala
268
+ best_model_overall = clone_model(population[current_best_idx])
269
+ best_model_overall.set_weights(population[current_best_idx].get_weights())
270
+ best_model_overall.compile(optimizer=Adam(), loss='mse') # Yeniden derle
271
+ logging.info(f"Generation {gen+1}: *** New overall best fitness found: {best_fitness_overall:.6f} ***")
272
+ except Exception as e:
273
+ logging.error(f"Could not clone or set weights for the new best model: {e}", exc_info=True)
274
+ # Klonlama başarısız olursa devam et, ama en iyi model güncellenmemiş olabilir.
275
+ best_fitness_overall = current_best_fitness # Fitness'�� yine de güncelle
276
+
277
+ generation_time = (datetime.now() - generation_start_time).total_seconds()
278
+ logging.info(f"Generation {gen+1}/{generations} | Best Fitness: {current_best_fitness:.6f} | Avg Fitness: {avg_fitness:.6f} | Time: {generation_time:.2f}s")
279
+
280
+ # 3. Yeni Popülasyon Oluşturma
281
+ new_population = []
282
+
283
+ # 3a. Elitizm
284
+ if elitism_count > 0 and len(population) >= elitism_count:
285
+ try:
286
+ elite_indices = np.argsort(fitness_scores)[-elitism_count:]
287
+ for idx in elite_indices:
288
+ elite_clone = clone_model(population[idx])
289
+ elite_clone.set_weights(population[idx].get_weights())
290
+ elite_clone.compile(optimizer=Adam(), loss='mse')
291
+ new_population.append(elite_clone)
292
+ #logging.debug(f"Added elite model {elite_clone.name} (Index: {idx}, Fitness: {fitness_scores[idx]:.4f})")
293
+ except Exception as e:
294
+ logging.error(f"Error during elitism: {e}", exc_info=True)
295
+
296
+
297
+ # 3b. Seçilim ve Üreme (Kalan Bireyler İçin)
298
+ num_to_generate = len(population) - len(new_population)
299
+ offspring_population = []
300
+ while len(offspring_population) < num_to_generate:
301
+ try:
302
+ # Ebeveyn seç
303
+ parent = tournament_selection(population, fitness_scores, tournament_size)
304
+
305
+ # Çocuk oluştur (mutasyon uygula veya uygulama)
306
+ if random.random() < mutation_rate:
307
+ child = mutate_individual(parent, weight_mut_rate, act_mut_rate, mut_strength)
308
+ else:
309
+ # Mutasyon yoksa, yine de klonla ki aynı nesne referansı olmasın
310
+ child = clone_model(parent)
311
+ child.set_weights(parent.get_weights())
312
+ child.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
313
+ child._name = f"cloned_{parent.name}_{random.randint(1000,9999)}" # İsmi güncelle
314
+
315
+ offspring_population.append(child)
316
+ except Exception as e:
317
+ logging.error(f"Error during selection/reproduction cycle: {e}", exc_info=True)
318
+ # Hata durumunda döngüyü kırmak veya rastgele birey eklemek düşünülebilir
319
+ # Şimdilik döngü devam etsin, belki sonraki denemede düzelir
320
+ if len(offspring_population) < num_to_generate: # Eksik kalmaması için rastgele ekle
321
+ logging.warning("Adding random individual due to reproduction error.")
322
+ offspring_population.append(create_individual(y.shape[1], X.shape[1:]))
323
+
324
+
325
+ new_population.extend(offspring_population)
326
+ population = new_population # Popülasyonu güncelle
327
+
328
+ # Döngü bittiğinde en iyi modeli döndür
329
+ if best_model_overall is None and population: # Hiç iyileşme olmadıysa veya elitizm yoksa
330
+ logging.warning("No overall best model tracked (or cloning failed). Returning best from final population.")
331
+ final_fitness_scores = [calculate_fitness(ind, X_tf, y_tf, batch_size) for ind in population]
332
+ best_idx_final = np.argmax(final_fitness_scores)
333
+ best_model_overall = population[best_idx_final]
334
+ elif not population:
335
+ logging.error("Evolution finished with an empty population!")
336
+ return None, best_fitness_history, avg_fitness_history
337
+
338
+
339
+ logging.info(f"Evolution finished. Best fitness achieved: {best_fitness_overall:.6f}")
340
+ return best_model_overall, best_fitness_history, avg_fitness_history
341
+
342
+
343
+ # --- Grafik Çizimi ---
344
+ def plot_fitness_history(history_best: List[float], history_avg: List[float], output_dir: str) -> None:
345
+ """Fitness geçmişini çizer ve kaydeder."""
346
+ if not history_best or not history_avg:
347
+ logging.warning("Fitness history is empty, cannot plot.")
348
+ return
349
+ try:
350
+ plt.figure(figsize=(12, 7))
351
+ plt.plot(history_best, label="Best Fitness per Generation", marker='o', linestyle='-', linewidth=2)
352
+ plt.plot(history_avg, label="Average Fitness per Generation", marker='x', linestyle='--', alpha=0.7)
353
+ plt.xlabel("Generation")
354
+ plt.ylabel("Fitness Score (1 / MSE)")
355
+ plt.title("Evolutionary Process Fitness History")
356
+ plt.legend()
357
+ plt.grid(True, which='both', linestyle='--', linewidth=0.5)
358
+ plt.tight_layout()
359
+ plot_path = os.path.join(output_dir, "fitness_history.png")
360
+ plt.savefig(plot_path)
361
+ plt.close() # Bellekte figürü kapat
362
+ logging.info(f"Fitness history plot saved to {plot_path}")
363
+ except Exception as e:
364
+ logging.error(f"Error plotting fitness history: {e}", exc_info=True)
365
+
366
+ # --- Değerlendirme ---
367
+ def evaluate_model(model: Sequential, X_test: np.ndarray, y_test: np.ndarray, batch_size: int) -> Dict[str, float]:
368
+ """Son modeli test verisi üzerinde değerlendirir."""
369
+ if model is None:
370
+ logging.error("Cannot evaluate a None model.")
371
+ return {"test_mse": np.inf, "avg_kendall_tau": 0.0}
372
+ logging.info("Evaluating final model on test data...")
373
+ try:
374
+ y_pred = model.predict(X_test, batch_size=batch_size, verbose=0)
375
+ test_mse = np.mean(np.square(y_test - y_pred))
376
+ logging.info(f"Final Test MSE: {test_mse:.6f}")
377
+
378
+ # Kendall's Tau (örneklem üzerinde)
379
+ sample_size = min(500, X_test.shape[0]) # Örneklem boyutunu ayarla
380
+ taus = []
381
+ indices = np.random.choice(X_test.shape[0], sample_size, replace=False)
382
+ for i in indices:
383
+ try:
384
+ tau, _ = kendalltau(y_test[i], y_pred[i])
385
+ if not np.isnan(tau): taus.append(tau)
386
+ except ValueError as ve: # Eğer y_pred sabit değerler içeriyorsa
387
+ logging.debug(f"Kendall tau ValueError for sample {i}: {ve}")
388
+
389
+ avg_kendall_tau = np.mean(taus) if taus else 0.0
390
+ logging.info(f"Average Kendall's Tau (on {sample_size} samples): {avg_kendall_tau:.4f}")
391
+
392
+ return {
393
+ "test_mse": float(test_mse),
394
+ "avg_kendall_tau": float(avg_kendall_tau)
395
+ }
396
+ except Exception as e:
397
+ logging.error(f"Error during final model evaluation: {e}", exc_info=True)
398
+ return {"test_mse": np.inf, "avg_kendall_tau": 0.0} # Hata durumunda kötü değerler döndür
399
+
400
+ # --- Ana İş Akışı ---
401
+ def run_pipeline(args: argparse.Namespace):
402
+ """Tüm neuroevolution iş akışını çalıştırır."""
403
+
404
+ # Benzersiz çıktı klasörü oluştur
405
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
406
+ run_name = f"evorun_{timestamp}_gen{args.generations}_pop{args.pop_size}"
407
+ output_dir = os.path.join(args.output_base_dir, run_name)
408
+ try:
409
+ os.makedirs(output_dir, exist_ok=True)
410
+ except OSError as e:
411
+ print(f"FATAL: Could not create output directory: {output_dir}. Error: {e}", file=sys.stderr)
412
+ sys.exit(1)
413
+
414
+ # Loglamayı ayarla
415
+ setup_logging(output_dir)
416
+ logging.info(f"========== Starting EvoNet Pipeline Run: {run_name} ==========")
417
+ logging.info(f"Output directory: {output_dir}")
418
+
419
+ # Argümanları logla ve kaydet
420
+ logging.info("--- Configuration ---")
421
+ args_dict = vars(args)
422
+ for k, v in args_dict.items():
423
+ logging.info(f" {k:<20}: {v}")
424
+ logging.info("---------------------")
425
+ config_path = os.path.join(output_dir, "config.json")
426
+ try:
427
+ with open(config_path, 'w') as f:
428
+ json.dump(args_dict, f, indent=4, sort_keys=True)
429
+ logging.info(f"Configuration saved to {config_path}")
430
+ except Exception as e:
431
+ logging.error(f"Failed to save configuration: {e}", exc_info=True)
432
+
433
+
434
+ # Rastgele tohumları ayarla
435
+ try:
436
+ random.seed(args.seed)
437
+ np.random.seed(args.seed)
438
+ tf.random.set_seed(args.seed)
439
+ logging.info(f"Using random seed: {args.seed}")
440
+ # Deterministic ops (TensorFlow >= 2.8): Opsiyonel, performansı düşürebilir ama tekrarlanabilirliği artırır
441
+ # tf.config.experimental.enable_op_determinism()
442
+ except Exception as e:
443
+ logging.warning(f"Could not set all random seeds: {e}")
444
+
445
+
446
+ # GPU kontrolü
447
+ is_gpu_available = check_gpu()
448
+
449
+ # Veri Üretimi
450
+ try:
451
+ X_train, y_train = generate_data(args.train_samples, args.seq_length)
452
+ X_test, y_test = generate_data(args.test_samples, args.seq_length)
453
+ input_shape = X_train.shape[1:] # Model oluşturmak için girdi şekli
454
+ except Exception:
455
+ logging.critical("Failed to generate data. Exiting.")
456
+ sys.exit(1)
457
+
458
+
459
+ # Popülasyon Başlatma
460
+ logging.info(f"--- Initializing Population (Size: {args.pop_size}) ---")
461
+ try:
462
+ population = [create_individual(args.seq_length, input_shape) for _ in range(args.pop_size)]
463
+ logging.info("Population initialized successfully.")
464
+ except Exception:
465
+ logging.critical("Failed to initialize population. Exiting.")
466
+ sys.exit(1)
467
+
468
+ # Evrim Süreci
469
+ logging.info(f"--- Starting Evolution ({args.generations} Generations) ---")
470
+ try:
471
+ best_model_unevolved, best_fitness_hist, avg_fitness_hist = evolve_population(
472
+ population, X_train, y_train, args.generations,
473
+ args.mutation_rate, args.weight_mut_rate, args.activation_mut_rate, args.mutation_strength,
474
+ args.tournament_size, args.elitism_count, args.batch_size
475
+ )
476
+ except Exception as e:
477
+ logging.critical(f"Fatal error during evolution process: {e}", exc_info=True)
478
+ sys.exit(1)
479
+ logging.info("--- Evolution Complete ---")
480
+
481
+ # Fitness geçmişini kaydet ve çizdir
482
+ if best_fitness_hist and avg_fitness_hist:
483
+ history_path = os.path.join(output_dir, "fitness_history.csv")
484
+ try:
485
+ history_data = np.array([np.arange(1, len(best_fitness_hist) + 1), best_fitness_hist, avg_fitness_hist]).T
486
+ np.savetxt(history_path, history_data, delimiter=',', header='Generation,BestFitness,AvgFitness', comments='', fmt=['%d', '%.8f', '%.8f'])
487
+ logging.info(f"Fitness history data saved to {history_path}")
488
+ except Exception as e:
489
+ logging.error(f"Could not save fitness history data: {e}", exc_info=True)
490
+ plot_fitness_history(best_fitness_hist, avg_fitness_hist, output_dir)
491
+ else:
492
+ logging.warning("Fitness history is empty, skipping saving/plotting.")
493
+
494
+
495
+ # En İyi Modelin Son Eğitimi
496
+ if best_model_unevolved is None:
497
+ logging.error("Evolution did not yield a best model. Skipping final training and evaluation.")
498
+ final_metrics = {"test_mse": np.inf, "avg_kendall_tau": 0.0}
499
+ final_model_path = None
500
+ training_summary = {}
501
+ else:
502
+ logging.info("--- Starting Final Training of Best Evolved Model ---")
503
+ try:
504
+ # En iyi modeli tekrar klonla ve derle (güvenlik için)
505
+ final_model = clone_model(best_model_unevolved)
506
+ final_model.set_weights(best_model_unevolved.get_weights())
507
+ # Son eğitim için belki farklı bir öğrenme oranı denenebilir
508
+ final_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
509
+ logging.info("Model Summary of Best Evolved (Untrained):")
510
+ final_model.summary(print_fn=logging.info)
511
+
512
+
513
+ # Callback'ler
514
+ early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True, verbose=1) # Sabrı biraz artır
515
+ reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=7, min_lr=1e-7, verbose=1) # Faktörü ve sabrı ayarla
516
+
517
+ history = final_model.fit(
518
+ X_train, y_train,
519
+ epochs=args.epochs_final_train,
520
+ batch_size=args.batch_size,
521
+ validation_split=0.2, # Eğitim verisinin %20'si validasyon için
522
+ callbacks=[early_stopping, reduce_lr],
523
+ verbose=2 # Her epoch için bir satır log
524
+ )
525
+ logging.info("Final training complete.")
526
+ training_summary = {
527
+ "epochs_run": len(history.history['loss']),
528
+ "final_train_loss": history.history['loss'][-1],
529
+ "final_val_loss": history.history['val_loss'][-1]
530
+ }
531
+
532
+ # Eğitilmiş modeli değerlendir
533
+ final_metrics = evaluate_model(final_model, X_test, y_test, args.batch_size)
534
+
535
+ # Eğitilmiş modeli kaydet
536
+ final_model_path = os.path.join(output_dir, "best_evolved_model_trained.keras")
537
+ final_model.save(final_model_path)
538
+ logging.info(f"Final trained model saved to {final_model_path}")
539
+
540
+ except Exception as e:
541
+ logging.error(f"Error during final training or evaluation: {e}", exc_info=True)
542
+ final_metrics = {"test_mse": np.inf, "avg_kendall_tau": 0.0}
543
+ final_model_path = None
544
+ training_summary = {"error": str(e)}
545
+
546
+
547
+ # Sonuçları Kaydet
548
+ logging.info("--- Saving Final Results ---")
549
+ final_results = {
550
+ "run_info": {
551
+ "run_name": run_name,
552
+ "timestamp": timestamp,
553
+ "output_directory": output_dir,
554
+ "gpu_used": is_gpu_available,
555
+ },
556
+ "config": args_dict,
557
+ "evolution_summary": {
558
+ "generations_run": len(best_fitness_hist) if best_fitness_hist else 0,
559
+ "best_fitness_achieved": best_fitness_overall if best_fitness_overall > -np.inf else None,
560
+ "best_fitness_final_gen": best_fitness_hist[-1] if best_fitness_hist else None,
561
+ "avg_fitness_final_gen": avg_fitness_hist[-1] if avg_fitness_hist else None,
562
+ },
563
+ "final_training_summary": training_summary,
564
+ "final_evaluation_on_test": final_metrics,
565
+ "saved_model_path": final_model_path
566
+ }
567
+ results_path = os.path.join(output_dir, "final_results.json")
568
+ try:
569
+ # JSON'a kaydederken NumPy türlerini dönüştür
570
+ def convert_numpy_types(obj):
571
+ if isinstance(obj, np.integer): return int(obj)
572
+ elif isinstance(obj, np.floating): return float(obj)
573
+ elif isinstance(obj, np.ndarray): return obj.tolist()
574
+ return obj
575
+ with open(results_path, 'w') as f:
576
+ json.dump(final_results, f, indent=4, default=convert_numpy_types) # default handler ekle
577
+ logging.info(f"Final results summary saved to {results_path}")
578
+ except Exception as e:
579
+ logging.error(f"Failed to save final results JSON: {e}", exc_info=True)
580
+
581
+ logging.info(f"========== Pipeline Run {run_name} Finished ==========")
582
+
583
+
584
+ # --- Argüman Ayrıştırıcı ---
585
+ def parse_arguments() -> argparse.Namespace:
586
+ parser = argparse.ArgumentParser(description="EvoNet Revised: Neuroevolution for Sorting Task")
587
+
588
+ # --- Dizinler ---
589
+ parser.add_argument('--output_base_dir', type=str, default=DEFAULT_OUTPUT_BASE_DIR,
590
+ help='Base directory to store run results.')
591
+
592
+ # --- Veri Ayarları ---
593
+ parser.add_argument('--seq_length', type=int, default=DEFAULT_SEQ_LENGTH, help='Length of sequences.')
594
+ parser.add_argument('--train_samples', type=int, default=5000, help='Number of training samples.')
595
+ parser.add_argument('--test_samples', type=int, default=1000, help='Number of test samples.')
596
+
597
+ # --- Evrim Parametreleri ---
598
+ parser.add_argument('--pop_size', type=int, default=DEFAULT_POP_SIZE, help='Population size.')
599
+ parser.add_argument('--generations', type=int, default=DEFAULT_GENERATIONS, help='Number of generations.')
600
+ parser.add_argument('--mutation_rate', type=float, default=DEFAULT_MUTATION_RATE, help='Overall mutation probability.')
601
+ parser.add_argument('--weight_mut_rate', type=float, default=DEFAULT_WEIGHT_MUT_RATE, help='Weight mutation probability (if mutation occurs).')
602
+ parser.add_argument('--activation_mut_rate', type=float, default=DEFAULT_ACTIVATION_MUT_RATE, help='Activation mutation probability (if mutation occurs).')
603
+ parser.add_argument('--mutation_strength', type=float, default=DEFAULT_MUTATION_STRENGTH, help='Std dev for weight mutation noise.')
604
+ parser.add_argument('--tournament_size', type=int, default=DEFAULT_TOURNAMENT_SIZE, help='Number of individuals in tournament selection.')
605
+ parser.add_argument('--elitism_count', type=int, default=DEFAULT_ELITISM_COUNT, help='Number of elite individuals to carry over.')
606
+
607
+ # --- Eğitim ve Değerlendirme ---
608
+ parser.add_argument('--batch_size', type=int, default=DEFAULT_BATCH_SIZE, help='Batch size for predictions and final training.')
609
+ parser.add_argument('--epochs_final_train', type=int, default=DEFAULT_EPOCHS_FINAL_TRAIN, help='Max epochs for final training.')
610
+
611
+ # --- Tekrarlanabilirlik ---
612
+ parser.add_argument('--seed', type=int, default=None, help='Random seed (default: random).')
613
+
614
+ args = parser.parse_args()
615
+
616
+ # Varsayılan tohum ayarla (eğer verilmediyse)
617
+ if args.seed is None:
618
+ args.seed = random.randint(0, 2**32 - 1)
619
+ print(f"Generated random seed: {args.seed}") # Loglama başlamadan önce print et
620
+
621
+ return args
622
+
623
+
624
+ # --- Ana Çalıştırma Bloğu ---
625
+ if __name__ == "__main__":
626
+ # Argümanları ayrıştır
627
+ cli_args = parse_arguments()
628
+
629
+ # Ana iş akışını çalıştır
630
+ try:
631
+ run_pipeline(cli_args)
632
+ except SystemExit: # sys.exit() çağrılarını yakala ve normal çıkış yap
633
+ pass
634
+ except Exception as e:
635
+ # Loglama başlamamışsa bile hatayı yazdırmaya çalış
636
+ print(f"\nFATAL UNHANDLED ERROR in main execution block: {e}", file=sys.stderr)
637
+ # Loglama ayarlandıysa oraya da yaz
638
+ if logging.getLogger().hasHandlers():
639
+ logging.critical("FATAL UNHANDLED ERROR in main execution block:", exc_info=True)
640
+ else:
641
+ import traceback
642
+ print(traceback.format_exc(), file=sys.stderr)
643
+ sys.exit(1) # Hata kodu ile çık
v3.py ADDED
@@ -0,0 +1,784 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==============================================================================
2
+ # EvoNet Optimizer - v3 - Daha İleri İyileştirmeler
3
+ # Açıklama: Çaprazlama, Kontrol Noktası eklenmiş, Adaptif Mutasyon ve
4
+ # Gelişmiş Fitness için kavramsal öneriler içeren versiyon.
5
+ # ==============================================================================
6
+
7
+ import os
8
+ import subprocess
9
+ import sys
10
+ import argparse
11
+ import random
12
+ import logging
13
+ from datetime import datetime
14
+ import json
15
+ import pickle # Checkpointing için
16
+ import time # Checkpointing için
17
+ from typing import List, Tuple, Dict, Any, Optional
18
+
19
+ import numpy as np
20
+ import tensorflow as tf
21
+ from tensorflow.keras.models import Sequential, load_model, clone_model
22
+ from tensorflow.keras.layers import Dense, Input
23
+ from tensorflow.keras.optimizers import Adam
24
+ from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
25
+ import matplotlib.pyplot as plt
26
+ from scipy.stats import kendalltau
27
+
28
+ # --- Sabitler ve Varsayılan Değerler ---
29
+ DEFAULT_SEQ_LENGTH = 10
30
+ DEFAULT_POP_SIZE = 50
31
+ DEFAULT_GENERATIONS = 50
32
+ DEFAULT_CROSSOVER_RATE = 0.6 # Çaprazlama uygulama olasılığı
33
+ DEFAULT_MUTATION_RATE = 0.4 # Mutasyon uygulama olasılığı (eğer çaprazlama olmazsa)
34
+ DEFAULT_WEIGHT_MUT_RATE = 0.8
35
+ DEFAULT_ACTIVATION_MUT_RATE = 0.2 # Aktivasyon mutasyonu hala deneysel
36
+ DEFAULT_MUTATION_STRENGTH = 0.1
37
+ DEFAULT_TOURNAMENT_SIZE = 5
38
+ DEFAULT_ELITISM_COUNT = 2
39
+ DEFAULT_EPOCHS_FINAL_TRAIN = 100
40
+ DEFAULT_BATCH_SIZE = 64
41
+ DEFAULT_OUTPUT_BASE_DIR = os.path.join(os.getcwd(), "evonet_runs_v3")
42
+ DEFAULT_CHECKPOINT_INTERVAL = 10 # Kaç nesilde bir checkpoint alınacağı (0 = kapalı)
43
+
44
+ # --- Loglama Ayarları ---
45
+ # (setup_logging fonksiyonu öncekiyle aynı, tekrar eklemiyorum)
46
+ def setup_logging(log_dir: str, log_level=logging.INFO) -> None:
47
+ log_filename = os.path.join(log_dir, 'evolution_run.log')
48
+ for handler in logging.root.handlers[:]: logging.root.removeHandler(handler)
49
+ logging.basicConfig(
50
+ level=log_level,
51
+ format='%(asctime)s - %(levelname)-8s - %(message)s',
52
+ handlers=[
53
+ logging.FileHandler(log_filename, mode='a'), # 'a' mode append for resuming
54
+ logging.StreamHandler(sys.stdout)
55
+ ]
56
+ )
57
+ logging.info("Logging setup complete.")
58
+
59
+ # --- GPU Kontrolü ---
60
+ # (check_gpu fonksiyonu öncekiyle aynı, tekrar eklemiyorum)
61
+ def check_gpu() -> bool:
62
+ gpus = tf.config.list_physical_devices('GPU')
63
+ if gpus:
64
+ try:
65
+ for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True)
66
+ logical_gpus = tf.config.list_logical_devices('GPU')
67
+ logging.info(f"{len(gpus)} Physical GPUs, {len(logical_gpus)} Logical GPUs found.")
68
+ if logical_gpus: logging.info(f"Using GPU: {tf.config.experimental.get_device_details(gpus[0])['device_name']}")
69
+ return True
70
+ except RuntimeError as e:
71
+ logging.error(f"Error setting memory growth for GPU: {e}", exc_info=True)
72
+ return False
73
+ else:
74
+ logging.warning("GPU not found. Using CPU.")
75
+ return False
76
+
77
+ # --- Veri Üretimi ---
78
+ # (generate_data fonksiyonu öncekiyle aynı, tekrar eklemiyorum)
79
+ def generate_data(num_samples: int, seq_length: int) -> Tuple[np.ndarray, np.ndarray]:
80
+ logging.info(f"Generating {num_samples} samples with sequence length {seq_length}...")
81
+ try:
82
+ X = np.random.rand(num_samples, seq_length).astype(np.float32) * 100
83
+ y = np.sort(X, axis=1).astype(np.float32)
84
+ logging.info("Data generation successful.")
85
+ return X, y
86
+ except Exception as e:
87
+ logging.error(f"Error during data generation: {e}", exc_info=True)
88
+ raise
89
+
90
+ # --- Neuroevolution Çekirdeği ---
91
+
92
+ def create_individual(seq_length: int, input_shape: Tuple) -> Sequential:
93
+ """Rastgele mimariye sahip bir Keras Sequential modeli oluşturur ve derler."""
94
+ # (Fonksiyon öncekiyle büyük ölçüde aynı, isim revize edildi)
95
+ try:
96
+ model = Sequential(name=f"model_rnd_{random.randint(10000, 99999)}")
97
+ num_hidden_layers = random.randint(1, 4)
98
+ neurons_per_layer = [random.randint(8, 64) for _ in range(num_hidden_layers)]
99
+ activations = [random.choice(['relu', 'tanh', 'sigmoid']) for _ in range(num_hidden_layers)]
100
+ model.add(Input(shape=input_shape))
101
+ for i in range(num_hidden_layers):
102
+ model.add(Dense(neurons_per_layer[i], activation=activations[i]))
103
+ model.add(Dense(seq_length, activation='linear'))
104
+ model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
105
+ return model
106
+ except Exception as e:
107
+ logging.error(f"Error creating individual model: {e}", exc_info=True)
108
+ raise
109
+
110
+ @tf.function
111
+ def get_predictions(model: Sequential, X: tf.Tensor) -> tf.Tensor:
112
+ """Model tahminlerini tf.function kullanarak alır."""
113
+ return model(X, training=False)
114
+
115
+ def calculate_fitness(individual: Sequential, X: np.ndarray, y: np.ndarray, batch_size: int, fitness_params: Dict = None) -> float:
116
+ """Bir bireyin fitness değerini hesaplar. Gelişmiş fitness için öneri içerir."""
117
+ # --- KAVRAMSAL: Gelişmiş Fitness Fonksiyonu ---
118
+ # Burada sadece MSE kullanılıyor. Daha gelişmiş bir fitness için:
119
+ # 1. Diğer metrikleri hesapla (örn: Kendall Tau).
120
+ # 2. Model karmaşıklığını hesapla (örn: parametre sayısı).
121
+ # 3. Bu değerleri ağırlıklı bir formülle birleştir.
122
+ # fitness_params = fitness_params or {}
123
+ # w_mse = fitness_params.get('w_mse', 1.0)
124
+ # w_tau = fitness_params.get('w_tau', 0.1)
125
+ # w_comp = fitness_params.get('w_comp', 0.0001)
126
+ # --------------------------------------------
127
+ if not isinstance(X, tf.Tensor): X = tf.cast(X, tf.float32)
128
+ if not isinstance(y, tf.Tensor): y = tf.cast(y, tf.float32)
129
+ try:
130
+ y_pred_tf = get_predictions(individual, X)
131
+ mse = tf.reduce_mean(tf.square(y - y_pred_tf))
132
+ mse_val = mse.numpy()
133
+ fitness_score = 1.0 / (mse_val + 1e-8) # Temel fitness
134
+
135
+ # --- KAVRAMSAL: Gelişmiş Fitness Hesabı ---
136
+ # if w_tau > 0 or w_comp > 0:
137
+ # # Kendall Tau hesapla (maliyetli olabilir, örneklem gerekebilir)
138
+ # tau_val = calculate_avg_kendall_tau(y.numpy(), y_pred_tf.numpy(), sample_size=100) # Örnek bir fonksiyon
139
+ # # Karmaşıklık hesapla
140
+ # complexity = individual.count_params()
141
+ # # Birleştirilmiş fitness
142
+ # fitness_score = w_mse * fitness_score + w_tau * tau_val - w_comp * complexity
143
+ # --------------------------------------------
144
+
145
+ if not np.isfinite(fitness_score) or fitness_score < -1e6: # Negatif olabilen fitness için kontrol
146
+ logging.warning(f"Non-finite or very low fitness ({fitness_score:.4g}) for model {individual.name}. Assigning minimal fitness.")
147
+ return -1e7 # Gelişmiş fitness negatif olabileceği için daha düşük sınır
148
+ return float(fitness_score)
149
+ except Exception as e:
150
+ logging.error(f"Error during fitness calculation for model {individual.name}: {e}", exc_info=True)
151
+ return -1e7
152
+
153
+ # (Aktivasyon mutasyonu hala deneysel, ana odak ağırlık mutasyonunda)
154
+ def mutate_individual(individual: Sequential, weight_mut_rate: float, mut_strength: float) -> Sequential:
155
+ """Bir bireye ağırlık bozulması mutasyonu uygular."""
156
+ try:
157
+ mutated_model = clone_model(individual)
158
+ mutated_model.set_weights(individual.get_weights())
159
+ mutated = False
160
+ if random.random() < weight_mut_rate: # Ağırlık mutasyon olasılığı (dışarıdan gelen genel rate ile birleştirilebilir)
161
+ mutated = True
162
+ for layer in mutated_model.layers:
163
+ if isinstance(layer, Dense) and layer.get_weights():
164
+ weights_biases = layer.get_weights()
165
+ new_weights_biases = [wb + np.random.normal(0, mut_strength, wb.shape).astype(np.float32) for wb in weights_biases]
166
+ layer.set_weights(new_weights_biases)
167
+
168
+ if mutated:
169
+ mutated_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
170
+ mutated_model._name = f"mutated_{individual.name}_{random.randint(1000,9999)}"
171
+ return mutated_model
172
+ except Exception as e:
173
+ logging.error(f"Error during mutation of model {individual.name}: {e}", exc_info=True)
174
+ return individual
175
+
176
+
177
+ def check_architecture_compatibility(model1: Sequential, model2: Sequential) -> bool:
178
+ """İki modelin basit çaprazlama için uyumlu olup olmadığını kontrol eder (katman sayısı ve tipleri)."""
179
+ if len(model1.layers) != len(model2.layers):
180
+ return False
181
+ for l1, l2 in zip(model1.layers, model2.layers):
182
+ if type(l1) != type(l2):
183
+ return False
184
+ # Daha detaylı kontrol (nöron sayısı vb.) eklenebilir, ancak basit tutalım.
185
+ return True
186
+
187
+ def crossover_individuals(parent1: Sequential, parent2: Sequential) -> Tuple[Optional[Sequential], Optional[Sequential]]:
188
+ """İki ebeveynden basit ağırlık ortalaması/karıştırması ile çocuklar oluşturur."""
189
+ # Mimari uyumluluğunu kontrol et (basit versiyon)
190
+ if not check_architecture_compatibility(parent1, parent2):
191
+ logging.debug("Skipping crossover due to incompatible architectures.")
192
+ return None, None # Uyumsuzsa çaprazlama yapma
193
+
194
+ try:
195
+ # Çocukları ebeveynleri klonlayarak başlat
196
+ child1 = clone_model(parent1)
197
+ child2 = clone_model(parent2)
198
+ child1.set_weights(parent1.get_weights()) # Başlangıç ağırlıklarını ata
199
+ child2.set_weights(parent2.get_weights())
200
+
201
+ p1_weights = parent1.get_weights()
202
+ p2_weights = parent2.get_weights()
203
+ child1_new_weights = []
204
+ child2_new_weights = []
205
+
206
+ # Katman katman ağırlıkları çaprazla
207
+ for i in range(len(p1_weights)): # Ağırlık matrisleri/bias vektörleri üzerinde döngü
208
+ w1 = p1_weights[i]
209
+ w2 = p2_weights[i]
210
+ # Basit ortalama veya rastgele seçim (örnek: rastgele seçim)
211
+ mask = np.random.rand(*w1.shape) < 0.5
212
+ cw1 = np.where(mask, w1, w2)
213
+ cw2 = np.where(mask, w2, w1) # Ters maske ile
214
+ # Veya basit ortalama: cw1 = (w1 + w2) / 2.0; cw2 = cw1
215
+ child1_new_weights.append(cw1.astype(np.float32))
216
+ child2_new_weights.append(cw2.astype(np.float32))
217
+
218
+
219
+ child1.set_weights(child1_new_weights)
220
+ child2.set_weights(child2_new_weights)
221
+
222
+ # Çocukları derle
223
+ child1.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
224
+ child2.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
225
+ child1._name = f"xover_{parent1.name[:10]}_{parent2.name[:10]}_c1_{random.randint(1000,9999)}"
226
+ child2._name = f"xover_{parent1.name[:10]}_{parent2.name[:10]}_c2_{random.randint(1000,9999)}"
227
+ #logging.debug(f"Crossover performed between {parent1.name} and {parent2.name}")
228
+ return child1, child2
229
+
230
+ except Exception as e:
231
+ logging.error(f"Error during crossover between {parent1.name} and {parent2.name}: {e}", exc_info=True)
232
+ return None, None # Hata olursa çocuk üretme
233
+
234
+ # (tournament_selection fonksiyonu öncekiyle aynı)
235
+ def tournament_selection(population: List[Sequential], fitness_scores: List[float], k: int) -> Sequential:
236
+ if not population: raise ValueError("Population cannot be empty.")
237
+ if len(population) < k: k = len(population)
238
+ try:
239
+ tournament_indices = random.sample(range(len(population)), k)
240
+ tournament_fitness = [fitness_scores[i] for i in tournament_indices]
241
+ winner_local_idx = np.argmax(tournament_fitness)
242
+ winner_global_idx = tournament_indices[winner_local_idx]
243
+ return population[winner_global_idx]
244
+ except Exception as e:
245
+ logging.error(f"Error during tournament selection: {e}", exc_info=True)
246
+ return random.choice(population)
247
+
248
+ # --- Checkpointing ---
249
+ def save_checkpoint(output_dir: str, generation: int, population: List[Sequential], rnd_state: Tuple, np_rnd_state: Tuple, tf_rnd_state: Any):
250
+ """Evrim durumunu kaydeder."""
251
+ checkpoint_dir = os.path.join(output_dir, "checkpoints")
252
+ os.makedirs(checkpoint_dir, exist_ok=True)
253
+ checkpoint_file = os.path.join(checkpoint_dir, f"evo_gen_{generation}.pkl")
254
+ logging.info(f"Saving checkpoint for generation {generation} to {checkpoint_file}...")
255
+ try:
256
+ # Modelleri kaydetmek için ağırlıkları ve konfigürasyonları al
257
+ population_state = []
258
+ for model in population:
259
+ try:
260
+ # Önce modeli diske kaydetmeyi dene (daha sağlam olabilir ama yavaş)
261
+ # model_path = os.path.join(checkpoint_dir, f"model_gen{generation}_{model.name}.keras")
262
+ # model.save(model_path)
263
+ # population_state.append({"config": model.get_config(), "saved_path": model_path})
264
+
265
+ # Alternatif: Ağırlık ve config'i pickle içine göm (daha riskli)
266
+ population_state.append({
267
+ "name": model.name,
268
+ "config": model.get_config(),
269
+ "weights": model.get_weights()
270
+ })
271
+ except Exception as e:
272
+ logging.error(f"Could not serialize model {model.name} for checkpoint: {e}")
273
+ population_state.append(None) # Hata durumunda None ekle
274
+
275
+ state = {
276
+ "generation": generation,
277
+ "population_state": [p for p in population_state if p is not None], # Başarısız olanları çıkarma
278
+ "random_state": rnd_state,
279
+ "numpy_random_state": np_rnd_state,
280
+ "tensorflow_random_state": tf_rnd_state, # TensorFlow state'i pickle ile kaydetmek sorunlu olabilir
281
+ "timestamp": datetime.now().isoformat()
282
+ }
283
+ with open(checkpoint_file, 'wb') as f:
284
+ pickle.dump(state, f)
285
+ logging.info(f"Checkpoint saved successfully for generation {generation}.")
286
+ except Exception as e:
287
+ logging.error(f"Failed to save checkpoint for generation {generation}: {e}", exc_info=True)
288
+
289
+
290
+ def load_checkpoint(checkpoint_path: str) -> Optional[Dict]:
291
+ """Kaydedilmiş evrim durumunu yükler."""
292
+ if not os.path.exists(checkpoint_path):
293
+ logging.error(f"Checkpoint file not found: {checkpoint_path}")
294
+ return None
295
+ logging.info(f"Loading checkpoint from {checkpoint_path}...")
296
+ try:
297
+ with open(checkpoint_path, 'rb') as f:
298
+ state = pickle.load(f)
299
+
300
+ population = []
301
+ for model_state in state["population_state"]:
302
+ try:
303
+ # Eğer model ayrı kaydedildiyse:
304
+ # model = load_model(model_state["saved_path"])
305
+ # population.append(model)
306
+
307
+ # Pickle içine gömüldüyse:
308
+ model = Sequential.from_config(model_state["config"])
309
+ model.set_weights(model_state["weights"])
310
+ # Modelin yeniden derlenmesi GEREKİR!
311
+ model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
312
+ model._name = model_state.get("name", f"model_loaded_{random.randint(1000,9999)}") # İsmi geri yükle
313
+ population.append(model)
314
+ except Exception as e:
315
+ logging.error(f"Failed to load model state from checkpoint for model {model_state.get('name', 'UNKNOWN')}: {e}")
316
+
317
+ # Sadece başarıyla yüklenen modelleri al
318
+ state["population"] = population
319
+ if not population:
320
+ logging.error("Failed to load any model from the checkpoint population state.")
321
+ return None # Hiç model yüklenemediyse checkpoint geçersiz
322
+
323
+ logging.info(f"Checkpoint loaded successfully. Resuming from generation {state['generation'] + 1}.")
324
+ return state
325
+ except Exception as e:
326
+ logging.error(f"Failed to load checkpoint from {checkpoint_path}: {e}", exc_info=True)
327
+ return None
328
+
329
+ def find_latest_checkpoint(output_dir: str) -> Optional[str]:
330
+ """Verilen klasördeki en son checkpoint dosyasını bulur."""
331
+ checkpoint_dir = os.path.join(output_dir, "checkpoints")
332
+ if not os.path.isdir(checkpoint_dir):
333
+ return None
334
+ checkpoints = [f for f in os.listdir(checkpoint_dir) if f.startswith("evo_gen_") and f.endswith(".pkl")]
335
+ if not checkpoints:
336
+ return None
337
+ # Dosya adından nesil numarasını çıkar ve en yükseğini bul
338
+ latest_gen = -1
339
+ latest_file = None
340
+ for cp in checkpoints:
341
+ try:
342
+ gen_num = int(cp.split('_')[2].split('.')[0])
343
+ if gen_num > latest_gen:
344
+ latest_gen = gen_num
345
+ latest_file = os.path.join(checkpoint_dir, cp)
346
+ except (IndexError, ValueError):
347
+ logging.warning(f"Could not parse generation number from checkpoint file: {cp}")
348
+ continue
349
+ return latest_file
350
+
351
+
352
+ # --- Ana Evrim Döngüsü (Checkpoint ve Crossover ile) ---
353
+ def evolve_population_v3(population: List[Sequential], X: np.ndarray, y: np.ndarray, start_generation: int, total_generations: int,
354
+ crossover_rate: float, mutation_rate: float, weight_mut_rate: float, mut_strength: float,
355
+ tournament_size: int, elitism_count: int, batch_size: int,
356
+ output_dir: str, checkpoint_interval: int) -> Tuple[Optional[Sequential], List[float], List[float]]:
357
+ """Evrimsel süreci çalıştırır (Checkpoint ve Crossover içerir)."""
358
+ best_fitness_history = []
359
+ avg_fitness_history = []
360
+ best_model_overall = None
361
+ best_fitness_overall = -np.inf
362
+
363
+ X_tf = tf.cast(X, tf.float32)
364
+ y_tf = tf.cast(y, tf.float32)
365
+
366
+ # --- KAVRAMSAL: Uyarlanabilir Mutasyon Oranı ---
367
+ # current_mutation_rate = mutation_rate # Başlangıç değeri
368
+ # stagnation_counter = 0
369
+ # --------------------------------------------
370
+
371
+ for gen in range(start_generation, total_generations):
372
+ generation_start_time = datetime.now()
373
+ # 1. Fitness Değerlendirme
374
+ try:
375
+ fitness_scores = [calculate_fitness(ind, X_tf, y_tf, batch_size) for ind in population]
376
+ except Exception as e:
377
+ logging.critical(f"Error calculating fitness for population in Generation {gen+1}: {e}", exc_info=True)
378
+ if best_model_overall: return best_model_overall, best_fitness_history, avg_fitness_history
379
+ else: raise
380
+
381
+ # 2. İstatistikler ve En İyiyi Takip
382
+ current_best_idx = np.argmax(fitness_scores)
383
+ current_best_fitness = fitness_scores[current_best_idx]
384
+ avg_fitness = np.mean(fitness_scores)
385
+ best_fitness_history.append(current_best_fitness)
386
+ avg_fitness_history.append(avg_fitness)
387
+
388
+ new_best_found = False
389
+ if current_best_fitness > best_fitness_overall:
390
+ best_fitness_overall = current_best_fitness
391
+ new_best_found = True
392
+ try:
393
+ best_model_overall = clone_model(population[current_best_idx])
394
+ best_model_overall.set_weights(population[current_best_idx].get_weights())
395
+ best_model_overall.compile(optimizer=Adam(), loss='mse')
396
+ logging.info(f"Generation {gen+1}: *** New overall best fitness found: {best_fitness_overall:.6f} ***")
397
+ except Exception as e:
398
+ logging.error(f"Could not clone new best model: {e}", exc_info=True)
399
+ best_fitness_overall = current_best_fitness # Sadece fitness'ı güncelle
400
+
401
+ generation_time = (datetime.now() - generation_start_time).total_seconds()
402
+ logging.info(f"Generation {gen+1}/{total_generations} | Best Fitness: {current_best_fitness:.6f} | Avg Fitness: {avg_fitness:.6f} | Time: {generation_time:.2f}s")
403
+
404
+ # --- KAVRAMSAL: Uyarlanabilir Mutasyon Oranı Güncelleme ---
405
+ # if new_best_found:
406
+ # stagnation_counter = 0
407
+ # # current_mutation_rate = max(min_mutation_rate, current_mutation_rate * 0.98) # Azalt
408
+ # else:
409
+ # stagnation_counter += 1
410
+ # if stagnation_counter > stagnation_limit:
411
+ # # current_mutation_rate = min(max_mutation_rate, current_mutation_rate * 1.1) # Artır
412
+ # stagnation_counter = 0 # Sayacı sıfırla
413
+ # logging.debug(f"Current mutation rate: {current_mutation_rate:.4f}")
414
+ # --------------------------------------------
415
+
416
+ # 3. Yeni Popülasyon Oluşturma
417
+ new_population = []
418
+
419
+ # 3a. Elitizm
420
+ if elitism_count > 0 and len(population) >= elitism_count:
421
+ try:
422
+ elite_indices = np.argsort(fitness_scores)[-elitism_count:]
423
+ for idx in elite_indices:
424
+ elite_clone = clone_model(population[idx])
425
+ elite_clone.set_weights(population[idx].get_weights())
426
+ elite_clone.compile(optimizer=Adam(), loss='mse')
427
+ new_population.append(elite_clone)
428
+ except Exception as e:
429
+ logging.error(f"Error during elitism: {e}", exc_info=True)
430
+
431
+
432
+ # 3b. Seçilim, Çaprazlama ve Mutasyon
433
+ num_to_generate = len(population) - len(new_population)
434
+ generated_count = 0
435
+ while generated_count < num_to_generate:
436
+ try:
437
+ # İki ebeveyn seç
438
+ parent1 = tournament_selection(population, fitness_scores, tournament_size)
439
+ parent2 = tournament_selection(population, fitness_scores, tournament_size)
440
+
441
+ child1, child2 = None, None # Çocukları başlat
442
+
443
+ # Çaprazlama uygula (belirli bir olasılıkla)
444
+ if random.random() < crossover_rate and parent1 is not parent2:
445
+ child1, child2 = crossover_individuals(parent1, parent2)
446
+
447
+ # Eğer çaprazlama yapılmadıysa veya başarısız olduysa, mutasyonla devam et
448
+ if child1 is None: # İlk çocuk oluşmadıysa
449
+ # Ebeveynlerden birini mutasyona uğrat
450
+ parent_to_mutate = parent1 # Veya parent2 veya rastgele biri
451
+ if random.random() < mutation_rate: # Genel mutasyon oranı kontrolü
452
+ child1 = mutate_individual(parent_to_mutate, weight_mut_rate, mut_strength)
453
+ else: # Mutasyon da olmazsa, ebeveyni klonla
454
+ child1 = clone_model(parent_to_mutate); child1.set_weights(parent_to_mutate.get_weights())
455
+ child1.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
456
+ child1._name = f"cloned_{parent_to_mutate.name}_{random.randint(1000,9999)}"
457
+
458
+ # Yeni popülasyona ekle
459
+ if child1:
460
+ new_population.append(child1)
461
+ generated_count += 1
462
+ if generated_count >= num_to_generate: break # Gerekli sayıya ulaşıldıysa çık
463
+
464
+ else: # Çaprazlama başarılı olduysa (child1 ve child2 var)
465
+ # Çaprazlama sonrası çocuklara ayrıca mutasyon uygulama seçeneği eklenebilir
466
+ # if random.random() < post_crossover_mutation_rate: child1 = mutate(...)
467
+ # if random.random() < post_crossover_mutation_rate: child2 = mutate(...)
468
+
469
+ new_population.append(child1)
470
+ generated_count += 1
471
+ if generated_count >= num_to_generate: break
472
+
473
+ if child2: # İkinci çocuk da varsa ekle
474
+ new_population.append(child2)
475
+ generated_count += 1
476
+ if generated_count >= num_to_generate: break
477
+
478
+ except Exception as e:
479
+ logging.error(f"Error during selection/reproduction cycle: {e}", exc_info=True)
480
+ if generated_count < num_to_generate: # Eksik kalırsa rastgele doldur
481
+ logging.warning("Adding random individual due to reproduction error.")
482
+ new_population.append(create_individual(y.shape[1], X.shape[1:]))
483
+ generated_count += 1
484
+
485
+ population = new_population[:len(population)] # Popülasyon boyutunu garantile
486
+
487
+ # 4. Checkpoint Alma
488
+ if checkpoint_interval > 0 and (gen + 1) % checkpoint_interval == 0:
489
+ try:
490
+ # Rastgele durumları al
491
+ rnd_state = random.getstate()
492
+ np_rnd_state = np.random.get_state()
493
+ # tf_rnd_state = tf.random.get_global_generator().state # TF state kaydetmek zor olabilir
494
+ tf_rnd_state = None # Şimdilik None
495
+ save_checkpoint(output_dir, gen + 1, population, rnd_state, np_rnd_state, tf_rnd_state)
496
+ except Exception as e:
497
+ logging.error(f"Failed to execute checkpoint saving for generation {gen+1}: {e}", exc_info=True)
498
+
499
+
500
+ # Döngü sonu
501
+ if best_model_overall is None and population:
502
+ logging.warning("No overall best model tracked. Returning best from final population.")
503
+ final_fitness_scores = [calculate_fitness(ind, X_tf, y_tf, batch_size) for ind in population]
504
+ best_idx_final = np.argmax(final_fitness_scores)
505
+ best_model_overall = population[best_idx_final]
506
+ elif not population:
507
+ logging.error("Evolution finished with an empty population!")
508
+ return None, best_fitness_history, avg_fitness_history
509
+
510
+ logging.info(f"Evolution finished. Best fitness achieved: {best_fitness_overall:.6f}")
511
+ return best_model_overall, best_fitness_history, avg_fitness_history
512
+
513
+ # --- Grafik Çizimi (Öncekiyle aynı) ---
514
+ def plot_fitness_history(history_best: List[float], history_avg: List[float], output_dir: str) -> None:
515
+ if not history_best or not history_avg:
516
+ logging.warning("Fitness history is empty, cannot plot.")
517
+ return
518
+ try:
519
+ plt.figure(figsize=(12, 7)); plt.plot(history_best, label="Best Fitness", marker='o', linestyle='-', linewidth=2)
520
+ plt.plot(history_avg, label="Average Fitness", marker='x', linestyle='--', alpha=0.7); plt.xlabel("Generation")
521
+ plt.ylabel("Fitness Score"); plt.title("Evolutionary Fitness History"); plt.legend(); plt.grid(True); plt.tight_layout()
522
+ plot_path = os.path.join(output_dir, "fitness_history.png"); plt.savefig(plot_path); plt.close()
523
+ logging.info(f"Fitness history plot saved to {plot_path}")
524
+ except Exception as e: logging.error(f"Error plotting fitness history: {e}", exc_info=True)
525
+
526
+ # --- Değerlendirme (Öncekiyle aynı) ---
527
+ def evaluate_model(model: Sequential, X_test: np.ndarray, y_test: np.ndarray, batch_size: int) -> Dict[str, float]:
528
+ if model is None: return {"test_mse": np.inf, "avg_kendall_tau": 0.0}
529
+ logging.info("Evaluating final model on test data...")
530
+ try:
531
+ y_pred = model.predict(X_test, batch_size=batch_size, verbose=0)
532
+ test_mse = np.mean(np.square(y_test - y_pred))
533
+ logging.info(f"Final Test MSE: {test_mse:.6f}")
534
+ sample_size = min(500, X_test.shape[0]); taus = []; indices = np.random.choice(X_test.shape[0], sample_size, replace=False)
535
+ for i in indices:
536
+ try: tau, _ = kendalltau(y_test[i], y_pred[i]);
537
+ if not np.isnan(tau): taus.append(tau)
538
+ except ValueError: pass # Handle constant prediction case
539
+ avg_kendall_tau = np.mean(taus) if taus else 0.0
540
+ logging.info(f"Average Kendall's Tau (on {sample_size} samples): {avg_kendall_tau:.4f}")
541
+ return {"test_mse": float(test_mse), "avg_kendall_tau": float(avg_kendall_tau)}
542
+ except Exception as e:
543
+ logging.error(f"Error during final model evaluation: {e}", exc_info=True)
544
+ return {"test_mse": np.inf, "avg_kendall_tau": 0.0}
545
+
546
+ # --- Ana İş Akışı (Checkpoint Yükleme ile) ---
547
+ def run_pipeline_v3(args: argparse.Namespace):
548
+ """Checkpoint ve Crossover içeren ana iş akışı."""
549
+
550
+ # Çalıştırma adı ve çıktı klasörü
551
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
552
+ run_name = f"evorun_{timestamp}_gen{args.generations}_pop{args.pop_size}"
553
+ # Eğer resume path verilmişse, o klasörü kullan
554
+ output_dir = args.resume_from if args.resume_from else os.path.join(args.output_base_dir, run_name)
555
+ resume_run = bool(args.resume_from)
556
+ if resume_run:
557
+ run_name = os.path.basename(output_dir) # Klasör adını kullan
558
+ logging.info(f"Attempting to resume run from: {output_dir}")
559
+ else:
560
+ try: os.makedirs(output_dir, exist_ok=True)
561
+ except OSError as e: print(f"FATAL: Could not create output directory: {output_dir}. Error: {e}", file=sys.stderr); sys.exit(1)
562
+
563
+ # Loglamayı ayarla ('a' modu ile devam etmeye uygun)
564
+ setup_logging(output_dir)
565
+ logging.info(f"========== Starting/Resuming EvoNet Pipeline Run: {run_name} ==========")
566
+ logging.info(f"Output directory: {output_dir}")
567
+
568
+ # --- Checkpoint Yükleme ---
569
+ start_generation = 0
570
+ population = []
571
+ initial_state_loaded = False
572
+ latest_checkpoint_path = find_latest_checkpoint(output_dir) if resume_run else None
573
+
574
+ if latest_checkpoint_path:
575
+ loaded_state = load_checkpoint(latest_checkpoint_path)
576
+ if loaded_state:
577
+ start_generation = loaded_state['generation'] # Kaldığı nesilden başla
578
+ population = loaded_state['population']
579
+ # Rastgele durumları geri yükle
580
+ try:
581
+ random.setstate(loaded_state['random_state'])
582
+ np.random.set_state(loaded_state['numpy_random_state'])
583
+ # tf.random.set_global_generator(tf.random.Generator.from_state(loaded_state['tensorflow_random_state'])) # TF state sorunlu olabilir
584
+ logging.info(f"Random states restored from checkpoint.")
585
+ except Exception as e:
586
+ logging.warning(f"Could not fully restore random states from checkpoint: {e}")
587
+ initial_state_loaded = True
588
+ logging.info(f"Resuming from Generation {start_generation + 1} with {len(population)} individuals.")
589
+ else:
590
+ logging.error("Failed to load checkpoint. Starting from scratch.")
591
+ resume_run = False # Checkpoint yüklenemediyse sıfırdan başla
592
+ elif resume_run:
593
+ logging.warning(f"Resume requested but no valid checkpoint found in {output_dir}. Starting from scratch.")
594
+ resume_run = False # Checkpoint yoksa sıfırdan başla
595
+
596
+
597
+ # --- Sıfırdan Başlama veya Devam Etme Ayarları ---
598
+ if not initial_state_loaded:
599
+ # Argümanları logla ve kaydet (sadece sıfırdan başlarken)
600
+ logging.info("--- Configuration ---")
601
+ args_dict = vars(args)
602
+ for k, v in args_dict.items(): logging.info(f" {k:<20}: {v}")
603
+ logging.info("---------------------")
604
+ config_path = os.path.join(output_dir, "config.json")
605
+ try:
606
+ with open(config_path, 'w') as f: json.dump(args_dict, f, indent=4, sort_keys=True)
607
+ logging.info(f"Configuration saved to {config_path}")
608
+ except Exception as e: logging.error(f"Failed to save configuration: {e}", exc_info=True)
609
+
610
+ # Rastgele tohumları ayarla
611
+ try:
612
+ random.seed(args.seed); np.random.seed(args.seed); tf.random.set_seed(args.seed)
613
+ logging.info(f"Using random seed: {args.seed}")
614
+ except Exception as e: logging.warning(f"Could not set all random seeds: {e}")
615
+
616
+ # GPU kontrolü
617
+ is_gpu_available = check_gpu()
618
+
619
+ # Veri Üretimi
620
+ try:
621
+ X_train, y_train = generate_data(args.train_samples, args.seq_length)
622
+ X_test, y_test = generate_data(args.test_samples, args.seq_length)
623
+ input_shape = X_train.shape[1:]
624
+ except Exception: logging.critical("Failed to generate data. Exiting."); sys.exit(1)
625
+
626
+ # Popülasyon Başlatma
627
+ logging.info(f"--- Initializing Population (Size: {args.pop_size}) ---")
628
+ try:
629
+ population = [create_individual(args.seq_length, input_shape) for _ in range(args.pop_size)]
630
+ logging.info("Population initialized successfully.")
631
+ except Exception: logging.critical("Failed to initialize population. Exiting."); sys.exit(1)
632
+ else:
633
+ # Checkpoint'ten devam ediliyorsa, veriyi yeniden üretmemiz gerekebilir
634
+ # veya checkpoint'e veriyi de dahil edebiliriz (büyük olabilir).
635
+ # Şimdilik veriyi yeniden üretelim.
636
+ logging.info("Reloading data for resumed run...")
637
+ is_gpu_available = check_gpu() # GPU durumunu tekrar kontrol et
638
+ try:
639
+ X_train, y_train = generate_data(args.train_samples, args.seq_length)
640
+ X_test, y_test = generate_data(args.test_samples, args.seq_length)
641
+ except Exception: logging.critical("Failed to reload data for resumed run. Exiting."); sys.exit(1)
642
+ # Config dosyasını tekrar okuyup loglayabiliriz
643
+ config_path = os.path.join(output_dir, "config.json")
644
+ try:
645
+ with open(config_path, 'r') as f: args_dict = json.load(f)
646
+ logging.info("--- Loaded Configuration (from resumed run) ---")
647
+ for k, v in args_dict.items(): logging.info(f" {k:<20}: {v}")
648
+ logging.info("-----------------------------------------------")
649
+ except Exception as e:
650
+ logging.warning(f"Could not reload config.json: {e}")
651
+ args_dict = vars(args) # Argümanları kullan
652
+
653
+
654
+ # Evrim Süreci
655
+ logging.info(f"--- Starting/Resuming Evolution ({args.generations} Total Generations) ---")
656
+ if start_generation >= args.generations:
657
+ logging.warning(f"Loaded checkpoint generation ({start_generation}) is already >= total generations ({args.generations}). Skipping evolution.")
658
+ best_model_unevolved = population[0] if population else None # En iyi modeli checkpoint'ten almaya çalışmak lazım
659
+ best_fitness_hist, avg_fitness_hist = [], [] # Geçmişi de yüklemek lazım
660
+ # TODO: Checkpoint'ten en iyi modeli ve geçmişi de yükle
661
+ # Şimdilik basitleştirilmiş - evrim atlanıyor
662
+ else:
663
+ try:
664
+ best_model_unevolved, best_fitness_hist, avg_fitness_hist = evolve_population_v3(
665
+ population, X_train, y_train, start_generation, args.generations,
666
+ args.crossover_rate, args.mutation_rate, args.weight_mut_rate, args.mutation_strength,
667
+ args.tournament_size, args.elitism_count, args.batch_size,
668
+ output_dir, args.checkpoint_interval
669
+ )
670
+ except Exception as e:
671
+ logging.critical(f"Fatal error during evolution process: {e}", exc_info=True)
672
+ sys.exit(1)
673
+ logging.info("--- Evolution Complete ---")
674
+
675
+ # (Fitness geçmişini kaydetme ve çizdirme - öncekiyle aynı)
676
+ if best_fitness_hist or avg_fitness_hist: # Sadece listeler boş değilse
677
+ # Geçmişi de checkpoint'ten yükleyip birleştirmek gerekebilir.
678
+ # Şimdilik sadece bu çalıştırmadaki kısmı kaydediyoruz/çizdiriyoruz.
679
+ # TODO: Checkpoint'ten yüklenen geçmişle birleştir.
680
+ plot_fitness_history(best_fitness_hist, avg_fitness_hist, output_dir)
681
+ history_path = os.path.join(output_dir, "fitness_history_run.csv") # Farklı isim?
682
+ try:
683
+ history_data = np.array([np.arange(start_generation + 1, start_generation + len(best_fitness_hist) + 1), best_fitness_hist, avg_fitness_hist]).T
684
+ np.savetxt(history_path, history_data, delimiter=',', header='Generation,BestFitness,AvgFitness', comments='', fmt=['%d', '%.8f', '%.8f'])
685
+ logging.info(f"Fitness history (this run) saved to {history_path}")
686
+ except Exception as e: logging.error(f"Could not save fitness history data: {e}")
687
+ else: logging.warning("Fitness history is empty, skipping saving/plotting.")
688
+
689
+ # (En iyi modelin son eğitimi, değerlendirme ve sonuç kaydı - öncekiyle aynı)
690
+ if best_model_unevolved is None:
691
+ logging.error("Evolution did not yield a best model. Skipping final training and evaluation.")
692
+ final_metrics = {"test_mse": np.inf, "avg_kendall_tau": 0.0}; final_model_path = None; training_summary = {}
693
+ else:
694
+ logging.info("--- Starting Final Training of Best Evolved Model ---")
695
+ try:
696
+ final_model = clone_model(best_model_unevolved); final_model.set_weights(best_model_unevolved.get_weights())
697
+ final_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
698
+ logging.info("Model Summary of Best Evolved (Untrained):"); final_model.summary(print_fn=logging.info)
699
+ early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True, verbose=1)
700
+ reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=7, min_lr=1e-7, verbose=1)
701
+ history = final_model.fit(X_train, y_train, epochs=args.epochs_final_train, batch_size=args.batch_size, validation_split=0.2, callbacks=[early_stopping, reduce_lr], verbose=2)
702
+ logging.info("Final training complete.")
703
+ training_summary = {"epochs_run": len(history.history['loss']), "final_train_loss": history.history['loss'][-1], "final_val_loss": history.history['val_loss'][-1]}
704
+ final_metrics = evaluate_model(final_model, X_test, y_test, args.batch_size)
705
+ final_model_path = os.path.join(output_dir, "best_evolved_model_trained.keras")
706
+ final_model.save(final_model_path); logging.info(f"Final trained model saved to {final_model_path}")
707
+ except Exception as e:
708
+ logging.error(f"Error during final training or evaluation: {e}", exc_info=True)
709
+ final_metrics = {"test_mse": np.inf, "avg_kendall_tau": 0.0}; final_model_path = None; training_summary = {"error": str(e)}
710
+
711
+ logging.info("--- Saving Final Results ---")
712
+ final_results = { # ... (öncekiyle aynı sonuç yapısı) ...
713
+ "run_info": {"run_name": run_name, "timestamp": timestamp, "output_directory": output_dir, "gpu_used": is_gpu_available, "resumed": resume_run},
714
+ "config": args_dict,
715
+ "evolution_summary": { # TODO: Checkpoint'ten yüklenen geçmişle birleştirilmeli
716
+ "generations_run_this_session": len(best_fitness_hist) if best_fitness_hist else 0,
717
+ "best_fitness_achieved_overall": best_fitness_overall if best_fitness_overall > -np.inf else None,
718
+ "best_fitness_final_gen": best_fitness_hist[-1] if best_fitness_hist else None,
719
+ "avg_fitness_final_gen": avg_fitness_hist[-1] if avg_fitness_hist else None, },
720
+ "final_training_summary": training_summary, "final_evaluation_on_test": final_metrics, "saved_model_path": final_model_path }
721
+ results_path = os.path.join(output_dir, "final_results.json")
722
+ try:
723
+ def convert_numpy_types(obj):
724
+ if isinstance(obj, np.integer): return int(obj)
725
+ elif isinstance(obj, np.floating): return float(obj)
726
+ elif isinstance(obj, np.ndarray): return obj.tolist()
727
+ return obj
728
+ with open(results_path, 'w') as f: json.dump(final_results, f, indent=4, default=convert_numpy_types)
729
+ logging.info(f"Final results summary saved to {results_path}")
730
+ except Exception as e: logging.error(f"Failed to save final results JSON: {e}", exc_info=True)
731
+
732
+ logging.info(f"========== Pipeline Run {run_name} Finished ==========")
733
+
734
+
735
+ # --- Argüman Ayrıştırıcı (Yeni Argümanlar Eklendi) ---
736
+ def parse_arguments_v3() -> argparse.Namespace:
737
+ parser = argparse.ArgumentParser(description="EvoNet v3: Neuroevolution with Crossover & Checkpointing")
738
+
739
+ # --- Dizinler ve Kontrol ---
740
+ parser.add_argument('--output_base_dir', type=str, default=DEFAULT_OUTPUT_BASE_DIR, help='Base directory for new runs.')
741
+ parser.add_argument('--resume_from', type=str, default=None, help='Path to a previous run directory to resume from.')
742
+ parser.add_argument('--checkpoint_interval', type=int, default=DEFAULT_CHECKPOINT_INTERVAL, help='Save checkpoint every N generations (0 to disable).')
743
+
744
+ # --- Veri Ayarları ---
745
+ parser.add_argument('--seq_length', type=int, default=DEFAULT_SEQ_LENGTH, help='Length of sequences.')
746
+ parser.add_argument('--train_samples', type=int, default=5000, help='Number of training samples.')
747
+ parser.add_argument('--test_samples', type=int, default=1000, help='Number of test samples.')
748
+
749
+ # --- Evrim Parametreleri ---
750
+ parser.add_argument('--pop_size', type=int, default=DEFAULT_POP_SIZE, help='Population size.')
751
+ parser.add_argument('--generations', type=int, default=DEFAULT_GENERATIONS, help='Total number of generations.')
752
+ parser.add_argument('--crossover_rate', type=float, default=DEFAULT_CROSSOVER_RATE, help='Probability of applying crossover.')
753
+ parser.add_argument('--mutation_rate', type=float, default=DEFAULT_MUTATION_RATE, help='Probability of applying mutation (if crossover is not applied).')
754
+ parser.add_argument('--weight_mut_rate', type=float, default=DEFAULT_WEIGHT_MUT_RATE, help='Weight mutation probability within mutation.')
755
+ # parser.add_argument('--activation_mut_rate', type=float, default=DEFAULT_ACTIVATION_MUT_RATE, help='Activation mutation probability (experimental).')
756
+ parser.add_argument('--mutation_strength', type=float, default=DEFAULT_MUTATION_STRENGTH, help='Std dev for weight mutation noise.')
757
+ parser.add_argument('--tournament_size', type=int, default=DEFAULT_TOURNAMENT_SIZE, help='Tournament selection size.')
758
+ parser.add_argument('--elitism_count', type=int, default=DEFAULT_ELITISM_COUNT, help='Number of elite individuals.')
759
+
760
+ # --- Eğitim ve Değerlendirme ---
761
+ parser.add_argument('--batch_size', type=int, default=DEFAULT_BATCH_SIZE, help='Batch size.')
762
+ parser.add_argument('--epochs_final_train', type=int, default=DEFAULT_EPOCHS_FINAL_TRAIN, help='Max epochs for final training.')
763
+
764
+ # --- Tekrarlanabilirlik ---
765
+ parser.add_argument('--seed', type=int, default=None, help='Random seed (default: random).')
766
+
767
+ args = parser.parse_args()
768
+ if args.seed is None: args.seed = random.randint(0, 2**32 - 1); print(f"Generated random seed: {args.seed}")
769
+ # Basit kontrol: Crossover + Mutation oranı > 1 olmamalı (teknik olarak olabilir ama mantık gereği biri seçilmeli)
770
+ # if args.crossover_rate + args.mutation_rate > 1.0: logging.warning("Sum of crossover and mutation rates exceeds 1.0")
771
+ return args
772
+
773
+
774
+ # --- Ana Çalıştırma Bloğu ---
775
+ if __name__ == "__main__":
776
+ cli_args = parse_arguments_v3()
777
+ try:
778
+ run_pipeline_v3(cli_args)
779
+ except SystemExit: pass
780
+ except Exception as e:
781
+ print(f"\nFATAL UNHANDLED ERROR in main execution block: {e}", file=sys.stderr)
782
+ if logging.getLogger().hasHandlers(): logging.critical("FATAL UNHANDLED ERROR:", exc_info=True)
783
+ else: import traceback; print(traceback.format_exc(), file=sys.stderr)
784
+ sys.exit(1)
v4.py ADDED
@@ -0,0 +1,1327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==============================================================================
2
+ # EvoNet Optimizer - v4 - PyTorch Tabanlı Geliştirilmiş Sürüm
3
+ # Açıklama: TensorFlow'dan PyTorch'a geçiş yapılmış, modern PyTorch
4
+ # pratikleri kullanılmış, esneklik artırılmış, kod kalitesi
5
+ # iyileştirilmiş ve PyTorch ekosistemine uygun hale getirilmiştir.
6
+ # Çaprazlama, Kontrol Noktası, Adaptif Mutasyon (kavramsal) ve
7
+ # Gelişmiş Fitness (kavramsal) özellikleri korunmuştur.
8
+ # ==============================================================================
9
+
10
+ import os
11
+ import subprocess
12
+ import sys
13
+ import argparse
14
+ import random
15
+ import logging
16
+ from datetime import datetime
17
+ import json
18
+ import copy # Model klonlama ve durum dikteleri için
19
+ import time
20
+ from typing import List, Tuple, Dict, Any, Optional, Union
21
+
22
+ import numpy as np
23
+ import torch
24
+ import torch.nn as nn
25
+ import torch.optim as optim
26
+ from torch.utils.data import TensorDataset, DataLoader
27
+ import matplotlib.pyplot as plt
28
+ from scipy.stats import kendalltau # Hala numpy/scipy kullanıyoruz
29
+
30
+ # --- Sabitler ve Varsayılan Değerler ---
31
+ DEFAULT_SEQ_LENGTH = 10
32
+ DEFAULT_POP_SIZE = 50
33
+ DEFAULT_GENERATIONS = 50
34
+ DEFAULT_CROSSOVER_RATE = 0.6
35
+ DEFAULT_MUTATION_RATE = 0.4 # Eğer çaprazlama olmazsa mutasyon olasılığı
36
+ DEFAULT_WEIGHT_MUT_RATE = 0.8 # Ağırlık mutasyonu olasılığı (mutasyon içinde)
37
+ # Aktivasyon mutasyonu PyTorch'ta daha farklı ele alınmalı, şimdilik odak ağırlıkta.
38
+ DEFAULT_MUTATION_STRENGTH = 0.1
39
+ DEFAULT_TOURNAMENT_SIZE = 5
40
+ DEFAULT_ELITISM_COUNT = 2
41
+ DEFAULT_EPOCHS_FINAL_TRAIN = 100
42
+ DEFAULT_BATCH_SIZE = 64
43
+ DEFAULT_OUTPUT_BASE_DIR = os.path.join(os.getcwd(), "evonet_runs_v4_pytorch")
44
+ DEFAULT_CHECKPOINT_INTERVAL = 10 # Nesil başına checkpoint (0 = kapalı)
45
+ DEFAULT_DEVICE = "auto" # "auto", "cpu", "cuda"
46
+
47
+ # --- Loglama Ayarları ---
48
+ # (setup_logging fonksiyonu öncekiyle aynı, tekrar eklemiyorum)
49
+ def setup_logging(log_dir: str, log_level=logging.INFO) -> None:
50
+ log_filename = os.path.join(log_dir, 'evolution_run_pytorch.log')
51
+ # Mevcut handler'ları temizle (özellikle tekrar çalıştırmalarda önemli)
52
+ for handler in logging.root.handlers[:]:
53
+ handler.close() # Önce kapat
54
+ logging.root.removeHandler(handler)
55
+ logging.basicConfig(
56
+ level=log_level,
57
+ format='%(asctime)s - %(levelname)-8s [%(filename)s:%(lineno)d] - %(message)s', # Daha detaylı format
58
+ handlers=[
59
+ logging.FileHandler(log_filename, mode='a'), # append modu
60
+ logging.StreamHandler(sys.stdout)
61
+ ]
62
+ )
63
+ logging.info("="*50)
64
+ logging.info("PyTorch EvoNet v4 Logging Başlatıldı.")
65
+ logging.info("="*50)
66
+
67
+ # --- Cihaz (GPU/CPU) Ayarları ---
68
+ def setup_device(requested_device: str) -> torch.device:
69
+ """ Kullanılabilir cihaza göre PyTorch cihazını ayarlar. """
70
+ if requested_device == "auto":
71
+ if torch.cuda.is_available():
72
+ device_name = "cuda"
73
+ logging.info(f"CUDA (GPU) kullanılabilir: {torch.cuda.get_device_name(0)}")
74
+ else:
75
+ device_name = "cpu"
76
+ logging.info("CUDA (GPU) bulunamadı. CPU kullanılacak.")
77
+ elif requested_device == "cuda":
78
+ if torch.cuda.is_available():
79
+ device_name = "cuda"
80
+ logging.info(f"CUDA (GPU) manuel olarak seçildi: {torch.cuda.get_device_name(0)}")
81
+ else:
82
+ logging.warning("CUDA (GPU) istendi ancak bulunamadı! CPU kullanılacak.")
83
+ device_name = "cpu"
84
+ else: # cpu veya geçersiz değer
85
+ device_name = "cpu"
86
+ logging.info("CPU manuel olarak seçildi veya geçersiz cihaz belirtildi.")
87
+
88
+ return torch.device(device_name)
89
+
90
+ # --- Veri Üretimi ---
91
+ # (generate_data fonksiyonu öncekiyle aynı, NumPy tabanlı)
92
+ def generate_data(num_samples: int, seq_length: int) -> Tuple[np.ndarray, np.ndarray]:
93
+ logging.info(f"Generating {num_samples} samples with sequence length {seq_length}...")
94
+ try:
95
+ # Veriyi float32 olarak üretmek PyTorch için genellikle daha iyidir
96
+ X = np.random.rand(num_samples, seq_length).astype(np.float32) * 100
97
+ y = np.sort(X, axis=1).astype(np.float32)
98
+ logging.info("Data generation successful.")
99
+ return X, y
100
+ except Exception as e:
101
+ logging.error(f"Error during data generation: {e}", exc_info=True)
102
+ raise
103
+
104
+ # --- PyTorch Sinir Ağı Modeli ---
105
+ class NeuralNetwork(nn.Module):
106
+ """ Dinamik olarak yapılandırılabilen basit bir PyTorch MLP modeli. """
107
+ def __init__(self, input_size: int, output_size: int, hidden_dims: List[int], activations: List[str]):
108
+ super().__init__()
109
+ self.input_size = input_size
110
+ self.output_size = output_size
111
+ self.hidden_dims = hidden_dims
112
+ self.activations_str = activations # Mimarinin string listesi (checkpoint için)
113
+
114
+ layers = []
115
+ last_dim = input_size
116
+ for i, h_dim in enumerate(hidden_dims):
117
+ layers.append(nn.Linear(last_dim, h_dim))
118
+ act_func_str = activations[i].lower()
119
+ if act_func_str == 'relu':
120
+ layers.append(nn.ReLU())
121
+ elif act_func_str == 'tanh':
122
+ layers.append(nn.Tanh())
123
+ elif act_func_str == 'sigmoid':
124
+ layers.append(nn.Sigmoid())
125
+ else:
126
+ logging.warning(f"Bilinmeyen aktivasyon '{activations[i]}', ReLU kullanılıyor.")
127
+ layers.append(nn.ReLU()) # Varsayılan
128
+ last_dim = h_dim
129
+
130
+ # Çıkış katmanı (genellikle lineer aktivasyon)
131
+ layers.append(nn.Linear(last_dim, output_size))
132
+
133
+ self.network = nn.Sequential(*layers)
134
+ self.architecture_id = self._generate_architecture_id() # Mimarinin özeti
135
+ # Modelin adını (ID'sini) oluşturma (opsiyonel, loglama için kullanışlı)
136
+ self.model_name = f"model_{self.architecture_id}_rnd{random.randint(10000, 99999)}"
137
+
138
+
139
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
140
+ return self.network(x)
141
+
142
+ def get_architecture(self) -> Dict[str, Any]:
143
+ """ Model mimarisini döndürür (checkpointing için). """
144
+ return {
145
+ "input_size": self.input_size,
146
+ "output_size": self.output_size,
147
+ "hidden_dims": self.hidden_dims,
148
+ "activations": self.activations_str
149
+ }
150
+
151
+ def _generate_architecture_id(self) -> str:
152
+ """ Mimariden kısa bir kimlik üretir. """
153
+ h_dims_str = '_'.join(map(str, self.hidden_dims))
154
+ acts_str = ''.join([a[0].upper() for a in self.activations_str]) # R_T_S
155
+ return f"I{self.input_size}_H{h_dims_str}_A{acts_str}_O{self.output_size}"
156
+
157
+ # Eşitlik kontrolü mimari bazında yapılabilir
158
+ def __eq__(self, other):
159
+ if not isinstance(other, NeuralNetwork):
160
+ return NotImplemented
161
+ return self.get_architecture() == other.get_architecture()
162
+
163
+ def __hash__(self):
164
+ # Mimariyi temsil eden bir tuple oluştur ve hash'ini al
165
+ arch_tuple = (
166
+ self.input_size,
167
+ self.output_size,
168
+ tuple(self.hidden_dims),
169
+ tuple(self.activations_str)
170
+ )
171
+ return hash(arch_tuple)
172
+
173
+
174
+ # --- Neuroevolution Çekirdeği (PyTorch) ---
175
+
176
+ def create_individual_pytorch(input_size: int, output_size: int) -> NeuralNetwork:
177
+ """ Rastgele mimariye sahip bir PyTorch NeuralNetwork modeli oluşturur. """
178
+ try:
179
+ num_hidden_layers = random.randint(1, 4)
180
+ hidden_dims = [random.randint(16, 128) for _ in range(num_hidden_layers)] # Biraz daha geniş aralık
181
+ activations = [random.choice(['relu', 'tanh', 'sigmoid']) for _ in range(num_hidden_layers)]
182
+
183
+ model = NeuralNetwork(input_size, output_size, hidden_dims, activations)
184
+ # PyTorch'ta model oluşturulduktan sonra compile gerekmez.
185
+ # Ağırlıklar zaten rastgele başlatılır.
186
+ logging.debug(f"Created individual: {model.model_name}")
187
+ return model
188
+ except Exception as e:
189
+ logging.error(f"Error creating PyTorch individual model: {e}", exc_info=True)
190
+ raise
191
+
192
+ # PyTorch için model kopyalama işlevi
193
+ def clone_pytorch_model(model: NeuralNetwork, device: torch.device) -> NeuralNetwork:
194
+ """ Bir PyTorch modelini (mimari ve ağırlıklar) klonlar. """
195
+ try:
196
+ # 1. Aynı mimariyle yeni bir model oluştur
197
+ arch = model.get_architecture()
198
+ cloned_model = NeuralNetwork(**arch)
199
+ # 2. Orijinal modelin state_dict'ini kopyala
200
+ cloned_model.load_state_dict(copy.deepcopy(model.state_dict()))
201
+ # 3. Yeni modeli doğru cihaza taşı
202
+ cloned_model.to(device)
203
+ cloned_model.model_name = f"cloned_{model.model_name}_{random.randint(1000,9999)}"
204
+ logging.debug(f"Cloned model {model.model_name} to {cloned_model.model_name}")
205
+ return cloned_model
206
+ except Exception as e:
207
+ logging.error(f"Error cloning PyTorch model {model.model_name}: {e}", exc_info=True)
208
+ raise
209
+
210
+ def calculate_fitness_pytorch(
211
+ individual: NeuralNetwork,
212
+ X: torch.Tensor,
213
+ y: torch.Tensor,
214
+ device: torch.device,
215
+ fitness_params: Optional[Dict] = None
216
+ ) -> float:
217
+ """ Bir bireyin fitness değerini PyTorch kullanarak hesaplar. """
218
+ # --- KAVRAMSAL: Gelişmiş Fitness Fonksiyonu (PyTorch ile uyumlu) ---
219
+ # fitness_params = fitness_params or {}
220
+ # w_mse = fitness_params.get('w_mse', 1.0)
221
+ # w_tau = fitness_params.get('w_tau', 0.1) # Kendall Tau ağırlığı
222
+ # w_comp = fitness_params.get('w_comp', 0.0001) # Karmaşıklık cezası ağırlığı
223
+ # --------------------------------------------
224
+
225
+ individual.eval() # Modeli değerlendirme moduna al (dropout vs. etkisizleşir)
226
+ individual.to(device) # Modeli doğru cihaza taşı
227
+ X, y = X.to(device), y.to(device) # Veriyi doğru cihaza taşı
228
+
229
+ try:
230
+ with torch.no_grad(): # Gradyan hesaplamasını kapat (inferans için)
231
+ y_pred = individual(X)
232
+ # Temel Fitness: MSE (Mean Squared Error)
233
+ # loss_fn = nn.MSELoss()
234
+ # mse_val = loss_fn(y_pred, y).item()
235
+ # Alternatif manuel hesaplama:
236
+ mse_val = torch.mean((y_pred - y)**2).item()
237
+
238
+ # MSE sonsuz veya NaN ise minimum fitness ata
239
+ if not np.isfinite(mse_val):
240
+ logging.warning(f"Non-finite MSE ({mse_val}) for model {individual.model_name}. Assigning minimal fitness.")
241
+ return -1e9 # Çok düşük bir değer ata
242
+
243
+ # Temel Fitness (MSE'nin tersi, daha yüksek daha iyi)
244
+ fitness_score = 1.0 / (mse_val + 1e-9) # Sıfıra bölme hatasını önle
245
+
246
+ # --- KAVRAMSAL: Gelişmiş Fitness Hesabı ---
247
+ # if w_tau > 0 or w_comp > 0:
248
+ # # Kendall Tau hesapla (NumPy'a çevirerek, maliyetli olabilir)
249
+ # y_np = y.cpu().numpy()
250
+ # y_pred_np = y_pred.cpu().numpy()
251
+ # tau_val = calculate_avg_kendall_tau(y_np, y_pred_np, sample_size=100) # Örnek fonksiyon
252
+ #
253
+ # # Karmaşıklık hesapla (parametre sayısı)
254
+ # complexity = sum(p.numel() for p in individual.parameters() if p.requires_grad)
255
+ #
256
+ # # Birleştirilmiş fitness (Örnek formül)
257
+ # # MSE'yi minimize etmek istediğimiz için 1/MSE kullanıyoruz.
258
+ # # Tau'yu maksimize etmek istiyoruz.
259
+ # # Karmaşıklığı minimize etmek istiyoruz.
260
+ # fitness_score = (w_mse * fitness_score) + (w_tau * tau_val) - (w_comp * complexity)
261
+ # --------------------------------------------
262
+
263
+ # Sonuçta yine de çok düşük veya sonsuz fitness kontrolü
264
+ if not np.isfinite(fitness_score) or fitness_score < -1e8:
265
+ logging.warning(f"Non-finite or very low final fitness ({fitness_score:.4g}) for model {individual.model_name}. Assigning minimal fitness.")
266
+ return -1e9
267
+
268
+ return float(fitness_score)
269
+
270
+ except Exception as e:
271
+ logging.error(f"Error during fitness calculation for model {individual.model_name}: {e}", exc_info=True)
272
+ return -1e9 # Hata durumunda çok düşük fitness
273
+
274
+
275
+ def mutate_individual_pytorch(
276
+ individual: NeuralNetwork,
277
+ weight_mut_rate: float, # Bu parametre aslında ağırlıkların *ne kadarının* mutasyona uğrayacağını belirleyebilir
278
+ mutation_strength: float,
279
+ device: torch.device
280
+ ) -> NeuralNetwork:
281
+ """ Bir PyTorch bireyine ağırlık bozulması mutasyonu uygular. """
282
+ try:
283
+ # Önemli: Orijinal modeli değiştirmemek için klonla
284
+ mutated_model = clone_pytorch_model(individual, device)
285
+ mutated_model.model_name = f"mutated_{individual.model_name}_{random.randint(1000,9999)}"
286
+
287
+ mutated = False
288
+ # Modelin state_dict'i üzerinde değişiklik yap
289
+ state_dict = mutated_model.state_dict()
290
+ new_state_dict = copy.deepcopy(state_dict) # Derin kopya al
291
+
292
+ for name, param in new_state_dict.items():
293
+ # Sadece eğitilebilir ağırlık/bias tensörlerini değiştir
294
+ if param.requires_grad and random.random() < weight_mut_rate : # Her parametre için mutasyon olasılığı
295
+ mutated = True
296
+ # Gaussian gürültü ekle
297
+ noise = torch.randn_like(param) * mutation_strength
298
+ new_state_dict[name] = param + noise.to(param.device) # Gürültüyü doğru cihaza taşı
299
+
300
+ if mutated:
301
+ mutated_model.load_state_dict(new_state_dict)
302
+ logging.debug(f"Mutated model {individual.model_name} -> {mutated_model.model_name}")
303
+ return mutated_model
304
+ else:
305
+ # Mutasyon uygulanmadıysa, klonlanmış modeli (isim değiştirilmiş) döndür veya orijinali?
306
+ # Mantıksal olarak mutasyon fonksiyonu çağrıldıysa bir değişiklik beklenir.
307
+ # Eğer hiç parametre mutasyona uğramadıysa bile farklı bir obje döndürmek tutarlı olabilir.
308
+ logging.debug(f"Mutation applied to {individual.model_name}, but no weights changed based on rate.")
309
+ return mutated_model # Klonlanmış, potansiyel olarak ismi değişmiş modeli döndür
310
+
311
+ except Exception as e:
312
+ logging.error(f"Error during PyTorch mutation of model {individual.model_name}: {e}", exc_info=True)
313
+ # Hata durumunda orijinal bireyi döndürmek güvenli bir seçenek olabilir
314
+ # return individual
315
+ # Ancak evrimsel süreçte sorun yaratabilir, bu yüzden klonlanmışı döndürmek daha iyi
316
+ return clone_pytorch_model(individual, device) # Hata durumunda temiz klon döndür
317
+
318
+
319
+ def check_architecture_compatibility_pytorch(model1: NeuralNetwork, model2: NeuralNetwork) -> bool:
320
+ """ İki PyTorch modelinin basit çaprazlama için uyumlu olup olmadığını kontrol eder. """
321
+ # Mimari bilgilerini karşılaştır
322
+ return model1.get_architecture() == model2.get_architecture()
323
+
324
+
325
+ def crossover_individuals_pytorch(
326
+ parent1: NeuralNetwork,
327
+ parent2: NeuralNetwork,
328
+ device: torch.device
329
+ ) -> Tuple[Optional[NeuralNetwork], Optional[NeuralNetwork]]:
330
+ """ İki PyTorch ebeveynden basit ağırlık ortalaması/karıştırması ile çocuklar oluşturur. """
331
+
332
+ # 1. Mimari uyumluluğunu kontrol et
333
+ if not check_architecture_compatibility_pytorch(parent1, parent2):
334
+ logging.debug(f"Skipping crossover between {parent1.model_name} and {parent2.model_name} due to incompatible architectures.")
335
+ return None, None
336
+
337
+ try:
338
+ # 2. Çocuklar için yeni model örnekleri oluştur (aynı mimariyle)
339
+ arch = parent1.get_architecture() # İkisi de aynı mimariye sahip
340
+ child1 = NeuralNetwork(**arch).to(device)
341
+ child2 = NeuralNetwork(**arch).to(device)
342
+ child1.model_name = f"xover_{parent1.architecture_id}_c1_{random.randint(1000,9999)}"
343
+ child2.model_name = f"xover_{parent1.architecture_id}_c2_{random.randint(1000,9999)}"
344
+
345
+
346
+ # 3. Ebeveynlerin state_dict'lerini al
347
+ p1_state = parent1.state_dict()
348
+ p2_state = parent2.state_dict()
349
+
350
+ # 4. Çocukların state_dict'lerini oluştur
351
+ c1_state = child1.state_dict() # Başlangıç (rastgele) state'i al
352
+ c2_state = child2.state_dict()
353
+
354
+ for name in p1_state: # Parametre isimleri üzerinden döngü
355
+ param1 = p1_state[name]
356
+ param2 = p2_state[name]
357
+
358
+ # Basit ortalama çaprazlama (daha fazla yöntem eklenebilir)
359
+ # c1_state[name] = (param1 + param2) / 2.0
360
+ # c2_state[name] = (param1 + param2) / 2.0 # Ortalama için ikisi de aynı
361
+
362
+ # Tek nokta veya uniform crossover (ağırlık matrisi üzerinde)
363
+ mask = torch.rand_like(param1) < 0.5
364
+ c1_state[name] = torch.where(mask, param1, param2)
365
+ c2_state[name] = torch.where(mask, param2, param1) # Ters maske ile
366
+
367
+ # 5. Yeni state_dict'leri çocuklara yükle
368
+ child1.load_state_dict(c1_state)
369
+ child2.load_state_dict(c2_state)
370
+
371
+ logging.debug(f"Crossover performed between {parent1.model_name} and {parent2.model_name}")
372
+ return child1, child2
373
+
374
+ except Exception as e:
375
+ logging.error(f"Error during PyTorch crossover between {parent1.model_name} and {parent2.model_name}: {e}", exc_info=True)
376
+ return None, None
377
+
378
+ # (tournament_selection fonksiyonu öncekiyle aynı mantıkta çalışır, sadece model yerine
379
+ # NeuralNetwork objesini döndürür)
380
+ def tournament_selection(
381
+ population: List[NeuralNetwork],
382
+ fitness_scores: List[float],
383
+ k: int
384
+ ) -> NeuralNetwork:
385
+ """ Turnuva seçimi ile popülasyondan bir birey seçer. """
386
+ if not population:
387
+ raise ValueError("Population cannot be empty for tournament selection.")
388
+ if len(population) < k:
389
+ logging.warning(f"Tournament size ({k}) is larger than population size ({len(population)}). Using population size.")
390
+ k = len(population)
391
+ if k <= 0:
392
+ logging.warning(f"Tournament size ({k}) must be positive. Using 1.")
393
+ k = 1
394
+
395
+ try:
396
+ # Popülasyondan k bireyi rastgele seç (indeksleriyle)
397
+ tournament_indices = random.sample(range(len(population)), k)
398
+ # Seçilenlerin fitness skorlarını ve kendilerini al
399
+ tournament_contenders = [(fitness_scores[i], population[i]) for i in tournament_indices]
400
+ # Fitness'a göre en iyiyi seç
401
+ winner = max(tournament_contenders, key=lambda item: item[0])[1] # item[0] fitness, item[1] model
402
+ return winner
403
+ except Exception as e:
404
+ logging.error(f"Error during tournament selection: {e}", exc_info=True)
405
+ # Hata durumunda rastgele bir birey döndür
406
+ return random.choice(population)
407
+
408
+
409
+ # --- Checkpointing (PyTorch) ---
410
+ def save_checkpoint_pytorch(output_dir: str, generation: int, population: List[NeuralNetwork], rnd_state: Any, np_rnd_state: Any, torch_rnd_state: Any):
411
+ """ Evrim durumunu (PyTorch modelleri ve rastgele durumlar) kaydeder. """
412
+ checkpoint_dir = os.path.join(output_dir, "checkpoints_pytorch")
413
+ os.makedirs(checkpoint_dir, exist_ok=True)
414
+ checkpoint_file = os.path.join(checkpoint_dir, f"evo_gen_{generation}.pt") # .pt uzantısı PyTorch için yaygın
415
+ logging.info(f"Saving checkpoint for generation {generation} to {checkpoint_file}...")
416
+
417
+ population_state = []
418
+ for model in population:
419
+ try:
420
+ # Her model için mimariyi ve state_dict'i kaydet
421
+ population_state.append({
422
+ "name": model.model_name,
423
+ "architecture": model.get_architecture(),
424
+ "state_dict": model.state_dict()
425
+ })
426
+ except Exception as e:
427
+ logging.error(f"Could not serialize model {model.model_name} for checkpoint: {e}")
428
+ # Başarısız olursa bu modeli atla
429
+
430
+ state = {
431
+ "generation": generation,
432
+ "population_state": population_state, # Sadece başarılı olanları içerir
433
+ "random_state": rnd_state,
434
+ "numpy_random_state": np_rnd_state,
435
+ "torch_random_state": torch_rnd_state, # PyTorch RNG durumu
436
+ "timestamp": datetime.now().isoformat()
437
+ }
438
+
439
+ try:
440
+ torch.save(state, checkpoint_file)
441
+ logging.info(f"Checkpoint saved successfully for generation {generation}.")
442
+ except Exception as e:
443
+ logging.error(f"Failed to save checkpoint using torch.save for generation {generation}: {e}", exc_info=True)
444
+
445
+
446
+ def load_checkpoint_pytorch(checkpoint_path: str, device: torch.device) -> Optional[Dict]:
447
+ """ Kaydedilmiş PyTorch evrim durumunu yükler. """
448
+ if not os.path.exists(checkpoint_path):
449
+ logging.error(f"Checkpoint file not found: {checkpoint_path}")
450
+ return None
451
+ logging.info(f"Loading checkpoint from {checkpoint_path}...")
452
+
453
+ try:
454
+ # Checkpoint'i CPU'ya yüklemek genellikle daha güvenlidir, sonra cihaza taşınır
455
+ checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
456
+
457
+ population = []
458
+ for model_state in checkpoint["population_state"]:
459
+ try:
460
+ # 1. Mimariden modeli yeniden oluştur
461
+ arch = model_state["architecture"]
462
+ model = NeuralNetwork(**arch)
463
+ # 2. Kaydedilmiş state_dict'i yükle
464
+ model.load_state_dict(model_state["state_dict"])
465
+ # 3. Modeli istenen cihaza taşı
466
+ model.to(device)
467
+ # 4. Model adını geri yükle (opsiyonel)
468
+ model.model_name = model_state.get("name", f"loaded_model_{random.randint(1000,9999)}")
469
+ model.eval() # Değerlendirme modunda başlat
470
+ population.append(model)
471
+ except Exception as e:
472
+ logging.error(f"Failed to load model state from checkpoint for model {model_state.get('name', 'UNKNOWN')}: {e}", exc_info=True)
473
+
474
+ if not population:
475
+ logging.error("Failed to load any model from the checkpoint population state.")
476
+ return None # Hiç model yüklenemediyse checkpoint geçersiz
477
+
478
+ # Yüklenen popülasyonu state'e ekle
479
+ checkpoint["population"] = population
480
+
481
+ logging.info(f"Checkpoint loaded successfully. Resuming from generation {checkpoint['generation'] + 1}.")
482
+ return checkpoint
483
+ except Exception as e:
484
+ logging.error(f"Failed to load checkpoint from {checkpoint_path}: {e}", exc_info=True)
485
+ return None
486
+
487
+ def find_latest_checkpoint_pytorch(output_dir: str) -> Optional[str]:
488
+ """ Verilen klasördeki en son PyTorch checkpoint dosyasını (.pt) bulur. """
489
+ checkpoint_dir = os.path.join(output_dir, "checkpoints_pytorch")
490
+ if not os.path.isdir(checkpoint_dir):
491
+ return None
492
+ checkpoints = [f for f in os.listdir(checkpoint_dir) if f.startswith("evo_gen_") and f.endswith(".pt")]
493
+ if not checkpoints:
494
+ return None
495
+
496
+ latest_gen = -1
497
+ latest_file = None
498
+ for cp in checkpoints:
499
+ try:
500
+ gen_num = int(cp.split('_')[2].split('.')[0])
501
+ if gen_num > latest_gen:
502
+ latest_gen = gen_num
503
+ latest_file = os.path.join(checkpoint_dir, cp)
504
+ except (IndexError, ValueError):
505
+ logging.warning(f"Could not parse generation number from checkpoint file: {cp}")
506
+ continue
507
+ return latest_file
508
+
509
+
510
+ # --- Ana Evrim Döngüsü (PyTorch) ---
511
+ def evolve_population_pytorch(
512
+ population: List[NeuralNetwork],
513
+ X: np.ndarray, y: np.ndarray, # Veri hala NumPy olarak geliyor
514
+ start_generation: int, total_generations: int,
515
+ crossover_rate: float, mutation_rate: float, weight_mut_rate: float, mut_strength: float,
516
+ tournament_size: int, elitism_count: int, batch_size: int, # batch_size fitness'ta kullanılmıyor şu an
517
+ output_dir: str, checkpoint_interval: int, device: torch.device
518
+ ) -> Tuple[Optional[NeuralNetwork], List[float], List[float]]:
519
+ """ PyTorch tabanlı evrimsel süreci çalıştırır. """
520
+
521
+ best_fitness_history = []
522
+ avg_fitness_history = []
523
+ best_model_overall: Optional[NeuralNetwork] = None
524
+ best_fitness_overall = -np.inf
525
+
526
+ # Veriyi PyTorch tensörlerine dönüştür ve cihaza gönder (bir kere)
527
+ # Büyük veri setleri için DataLoader düşünülebilir, ancak burada basit tutuyoruz
528
+ try:
529
+ X_torch = torch.from_numpy(X).float().to(device)
530
+ y_torch = torch.from_numpy(y).float().to(device)
531
+ except Exception as e:
532
+ logging.critical(f"Failed to convert data to PyTorch tensors or move to device: {e}", exc_info=True)
533
+ raise
534
+
535
+ # --- KAVRAMSAL: Uyarlanabilir Mutasyon Oranı (Adaptif Parametreler) ---
536
+ # current_mutation_strength = mut_strength
537
+ # stagnation_counter = 0
538
+ # stagnation_limit = 10 # Örneğin, 10 nesil iyileşme olmazsa...
539
+ # min_mut_strength = 0.01
540
+ # max_mut_strength = 0.5
541
+ # --------------------------------------------
542
+
543
+ pop_size = len(population)
544
+
545
+ for gen in range(start_generation, total_generations):
546
+ generation_start_time = time.time()
547
+
548
+ # 1. Fitness Değerlendirme
549
+ try:
550
+ # Paralelleştirme potansiyeli (eğer fitness hesaplama çok uzun sürüyorsa)
551
+ # Örnek: concurrent.futures kullanarak
552
+ fitness_scores = [calculate_fitness_pytorch(ind, X_torch, y_torch, device) for ind in population]
553
+ except Exception as e:
554
+ logging.critical(f"Error calculating fitness for population in Generation {gen+1}: {e}", exc_info=True)
555
+ # Hata durumunda en iyi modeli döndürmeye çalış
556
+ if best_model_overall:
557
+ return best_model_overall, best_fitness_history, avg_fitness_history
558
+ else:
559
+ raise # Eğer hiç en iyi model yoksa, hata ver
560
+
561
+ # 2. İstatistikler ve En İyiyi Takip
562
+ current_best_idx = np.argmax(fitness_scores)
563
+ current_best_fitness = fitness_scores[current_best_idx]
564
+ # NaN veya Inf değerlerini filtreleyerek ortalama hesapla
565
+ finite_scores = [s for s in fitness_scores if np.isfinite(s)]
566
+ avg_fitness = np.mean(finite_scores) if finite_scores else -np.inf
567
+
568
+ best_fitness_history.append(current_best_fitness)
569
+ avg_fitness_history.append(avg_fitness)
570
+
571
+ new_best_found = False
572
+ if current_best_fitness > best_fitness_overall and np.isfinite(current_best_fitness):
573
+ best_fitness_overall = current_best_fitness
574
+ new_best_found = True
575
+ try:
576
+ # En iyi modeli klonla (orijinal popülasyondaki değişmesin)
577
+ best_model_overall = clone_pytorch_model(population[current_best_idx], device)
578
+ logging.info(f"Generation {gen+1}: *** New overall best fitness found: {best_fitness_overall:.6f} (Model: {best_model_overall.model_name}) ***")
579
+ except Exception as e:
580
+ logging.error(f"Could not clone new best model {population[current_best_idx].model_name}: {e}", exc_info=True)
581
+ # Klonlama başarısız olursa, en azından fitness'ı takip et
582
+ best_model_overall = None # Klonlanamadığı için referansı tutma
583
+ # else: # En iyi bulunamadıysa veya aynıysa
584
+ # --- KAVRAMSAL: Adaptif Mutasyon Güncelleme ---
585
+ # stagnation_counter += 1
586
+ # logging.debug(f"Stagnation counter: {stagnation_counter}")
587
+ # if stagnation_counter >= stagnation_limit:
588
+ # current_mutation_strength = min(max_mut_strength, current_mutation_strength * 1.2) # Mutasyon gücünü artır
589
+ # logging.info(f"Stagnation detected. Increasing mutation strength to {current_mutation_strength:.4f}")
590
+ # stagnation_counter = 0 # Sayacı sıfırla
591
+
592
+ # if new_best_found:
593
+ # stagnation_counter = 0
594
+ # current_mutation_strength = max(min_mut_strength, current_mutation_strength * 0.95) # İyileşme varsa azalt
595
+ # logging.debug(f"Improvement found. Decreasing mutation strength to {current_mutation_strength:.4f}")
596
+
597
+ generation_time = time.time() - generation_start_time
598
+ logging.info(f"Generation {gen+1}/{total_generations} | Best Fitness: {current_best_fitness:.6f} | Avg Fitness: {avg_fitness:.6f} | Time: {generation_time:.2f}s")
599
+
600
+ # 3. Yeni Popülasyon Oluşturma
601
+ new_population = []
602
+
603
+ # 3a. Elitizm
604
+ if elitism_count > 0 and len(population) >= elitism_count:
605
+ try:
606
+ # Fitness skorlarına göre sırala ve en iyileri al (indeksleri)
607
+ elite_indices = np.argsort(fitness_scores)[-elitism_count:]
608
+ for idx in elite_indices:
609
+ # Elitleri klonlayarak yeni popülasyona ekle
610
+ elite_clone = clone_pytorch_model(population[idx], device)
611
+ elite_clone.model_name = f"elite_{population[idx].model_name}" # İsimlendirme
612
+ new_population.append(elite_clone)
613
+ logging.debug(f"Added {len(new_population)} elites to the next generation.")
614
+ except Exception as e:
615
+ logging.error(f"Error during elitism: {e}", exc_info=True)
616
+
617
+ # 3b. Seçilim, Çaprazlama ve Mutasyon ile kalanları doldur
618
+ num_to_generate = pop_size - len(new_population)
619
+ generated_count = 0
620
+ reproduction_attempts = 0 # Sonsuz döngüyü önlemek için
621
+ max_reproduction_attempts = num_to_generate * 5 # Cömert bir sınır
622
+
623
+ while generated_count < num_to_generate and reproduction_attempts < max_reproduction_attempts:
624
+ reproduction_attempts += 1
625
+ try:
626
+ # İki ebeveyn seç
627
+ parent1 = tournament_selection(population, fitness_scores, tournament_size)
628
+ parent2 = tournament_selection(population, fitness_scores, tournament_size)
629
+
630
+ child1, child2 = None, None
631
+
632
+ # Çaprazlama uygula (belirli bir olasılıkla ve farklı ebeveynlerse)
633
+ if random.random() < crossover_rate and parent1 is not parent2:
634
+ # logging.debug(f"Attempting crossover between {parent1.model_name} and {parent2.model_name}")
635
+ child1, child2 = crossover_individuals_pytorch(parent1, parent2, device)
636
+
637
+ # Eğer çaprazlama yapılmadıysa/başarısız olduysa veya tek çocuk üretildiyse
638
+ if child1 is None:
639
+ # Mutasyon uygula (belirli bir olasılıkla)
640
+ if random.random() < mutation_rate:
641
+ parent_to_mutate = parent1 # Veya parent2, veya rastgele biri
642
+ child1 = mutate_individual_pytorch(parent_to_mutate, weight_mut_rate, mut_strength, device) # Adaptif: current_mutation_strength
643
+ else:
644
+ # Ne çaprazlama ne mutasyon -> ebeveyni klonla
645
+ child1 = clone_pytorch_model(parent1, device)
646
+ child1.model_name = f"direct_clone_{parent1.model_name}_{random.randint(1000,9999)}"
647
+
648
+ # Çocukları yeni popülasyona ekle (eğer üretildilerse)
649
+ if child1:
650
+ new_population.append(child1)
651
+ generated_count += 1
652
+ if generated_count >= num_to_generate: break
653
+
654
+ if child2: # Eğer çaprazlama iki çocuk ürettiyse
655
+ # İkinci çocuğa da mutasyon uygulama seçeneği eklenebilir
656
+ # if random.random() < post_crossover_mutation_rate: child2 = mutate(...)
657
+ new_population.append(child2)
658
+ generated_count += 1
659
+ if generated_count >= num_to_generate: break
660
+
661
+ except Exception as e:
662
+ logging.error(f"Error during selection/reproduction cycle (attempt {reproduction_attempts}): {e}", exc_info=True)
663
+ # Hata durumunda döngüye devam etmeye çalış, ancak sınırı aşarsa durur.
664
+ # Güvenlik önlemi olarak rastgele birey eklenebilir ama hatayı maskeleyebilir.
665
+
666
+ # Eğer döngü sınırı aşıldıysa popülasyonu tamamla
667
+ if generated_count < num_to_generate:
668
+ logging.warning(f"Reproduction cycle finished early or hit attempt limit. Adding {num_to_generate - generated_count} random individuals.")
669
+ input_size = population[0].input_size # İlk bireyden al
670
+ output_size = population[0].output_size
671
+ for _ in range(num_to_generate - generated_count):
672
+ try:
673
+ random_ind = create_individual_pytorch(input_size, output_size).to(device)
674
+ new_population.append(random_ind)
675
+ except Exception as e:
676
+ logging.error(f"Failed to create random individual to fill population: {e}")
677
+ # Bu durumda popülasyon eksik kalabilir
678
+
679
+ population = new_population[:pop_size] # Popülasyon boyutunu garantile
680
+
681
+ # 4. Checkpoint Alma
682
+ if checkpoint_interval > 0 and (gen + 1) % checkpoint_interval == 0:
683
+ try:
684
+ rnd_state = random.getstate()
685
+ np_rnd_state = np.random.get_state()
686
+ torch_rnd_state = torch.get_rng_state() # PyTorch RNG durumu
687
+ # Cihaz RNG durumları da kaydedilebilir: torch.cuda.get_rng_state_all()
688
+ save_checkpoint_pytorch(output_dir, gen + 1, population, rnd_state, np_rnd_state, torch_rnd_state)
689
+ except Exception as e:
690
+ logging.error(f"Failed to execute checkpoint saving for generation {gen+1}: {e}", exc_info=True)
691
+
692
+ # Döngü sonu temizliği (GPU belleği için önemli olabilir)
693
+ if device.type == 'cuda':
694
+ torch.cuda.empty_cache()
695
+
696
+ # Evrim Döngüsü Sonu
697
+ if best_model_overall is None:
698
+ logging.warning("Evolution finished, but no single best model was tracked (possibly due to errors or all fitness being non-finite).")
699
+ # Son popülasyondan en iyiyi bulmaya çalış
700
+ if population:
701
+ final_fitness_scores = [calculate_fitness_pytorch(ind, X_torch, y_torch, device) for ind in population]
702
+ valid_scores = [(s, i) for i, s in enumerate(final_fitness_scores) if np.isfinite(s)]
703
+ if valid_scores:
704
+ best_idx_final = max(valid_scores, key=lambda item: item[0])[1]
705
+ best_model_overall = clone_pytorch_model(population[best_idx_final], device) # Klonla
706
+ best_fitness_overall = final_fitness_scores[best_idx_final]
707
+ logging.info(f"Selected best model from final population: {best_model_overall.model_name} with fitness {best_fitness_overall:.6f}")
708
+ else:
709
+ logging.error("Evolution finished. No valid finite fitness scores in the final population.")
710
+ return None, best_fitness_history, avg_fitness_history
711
+ else:
712
+ logging.error("Evolution finished with an empty population!")
713
+ return None, best_fitness_history, avg_fitness_history
714
+ else:
715
+ logging.info(f"Evolution finished. Best fitness achieved: {best_fitness_overall:.6f} by model {best_model_overall.model_name}")
716
+
717
+ return best_model_overall, best_fitness_history, avg_fitness_history
718
+
719
+ # --- Grafik Çizimi (Öncekiyle aynı, Matplotlib kullanıyor) ---
720
+ def plot_fitness_history(history_best: List[float], history_avg: List[float], output_dir: str, filename: str = "fitness_history_pytorch.png") -> None:
721
+ if not history_best or not history_avg:
722
+ logging.warning("Fitness history is empty, cannot plot.")
723
+ return
724
+ try:
725
+ plt.figure(figsize=(12, 7))
726
+ # NaN veya Inf değerlerini çizimde atlamak için filtrele
727
+ gens = np.arange(1, len(history_best) + 1)
728
+ valid_best_indices = [i for i, v in enumerate(history_best) if np.isfinite(v)]
729
+ valid_avg_indices = [i for i, v in enumerate(history_avg) if np.isfinite(v)]
730
+
731
+ if valid_best_indices:
732
+ plt.plot(gens[valid_best_indices], np.array(history_best)[valid_best_indices], label="Best Fitness", marker='o', linestyle='-', linewidth=2)
733
+ if valid_avg_indices:
734
+ plt.plot(gens[valid_avg_indices], np.array(history_avg)[valid_avg_indices], label="Average Fitness", marker='x', linestyle='--', alpha=0.7)
735
+
736
+ plt.xlabel("Generation")
737
+ plt.ylabel("Fitness Score")
738
+ plt.title("Evolutionary Fitness History (PyTorch)")
739
+ plt.legend()
740
+ plt.grid(True)
741
+ plt.tight_layout()
742
+ plot_path = os.path.join(output_dir, filename)
743
+ plt.savefig(plot_path)
744
+ plt.close() # Belleği boşalt
745
+ logging.info(f"Fitness history plot saved to {plot_path}")
746
+ except Exception as e:
747
+ logging.error(f"Error plotting fitness history: {e}", exc_info=True)
748
+
749
+
750
+ # --- Değerlendirme (PyTorch) ---
751
+ def evaluate_model_pytorch(
752
+ model: NeuralNetwork,
753
+ X_test: np.ndarray, y_test: np.ndarray,
754
+ batch_size: int, device: torch.device
755
+ ) -> Dict[str, float]:
756
+ """ En iyi modeli test verisi üzerinde PyTorch ile değerlendirir. """
757
+ if model is None:
758
+ logging.error("Cannot evaluate a None model.")
759
+ return {"test_mse": np.inf, "avg_kendall_tau": 0.0}
760
+
761
+ logging.info("Evaluating final model on test data using PyTorch...")
762
+ model.eval() # Değerlendirme modu
763
+ model.to(device)
764
+
765
+ # NumPy verisini PyTorch DataLoader ile kullanmak
766
+ try:
767
+ test_dataset = TensorDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).float())
768
+ test_loader = DataLoader(test_dataset, batch_size=batch_size) # Shuffle=False önemli
769
+ except Exception as e:
770
+ logging.error(f"Failed to create PyTorch DataLoader for test data: {e}", exc_info=True)
771
+ return {"test_mse": np.inf, "avg_kendall_tau": 0.0}
772
+
773
+ all_preds = []
774
+ all_targets = []
775
+ total_mse = 0.0
776
+ num_batches = 0
777
+
778
+ try:
779
+ with torch.no_grad():
780
+ for inputs, targets in test_loader:
781
+ inputs, targets = inputs.to(device), targets.to(device)
782
+ outputs = model(inputs)
783
+ batch_mse = torch.mean((outputs - targets)**2)
784
+ total_mse += batch_mse.item()
785
+ num_batches += 1
786
+ # Kendall Tau için tahminleri ve hedefleri topla (CPU'da)
787
+ all_preds.append(outputs.cpu().numpy())
788
+ all_targets.append(targets.cpu().numpy())
789
+
790
+ avg_mse = total_mse / num_batches if num_batches > 0 else np.inf
791
+ logging.info(f"Final Test MSE: {avg_mse:.6f}")
792
+
793
+ # Kendall Tau hesaplaması
794
+ all_preds_np = np.concatenate(all_preds, axis=0)
795
+ all_targets_np = np.concatenate(all_targets, axis=0)
796
+
797
+ sample_size = min(500, all_targets_np.shape[0])
798
+ taus = []
799
+ if sample_size > 0:
800
+ indices = np.random.choice(all_targets_np.shape[0], sample_size, replace=False)
801
+ for i in indices:
802
+ try:
803
+ tau, _ = kendalltau(all_targets_np[i], all_preds_np[i])
804
+ if not np.isnan(tau):
805
+ taus.append(tau)
806
+ except ValueError: # Sabit tahmin durumu vb.
807
+ pass
808
+ avg_kendall_tau = np.mean(taus) if taus else 0.0
809
+ logging.info(f"Average Kendall's Tau (on {sample_size} samples): {avg_kendall_tau:.4f}")
810
+
811
+ return {"test_mse": float(avg_mse), "avg_kendall_tau": float(avg_kendall_tau)}
812
+
813
+ except Exception as e:
814
+ logging.error(f"Error during final PyTorch model evaluation: {e}", exc_info=True)
815
+ return {"test_mse": np.inf, "avg_kendall_tau": 0.0}
816
+
817
+
818
+ # --- Son Eğitim (PyTorch) ---
819
+ def train_final_model_pytorch(
820
+ model: NeuralNetwork,
821
+ X_train: np.ndarray, y_train: np.ndarray,
822
+ epochs: int, batch_size: int, learning_rate: float,
823
+ device: torch.device, output_dir: str
824
+ ) -> Tuple[NeuralNetwork, Dict[str, Any]]:
825
+ """ En iyi evrimleşmiş modeli PyTorch ile eğitir (Early Stopping ve LR Scheduling ile). """
826
+ logging.info(f"--- Starting Final Training of Best Evolved Model ({model.model_name}) ---")
827
+ model.to(device) # Modeli cihaza taşı
828
+
829
+ # Veriyi DataLoader'a yükle
830
+ try:
831
+ train_dataset = TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float())
832
+ # Veriyi train/validation olarak ayır
833
+ val_split = 0.2
834
+ num_train = len(train_dataset)
835
+ split_idx = int(np.floor(val_split * num_train))
836
+ indices = list(range(num_train))
837
+ np.random.shuffle(indices) # Karıştır
838
+ train_indices, val_indices = indices[split_idx:], indices[:split_idx]
839
+
840
+ train_sampler = torch.utils.data.SubsetRandomSampler(train_indices)
841
+ val_sampler = torch.utils.data.SubsetRandomSampler(val_indices) # Veya SequentialSampler
842
+
843
+ train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler)
844
+ val_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=val_sampler)
845
+ logging.info(f"Created DataLoaders. Train samples: {len(train_indices)}, Val samples: {len(val_indices)}")
846
+ except Exception as e:
847
+ logging.error(f"Failed to create DataLoaders for final training: {e}", exc_info=True)
848
+ return model, {"error": "DataLoader creation failed"}
849
+
850
+ # Optimizatör ve Kayıp Fonksiyonu
851
+ optimizer = optim.Adam(model.parameters(), lr=learning_rate)
852
+ criterion = nn.MSELoss() # Kayıp fonksiyonu
853
+
854
+ # Learning Rate Scheduler (Platoda Azaltma)
855
+ scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.3, patience=7, verbose=True, min_lr=1e-7)
856
+
857
+ # Early Stopping Parametreleri
858
+ early_stopping_patience = 15
859
+ best_val_loss = np.inf
860
+ epochs_no_improve = 0
861
+ best_model_state = None # En iyi modelin state_dict'ini sakla
862
+
863
+ training_history = {'train_loss': [], 'val_loss': [], 'lr': []}
864
+ epochs_run = 0
865
+
866
+ try:
867
+ for epoch in range(epochs):
868
+ epochs_run += 1
869
+ model.train() # Eğitim modu
870
+ running_train_loss = 0.0
871
+ for i, (inputs, targets) in enumerate(train_loader):
872
+ inputs, targets = inputs.to(device), targets.to(device)
873
+
874
+ optimizer.zero_grad() # Gradyanları sıfırla
875
+ outputs = model(inputs) # İleri besleme
876
+ loss = criterion(outputs, targets) # Kaybı hesapla
877
+ loss.backward() # Geri yayılım
878
+ optimizer.step() # Ağırlıkları güncelle
879
+
880
+ running_train_loss += loss.item()
881
+
882
+ avg_train_loss = running_train_loss / len(train_loader) if len(train_loader) > 0 else 0.0
883
+ training_history['train_loss'].append(avg_train_loss)
884
+ training_history['lr'].append(optimizer.param_groups[0]['lr']) # Mevcut LR'yi kaydet
885
+
886
+ # ---- Validation ----
887
+ model.eval() # Değerlendirme modu
888
+ running_val_loss = 0.0
889
+ with torch.no_grad():
890
+ for inputs, targets in val_loader:
891
+ inputs, targets = inputs.to(device), targets.to(device)
892
+ outputs = model(inputs)
893
+ loss = criterion(outputs, targets)
894
+ running_val_loss += loss.item()
895
+
896
+ avg_val_loss = running_val_loss / len(val_loader) if len(val_loader) > 0 else np.inf
897
+ training_history['val_loss'].append(avg_val_loss)
898
+
899
+ logging.info(f"Epoch [{epoch+1}/{epochs}] Train Loss: {avg_train_loss:.6f} | Val Loss: {avg_val_loss:.6f} | LR: {optimizer.param_groups[0]['lr']:.2e}")
900
+
901
+ # Learning Rate Scheduling
902
+ scheduler.step(avg_val_loss)
903
+
904
+ # Early Stopping Kontrolü
905
+ if avg_val_loss < best_val_loss:
906
+ best_val_loss = avg_val_loss
907
+ epochs_no_improve = 0
908
+ # En iyi modelin durumunu kaydet (derin kopya)
909
+ best_model_state = copy.deepcopy(model.state_dict())
910
+ logging.debug(f"New best validation loss: {best_val_loss:.6f}. Saving model state.")
911
+ else:
912
+ epochs_no_improve += 1
913
+
914
+ if epochs_no_improve >= early_stopping_patience:
915
+ logging.info(f"Early stopping triggered after {epoch+1} epochs due to no improvement in validation loss for {early_stopping_patience} epochs.")
916
+ break
917
+
918
+ # Eğitim sonrası en iyi modeli yükle (eğer kaydedildiyse)
919
+ if best_model_state:
920
+ logging.info(f"Restoring model to best validation performance (Val Loss: {best_val_loss:.6f}).")
921
+ model.load_state_dict(best_model_state)
922
+ else:
923
+ logging.warning("No best model state was saved during training (possibly validation loss never improved).")
924
+
925
+
926
+ logging.info("Final training complete.")
927
+ training_summary = {
928
+ "epochs_run": epochs_run,
929
+ "final_train_loss": avg_train_loss, # Son epoch'un kaybı
930
+ "best_val_loss": best_val_loss, # Elde edilen en iyi val kaybı
931
+ "final_lr": optimizer.param_groups[0]['lr']
932
+ }
933
+ # Eğitim grafiğini çizdir (opsiyonel)
934
+ # plot_training_history(training_history, output_dir)
935
+
936
+ return model, training_summary
937
+
938
+ except Exception as e:
939
+ logging.error(f"Error during final PyTorch model training: {e}", exc_info=True)
940
+ return model, {"error": str(e)}
941
+
942
+
943
+ # --- Ana İş Akışı (PyTorch) ---
944
+ def run_pipeline_pytorch(args: argparse.Namespace):
945
+ """ Checkpoint ve PyTorch tabanlı ana iş akışı. """
946
+
947
+ # Cihazı Ayarla
948
+ device = setup_device(args.device)
949
+
950
+ # Çalıştırma adı ve çıktı klasörü
951
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
952
+ run_name = f"evorun_pt_{timestamp}_gen{args.generations}_pop{args.pop_size}"
953
+ output_dir = args.resume_from if args.resume_from else os.path.join(args.output_base_dir, run_name)
954
+ resume_run = bool(args.resume_from)
955
+
956
+ if resume_run:
957
+ run_name = os.path.basename(output_dir)
958
+ logging.info(f"Attempting to resume PyTorch run from: {output_dir}")
959
+ # Devam edilen çalıştırmada çıktı klasörü zaten var olmalı
960
+ if not os.path.isdir(output_dir):
961
+ logging.error(f"Resume directory not found: {output_dir}. Exiting.")
962
+ sys.exit(1)
963
+ else:
964
+ try:
965
+ os.makedirs(output_dir, exist_ok=True)
966
+ except OSError as e:
967
+ print(f"FATAL: Could not create output directory: {output_dir}. Error: {e}", file=sys.stderr)
968
+ sys.exit(1)
969
+
970
+ # Loglamayı ayarla ('a' modu ile devam etmeye uygun)
971
+ setup_logging(output_dir)
972
+ logging.info(f"========== Starting/Resuming EvoNet v4 PyTorch Pipeline: {run_name} ==========")
973
+ logging.info(f"Output directory: {output_dir}")
974
+ logging.info(f"Using device: {device}")
975
+
976
+ # --- Checkpoint Yükleme ---
977
+ start_generation = 0
978
+ population = []
979
+ initial_state_loaded = False
980
+ loaded_history_best = [] # Yüklenecek geçmiş fitness verileri
981
+ loaded_history_avg = []
982
+
983
+ latest_checkpoint_path = find_latest_checkpoint_pytorch(output_dir) if resume_run else None
984
+
985
+ if latest_checkpoint_path:
986
+ loaded_state = load_checkpoint_pytorch(latest_checkpoint_path, device)
987
+ if loaded_state:
988
+ start_generation = loaded_state['generation']
989
+ population = loaded_state['population'] # Yüklenen modeller zaten doğru cihazda olmalı
990
+ # Rastgele durumları geri yükle
991
+ try:
992
+ random.setstate(loaded_state['random_state'])
993
+ np.random.set_state(loaded_state['numpy_random_state'])
994
+ torch.set_rng_state(loaded_state['torch_random_state'].cpu()) # CPU'ya yüklenen state'i kullan
995
+ if device.type == 'cuda' and 'torch_cuda_random_state' in loaded_state:
996
+ # TODO: CUDA RNG state'i de kaydet/yükle (gerekirse)
997
+ # torch.cuda.set_rng_state_all(loaded_state['torch_cuda_random_state'])
998
+ pass
999
+ logging.info(f"Random states restored from checkpoint (Generation {start_generation}).")
1000
+ except Exception as e:
1001
+ logging.warning(f"Could not fully restore random states from checkpoint: {e}")
1002
+
1003
+ # TODO: Fitness geçmişini de checkpoint'e kaydet/yükle
1004
+ # loaded_history_best = loaded_state.get('best_fitness_history', [])
1005
+ # loaded_history_avg = loaded_state.get('avg_fitness_history', [])
1006
+
1007
+ initial_state_loaded = True
1008
+ logging.info(f"Resuming from Generation {start_generation + 1} with {len(population)} individuals.")
1009
+ else:
1010
+ logging.error("Failed to load checkpoint. Starting from scratch.")
1011
+ resume_run = False
1012
+ elif resume_run:
1013
+ logging.warning(f"Resume requested but no valid PyTorch checkpoint (.pt) found in {output_dir}. Starting from scratch.")
1014
+ resume_run = False
1015
+
1016
+
1017
+ # --- Sıfırdan Başlama veya Devam Etme Ayarları ---
1018
+ # Argümanları logla ve kaydet (sadece sıfırdan başlarken veya config yoksa)
1019
+ config_path = os.path.join(output_dir, "config_pytorch.json")
1020
+ args_dict = vars(args)
1021
+ if not initial_state_loaded or not os.path.exists(config_path):
1022
+ logging.info("--- Configuration ---")
1023
+ for k, v in args_dict.items(): logging.info(f" {k:<25}: {v}")
1024
+ logging.info("---------------------")
1025
+ try:
1026
+ # Argümanları JSON olarak kaydet
1027
+ args_to_save = args_dict.copy()
1028
+ # Cihaz objesini string'e çevir
1029
+ args_to_save['device'] = str(device)
1030
+ with open(config_path, 'w') as f: json.dump(args_to_save, f, indent=4, sort_keys=True)
1031
+ logging.info(f"Configuration saved to {config_path}")
1032
+ except Exception as e: logging.error(f"Failed to save configuration: {e}", exc_info=True)
1033
+ else: # Devam ediliyorsa ve config varsa, onu logla
1034
+ try:
1035
+ with open(config_path, 'r') as f: loaded_args_dict = json.load(f)
1036
+ logging.info("--- Loaded Configuration (from resumed run) ---")
1037
+ for k, v in loaded_args_dict.items(): logging.info(f" {k:<25}: {v}")
1038
+ logging.info("-----------------------------------------------")
1039
+ # İsteğe bağlı: Yüklenen argümanlarla mevcut argümanları karşılaştır
1040
+ # for k, v in args_dict.items():
1041
+ # if k in loaded_args_dict and loaded_args_dict[k] != v:
1042
+ # logging.warning(f"Argument mismatch: '{k}' loaded as {loaded_args_dict[k]}, current is {v}")
1043
+ except Exception as e: logging.warning(f"Could not reload config.json: {e}")
1044
+
1045
+
1046
+ # Rastgele tohumları ayarla (her zaman, devam etse bile determinizm için önemli olabilir)
1047
+ # Ancak checkpoint'ten yüklenen state'ler bunu geçersiz kılabilir.
1048
+ # Genellikle sadece sıfırdan başlarken ayarlamak daha mantıklıdır.
1049
+ if not initial_state_loaded:
1050
+ try:
1051
+ seed = args.seed
1052
+ random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
1053
+ if device.type == 'cuda': torch.cuda.manual_seed_all(seed) # GPU için de
1054
+ # Potansiyel olarak deterministik algoritmaları zorla (performansı düşürebilir)
1055
+ # torch.backends.cudnn.deterministic = True
1056
+ # torch.backends.cudnn.benchmark = False
1057
+ logging.info(f"Using random seed: {seed}")
1058
+ except Exception as e: logging.warning(f"Could not set all random seeds: {e}")
1059
+
1060
+
1061
+ # Veri Üretimi (her zaman, checkpoint veriyi içermiyorsa)
1062
+ # Büyük veri setleri için veriyi kaydet/yükle mekanizması daha iyi olabilir.
1063
+ try:
1064
+ logging.info("Generating/Reloading data...")
1065
+ X_train, y_train = generate_data(args.train_samples, args.seq_length)
1066
+ X_test, y_test = generate_data(args.test_samples, args.seq_length)
1067
+ input_shape = X_train.shape[1] # Sadece özellik sayısı
1068
+ output_shape = y_train.shape[1]
1069
+ except Exception:
1070
+ logging.critical("Failed to generate/reload data. Exiting.")
1071
+ sys.exit(1)
1072
+
1073
+
1074
+ # Popülasyon Başlatma (sadece sıfırdan başlarken)
1075
+ if not initial_state_loaded:
1076
+ logging.info(f"--- Initializing Population (Size: {args.pop_size}) ---")
1077
+ try:
1078
+ population = [create_individual_pytorch(input_shape, output_shape).to(device) for _ in range(args.pop_size)]
1079
+ logging.info("Population initialized successfully.")
1080
+ except Exception:
1081
+ logging.critical("Failed to initialize population. Exiting.")
1082
+ sys.exit(1)
1083
+
1084
+
1085
+ # Evrim Süreci
1086
+ logging.info(f"--- Starting/Resuming PyTorch Evolution ({args.generations} Total Generations) ---")
1087
+ best_model_evolved: Optional[NeuralNetwork] = None
1088
+ best_fitness_hist = loaded_history_best # Yüklenen geçmişle başla
1089
+ avg_fitness_hist = loaded_history_avg
1090
+
1091
+ if start_generation >= args.generations:
1092
+ logging.warning(f"Loaded checkpoint generation ({start_generation}) is already >= total generations ({args.generations}). Skipping evolution.")
1093
+ # Checkpoint'ten en iyi modeli ve geçmişi düzgün yüklemek önemli
1094
+ # Şimdilik en iyi modeli popülasyondaki ilk model varsayalım (bu doğru olmayabilir!)
1095
+ if population:
1096
+ # TODO: Checkpoint'e en iyi modeli de kaydetmek daha iyi olur.
1097
+ # Geçici çözüm: Son popülasyondan en iyiyi seç
1098
+ try:
1099
+ logging.info("Selecting best model from loaded population as evolution is skipped...")
1100
+ fitness_scores_loaded = [calculate_fitness_pytorch(ind, torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float(), device) for ind in population]
1101
+ valid_scores_loaded = [(s, i) for i, s in enumerate(fitness_scores_loaded) if np.isfinite(s)]
1102
+ if valid_scores_loaded:
1103
+ best_idx_loaded = max(valid_scores_loaded, key=lambda item: item[0])[1]
1104
+ best_model_evolved = clone_pytorch_model(population[best_idx_loaded], device) # Klonla
1105
+ logging.info(f"Using model {best_model_evolved.model_name} from loaded population as best evolved model.")
1106
+ else:
1107
+ logging.warning("Could not determine best model from loaded population (no finite fitness).")
1108
+ best_model_evolved = None
1109
+ except Exception as e:
1110
+ logging.error(f"Error selecting best model from loaded population: {e}")
1111
+ best_model_evolved = None
1112
+ else:
1113
+ best_model_evolved = None # Popülasyon yüklenememişse
1114
+ # Geçmişi de yüklemek lazım (yukarıda TODO olarak belirtildi)
1115
+ best_fitness_hist, avg_fitness_hist = [], []
1116
+ else:
1117
+ try:
1118
+ best_model_evolved, gen_best_hist, gen_avg_hist = evolve_population_pytorch(
1119
+ population, X_train, y_train, start_generation, args.generations,
1120
+ args.crossover_rate, args.mutation_rate, args.weight_mut_rate, args.mutation_strength,
1121
+ args.tournament_size, args.elitism_count, args.batch_size, # batch_size evrimde doğrudan kullanılmıyor
1122
+ output_dir, args.checkpoint_interval, device
1123
+ )
1124
+ # Yüklenen geçmişle bu çalıştırmanın geçmişini birleştir
1125
+ best_fitness_hist.extend(gen_best_hist)
1126
+ avg_fitness_hist.extend(gen_avg_hist)
1127
+
1128
+ except Exception as e:
1129
+ logging.critical(f"Fatal error during PyTorch evolution process: {e}", exc_info=True)
1130
+ sys.exit(1)
1131
+ logging.info("--- PyTorch Evolution Complete ---")
1132
+
1133
+ # Fitness geçmişini kaydetme ve çizdirme
1134
+ if best_fitness_hist or avg_fitness_hist:
1135
+ plot_fitness_history(best_fitness_hist, avg_fitness_hist, output_dir)
1136
+ history_path = os.path.join(output_dir, "fitness_history_pytorch.csv")
1137
+ try:
1138
+ # Geçmişi CSV olarak kaydet
1139
+ history_data = np.array([
1140
+ np.arange(1, len(best_fitness_hist) + 1), # Nesil numaraları (1'den başlayarak)
1141
+ best_fitness_hist,
1142
+ avg_fitness_hist
1143
+ ]).T
1144
+ np.savetxt(history_path, history_data, delimiter=',', header='Generation,BestFitness,AvgFitness', comments='', fmt=['%d', '%.8f', '%.8f'])
1145
+ logging.info(f"Full fitness history saved to {history_path}")
1146
+ except Exception as e:
1147
+ logging.error(f"Could not save fitness history data: {e}")
1148
+ else:
1149
+ logging.warning("Fitness history is empty after evolution, skipping saving/plotting.")
1150
+
1151
+
1152
+ # En iyi modelin son eğitimi, değerlendirme ve sonuç kaydı
1153
+ final_model_path = None
1154
+ training_summary = {}
1155
+ final_metrics = {"test_mse": np.inf, "avg_kendall_tau": 0.0}
1156
+ best_model_architecture = {}
1157
+
1158
+ if best_model_evolved is None:
1159
+ logging.error("Evolution did not yield a best model. Skipping final training and evaluation.")
1160
+ else:
1161
+ best_model_architecture = best_model_evolved.get_architecture()
1162
+ logging.info(f"Best evolved model architecture: {best_model_architecture}")
1163
+ # Model özetini logla (parametre sayısı vb.)
1164
+ try:
1165
+ num_params = sum(p.numel() for p in best_model_evolved.parameters() if p.requires_grad)
1166
+ logging.info(f"Best Evolved Model ({best_model_evolved.model_name}) - Trainable Parameters: {num_params}")
1167
+ # Daha detaylı özet için torchinfo gibi kütüphaneler kullanılabilir:
1168
+ # from torchinfo import summary
1169
+ # summary(best_model_evolved, input_size=(args.batch_size, input_shape)) # input_size örnektir
1170
+ except Exception as e:
1171
+ logging.warning(f"Could not log model summary details: {e}")
1172
+
1173
+
1174
+ # Son Eğitim
1175
+ try:
1176
+ # Eğitmeden önce bir klonunu alalım ki orijinal evrimleşmiş hali kaybolmasın
1177
+ model_to_train = clone_pytorch_model(best_model_evolved, device)
1178
+ final_model, training_summary = train_final_model_pytorch(
1179
+ model_to_train, X_train, y_train,
1180
+ args.epochs_final_train, args.batch_size, args.learning_rate, # Args'a learning_rate ekle
1181
+ device, output_dir
1182
+ )
1183
+ except Exception as e:
1184
+ logging.error(f"Error during final training setup or execution: {e}", exc_info=True)
1185
+ final_model = None # Eğitim başarısız
1186
+ training_summary = {"error": str(e)}
1187
+
1188
+ # Değerlendirme
1189
+ if final_model:
1190
+ final_metrics = evaluate_model_pytorch(final_model, X_test, y_test, args.batch_size, device)
1191
+ # Son eğitilmiş modeli kaydet
1192
+ final_model_path = os.path.join(output_dir, "best_evolved_model_trained_pytorch.pt")
1193
+ try:
1194
+ # Sadece state_dict kaydetmek genellikle daha iyidir
1195
+ torch.save({
1196
+ 'architecture': final_model.get_architecture(),
1197
+ 'model_state_dict': final_model.state_dict(),
1198
+ # 'optimizer_state_dict': optimizer.state_dict(), # Eğitimde kullanılan optimizatör durumu
1199
+ 'training_summary': training_summary,
1200
+ 'evaluation_metrics': final_metrics
1201
+ }, final_model_path)
1202
+ logging.info(f"Final trained model state and architecture saved to {final_model_path}")
1203
+ except Exception as e:
1204
+ logging.error(f"Failed to save final trained model: {e}", exc_info=True)
1205
+ final_model_path = None # Kaydedilemedi
1206
+ else:
1207
+ logging.error("Final model training failed or did not produce a model. Skipping evaluation and saving.")
1208
+
1209
+
1210
+ logging.info("--- Saving Final Results ---")
1211
+ final_results = {
1212
+ "run_info": {
1213
+ "run_name": run_name,
1214
+ "timestamp": timestamp,
1215
+ "output_directory": output_dir,
1216
+ "framework": "PyTorch",
1217
+ "device_used": str(device),
1218
+ "resumed_run": resume_run,
1219
+ "last_checkpoint_loaded": latest_checkpoint_path
1220
+ },
1221
+ "config": args_dict, # Başlangıç argümanları
1222
+ "evolution_summary": {
1223
+ "start_generation": start_generation,
1224
+ "end_generation": start_generation + len(best_fitness_hist) - (1 if loaded_history_best else 0), # Çalıştırılan son nesil
1225
+ "generations_run_this_session": len(best_fitness_hist) - len(loaded_history_best),
1226
+ "best_fitness_achieved_overall": max(best_fitness_hist) if best_fitness_hist and any(np.isfinite(f) for f in best_fitness_hist) else None,
1227
+ "best_fitness_final_gen": best_fitness_hist[-1] if best_fitness_hist and np.isfinite(best_fitness_hist[-1]) else None,
1228
+ "avg_fitness_final_gen": avg_fitness_hist[-1] if avg_fitness_hist and np.isfinite(avg_fitness_hist[-1]) else None,
1229
+ "best_model_architecture": best_model_architecture
1230
+ },
1231
+ "final_training_summary": training_summary,
1232
+ "final_evaluation_on_test": final_metrics,
1233
+ "saved_trained_model_path": final_model_path
1234
+ }
1235
+ results_path = os.path.join(output_dir, "final_results_pytorch.json")
1236
+ try:
1237
+ # NumPy ve diğer serileştirilemeyen türleri JSON'a uygun hale getir
1238
+ def convert_types(obj):
1239
+ if isinstance(obj, np.integer): return int(obj)
1240
+ elif isinstance(obj, np.floating): return float(obj)
1241
+ elif isinstance(obj, np.ndarray): return obj.tolist()
1242
+ elif isinstance(obj, torch.Tensor): return obj.tolist() # Tensörleri listeye çevir
1243
+ elif isinstance(obj, torch.device): return str(obj) # Cihazı string yap
1244
+ elif isinstance(obj, type): return obj.__name__ # Türleri isim olarak kaydet
1245
+ return obj
1246
+ with open(results_path, 'w') as f:
1247
+ json.dump(final_results, f, indent=4, default=convert_types, sort_keys=True)
1248
+ logging.info(f"Final results summary saved to {results_path}")
1249
+ except Exception as e:
1250
+ logging.error(f"Failed to save final results JSON: {e}", exc_info=True)
1251
+
1252
+ logging.info(f"========== PyTorch Pipeline Run {run_name} Finished ==========")
1253
+
1254
+
1255
+ # --- Argüman Ayrıştırıcı (PyTorch için Eklemeler) ---
1256
+ def parse_arguments_v4() -> argparse.Namespace:
1257
+ parser = argparse.ArgumentParser(description="EvoNet v4: Neuroevolution with PyTorch, Crossover & Checkpointing")
1258
+
1259
+ # --- Dizinler ve Kontrol ---
1260
+ parser.add_argument('--output_base_dir', type=str, default=DEFAULT_OUTPUT_BASE_DIR, help='Base directory for new runs.')
1261
+ parser.add_argument('--resume_from', type=str, default=None, help='Path to a previous run directory to resume from (PyTorch checkpoints).')
1262
+ parser.add_argument('--checkpoint_interval', type=int, default=DEFAULT_CHECKPOINT_INTERVAL, help='Save checkpoint every N generations (0 to disable).')
1263
+ parser.add_argument('--device', type=str, default=DEFAULT_DEVICE, choices=['auto', 'cpu', 'cuda'], help='Device to use (cpu, cuda, or auto-detect).')
1264
+
1265
+ # --- Veri Ayarları ---
1266
+ parser.add_argument('--seq_length', type=int, default=DEFAULT_SEQ_LENGTH, help='Length of sequences.')
1267
+ parser.add_argument('--train_samples', type=int, default=5000, help='Number of training samples.')
1268
+ parser.add_argument('--test_samples', type=int, default=1000, help='Number of test samples.')
1269
+
1270
+ # --- Evrim Parametreleri ---
1271
+ parser.add_argument('--pop_size', type=int, default=DEFAULT_POP_SIZE, help='Population size.')
1272
+ parser.add_argument('--generations', type=int, default=DEFAULT_GENERATIONS, help='Total number of generations.')
1273
+ parser.add_argument('--crossover_rate', type=float, default=DEFAULT_CROSSOVER_RATE, help='Probability of applying crossover.')
1274
+ parser.add_argument('--mutation_rate', type=float, default=DEFAULT_MUTATION_RATE, help='Probability of applying mutation (if crossover is not applied).')
1275
+ parser.add_argument('--weight_mut_rate', type=float, default=DEFAULT_WEIGHT_MUT_RATE, help='Probability for each weight/bias to be mutated if mutation occurs.')
1276
+ parser.add_argument('--mutation_strength', type=float, default=DEFAULT_MUTATION_STRENGTH, help='Std dev for weight mutation noise (Gaussian).')
1277
+ parser.add_argument('--tournament_size', type=int, default=DEFAULT_TOURNAMENT_SIZE, help='Tournament selection size.')
1278
+ parser.add_argument('--elitism_count', type=int, default=DEFAULT_ELITISM_COUNT, help='Number of elite individuals to carry over.')
1279
+
1280
+ # --- Eğitim ve Değerlendirme ---
1281
+ parser.add_argument('--batch_size', type=int, default=DEFAULT_BATCH_SIZE, help='Batch size for final training and evaluation.')
1282
+ parser.add_argument('--epochs_final_train', type=int, default=DEFAULT_EPOCHS_FINAL_TRAIN, help='Max epochs for final training of the best model.')
1283
+ parser.add_argument('--learning_rate', type=float, default=0.001, help='Learning rate for Adam optimizer during final training.')
1284
+
1285
+ # --- Tekrarlanabilirlik ---
1286
+ parser.add_argument('--seed', type=int, default=None, help='Random seed for Python, NumPy, and PyTorch (default: random).')
1287
+
1288
+ args = parser.parse_args()
1289
+ if args.seed is None:
1290
+ args.seed = random.randint(0, 2**32 - 1)
1291
+ print(f"Generated random seed: {args.seed}")
1292
+
1293
+ # Basit kontroller
1294
+ if args.elitism_count >= args.pop_size:
1295
+ print(f"Warning: Elitism count ({args.elitism_count}) >= Population size ({args.pop_size}). Setting elitism to PopSize - 1.")
1296
+ args.elitism_count = max(0, args.pop_size - 1)
1297
+ if args.tournament_size <= 0:
1298
+ print(f"Warning: Tournament size ({args.tournament_size}) must be > 0. Setting to 1.")
1299
+ args.tournament_size = 1
1300
+ if args.tournament_size > args.pop_size:
1301
+ print(f"Warning: Tournament size ({args.tournament_size}) > Population size ({args.pop_size}). Setting to PopSize.")
1302
+ args.tournament_size = args.pop_size
1303
+
1304
+ return args
1305
+
1306
+
1307
+ # --- Ana Çalıştırma Bloğu ---
1308
+ if __name__ == "__main__":
1309
+ cli_args = parse_arguments_v4()
1310
+ try:
1311
+ run_pipeline_pytorch(cli_args)
1312
+ except SystemExit:
1313
+ logging.info("SystemExit caught, exiting gracefully.")
1314
+ pass # Argparse veya bilinçli çıkışlar için
1315
+ except KeyboardInterrupt:
1316
+ print("\nKeyboardInterrupt detected. Exiting...")
1317
+ logging.warning("KeyboardInterrupt detected. Attempting graceful shutdown.")
1318
+ sys.exit(130) # Ctrl+C için standart çıkış kodu
1319
+ except Exception as e:
1320
+ # Loglama zaten ayarlandıysa, kritik hata logla
1321
+ if logging.getLogger().hasHandlers():
1322
+ logging.critical("FATAL UNHANDLED ERROR in main execution block:", exc_info=True)
1323
+ else: # Loglama başlamadan hata olursa stderr'a yaz
1324
+ import traceback
1325
+ print(f"\nFATAL UNHANDLED ERROR in main execution block: {e}", file=sys.stderr)
1326
+ print(traceback.format_exc(), file=sys.stderr)
1327
+ sys.exit(1) # Başarısız çıkış kodu
v5.py ADDED
@@ -0,0 +1,1330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==============================================================================
2
+ # EvoNet Optimizer - v5 - Adaptif & Paralel PyTorch Sürümü
3
+ # Açıklama: v4 üzerine inşa edilmiştir. Adaptif mutasyon gücü, fitness'ta
4
+ # karmaşıklık cezası, paralel fitness hesaplama (CPU),
5
+ # opsiyonel Weights & Biases entegrasyonu ve genel iyileştirmeler içerir.
6
+ # ==============================================================================
7
+
8
+ import os
9
+ # os.environ["WANDB_SILENT"] = "true" # W&B loglarını azaltmak için (isteğe bağlı)
10
+ import sys
11
+ import argparse
12
+ import random
13
+ import logging
14
+ from datetime import datetime
15
+ import json
16
+ import copy
17
+ import time
18
+ from typing import List, Tuple, Dict, Any, Optional, Union
19
+ import concurrent.futures # Paralel fitness hesaplama için
20
+
21
+ import numpy as np
22
+ import torch
23
+ import torch.nn as nn
24
+ import torch.optim as optim
25
+ from torch.utils.data import TensorDataset, DataLoader
26
+ import matplotlib.pyplot as plt
27
+ from scipy.stats import kendalltau
28
+
29
+ # Opsiyonel W&B importu
30
+ try:
31
+ import wandb
32
+ _WANDB_AVAILABLE = True
33
+ except ImportError:
34
+ _WANDB_AVAILABLE = False
35
+ print("Warning: wandb library not found. Experiment tracking with W&B is disabled.")
36
+ print("Install with: pip install wandb")
37
+
38
+
39
+ # --- Sabitler ve Varsayılan Değerler ---
40
+ DEFAULT_SEQ_LENGTH = 10
41
+ DEFAULT_POP_SIZE = 50
42
+ DEFAULT_GENERATIONS = 50
43
+ DEFAULT_CROSSOVER_RATE = 0.6
44
+ DEFAULT_MUTATION_RATE = 0.4
45
+ DEFAULT_WEIGHT_MUT_RATE = 0.8
46
+ DEFAULT_MUTATION_STRENGTH = 0.1 # Başlangıç mutasyon gücü
47
+ DEFAULT_TOURNAMENT_SIZE = 5
48
+ DEFAULT_ELITISM_COUNT = 2
49
+ DEFAULT_EPOCHS_FINAL_TRAIN = 100
50
+ DEFAULT_BATCH_SIZE = 64
51
+ DEFAULT_OUTPUT_BASE_DIR = os.path.join(os.getcwd(), "evonet_runs_v5_pytorch")
52
+ DEFAULT_CHECKPOINT_INTERVAL = 10
53
+ DEFAULT_DEVICE = "auto"
54
+ DEFAULT_NUM_WORKERS = 0 # Paralel fitness için worker sayısı (0 = Kapalı/Ana thread)
55
+
56
+ # Adaptif Mutasyon Parametreleri
57
+ DEFAULT_ADAPT_MUTATION = True
58
+ DEFAULT_STAGNATION_LIMIT = 10 # İyileşme olmazsa adaptasyon için nesil sayısı
59
+ DEFAULT_MUT_STRENGTH_DECAY = 0.98 # İyileşme olduğunda azaltma faktörü
60
+ DEFAULT_MUT_STRENGTH_INCREASE = 1.1 # Tıkanma olduğunda artırma faktörü
61
+ DEFAULT_MIN_MUT_STRENGTH = 0.005
62
+ DEFAULT_MAX_MUT_STRENGTH = 0.5
63
+
64
+ # Gelişmiş Fitness Parametreleri
65
+ DEFAULT_COMPLEXITY_PENALTY = 0.00001 # Parametre başına ceza ağırlığı
66
+
67
+
68
+ # --- Loglama Ayarları ---
69
+ # (setup_logging fonksiyonu öncekiyle aynı, v4'teki gibi)
70
+ def setup_logging(log_dir: str, log_level=logging.INFO) -> None:
71
+ log_filename = os.path.join(log_dir, 'evolution_run_pytorch_v5.log')
72
+ for handler in logging.root.handlers[:]:
73
+ handler.close()
74
+ logging.root.removeHandler(handler)
75
+ logging.basicConfig(
76
+ level=log_level,
77
+ format='%(asctime)s - %(levelname)-8s [%(filename)s:%(lineno)d] - %(message)s',
78
+ handlers=[
79
+ logging.FileHandler(log_filename, mode='a'),
80
+ logging.StreamHandler(sys.stdout)
81
+ ]
82
+ )
83
+ logging.info("="*50)
84
+ logging.info("PyTorch EvoNet v5 Logging Başlatıldı.")
85
+ logging.info("="*50)
86
+
87
+ # --- Cihaz (GPU/CPU) Ayarları ---
88
+ # (setup_device fonksiyonu öncekiyle aynı, v4'teki gibi)
89
+ def setup_device(requested_device: str) -> torch.device:
90
+ """ Kullanılabilir cihaza göre PyTorch cihazını ayarlar. """
91
+ if requested_device == "auto":
92
+ if torch.cuda.is_available():
93
+ device_name = "cuda"
94
+ logging.info(f"CUDA (GPU) kullanılabilir: {torch.cuda.get_device_name(0)}")
95
+ else:
96
+ device_name = "cpu"
97
+ logging.info("CUDA (GPU) bulunamadı. CPU kullanılacak.")
98
+ elif requested_device == "cuda":
99
+ if torch.cuda.is_available():
100
+ device_name = "cuda"
101
+ logging.info(f"CUDA (GPU) manuel olarak seçildi: {torch.cuda.get_device_name(0)}")
102
+ else:
103
+ logging.warning("CUDA (GPU) istendi ancak bulunamadı! CPU kullanılacak.")
104
+ device_name = "cpu"
105
+ else: # cpu veya geçersiz değer
106
+ device_name = "cpu"
107
+ logging.info("CPU manuel olarak seçildi veya geçersiz cihaz belirtildi.")
108
+
109
+ return torch.device(device_name)
110
+
111
+
112
+ # --- Veri Üretimi ---
113
+ # (generate_data fonksiyonu öncekiyle aynı, v4'teki gibi)
114
+ def generate_data(num_samples: int, seq_length: int) -> Tuple[np.ndarray, np.ndarray]:
115
+ logging.info(f"Generating {num_samples} samples with sequence length {seq_length}...")
116
+ try:
117
+ X = np.random.rand(num_samples, seq_length).astype(np.float32) * 100
118
+ y = np.sort(X, axis=1).astype(np.float32)
119
+ logging.info("Data generation successful.")
120
+ return X, y
121
+ except Exception as e:
122
+ logging.error(f"Error during data generation: {e}", exc_info=True)
123
+ raise
124
+
125
+ # --- PyTorch Sinir Ağı Modeli ---
126
+ # (NeuralNetwork sınıfı öncekiyle büyük ölçüde aynı, v4'teki gibi)
127
+ # Küçük iyileştirme: get_num_params metodu eklendi.
128
+ class NeuralNetwork(nn.Module):
129
+ """ Dinamik olarak yapılandırılabilen basit bir PyTorch MLP modeli. """
130
+ def __init__(self, input_size: int, output_size: int, hidden_dims: List[int], activations: List[str]):
131
+ super().__init__()
132
+ self.input_size = input_size
133
+ self.output_size = output_size
134
+ self.hidden_dims = hidden_dims
135
+ self.activations_str = activations
136
+
137
+ layers = []
138
+ last_dim = input_size
139
+ for i, h_dim in enumerate(hidden_dims):
140
+ layers.append(nn.Linear(last_dim, h_dim))
141
+ act_func_str = activations[i].lower()
142
+ if act_func_str == 'relu': layers.append(nn.ReLU())
143
+ elif act_func_str == 'tanh': layers.append(nn.Tanh())
144
+ elif act_func_str == 'sigmoid': layers.append(nn.Sigmoid())
145
+ else:
146
+ logging.warning(f"Bilinmeyen aktivasyon '{activations[i]}', ReLU kullanılıyor.")
147
+ layers.append(nn.ReLU())
148
+ last_dim = h_dim
149
+ layers.append(nn.Linear(last_dim, output_size))
150
+
151
+ self.network = nn.Sequential(*layers)
152
+ self.architecture_id = self._generate_architecture_id()
153
+ self.model_name = f"model_{self.architecture_id}_rnd{random.randint(10000, 99999)}"
154
+
155
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
156
+ return self.network(x)
157
+
158
+ def get_architecture(self) -> Dict[str, Any]:
159
+ return {"input_size": self.input_size, "output_size": self.output_size,
160
+ "hidden_dims": self.hidden_dims, "activations": self.activations_str}
161
+
162
+ def _generate_architecture_id(self) -> str:
163
+ h_dims_str = '_'.join(map(str, self.hidden_dims))
164
+ acts_str = ''.join([a[0].upper() for a in self.activations_str])
165
+ return f"I{self.input_size}_H{h_dims_str}_A{acts_str}_O{self.output_size}"
166
+
167
+ def get_num_params(self, trainable_only: bool = True) -> int:
168
+ """ Modeldeki parametre sayısını döndürür. """
169
+ if trainable_only:
170
+ return sum(p.numel() for p in self.parameters() if p.requires_grad)
171
+ else:
172
+ return sum(p.numel() for p in self.parameters())
173
+
174
+ def __eq__(self, other):
175
+ if not isinstance(other, NeuralNetwork): return NotImplemented
176
+ return self.get_architecture() == other.get_architecture()
177
+
178
+ def __hash__(self):
179
+ arch_tuple = (self.input_size, self.output_size, tuple(self.hidden_dims), tuple(self.activations_str))
180
+ return hash(arch_tuple)
181
+
182
+
183
+ # --- Neuroevolution Çekirdeği (PyTorch v5) ---
184
+
185
+ # (create_individual_pytorch fonksiyonu öncekiyle aynı, v4'teki gibi)
186
+ def create_individual_pytorch(input_size: int, output_size: int) -> NeuralNetwork:
187
+ """ Rastgele mimariye sahip bir PyTorch NeuralNetwork modeli oluşturur. """
188
+ try:
189
+ num_hidden_layers = random.randint(1, 4)
190
+ hidden_dims = [random.randint(16, 128) for _ in range(num_hidden_layers)]
191
+ activations = [random.choice(['relu', 'tanh', 'sigmoid']) for _ in range(num_hidden_layers)]
192
+ model = NeuralNetwork(input_size, output_size, hidden_dims, activations)
193
+ logging.debug(f"Created individual: {model.model_name} with {model.get_num_params()} params")
194
+ return model
195
+ except Exception as e:
196
+ logging.error(f"Error creating PyTorch individual model: {e}", exc_info=True)
197
+ raise
198
+
199
+ # (clone_pytorch_model fonksiyonu öncekiyle aynı, v4'teki gibi)
200
+ def clone_pytorch_model(model: NeuralNetwork, device: torch.device) -> NeuralNetwork:
201
+ """ Bir PyTorch modelini (mimari ve ağırlıklar) klonlar. """
202
+ try:
203
+ arch = model.get_architecture()
204
+ cloned_model = NeuralNetwork(**arch)
205
+ cloned_model.load_state_dict(copy.deepcopy(model.state_dict()))
206
+ cloned_model.to(device)
207
+ cloned_model.model_name = f"cloned_{model.model_name}_{random.randint(1000,9999)}"
208
+ logging.debug(f"Cloned model {model.model_name} to {cloned_model.model_name}")
209
+ return cloned_model
210
+ except Exception as e:
211
+ logging.error(f"Error cloning PyTorch model {model.model_name}: {e}", exc_info=True)
212
+ raise
213
+
214
+
215
+ # Bu fonksiyon paralel işçiler tarafından çağrılacak
216
+ # Doğrudan model objesi yerine state_dict ve mimari alıyor
217
+ def _calculate_fitness_worker(
218
+ model_arch: Dict[str, Any],
219
+ model_state_dict: Dict[str, torch.Tensor],
220
+ X_np: np.ndarray, # Veriyi NumPy olarak alalım
221
+ y_np: np.ndarray,
222
+ device_str: str, # Cihazı string olarak alalım
223
+ fitness_params: Dict # Karmaşıklık cezası vb.
224
+ ) -> float:
225
+ """ Bir modelin fitness'ını hesaplayan işçi fonksiyonu (paralel kullanım için). """
226
+ try:
227
+ # 1. Modeli yeniden oluştur
228
+ device = torch.device(device_str)
229
+ model = NeuralNetwork(**model_arch)
230
+ model.load_state_dict(model_state_dict)
231
+ model.to(device)
232
+ model.eval()
233
+
234
+ # 2. Veriyi Tensör'e çevir ve cihaza taşı
235
+ X = torch.from_numpy(X_np).float().to(device)
236
+ y = torch.from_numpy(y_np).float().to(device)
237
+
238
+ # 3. Fitness Hesaplama (v4'teki calculate_fitness_pytorch benzeri)
239
+ complexity_penalty_weight = fitness_params.get('complexity_penalty', 0.0)
240
+
241
+ with torch.no_grad():
242
+ y_pred = model(X)
243
+ mse_val = torch.mean((y_pred - y)**2).item()
244
+
245
+ if not np.isfinite(mse_val):
246
+ # Worker'da loglama yapmak yerine None veya özel bir değer döndürebiliriz
247
+ # logging.warning(f"Worker: Non-finite MSE ({mse_val}) for model.")
248
+ return -np.inf # Ana süreçte işlenecek
249
+
250
+ # Temel fitness (MSE'nin tersi)
251
+ fitness_score = 1.0 / (mse_val + 1e-9)
252
+
253
+ # Karmaşıklık Cezası Ekle
254
+ if complexity_penalty_weight > 0:
255
+ num_params = model.get_num_params(trainable_only=True)
256
+ complexity_penalty = complexity_penalty_weight * num_params
257
+ fitness_score -= complexity_penalty
258
+ # print(f"Debug: Model params: {num_params}, penalty: {complexity_penalty:.4f}, score after penalty: {fitness_score:.4f}") # DEBUG
259
+
260
+ # --- KAVRAMSAL: Diğer Gelişmiş Fitness Metrikleri ---
261
+ # tau_weight = fitness_params.get('w_tau', 0.0)
262
+ # if tau_weight > 0:
263
+ # y_np_local = y.cpu().numpy()
264
+ # y_pred_np_local = y_pred.cpu().numpy()
265
+ # tau_val = calculate_avg_kendall_tau(y_np_local, y_pred_np_local, sample_size=100)
266
+ # fitness_score += tau_weight * tau_val
267
+ # ----------------------------------------------------
268
+
269
+ if not np.isfinite(fitness_score):
270
+ return -np.inf
271
+
272
+ return float(fitness_score)
273
+
274
+ except Exception as e:
275
+ # Hataları ana sürece bildirmek için loglama yerine None/exception döndürmek daha iyi olabilir
276
+ # Ancak basitlik için burada loglayıp çok düşük değer döndürelim
277
+ # logging.error(f"Error in fitness worker: {e}", exc_info=True) # Bu log dosyasına yazılmaz
278
+ print(f"[Worker Error] Failed to calculate fitness: {e}", file=sys.stderr) # stderr'a yazdır
279
+ return -np.inf # Hata durumunda
280
+
281
+
282
+ # (mutate_individual_pytorch fonksiyonu öncekiyle aynı, v4'teki gibi)
283
+ # Sadece mutasyon gücünü parametre olarak alıyor
284
+ def mutate_individual_pytorch(
285
+ individual: NeuralNetwork,
286
+ weight_mut_rate: float,
287
+ current_mutation_strength: float, # Adaptif olarak gelen güç
288
+ device: torch.device
289
+ ) -> NeuralNetwork:
290
+ """ Bir PyTorch bireyine adaptif güçle ağırlık mutasyonu uygular. """
291
+ try:
292
+ mutated_model = clone_pytorch_model(individual, device)
293
+ mutated_model.model_name = f"mutated_{individual.model_name}_{random.randint(1000,9999)}"
294
+ mutated = False
295
+ state_dict = mutated_model.state_dict()
296
+ new_state_dict = copy.deepcopy(state_dict)
297
+
298
+ for name, param in new_state_dict.items():
299
+ if param.requires_grad and random.random() < weight_mut_rate :
300
+ mutated = True
301
+ noise = torch.randn_like(param) * current_mutation_strength # Adaptif gücü kullan
302
+ new_state_dict[name] = param + noise.to(param.device)
303
+
304
+ if mutated:
305
+ mutated_model.load_state_dict(new_state_dict)
306
+ logging.debug(f"Mutated model {individual.model_name} -> {mutated_model.model_name} with strength {current_mutation_strength:.4f}")
307
+ return mutated_model
308
+ else:
309
+ logging.debug(f"Mutation applied to {individual.model_name}, but no weights changed based on rate.")
310
+ return mutated_model # Klonlanmış modeli döndür
311
+
312
+ except Exception as e:
313
+ logging.error(f"Error during PyTorch mutation of model {individual.model_name}: {e}", exc_info=True)
314
+ return clone_pytorch_model(individual, device)
315
+
316
+
317
+ # (check_architecture_compatibility_pytorch fonksiyonu öncekiyle aynı, v4'teki gibi)
318
+ def check_architecture_compatibility_pytorch(model1: NeuralNetwork, model2: NeuralNetwork) -> bool:
319
+ return model1.get_architecture() == model2.get_architecture()
320
+
321
+ # (crossover_individuals_pytorch fonksiyonu öncekiyle aynı, v4'teki gibi)
322
+ def crossover_individuals_pytorch(
323
+ parent1: NeuralNetwork,
324
+ parent2: NeuralNetwork,
325
+ device: torch.device
326
+ ) -> Tuple[Optional[NeuralNetwork], Optional[NeuralNetwork]]:
327
+ """ İki PyTorch ebeveynden basit ağırlık ortalaması/karıştırması ile çocuklar oluşturur. """
328
+ if not check_architecture_compatibility_pytorch(parent1, parent2):
329
+ logging.debug(f"Skipping crossover between {parent1.model_name} and {parent2.model_name} due to incompatible architectures.")
330
+ return None, None
331
+ try:
332
+ arch = parent1.get_architecture()
333
+ child1 = NeuralNetwork(**arch).to(device)
334
+ child2 = NeuralNetwork(**arch).to(device)
335
+ child1.model_name = f"xover_{parent1.architecture_id}_c1_{random.randint(1000,9999)}"
336
+ child2.model_name = f"xover_{parent1.architecture_id}_c2_{random.randint(1000,9999)}"
337
+ p1_state, p2_state = parent1.state_dict(), parent2.state_dict()
338
+ c1_state, c2_state = child1.state_dict(), child2.state_dict()
339
+ for name in p1_state:
340
+ param1, param2 = p1_state[name], p2_state[name]
341
+ mask = torch.rand_like(param1) < 0.5
342
+ c1_state[name] = torch.where(mask, param1, param2)
343
+ c2_state[name] = torch.where(mask, param2, param1)
344
+ child1.load_state_dict(c1_state)
345
+ child2.load_state_dict(c2_state)
346
+ logging.debug(f"Crossover performed between {parent1.model_name} and {parent2.model_name}")
347
+ return child1, child2
348
+ except Exception as e:
349
+ logging.error(f"Error during PyTorch crossover between {parent1.model_name} and {parent2.model_name}: {e}", exc_info=True)
350
+ return None, None
351
+
352
+ # (tournament_selection fonksiyonu öncekiyle aynı, v4'teki gibi)
353
+ def tournament_selection(
354
+ population: List[NeuralNetwork],
355
+ fitness_scores: List[float],
356
+ k: int
357
+ ) -> NeuralNetwork:
358
+ """ Turnuva seçimi ile popülasyondan bir birey seçer. """
359
+ if not population: raise ValueError("Population cannot be empty")
360
+ valid_indices = [i for i, score in enumerate(fitness_scores) if np.isfinite(score)]
361
+ if not valid_indices:
362
+ logging.warning("No individuals with finite fitness scores found for tournament selection. Returning random individual.")
363
+ return random.choice(population)
364
+ if len(valid_indices) < k: k = len(valid_indices)
365
+ if k <= 0: k = 1
366
+
367
+ try:
368
+ # Sadece geçerli fitness'a sahip olanlar arasından seç
369
+ tournament_indices_pool = random.sample(valid_indices, k)
370
+ tournament_contenders = [(fitness_scores[i], population[i]) for i in tournament_indices_pool]
371
+ winner = max(tournament_contenders, key=lambda item: item[0])[1]
372
+ return winner
373
+ except Exception as e:
374
+ logging.error(f"Error during tournament selection: {e}", exc_info=True)
375
+ return random.choice(population) # Hata durumunda rastgele
376
+
377
+
378
+ # --- Checkpointing (PyTorch v5) ---
379
+ # (save_checkpoint_pytorch fonksiyonu öncekiyle aynı, v4'teki gibi)
380
+ # İsteğe bağlı: Adaptif durum veya W&B run ID'si eklenebilir.
381
+ def save_checkpoint_pytorch(output_dir: str, generation: int, population: List[NeuralNetwork],
382
+ rnd_state: Any, np_rnd_state: Any, torch_rnd_state: Any,
383
+ wandb_run_id: Optional[str] = None): # W&B ID'si ekle
384
+ """ Evrim durumunu (PyTorch v5) kaydeder. """
385
+ checkpoint_dir = os.path.join(output_dir, "checkpoints_pytorch_v5")
386
+ os.makedirs(checkpoint_dir, exist_ok=True)
387
+ checkpoint_file = os.path.join(checkpoint_dir, f"evo_gen_{generation}.pt")
388
+ logging.info(f"Saving checkpoint for generation {generation} to {checkpoint_file}...")
389
+
390
+ population_state = []
391
+ for model in population:
392
+ try:
393
+ population_state.append({
394
+ "name": model.model_name,
395
+ "architecture": model.get_architecture(),
396
+ "state_dict": model.state_dict()
397
+ })
398
+ except Exception as e:
399
+ logging.error(f"Could not serialize model {model.model_name} for checkpoint: {e}")
400
+
401
+ state = {
402
+ "version": "v5", # Sürüm bilgisi ekle
403
+ "generation": generation,
404
+ "population_state": population_state,
405
+ "random_state": rnd_state,
406
+ "numpy_random_state": np_rnd_state,
407
+ "torch_random_state": torch_rnd_state,
408
+ "wandb_run_id": wandb_run_id, # W&B run ID
409
+ "timestamp": datetime.now().isoformat()
410
+ # İsteğe bağlı: Adaptif mutasyonun mevcut durumu (current_mutation_strength, stagnation_counter)
411
+ }
412
+ try:
413
+ torch.save(state, checkpoint_file)
414
+ logging.info(f"Checkpoint saved successfully for generation {generation}.")
415
+ except Exception as e:
416
+ logging.error(f"Failed to save checkpoint using torch.save for generation {generation}: {e}", exc_info=True)
417
+
418
+ # (load_checkpoint_pytorch fonksiyonu öncekiyle aynı, v4'teki gibi)
419
+ # Sadece W&B run ID'sini okur
420
+ def load_checkpoint_pytorch(checkpoint_path: str, device: torch.device) -> Optional[Dict]:
421
+ """ Kaydedilmiş PyTorch v5 evrim durumunu yükler. """
422
+ if not os.path.exists(checkpoint_path):
423
+ logging.error(f"Checkpoint file not found: {checkpoint_path}")
424
+ return None
425
+ logging.info(f"Loading checkpoint from {checkpoint_path}...")
426
+ try:
427
+ checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
428
+ if checkpoint.get("version") != "v5":
429
+ logging.warning(f"Loading checkpoint from a different version ({checkpoint.get('version', 'Unknown')}). Compatibility not guaranteed.")
430
+
431
+ population = []
432
+ for model_state in checkpoint["population_state"]:
433
+ try:
434
+ arch = model_state["architecture"]
435
+ model = NeuralNetwork(**arch)
436
+ model.load_state_dict(model_state["state_dict"])
437
+ model.to(device)
438
+ model.model_name = model_state.get("name", f"loaded_model_{random.randint(1000,9999)}")
439
+ model.eval()
440
+ population.append(model)
441
+ except Exception as e:
442
+ logging.error(f"Failed to load model state from checkpoint for model {model_state.get('name', 'UNKNOWN')}: {e}", exc_info=True)
443
+
444
+ if not population:
445
+ logging.error("Failed to load any model from the checkpoint population state.")
446
+ return None
447
+
448
+ checkpoint["population"] = population
449
+ logging.info(f"Checkpoint loaded successfully. Resuming from generation {checkpoint['generation'] + 1}.")
450
+ # W&B ID'sini döndür
451
+ checkpoint["wandb_run_id"] = checkpoint.get("wandb_run_id")
452
+ return checkpoint
453
+ except Exception as e:
454
+ logging.error(f"Failed to load checkpoint from {checkpoint_path}: {e}", exc_info=True)
455
+ return None
456
+
457
+ # (find_latest_checkpoint_pytorch fonksiyonu öncekiyle aynı, v4'teki gibi)
458
+ # Sadece klasör adını v5'e göre güncelleyebiliriz
459
+ def find_latest_checkpoint_pytorch(output_dir: str) -> Optional[str]:
460
+ """ Verilen klasördeki en son PyTorch v5 checkpoint dosyasını (.pt) bulur. """
461
+ checkpoint_dir = os.path.join(output_dir, "checkpoints_pytorch_v5") # v5 klasörü
462
+ if not os.path.isdir(checkpoint_dir): return None
463
+ checkpoints = [f for f in os.listdir(checkpoint_dir) if f.startswith("evo_gen_") and f.endswith(".pt")]
464
+ if not checkpoints: return None
465
+ latest_gen = -1
466
+ latest_file = None
467
+ for cp in checkpoints:
468
+ try:
469
+ gen_num = int(cp.split('_')[2].split('.')[0])
470
+ if gen_num > latest_gen:
471
+ latest_gen = gen_num
472
+ latest_file = os.path.join(checkpoint_dir, cp)
473
+ except (IndexError, ValueError):
474
+ logging.warning(f"Could not parse generation number from checkpoint file: {cp}")
475
+ continue
476
+ return latest_file
477
+
478
+
479
+ # --- Ana Evrim Döngüsü (PyTorch v5 - Adaptif, Paralel) ---
480
+ def evolve_population_pytorch_v5(
481
+ population: List[NeuralNetwork],
482
+ X_train_np: np.ndarray, y_train_np: np.ndarray, # Veriyi NumPy olarak al
483
+ start_generation: int, total_generations: int,
484
+ crossover_rate: float, mutation_rate: float, weight_mut_rate: float,
485
+ args: argparse.Namespace, # Tüm argümanları alalım
486
+ output_dir: str, device: torch.device,
487
+ wandb_run: Optional[Any] # W&B run objesi
488
+ ) -> Tuple[Optional[NeuralNetwork], List[float], List[float]]:
489
+ """ PyTorch v5 tabanlı evrimsel süreci çalıştırır (Adaptif, Paralel). """
490
+
491
+ best_fitness_history = []
492
+ avg_fitness_history = []
493
+ best_model_overall: Optional[NeuralNetwork] = None
494
+ best_fitness_overall = -np.inf
495
+
496
+ # Adaptif Mutasyon için başlangıç değerleri
497
+ current_mutation_strength = args.mutation_strength
498
+ stagnation_counter = 0
499
+
500
+ pop_size = len(population)
501
+ fitness_params = {'complexity_penalty': args.complexity_penalty} # Fitness worker için parametreler
502
+
503
+ # Paralel işleyici havuzu (eğer worker > 0 ise)
504
+ # 'fork' yerine 'spawn' kullanmak daha güvenli olabilir (özellikle CUDA ile)
505
+ # Ancak 'spawn' daha fazla overhead yaratabilir. Duruma göre seçilebilir.
506
+ # context = torch.multiprocessing.get_context("spawn") if args.num_workers > 0 else None
507
+ # executor = concurrent.futures.ProcessPoolExecutor(max_workers=args.num_workers, mp_context=context) if args.num_workers > 0 else None
508
+ executor = concurrent.futures.ProcessPoolExecutor(max_workers=args.num_workers) if args.num_workers > 0 else None
509
+ if executor:
510
+ logging.info(f"Using ProcessPoolExecutor with {args.num_workers} workers for fitness evaluation.")
511
+
512
+ try: # Executor'ı düzgün kapatmak için try...finally
513
+ for gen in range(start_generation, total_generations):
514
+ generation_start_time = time.time()
515
+
516
+ # 1. Fitness Değerlendirme (Paralel veya Seri)
517
+ fitness_scores = [-np.inf] * pop_size # Başlangıç değeri
518
+ population_states = [(ind.get_architecture(), ind.state_dict()) for ind in population]
519
+
520
+ try:
521
+ if executor and args.num_workers > 0:
522
+ futures = [executor.submit(_calculate_fitness_worker,
523
+ arch, state, X_train_np, y_train_np,
524
+ str(device), fitness_params)
525
+ for arch, state in population_states]
526
+ # concurrent.futures.wait(futures) # Beklemeye gerek yok, as_completed daha iyi
527
+ results = []
528
+ # Sonuçları geldikçe işle (sırasız gelebilir)
529
+ for i, future in enumerate(concurrent.futures.as_completed(futures)):
530
+ try:
531
+ result = future.result()
532
+ results.append(result)
533
+ # print(f"DEBUG: Worker {i} finished with fitness {result}") # DEBUG
534
+ except Exception as exc:
535
+ logging.error(f"Fitness calculation job {i} generated an exception: {exc}")
536
+ results.append(-np.inf) # Hata durumunda minimum fitness
537
+ # Sonuçları doğru sıraya koymak GEREKLİ DEĞİL çünkü seçilim/elitizm zaten skorlara göre çalışır
538
+ # Ancak loglama/takip için orijinal sıra önemliyse, future'ları dict ile takip edip sıraya dizmek gerekir.
539
+ # Basitlik için, sonuç listesinin popülasyonla aynı sırada olduğunu varsayalım (as_completed sırayı bozar!)
540
+ # DÜZELTME: Sonuçları sıraya dizmek ŞART. Future'ları indeksle takip et.
541
+ results_map = {}
542
+ futures_map = {executor.submit(_calculate_fitness_worker,
543
+ pop[0], pop[1], X_train_np, y_train_np,
544
+ str(device), fitness_params): index
545
+ for index, pop in enumerate(population_states)}
546
+
547
+ for future in concurrent.futures.as_completed(futures_map):
548
+ original_index = futures_map[future]
549
+ try:
550
+ result = future.result()
551
+ fitness_scores[original_index] = result
552
+ except Exception as exc:
553
+ logging.error(f'Individual {original_index} generated an exception: {exc}')
554
+ fitness_scores[original_index] = -np.inf # Hata durumunda
555
+
556
+ else: # Seri hesaplama (num_workers=0)
557
+ logging.debug("Calculating fitness sequentially...")
558
+ temp_device = torch.device("cpu") # Seri hesaplamayı CPU'da yapmak GPU'yu meşgul etmez
559
+ # Ana süreçte modeli CPU'ya taşı, hesapla, sonucu al
560
+ for i, (arch, state) in enumerate(population_states):
561
+ # Modeli her seferinde yeniden oluşturmak yerine klonlamak daha verimli olabilir mi?
562
+ # Ancak _calculate_fitness_worker mantığına uymak için yeniden oluşturalım.
563
+ try:
564
+ model_instance = NeuralNetwork(**arch)
565
+ model_instance.load_state_dict(state)
566
+ model_instance.to(temp_device)
567
+ fitness_scores[i] = calculate_fitness_pytorch( # Bu fonksiyon artık sadece seri için
568
+ model_instance, X_train_np, y_train_np,
569
+ temp_device, fitness_params)
570
+ except Exception as e:
571
+ logging.error(f"Error calculating fitness for individual {i} sequentially: {e}")
572
+ fitness_scores[i] = -np.inf
573
+
574
+
575
+ except Exception as e:
576
+ logging.critical(f"Error during fitness evaluation distribution/collection in Gen {gen+1}: {e}", exc_info=True)
577
+ raise # Bu kritik bir hata, devam etmek zor
578
+
579
+ # Fitness hesaplama sonrası GPU belleğini temizle (paralel workerlar ayrı process olduğu için burada etkisi olmaz ama seri için kalabilir)
580
+ # if device.type == 'cuda': torch.cuda.empty_cache()
581
+
582
+ # 2. İstatistikler ve En İyiyi Takip
583
+ valid_indices = [i for i, score in enumerate(fitness_scores) if np.isfinite(score)]
584
+ if not valid_indices:
585
+ logging.error(f"Generation {gen+1}: No individuals with finite fitness scores found! Cannot proceed.")
586
+ # Burada ne yapmalı? Popülasyonu sıfırlamak mı, durmak mı? Şimdilik duralım.
587
+ raise RuntimeError(f"Evolution stopped at generation {gen+1} due to lack of valid individuals.")
588
+
589
+ current_best_idx_local = np.argmax([fitness_scores[i] for i in valid_indices])
590
+ current_best_idx_global = valid_indices[current_best_idx_local]
591
+ current_best_fitness = fitness_scores[current_best_idx_global]
592
+
593
+ finite_scores = [fitness_scores[i] for i in valid_indices]
594
+ avg_fitness = np.mean(finite_scores)
595
+
596
+ best_fitness_history.append(current_best_fitness)
597
+ avg_fitness_history.append(avg_fitness)
598
+
599
+ new_best_found = False
600
+ if current_best_fitness > best_fitness_overall:
601
+ best_fitness_overall = current_best_fitness
602
+ new_best_found = True
603
+ try:
604
+ best_model_overall = clone_pytorch_model(population[current_best_idx_global], device)
605
+ logging.info(f"Generation {gen+1}: *** New overall best fitness: {best_fitness_overall:.6f} (Model: {best_model_overall.model_name}) ***")
606
+ except Exception as e:
607
+ logging.error(f"Could not clone new best model {population[current_best_idx_global].model_name}: {e}", exc_info=True)
608
+ best_model_overall = None
609
+ # else: # En iyi bulunamadıysa veya aynıysa
610
+ # pass # Stagnation sayacı aşağıda artacak
611
+
612
+ generation_time = time.time() - generation_start_time
613
+ logging.info(f"Generation {gen+1}/{total_generations} | Best Fitness: {current_best_fitness:.6f} | Avg Fitness: {avg_fitness:.6f} | Mut Strength: {current_mutation_strength:.4f} | Time: {generation_time:.2f}s")
614
+
615
+ # W&B Loglama (eğer aktifse)
616
+ if wandb_run:
617
+ try:
618
+ wandb_run.log({
619
+ "generation": gen + 1,
620
+ "best_fitness": current_best_fitness,
621
+ "average_fitness": avg_fitness,
622
+ "mutation_strength": current_mutation_strength,
623
+ "generation_time_sec": generation_time,
624
+ "num_valid_individuals": len(valid_indices),
625
+ # "best_model_params": best_model_overall.get_num_params() if best_model_overall else None # En iyinin parametre sayısı
626
+ }, step=gen + 1) # Adım olarak nesil numarasını kullan
627
+ except Exception as e:
628
+ logging.warning(f"Failed to log metrics to W&B: {e}")
629
+
630
+
631
+ # Adaptif Mutasyon Gücü Güncelleme
632
+ if args.adapt_mutation:
633
+ if new_best_found:
634
+ stagnation_counter = 0
635
+ current_mutation_strength = max(args.min_mut_strength, current_mutation_strength * args.mut_strength_decay)
636
+ logging.debug(f"Improvement found. Decreasing mutation strength to {current_mutation_strength:.4f}")
637
+ else:
638
+ stagnation_counter += 1
639
+ logging.debug(f"No improvement. Stagnation counter: {stagnation_counter}")
640
+ if stagnation_counter >= args.stagnation_limit:
641
+ current_mutation_strength = min(args.max_mut_strength, current_mutation_strength * args.mut_strength_increase)
642
+ logging.info(f"Stagnation detected ({stagnation_counter} gens). Increasing mutation strength to {current_mutation_strength:.4f}")
643
+ stagnation_counter = 0 # Sayacı sıfırla
644
+
645
+ # 3. Yeni Popülasyon Oluşturma (Elitizm, Çaprazlama, Mutasyon)
646
+ new_population = []
647
+
648
+ # 3a. Elitizm
649
+ if args.elitism_count > 0 and len(population) >= args.elitism_count:
650
+ try:
651
+ # Sadece geçerli fitness'a sahip elitleri seç
652
+ sorted_valid_indices = sorted(valid_indices, key=lambda i: fitness_scores[i], reverse=True)
653
+ elite_indices = sorted_valid_indices[:args.elitism_count]
654
+ for idx in elite_indices:
655
+ elite_clone = clone_pytorch_model(population[idx], device)
656
+ elite_clone.model_name = f"elite_{population[idx].model_name}"
657
+ new_population.append(elite_clone)
658
+ logging.debug(f"Added {len(new_population)} elites to the next generation.")
659
+ except Exception as e:
660
+ logging.error(f"Error during elitism: {e}", exc_info=True)
661
+
662
+ # 3b. Kalanları Üretme
663
+ num_to_generate = pop_size - len(new_population)
664
+ generated_count = 0
665
+ reproduction_attempts = 0
666
+ max_reproduction_attempts = num_to_generate * 5 # Daha cömert sınır
667
+
668
+ while generated_count < num_to_generate and reproduction_attempts < max_reproduction_attempts:
669
+ reproduction_attempts += 1
670
+ try:
671
+ parent1 = tournament_selection(population, fitness_scores, args.tournament_size)
672
+ parent2 = tournament_selection(population, fitness_scores, args.tournament_size)
673
+ child1, child2 = None, None
674
+
675
+ if random.random() < crossover_rate and parent1 is not parent2:
676
+ child1, child2 = crossover_individuals_pytorch(parent1, parent2, device)
677
+
678
+ if child1 is None: # Çaprazlama olmadıysa veya başarısızsa
679
+ if random.random() < mutation_rate:
680
+ parent_to_mutate = parent1
681
+ child1 = mutate_individual_pytorch(parent_to_mutate, weight_mut_rate, current_mutation_strength, device)
682
+ else: # Klonlama
683
+ child1 = clone_pytorch_model(parent1, device)
684
+ child1.model_name = f"direct_clone_{parent1.model_name}_{random.randint(1000,9999)}"
685
+
686
+ if child1:
687
+ new_population.append(child1); generated_count += 1
688
+ if generated_count >= num_to_generate: break
689
+ if child2:
690
+ new_population.append(child2); generated_count += 1
691
+ if generated_count >= num_to_generate: break
692
+
693
+ except Exception as e:
694
+ logging.error(f"Error during selection/reproduction cycle (attempt {reproduction_attempts}): {e}", exc_info=True)
695
+
696
+ if generated_count < num_to_generate:
697
+ logging.warning(f"Reproduction cycle failed to generate enough individuals. Adding {num_to_generate - generated_count} random individuals.")
698
+ # Rastgele bireyleri eklemeden önce popülasyonun boş olmadığından emin ol
699
+ if population:
700
+ input_s = population[0].input_size
701
+ output_s = population[0].output_size
702
+ for _ in range(num_to_generate - generated_count):
703
+ try:
704
+ random_ind = create_individual_pytorch(input_s, output_s).to(device)
705
+ new_population.append(random_ind)
706
+ except Exception as e:
707
+ logging.error(f"Failed to create random individual to fill population: {e}")
708
+ else: # İlk popülasyon da boşsa veya hata oluştuysa
709
+ logging.error("Cannot create random individuals as initial population is unavailable.")
710
+
711
+
712
+ population = new_population[:pop_size] # Boyutu garantile
713
+
714
+ # 4. Checkpoint Alma
715
+ if args.checkpoint_interval > 0 and (gen + 1) % args.checkpoint_interval == 0:
716
+ try:
717
+ rnd_state = random.getstate()
718
+ np_rnd_state = np.random.get_state()
719
+ torch_rnd_state = torch.get_rng_state().cpu() # CPU state'i kaydet
720
+ wandb_id = wandb_run.id if wandb_run else None
721
+ save_checkpoint_pytorch(output_dir, gen + 1, population, rnd_state, np_rnd_state, torch_rnd_state, wandb_id)
722
+ except Exception as e:
723
+ logging.error(f"Failed to execute checkpoint saving for generation {gen+1}: {e}", exc_info=True)
724
+
725
+ # Bellek temizliği (çok büyük ağlarda işe yarayabilir)
726
+ # import gc; gc.collect()
727
+ # if device.type == 'cuda': torch.cuda.empty_cache()
728
+
729
+ finally: # Executor'ı her zaman kapat
730
+ if executor:
731
+ logging.info("Shutting down ProcessPoolExecutor...")
732
+ executor.shutdown(wait=True) # İşlerin bitmesini bekle
733
+ logging.info("Executor shut down.")
734
+
735
+
736
+ # Evrim Sonu
737
+ if best_model_overall is None and population:
738
+ logging.warning("Evolution finished, but no single best model was tracked. Selecting best from final population.")
739
+ # Son popülasyondan en iyiyi seçmek için fitness'ları tekrar hesapla (veya son skorları kullan?)
740
+ # En güvenlisi tekrar hesaplamak:
741
+ final_population_states = [(ind.get_architecture(), ind.state_dict()) for ind in population]
742
+ final_fitness_scores = [-np.inf] * len(population)
743
+ # Seri hesaplama yapalım (executor kapalı)
744
+ temp_device = torch.device("cpu")
745
+ for i, (arch, state) in enumerate(final_population_states):
746
+ try:
747
+ model_instance = NeuralNetwork(**arch); model_instance.load_state_dict(state); model_instance.to(temp_device)
748
+ final_fitness_scores[i] = calculate_fitness_pytorch(model_instance, X_train_np, y_train_np, temp_device, fitness_params)
749
+ except Exception: final_fitness_scores[i] = -np.inf
750
+
751
+ final_valid_indices = [i for i, score in enumerate(final_fitness_scores) if np.isfinite(score)]
752
+ if final_valid_indices:
753
+ best_idx_final = max(final_valid_indices, key=lambda i: final_fitness_scores[i])
754
+ best_model_overall = clone_pytorch_model(population[best_idx_final], device)
755
+ best_fitness_overall = final_fitness_scores[best_idx_final]
756
+ logging.info(f"Selected best model from final population: {best_model_overall.model_name} with fitness {best_fitness_overall:.6f}")
757
+ else:
758
+ logging.error("Evolution finished. No valid finite fitness scores in the final population.")
759
+ return None, best_fitness_history, avg_fitness_history
760
+ elif not population:
761
+ logging.error("Evolution finished with an empty population!")
762
+ return None, best_fitness_history, avg_fitness_history
763
+ else: # best_model_overall zaten bulundu
764
+ logging.info(f"Evolution finished. Best fitness achieved: {best_fitness_overall:.6f} by model {best_model_overall.model_name}")
765
+
766
+ return best_model_overall, best_fitness_history, avg_fitness_history
767
+
768
+
769
+ # --- Fitness Hesaplama (Seri - Ana Süreç veya Worker=0 için) ---
770
+ # Paralel worker'dan farklı olarak modeli doğrudan alır.
771
+ def calculate_fitness_pytorch(
772
+ individual: NeuralNetwork,
773
+ X_np: np.ndarray, y_np: np.ndarray, # Veriyi NumPy olarak alır
774
+ device: torch.device,
775
+ fitness_params: Dict
776
+ ) -> float:
777
+ """ Bir bireyin fitness değerini hesaplar (Seri kullanım için). """
778
+ individual.eval()
779
+ individual.to(device)
780
+ # Veriyi Tensör'e çevir ve cihaza taşı
781
+ try:
782
+ X = torch.from_numpy(X_np).float().to(device)
783
+ y = torch.from_numpy(y_np).float().to(device)
784
+ except Exception as e:
785
+ logging.error(f"Error converting data to tensor or moving to device in calculate_fitness_pytorch: {e}")
786
+ return -np.inf
787
+
788
+ complexity_penalty_weight = fitness_params.get('complexity_penalty', 0.0)
789
+
790
+ try:
791
+ with torch.no_grad():
792
+ y_pred = individual(X)
793
+ mse_val = torch.mean((y_pred - y)**2).item()
794
+
795
+ if not np.isfinite(mse_val):
796
+ logging.warning(f"Non-finite MSE ({mse_val}) for model {individual.model_name} (Serial Calc). Assigning minimal fitness.")
797
+ return -np.inf
798
+
799
+ fitness_score = 1.0 / (mse_val + 1e-9)
800
+
801
+ if complexity_penalty_weight > 0:
802
+ num_params = individual.get_num_params(trainable_only=True)
803
+ complexity_penalty = complexity_penalty_weight * num_params
804
+ fitness_score -= complexity_penalty
805
+
806
+ if not np.isfinite(fitness_score):
807
+ logging.warning(f"Non-finite final fitness ({fitness_score:.4g}) for model {individual.model_name} (Serial Calc). Assigning minimal fitness.")
808
+ return -np.inf
809
+
810
+ return float(fitness_score)
811
+
812
+ except Exception as e:
813
+ logging.error(f"Error during serial fitness calculation for model {individual.model_name}: {e}", exc_info=True)
814
+ return -np.inf
815
+
816
+
817
+ # --- Grafik Çizimi ---
818
+ # (plot_fitness_history fonksiyonu öncekiyle aynı, v4'teki gibi)
819
+ def plot_fitness_history(history_best: List[float], history_avg: List[float], output_dir: str, filename: str = "fitness_history_pytorch_v5.png") -> None:
820
+ if not history_best or not history_avg: logging.warning("Fitness history empty, cannot plot."); return
821
+ try:
822
+ plt.figure(figsize=(12, 7))
823
+ gens = np.arange(1, len(history_best) + 1)
824
+ valid_best_indices = [i for i, v in enumerate(history_best) if np.isfinite(v)]
825
+ valid_avg_indices = [i for i, v in enumerate(history_avg) if np.isfinite(v)]
826
+ if valid_best_indices: plt.plot(gens[valid_best_indices], np.array(history_best)[valid_best_indices], label="Best Fitness", marker='o', linestyle='-', linewidth=2)
827
+ if valid_avg_indices: plt.plot(gens[valid_avg_indices], np.array(history_avg)[valid_avg_indices], label="Average Fitness", marker='x', linestyle='--', alpha=0.7)
828
+ plt.xlabel("Generation"); plt.ylabel("Fitness Score"); plt.title("Evolutionary Fitness History (PyTorch v5)"); plt.legend(); plt.grid(True); plt.tight_layout()
829
+ plot_path = os.path.join(output_dir, filename); plt.savefig(plot_path); plt.close()
830
+ logging.info(f"Fitness history plot saved to {plot_path}")
831
+ except Exception as e: logging.error(f"Error plotting fitness history: {e}", exc_info=True)
832
+
833
+
834
+ # --- Değerlendirme (PyTorch v5) ---
835
+ # (evaluate_model_pytorch fonksiyonu öncekiyle aynı, v4'teki gibi)
836
+ # Sadece loglamayı güncelleyebiliriz.
837
+ def evaluate_model_pytorch(
838
+ model: NeuralNetwork,
839
+ X_test_np: np.ndarray, y_test_np: np.ndarray,
840
+ batch_size: int, device: torch.device
841
+ ) -> Dict[str, float]:
842
+ """ En iyi modeli test verisi üzerinde PyTorch v5 ile değerlendirir. """
843
+ if model is None: logging.error("Cannot evaluate a None model."); return {"test_mse": np.inf, "avg_kendall_tau": 0.0}
844
+ logging.info(f"Evaluating final model {model.model_name} on test data (PyTorch v5)...")
845
+ model.eval(); model.to(device)
846
+ try:
847
+ test_dataset = TensorDataset(torch.from_numpy(X_test_np).float(), torch.from_numpy(y_test_np).float())
848
+ test_loader = DataLoader(test_dataset, batch_size=batch_size)
849
+ except Exception as e:
850
+ logging.error(f"Failed to create PyTorch DataLoader for test data: {e}", exc_info=True)
851
+ return {"test_mse": np.inf, "avg_kendall_tau": 0.0}
852
+
853
+ all_preds, all_targets = [], []
854
+ total_mse, num_batches = 0.0, 0
855
+ try:
856
+ with torch.no_grad():
857
+ for inputs, targets in test_loader:
858
+ inputs, targets = inputs.to(device), targets.to(device)
859
+ outputs = model(inputs)
860
+ total_mse += torch.mean((outputs - targets)**2).item()
861
+ num_batches += 1
862
+ all_preds.append(outputs.cpu().numpy())
863
+ all_targets.append(targets.cpu().numpy())
864
+
865
+ avg_mse = total_mse / num_batches if num_batches > 0 else np.inf
866
+ logging.info(f"Final Test MSE: {avg_mse:.6f}")
867
+ all_preds_np = np.concatenate(all_preds, axis=0)
868
+ all_targets_np = np.concatenate(all_targets, axis=0)
869
+ sample_size = min(500, all_targets_np.shape[0]); taus = []
870
+ if sample_size > 0:
871
+ indices = np.random.choice(all_targets_np.shape[0], sample_size, replace=False)
872
+ for i in indices:
873
+ try:
874
+ tau, _ = kendalltau(all_targets_np[i], all_preds_np[i])
875
+ if not np.isnan(tau): taus.append(tau)
876
+ except ValueError: pass
877
+ avg_kendall_tau = np.mean(taus) if taus else 0.0
878
+ logging.info(f"Average Kendall's Tau (on {sample_size} samples): {avg_kendall_tau:.4f}")
879
+ return {"test_mse": float(avg_mse), "avg_kendall_tau": float(avg_kendall_tau)}
880
+ except Exception as e:
881
+ logging.error(f"Error during final model evaluation: {e}", exc_info=True)
882
+ return {"test_mse": np.inf, "avg_kendall_tau": 0.0}
883
+
884
+
885
+ # --- Son Eğitim (PyTorch v5) ---
886
+ # (train_final_model_pytorch fonksiyonu öncekiyle aynı, v4'teki gibi)
887
+ # Sadece loglamayı güncelleyebiliriz.
888
+ def train_final_model_pytorch(
889
+ model: NeuralNetwork,
890
+ X_train_np: np.ndarray, y_train_np: np.ndarray,
891
+ epochs: int, batch_size: int, learning_rate: float,
892
+ device: torch.device, output_dir: str,
893
+ wandb_run: Optional[Any] # W&B objesi
894
+ ) -> Tuple[NeuralNetwork, Dict[str, Any]]:
895
+ """ En iyi evrimleşmiş modeli PyTorch v5 ile eğitir. """
896
+ logging.info(f"--- Starting Final Training of Best Evolved Model ({model.model_name}) ---")
897
+ model.to(device)
898
+ try:
899
+ train_dataset = TensorDataset(torch.from_numpy(X_train_np).float(), torch.from_numpy(y_train_np).float())
900
+ val_split = 0.2; num_train = len(train_dataset); split_idx = int(np.floor(val_split * num_train))
901
+ indices = list(range(num_train)); np.random.shuffle(indices)
902
+ train_indices, val_indices = indices[split_idx:], indices[:split_idx]
903
+ train_sampler = torch.utils.data.SubsetRandomSampler(train_indices)
904
+ val_sampler = torch.utils.data.SequentialSampler(val_indices) # Sıralı yapalım val'ı
905
+ train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler, num_workers=min(4, os.cpu_count() or 1)) # DataLoader workerları
906
+ val_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=val_sampler, num_workers=min(4, os.cpu_count() or 1))
907
+ logging.info(f"Created DataLoaders. Train samples: {len(train_indices)}, Val samples: {len(val_indices)}")
908
+ except Exception as e:
909
+ logging.error(f"Failed to create DataLoaders for final training: {e}", exc_info=True)
910
+ return model, {"error": "DataLoader creation failed"}
911
+
912
+ optimizer = optim.Adam(model.parameters(), lr=learning_rate)
913
+ criterion = nn.MSELoss()
914
+ scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.3, patience=7, verbose=False, min_lr=1e-7) # verbose=False
915
+
916
+ early_stopping_patience = 15; best_val_loss = np.inf; epochs_no_improve = 0; best_model_state = None
917
+ training_history = {'train_loss': [], 'val_loss': [], 'lr': []}; epochs_run = 0
918
+
919
+ try:
920
+ for epoch in range(epochs):
921
+ epochs_run += 1; model.train(); running_train_loss = 0.0
922
+ for inputs, targets in train_loader:
923
+ inputs, targets = inputs.to(device), targets.to(device)
924
+ optimizer.zero_grad(); outputs = model(inputs); loss = criterion(outputs, targets)
925
+ loss.backward(); optimizer.step()
926
+ running_train_loss += loss.item()
927
+ avg_train_loss = running_train_loss / len(train_loader) if len(train_loader) > 0 else 0.0
928
+ training_history['train_loss'].append(avg_train_loss); training_history['lr'].append(optimizer.param_groups[0]['lr'])
929
+
930
+ model.eval(); running_val_loss = 0.0
931
+ with torch.no_grad():
932
+ for inputs, targets in val_loader:
933
+ inputs, targets = inputs.to(device), targets.to(device)
934
+ running_val_loss += criterion(model(inputs), targets).item()
935
+ avg_val_loss = running_val_loss / len(val_loader) if len(val_loader) > 0 else np.inf
936
+ training_history['val_loss'].append(avg_val_loss)
937
+ logging.info(f"Epoch [{epoch+1}/{epochs}] Train Loss: {avg_train_loss:.6f} | Val Loss: {avg_val_loss:.6f} | LR: {optimizer.param_groups[0]['lr']:.2e}")
938
+
939
+ # W&B Loglama (final training)
940
+ if wandb_run:
941
+ try:
942
+ wandb_run.log({
943
+ "final_train_epoch": epoch + 1,
944
+ "final_train_loss": avg_train_loss,
945
+ "final_val_loss": avg_val_loss,
946
+ "final_learning_rate": optimizer.param_groups[0]['lr']
947
+ }, step=start_generation + epochs_run) # Toplam adım sayısı
948
+ except Exception as e:
949
+ logging.warning(f"Failed to log final training metrics to W&B: {e}")
950
+
951
+
952
+ scheduler.step(avg_val_loss)
953
+ if avg_val_loss < best_val_loss:
954
+ best_val_loss = avg_val_loss; epochs_no_improve = 0
955
+ best_model_state = copy.deepcopy(model.state_dict()); logging.debug(f"New best val loss: {best_val_loss:.6f}")
956
+ else: epochs_no_improve += 1
957
+ if epochs_no_improve >= early_stopping_patience:
958
+ logging.info(f"Early stopping triggered after {epoch+1} epochs."); break
959
+
960
+ if best_model_state: logging.info(f"Restoring model to best validation performance."); model.load_state_dict(best_model_state)
961
+ else: logging.warning("No best model state saved during training.")
962
+
963
+ logging.info("Final training complete.")
964
+ training_summary = {"epochs_run": epochs_run, "final_train_loss": avg_train_loss,
965
+ "best_val_loss": best_val_loss, "final_lr": optimizer.param_groups[0]['lr']}
966
+ return model, training_summary
967
+
968
+ except Exception as e:
969
+ logging.error(f"Error during final PyTorch model training: {e}", exc_info=True)
970
+ return model, {"error": str(e)}
971
+
972
+
973
+ # --- Ana İş Akışı (PyTorch v5) ---
974
+ def run_pipeline_pytorch_v5(args: argparse.Namespace):
975
+ """ Checkpoint, Adaptif, Paralel PyTorch v5 tabanlı ana iş akışı. """
976
+
977
+ wandb_run = None # W&B run objesi
978
+ output_dir = None # Hata durumunda tanımlı olması için
979
+
980
+ try: # Ana try bloğu, W&B finish için
981
+ device = setup_device(args.device)
982
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
983
+ run_name = f"evorun_pt_v5_{timestamp}_gen{args.generations}_pop{args.pop_size}"
984
+ output_dir = args.resume_from if args.resume_from else os.path.join(args.output_base_dir, run_name)
985
+ resume_run = bool(args.resume_from)
986
+ resumed_wandb_id = None
987
+
988
+ if resume_run:
989
+ run_name = os.path.basename(output_dir)
990
+ logging.info(f"Attempting to resume PyTorch v5 run from: {output_dir}")
991
+ if not os.path.isdir(output_dir): logging.error(f"Resume directory not found: {output_dir}. Exiting."); sys.exit(1)
992
+ else:
993
+ try: os.makedirs(output_dir, exist_ok=True)
994
+ except OSError as e: print(f"FATAL: Could not create output dir: {output_dir}. Error: {e}", file=sys.stderr); sys.exit(1)
995
+
996
+ setup_logging(output_dir)
997
+ logging.info(f"========== Starting/Resuming EvoNet v5 PyTorch Pipeline: {run_name} ==========")
998
+ logging.info(f"Output directory: {output_dir}")
999
+ logging.info(f"Using device: {device}")
1000
+
1001
+ # Checkpoint Yükleme
1002
+ start_generation = 0; population = []; initial_state_loaded = False; loaded_history_best = []; loaded_history_avg = []
1003
+ latest_checkpoint_path = find_latest_checkpoint_pytorch(output_dir) if resume_run else None
1004
+
1005
+ if latest_checkpoint_path:
1006
+ loaded_state = load_checkpoint_pytorch(latest_checkpoint_path, device)
1007
+ if loaded_state:
1008
+ start_generation = loaded_state['generation']
1009
+ population = loaded_state['population']
1010
+ resumed_wandb_id = loaded_state.get("wandb_run_id") # W&B ID'sini al
1011
+ try: # Random state yükleme
1012
+ random.setstate(loaded_state['random_state']); np.random.set_state(loaded_state['numpy_random_state'])
1013
+ torch.set_rng_state(loaded_state['torch_random_state'].cpu())
1014
+ logging.info(f"Random states restored from checkpoint (Generation {start_generation}).")
1015
+ except Exception as e: logging.warning(f"Could not fully restore random states: {e}")
1016
+ initial_state_loaded = True
1017
+ logging.info(f"Resuming from Generation {start_generation + 1} with {len(population)} individuals.")
1018
+ if resumed_wandb_id: logging.info(f"Found previous W&B run ID in checkpoint: {resumed_wandb_id}")
1019
+ else: logging.error("Failed to load checkpoint. Starting from scratch."); resume_run = False
1020
+ elif resume_run: logging.warning(f"Resume requested but no valid v5 checkpoint found. Starting from scratch."); resume_run = False
1021
+
1022
+
1023
+ # W&B Başlatma (eğer argüman verildiyse ve kütüphane varsa)
1024
+ if args.use_wandb and _WANDB_AVAILABLE:
1025
+ try:
1026
+ wandb_kwargs = {
1027
+ "project": args.wandb_project,
1028
+ "entity": args.wandb_entity,
1029
+ "name": run_name,
1030
+ "config": vars(args), # Argümanları kaydet
1031
+ "dir": output_dir, # Logları çıktı klasörüne yazdır
1032
+ "resume": "allow", # Devam etmeye izin ver
1033
+ "id": resumed_wandb_id # Eğer varsa önceki ID'yi kullan
1034
+ }
1035
+ # Entity boşsa argümandan çıkar
1036
+ if not wandb_kwargs["entity"]: del wandb_kwargs["entity"]
1037
+
1038
+ wandb_run = wandb.init(**wandb_kwargs)
1039
+ logging.info(f"Weights & Biases initialized. Run ID: {wandb_run.id if wandb_run else 'N/A'}")
1040
+ # Eğer yeni bir run başladıysa (resume edilmediyse) veya ID değiştiyse W&B ID'sini logla
1041
+ if wandb_run and (not resume_run or wandb_run.id != resumed_wandb_id):
1042
+ logging.info(f"Logging to W&B run: {wandb_run.get_url()}" if wandb_run else "W&B run URL not available.")
1043
+
1044
+ except Exception as e:
1045
+ logging.error(f"Failed to initialize Weights & Biases: {e}", exc_info=True)
1046
+ wandb_run = None # Başarısız olursa devam et ama loglama yapma
1047
+
1048
+
1049
+ # Config Kaydetme/Loglama (v4'teki gibi)
1050
+ config_path = os.path.join(output_dir, "config_pytorch_v5.json")
1051
+ args_dict = vars(args)
1052
+ if not initial_state_loaded or not os.path.exists(config_path):
1053
+ logging.info("--- Configuration ---")
1054
+ for k, v in args_dict.items(): logging.info(f" {k:<25}: {v}")
1055
+ logging.info("---------------------")
1056
+ try:
1057
+ args_to_save = args_dict.copy(); args_to_save['device'] = str(device)
1058
+ with open(config_path, 'w') as f: json.dump(args_to_save, f, indent=4, sort_keys=True)
1059
+ logging.info(f"Configuration saved to {config_path}")
1060
+ except Exception as e: logging.error(f"Failed to save configuration: {e}", exc_info=True)
1061
+ else: # Devam ediliyorsa logla
1062
+ try:
1063
+ with open(config_path, 'r') as f: loaded_args_dict = json.load(f)
1064
+ logging.info("--- Loaded Configuration (from resumed run) ---")
1065
+ for k, v in loaded_args_dict.items(): logging.info(f" {k:<25}: {v}")
1066
+ logging.info("-----------------------------------------------")
1067
+ except Exception as e: logging.warning(f"Could not reload config.json: {e}")
1068
+
1069
+
1070
+ # Random Tohum Ayarlama (sadece sıfırdan başlarken)
1071
+ if not initial_state_loaded:
1072
+ try:
1073
+ seed = args.seed
1074
+ random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
1075
+ if device.type == 'cuda': torch.cuda.manual_seed_all(seed)
1076
+ logging.info(f"Using random seed: {seed}")
1077
+ except Exception as e: logging.warning(f"Could not set all random seeds: {e}")
1078
+
1079
+
1080
+ # Veri Üretimi (her zaman)
1081
+ try:
1082
+ logging.info("Generating/Reloading data...")
1083
+ X_train_np, y_train_np = generate_data(args.train_samples, args.seq_length)
1084
+ X_test_np, y_test_np = generate_data(args.test_samples, args.seq_length)
1085
+ input_shape = X_train_np.shape[1]
1086
+ output_shape = y_train_np.shape[1]
1087
+ except Exception: logging.critical("Failed to generate/reload data. Exiting."); sys.exit(1)
1088
+
1089
+
1090
+ # Popülasyon Başlatma (sadece sıfırdan başlarken)
1091
+ if not initial_state_loaded:
1092
+ logging.info(f"--- Initializing Population (Size: {args.pop_size}) ---")
1093
+ try:
1094
+ population = [create_individual_pytorch(input_shape, output_shape).to(device) for _ in range(args.pop_size)]
1095
+ logging.info("Population initialized successfully.")
1096
+ except Exception: logging.critical("Failed to initialize population. Exiting."); sys.exit(1)
1097
+
1098
+
1099
+ # Evrim Süreci
1100
+ logging.info(f"--- Starting/Resuming PyTorch v5 Evolution ({args.generations} Total Generations) ---")
1101
+ best_model_evolved: Optional[NeuralNetwork] = None
1102
+ best_fitness_hist = loaded_history_best
1103
+ avg_fitness_hist = loaded_history_avg
1104
+
1105
+ if start_generation >= args.generations:
1106
+ logging.warning(f"Loaded checkpoint gen ({start_generation}) >= total gens ({args.generations}). Skipping evolution.")
1107
+ # Checkpoint'ten en iyiyi al (v4'teki gibi TODO: daha iyi yöntem)
1108
+ if population:
1109
+ # Son popülasyondan en iyiyi seç (fitness hesaplayarak)
1110
+ try:
1111
+ logging.info("Selecting best model from loaded population as evolution is skipped...")
1112
+ temp_device = torch.device("cpu")
1113
+ fitness_scores_loaded = [calculate_fitness_pytorch(ind, X_train_np, y_train_np, temp_device, {'complexity_penalty': args.complexity_penalty}) for ind in population]
1114
+ valid_scores_loaded = [(s, i) for i, s in enumerate(fitness_scores_loaded) if np.isfinite(s)]
1115
+ if valid_scores_loaded:
1116
+ best_idx_loaded = max(valid_scores_loaded, key=lambda item: item[0])[1]
1117
+ best_model_evolved = clone_pytorch_model(population[best_idx_loaded], device)
1118
+ logging.info(f"Using model {best_model_evolved.model_name} from loaded population.")
1119
+ else: logging.warning("Could not determine best model from loaded population."); best_model_evolved = None
1120
+ except Exception as e: logging.error(f"Error selecting best model from loaded population: {e}"); best_model_evolved = None
1121
+ else: best_model_evolved = None
1122
+ else:
1123
+ try:
1124
+ best_model_evolved, gen_best_hist, gen_avg_hist = evolve_population_pytorch_v5(
1125
+ population, X_train_np, y_train_np, start_generation, args.generations,
1126
+ args.crossover_rate, args.mutation_rate, args.weight_mut_rate,
1127
+ args, # Tüm argümanları geçir
1128
+ output_dir, device, wandb_run
1129
+ )
1130
+ best_fitness_hist.extend(gen_best_hist)
1131
+ avg_fitness_hist.extend(gen_avg_hist)
1132
+ except Exception as e:
1133
+ logging.critical(f"Fatal error during PyTorch v5 evolution process: {e}", exc_info=True)
1134
+ raise # Hatayı yukarı fırlat
1135
+
1136
+ logging.info("--- PyTorch v5 Evolution Complete ---")
1137
+
1138
+ # Fitness Geçmişi Kaydet/Çizdir (v4'teki gibi)
1139
+ if best_fitness_hist or avg_fitness_hist:
1140
+ plot_fitness_history(best_fitness_hist, avg_fitness_hist, output_dir)
1141
+ history_path = os.path.join(output_dir, "fitness_history_pytorch_v5.csv")
1142
+ try:
1143
+ history_data = np.array([np.arange(1, len(best_fitness_hist) + 1), best_fitness_hist, avg_fitness_hist]).T
1144
+ np.savetxt(history_path, history_data, delimiter=',', header='Generation,BestFitness,AvgFitness', comments='', fmt=['%d', '%.8f', '%.8f'])
1145
+ logging.info(f"Full fitness history saved to {history_path}")
1146
+ # W&B'ye tablo olarak logla (opsiyonel)
1147
+ if wandb_run:
1148
+ try:
1149
+ table = wandb.Table(data=history_data, columns=["Generation", "BestFitness", "AvgFitness"])
1150
+ wandb_run.log({"fitness_history_table": table})
1151
+ except Exception as e: logging.warning(f"Failed to log fitness history table to W&B: {e}")
1152
+
1153
+ except Exception as e: logging.error(f"Could not save fitness history data: {e}")
1154
+ else: logging.warning("Fitness history empty, skipping saving/plotting.")
1155
+
1156
+ # En İyi Modeli Eğit/Değerlendir/Kaydet
1157
+ final_model_path = None; training_summary = {}; final_metrics = {"test_mse": np.inf, "avg_kendall_tau": 0.0}; best_model_architecture = {}
1158
+ if best_model_evolved is None:
1159
+ logging.error("Evolution did not yield a best model. Skipping final training and evaluation.")
1160
+ else:
1161
+ best_model_architecture = best_model_evolved.get_architecture()
1162
+ logging.info(f"Best evolved model architecture: {best_model_architecture}")
1163
+ try:
1164
+ num_params = best_model_evolved.get_num_params(); logging.info(f"Best Evolved Model ({best_model_evolved.model_name}) - Params: {num_params}")
1165
+ if wandb_run: wandb_run.summary["best_evolved_params"] = num_params # W&B özete ekle
1166
+ except Exception as e: logging.warning(f"Could not log model summary details: {e}")
1167
+
1168
+ # Son Eğitim
1169
+ try:
1170
+ model_to_train = clone_pytorch_model(best_model_evolved, device)
1171
+ final_model, training_summary = train_final_model_pytorch(
1172
+ model_to_train, X_train_np, y_train_np,
1173
+ args.epochs_final_train, args.batch_size, args.learning_rate,
1174
+ device, output_dir, wandb_run
1175
+ )
1176
+ except Exception as e: logging.error(f"Error during final training: {e}", exc_info=True); final_model = None; training_summary = {"error": str(e)}
1177
+
1178
+ # Değerlendirme ve Kaydetme
1179
+ if final_model:
1180
+ final_metrics = evaluate_model_pytorch(final_model, X_test_np, y_test_np, args.batch_size, device)
1181
+ if wandb_run: wandb_run.summary.update(final_metrics) # W&B özete ekle
1182
+
1183
+ final_model_path = os.path.join(output_dir, "best_evolved_model_trained_pytorch_v5.pt")
1184
+ try:
1185
+ torch.save({'architecture': final_model.get_architecture(), 'model_state_dict': final_model.state_dict(),
1186
+ 'training_summary': training_summary, 'evaluation_metrics': final_metrics}, final_model_path)
1187
+ logging.info(f"Final trained model state and architecture saved to {final_model_path}")
1188
+ # W&B'ye artifact olarak kaydet (opsiyonel)
1189
+ if wandb_run:
1190
+ try:
1191
+ artifact = wandb.Artifact(f'final_model_{run_name}', type='model')
1192
+ artifact.add_file(final_model_path)
1193
+ wandb_run.log_artifact(artifact)
1194
+ logging.info(f"Saved final model as W&B artifact.")
1195
+ except Exception as e: logging.warning(f"Failed to save model as W&B artifact: {e}")
1196
+ except Exception as e: logging.error(f"Failed to save final trained model: {e}", exc_info=True); final_model_path = None
1197
+ else: logging.error("Final model training failed. Skipping evaluation and saving.")
1198
+
1199
+ # Sonuçları Kaydet
1200
+ logging.info("--- Saving Final Results (v5) ---")
1201
+ final_results = {
1202
+ "run_info": {"run_name": run_name, "timestamp": timestamp, "output_directory": output_dir, "framework": "PyTorch",
1203
+ "version": "v5", "device_used": str(device), "resumed_run": resume_run, "last_checkpoint": latest_checkpoint_path,
1204
+ "wandb_url": wandb_run.get_url() if wandb_run else None},
1205
+ "config": args_dict,
1206
+ "evolution_summary": {
1207
+ "start_generation": start_generation, "end_generation": start_generation + len(best_fitness_hist),
1208
+ "generations_run_this_session": len(best_fitness_hist) - len(loaded_history_best),
1209
+ "best_fitness_overall": max(best_fitness_hist) if best_fitness_hist and any(np.isfinite(f) for f in best_fitness_hist) else None,
1210
+ "best_fitness_final_gen": best_fitness_hist[-1] if best_fitness_hist and np.isfinite(best_fitness_hist[-1]) else None,
1211
+ "avg_fitness_final_gen": avg_fitness_hist[-1] if avg_fitness_hist and np.isfinite(avg_fitness_hist[-1]) else None,
1212
+ "best_model_architecture": best_model_architecture,
1213
+ "best_model_params": best_model_evolved.get_num_params() if best_model_evolved else None
1214
+ },
1215
+ "final_training_summary": training_summary,
1216
+ "final_evaluation_on_test": final_metrics,
1217
+ "saved_trained_model_path": final_model_path
1218
+ }
1219
+ results_path = os.path.join(output_dir, "final_results_pytorch_v5.json")
1220
+ try:
1221
+ def convert_types(obj): # JSON için tür dönüştürücü
1222
+ if isinstance(obj, (np.integer, np.int_)): return int(obj)
1223
+ elif isinstance(obj, (np.floating, np.float_)): return float(obj)
1224
+ elif isinstance(obj, np.ndarray): return obj.tolist()
1225
+ elif isinstance(obj, torch.Tensor): return obj.tolist()
1226
+ elif isinstance(obj, torch.device): return str(obj)
1227
+ elif isinstance(obj, type): return obj.__name__
1228
+ elif isinstance(obj, argparse.Namespace): return vars(obj) # Argümanları dict yap
1229
+ return obj
1230
+ with open(results_path, 'w') as f: json.dump(final_results, f, indent=4, default=convert_types, sort_keys=True)
1231
+ logging.info(f"Final results summary saved to {results_path}")
1232
+ except Exception as e: logging.error(f"Failed to save final results JSON: {e}", exc_info=True)
1233
+
1234
+ except (Exception, KeyboardInterrupt) as e:
1235
+ # Hata veya kesinti durumunda loglama ve W&B bitirme
1236
+ if isinstance(e, KeyboardInterrupt):
1237
+ logging.warning("KeyboardInterrupt detected. Exiting.")
1238
+ else:
1239
+ logging.critical("Unhandled exception in pipeline:", exc_info=True)
1240
+ # W&B run'ı "crashed" veya "failed" olarak işaretle
1241
+ if wandb_run:
1242
+ exit_code = 1 if not isinstance(e, KeyboardInterrupt) else 130
1243
+ try:
1244
+ wandb.finish(exit_code=exit_code, quiet=True)
1245
+ logging.info(f"W&B run marked as {'failed' if exit_code==1 else 'killed'}.")
1246
+ except Exception as wb_e:
1247
+ logging.error(f"Error finishing W&B run: {wb_e}")
1248
+ # Hatayı tekrar fırlat veya çık
1249
+ if isinstance(e, KeyboardInterrupt): sys.exit(130)
1250
+ else: sys.exit(1)
1251
+
1252
+ finally:
1253
+ # W&B run'ı normal şekilde bitir (eğer hata olmadıysa)
1254
+ if wandb_run and not sys.exc_info()[0]: # Sadece hata yoksa bitir
1255
+ try:
1256
+ wandb.finish()
1257
+ logging.info("W&B run finished successfully.")
1258
+ except Exception as e:
1259
+ logging.error(f"Error finishing W&B run: {e}")
1260
+
1261
+ logging.info(f"========== PyTorch v5 Pipeline Run {run_name} Finished ==========")
1262
+
1263
+
1264
+ # --- Argüman Ayrıştırıcı (v5) ---
1265
+ def parse_arguments_v5() -> argparse.Namespace:
1266
+ parser = argparse.ArgumentParser(description="EvoNet v5: Adaptive & Parallel Neuroevolution with PyTorch")
1267
+
1268
+ # --- Dizinler ve Kontrol ---
1269
+ parser.add_argument('--output_base_dir', type=str, default=DEFAULT_OUTPUT_BASE_DIR)
1270
+ parser.add_argument('--resume_from', type=str, default=None, help='Path to previous run dir to resume.')
1271
+ parser.add_argument('--checkpoint_interval', type=int, default=DEFAULT_CHECKPOINT_INTERVAL, help='Checkpoint frequency (gens). 0=disable.')
1272
+ parser.add_argument('--device', type=str, default=DEFAULT_DEVICE, choices=['auto', 'cpu', 'cuda'])
1273
+ parser.add_argument('--seed', type=int, default=None, help='Random seed (default: random).')
1274
+
1275
+ # --- Veri ---
1276
+ parser.add_argument('--seq_length', type=int, default=DEFAULT_SEQ_LENGTH)
1277
+ parser.add_argument('--train_samples', type=int, default=5000)
1278
+ parser.add_argument('--test_samples', type=int, default=1000)
1279
+
1280
+ # --- Evrim Parametreleri ---
1281
+ evo_group = parser.add_argument_group('Evolution Parameters')
1282
+ evo_group.add_argument('--pop_size', type=int, default=DEFAULT_POP_SIZE)
1283
+ evo_group.add_argument('--generations', type=int, default=DEFAULT_GENERATIONS)
1284
+ evo_group.add_argument('--crossover_rate', type=float, default=DEFAULT_CROSSOVER_RATE)
1285
+ evo_group.add_argument('--mutation_rate', type=float, default=DEFAULT_MUTATION_RATE, help='Prob. of mutation if crossover is not applied.')
1286
+ evo_group.add_argument('--weight_mut_rate', type=float, default=DEFAULT_WEIGHT_MUT_RATE, help='Prob. for each weight to mutate if mutation occurs.')
1287
+ evo_group.add_argument('--tournament_size', type=int, default=DEFAULT_TOURNAMENT_SIZE)
1288
+ evo_group.add_argument('--elitism_count', type=int, default=DEFAULT_ELITISM_COUNT)
1289
+ evo_group.add_argument('--complexity_penalty', type=float, default=DEFAULT_COMPLEXITY_PENALTY, help='Penalty weight per parameter in fitness.')
1290
+
1291
+ # --- Adaptif Mutasyon ---
1292
+ adapt_group = parser.add_argument_group('Adaptive Mutation')
1293
+ adapt_group.add_argument('--adapt_mutation', action=argparse.BooleanOptionalAction, default=DEFAULT_ADAPT_MUTATION, help='Enable adaptive mutation strength.')
1294
+ adapt_group.add_argument('--mutation_strength', type=float, default=DEFAULT_MUTATION_STRENGTH, help='Initial mutation strength (std dev).')
1295
+ adapt_group.add_argument('--stagnation_limit', type=int, default=DEFAULT_STAGNATION_LIMIT, help='Generations without improvement to trigger adaptation.')
1296
+ adapt_group.add_argument('--mut_strength_decay', type=float, default=DEFAULT_MUT_STRENGTH_DECAY, help='Factor to decrease strength on improvement.')
1297
+ adapt_group.add_argument('--mut_strength_increase', type=float, default=DEFAULT_MUT_STRENGTH_INCREASE, help='Factor to increase strength on stagnation.')
1298
+ adapt_group.add_argument('--min_mut_strength', type=float, default=DEFAULT_MIN_MUT_STRENGTH)
1299
+ adapt_group.add_argument('--max_mut_strength', type=float, default=DEFAULT_MAX_MUT_STRENGTH)
1300
+
1301
+ # --- Paralellik ---
1302
+ parallel_group = parser.add_argument_group('Parallelism')
1303
+ parallel_group.add_argument('--num_workers', type=int, default=DEFAULT_NUM_WORKERS, help='Number of CPU workers for parallel fitness evaluation (0=disable/serial).')
1304
+
1305
+ # --- Eğitim ve Değerlendirme ---
1306
+ train_group = parser.add_argument_group('Final Training & Evaluation')
1307
+ train_group.add_argument('--batch_size', type=int, default=DEFAULT_BATCH_SIZE)
1308
+ train_group.add_argument('--epochs_final_train', type=int, default=DEFAULT_EPOCHS_FINAL_TRAIN)
1309
+ train_group.add_argument('--learning_rate', type=float, default=0.001, help='LR for final training.')
1310
+
1311
+ # --- Deney Takibi (W&B) ---
1312
+ wandb_group = parser.add_argument_group('Experiment Tracking (Weights & Biases)')
1313
+ wandb_group.add_argument('--use_wandb', action=argparse.BooleanOptionalAction, default=False, help='Enable W&B logging.')
1314
+ wandb_group.add_argument('--wandb_project', type=str, default="EvoNet-v5", help='W&B project name.')
1315
+ wandb_group.add_argument('--wandb_entity', type=str, default=None, help='W&B entity (username or team). Uses default if None.') # Genellikle kullanıcı adı veya takım
1316
+
1317
+ args = parser.parse_args()
1318
+ if args.seed is None: args.seed = random.randint(0, 2**32 - 1); print(f"Generated random seed: {args.seed}")
1319
+ if args.num_workers < 0: print(f"Warning: num_workers ({args.num_workers}) cannot be negative. Setting to 0."); args.num_workers = 0
1320
+ # Diğer v4 kontrolleri (elitism, tournament size) burada da yapılabilir.
1321
+
1322
+ return args
1323
+
1324
+ # --- Ana Çalıştırma Bloğu ---
1325
+ if __name__ == "__main__":
1326
+ # Önemli Not: concurrent.futures (özellikle ProcessPoolExecutor) ve
1327
+ # multiprocessing'in düzgün çalışması için ana kod bloğunun
1328
+ # `if __name__ == "__main__":` içinde olması genellikle gereklidir.
1329
+ cli_args = parse_arguments_v5()
1330
+ run_pipeline_pytorch_v5(cli_args)