Hokeno commited on
Commit
c3333b8
·
verified ·
1 Parent(s): 43fcec3

Upload main.ipynb

Browse files
Files changed (1) hide show
  1. main.ipynb +848 -0
main.ipynb ADDED
@@ -0,0 +1,848 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "2cec985b",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "Data split into 70 training lines and 18 validation lines from ai_ethics_data.txt.\n"
14
+ ]
15
+ },
16
+ {
17
+ "name": "stderr",
18
+ "output_type": "stream",
19
+ "text": [
20
+ "[I 2025-07-16 15:07:19,311] A new study created in memory with name: gpt2_finetuning_ai_ethics\n"
21
+ ]
22
+ },
23
+ {
24
+ "name": "stdout",
25
+ "output_type": "stream",
26
+ "text": [
27
+ "\n",
28
+ "--- Starting Optuna hyperparameter optimization ---\n",
29
+ "A new study created in memory with name: gpt2_finetuning_ai_ethics\n",
30
+ "A new study created in memory with name: gpt2_finetuning_ai_ethics\n",
31
+ "A new study created in memory with name: gpt2_finetuning_ai_ethics\n",
32
+ "A new study created in memory with name: gpt2_finetuning_ai_ethics\n",
33
+ "\n",
34
+ "--- Starting training for trial 0 ---\n"
35
+ ]
36
+ },
37
+ {
38
+ "data": {
39
+ "text/html": [
40
+ "\n",
41
+ " <div>\n",
42
+ " \n",
43
+ " <progress value='42' max='42' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
44
+ " [42/42 01:01, Epoch 6/6]\n",
45
+ " </div>\n",
46
+ " <table border=\"1\" class=\"dataframe\">\n",
47
+ " <thead>\n",
48
+ " <tr style=\"text-align: left;\">\n",
49
+ " <th>Epoch</th>\n",
50
+ " <th>Training Loss</th>\n",
51
+ " <th>Validation Loss</th>\n",
52
+ " </tr>\n",
53
+ " </thead>\n",
54
+ " <tbody>\n",
55
+ " <tr>\n",
56
+ " <td>1</td>\n",
57
+ " <td>No log</td>\n",
58
+ " <td>2.896929</td>\n",
59
+ " </tr>\n",
60
+ " <tr>\n",
61
+ " <td>2</td>\n",
62
+ " <td>No log</td>\n",
63
+ " <td>2.772568</td>\n",
64
+ " </tr>\n",
65
+ " <tr>\n",
66
+ " <td>3</td>\n",
67
+ " <td>No log</td>\n",
68
+ " <td>2.693300</td>\n",
69
+ " </tr>\n",
70
+ " <tr>\n",
71
+ " <td>4</td>\n",
72
+ " <td>No log</td>\n",
73
+ " <td>2.668611</td>\n",
74
+ " </tr>\n",
75
+ " <tr>\n",
76
+ " <td>5</td>\n",
77
+ " <td>No log</td>\n",
78
+ " <td>2.660016</td>\n",
79
+ " </tr>\n",
80
+ " <tr>\n",
81
+ " <td>6</td>\n",
82
+ " <td>No log</td>\n",
83
+ " <td>2.654525</td>\n",
84
+ " </tr>\n",
85
+ " </tbody>\n",
86
+ "</table><p>"
87
+ ],
88
+ "text/plain": [
89
+ "<IPython.core.display.HTML object>"
90
+ ]
91
+ },
92
+ "metadata": {},
93
+ "output_type": "display_data"
94
+ },
95
+ {
96
+ "name": "stderr",
97
+ "output_type": "stream",
98
+ "text": [
99
+ "There were missing keys in the checkpoint model loaded: ['lm_head.weight'].\n"
100
+ ]
101
+ },
102
+ {
103
+ "data": {
104
+ "text/html": [
105
+ "\n",
106
+ " <div>\n",
107
+ " \n",
108
+ " <progress value='1' max='1' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
109
+ " [1/1 : < :]\n",
110
+ " </div>\n",
111
+ " "
112
+ ],
113
+ "text/plain": [
114
+ "<IPython.core.display.HTML object>"
115
+ ]
116
+ },
117
+ "metadata": {},
118
+ "output_type": "display_data"
119
+ },
120
+ {
121
+ "name": "stderr",
122
+ "output_type": "stream",
123
+ "text": [
124
+ "[I 2025-07-16 15:08:28,746] Trial 0 finished with value: 14.218224606871981 and parameters: {'learning_rate': 4.257432830042159e-05, 'num_train_epochs': 6, 'per_device_train_batch_batch_size': 4, 'gradient_accumulation_steps': 1, 'weight_decay': 0.006021467020511296}. Best is trial 0 with value: 14.218224606871981.\n"
125
+ ]
126
+ },
127
+ {
128
+ "name": "stdout",
129
+ "output_type": "stream",
130
+ "text": [
131
+ "Trial 0 finished. Perplexity: 14.22\n",
132
+ "Trial 0 finished with value: 14.218224606871981 and parameters: {'learning_rate': 4.257432830042159e-05, 'num_train_epochs': 6, 'per_device_train_batch_batch_size': 4, 'gradient_accumulation_steps': 1, 'weight_decay': 0.006021467020511296}. Best is trial 0 with value: 14.218224606871981.\n",
133
+ "Trial 0 finished with value: 14.218224606871981 and parameters: {'learning_rate': 4.257432830042159e-05, 'num_train_epochs': 6, 'per_device_train_batch_batch_size': 4, 'gradient_accumulation_steps': 1, 'weight_decay': 0.006021467020511296}. Best is trial 0 with value: 14.218224606871981.\n",
134
+ "Trial 0 finished with value: 14.218224606871981 and parameters: {'learning_rate': 4.257432830042159e-05, 'num_train_epochs': 6, 'per_device_train_batch_batch_size': 4, 'gradient_accumulation_steps': 1, 'weight_decay': 0.006021467020511296}. Best is trial 0 with value: 14.218224606871981.\n",
135
+ "Trial 0 finished with value: 14.218224606871981 and parameters: {'learning_rate': 4.257432830042159e-05, 'num_train_epochs': 6, 'per_device_train_batch_batch_size': 4, 'gradient_accumulation_steps': 1, 'weight_decay': 0.006021467020511296}. Best is trial 0 with value: 14.218224606871981.\n",
136
+ "\n",
137
+ "--- Starting training for trial 1 ---\n"
138
+ ]
139
+ },
140
+ {
141
+ "data": {
142
+ "text/html": [
143
+ "\n",
144
+ " <div>\n",
145
+ " \n",
146
+ " <progress value='6' max='6' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
147
+ " [6/6 00:31, Epoch 3/3]\n",
148
+ " </div>\n",
149
+ " <table border=\"1\" class=\"dataframe\">\n",
150
+ " <thead>\n",
151
+ " <tr style=\"text-align: left;\">\n",
152
+ " <th>Epoch</th>\n",
153
+ " <th>Training Loss</th>\n",
154
+ " <th>Validation Loss</th>\n",
155
+ " </tr>\n",
156
+ " </thead>\n",
157
+ " <tbody>\n",
158
+ " <tr>\n",
159
+ " <td>1</td>\n",
160
+ " <td>No log</td>\n",
161
+ " <td>3.138857</td>\n",
162
+ " </tr>\n",
163
+ " <tr>\n",
164
+ " <td>2</td>\n",
165
+ " <td>No log</td>\n",
166
+ " <td>3.090950</td>\n",
167
+ " </tr>\n",
168
+ " <tr>\n",
169
+ " <td>3</td>\n",
170
+ " <td>No log</td>\n",
171
+ " <td>3.071213</td>\n",
172
+ " </tr>\n",
173
+ " </tbody>\n",
174
+ "</table><p>"
175
+ ],
176
+ "text/plain": [
177
+ "<IPython.core.display.HTML object>"
178
+ ]
179
+ },
180
+ "metadata": {},
181
+ "output_type": "display_data"
182
+ },
183
+ {
184
+ "name": "stderr",
185
+ "output_type": "stream",
186
+ "text": [
187
+ "There were missing keys in the checkpoint model loaded: ['lm_head.weight'].\n"
188
+ ]
189
+ },
190
+ {
191
+ "data": {
192
+ "text/html": [
193
+ "\n",
194
+ " <div>\n",
195
+ " \n",
196
+ " <progress value='1' max='1' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
197
+ " [1/1 : < :]\n",
198
+ " </div>\n",
199
+ " "
200
+ ],
201
+ "text/plain": [
202
+ "<IPython.core.display.HTML object>"
203
+ ]
204
+ },
205
+ "metadata": {},
206
+ "output_type": "display_data"
207
+ },
208
+ {
209
+ "name": "stderr",
210
+ "output_type": "stream",
211
+ "text": [
212
+ "[I 2025-07-16 15:09:11,356] Trial 1 finished with value: 21.568049007800685 and parameters: {'learning_rate': 1.984906349341712e-05, 'num_train_epochs': 3, 'per_device_train_batch_batch_size': 4, 'gradient_accumulation_steps': 4, 'weight_decay': 0.08258462935496225}. Best is trial 0 with value: 14.218224606871981.\n"
213
+ ]
214
+ },
215
+ {
216
+ "name": "stdout",
217
+ "output_type": "stream",
218
+ "text": [
219
+ "Trial 1 finished. Perplexity: 21.57\n",
220
+ "Trial 1 finished with value: 21.568049007800685 and parameters: {'learning_rate': 1.984906349341712e-05, 'num_train_epochs': 3, 'per_device_train_batch_batch_size': 4, 'gradient_accumulation_steps': 4, 'weight_decay': 0.08258462935496225}. Best is trial 0 with value: 14.218224606871981.\n",
221
+ "Trial 1 finished with value: 21.568049007800685 and parameters: {'learning_rate': 1.984906349341712e-05, 'num_train_epochs': 3, 'per_device_train_batch_batch_size': 4, 'gradient_accumulation_steps': 4, 'weight_decay': 0.08258462935496225}. Best is trial 0 with value: 14.218224606871981.\n",
222
+ "Trial 1 finished with value: 21.568049007800685 and parameters: {'learning_rate': 1.984906349341712e-05, 'num_train_epochs': 3, 'per_device_train_batch_batch_size': 4, 'gradient_accumulation_steps': 4, 'weight_decay': 0.08258462935496225}. Best is trial 0 with value: 14.218224606871981.\n",
223
+ "Trial 1 finished with value: 21.568049007800685 and parameters: {'learning_rate': 1.984906349341712e-05, 'num_train_epochs': 3, 'per_device_train_batch_batch_size': 4, 'gradient_accumulation_steps': 4, 'weight_decay': 0.08258462935496225}. Best is trial 0 with value: 14.218224606871981.\n",
224
+ "\n",
225
+ "--- Starting training for trial 2 ---\n"
226
+ ]
227
+ },
228
+ {
229
+ "data": {
230
+ "text/html": [
231
+ "\n",
232
+ " <div>\n",
233
+ " \n",
234
+ " <progress value='65' max='65' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
235
+ " [65/65 01:33, Epoch 5/5]\n",
236
+ " </div>\n",
237
+ " <table border=\"1\" class=\"dataframe\">\n",
238
+ " <thead>\n",
239
+ " <tr style=\"text-align: left;\">\n",
240
+ " <th>Epoch</th>\n",
241
+ " <th>Training Loss</th>\n",
242
+ " <th>Validation Loss</th>\n",
243
+ " </tr>\n",
244
+ " </thead>\n",
245
+ " <tbody>\n",
246
+ " <tr>\n",
247
+ " <td>1</td>\n",
248
+ " <td>No log</td>\n",
249
+ " <td>2.830866</td>\n",
250
+ " </tr>\n",
251
+ " <tr>\n",
252
+ " <td>2</td>\n",
253
+ " <td>No log</td>\n",
254
+ " <td>2.704903</td>\n",
255
+ " </tr>\n",
256
+ " <tr>\n",
257
+ " <td>3</td>\n",
258
+ " <td>No log</td>\n",
259
+ " <td>2.650187</td>\n",
260
+ " </tr>\n",
261
+ " <tr>\n",
262
+ " <td>4</td>\n",
263
+ " <td>2.572700</td>\n",
264
+ " <td>2.637062</td>\n",
265
+ " </tr>\n",
266
+ " <tr>\n",
267
+ " <td>5</td>\n",
268
+ " <td>2.572700</td>\n",
269
+ " <td>2.630425</td>\n",
270
+ " </tr>\n",
271
+ " </tbody>\n",
272
+ "</table><p>"
273
+ ],
274
+ "text/plain": [
275
+ "<IPython.core.display.HTML object>"
276
+ ]
277
+ },
278
+ "metadata": {},
279
+ "output_type": "display_data"
280
+ },
281
+ {
282
+ "name": "stderr",
283
+ "output_type": "stream",
284
+ "text": [
285
+ "There were missing keys in the checkpoint model loaded: ['lm_head.weight'].\n"
286
+ ]
287
+ },
288
+ {
289
+ "data": {
290
+ "text/html": [
291
+ "\n",
292
+ " <div>\n",
293
+ " \n",
294
+ " <progress value='1' max='1' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
295
+ " [1/1 : < :]\n",
296
+ " </div>\n",
297
+ " "
298
+ ],
299
+ "text/plain": [
300
+ "<IPython.core.display.HTML object>"
301
+ ]
302
+ },
303
+ "metadata": {},
304
+ "output_type": "display_data"
305
+ },
306
+ {
307
+ "name": "stderr",
308
+ "output_type": "stream",
309
+ "text": [
310
+ "[I 2025-07-16 15:10:53,203] Trial 2 finished with value: 13.87967048839188 and parameters: {'learning_rate': 3.934727515529697e-05, 'num_train_epochs': 5, 'per_device_train_batch_batch_size': 2, 'gradient_accumulation_steps': 1, 'weight_decay': 0.013016196478728849}. Best is trial 2 with value: 13.87967048839188.\n"
311
+ ]
312
+ },
313
+ {
314
+ "name": "stdout",
315
+ "output_type": "stream",
316
+ "text": [
317
+ "Trial 2 finished. Perplexity: 13.88\n",
318
+ "Trial 2 finished with value: 13.87967048839188 and parameters: {'learning_rate': 3.934727515529697e-05, 'num_train_epochs': 5, 'per_device_train_batch_batch_size': 2, 'gradient_accumulation_steps': 1, 'weight_decay': 0.013016196478728849}. Best is trial 2 with value: 13.87967048839188.\n",
319
+ "Trial 2 finished with value: 13.87967048839188 and parameters: {'learning_rate': 3.934727515529697e-05, 'num_train_epochs': 5, 'per_device_train_batch_batch_size': 2, 'gradient_accumulation_steps': 1, 'weight_decay': 0.013016196478728849}. Best is trial 2 with value: 13.87967048839188.\n",
320
+ "Trial 2 finished with value: 13.87967048839188 and parameters: {'learning_rate': 3.934727515529697e-05, 'num_train_epochs': 5, 'per_device_train_batch_batch_size': 2, 'gradient_accumulation_steps': 1, 'weight_decay': 0.013016196478728849}. Best is trial 2 with value: 13.87967048839188.\n",
321
+ "Trial 2 finished with value: 13.87967048839188 and parameters: {'learning_rate': 3.934727515529697e-05, 'num_train_epochs': 5, 'per_device_train_batch_batch_size': 2, 'gradient_accumulation_steps': 1, 'weight_decay': 0.013016196478728849}. Best is trial 2 with value: 13.87967048839188.\n",
322
+ "\n",
323
+ "--- Starting training for trial 3 ---\n"
324
+ ]
325
+ },
326
+ {
327
+ "data": {
328
+ "text/html": [
329
+ "\n",
330
+ " <div>\n",
331
+ " \n",
332
+ " <progress value='130' max='130' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
333
+ " [130/130 02:31, Epoch 10/10]\n",
334
+ " </div>\n",
335
+ " <table border=\"1\" class=\"dataframe\">\n",
336
+ " <thead>\n",
337
+ " <tr style=\"text-align: left;\">\n",
338
+ " <th>Epoch</th>\n",
339
+ " <th>Training Loss</th>\n",
340
+ " <th>Validation Loss</th>\n",
341
+ " </tr>\n",
342
+ " </thead>\n",
343
+ " <tbody>\n",
344
+ " <tr>\n",
345
+ " <td>1</td>\n",
346
+ " <td>No log</td>\n",
347
+ " <td>3.009625</td>\n",
348
+ " </tr>\n",
349
+ " <tr>\n",
350
+ " <td>2</td>\n",
351
+ " <td>No log</td>\n",
352
+ " <td>2.892823</td>\n",
353
+ " </tr>\n",
354
+ " <tr>\n",
355
+ " <td>3</td>\n",
356
+ " <td>No log</td>\n",
357
+ " <td>2.799477</td>\n",
358
+ " </tr>\n",
359
+ " <tr>\n",
360
+ " <td>4</td>\n",
361
+ " <td>2.906200</td>\n",
362
+ " <td>2.756954</td>\n",
363
+ " </tr>\n",
364
+ " <tr>\n",
365
+ " <td>5</td>\n",
366
+ " <td>2.906200</td>\n",
367
+ " <td>2.726986</td>\n",
368
+ " </tr>\n",
369
+ " <tr>\n",
370
+ " <td>6</td>\n",
371
+ " <td>2.906200</td>\n",
372
+ " <td>2.708734</td>\n",
373
+ " </tr>\n",
374
+ " <tr>\n",
375
+ " <td>7</td>\n",
376
+ " <td>2.906200</td>\n",
377
+ " <td>2.693299</td>\n",
378
+ " </tr>\n",
379
+ " <tr>\n",
380
+ " <td>8</td>\n",
381
+ " <td>2.366800</td>\n",
382
+ " <td>2.687046</td>\n",
383
+ " </tr>\n",
384
+ " <tr>\n",
385
+ " <td>9</td>\n",
386
+ " <td>2.366800</td>\n",
387
+ " <td>2.682608</td>\n",
388
+ " </tr>\n",
389
+ " <tr>\n",
390
+ " <td>10</td>\n",
391
+ " <td>2.366800</td>\n",
392
+ " <td>2.680843</td>\n",
393
+ " </tr>\n",
394
+ " </tbody>\n",
395
+ "</table><p>"
396
+ ],
397
+ "text/plain": [
398
+ "<IPython.core.display.HTML object>"
399
+ ]
400
+ },
401
+ "metadata": {},
402
+ "output_type": "display_data"
403
+ },
404
+ {
405
+ "name": "stderr",
406
+ "output_type": "stream",
407
+ "text": [
408
+ "There were missing keys in the checkpoint model loaded: ['lm_head.weight'].\n"
409
+ ]
410
+ },
411
+ {
412
+ "data": {
413
+ "text/html": [
414
+ "\n",
415
+ " <div>\n",
416
+ " \n",
417
+ " <progress value='1' max='1' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
418
+ " [1/1 : < :]\n",
419
+ " </div>\n",
420
+ " "
421
+ ],
422
+ "text/plain": [
423
+ "<IPython.core.display.HTML object>"
424
+ ]
425
+ },
426
+ "metadata": {},
427
+ "output_type": "display_data"
428
+ },
429
+ {
430
+ "name": "stderr",
431
+ "output_type": "stream",
432
+ "text": [
433
+ "[I 2025-07-16 15:13:34,833] Trial 3 finished with value: 14.597388429401072 and parameters: {'learning_rate': 1.4797090243167628e-05, 'num_train_epochs': 10, 'per_device_train_batch_batch_size': 2, 'gradient_accumulation_steps': 1, 'weight_decay': 0.042400374321387935}. Best is trial 2 with value: 13.87967048839188.\n"
434
+ ]
435
+ },
436
+ {
437
+ "name": "stdout",
438
+ "output_type": "stream",
439
+ "text": [
440
+ "Trial 3 finished. Perplexity: 14.60\n",
441
+ "Trial 3 finished with value: 14.597388429401072 and parameters: {'learning_rate': 1.4797090243167628e-05, 'num_train_epochs': 10, 'per_device_train_batch_batch_size': 2, 'gradient_accumulation_steps': 1, 'weight_decay': 0.042400374321387935}. Best is trial 2 with value: 13.87967048839188.\n",
442
+ "Trial 3 finished with value: 14.597388429401072 and parameters: {'learning_rate': 1.4797090243167628e-05, 'num_train_epochs': 10, 'per_device_train_batch_batch_size': 2, 'gradient_accumulation_steps': 1, 'weight_decay': 0.042400374321387935}. Best is trial 2 with value: 13.87967048839188.\n",
443
+ "Trial 3 finished with value: 14.597388429401072 and parameters: {'learning_rate': 1.4797090243167628e-05, 'num_train_epochs': 10, 'per_device_train_batch_batch_size': 2, 'gradient_accumulation_steps': 1, 'weight_decay': 0.042400374321387935}. Best is trial 2 with value: 13.87967048839188.\n",
444
+ "Trial 3 finished with value: 14.597388429401072 and parameters: {'learning_rate': 1.4797090243167628e-05, 'num_train_epochs': 10, 'per_device_train_batch_batch_size': 2, 'gradient_accumulation_steps': 1, 'weight_decay': 0.042400374321387935}. Best is trial 2 with value: 13.87967048839188.\n",
445
+ "\n",
446
+ "--- Starting training for trial 4 ---\n"
447
+ ]
448
+ },
449
+ {
450
+ "data": {
451
+ "text/html": [
452
+ "\n",
453
+ " <div>\n",
454
+ " \n",
455
+ " <progress value='16' max='16' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
456
+ " [16/16 00:39, Epoch 4/4]\n",
457
+ " </div>\n",
458
+ " <table border=\"1\" class=\"dataframe\">\n",
459
+ " <thead>\n",
460
+ " <tr style=\"text-align: left;\">\n",
461
+ " <th>Epoch</th>\n",
462
+ " <th>Training Loss</th>\n",
463
+ " <th>Validation Loss</th>\n",
464
+ " </tr>\n",
465
+ " </thead>\n",
466
+ " <tbody>\n",
467
+ " <tr>\n",
468
+ " <td>1</td>\n",
469
+ " <td>No log</td>\n",
470
+ " <td>3.007116</td>\n",
471
+ " </tr>\n",
472
+ " <tr>\n",
473
+ " <td>2</td>\n",
474
+ " <td>No log</td>\n",
475
+ " <td>2.916104</td>\n",
476
+ " </tr>\n",
477
+ " <tr>\n",
478
+ " <td>3</td>\n",
479
+ " <td>No log</td>\n",
480
+ " <td>2.852073</td>\n",
481
+ " </tr>\n",
482
+ " <tr>\n",
483
+ " <td>4</td>\n",
484
+ " <td>No log</td>\n",
485
+ " <td>2.832791</td>\n",
486
+ " </tr>\n",
487
+ " </tbody>\n",
488
+ "</table><p>"
489
+ ],
490
+ "text/plain": [
491
+ "<IPython.core.display.HTML object>"
492
+ ]
493
+ },
494
+ "metadata": {},
495
+ "output_type": "display_data"
496
+ },
497
+ {
498
+ "name": "stderr",
499
+ "output_type": "stream",
500
+ "text": [
501
+ "There were missing keys in the checkpoint model loaded: ['lm_head.weight'].\n"
502
+ ]
503
+ },
504
+ {
505
+ "data": {
506
+ "text/html": [
507
+ "\n",
508
+ " <div>\n",
509
+ " \n",
510
+ " <progress value='1' max='1' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
511
+ " [1/1 : < :]\n",
512
+ " </div>\n",
513
+ " "
514
+ ],
515
+ "text/plain": [
516
+ "<IPython.core.display.HTML object>"
517
+ ]
518
+ },
519
+ "metadata": {},
520
+ "output_type": "display_data"
521
+ },
522
+ {
523
+ "name": "stderr",
524
+ "output_type": "stream",
525
+ "text": [
526
+ "[I 2025-07-16 15:14:24,143] Trial 4 finished with value: 16.992815093757653 and parameters: {'learning_rate': 3.624712025325302e-05, 'num_train_epochs': 4, 'per_device_train_batch_batch_size': 4, 'gradient_accumulation_steps': 2, 'weight_decay': 0.021517733040790188}. Best is trial 2 with value: 13.87967048839188.\n"
527
+ ]
528
+ },
529
+ {
530
+ "name": "stdout",
531
+ "output_type": "stream",
532
+ "text": [
533
+ "Trial 4 finished. Perplexity: 16.99\n",
534
+ "Trial 4 finished with value: 16.992815093757653 and parameters: {'learning_rate': 3.624712025325302e-05, 'num_train_epochs': 4, 'per_device_train_batch_batch_size': 4, 'gradient_accumulation_steps': 2, 'weight_decay': 0.021517733040790188}. Best is trial 2 with value: 13.87967048839188.\n",
535
+ "Trial 4 finished with value: 16.992815093757653 and parameters: {'learning_rate': 3.624712025325302e-05, 'num_train_epochs': 4, 'per_device_train_batch_batch_size': 4, 'gradient_accumulation_steps': 2, 'weight_decay': 0.021517733040790188}. Best is trial 2 with value: 13.87967048839188.\n",
536
+ "Trial 4 finished with value: 16.992815093757653 and parameters: {'learning_rate': 3.624712025325302e-05, 'num_train_epochs': 4, 'per_device_train_batch_batch_size': 4, 'gradient_accumulation_steps': 2, 'weight_decay': 0.021517733040790188}. Best is trial 2 with value: 13.87967048839188.\n",
537
+ "Trial 4 finished with value: 16.992815093757653 and parameters: {'learning_rate': 3.624712025325302e-05, 'num_train_epochs': 4, 'per_device_train_batch_batch_size': 4, 'gradient_accumulation_steps': 2, 'weight_decay': 0.021517733040790188}. Best is trial 2 with value: 13.87967048839188.\n",
538
+ "\n",
539
+ "--- Hyperparameter optimization complete ---\n",
540
+ "Best trial:\n",
541
+ " Value (Perplexity): 13.88\n",
542
+ " Params: \n",
543
+ " learning_rate: 3.934727515529697e-05\n",
544
+ " num_train_epochs: 5\n",
545
+ " per_device_train_batch_batch_size: 2\n",
546
+ " gradient_accumulation_steps: 1\n",
547
+ " weight_decay: 0.013016196478728849\n",
548
+ "\n",
549
+ "--- Retraining final model with best hyperparameters ---\n"
550
+ ]
551
+ },
552
+ {
553
+ "name": "stderr",
554
+ "output_type": "stream",
555
+ "text": [
556
+ "d:\\Anaconda\\Lib\\site-packages\\transformers\\data\\datasets\\language_modeling.py:53: FutureWarning: This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py\n",
557
+ " warnings.warn(\n"
558
+ ]
559
+ },
560
+ {
561
+ "data": {
562
+ "text/html": [
563
+ "\n",
564
+ " <div>\n",
565
+ " \n",
566
+ " <progress value='80' max='80' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
567
+ " [80/80 01:34, Epoch 5/5]\n",
568
+ " </div>\n",
569
+ " <table border=\"1\" class=\"dataframe\">\n",
570
+ " <thead>\n",
571
+ " <tr style=\"text-align: left;\">\n",
572
+ " <th>Step</th>\n",
573
+ " <th>Training Loss</th>\n",
574
+ " </tr>\n",
575
+ " </thead>\n",
576
+ " <tbody>\n",
577
+ " </tbody>\n",
578
+ "</table><p>"
579
+ ],
580
+ "text/plain": [
581
+ "<IPython.core.display.HTML object>"
582
+ ]
583
+ },
584
+ "metadata": {},
585
+ "output_type": "display_data"
586
+ },
587
+ {
588
+ "name": "stdout",
589
+ "output_type": "stream",
590
+ "text": [
591
+ "Final model fine-tuning complete and saved.\n",
592
+ "\n",
593
+ "--- Generating text with the fine-tuned model ---\n"
594
+ ]
595
+ },
596
+ {
597
+ "name": "stderr",
598
+ "output_type": "stream",
599
+ "text": [
600
+ "The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.\n"
601
+ ]
602
+ },
603
+ {
604
+ "name": "stdout",
605
+ "output_type": "stream",
606
+ "text": [
607
+ "Prompt: The ethical implications of AI are\n",
608
+ "Generated Text:\n",
609
+ "The ethical implications of AI are complex, and the ethical consequences of autonomous systems are often complex.\n",
610
+ "Ethical Implications of Artificial Intelligence\n",
611
+ "AI ethics are important for several reasons:\n",
612
+ "1. It enables individuals to make informed decisions about their own well-being and their well being. This enables them to better understand and address societal challenges. 2. AI technologies can help individuals better manage their personal and social lives. 3. They can be used to address social and economic inequalities. 4. Ethical\n",
613
+ "\n",
614
+ "Cleaned up temporary data split files.\n"
615
+ ]
616
+ }
617
+ ],
618
+ "source": [
619
+ "import torch\n",
620
+ "from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments\n",
621
+ "import math\n",
622
+ "from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction\n",
623
+ "import os\n",
624
+ "import random\n",
625
+ "import optuna\n",
626
+ "import logging\n",
627
+ "import sys\n",
628
+ "\n",
629
+ "optuna.logging.get_logger(\"optuna\").addHandler(logging.StreamHandler(sys.stdout))\n",
630
+ "\n",
631
+ "file_path = \"ai_ethics_data.txt\"\n",
632
+ "\n",
633
+ "with open(file_path, \"r\", encoding=\"utf-8\") as f:\n",
634
+ " full_text_content = f.read()\n",
635
+ "\n",
636
+ "train_file_path = \"ai_ethics_train.txt\"\n",
637
+ "val_file_path = \"ai_ethics_val.txt\"\n",
638
+ "\n",
639
+ "lines = full_text_content.strip().split('\\n')\n",
640
+ "lines = [line for line in lines if line.strip()]\n",
641
+ "random.shuffle(lines)\n",
642
+ "\n",
643
+ "train_size = int(len(lines) * 0.8)\n",
644
+ "train_lines = lines[:train_size]\n",
645
+ "val_lines = lines[train_size:]\n",
646
+ "\n",
647
+ "with open(train_file_path, \"w\", encoding=\"utf-8\") as f:\n",
648
+ " f.write(\"\\n\".join(train_lines))\n",
649
+ "\n",
650
+ "with open(val_file_path, \"w\", encoding=\"utf-8\") as f:\n",
651
+ " f.write(\"\\n\".join(val_lines))\n",
652
+ "\n",
653
+ "print(f\"Data split into {len(train_lines)} training lines and {len(val_lines)} validation lines from {file_path}.\")\n",
654
+ "\n",
655
+ "tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\")\n",
656
+ "if tokenizer.pad_token is None:\n",
657
+ " tokenizer.add_special_tokens({'pad_token': '[PAD]'})\n",
658
+ "\n",
659
+ "# 1. Fine-tuning the GPT-2 Model\n",
660
+ "\n",
661
+ "def model_init():\n",
662
+ " model = GPT2LMHeadModel.from_pretrained(\"gpt2\")\n",
663
+ " model.resize_token_embeddings(len(tokenizer))\n",
664
+ " model.config.pad_token_id = tokenizer.pad_token_id\n",
665
+ " return model\n",
666
+ "\n",
667
+ "train_dataset = TextDataset(\n",
668
+ " tokenizer=tokenizer,\n",
669
+ " file_path=train_file_path,\n",
670
+ " block_size=128 \n",
671
+ ")\n",
672
+ "val_dataset = TextDataset(\n",
673
+ " tokenizer=tokenizer,\n",
674
+ " file_path=val_file_path,\n",
675
+ " block_size=128\n",
676
+ ")\n",
677
+ "\n",
678
+ "# Data collator for language modeling\n",
679
+ "data_collator = DataCollatorForLanguageModeling(\n",
680
+ " tokenizer=tokenizer, mlm=False\n",
681
+ ")\n",
682
+ "\n",
683
+ "# 2. Hyperparameter Tuning with Optuna\n",
684
+ "\n",
685
+ "def objective(trial):\n",
686
+ " learning_rate = trial.suggest_float(\"learning_rate\", 1e-5, 5e-5, log=True)\n",
687
+ " num_train_epochs = trial.suggest_int(\"num_train_epochs\", 3, 10) # Reduced max epochs for faster tuning\n",
688
+ " per_device_train_batch_size = trial.suggest_categorical(\"per_device_train_batch_batch_size\", [2, 4])\n",
689
+ " gradient_accumulation_steps = trial.suggest_categorical(\"gradient_accumulation_steps\", [1, 2, 4])\n",
690
+ " weight_decay = trial.suggest_float(\"weight_decay\", 0.0, 0.1)\n",
691
+ "\n",
692
+ " output_dir = f\"./gpt2-finetuned-ai-ethics-trial-{trial.number}\"\n",
693
+ "\n",
694
+ " training_args = TrainingArguments(\n",
695
+ " output_dir=output_dir,\n",
696
+ " overwrite_output_dir=True,\n",
697
+ " num_train_epochs=num_train_epochs,\n",
698
+ " per_device_train_batch_size=per_device_train_batch_size,\n",
699
+ " gradient_accumulation_steps=gradient_accumulation_steps,\n",
700
+ " save_steps=500, \n",
701
+ " save_total_limit=1, \n",
702
+ " prediction_loss_only=True,\n",
703
+ " logging_dir='./logs',\n",
704
+ " logging_steps=50,\n",
705
+ " learning_rate=learning_rate,\n",
706
+ " weight_decay=weight_decay,\n",
707
+ "\n",
708
+ " eval_strategy=\"epoch\",\n",
709
+ " save_strategy=\"epoch\", \n",
710
+ " load_best_model_at_end=True, \n",
711
+ " metric_for_best_model=\"eval_loss\", \n",
712
+ " report_to=\"none\",\n",
713
+ " )\n",
714
+ "\n",
715
+ " trainer = Trainer(\n",
716
+ " model_init=model_init, \n",
717
+ " args=training_args,\n",
718
+ " data_collator=data_collator,\n",
719
+ " train_dataset=train_dataset,\n",
720
+ " eval_dataset=val_dataset, \n",
721
+ " )\n",
722
+ "\n",
723
+ " print(f\"\\n--- Starting training for trial {trial.number} ---\")\n",
724
+ " trainer.train()\n",
725
+ "\n",
726
+ " results = trainer.evaluate(eval_dataset=val_dataset)\n",
727
+ " eval_loss = results['eval_loss']\n",
728
+ " perplexity = math.exp(eval_loss) if eval_loss < float('inf') else float('inf')\n",
729
+ "\n",
730
+ " print(f\"Trial {trial.number} finished. Perplexity: {perplexity:.2f}\")\n",
731
+ " return perplexity\n",
732
+ "\n",
733
+ "print(\"\\n--- Starting Optuna hyperparameter optimization ---\")\n",
734
+ "study = optuna.create_study(direction=\"minimize\", study_name=\"gpt2_finetuning_ai_ethics\")\n",
735
+ "study.optimize(objective, n_trials=5)\n",
736
+ "\n",
737
+ "print(\"\\n--- Hyperparameter optimization complete ---\")\n",
738
+ "print(\"Best trial:\")\n",
739
+ "trial = study.best_trial\n",
740
+ "print(f\" Value (Perplexity): {trial.value:.2f}\")\n",
741
+ "print(\" Params: \")\n",
742
+ "for key, value in trial.params.items():\n",
743
+ " print(f\" {key}: {value}\")\n",
744
+ "\n",
745
+ "# 3. Retrain with Best Hyperparameters (Final Model)\n",
746
+ "\n",
747
+ "print(\"\\n--- Retraining final model with best hyperparameters ---\")\n",
748
+ "\n",
749
+ "best_params = study.best_trial.params\n",
750
+ "\n",
751
+ "final_model = model_init()\n",
752
+ "\n",
753
+ "final_train_dataset = TextDataset(\n",
754
+ " tokenizer=tokenizer,\n",
755
+ " file_path=file_path,\n",
756
+ " block_size=128\n",
757
+ ")\n",
758
+ "\n",
759
+ "final_training_args = TrainingArguments(\n",
760
+ " output_dir=\"./gpt2-finetuned-ai-ethics-final\",\n",
761
+ " overwrite_output_dir=True,\n",
762
+ " num_train_epochs=best_params[\"num_train_epochs\"],\n",
763
+ " per_device_train_batch_size=best_params[\"per_device_train_batch_batch_size\"],\n",
764
+ " gradient_accumulation_steps=best_params[\"gradient_accumulation_steps\"],\n",
765
+ " save_steps=500, \n",
766
+ " save_total_limit=2,\n",
767
+ " prediction_loss_only=True,\n",
768
+ " logging_dir='./logs_final',\n",
769
+ " logging_steps=100,\n",
770
+ " learning_rate=best_params[\"learning_rate\"],\n",
771
+ " weight_decay=best_params[\"weight_decay\"],\n",
772
+ " eval_strategy=\"no\", \n",
773
+ " save_strategy=\"epoch\", \n",
774
+ " load_best_model_at_end=False, \n",
775
+ " report_to=\"none\",\n",
776
+ ")\n",
777
+ "\n",
778
+ "final_trainer = Trainer(\n",
779
+ " model=final_model,\n",
780
+ " args=final_training_args,\n",
781
+ " data_collator=data_collator,\n",
782
+ " train_dataset=final_train_dataset,\n",
783
+ ")\n",
784
+ "\n",
785
+ "final_trainer.train()\n",
786
+ "final_trainer.save_model(\"./gpt2-finetuned-ai-ethics-final\")\n",
787
+ "print(\"Final model fine-tuning complete and saved.\")\n",
788
+ "\n",
789
+ "# 4. Basic Text Generation Example (using the final model)\n",
790
+ "print(\"\\n--- Generating text with the fine-tuned model ---\")\n",
791
+ "\n",
792
+ "loaded_tokenizer = GPT2Tokenizer.from_pretrained(\"./gpt2-finetuned-ai-ethics-final\")\n",
793
+ "loaded_model = GPT2LMHeadModel.from_pretrained(\"./gpt2-finetuned-ai-ethics-final\")\n",
794
+ "\n",
795
+ "if loaded_tokenizer.pad_token is None:\n",
796
+ " loaded_tokenizer.add_special_tokens({'pad_token': '[PAD]'})\n",
797
+ " loaded_model.resize_token_embeddings(len(loaded_tokenizer))\n",
798
+ "loaded_model.config.pad_token_id = loaded_tokenizer.pad_token_id\n",
799
+ "\n",
800
+ "prompt = \"The ethical implications of AI are\"\n",
801
+ "input_ids = loaded_tokenizer.encode(prompt, return_tensors='pt')\n",
802
+ "\n",
803
+ "output = loaded_model.generate(\n",
804
+ " input_ids,\n",
805
+ " max_length=100,\n",
806
+ " num_return_sequences=1,\n",
807
+ " no_repeat_ngram_size=2,\n",
808
+ " top_k=50,\n",
809
+ " top_p=0.95,\n",
810
+ " temperature=0.7,\n",
811
+ " pad_token_id=loaded_tokenizer.pad_token_id \n",
812
+ ")\n",
813
+ "\n",
814
+ "generated_text = loaded_tokenizer.decode(output[0], skip_special_tokens=True)\n",
815
+ "print(f\"Prompt: {prompt}\")\n",
816
+ "print(f\"Generated Text:\\n{generated_text}\")\n",
817
+ "\n",
818
+ "try:\n",
819
+ " import nltk\n",
820
+ " nltk.data.find('tokenizers/punkt')\n",
821
+ "except nltk.downloader.DownloadError:\n",
822
+ " nltk.download('punkt')\n",
823
+ "\n",
824
+ "reference = [[\"The responsible development and deployment of AI are paramount for societal well-being.\"]]\n",
825
+ "candidate = generated_text.split()\n",
826
+ "score = sentence_bleu(reference, candidate, smoothing_function=SmoothingFunction().method1)\n",
827
+ "print(f\"\\nBLEU Score (example): {score:.2f}\")\n",
828
+ "\n",
829
+ "os.remove(train_file_path)\n",
830
+ "os.remove(val_file_path)\n",
831
+ "print(\"\\nCleaned up temporary data split files.\")"
832
+ ]
833
+ }
834
+ ],
835
+ "metadata": {
836
+ "kernelspec": {
837
+ "display_name": "base",
838
+ "language": "python",
839
+ "name": "python3"
840
+ },
841
+ "language_info": {
842
+ "name": "python",
843
+ "version": "3.12.7"
844
+ }
845
+ },
846
+ "nbformat": 4,
847
+ "nbformat_minor": 5
848
+ }