Spaces:
Sleeping
Sleeping
Commit ·
2e4aa16
1
Parent(s): 76a2962
update generato
Browse files- .gitignore +1 -0
- best_model.pth +2 -2
- generator.ipynb +73 -119
- model/__pycache__/attn.cpython-312.pyc +0 -0
- model/__pycache__/decoder.cpython-312.pyc +0 -0
- model/__pycache__/encoder.cpython-312.pyc +0 -0
- model/__pycache__/generator.cpython-312.pyc +0 -0
- model/decoder.py +4 -1
- model/encoder.py +3 -3
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
best_model.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f960ed296d73469239b693c1d37c1cef55e92cf3d7a914c75b8c7b9efb9ff701
|
| 3 |
+
size 162076771
|
generator.ipynb
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
"cells": [
|
| 3 |
{
|
| 4 |
"cell_type": "code",
|
| 5 |
-
"execution_count":
|
| 6 |
"id": "bae751d8",
|
| 7 |
"metadata": {},
|
| 8 |
"outputs": [],
|
|
@@ -26,7 +26,7 @@
|
|
| 26 |
},
|
| 27 |
{
|
| 28 |
"cell_type": "code",
|
| 29 |
-
"execution_count":
|
| 30 |
"id": "c0e30f61",
|
| 31 |
"metadata": {},
|
| 32 |
"outputs": [
|
|
@@ -42,10 +42,10 @@
|
|
| 42 |
{
|
| 43 |
"data": {
|
| 44 |
"text/plain": [
|
| 45 |
-
"<
|
| 46 |
]
|
| 47 |
},
|
| 48 |
-
"execution_count":
|
| 49 |
"metadata": {},
|
| 50 |
"output_type": "execute_result"
|
| 51 |
}
|
|
@@ -57,7 +57,7 @@
|
|
| 57 |
},
|
| 58 |
{
|
| 59 |
"cell_type": "code",
|
| 60 |
-
"execution_count":
|
| 61 |
"id": "db130c45",
|
| 62 |
"metadata": {},
|
| 63 |
"outputs": [
|
|
@@ -134,7 +134,7 @@
|
|
| 134 |
},
|
| 135 |
{
|
| 136 |
"cell_type": "code",
|
| 137 |
-
"execution_count":
|
| 138 |
"id": "d5747915",
|
| 139 |
"metadata": {},
|
| 140 |
"outputs": [
|
|
@@ -173,7 +173,7 @@
|
|
| 173 |
},
|
| 174 |
{
|
| 175 |
"cell_type": "code",
|
| 176 |
-
"execution_count":
|
| 177 |
"id": "5252d457",
|
| 178 |
"metadata": {},
|
| 179 |
"outputs": [
|
|
@@ -190,13 +190,7 @@
|
|
| 190 |
" input shape: torch.Size([32, 500])\n",
|
| 191 |
" output shape: torch.Size([32, 100])\n",
|
| 192 |
" input_len shape: torch.Size([32])\n",
|
| 193 |
-
" First sample input_len:
|
| 194 |
-
"\n",
|
| 195 |
-
"Sample batch structure:\n",
|
| 196 |
-
" input shape: torch.Size([32, 500])\n",
|
| 197 |
-
" output shape: torch.Size([32, 100])\n",
|
| 198 |
-
" input_len shape: torch.Size([32])\n",
|
| 199 |
-
" First sample input_len: 16\n"
|
| 200 |
]
|
| 201 |
}
|
| 202 |
],
|
|
@@ -229,7 +223,7 @@
|
|
| 229 |
},
|
| 230 |
{
|
| 231 |
"cell_type": "code",
|
| 232 |
-
"execution_count":
|
| 233 |
"id": "11631bed",
|
| 234 |
"metadata": {},
|
| 235 |
"outputs": [
|
|
@@ -243,6 +237,7 @@
|
|
| 243 |
" (embedding): Embedding(8002, 128, padding_idx=0)\n",
|
| 244 |
" (lstm): LSTM(128, 256, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)\n",
|
| 245 |
" (dropout): Dropout(p=0.3, inplace=False)\n",
|
|
|
|
| 246 |
" )\n",
|
| 247 |
" (decoder): Decoder(\n",
|
| 248 |
" (attention): BahdanauAttention(\n",
|
|
@@ -254,12 +249,13 @@
|
|
| 254 |
" (lstm): LSTM(640, 256, num_layers=2, batch_first=True, dropout=0.3)\n",
|
| 255 |
" (fc_out): Linear(in_features=896, out_features=8002, bias=True)\n",
|
| 256 |
" (dropout): Dropout(p=0.3, inplace=False)\n",
|
|
|
|
| 257 |
" )\n",
|
| 258 |
" (hidden_projection): Linear(in_features=512, out_features=256, bias=True)\n",
|
| 259 |
" (cell_projection): Linear(in_features=512, out_features=256, bias=True)\n",
|
| 260 |
")\n",
|
| 261 |
"\n",
|
| 262 |
-
"Total parameters: 13,
|
| 263 |
]
|
| 264 |
}
|
| 265 |
],
|
|
@@ -286,7 +282,7 @@
|
|
| 286 |
},
|
| 287 |
{
|
| 288 |
"cell_type": "code",
|
| 289 |
-
"execution_count":
|
| 290 |
"id": "2d3125a6",
|
| 291 |
"metadata": {},
|
| 292 |
"outputs": [],
|
|
@@ -304,12 +300,12 @@
|
|
| 304 |
},
|
| 305 |
{
|
| 306 |
"cell_type": "code",
|
| 307 |
-
"execution_count":
|
| 308 |
"id": "794c40e7",
|
| 309 |
"metadata": {},
|
| 310 |
"outputs": [],
|
| 311 |
"source": [
|
| 312 |
-
"def train(model, iterator, optimizer, criterion, clip=1.0, teacher_forcing_ratio=0.
|
| 313 |
" model.train()\n",
|
| 314 |
" epoch_loss = 0\n",
|
| 315 |
" \n",
|
|
@@ -361,7 +357,7 @@
|
|
| 361 |
},
|
| 362 |
{
|
| 363 |
"cell_type": "code",
|
| 364 |
-
"execution_count":
|
| 365 |
"id": "d4bb0e92",
|
| 366 |
"metadata": {},
|
| 367 |
"outputs": [
|
|
@@ -369,236 +365,192 @@
|
|
| 369 |
"name": "stderr",
|
| 370 |
"output_type": "stream",
|
| 371 |
"text": [
|
| 372 |
-
" "
|
| 373 |
-
]
|
| 374 |
-
},
|
| 375 |
-
{
|
| 376 |
-
"name": "stdout",
|
| 377 |
-
"output_type": "stream",
|
| 378 |
-
"text": [
|
| 379 |
-
"Epoch: 01/15 | Time: 7m 39s | TF Ratio: 0.50\n",
|
| 380 |
-
"\tTrain Loss: 4.5316 | Val Loss: 4.2697 | Best Val: 4.2697 ✓ SAVED\n"
|
| 381 |
-
]
|
| 382 |
-
},
|
| 383 |
-
{
|
| 384 |
-
"name": "stderr",
|
| 385 |
-
"output_type": "stream",
|
| 386 |
-
"text": [
|
| 387 |
-
" "
|
| 388 |
]
|
| 389 |
},
|
| 390 |
{
|
| 391 |
"name": "stdout",
|
| 392 |
"output_type": "stream",
|
| 393 |
"text": [
|
| 394 |
-
"Epoch:
|
| 395 |
-
"\tTrain Loss:
|
| 396 |
]
|
| 397 |
},
|
| 398 |
{
|
| 399 |
"name": "stderr",
|
| 400 |
"output_type": "stream",
|
| 401 |
"text": [
|
| 402 |
-
" "
|
| 403 |
]
|
| 404 |
},
|
| 405 |
{
|
| 406 |
"name": "stdout",
|
| 407 |
"output_type": "stream",
|
| 408 |
"text": [
|
| 409 |
-
"Epoch:
|
| 410 |
-
"\tTrain Loss: 3.
|
| 411 |
]
|
| 412 |
},
|
| 413 |
{
|
| 414 |
"name": "stderr",
|
| 415 |
"output_type": "stream",
|
| 416 |
"text": [
|
| 417 |
-
" "
|
| 418 |
]
|
| 419 |
},
|
| 420 |
{
|
| 421 |
"name": "stdout",
|
| 422 |
"output_type": "stream",
|
| 423 |
"text": [
|
| 424 |
-
"Epoch:
|
| 425 |
-
"\tTrain Loss:
|
| 426 |
]
|
| 427 |
},
|
| 428 |
{
|
| 429 |
"name": "stderr",
|
| 430 |
"output_type": "stream",
|
| 431 |
"text": [
|
| 432 |
-
" "
|
| 433 |
]
|
| 434 |
},
|
| 435 |
{
|
| 436 |
"name": "stdout",
|
| 437 |
"output_type": "stream",
|
| 438 |
"text": [
|
| 439 |
-
"Epoch:
|
| 440 |
-
"\tTrain Loss:
|
| 441 |
]
|
| 442 |
},
|
| 443 |
{
|
| 444 |
"name": "stderr",
|
| 445 |
"output_type": "stream",
|
| 446 |
"text": [
|
| 447 |
-
" "
|
| 448 |
]
|
| 449 |
},
|
| 450 |
{
|
| 451 |
"name": "stdout",
|
| 452 |
"output_type": "stream",
|
| 453 |
"text": [
|
| 454 |
-
"Epoch:
|
| 455 |
-
"\tTrain Loss: 2.
|
| 456 |
]
|
| 457 |
},
|
| 458 |
{
|
| 459 |
"name": "stderr",
|
| 460 |
"output_type": "stream",
|
| 461 |
"text": [
|
| 462 |
-
" "
|
| 463 |
]
|
| 464 |
},
|
| 465 |
{
|
| 466 |
"name": "stdout",
|
| 467 |
"output_type": "stream",
|
| 468 |
"text": [
|
| 469 |
-
"Epoch:
|
| 470 |
-
"\tTrain Loss: 2.
|
| 471 |
]
|
| 472 |
},
|
| 473 |
{
|
| 474 |
"name": "stderr",
|
| 475 |
"output_type": "stream",
|
| 476 |
"text": [
|
| 477 |
-
" "
|
| 478 |
]
|
| 479 |
},
|
| 480 |
{
|
| 481 |
"name": "stdout",
|
| 482 |
"output_type": "stream",
|
| 483 |
"text": [
|
| 484 |
-
"Epoch:
|
| 485 |
-
"\tTrain Loss:
|
| 486 |
]
|
| 487 |
},
|
| 488 |
{
|
| 489 |
"name": "stderr",
|
| 490 |
"output_type": "stream",
|
| 491 |
"text": [
|
| 492 |
-
" "
|
| 493 |
]
|
| 494 |
},
|
| 495 |
{
|
| 496 |
"name": "stdout",
|
| 497 |
"output_type": "stream",
|
| 498 |
"text": [
|
| 499 |
-
"Epoch:
|
| 500 |
-
"\tTrain Loss:
|
| 501 |
]
|
| 502 |
},
|
| 503 |
{
|
| 504 |
"name": "stderr",
|
| 505 |
"output_type": "stream",
|
| 506 |
"text": [
|
| 507 |
-
" "
|
| 508 |
]
|
| 509 |
},
|
| 510 |
{
|
| 511 |
"name": "stdout",
|
| 512 |
"output_type": "stream",
|
| 513 |
"text": [
|
| 514 |
-
"Epoch:
|
| 515 |
-
"\tTrain Loss:
|
| 516 |
]
|
| 517 |
},
|
| 518 |
{
|
| 519 |
"name": "stderr",
|
| 520 |
"output_type": "stream",
|
| 521 |
"text": [
|
| 522 |
-
" "
|
| 523 |
]
|
| 524 |
},
|
| 525 |
{
|
| 526 |
"name": "stdout",
|
| 527 |
"output_type": "stream",
|
| 528 |
"text": [
|
| 529 |
-
"Epoch:
|
| 530 |
-
"\tTrain Loss:
|
| 531 |
]
|
| 532 |
},
|
| 533 |
{
|
| 534 |
"name": "stderr",
|
| 535 |
"output_type": "stream",
|
| 536 |
"text": [
|
| 537 |
-
" "
|
| 538 |
]
|
| 539 |
},
|
| 540 |
{
|
| 541 |
"name": "stdout",
|
| 542 |
"output_type": "stream",
|
| 543 |
"text": [
|
| 544 |
-
"Epoch:
|
| 545 |
-
"\tTrain Loss:
|
| 546 |
]
|
| 547 |
},
|
| 548 |
{
|
| 549 |
"name": "stderr",
|
| 550 |
"output_type": "stream",
|
| 551 |
"text": [
|
| 552 |
-
"
|
| 553 |
]
|
| 554 |
},
|
| 555 |
{
|
| 556 |
-
"
|
| 557 |
-
"
|
| 558 |
-
"
|
| 559 |
-
|
| 560 |
-
"\
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
"
|
| 568 |
-
]
|
| 569 |
-
},
|
| 570 |
-
{
|
| 571 |
-
"name": "stdout",
|
| 572 |
-
"output_type": "stream",
|
| 573 |
-
"text": [
|
| 574 |
-
"Epoch: 14/15 | Time: 7m 38s | TF Ratio: 0.13\n",
|
| 575 |
-
"\tTrain Loss: 2.2045 | Val Loss: 2.9489 | Best Val: 2.9489 ✓ SAVED\n"
|
| 576 |
-
]
|
| 577 |
-
},
|
| 578 |
-
{
|
| 579 |
-
"name": "stderr",
|
| 580 |
-
"output_type": "stream",
|
| 581 |
-
"text": [
|
| 582 |
-
" "
|
| 583 |
-
]
|
| 584 |
-
},
|
| 585 |
-
{
|
| 586 |
-
"name": "stdout",
|
| 587 |
-
"output_type": "stream",
|
| 588 |
-
"text": [
|
| 589 |
-
"Epoch: 15/15 | Time: 7m 39s | TF Ratio: 0.11\n",
|
| 590 |
-
"\tTrain Loss: 2.1487 | Val Loss: 2.9050 | Best Val: 2.9050 ✓ SAVED\n",
|
| 591 |
-
"\n",
|
| 592 |
-
"======================================================================\n",
|
| 593 |
-
"✓ TRAINING COMPLETE!\n",
|
| 594 |
-
"Best validation loss: 2.9050\n",
|
| 595 |
-
"Model saved to 'best_model.pth'\n",
|
| 596 |
-
"======================================================================\n"
|
| 597 |
]
|
| 598 |
}
|
| 599 |
],
|
| 600 |
"source": [
|
| 601 |
-
"EPOCHS =
|
| 602 |
"CLIP = 1.0\n",
|
| 603 |
"best_valid_loss = float('inf')\n",
|
| 604 |
"training_history = {'train_loss': [], 'valid_loss': []}\n",
|
|
@@ -645,7 +597,7 @@
|
|
| 645 |
},
|
| 646 |
{
|
| 647 |
"cell_type": "code",
|
| 648 |
-
"execution_count":
|
| 649 |
"id": "6d9a8e25",
|
| 650 |
"metadata": {},
|
| 651 |
"outputs": [
|
|
@@ -653,12 +605,14 @@
|
|
| 653 |
"name": "stdout",
|
| 654 |
"output_type": "stream",
|
| 655 |
"text": [
|
| 656 |
-
"Loaded checkpoint from best_model.pth (epoch
|
| 657 |
-
"Sample input (truncated): [CLS:
|
| 658 |
-
"
|
|
|
|
|
|
|
| 659 |
"\n",
|
| 660 |
-
"Reference pragma: omp
|
| 661 |
-
"Model prediction: omp parallel for
|
| 662 |
]
|
| 663 |
}
|
| 664 |
],
|
|
@@ -712,8 +666,8 @@
|
|
| 712 |
" return tokenizer.decode(generated)\n",
|
| 713 |
"\n",
|
| 714 |
"# Quick sanity check on a validation example\n",
|
| 715 |
-
"sample_input = val_inputs[
|
| 716 |
-
"reference = val_outputs[
|
| 717 |
"prediction = greedy_generate(sample_input)\n",
|
| 718 |
"print(\"Sample input (truncated):\", sample_input[:140] + \"...\" if len(sample_input) > 140 else sample_input)\n",
|
| 719 |
"print(\"Reference pragma:\", reference)\n",
|
|
|
|
| 2 |
"cells": [
|
| 3 |
{
|
| 4 |
"cell_type": "code",
|
| 5 |
+
"execution_count": 6,
|
| 6 |
"id": "bae751d8",
|
| 7 |
"metadata": {},
|
| 8 |
"outputs": [],
|
|
|
|
| 26 |
},
|
| 27 |
{
|
| 28 |
"cell_type": "code",
|
| 29 |
+
"execution_count": 7,
|
| 30 |
"id": "c0e30f61",
|
| 31 |
"metadata": {},
|
| 32 |
"outputs": [
|
|
|
|
| 42 |
{
|
| 43 |
"data": {
|
| 44 |
"text/plain": [
|
| 45 |
+
"<tokenizer.Tokenizer at 0x7d2bbbafcb90>"
|
| 46 |
]
|
| 47 |
},
|
| 48 |
+
"execution_count": 7,
|
| 49 |
"metadata": {},
|
| 50 |
"output_type": "execute_result"
|
| 51 |
}
|
|
|
|
| 57 |
},
|
| 58 |
{
|
| 59 |
"cell_type": "code",
|
| 60 |
+
"execution_count": 8,
|
| 61 |
"id": "db130c45",
|
| 62 |
"metadata": {},
|
| 63 |
"outputs": [
|
|
|
|
| 134 |
},
|
| 135 |
{
|
| 136 |
"cell_type": "code",
|
| 137 |
+
"execution_count": 9,
|
| 138 |
"id": "d5747915",
|
| 139 |
"metadata": {},
|
| 140 |
"outputs": [
|
|
|
|
| 173 |
},
|
| 174 |
{
|
| 175 |
"cell_type": "code",
|
| 176 |
+
"execution_count": 10,
|
| 177 |
"id": "5252d457",
|
| 178 |
"metadata": {},
|
| 179 |
"outputs": [
|
|
|
|
| 190 |
" input shape: torch.Size([32, 500])\n",
|
| 191 |
" output shape: torch.Size([32, 100])\n",
|
| 192 |
" input_len shape: torch.Size([32])\n",
|
| 193 |
+
" First sample input_len: 12\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
]
|
| 195 |
}
|
| 196 |
],
|
|
|
|
| 223 |
},
|
| 224 |
{
|
| 225 |
"cell_type": "code",
|
| 226 |
+
"execution_count": 11,
|
| 227 |
"id": "11631bed",
|
| 228 |
"metadata": {},
|
| 229 |
"outputs": [
|
|
|
|
| 237 |
" (embedding): Embedding(8002, 128, padding_idx=0)\n",
|
| 238 |
" (lstm): LSTM(128, 256, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)\n",
|
| 239 |
" (dropout): Dropout(p=0.3, inplace=False)\n",
|
| 240 |
+
" (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n",
|
| 241 |
" )\n",
|
| 242 |
" (decoder): Decoder(\n",
|
| 243 |
" (attention): BahdanauAttention(\n",
|
|
|
|
| 249 |
" (lstm): LSTM(640, 256, num_layers=2, batch_first=True, dropout=0.3)\n",
|
| 250 |
" (fc_out): Linear(in_features=896, out_features=8002, bias=True)\n",
|
| 251 |
" (dropout): Dropout(p=0.3, inplace=False)\n",
|
| 252 |
+
" (layer_norm): LayerNorm((896,), eps=1e-05, elementwise_affine=True)\n",
|
| 253 |
" )\n",
|
| 254 |
" (hidden_projection): Linear(in_features=512, out_features=256, bias=True)\n",
|
| 255 |
" (cell_projection): Linear(in_features=512, out_features=256, bias=True)\n",
|
| 256 |
")\n",
|
| 257 |
"\n",
|
| 258 |
+
"Total parameters: 13,502,531\n"
|
| 259 |
]
|
| 260 |
}
|
| 261 |
],
|
|
|
|
| 282 |
},
|
| 283 |
{
|
| 284 |
"cell_type": "code",
|
| 285 |
+
"execution_count": 12,
|
| 286 |
"id": "2d3125a6",
|
| 287 |
"metadata": {},
|
| 288 |
"outputs": [],
|
|
|
|
| 300 |
},
|
| 301 |
{
|
| 302 |
"cell_type": "code",
|
| 303 |
+
"execution_count": 13,
|
| 304 |
"id": "794c40e7",
|
| 305 |
"metadata": {},
|
| 306 |
"outputs": [],
|
| 307 |
"source": [
|
| 308 |
+
"def train(model, iterator, optimizer, criterion, clip=1.0, teacher_forcing_ratio=0.8):\n",
|
| 309 |
" model.train()\n",
|
| 310 |
" epoch_loss = 0\n",
|
| 311 |
" \n",
|
|
|
|
| 357 |
},
|
| 358 |
{
|
| 359 |
"cell_type": "code",
|
| 360 |
+
"execution_count": 14,
|
| 361 |
"id": "d4bb0e92",
|
| 362 |
"metadata": {},
|
| 363 |
"outputs": [
|
|
|
|
| 365 |
"name": "stderr",
|
| 366 |
"output_type": "stream",
|
| 367 |
"text": [
|
| 368 |
+
" \r"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 369 |
]
|
| 370 |
},
|
| 371 |
{
|
| 372 |
"name": "stdout",
|
| 373 |
"output_type": "stream",
|
| 374 |
"text": [
|
| 375 |
+
"Epoch: 01/25 | Time: 8m 1s | TF Ratio: 0.50\n",
|
| 376 |
+
"\tTrain Loss: 4.1408 | Val Loss: 3.8033 | Best Val: 3.8033 ✓ SAVED\n"
|
| 377 |
]
|
| 378 |
},
|
| 379 |
{
|
| 380 |
"name": "stderr",
|
| 381 |
"output_type": "stream",
|
| 382 |
"text": [
|
| 383 |
+
" \r"
|
| 384 |
]
|
| 385 |
},
|
| 386 |
{
|
| 387 |
"name": "stdout",
|
| 388 |
"output_type": "stream",
|
| 389 |
"text": [
|
| 390 |
+
"Epoch: 02/25 | Time: 7m 41s | TF Ratio: 0.45\n",
|
| 391 |
+
"\tTrain Loss: 3.0543 | Val Loss: 3.5220 | Best Val: 3.5220 ✓ SAVED\n"
|
| 392 |
]
|
| 393 |
},
|
| 394 |
{
|
| 395 |
"name": "stderr",
|
| 396 |
"output_type": "stream",
|
| 397 |
"text": [
|
| 398 |
+
" \r"
|
| 399 |
]
|
| 400 |
},
|
| 401 |
{
|
| 402 |
"name": "stdout",
|
| 403 |
"output_type": "stream",
|
| 404 |
"text": [
|
| 405 |
+
"Epoch: 03/25 | Time: 7m 40s | TF Ratio: 0.41\n",
|
| 406 |
+
"\tTrain Loss: 2.6443 | Val Loss: 3.2353 | Best Val: 3.2353 ✓ SAVED\n"
|
| 407 |
]
|
| 408 |
},
|
| 409 |
{
|
| 410 |
"name": "stderr",
|
| 411 |
"output_type": "stream",
|
| 412 |
"text": [
|
| 413 |
+
" \r"
|
| 414 |
]
|
| 415 |
},
|
| 416 |
{
|
| 417 |
"name": "stdout",
|
| 418 |
"output_type": "stream",
|
| 419 |
"text": [
|
| 420 |
+
"Epoch: 04/25 | Time: 7m 44s | TF Ratio: 0.36\n",
|
| 421 |
+
"\tTrain Loss: 2.3818 | Val Loss: 3.1132 | Best Val: 3.1132 ✓ SAVED\n"
|
| 422 |
]
|
| 423 |
},
|
| 424 |
{
|
| 425 |
"name": "stderr",
|
| 426 |
"output_type": "stream",
|
| 427 |
"text": [
|
| 428 |
+
" \r"
|
| 429 |
]
|
| 430 |
},
|
| 431 |
{
|
| 432 |
"name": "stdout",
|
| 433 |
"output_type": "stream",
|
| 434 |
"text": [
|
| 435 |
+
"Epoch: 05/25 | Time: 7m 42s | TF Ratio: 0.33\n",
|
| 436 |
+
"\tTrain Loss: 2.2041 | Val Loss: 2.9274 | Best Val: 2.9274 ✓ SAVED\n"
|
| 437 |
]
|
| 438 |
},
|
| 439 |
{
|
| 440 |
"name": "stderr",
|
| 441 |
"output_type": "stream",
|
| 442 |
"text": [
|
| 443 |
+
" \r"
|
| 444 |
]
|
| 445 |
},
|
| 446 |
{
|
| 447 |
"name": "stdout",
|
| 448 |
"output_type": "stream",
|
| 449 |
"text": [
|
| 450 |
+
"Epoch: 06/25 | Time: 7m 36s | TF Ratio: 0.30\n",
|
| 451 |
+
"\tTrain Loss: 2.0576 | Val Loss: 2.8356 | Best Val: 2.8356 ✓ SAVED\n"
|
| 452 |
]
|
| 453 |
},
|
| 454 |
{
|
| 455 |
"name": "stderr",
|
| 456 |
"output_type": "stream",
|
| 457 |
"text": [
|
| 458 |
+
" \r"
|
| 459 |
]
|
| 460 |
},
|
| 461 |
{
|
| 462 |
"name": "stdout",
|
| 463 |
"output_type": "stream",
|
| 464 |
"text": [
|
| 465 |
+
"Epoch: 07/25 | Time: 7m 41s | TF Ratio: 0.27\n",
|
| 466 |
+
"\tTrain Loss: 1.9377 | Val Loss: 2.8092 | Best Val: 2.8092 ✓ SAVED\n"
|
| 467 |
]
|
| 468 |
},
|
| 469 |
{
|
| 470 |
"name": "stderr",
|
| 471 |
"output_type": "stream",
|
| 472 |
"text": [
|
| 473 |
+
" \r"
|
| 474 |
]
|
| 475 |
},
|
| 476 |
{
|
| 477 |
"name": "stdout",
|
| 478 |
"output_type": "stream",
|
| 479 |
"text": [
|
| 480 |
+
"Epoch: 08/25 | Time: 7m 39s | TF Ratio: 0.24\n",
|
| 481 |
+
"\tTrain Loss: 1.8034 | Val Loss: 2.8102 | Best Val: 2.8092 \n"
|
| 482 |
]
|
| 483 |
},
|
| 484 |
{
|
| 485 |
"name": "stderr",
|
| 486 |
"output_type": "stream",
|
| 487 |
"text": [
|
| 488 |
+
" \r"
|
| 489 |
]
|
| 490 |
},
|
| 491 |
{
|
| 492 |
"name": "stdout",
|
| 493 |
"output_type": "stream",
|
| 494 |
"text": [
|
| 495 |
+
"Epoch: 09/25 | Time: 7m 39s | TF Ratio: 0.22\n",
|
| 496 |
+
"\tTrain Loss: 1.7125 | Val Loss: 2.7772 | Best Val: 2.7772 ✓ SAVED\n"
|
| 497 |
]
|
| 498 |
},
|
| 499 |
{
|
| 500 |
"name": "stderr",
|
| 501 |
"output_type": "stream",
|
| 502 |
"text": [
|
| 503 |
+
" \r"
|
| 504 |
]
|
| 505 |
},
|
| 506 |
{
|
| 507 |
"name": "stdout",
|
| 508 |
"output_type": "stream",
|
| 509 |
"text": [
|
| 510 |
+
"Epoch: 10/25 | Time: 7m 38s | TF Ratio: 0.19\n",
|
| 511 |
+
"\tTrain Loss: 1.6454 | Val Loss: 2.8247 | Best Val: 2.7772 \n"
|
| 512 |
]
|
| 513 |
},
|
| 514 |
{
|
| 515 |
"name": "stderr",
|
| 516 |
"output_type": "stream",
|
| 517 |
"text": [
|
| 518 |
+
" \r"
|
| 519 |
]
|
| 520 |
},
|
| 521 |
{
|
| 522 |
"name": "stdout",
|
| 523 |
"output_type": "stream",
|
| 524 |
"text": [
|
| 525 |
+
"Epoch: 11/25 | Time: 7m 42s | TF Ratio: 0.17\n",
|
| 526 |
+
"\tTrain Loss: 1.5686 | Val Loss: 2.8969 | Best Val: 2.7772 \n"
|
| 527 |
]
|
| 528 |
},
|
| 529 |
{
|
| 530 |
"name": "stderr",
|
| 531 |
"output_type": "stream",
|
| 532 |
"text": [
|
| 533 |
+
" \r"
|
| 534 |
]
|
| 535 |
},
|
| 536 |
{
|
| 537 |
+
"ename": "KeyboardInterrupt",
|
| 538 |
+
"evalue": "",
|
| 539 |
+
"output_type": "error",
|
| 540 |
+
"traceback": [
|
| 541 |
+
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
| 542 |
+
"\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)",
|
| 543 |
+
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[14]\u001b[39m\u001b[32m, line 10\u001b[39m\n\u001b[32m 7\u001b[39m start_time = time.time()\n\u001b[32m 9\u001b[39m tf_ratio = \u001b[38;5;28mmax\u001b[39m(\u001b[32m0.1\u001b[39m, \u001b[32m0.5\u001b[39m * (\u001b[32m0.9\u001b[39m ** epoch))\n\u001b[32m---> \u001b[39m\u001b[32m10\u001b[39m train_loss = \u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrain_loader\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptimizer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcriterion\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mCLIP\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtf_ratio\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 11\u001b[39m valid_loss = evaluate(model, val_loader, criterion)\n\u001b[32m 12\u001b[39m scheduler.step(valid_loss)\n",
|
| 544 |
+
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[13]\u001b[39m\u001b[32m, line 17\u001b[39m, in \u001b[36mtrain\u001b[39m\u001b[34m(model, iterator, optimizer, criterion, clip, teacher_forcing_ratio)\u001b[39m\n\u001b[32m 14\u001b[39m trg = trg[\u001b[32m1\u001b[39m:].reshape(-\u001b[32m1\u001b[39m)\n\u001b[32m 16\u001b[39m loss = criterion(output, trg)\n\u001b[32m---> \u001b[39m\u001b[32m17\u001b[39m \u001b[43mloss\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 19\u001b[39m torch.nn.utils.clip_grad_norm_(model.parameters(), clip)\n\u001b[32m 21\u001b[39m optimizer.step()\n",
|
| 545 |
+
"\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/projects/env/lib/python3.12/site-packages/torch/_tensor.py:630\u001b[39m, in \u001b[36mTensor.backward\u001b[39m\u001b[34m(self, gradient, retain_graph, create_graph, inputs)\u001b[39m\n\u001b[32m 620\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[32m 621\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[32m 622\u001b[39m Tensor.backward,\n\u001b[32m 623\u001b[39m (\u001b[38;5;28mself\u001b[39m,),\n\u001b[32m (...)\u001b[39m\u001b[32m 628\u001b[39m inputs=inputs,\n\u001b[32m 629\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m630\u001b[39m \u001b[43mtorch\u001b[49m\u001b[43m.\u001b[49m\u001b[43mautograd\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 631\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m=\u001b[49m\u001b[43minputs\u001b[49m\n\u001b[32m 632\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
|
| 546 |
+
"\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/projects/env/lib/python3.12/site-packages/torch/autograd/__init__.py:364\u001b[39m, in \u001b[36mbackward\u001b[39m\u001b[34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[39m\n\u001b[32m 359\u001b[39m retain_graph = create_graph\n\u001b[32m 361\u001b[39m \u001b[38;5;66;03m# The reason we repeat the same comment below is that\u001b[39;00m\n\u001b[32m 362\u001b[39m \u001b[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001b[39;00m\n\u001b[32m 363\u001b[39m \u001b[38;5;66;03m# calls in the traceback and some print out the last line\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m364\u001b[39m \u001b[43m_engine_run_backward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 365\u001b[39m \u001b[43m \u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 366\u001b[39m \u001b[43m \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 367\u001b[39m \u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 368\u001b[39m \u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 369\u001b[39m \u001b[43m \u001b[49m\u001b[43minputs_tuple\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 370\u001b[39m \u001b[43m \u001b[49m\u001b[43mallow_unreachable\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 371\u001b[39m \u001b[43m \u001b[49m\u001b[43maccumulate_grad\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 372\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
|
| 547 |
+
"\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/projects/env/lib/python3.12/site-packages/torch/autograd/graph.py:865\u001b[39m, in \u001b[36m_engine_run_backward\u001b[39m\u001b[34m(t_outputs, *args, **kwargs)\u001b[39m\n\u001b[32m 863\u001b[39m unregister_hooks = _register_logging_hooks_on_whole_graph(t_outputs)\n\u001b[32m 864\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m865\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mVariable\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_execution_engine\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrun_backward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Calls into the C++ engine to run the backward pass\u001b[39;49;00m\n\u001b[32m 866\u001b[39m \u001b[43m \u001b[49m\u001b[43mt_outputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\n\u001b[32m 867\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# Calls into the C++ engine to run the backward pass\u001b[39;00m\n\u001b[32m 868\u001b[39m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[32m 869\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m attach_logging_hooks:\n",
|
| 548 |
+
"\u001b[31mKeyboardInterrupt\u001b[39m: "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 549 |
]
|
| 550 |
}
|
| 551 |
],
|
| 552 |
"source": [
|
| 553 |
+
"EPOCHS = 25\n",
|
| 554 |
"CLIP = 1.0\n",
|
| 555 |
"best_valid_loss = float('inf')\n",
|
| 556 |
"training_history = {'train_loss': [], 'valid_loss': []}\n",
|
|
|
|
| 597 |
},
|
| 598 |
{
|
| 599 |
"cell_type": "code",
|
| 600 |
+
"execution_count": 32,
|
| 601 |
"id": "6d9a8e25",
|
| 602 |
"metadata": {},
|
| 603 |
"outputs": [
|
|
|
|
| 605 |
"name": "stdout",
|
| 606 |
"output_type": "stream",
|
| 607 |
"text": [
|
| 608 |
+
"Loaded checkpoint from best_model.pth (epoch 8)\n",
|
| 609 |
+
"Sample input (truncated): [CLS:reduction] for (i = 0; i < 1000; ++i)\n",
|
| 610 |
+
"{\n",
|
| 611 |
+
" logic_and = logic_and && logics[i];\n",
|
| 612 |
+
"}\n",
|
| 613 |
"\n",
|
| 614 |
+
"Reference pragma: omp parallel for schedule(dynamic,1) private(i) reduction(&&:logic_and)\n",
|
| 615 |
+
"Model prediction: omp parallel for schedule(dynamic,1) private(i) reduction(&&:logic_and)\n"
|
| 616 |
]
|
| 617 |
}
|
| 618 |
],
|
|
|
|
| 666 |
" return tokenizer.decode(generated)\n",
|
| 667 |
"\n",
|
| 668 |
"# Quick sanity check on a validation example\n",
|
| 669 |
+
"sample_input = val_inputs[18]\n",
|
| 670 |
+
"reference = val_outputs[18]\n",
|
| 671 |
"prediction = greedy_generate(sample_input)\n",
|
| 672 |
"print(\"Sample input (truncated):\", sample_input[:140] + \"...\" if len(sample_input) > 140 else sample_input)\n",
|
| 673 |
"print(\"Reference pragma:\", reference)\n",
|
model/__pycache__/attn.cpython-312.pyc
DELETED
|
Binary file (2.35 kB)
|
|
|
model/__pycache__/decoder.cpython-312.pyc
DELETED
|
Binary file (3.12 kB)
|
|
|
model/__pycache__/encoder.cpython-312.pyc
DELETED
|
Binary file (2.48 kB)
|
|
|
model/__pycache__/generator.cpython-312.pyc
DELETED
|
Binary file (5.35 kB)
|
|
|
model/decoder.py
CHANGED
|
@@ -39,6 +39,7 @@ class Decoder(nn.Module):
|
|
| 39 |
)
|
| 40 |
|
| 41 |
self.dropout = nn.Dropout(dropout)
|
|
|
|
| 42 |
|
| 43 |
def forward(
|
| 44 |
self,
|
|
@@ -53,6 +54,7 @@ class Decoder(nn.Module):
|
|
| 53 |
context, attention_weights = self.attention(
|
| 54 |
top_hidden, encoder_outputs, mask
|
| 55 |
)
|
|
|
|
| 56 |
|
| 57 |
lstm_input = torch.cat((embedded, context.unsqueeze(1)), dim=2)
|
| 58 |
|
|
@@ -65,7 +67,8 @@ class Decoder(nn.Module):
|
|
| 65 |
embedded = embedded.squeeze(1)
|
| 66 |
|
| 67 |
output_context = torch.cat((output, context, embedded), dim=1)
|
|
|
|
| 68 |
|
| 69 |
prediction = self.fc_out(output_context)
|
| 70 |
|
| 71 |
-
return prediction, decoder_hidden, decoder_cell, attention_weights
|
|
|
|
| 39 |
)
|
| 40 |
|
| 41 |
self.dropout = nn.Dropout(dropout)
|
| 42 |
+
self.layer_norm = nn.LayerNorm(hidden_size + hidden_size * 2 + embed_size)
|
| 43 |
|
| 44 |
def forward(
|
| 45 |
self,
|
|
|
|
| 54 |
context, attention_weights = self.attention(
|
| 55 |
top_hidden, encoder_outputs, mask
|
| 56 |
)
|
| 57 |
+
context = self.dropout(context)
|
| 58 |
|
| 59 |
lstm_input = torch.cat((embedded, context.unsqueeze(1)), dim=2)
|
| 60 |
|
|
|
|
| 67 |
embedded = embedded.squeeze(1)
|
| 68 |
|
| 69 |
output_context = torch.cat((output, context, embedded), dim=1)
|
| 70 |
+
output_context = self.layer_norm(output_context)
|
| 71 |
|
| 72 |
prediction = self.fc_out(output_context)
|
| 73 |
|
| 74 |
+
return prediction, decoder_hidden, decoder_cell, attention_weights
|
model/encoder.py
CHANGED
|
@@ -32,8 +32,8 @@ class Encoder(nn.Module):
|
|
| 32 |
dropout=dropout if num_layers > 1 else 0,
|
| 33 |
bidirectional=True
|
| 34 |
)
|
| 35 |
-
|
| 36 |
self.dropout = nn.Dropout(dropout)
|
|
|
|
| 37 |
|
| 38 |
def forward(
|
| 39 |
self,
|
|
@@ -52,5 +52,5 @@ class Encoder(nn.Module):
|
|
| 52 |
packed_output,
|
| 53 |
batch_first=True
|
| 54 |
)
|
| 55 |
-
|
| 56 |
-
return outputs, hidden, cell
|
|
|
|
| 32 |
dropout=dropout if num_layers > 1 else 0,
|
| 33 |
bidirectional=True
|
| 34 |
)
|
|
|
|
| 35 |
self.dropout = nn.Dropout(dropout)
|
| 36 |
+
self.layer_norm = nn.LayerNorm(hidden_size * 2)
|
| 37 |
|
| 38 |
def forward(
|
| 39 |
self,
|
|
|
|
| 52 |
packed_output,
|
| 53 |
batch_first=True
|
| 54 |
)
|
| 55 |
+
outputs = self.layer_norm(outputs)
|
| 56 |
+
return outputs, hidden, cell
|