Commit ·
cab758f
1
Parent(s): 094c6df
Training in progress, step 400
Browse files
fine-tune-atco2-non-streaming.ipynb
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
"cells": [
|
| 3 |
{
|
| 4 |
"cell_type": "markdown",
|
| 5 |
-
"id": "
|
| 6 |
"metadata": {
|
| 7 |
"id": "75b58048-7d14-4fc6-8085-1fc08c81b4a6"
|
| 8 |
},
|
|
@@ -12,7 +12,7 @@
|
|
| 12 |
},
|
| 13 |
{
|
| 14 |
"cell_type": "markdown",
|
| 15 |
-
"id": "
|
| 16 |
"metadata": {
|
| 17 |
"id": "fbfa8ad5-4cdc-4512-9058-836cbbf65e1a"
|
| 18 |
},
|
|
@@ -25,7 +25,7 @@
|
|
| 25 |
},
|
| 26 |
{
|
| 27 |
"cell_type": "markdown",
|
| 28 |
-
"id": "
|
| 29 |
"metadata": {
|
| 30 |
"id": "afe0d503-ae4e-4aa7-9af4-dbcba52db41e"
|
| 31 |
},
|
|
@@ -35,7 +35,7 @@
|
|
| 35 |
},
|
| 36 |
{
|
| 37 |
"cell_type": "markdown",
|
| 38 |
-
"id": "
|
| 39 |
"metadata": {
|
| 40 |
"id": "9ae91ed4-9c3e-4ade-938e-f4c2dcfbfdc0"
|
| 41 |
},
|
|
@@ -67,7 +67,7 @@
|
|
| 67 |
},
|
| 68 |
{
|
| 69 |
"cell_type": "markdown",
|
| 70 |
-
"id": "
|
| 71 |
"metadata": {
|
| 72 |
"id": "e59b91d6-be24-4b5e-bb38-4977ea143a72"
|
| 73 |
},
|
|
@@ -86,7 +86,7 @@
|
|
| 86 |
},
|
| 87 |
{
|
| 88 |
"cell_type": "markdown",
|
| 89 |
-
"id": "
|
| 90 |
"metadata": {
|
| 91 |
"id": "21b6316e-8a55-4549-a154-66d3da2ab74a"
|
| 92 |
},
|
|
@@ -115,7 +115,7 @@
|
|
| 115 |
},
|
| 116 |
{
|
| 117 |
"cell_type": "markdown",
|
| 118 |
-
"id": "
|
| 119 |
"metadata": {
|
| 120 |
"id": "3a680dfc-cbba-4f6c-8a1f-e1a5ff3f123a"
|
| 121 |
},
|
|
@@ -127,7 +127,7 @@
|
|
| 127 |
},
|
| 128 |
{
|
| 129 |
"cell_type": "markdown",
|
| 130 |
-
"id": "
|
| 131 |
"metadata": {
|
| 132 |
"id": "b219c9dd-39b6-4a95-b2a1-3f547a1e7bc0"
|
| 133 |
},
|
|
@@ -137,7 +137,7 @@
|
|
| 137 |
},
|
| 138 |
{
|
| 139 |
"cell_type": "markdown",
|
| 140 |
-
"id": "
|
| 141 |
"metadata": {
|
| 142 |
"id": "674429c5-0ab4-4adf-975b-621bb69eca38"
|
| 143 |
},
|
|
@@ -155,7 +155,7 @@
|
|
| 155 |
{
|
| 156 |
"cell_type": "code",
|
| 157 |
"execution_count": 1,
|
| 158 |
-
"id": "
|
| 159 |
"metadata": {
|
| 160 |
"id": "a2787582-554f-44ce-9f38-4180a5ed6b44"
|
| 161 |
},
|
|
@@ -171,7 +171,7 @@
|
|
| 171 |
{
|
| 172 |
"cell_type": "code",
|
| 173 |
"execution_count": 2,
|
| 174 |
-
"id": "
|
| 175 |
"metadata": {},
|
| 176 |
"outputs": [
|
| 177 |
{
|
|
@@ -212,7 +212,7 @@
|
|
| 212 |
},
|
| 213 |
{
|
| 214 |
"cell_type": "markdown",
|
| 215 |
-
"id": "
|
| 216 |
"metadata": {
|
| 217 |
"id": "d5c7c3d6-7197-41e7-a088-49b753c1681f"
|
| 218 |
},
|
|
@@ -227,7 +227,7 @@
|
|
| 227 |
{
|
| 228 |
"cell_type": "code",
|
| 229 |
"execution_count": 3,
|
| 230 |
-
"id": "
|
| 231 |
"metadata": {
|
| 232 |
"id": "20ba635d-518c-47ac-97ee-3cad25f1e0ce"
|
| 233 |
},
|
|
@@ -257,7 +257,7 @@
|
|
| 257 |
},
|
| 258 |
{
|
| 259 |
"cell_type": "markdown",
|
| 260 |
-
"id": "
|
| 261 |
"metadata": {
|
| 262 |
"id": "2d63b2d2-f68a-4d74-b7f1-5127f6d16605"
|
| 263 |
},
|
|
@@ -267,7 +267,7 @@
|
|
| 267 |
},
|
| 268 |
{
|
| 269 |
"cell_type": "markdown",
|
| 270 |
-
"id": "
|
| 271 |
"metadata": {
|
| 272 |
"id": "601c3099-1026-439e-93e2-5635b3ba5a73"
|
| 273 |
},
|
|
@@ -287,7 +287,7 @@
|
|
| 287 |
},
|
| 288 |
{
|
| 289 |
"cell_type": "markdown",
|
| 290 |
-
"id": "
|
| 291 |
"metadata": {
|
| 292 |
"id": "560332eb-3558-41a1-b500-e83a9f695f84"
|
| 293 |
},
|
|
@@ -297,7 +297,7 @@
|
|
| 297 |
},
|
| 298 |
{
|
| 299 |
"cell_type": "markdown",
|
| 300 |
-
"id": "
|
| 301 |
"metadata": {
|
| 302 |
"id": "32ec8068-0bd7-412d-b662-0edb9d1e7365"
|
| 303 |
},
|
|
@@ -309,7 +309,7 @@
|
|
| 309 |
},
|
| 310 |
{
|
| 311 |
"cell_type": "markdown",
|
| 312 |
-
"id": "
|
| 313 |
"metadata": {
|
| 314 |
"id": "589d9ec1-d12b-4b64-93f7-04c63997da19"
|
| 315 |
},
|
|
@@ -324,7 +324,7 @@
|
|
| 324 |
},
|
| 325 |
{
|
| 326 |
"cell_type": "markdown",
|
| 327 |
-
"id": "
|
| 328 |
"metadata": {
|
| 329 |
"id": "b2ef54d5-b946-4c1d-9fdc-adc5d01b46aa"
|
| 330 |
},
|
|
@@ -335,7 +335,7 @@
|
|
| 335 |
{
|
| 336 |
"cell_type": "code",
|
| 337 |
"execution_count": 4,
|
| 338 |
-
"id": "
|
| 339 |
"metadata": {
|
| 340 |
"id": "bc77d7bb-f9e2-47f5-b663-30f7a4321ce5"
|
| 341 |
},
|
|
@@ -348,7 +348,7 @@
|
|
| 348 |
},
|
| 349 |
{
|
| 350 |
"cell_type": "markdown",
|
| 351 |
-
"id": "
|
| 352 |
"metadata": {
|
| 353 |
"id": "93748af7-b917-4ecf-a0c8-7d89077ff9cb"
|
| 354 |
},
|
|
@@ -358,7 +358,7 @@
|
|
| 358 |
},
|
| 359 |
{
|
| 360 |
"cell_type": "markdown",
|
| 361 |
-
"id": "
|
| 362 |
"metadata": {
|
| 363 |
"id": "2bc82609-a9fb-447a-a2af-99597c864029"
|
| 364 |
},
|
|
@@ -372,7 +372,7 @@
|
|
| 372 |
{
|
| 373 |
"cell_type": "code",
|
| 374 |
"execution_count": 5,
|
| 375 |
-
"id": "
|
| 376 |
"metadata": {
|
| 377 |
"id": "c7b07f9b-ae0e-4f89-98f0-0c50d432eab6",
|
| 378 |
"outputId": "5c004b44-86e7-4e00-88be-39e0af5eed69"
|
|
@@ -386,7 +386,7 @@
|
|
| 386 |
},
|
| 387 |
{
|
| 388 |
"cell_type": "markdown",
|
| 389 |
-
"id": "
|
| 390 |
"metadata": {
|
| 391 |
"id": "d2ef23f3-f4a8-483a-a2dc-080a7496cb1b"
|
| 392 |
},
|
|
@@ -396,7 +396,7 @@
|
|
| 396 |
},
|
| 397 |
{
|
| 398 |
"cell_type": "markdown",
|
| 399 |
-
"id": "
|
| 400 |
"metadata": {
|
| 401 |
"id": "5ff67654-5a29-4bb8-a69d-0228946c6f8d"
|
| 402 |
},
|
|
@@ -412,7 +412,7 @@
|
|
| 412 |
{
|
| 413 |
"cell_type": "code",
|
| 414 |
"execution_count": 6,
|
| 415 |
-
"id": "
|
| 416 |
"metadata": {
|
| 417 |
"id": "77d9f0c5-8607-4642-a8ac-c3ab2e223ea6"
|
| 418 |
},
|
|
@@ -425,7 +425,7 @@
|
|
| 425 |
},
|
| 426 |
{
|
| 427 |
"cell_type": "markdown",
|
| 428 |
-
"id": "
|
| 429 |
"metadata": {
|
| 430 |
"id": "381acd09-0b0f-4d04-9eb3-f028ac0e5f2c"
|
| 431 |
},
|
|
@@ -435,7 +435,7 @@
|
|
| 435 |
},
|
| 436 |
{
|
| 437 |
"cell_type": "markdown",
|
| 438 |
-
"id": "
|
| 439 |
"metadata": {
|
| 440 |
"id": "9649bf01-2e8a-45e5-8fca-441c13637b8f"
|
| 441 |
},
|
|
@@ -447,7 +447,7 @@
|
|
| 447 |
{
|
| 448 |
"cell_type": "code",
|
| 449 |
"execution_count": 7,
|
| 450 |
-
"id": "
|
| 451 |
"metadata": {
|
| 452 |
"id": "6e6b0ec5-0c94-4e2c-ae24-c791be1b2255"
|
| 453 |
},
|
|
@@ -467,7 +467,7 @@
|
|
| 467 |
},
|
| 468 |
{
|
| 469 |
"cell_type": "markdown",
|
| 470 |
-
"id": "
|
| 471 |
"metadata": {
|
| 472 |
"id": "5a679f05-063d-41b3-9b58-4fc9c6ccf4fd"
|
| 473 |
},
|
|
@@ -486,7 +486,7 @@
|
|
| 486 |
{
|
| 487 |
"cell_type": "code",
|
| 488 |
"execution_count": 8,
|
| 489 |
-
"id": "
|
| 490 |
"metadata": {
|
| 491 |
"id": "f12e2e57-156f-417b-8cfb-69221cc198e8"
|
| 492 |
},
|
|
@@ -499,7 +499,7 @@
|
|
| 499 |
},
|
| 500 |
{
|
| 501 |
"cell_type": "markdown",
|
| 502 |
-
"id": "
|
| 503 |
"metadata": {
|
| 504 |
"id": "00382a3e-abec-4cdd-a54c-d1aaa3ea4707"
|
| 505 |
},
|
|
@@ -511,7 +511,7 @@
|
|
| 511 |
{
|
| 512 |
"cell_type": "code",
|
| 513 |
"execution_count": 9,
|
| 514 |
-
"id": "
|
| 515 |
"metadata": {
|
| 516 |
"id": "87122d71-289a-466a-afcf-fa354b18946b"
|
| 517 |
},
|
|
@@ -531,7 +531,7 @@
|
|
| 531 |
},
|
| 532 |
{
|
| 533 |
"cell_type": "markdown",
|
| 534 |
-
"id": "
|
| 535 |
"metadata": {},
|
| 536 |
"source": [
|
| 537 |
"We'll define our pre-processing strategy. We advise that you **do not** lower-case the transcriptions or remove punctuation unless mixing different datasets. This will enable you to fine-tune Whisper models that can predict punctuation and casing. Later, you will see how we can evaluate the predictions without punctuation or casing, so that the models benefit from the WER improvement obtained by normalising the transcriptions while still predicting fully formatted transcriptions."
|
|
@@ -540,7 +540,7 @@
|
|
| 540 |
{
|
| 541 |
"cell_type": "code",
|
| 542 |
"execution_count": 10,
|
| 543 |
-
"id": "
|
| 544 |
"metadata": {},
|
| 545 |
"outputs": [],
|
| 546 |
"source": [
|
|
@@ -554,7 +554,7 @@
|
|
| 554 |
},
|
| 555 |
{
|
| 556 |
"cell_type": "markdown",
|
| 557 |
-
"id": "
|
| 558 |
"metadata": {},
|
| 559 |
"source": [
|
| 560 |
"Now we can write a function to prepare our data ready for the model:\n",
|
|
@@ -567,7 +567,7 @@
|
|
| 567 |
{
|
| 568 |
"cell_type": "code",
|
| 569 |
"execution_count": 11,
|
| 570 |
-
"id": "
|
| 571 |
"metadata": {},
|
| 572 |
"outputs": [],
|
| 573 |
"source": [
|
|
@@ -594,7 +594,7 @@
|
|
| 594 |
},
|
| 595 |
{
|
| 596 |
"cell_type": "markdown",
|
| 597 |
-
"id": "
|
| 598 |
"metadata": {
|
| 599 |
"id": "70b319fb-2439-4ef6-a70d-a47bf41c4a13"
|
| 600 |
},
|
|
@@ -605,7 +605,7 @@
|
|
| 605 |
{
|
| 606 |
"cell_type": "code",
|
| 607 |
"execution_count": 12,
|
| 608 |
-
"id": "
|
| 609 |
"metadata": {
|
| 610 |
"id": "7b73ab39-ffaf-4b9e-86e5-782963c6134b"
|
| 611 |
},
|
|
@@ -841,7 +841,7 @@
|
|
| 841 |
},
|
| 842 |
{
|
| 843 |
"cell_type": "markdown",
|
| 844 |
-
"id": "
|
| 845 |
"metadata": {},
|
| 846 |
"source": [
|
| 847 |
"Finally, we filter any training data with audio samples longer than 30s. These samples would otherwise be truncated by the Whisper feature-extractor which could affect the stability of training. We define a function that returns `True` for samples that are less than 30s, and `False` for those that are longer:"
|
|
@@ -850,7 +850,7 @@
|
|
| 850 |
{
|
| 851 |
"cell_type": "code",
|
| 852 |
"execution_count": 13,
|
| 853 |
-
"id": "
|
| 854 |
"metadata": {},
|
| 855 |
"outputs": [],
|
| 856 |
"source": [
|
|
@@ -862,7 +862,7 @@
|
|
| 862 |
},
|
| 863 |
{
|
| 864 |
"cell_type": "markdown",
|
| 865 |
-
"id": "
|
| 866 |
"metadata": {},
|
| 867 |
"source": [
|
| 868 |
"We apply our filter function to all samples of our training dataset through 🤗 Datasets' `.filter` method:"
|
|
@@ -871,7 +871,7 @@
|
|
| 871 |
{
|
| 872 |
"cell_type": "code",
|
| 873 |
"execution_count": 14,
|
| 874 |
-
"id": "
|
| 875 |
"metadata": {},
|
| 876 |
"outputs": [
|
| 877 |
{
|
|
@@ -891,7 +891,7 @@
|
|
| 891 |
},
|
| 892 |
{
|
| 893 |
"cell_type": "markdown",
|
| 894 |
-
"id": "
|
| 895 |
"metadata": {
|
| 896 |
"id": "263a5a58-0239-4a25-b0df-c625fc9c5810"
|
| 897 |
},
|
|
@@ -901,7 +901,7 @@
|
|
| 901 |
},
|
| 902 |
{
|
| 903 |
"cell_type": "markdown",
|
| 904 |
-
"id": "
|
| 905 |
"metadata": {
|
| 906 |
"id": "a693e768-c5a6-453f-89a1-b601dcf7daf7"
|
| 907 |
},
|
|
@@ -924,7 +924,7 @@
|
|
| 924 |
},
|
| 925 |
{
|
| 926 |
"cell_type": "markdown",
|
| 927 |
-
"id": "
|
| 928 |
"metadata": {
|
| 929 |
"id": "8d230e6d-624c-400a-bbf5-fa660881df25"
|
| 930 |
},
|
|
@@ -934,7 +934,7 @@
|
|
| 934 |
},
|
| 935 |
{
|
| 936 |
"cell_type": "markdown",
|
| 937 |
-
"id": "
|
| 938 |
"metadata": {
|
| 939 |
"id": "04def221-0637-4a69-b242-d3f0c1d0ee78"
|
| 940 |
},
|
|
@@ -960,7 +960,7 @@
|
|
| 960 |
{
|
| 961 |
"cell_type": "code",
|
| 962 |
"execution_count": 15,
|
| 963 |
-
"id": "
|
| 964 |
"metadata": {
|
| 965 |
"id": "8326221e-ec13-4731-bb4e-51e5fc1486c5"
|
| 966 |
},
|
|
@@ -1001,7 +1001,7 @@
|
|
| 1001 |
},
|
| 1002 |
{
|
| 1003 |
"cell_type": "markdown",
|
| 1004 |
-
"id": "
|
| 1005 |
"metadata": {
|
| 1006 |
"id": "3cae7dbf-8a50-456e-a3a8-7fd005390f86"
|
| 1007 |
},
|
|
@@ -1012,7 +1012,7 @@
|
|
| 1012 |
{
|
| 1013 |
"cell_type": "code",
|
| 1014 |
"execution_count": 16,
|
| 1015 |
-
"id": "
|
| 1016 |
"metadata": {
|
| 1017 |
"id": "fc834702-c0d3-4a96-b101-7b87be32bf42"
|
| 1018 |
},
|
|
@@ -1023,7 +1023,7 @@
|
|
| 1023 |
},
|
| 1024 |
{
|
| 1025 |
"cell_type": "markdown",
|
| 1026 |
-
"id": "
|
| 1027 |
"metadata": {
|
| 1028 |
"id": "d62bb2ab-750a-45e7-82e9-61d6f4805698"
|
| 1029 |
},
|
|
@@ -1033,7 +1033,7 @@
|
|
| 1033 |
},
|
| 1034 |
{
|
| 1035 |
"cell_type": "markdown",
|
| 1036 |
-
"id": "
|
| 1037 |
"metadata": {
|
| 1038 |
"id": "66fee1a7-a44c-461e-b047-c3917221572e"
|
| 1039 |
},
|
|
@@ -1045,7 +1045,7 @@
|
|
| 1045 |
{
|
| 1046 |
"cell_type": "code",
|
| 1047 |
"execution_count": 17,
|
| 1048 |
-
"id": "
|
| 1049 |
"metadata": {
|
| 1050 |
"id": "b22b4011-f31f-4b57-b684-c52332f92890"
|
| 1051 |
},
|
|
@@ -1058,7 +1058,7 @@
|
|
| 1058 |
},
|
| 1059 |
{
|
| 1060 |
"cell_type": "markdown",
|
| 1061 |
-
"id": "
|
| 1062 |
"metadata": {
|
| 1063 |
"id": "4f32cab6-31f0-4cb9-af4c-40ba0f5fc508"
|
| 1064 |
},
|
|
@@ -1078,7 +1078,7 @@
|
|
| 1078 |
{
|
| 1079 |
"cell_type": "code",
|
| 1080 |
"execution_count": 18,
|
| 1081 |
-
"id": "
|
| 1082 |
"metadata": {
|
| 1083 |
"id": "23959a70-22d0-4ffe-9fa1-72b61e75bb52"
|
| 1084 |
},
|
|
@@ -1109,7 +1109,7 @@
|
|
| 1109 |
},
|
| 1110 |
{
|
| 1111 |
"cell_type": "markdown",
|
| 1112 |
-
"id": "
|
| 1113 |
"metadata": {
|
| 1114 |
"id": "daf2a825-6d9f-4a23-b145-c37c0039075b"
|
| 1115 |
},
|
|
@@ -1119,7 +1119,7 @@
|
|
| 1119 |
},
|
| 1120 |
{
|
| 1121 |
"cell_type": "markdown",
|
| 1122 |
-
"id": "
|
| 1123 |
"metadata": {
|
| 1124 |
"id": "437a97fa-4864-476b-8abc-f28b8166cfa5"
|
| 1125 |
},
|
|
@@ -1131,7 +1131,7 @@
|
|
| 1131 |
{
|
| 1132 |
"cell_type": "code",
|
| 1133 |
"execution_count": 19,
|
| 1134 |
-
"id": "
|
| 1135 |
"metadata": {
|
| 1136 |
"id": "5a10cc4b-07ec-4ebd-ac1d-7c601023594f"
|
| 1137 |
},
|
|
@@ -1144,7 +1144,7 @@
|
|
| 1144 |
},
|
| 1145 |
{
|
| 1146 |
"cell_type": "markdown",
|
| 1147 |
-
"id": "
|
| 1148 |
"metadata": {
|
| 1149 |
"id": "a15ead5f-2277-4a39-937b-585c2497b2df"
|
| 1150 |
},
|
|
@@ -1155,7 +1155,7 @@
|
|
| 1155 |
{
|
| 1156 |
"cell_type": "code",
|
| 1157 |
"execution_count": 20,
|
| 1158 |
-
"id": "
|
| 1159 |
"metadata": {
|
| 1160 |
"id": "62038ba3-88ed-4fce-84db-338f50dcd04f"
|
| 1161 |
},
|
|
@@ -1168,7 +1168,7 @@
|
|
| 1168 |
},
|
| 1169 |
{
|
| 1170 |
"cell_type": "markdown",
|
| 1171 |
-
"id": "
|
| 1172 |
"metadata": {
|
| 1173 |
"id": "2178dea4-80ca-47b6-b6ea-ba1915c90c06"
|
| 1174 |
},
|
|
@@ -1178,7 +1178,7 @@
|
|
| 1178 |
},
|
| 1179 |
{
|
| 1180 |
"cell_type": "markdown",
|
| 1181 |
-
"id": "
|
| 1182 |
"metadata": {
|
| 1183 |
"id": "c21af1e9-0188-4134-ac82-defc7bdcc436"
|
| 1184 |
},
|
|
@@ -1189,7 +1189,7 @@
|
|
| 1189 |
{
|
| 1190 |
"cell_type": "code",
|
| 1191 |
"execution_count": 21,
|
| 1192 |
-
"id": "
|
| 1193 |
"metadata": {
|
| 1194 |
"id": "0ae3e9af-97b7-4aa0-ae85-20b23b5bcb3a"
|
| 1195 |
},
|
|
@@ -1223,7 +1223,7 @@
|
|
| 1223 |
},
|
| 1224 |
{
|
| 1225 |
"cell_type": "markdown",
|
| 1226 |
-
"id": "
|
| 1227 |
"metadata": {
|
| 1228 |
"id": "b3a944d8-3112-4552-82a0-be25988b3857"
|
| 1229 |
},
|
|
@@ -1234,7 +1234,7 @@
|
|
| 1234 |
},
|
| 1235 |
{
|
| 1236 |
"cell_type": "markdown",
|
| 1237 |
-
"id": "
|
| 1238 |
"metadata": {
|
| 1239 |
"id": "bac29114-d226-4f54-97cf-8718c9f94e1e"
|
| 1240 |
},
|
|
@@ -1246,7 +1246,7 @@
|
|
| 1246 |
{
|
| 1247 |
"cell_type": "code",
|
| 1248 |
"execution_count": 22,
|
| 1249 |
-
"id": "
|
| 1250 |
"metadata": {
|
| 1251 |
"id": "d546d7fe-0543-479a-b708-2ebabec19493"
|
| 1252 |
},
|
|
@@ -1277,7 +1277,7 @@
|
|
| 1277 |
},
|
| 1278 |
{
|
| 1279 |
"cell_type": "markdown",
|
| 1280 |
-
"id": "
|
| 1281 |
"metadata": {
|
| 1282 |
"id": "uOrRhDGtN5S4"
|
| 1283 |
},
|
|
@@ -1288,7 +1288,7 @@
|
|
| 1288 |
{
|
| 1289 |
"cell_type": "code",
|
| 1290 |
"execution_count": 23,
|
| 1291 |
-
"id": "
|
| 1292 |
"metadata": {
|
| 1293 |
"id": "-2zQwMfEOBJq"
|
| 1294 |
},
|
|
@@ -1310,7 +1310,7 @@
|
|
| 1310 |
},
|
| 1311 |
{
|
| 1312 |
"cell_type": "markdown",
|
| 1313 |
-
"id": "
|
| 1314 |
"metadata": {
|
| 1315 |
"id": "7f404cf9-4345-468c-8196-4bd101d9bd51"
|
| 1316 |
},
|
|
@@ -1320,7 +1320,7 @@
|
|
| 1320 |
},
|
| 1321 |
{
|
| 1322 |
"cell_type": "markdown",
|
| 1323 |
-
"id": "
|
| 1324 |
"metadata": {
|
| 1325 |
"id": "5e8b8d56-5a70-4f68-bd2e-f0752d0bd112"
|
| 1326 |
},
|
|
@@ -1337,7 +1337,7 @@
|
|
| 1337 |
{
|
| 1338 |
"cell_type": "code",
|
| 1339 |
"execution_count": null,
|
| 1340 |
-
"id": "
|
| 1341 |
"metadata": {
|
| 1342 |
"id": "ee8b7b8e-1c9a-4d77-9137-1778a629e6de"
|
| 1343 |
},
|
|
@@ -1365,8 +1365,8 @@
|
|
| 1365 |
"\n",
|
| 1366 |
" <div>\n",
|
| 1367 |
" \n",
|
| 1368 |
-
" <progress value='
|
| 1369 |
-
" [
|
| 1370 |
" </div>\n",
|
| 1371 |
" <table border=\"1\" class=\"dataframe\">\n",
|
| 1372 |
" <thead>\n",
|
|
@@ -1415,7 +1415,13 @@
|
|
| 1415 |
" <td>18.134172</td>\n",
|
| 1416 |
" </tr>\n",
|
| 1417 |
" </tbody>\n",
|
| 1418 |
-
"</table><p>"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1419 |
],
|
| 1420 |
"text/plain": [
|
| 1421 |
"<IPython.core.display.HTML object>"
|
|
@@ -1466,7 +1472,11 @@
|
|
| 1466 |
"Configuration saved in ./checkpoint-300/config.json\n",
|
| 1467 |
"Model weights saved in ./checkpoint-300/pytorch_model.bin\n",
|
| 1468 |
"Feature extractor saved in ./checkpoint-300/preprocessor_config.json\n",
|
| 1469 |
-
"Feature extractor saved in ./preprocessor_config.json\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1470 |
]
|
| 1471 |
}
|
| 1472 |
],
|
|
@@ -1477,7 +1487,7 @@
|
|
| 1477 |
{
|
| 1478 |
"cell_type": "code",
|
| 1479 |
"execution_count": null,
|
| 1480 |
-
"id": "
|
| 1481 |
"metadata": {
|
| 1482 |
"id": "c704f91e-241b-48c9-b8e0-f0da396a9663"
|
| 1483 |
},
|
|
@@ -1495,7 +1505,7 @@
|
|
| 1495 |
},
|
| 1496 |
{
|
| 1497 |
"cell_type": "markdown",
|
| 1498 |
-
"id": "
|
| 1499 |
"metadata": {
|
| 1500 |
"id": "090d676a-f944-4297-a938-a40eda0b2b68"
|
| 1501 |
},
|
|
@@ -1506,7 +1516,7 @@
|
|
| 1506 |
{
|
| 1507 |
"cell_type": "code",
|
| 1508 |
"execution_count": null,
|
| 1509 |
-
"id": "
|
| 1510 |
"metadata": {
|
| 1511 |
"id": "d7030622-caf7-4039-939b-6195cdaa2585"
|
| 1512 |
},
|
|
@@ -1517,7 +1527,7 @@
|
|
| 1517 |
},
|
| 1518 |
{
|
| 1519 |
"cell_type": "markdown",
|
| 1520 |
-
"id": "
|
| 1521 |
"metadata": {
|
| 1522 |
"id": "ca743fbd-602c-48d4-ba8d-a2fe60af64ba"
|
| 1523 |
},
|
|
@@ -1527,7 +1537,7 @@
|
|
| 1527 |
},
|
| 1528 |
{
|
| 1529 |
"cell_type": "markdown",
|
| 1530 |
-
"id": "
|
| 1531 |
"metadata": {
|
| 1532 |
"id": "7f737783-2870-4e35-aa11-86a42d7d997a"
|
| 1533 |
},
|
|
|
|
| 2 |
"cells": [
|
| 3 |
{
|
| 4 |
"cell_type": "markdown",
|
| 5 |
+
"id": "153b977f",
|
| 6 |
"metadata": {
|
| 7 |
"id": "75b58048-7d14-4fc6-8085-1fc08c81b4a6"
|
| 8 |
},
|
|
|
|
| 12 |
},
|
| 13 |
{
|
| 14 |
"cell_type": "markdown",
|
| 15 |
+
"id": "37b4b092",
|
| 16 |
"metadata": {
|
| 17 |
"id": "fbfa8ad5-4cdc-4512-9058-836cbbf65e1a"
|
| 18 |
},
|
|
|
|
| 25 |
},
|
| 26 |
{
|
| 27 |
"cell_type": "markdown",
|
| 28 |
+
"id": "3dc16bdb",
|
| 29 |
"metadata": {
|
| 30 |
"id": "afe0d503-ae4e-4aa7-9af4-dbcba52db41e"
|
| 31 |
},
|
|
|
|
| 35 |
},
|
| 36 |
{
|
| 37 |
"cell_type": "markdown",
|
| 38 |
+
"id": "a0ed696e",
|
| 39 |
"metadata": {
|
| 40 |
"id": "9ae91ed4-9c3e-4ade-938e-f4c2dcfbfdc0"
|
| 41 |
},
|
|
|
|
| 67 |
},
|
| 68 |
{
|
| 69 |
"cell_type": "markdown",
|
| 70 |
+
"id": "f9040bc6",
|
| 71 |
"metadata": {
|
| 72 |
"id": "e59b91d6-be24-4b5e-bb38-4977ea143a72"
|
| 73 |
},
|
|
|
|
| 86 |
},
|
| 87 |
{
|
| 88 |
"cell_type": "markdown",
|
| 89 |
+
"id": "9770f95f",
|
| 90 |
"metadata": {
|
| 91 |
"id": "21b6316e-8a55-4549-a154-66d3da2ab74a"
|
| 92 |
},
|
|
|
|
| 115 |
},
|
| 116 |
{
|
| 117 |
"cell_type": "markdown",
|
| 118 |
+
"id": "47ccc69a",
|
| 119 |
"metadata": {
|
| 120 |
"id": "3a680dfc-cbba-4f6c-8a1f-e1a5ff3f123a"
|
| 121 |
},
|
|
|
|
| 127 |
},
|
| 128 |
{
|
| 129 |
"cell_type": "markdown",
|
| 130 |
+
"id": "20eb5cf7",
|
| 131 |
"metadata": {
|
| 132 |
"id": "b219c9dd-39b6-4a95-b2a1-3f547a1e7bc0"
|
| 133 |
},
|
|
|
|
| 137 |
},
|
| 138 |
{
|
| 139 |
"cell_type": "markdown",
|
| 140 |
+
"id": "b7ef7008",
|
| 141 |
"metadata": {
|
| 142 |
"id": "674429c5-0ab4-4adf-975b-621bb69eca38"
|
| 143 |
},
|
|
|
|
| 155 |
{
|
| 156 |
"cell_type": "code",
|
| 157 |
"execution_count": 1,
|
| 158 |
+
"id": "e4beab8c",
|
| 159 |
"metadata": {
|
| 160 |
"id": "a2787582-554f-44ce-9f38-4180a5ed6b44"
|
| 161 |
},
|
|
|
|
| 171 |
{
|
| 172 |
"cell_type": "code",
|
| 173 |
"execution_count": 2,
|
| 174 |
+
"id": "a89ec3cc",
|
| 175 |
"metadata": {},
|
| 176 |
"outputs": [
|
| 177 |
{
|
|
|
|
| 212 |
},
|
| 213 |
{
|
| 214 |
"cell_type": "markdown",
|
| 215 |
+
"id": "6db3c688",
|
| 216 |
"metadata": {
|
| 217 |
"id": "d5c7c3d6-7197-41e7-a088-49b753c1681f"
|
| 218 |
},
|
|
|
|
| 227 |
{
|
| 228 |
"cell_type": "code",
|
| 229 |
"execution_count": 3,
|
| 230 |
+
"id": "4b53aecf",
|
| 231 |
"metadata": {
|
| 232 |
"id": "20ba635d-518c-47ac-97ee-3cad25f1e0ce"
|
| 233 |
},
|
|
|
|
| 257 |
},
|
| 258 |
{
|
| 259 |
"cell_type": "markdown",
|
| 260 |
+
"id": "88a8bfef",
|
| 261 |
"metadata": {
|
| 262 |
"id": "2d63b2d2-f68a-4d74-b7f1-5127f6d16605"
|
| 263 |
},
|
|
|
|
| 267 |
},
|
| 268 |
{
|
| 269 |
"cell_type": "markdown",
|
| 270 |
+
"id": "30f77c0c",
|
| 271 |
"metadata": {
|
| 272 |
"id": "601c3099-1026-439e-93e2-5635b3ba5a73"
|
| 273 |
},
|
|
|
|
| 287 |
},
|
| 288 |
{
|
| 289 |
"cell_type": "markdown",
|
| 290 |
+
"id": "ca870a25",
|
| 291 |
"metadata": {
|
| 292 |
"id": "560332eb-3558-41a1-b500-e83a9f695f84"
|
| 293 |
},
|
|
|
|
| 297 |
},
|
| 298 |
{
|
| 299 |
"cell_type": "markdown",
|
| 300 |
+
"id": "a1757ecc",
|
| 301 |
"metadata": {
|
| 302 |
"id": "32ec8068-0bd7-412d-b662-0edb9d1e7365"
|
| 303 |
},
|
|
|
|
| 309 |
},
|
| 310 |
{
|
| 311 |
"cell_type": "markdown",
|
| 312 |
+
"id": "aad089dc",
|
| 313 |
"metadata": {
|
| 314 |
"id": "589d9ec1-d12b-4b64-93f7-04c63997da19"
|
| 315 |
},
|
|
|
|
| 324 |
},
|
| 325 |
{
|
| 326 |
"cell_type": "markdown",
|
| 327 |
+
"id": "b42ff657",
|
| 328 |
"metadata": {
|
| 329 |
"id": "b2ef54d5-b946-4c1d-9fdc-adc5d01b46aa"
|
| 330 |
},
|
|
|
|
| 335 |
{
|
| 336 |
"cell_type": "code",
|
| 337 |
"execution_count": 4,
|
| 338 |
+
"id": "47a93901",
|
| 339 |
"metadata": {
|
| 340 |
"id": "bc77d7bb-f9e2-47f5-b663-30f7a4321ce5"
|
| 341 |
},
|
|
|
|
| 348 |
},
|
| 349 |
{
|
| 350 |
"cell_type": "markdown",
|
| 351 |
+
"id": "22723907",
|
| 352 |
"metadata": {
|
| 353 |
"id": "93748af7-b917-4ecf-a0c8-7d89077ff9cb"
|
| 354 |
},
|
|
|
|
| 358 |
},
|
| 359 |
{
|
| 360 |
"cell_type": "markdown",
|
| 361 |
+
"id": "4c94068f",
|
| 362 |
"metadata": {
|
| 363 |
"id": "2bc82609-a9fb-447a-a2af-99597c864029"
|
| 364 |
},
|
|
|
|
| 372 |
{
|
| 373 |
"cell_type": "code",
|
| 374 |
"execution_count": 5,
|
| 375 |
+
"id": "c5ed14b7",
|
| 376 |
"metadata": {
|
| 377 |
"id": "c7b07f9b-ae0e-4f89-98f0-0c50d432eab6",
|
| 378 |
"outputId": "5c004b44-86e7-4e00-88be-39e0af5eed69"
|
|
|
|
| 386 |
},
|
| 387 |
{
|
| 388 |
"cell_type": "markdown",
|
| 389 |
+
"id": "fe3ec4d1",
|
| 390 |
"metadata": {
|
| 391 |
"id": "d2ef23f3-f4a8-483a-a2dc-080a7496cb1b"
|
| 392 |
},
|
|
|
|
| 396 |
},
|
| 397 |
{
|
| 398 |
"cell_type": "markdown",
|
| 399 |
+
"id": "06729f05",
|
| 400 |
"metadata": {
|
| 401 |
"id": "5ff67654-5a29-4bb8-a69d-0228946c6f8d"
|
| 402 |
},
|
|
|
|
| 412 |
{
|
| 413 |
"cell_type": "code",
|
| 414 |
"execution_count": 6,
|
| 415 |
+
"id": "e6d869dd",
|
| 416 |
"metadata": {
|
| 417 |
"id": "77d9f0c5-8607-4642-a8ac-c3ab2e223ea6"
|
| 418 |
},
|
|
|
|
| 425 |
},
|
| 426 |
{
|
| 427 |
"cell_type": "markdown",
|
| 428 |
+
"id": "c8a0f5f5",
|
| 429 |
"metadata": {
|
| 430 |
"id": "381acd09-0b0f-4d04-9eb3-f028ac0e5f2c"
|
| 431 |
},
|
|
|
|
| 435 |
},
|
| 436 |
{
|
| 437 |
"cell_type": "markdown",
|
| 438 |
+
"id": "eb000f50",
|
| 439 |
"metadata": {
|
| 440 |
"id": "9649bf01-2e8a-45e5-8fca-441c13637b8f"
|
| 441 |
},
|
|
|
|
| 447 |
{
|
| 448 |
"cell_type": "code",
|
| 449 |
"execution_count": 7,
|
| 450 |
+
"id": "8de557eb",
|
| 451 |
"metadata": {
|
| 452 |
"id": "6e6b0ec5-0c94-4e2c-ae24-c791be1b2255"
|
| 453 |
},
|
|
|
|
| 467 |
},
|
| 468 |
{
|
| 469 |
"cell_type": "markdown",
|
| 470 |
+
"id": "add2d161",
|
| 471 |
"metadata": {
|
| 472 |
"id": "5a679f05-063d-41b3-9b58-4fc9c6ccf4fd"
|
| 473 |
},
|
|
|
|
| 486 |
{
|
| 487 |
"cell_type": "code",
|
| 488 |
"execution_count": 8,
|
| 489 |
+
"id": "67b39dba",
|
| 490 |
"metadata": {
|
| 491 |
"id": "f12e2e57-156f-417b-8cfb-69221cc198e8"
|
| 492 |
},
|
|
|
|
| 499 |
},
|
| 500 |
{
|
| 501 |
"cell_type": "markdown",
|
| 502 |
+
"id": "82db95dd",
|
| 503 |
"metadata": {
|
| 504 |
"id": "00382a3e-abec-4cdd-a54c-d1aaa3ea4707"
|
| 505 |
},
|
|
|
|
| 511 |
{
|
| 512 |
"cell_type": "code",
|
| 513 |
"execution_count": 9,
|
| 514 |
+
"id": "2f58e84d",
|
| 515 |
"metadata": {
|
| 516 |
"id": "87122d71-289a-466a-afcf-fa354b18946b"
|
| 517 |
},
|
|
|
|
| 531 |
},
|
| 532 |
{
|
| 533 |
"cell_type": "markdown",
|
| 534 |
+
"id": "a0a36ce6",
|
| 535 |
"metadata": {},
|
| 536 |
"source": [
|
| 537 |
"We'll define our pre-processing strategy. We advise that you **do not** lower-case the transcriptions or remove punctuation unless mixing different datasets. This will enable you to fine-tune Whisper models that can predict punctuation and casing. Later, you will see how we can evaluate the predictions without punctuation or casing, so that the models benefit from the WER improvement obtained by normalising the transcriptions while still predicting fully formatted transcriptions."
|
|
|
|
| 540 |
{
|
| 541 |
"cell_type": "code",
|
| 542 |
"execution_count": 10,
|
| 543 |
+
"id": "c580eca9",
|
| 544 |
"metadata": {},
|
| 545 |
"outputs": [],
|
| 546 |
"source": [
|
|
|
|
| 554 |
},
|
| 555 |
{
|
| 556 |
"cell_type": "markdown",
|
| 557 |
+
"id": "75220c6e",
|
| 558 |
"metadata": {},
|
| 559 |
"source": [
|
| 560 |
"Now we can write a function to prepare our data ready for the model:\n",
|
|
|
|
| 567 |
{
|
| 568 |
"cell_type": "code",
|
| 569 |
"execution_count": 11,
|
| 570 |
+
"id": "2b19b3b4",
|
| 571 |
"metadata": {},
|
| 572 |
"outputs": [],
|
| 573 |
"source": [
|
|
|
|
| 594 |
},
|
| 595 |
{
|
| 596 |
"cell_type": "markdown",
|
| 597 |
+
"id": "b780de3a",
|
| 598 |
"metadata": {
|
| 599 |
"id": "70b319fb-2439-4ef6-a70d-a47bf41c4a13"
|
| 600 |
},
|
|
|
|
| 605 |
{
|
| 606 |
"cell_type": "code",
|
| 607 |
"execution_count": 12,
|
| 608 |
+
"id": "33df4a27",
|
| 609 |
"metadata": {
|
| 610 |
"id": "7b73ab39-ffaf-4b9e-86e5-782963c6134b"
|
| 611 |
},
|
|
|
|
| 841 |
},
|
| 842 |
{
|
| 843 |
"cell_type": "markdown",
|
| 844 |
+
"id": "28280b1e",
|
| 845 |
"metadata": {},
|
| 846 |
"source": [
|
| 847 |
"Finally, we filter any training data with audio samples longer than 30s. These samples would otherwise be truncated by the Whisper feature-extractor which could affect the stability of training. We define a function that returns `True` for samples that are less than 30s, and `False` for those that are longer:"
|
|
|
|
| 850 |
{
|
| 851 |
"cell_type": "code",
|
| 852 |
"execution_count": 13,
|
| 853 |
+
"id": "74917d17",
|
| 854 |
"metadata": {},
|
| 855 |
"outputs": [],
|
| 856 |
"source": [
|
|
|
|
| 862 |
},
|
| 863 |
{
|
| 864 |
"cell_type": "markdown",
|
| 865 |
+
"id": "ad72b009",
|
| 866 |
"metadata": {},
|
| 867 |
"source": [
|
| 868 |
"We apply our filter function to all samples of our training dataset through 🤗 Datasets' `.filter` method:"
|
|
|
|
| 871 |
{
|
| 872 |
"cell_type": "code",
|
| 873 |
"execution_count": 14,
|
| 874 |
+
"id": "ff62b10c",
|
| 875 |
"metadata": {},
|
| 876 |
"outputs": [
|
| 877 |
{
|
|
|
|
| 891 |
},
|
| 892 |
{
|
| 893 |
"cell_type": "markdown",
|
| 894 |
+
"id": "63f649b4",
|
| 895 |
"metadata": {
|
| 896 |
"id": "263a5a58-0239-4a25-b0df-c625fc9c5810"
|
| 897 |
},
|
|
|
|
| 901 |
},
|
| 902 |
{
|
| 903 |
"cell_type": "markdown",
|
| 904 |
+
"id": "f6da5590",
|
| 905 |
"metadata": {
|
| 906 |
"id": "a693e768-c5a6-453f-89a1-b601dcf7daf7"
|
| 907 |
},
|
|
|
|
| 924 |
},
|
| 925 |
{
|
| 926 |
"cell_type": "markdown",
|
| 927 |
+
"id": "7e49d443",
|
| 928 |
"metadata": {
|
| 929 |
"id": "8d230e6d-624c-400a-bbf5-fa660881df25"
|
| 930 |
},
|
|
|
|
| 934 |
},
|
| 935 |
{
|
| 936 |
"cell_type": "markdown",
|
| 937 |
+
"id": "a7af1a62",
|
| 938 |
"metadata": {
|
| 939 |
"id": "04def221-0637-4a69-b242-d3f0c1d0ee78"
|
| 940 |
},
|
|
|
|
| 960 |
{
|
| 961 |
"cell_type": "code",
|
| 962 |
"execution_count": 15,
|
| 963 |
+
"id": "6742bdde",
|
| 964 |
"metadata": {
|
| 965 |
"id": "8326221e-ec13-4731-bb4e-51e5fc1486c5"
|
| 966 |
},
|
|
|
|
| 1001 |
},
|
| 1002 |
{
|
| 1003 |
"cell_type": "markdown",
|
| 1004 |
+
"id": "6a5bfad2",
|
| 1005 |
"metadata": {
|
| 1006 |
"id": "3cae7dbf-8a50-456e-a3a8-7fd005390f86"
|
| 1007 |
},
|
|
|
|
| 1012 |
{
|
| 1013 |
"cell_type": "code",
|
| 1014 |
"execution_count": 16,
|
| 1015 |
+
"id": "d349c47c",
|
| 1016 |
"metadata": {
|
| 1017 |
"id": "fc834702-c0d3-4a96-b101-7b87be32bf42"
|
| 1018 |
},
|
|
|
|
| 1023 |
},
|
| 1024 |
{
|
| 1025 |
"cell_type": "markdown",
|
| 1026 |
+
"id": "2fb9bd9f",
|
| 1027 |
"metadata": {
|
| 1028 |
"id": "d62bb2ab-750a-45e7-82e9-61d6f4805698"
|
| 1029 |
},
|
|
|
|
| 1033 |
},
|
| 1034 |
{
|
| 1035 |
"cell_type": "markdown",
|
| 1036 |
+
"id": "eb620cc8",
|
| 1037 |
"metadata": {
|
| 1038 |
"id": "66fee1a7-a44c-461e-b047-c3917221572e"
|
| 1039 |
},
|
|
|
|
| 1045 |
{
|
| 1046 |
"cell_type": "code",
|
| 1047 |
"execution_count": 17,
|
| 1048 |
+
"id": "a0cc7e31",
|
| 1049 |
"metadata": {
|
| 1050 |
"id": "b22b4011-f31f-4b57-b684-c52332f92890"
|
| 1051 |
},
|
|
|
|
| 1058 |
},
|
| 1059 |
{
|
| 1060 |
"cell_type": "markdown",
|
| 1061 |
+
"id": "2b750f0c",
|
| 1062 |
"metadata": {
|
| 1063 |
"id": "4f32cab6-31f0-4cb9-af4c-40ba0f5fc508"
|
| 1064 |
},
|
|
|
|
| 1078 |
{
|
| 1079 |
"cell_type": "code",
|
| 1080 |
"execution_count": 18,
|
| 1081 |
+
"id": "cf3da153",
|
| 1082 |
"metadata": {
|
| 1083 |
"id": "23959a70-22d0-4ffe-9fa1-72b61e75bb52"
|
| 1084 |
},
|
|
|
|
| 1109 |
},
|
| 1110 |
{
|
| 1111 |
"cell_type": "markdown",
|
| 1112 |
+
"id": "3c40ea30",
|
| 1113 |
"metadata": {
|
| 1114 |
"id": "daf2a825-6d9f-4a23-b145-c37c0039075b"
|
| 1115 |
},
|
|
|
|
| 1119 |
},
|
| 1120 |
{
|
| 1121 |
"cell_type": "markdown",
|
| 1122 |
+
"id": "0763fedb",
|
| 1123 |
"metadata": {
|
| 1124 |
"id": "437a97fa-4864-476b-8abc-f28b8166cfa5"
|
| 1125 |
},
|
|
|
|
| 1131 |
{
|
| 1132 |
"cell_type": "code",
|
| 1133 |
"execution_count": 19,
|
| 1134 |
+
"id": "8b6d0fcc",
|
| 1135 |
"metadata": {
|
| 1136 |
"id": "5a10cc4b-07ec-4ebd-ac1d-7c601023594f"
|
| 1137 |
},
|
|
|
|
| 1144 |
},
|
| 1145 |
{
|
| 1146 |
"cell_type": "markdown",
|
| 1147 |
+
"id": "516a3a07",
|
| 1148 |
"metadata": {
|
| 1149 |
"id": "a15ead5f-2277-4a39-937b-585c2497b2df"
|
| 1150 |
},
|
|
|
|
| 1155 |
{
|
| 1156 |
"cell_type": "code",
|
| 1157 |
"execution_count": 20,
|
| 1158 |
+
"id": "dad8cf04",
|
| 1159 |
"metadata": {
|
| 1160 |
"id": "62038ba3-88ed-4fce-84db-338f50dcd04f"
|
| 1161 |
},
|
|
|
|
| 1168 |
},
|
| 1169 |
{
|
| 1170 |
"cell_type": "markdown",
|
| 1171 |
+
"id": "4e4ad9cb",
|
| 1172 |
"metadata": {
|
| 1173 |
"id": "2178dea4-80ca-47b6-b6ea-ba1915c90c06"
|
| 1174 |
},
|
|
|
|
| 1178 |
},
|
| 1179 |
{
|
| 1180 |
"cell_type": "markdown",
|
| 1181 |
+
"id": "af61b4bd",
|
| 1182 |
"metadata": {
|
| 1183 |
"id": "c21af1e9-0188-4134-ac82-defc7bdcc436"
|
| 1184 |
},
|
|
|
|
| 1189 |
{
|
| 1190 |
"cell_type": "code",
|
| 1191 |
"execution_count": 21,
|
| 1192 |
+
"id": "a9e18890",
|
| 1193 |
"metadata": {
|
| 1194 |
"id": "0ae3e9af-97b7-4aa0-ae85-20b23b5bcb3a"
|
| 1195 |
},
|
|
|
|
| 1223 |
},
|
| 1224 |
{
|
| 1225 |
"cell_type": "markdown",
|
| 1226 |
+
"id": "0a7aacb7",
|
| 1227 |
"metadata": {
|
| 1228 |
"id": "b3a944d8-3112-4552-82a0-be25988b3857"
|
| 1229 |
},
|
|
|
|
| 1234 |
},
|
| 1235 |
{
|
| 1236 |
"cell_type": "markdown",
|
| 1237 |
+
"id": "038105a5",
|
| 1238 |
"metadata": {
|
| 1239 |
"id": "bac29114-d226-4f54-97cf-8718c9f94e1e"
|
| 1240 |
},
|
|
|
|
| 1246 |
{
|
| 1247 |
"cell_type": "code",
|
| 1248 |
"execution_count": 22,
|
| 1249 |
+
"id": "6df38176",
|
| 1250 |
"metadata": {
|
| 1251 |
"id": "d546d7fe-0543-479a-b708-2ebabec19493"
|
| 1252 |
},
|
|
|
|
| 1277 |
},
|
| 1278 |
{
|
| 1279 |
"cell_type": "markdown",
|
| 1280 |
+
"id": "d6f06247",
|
| 1281 |
"metadata": {
|
| 1282 |
"id": "uOrRhDGtN5S4"
|
| 1283 |
},
|
|
|
|
| 1288 |
{
|
| 1289 |
"cell_type": "code",
|
| 1290 |
"execution_count": 23,
|
| 1291 |
+
"id": "3158c8e0",
|
| 1292 |
"metadata": {
|
| 1293 |
"id": "-2zQwMfEOBJq"
|
| 1294 |
},
|
|
|
|
| 1310 |
},
|
| 1311 |
{
|
| 1312 |
"cell_type": "markdown",
|
| 1313 |
+
"id": "20d51bc5",
|
| 1314 |
"metadata": {
|
| 1315 |
"id": "7f404cf9-4345-468c-8196-4bd101d9bd51"
|
| 1316 |
},
|
|
|
|
| 1320 |
},
|
| 1321 |
{
|
| 1322 |
"cell_type": "markdown",
|
| 1323 |
+
"id": "b1bac1cc",
|
| 1324 |
"metadata": {
|
| 1325 |
"id": "5e8b8d56-5a70-4f68-bd2e-f0752d0bd112"
|
| 1326 |
},
|
|
|
|
| 1337 |
{
|
| 1338 |
"cell_type": "code",
|
| 1339 |
"execution_count": null,
|
| 1340 |
+
"id": "449b7807",
|
| 1341 |
"metadata": {
|
| 1342 |
"id": "ee8b7b8e-1c9a-4d77-9137-1778a629e6de"
|
| 1343 |
},
|
|
|
|
| 1365 |
"\n",
|
| 1366 |
" <div>\n",
|
| 1367 |
" \n",
|
| 1368 |
+
" <progress value='351' max='500' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
|
| 1369 |
+
" [351/500 2:07:16 < 54:20, 0.05 it/s, Epoch 7.45/11]\n",
|
| 1370 |
" </div>\n",
|
| 1371 |
" <table border=\"1\" class=\"dataframe\">\n",
|
| 1372 |
" <thead>\n",
|
|
|
|
| 1415 |
" <td>18.134172</td>\n",
|
| 1416 |
" </tr>\n",
|
| 1417 |
" </tbody>\n",
|
| 1418 |
+
"</table><p>\n",
|
| 1419 |
+
" <div>\n",
|
| 1420 |
+
" \n",
|
| 1421 |
+
" <progress value='52' max='56' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
|
| 1422 |
+
" [52/56 01:34 < 00:07, 0.54 it/s]\n",
|
| 1423 |
+
" </div>\n",
|
| 1424 |
+
" "
|
| 1425 |
],
|
| 1426 |
"text/plain": [
|
| 1427 |
"<IPython.core.display.HTML object>"
|
|
|
|
| 1472 |
"Configuration saved in ./checkpoint-300/config.json\n",
|
| 1473 |
"Model weights saved in ./checkpoint-300/pytorch_model.bin\n",
|
| 1474 |
"Feature extractor saved in ./checkpoint-300/preprocessor_config.json\n",
|
| 1475 |
+
"Feature extractor saved in ./preprocessor_config.json\n",
|
| 1476 |
+
"The following columns in the evaluation set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: input_length. If input_length are not expected by `WhisperForConditionalGeneration.forward`, you can safely ignore this message.\n",
|
| 1477 |
+
"***** Running Evaluation *****\n",
|
| 1478 |
+
" Num examples = 56\n",
|
| 1479 |
+
" Batch size = 1\n"
|
| 1480 |
]
|
| 1481 |
}
|
| 1482 |
],
|
|
|
|
| 1487 |
{
|
| 1488 |
"cell_type": "code",
|
| 1489 |
"execution_count": null,
|
| 1490 |
+
"id": "06dc5477",
|
| 1491 |
"metadata": {
|
| 1492 |
"id": "c704f91e-241b-48c9-b8e0-f0da396a9663"
|
| 1493 |
},
|
|
|
|
| 1505 |
},
|
| 1506 |
{
|
| 1507 |
"cell_type": "markdown",
|
| 1508 |
+
"id": "2385c318",
|
| 1509 |
"metadata": {
|
| 1510 |
"id": "090d676a-f944-4297-a938-a40eda0b2b68"
|
| 1511 |
},
|
|
|
|
| 1516 |
{
|
| 1517 |
"cell_type": "code",
|
| 1518 |
"execution_count": null,
|
| 1519 |
+
"id": "79cdaf1a",
|
| 1520 |
"metadata": {
|
| 1521 |
"id": "d7030622-caf7-4039-939b-6195cdaa2585"
|
| 1522 |
},
|
|
|
|
| 1527 |
},
|
| 1528 |
{
|
| 1529 |
"cell_type": "markdown",
|
| 1530 |
+
"id": "fc5763f7",
|
| 1531 |
"metadata": {
|
| 1532 |
"id": "ca743fbd-602c-48d4-ba8d-a2fe60af64ba"
|
| 1533 |
},
|
|
|
|
| 1537 |
},
|
| 1538 |
{
|
| 1539 |
"cell_type": "markdown",
|
| 1540 |
+
"id": "78fc1970",
|
| 1541 |
"metadata": {
|
| 1542 |
"id": "7f737783-2870-4e35-aa11-86a42d7d997a"
|
| 1543 |
},
|
pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 3055754841
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d578355b6f15b04766cc526e98fb03f0c91df1e63339b2781585fd3c767247fe
|
| 3 |
size 3055754841
|
runs/Dec10_19-43-08_28bc7e304ae9/events.out.tfevents.1670701396.28bc7e304ae9.29234.0
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f4ea206346175b73b7dea8dc1b9f371b42fe52eedd5e5ef7a22c21736207a756
|
| 3 |
+
size 9312
|