Instructions to use FierceLLM/CoreGPT-small with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use FierceLLM/CoreGPT-small with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("sberbank-ai/rugpt3small_based_on_gpt2") model = PeftModel.from_pretrained(base_model, "FierceLLM/CoreGPT-small") - Notebooks
- Google Colab
- Kaggle
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 1860, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.016148566814695196, | |
| "grad_norm": 0.5020289421081543, | |
| "learning_rate": 8.999999999999999e-05, | |
| "loss": 2.439466857910156, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03229713362939039, | |
| "grad_norm": 0.6233699321746826, | |
| "learning_rate": 0.00019, | |
| "loss": 2.4973506927490234, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.04844570044408559, | |
| "grad_norm": 0.6460151076316833, | |
| "learning_rate": 0.00029, | |
| "loss": 2.579629898071289, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.06459426725878079, | |
| "grad_norm": 0.5584802031517029, | |
| "learning_rate": 0.00039000000000000005, | |
| "loss": 2.5269956588745117, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.08074283407347597, | |
| "grad_norm": 0.5267783403396606, | |
| "learning_rate": 0.00049, | |
| "loss": 2.5723337173461913, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.09689140088817118, | |
| "grad_norm": 0.492374062538147, | |
| "learning_rate": 0.00059, | |
| "loss": 2.6084869384765623, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.11303996770286637, | |
| "grad_norm": 0.533662736415863, | |
| "learning_rate": 0.00069, | |
| "loss": 2.527654838562012, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.12918853451756157, | |
| "grad_norm": 0.568081796169281, | |
| "learning_rate": 0.00079, | |
| "loss": 2.584039497375488, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.14533710133225677, | |
| "grad_norm": 0.5420770049095154, | |
| "learning_rate": 0.0008900000000000001, | |
| "loss": 2.5998212814331056, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.16148566814695195, | |
| "grad_norm": 0.5972040295600891, | |
| "learning_rate": 0.00099, | |
| "loss": 2.61820011138916, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.17763423496164715, | |
| "grad_norm": 0.622533917427063, | |
| "learning_rate": 0.0009999354806331361, | |
| "loss": 2.5819944381713866, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.19378280177634236, | |
| "grad_norm": 0.6911935210227966, | |
| "learning_rate": 0.0009997124721002689, | |
| "loss": 2.5768102645874023, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.20993136859103753, | |
| "grad_norm": 0.7095156908035278, | |
| "learning_rate": 0.000999330248902402, | |
| "loss": 2.6130306243896486, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.22607993540573273, | |
| "grad_norm": 0.5791682600975037, | |
| "learning_rate": 0.0009987889328206437, | |
| "loss": 2.562555503845215, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.24222850222042794, | |
| "grad_norm": 0.5518380403518677, | |
| "learning_rate": 0.0009980886963250907, | |
| "loss": 2.518760108947754, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.25837706903512314, | |
| "grad_norm": 0.4897823631763458, | |
| "learning_rate": 0.000997229762519879, | |
| "loss": 2.6402866363525392, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.2745256358498183, | |
| "grad_norm": 0.5050747394561768, | |
| "learning_rate": 0.0009962124050720978, | |
| "loss": 2.684323310852051, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.29067420266451355, | |
| "grad_norm": 0.468423068523407, | |
| "learning_rate": 0.0009950369481245985, | |
| "loss": 2.6152185440063476, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.3068227694792087, | |
| "grad_norm": 0.5091232657432556, | |
| "learning_rate": 0.0009937037661927161, | |
| "loss": 2.531853675842285, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.3229713362939039, | |
| "grad_norm": 0.4922482967376709, | |
| "learning_rate": 0.0009922132840449458, | |
| "loss": 2.6094560623168945, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.33911990310859913, | |
| "grad_norm": 0.5051465034484863, | |
| "learning_rate": 0.0009905659765676053, | |
| "loss": 2.559980583190918, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.3552684699232943, | |
| "grad_norm": 0.4865105450153351, | |
| "learning_rate": 0.0009887623686135306, | |
| "loss": 2.508647346496582, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.3714170367379895, | |
| "grad_norm": 0.5287356376647949, | |
| "learning_rate": 0.0009868030348348512, | |
| "loss": 2.6150222778320313, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.3875656035526847, | |
| "grad_norm": 0.4643533229827881, | |
| "learning_rate": 0.0009846885994998983, | |
| "loss": 2.6150381088256838, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.4037141703673799, | |
| "grad_norm": 0.46244189143180847, | |
| "learning_rate": 0.0009824197362943063, | |
| "loss": 2.5374935150146483, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.41986273718207506, | |
| "grad_norm": 0.480276882648468, | |
| "learning_rate": 0.000979997168106366, | |
| "loss": 2.5654741287231446, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.4360113039967703, | |
| "grad_norm": 0.5552269816398621, | |
| "learning_rate": 0.0009774216667967062, | |
| "loss": 2.6036794662475584, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.45215987081146547, | |
| "grad_norm": 0.511289656162262, | |
| "learning_rate": 0.000974694052952366, | |
| "loss": 2.610031318664551, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.4683084376261607, | |
| "grad_norm": 0.5037236213684082, | |
| "learning_rate": 0.000971815195625348, | |
| "loss": 2.516169548034668, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.4844570044408559, | |
| "grad_norm": 0.5199276804924011, | |
| "learning_rate": 0.000968786012055726, | |
| "loss": 2.5069480895996095, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.500605571255551, | |
| "grad_norm": 0.5875343680381775, | |
| "learning_rate": 0.0009656074673794017, | |
| "loss": 2.639841651916504, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.5167541380702463, | |
| "grad_norm": 0.5489600896835327, | |
| "learning_rate": 0.0009622805743205998, | |
| "loss": 2.5628652572631836, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.5329027048849415, | |
| "grad_norm": 0.4753468334674835, | |
| "learning_rate": 0.0009588063928692012, | |
| "loss": 2.5956233978271483, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.5490512716996366, | |
| "grad_norm": 0.5152420997619629, | |
| "learning_rate": 0.0009551860299430173, | |
| "loss": 2.597700500488281, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.5651998385143319, | |
| "grad_norm": 0.4520896375179291, | |
| "learning_rate": 0.0009514206390351116, | |
| "loss": 2.586415481567383, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.5813484053290271, | |
| "grad_norm": 0.5123590230941772, | |
| "learning_rate": 0.0009475114198462837, | |
| "loss": 2.555033302307129, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.5974969721437222, | |
| "grad_norm": 0.6008352637290955, | |
| "learning_rate": 0.0009434596179028271, | |
| "loss": 2.6199901580810545, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.6136455389584174, | |
| "grad_norm": 0.4769132435321808, | |
| "learning_rate": 0.0009392665241596914, | |
| "loss": 2.5420787811279295, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.6297941057731127, | |
| "grad_norm": 0.4474424421787262, | |
| "learning_rate": 0.0009349334745891666, | |
| "loss": 2.549270820617676, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.6459426725878078, | |
| "grad_norm": 0.5046530365943909, | |
| "learning_rate": 0.0009304618497552253, | |
| "loss": 2.540346145629883, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.662091239402503, | |
| "grad_norm": 0.5442773699760437, | |
| "learning_rate": 0.0009258530743736586, | |
| "loss": 2.550925636291504, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.6782398062171983, | |
| "grad_norm": 0.45155641436576843, | |
| "learning_rate": 0.0009211086168581433, | |
| "loss": 2.5896928787231444, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.6943883730318934, | |
| "grad_norm": 0.48448678851127625, | |
| "learning_rate": 0.0009162299888523867, | |
| "loss": 2.568522834777832, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.7105369398465886, | |
| "grad_norm": 0.4634808897972107, | |
| "learning_rate": 0.0009112187447484979, | |
| "loss": 2.543706512451172, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.7266855066612838, | |
| "grad_norm": 0.520962655544281, | |
| "learning_rate": 0.0009060764811917397, | |
| "loss": 2.4791580200195313, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.742834073475979, | |
| "grad_norm": 0.495394766330719, | |
| "learning_rate": 0.0009008048365718167, | |
| "loss": 2.6086082458496094, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.7589826402906742, | |
| "grad_norm": 0.4325544834136963, | |
| "learning_rate": 0.0008954054905008639, | |
| "loss": 2.5405605316162108, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.7751312071053694, | |
| "grad_norm": 0.4992341697216034, | |
| "learning_rate": 0.0008898801632783013, | |
| "loss": 2.6021982192993165, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.7912797739200645, | |
| "grad_norm": 0.5032821893692017, | |
| "learning_rate": 0.0008842306153427246, | |
| "loss": 2.5671119689941406, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.8074283407347598, | |
| "grad_norm": 0.44175952672958374, | |
| "learning_rate": 0.000878458646711008, | |
| "loss": 2.5145410537719726, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.823576907549455, | |
| "grad_norm": 0.48530301451683044, | |
| "learning_rate": 0.0008725660964047959, | |
| "loss": 2.4978832244873046, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.8397254743641501, | |
| "grad_norm": 0.4604915976524353, | |
| "learning_rate": 0.0008665548418645672, | |
| "loss": 2.5596445083618162, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.8558740411788454, | |
| "grad_norm": 0.45781826972961426, | |
| "learning_rate": 0.0008604267983514594, | |
| "loss": 2.6085268020629884, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.8720226079935406, | |
| "grad_norm": 0.48819592595100403, | |
| "learning_rate": 0.000854183918337043, | |
| "loss": 2.5747554779052733, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.8881711748082358, | |
| "grad_norm": 0.4929693043231964, | |
| "learning_rate": 0.0008478281908812387, | |
| "loss": 2.543058395385742, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.9043197416229309, | |
| "grad_norm": 0.4913038909435272, | |
| "learning_rate": 0.0008413616409985779, | |
| "loss": 2.5399295806884767, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.9204683084376262, | |
| "grad_norm": 0.47384563088417053, | |
| "learning_rate": 0.0008347863290130087, | |
| "loss": 2.5927974700927736, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.9366168752523214, | |
| "grad_norm": 0.4775764048099518, | |
| "learning_rate": 0.0008281043499014498, | |
| "loss": 2.5593168258666994, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.9527654420670165, | |
| "grad_norm": 0.5058761239051819, | |
| "learning_rate": 0.0008213178326263049, | |
| "loss": 2.552435111999512, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.9689140088817118, | |
| "grad_norm": 0.4922596216201782, | |
| "learning_rate": 0.0008144289394571484, | |
| "loss": 2.5472679138183594, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.985062575696407, | |
| "grad_norm": 0.49022358655929565, | |
| "learning_rate": 0.0008074398652817998, | |
| "loss": 2.5109460830688475, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.6876914501190186, | |
| "learning_rate": 0.0008003528369070043, | |
| "loss": 2.465944290161133, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.0161485668146952, | |
| "grad_norm": 0.4119199812412262, | |
| "learning_rate": 0.0007931701123489439, | |
| "loss": 2.520769500732422, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.0322971336293905, | |
| "grad_norm": 0.4147922992706299, | |
| "learning_rate": 0.000785893980113806, | |
| "loss": 2.5548782348632812, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.0484457004440857, | |
| "grad_norm": 0.5313045382499695, | |
| "learning_rate": 0.0007785267584686366, | |
| "loss": 2.6331764221191407, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.0645942672587807, | |
| "grad_norm": 0.45284605026245117, | |
| "learning_rate": 0.00077107079470271, | |
| "loss": 2.519462013244629, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.080742834073476, | |
| "grad_norm": 0.5042719841003418, | |
| "learning_rate": 0.0007635284643796545, | |
| "loss": 2.4921422958374024, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.0968914008881712, | |
| "grad_norm": 0.4403098225593567, | |
| "learning_rate": 0.0007559021705805671, | |
| "loss": 2.454839897155762, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.1130399677028664, | |
| "grad_norm": 0.4963165819644928, | |
| "learning_rate": 0.0007481943431383622, | |
| "loss": 2.5821470260620116, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.1291885345175616, | |
| "grad_norm": 0.4509197175502777, | |
| "learning_rate": 0.000740407437863596, | |
| "loss": 2.4973094940185545, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.1453371013322569, | |
| "grad_norm": 0.49439796805381775, | |
| "learning_rate": 0.0007325439357620147, | |
| "loss": 2.6476114273071287, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.1614856681469519, | |
| "grad_norm": 0.5243302583694458, | |
| "learning_rate": 0.0007246063422440747, | |
| "loss": 2.4758913040161135, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.177634234961647, | |
| "grad_norm": 0.6043158769607544, | |
| "learning_rate": 0.0007165971863266878, | |
| "loss": 2.5820987701416014, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.1937828017763423, | |
| "grad_norm": 0.44207850098609924, | |
| "learning_rate": 0.0007085190198274438, | |
| "loss": 2.4599235534667967, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.2099313685910376, | |
| "grad_norm": 0.4254566431045532, | |
| "learning_rate": 0.0007003744165515704, | |
| "loss": 2.4942739486694334, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.2260799354057328, | |
| "grad_norm": 0.41949278116226196, | |
| "learning_rate": 0.0006921659714718863, | |
| "loss": 2.505445098876953, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.242228502220428, | |
| "grad_norm": 0.4123310148715973, | |
| "learning_rate": 0.0006838962999020094, | |
| "loss": 2.5693735122680663, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.2583770690351233, | |
| "grad_norm": 0.4186009168624878, | |
| "learning_rate": 0.0006755680366630865, | |
| "loss": 2.4493398666381836, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.2745256358498183, | |
| "grad_norm": 0.444654643535614, | |
| "learning_rate": 0.0006671838352443049, | |
| "loss": 2.5728691101074217, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.2906742026645135, | |
| "grad_norm": 0.40202823281288147, | |
| "learning_rate": 0.0006587463669574584, | |
| "loss": 2.526685333251953, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.3068227694792087, | |
| "grad_norm": 0.4873361885547638, | |
| "learning_rate": 0.0006502583200858335, | |
| "loss": 2.5635454177856447, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.322971336293904, | |
| "grad_norm": 0.5394927859306335, | |
| "learning_rate": 0.0006417223990276883, | |
| "loss": 2.6018707275390627, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.3391199031085992, | |
| "grad_norm": 0.5254472494125366, | |
| "learning_rate": 0.0006331413234345977, | |
| "loss": 2.5202842712402345, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.3552684699232942, | |
| "grad_norm": 0.4611901342868805, | |
| "learning_rate": 0.0006245178273449383, | |
| "loss": 2.527310371398926, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.3714170367379894, | |
| "grad_norm": 0.46280530095100403, | |
| "learning_rate": 0.0006158546583127886, | |
| "loss": 2.5010074615478515, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.3875656035526847, | |
| "grad_norm": 0.47044530510902405, | |
| "learning_rate": 0.0006071545765325253, | |
| "loss": 2.658343505859375, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.4037141703673799, | |
| "grad_norm": 0.5649057030677795, | |
| "learning_rate": 0.0005984203539593897, | |
| "loss": 2.514650344848633, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.4198627371820751, | |
| "grad_norm": 0.5100232362747192, | |
| "learning_rate": 0.0005896547734263077, | |
| "loss": 2.4994720458984374, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.4360113039967703, | |
| "grad_norm": 0.4990105926990509, | |
| "learning_rate": 0.0005808606277572453, | |
| "loss": 2.489163398742676, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.4521598708114656, | |
| "grad_norm": 0.473297655582428, | |
| "learning_rate": 0.0005720407188773791, | |
| "loss": 2.534769630432129, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.4683084376261606, | |
| "grad_norm": 0.5112643837928772, | |
| "learning_rate": 0.000563197856920368, | |
| "loss": 2.5247997283935546, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.4844570044408558, | |
| "grad_norm": 0.4357326924800873, | |
| "learning_rate": 0.0005543348593330093, | |
| "loss": 2.508163642883301, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.500605571255551, | |
| "grad_norm": 0.47014695405960083, | |
| "learning_rate": 0.0005454545499775651, | |
| "loss": 2.6127824783325195, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.5167541380702463, | |
| "grad_norm": 0.4659437835216522, | |
| "learning_rate": 0.0005365597582320436, | |
| "loss": 2.4793100357055664, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.5329027048849415, | |
| "grad_norm": 0.5408269166946411, | |
| "learning_rate": 0.0005276533180887248, | |
| "loss": 2.45506591796875, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.5490512716996365, | |
| "grad_norm": 0.5870039463043213, | |
| "learning_rate": 0.000518738067251214, | |
| "loss": 2.480586814880371, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.565199838514332, | |
| "grad_norm": 0.48131951689720154, | |
| "learning_rate": 0.0005098168462303141, | |
| "loss": 2.589830207824707, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.581348405329027, | |
| "grad_norm": 0.4618188440799713, | |
| "learning_rate": 0.0005008924974390041, | |
| "loss": 2.5488056182861327, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.5974969721437222, | |
| "grad_norm": 0.49082285165786743, | |
| "learning_rate": 0.0004919678642868092, | |
| "loss": 2.4781982421875, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.6136455389584174, | |
| "grad_norm": 0.47744420170783997, | |
| "learning_rate": 0.0004830457902738558, | |
| "loss": 2.517325210571289, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.6297941057731127, | |
| "grad_norm": 0.507945716381073, | |
| "learning_rate": 0.0004741291180848961, | |
| "loss": 2.5076452255249024, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.645942672587808, | |
| "grad_norm": 0.42816001176834106, | |
| "learning_rate": 0.000465220688683594, | |
| "loss": 2.671817398071289, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.662091239402503, | |
| "grad_norm": 0.4258963167667389, | |
| "learning_rate": 0.00045632334040735764, | |
| "loss": 2.533784103393555, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.6782398062171984, | |
| "grad_norm": 0.494028240442276, | |
| "learning_rate": 0.00044743990806300917, | |
| "loss": 2.514291000366211, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.6943883730318934, | |
| "grad_norm": 0.4230322539806366, | |
| "learning_rate": 0.00043857322202358066, | |
| "loss": 2.5531696319580077, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.7105369398465886, | |
| "grad_norm": 0.5738111734390259, | |
| "learning_rate": 0.00042972610732652105, | |
| "loss": 2.5059112548828124, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.7266855066612838, | |
| "grad_norm": 0.5092839002609253, | |
| "learning_rate": 0.0004209013827736042, | |
| "loss": 2.5219795227050783, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.7428340734759789, | |
| "grad_norm": 0.4812857508659363, | |
| "learning_rate": 0.00041210186003282274, | |
| "loss": 2.5235408782958983, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.7589826402906743, | |
| "grad_norm": 0.47096627950668335, | |
| "learning_rate": 0.000403330342742556, | |
| "loss": 2.574551582336426, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.7751312071053693, | |
| "grad_norm": 0.4319113790988922, | |
| "learning_rate": 0.0003945896256182949, | |
| "loss": 2.610904502868652, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.7912797739200645, | |
| "grad_norm": 0.44047966599464417, | |
| "learning_rate": 0.0003858824935622115, | |
| "loss": 2.5323257446289062, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.8074283407347598, | |
| "grad_norm": 0.548047661781311, | |
| "learning_rate": 0.00037721172077585287, | |
| "loss": 2.5165468215942384, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.823576907549455, | |
| "grad_norm": 0.46526339650154114, | |
| "learning_rate": 0.00036858006987624723, | |
| "loss": 2.502303886413574, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.8397254743641502, | |
| "grad_norm": 0.4635223150253296, | |
| "learning_rate": 0.0003599902910156984, | |
| "loss": 2.5442089080810546, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.8558740411788452, | |
| "grad_norm": 0.5317935347557068, | |
| "learning_rate": 0.0003514451210055527, | |
| "loss": 2.600077247619629, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.8720226079935407, | |
| "grad_norm": 0.5463606119155884, | |
| "learning_rate": 0.00034294728244421756, | |
| "loss": 2.574476623535156, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.8881711748082357, | |
| "grad_norm": 0.4650241732597351, | |
| "learning_rate": 0.00033449948284970617, | |
| "loss": 2.500654411315918, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.904319741622931, | |
| "grad_norm": 0.40839987993240356, | |
| "learning_rate": 0.00032610441379698937, | |
| "loss": 2.4712839126586914, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.9204683084376262, | |
| "grad_norm": 0.49943020939826965, | |
| "learning_rate": 0.0003177647500604252, | |
| "loss": 2.5296091079711913, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.9366168752523214, | |
| "grad_norm": 0.4655015468597412, | |
| "learning_rate": 0.00030948314876154306, | |
| "loss": 2.5075130462646484, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.9527654420670166, | |
| "grad_norm": 0.45307499170303345, | |
| "learning_rate": 0.00030126224852245056, | |
| "loss": 2.464124298095703, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.9689140088817116, | |
| "grad_norm": 0.5530602931976318, | |
| "learning_rate": 0.0002931046686251365, | |
| "loss": 2.553061866760254, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.985062575696407, | |
| "grad_norm": 0.46495500206947327, | |
| "learning_rate": 0.0002850130081769334, | |
| "loss": 2.486197853088379, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.9545954465866089, | |
| "learning_rate": 0.00027698984528241036, | |
| "loss": 2.4371877670288087, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.016148566814695, | |
| "grad_norm": 0.48022302985191345, | |
| "learning_rate": 0.00026903773622195636, | |
| "loss": 2.512773895263672, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.0322971336293905, | |
| "grad_norm": 0.433242529630661, | |
| "learning_rate": 0.00026115921463731694, | |
| "loss": 2.5300994873046876, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.0484457004440855, | |
| "grad_norm": 0.47226834297180176, | |
| "learning_rate": 0.0002533567907243446, | |
| "loss": 2.421502113342285, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.064594267258781, | |
| "grad_norm": 0.5992064476013184, | |
| "learning_rate": 0.00024563295043321783, | |
| "loss": 2.5453359603881838, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.080742834073476, | |
| "grad_norm": 0.4261581599712372, | |
| "learning_rate": 0.0002379901546763879, | |
| "loss": 2.495037841796875, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.0968914008881714, | |
| "grad_norm": 0.4328082203865051, | |
| "learning_rate": 0.00023043083854449987, | |
| "loss": 2.464985466003418, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.1130399677028664, | |
| "grad_norm": 0.4510248303413391, | |
| "learning_rate": 0.00022295741053054296, | |
| "loss": 2.5308864593505858, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.1291885345175614, | |
| "grad_norm": 0.49293237924575806, | |
| "learning_rate": 0.00021557225176247353, | |
| "loss": 2.5278299331665037, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.145337101332257, | |
| "grad_norm": 0.4132377505302429, | |
| "learning_rate": 0.0002082777152445589, | |
| "loss": 2.398031234741211, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.161485668146952, | |
| "grad_norm": 0.5191354751586914, | |
| "learning_rate": 0.00020107612510768014, | |
| "loss": 2.5248828887939454, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.1776342349616473, | |
| "grad_norm": 0.5512005686759949, | |
| "learning_rate": 0.00019396977586883475, | |
| "loss": 2.4451154708862304, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.1937828017763423, | |
| "grad_norm": 0.42351678013801575, | |
| "learning_rate": 0.00018696093170007493, | |
| "loss": 2.573942756652832, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.2099313685910373, | |
| "grad_norm": 0.45575806498527527, | |
| "learning_rate": 0.00018005182570711366, | |
| "loss": 2.5537620544433595, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.226079935405733, | |
| "grad_norm": 0.46785682439804077, | |
| "learning_rate": 0.0001732446592178295, | |
| "loss": 2.491817092895508, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.242228502220428, | |
| "grad_norm": 0.45513054728507996, | |
| "learning_rate": 0.00016654160108089594, | |
| "loss": 2.5171764373779295, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.2583770690351233, | |
| "grad_norm": 0.43288710713386536, | |
| "learning_rate": 0.00015994478697475885, | |
| "loss": 2.5143251419067383, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.2745256358498183, | |
| "grad_norm": 0.4459301233291626, | |
| "learning_rate": 0.00015345631872718213, | |
| "loss": 2.5065849304199217, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.2906742026645137, | |
| "grad_norm": 0.501880407333374, | |
| "learning_rate": 0.00014707826364557985, | |
| "loss": 2.4833837509155274, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.3068227694792087, | |
| "grad_norm": 0.4559042453765869, | |
| "learning_rate": 0.00014081265385834557, | |
| "loss": 2.5120367050170898, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.3229713362939037, | |
| "grad_norm": 0.48048946261405945, | |
| "learning_rate": 0.000134661485667391, | |
| "loss": 2.4629817962646485, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.339119903108599, | |
| "grad_norm": 0.46413764357566833, | |
| "learning_rate": 0.0001286267189120986, | |
| "loss": 2.4572961807250975, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.355268469923294, | |
| "grad_norm": 0.4971129298210144, | |
| "learning_rate": 0.000122710276344893, | |
| "loss": 2.5448049545288085, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.3714170367379896, | |
| "grad_norm": 0.45018401741981506, | |
| "learning_rate": 0.00011691404301862746, | |
| "loss": 2.551463317871094, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.3875656035526847, | |
| "grad_norm": 0.49596303701400757, | |
| "learning_rate": 0.00011123986568598249, | |
| "loss": 2.5440711975097656, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 2.4037141703673797, | |
| "grad_norm": 0.47923141717910767, | |
| "learning_rate": 0.00010568955221106713, | |
| "loss": 2.45603084564209, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 2.419862737182075, | |
| "grad_norm": 0.4507387578487396, | |
| "learning_rate": 0.0001002648709934108, | |
| "loss": 2.458144187927246, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.43601130399677, | |
| "grad_norm": 0.46995073556900024, | |
| "learning_rate": 9.496755040452915e-05, | |
| "loss": 2.4886669158935546, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 2.4521598708114656, | |
| "grad_norm": 0.4773581326007843, | |
| "learning_rate": 8.979927823724321e-05, | |
| "loss": 2.548818016052246, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 2.4683084376261606, | |
| "grad_norm": 0.49100548028945923, | |
| "learning_rate": 8.476170116792736e-05, | |
| "loss": 2.5328250885009767, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.484457004440856, | |
| "grad_norm": 0.5164358615875244, | |
| "learning_rate": 7.985642423185718e-05, | |
| "loss": 2.4474281311035155, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 2.500605571255551, | |
| "grad_norm": 0.48450803756713867, | |
| "learning_rate": 7.508501031182585e-05, | |
| "loss": 2.470208168029785, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.5167541380702465, | |
| "grad_norm": 0.4779358208179474, | |
| "learning_rate": 7.044897964018949e-05, | |
| "loss": 2.443818283081055, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.5329027048849415, | |
| "grad_norm": 0.42919352650642395, | |
| "learning_rate": 6.594980931450223e-05, | |
| "loss": 2.4840776443481447, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 2.5490512716996365, | |
| "grad_norm": 0.4021783769130707, | |
| "learning_rate": 6.158893282689454e-05, | |
| "loss": 2.450935173034668, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 2.565199838514332, | |
| "grad_norm": 0.5504807829856873, | |
| "learning_rate": 5.7367739607344093e-05, | |
| "loss": 2.477644348144531, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.581348405329027, | |
| "grad_norm": 0.3885093331336975, | |
| "learning_rate": 5.328757458098665e-05, | |
| "loss": 2.449785041809082, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.5974969721437224, | |
| "grad_norm": 0.5313092470169067, | |
| "learning_rate": 4.934973773960572e-05, | |
| "loss": 2.564461898803711, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 2.6136455389584174, | |
| "grad_norm": 0.5081238150596619, | |
| "learning_rate": 4.5555483727438896e-05, | |
| "loss": 2.5144027709960937, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 2.6297941057731125, | |
| "grad_norm": 0.45351824164390564, | |
| "learning_rate": 4.190602144143207e-05, | |
| "loss": 2.5690656661987306, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 2.645942672587808, | |
| "grad_norm": 0.49749699234962463, | |
| "learning_rate": 3.840251364607045e-05, | |
| "loss": 2.5524566650390623, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 2.662091239402503, | |
| "grad_norm": 0.4736417233943939, | |
| "learning_rate": 3.50460766029066e-05, | |
| "loss": 2.437306022644043, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.6782398062171984, | |
| "grad_norm": 0.54044508934021, | |
| "learning_rate": 3.183777971490576e-05, | |
| "loss": 2.4329130172729494, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 2.6943883730318934, | |
| "grad_norm": 0.5381774306297302, | |
| "learning_rate": 2.8778645185720487e-05, | |
| "loss": 2.5384393692016602, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.7105369398465884, | |
| "grad_norm": 0.4624033570289612, | |
| "learning_rate": 2.5869647694003962e-05, | |
| "loss": 2.5221799850463866, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.726685506661284, | |
| "grad_norm": 0.49552807211875916, | |
| "learning_rate": 2.3111714082864887e-05, | |
| "loss": 2.519091987609863, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 2.742834073475979, | |
| "grad_norm": 0.4731680154800415, | |
| "learning_rate": 2.0505723064563886e-05, | |
| "loss": 2.4723621368408204, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.7589826402906743, | |
| "grad_norm": 0.44819337129592896, | |
| "learning_rate": 1.8052504940544613e-05, | |
| "loss": 2.5209144592285155, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.7751312071053693, | |
| "grad_norm": 0.4628264307975769, | |
| "learning_rate": 1.575284133688909e-05, | |
| "loss": 2.502878189086914, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 2.7912797739200643, | |
| "grad_norm": 0.4762296676635742, | |
| "learning_rate": 1.3607464955282257e-05, | |
| "loss": 2.4997129440307617, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 2.8074283407347598, | |
| "grad_norm": 0.4384547173976898, | |
| "learning_rate": 1.1617059339563806e-05, | |
| "loss": 2.6152523040771483, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 2.8235769075494552, | |
| "grad_norm": 0.4384756088256836, | |
| "learning_rate": 9.782258657942467e-06, | |
| "loss": 2.4833805084228517, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.8397254743641502, | |
| "grad_norm": 0.4431445300579071, | |
| "learning_rate": 8.103647500942112e-06, | |
| "loss": 2.500337028503418, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 2.8558740411788452, | |
| "grad_norm": 0.4854304790496826, | |
| "learning_rate": 6.581760695143934e-06, | |
| "loss": 2.4700483322143554, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 2.8720226079935407, | |
| "grad_norm": 0.4667441248893738, | |
| "learning_rate": 5.217083132783907e-06, | |
| "loss": 2.4867990493774412, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 2.8881711748082357, | |
| "grad_norm": 0.49439942836761475, | |
| "learning_rate": 4.010049617260203e-06, | |
| "loss": 2.515974426269531, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 2.904319741622931, | |
| "grad_norm": 0.43787845969200134, | |
| "learning_rate": 2.961044724599016e-06, | |
| "loss": 2.5289798736572267, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.920468308437626, | |
| "grad_norm": 0.4790705144405365, | |
| "learning_rate": 2.0704026809241215e-06, | |
| "loss": 2.4601634979248046, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 2.936616875252321, | |
| "grad_norm": 0.4470031261444092, | |
| "learning_rate": 1.338407255968288e-06, | |
| "loss": 2.4816938400268556, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 2.9527654420670166, | |
| "grad_norm": 0.4893916845321655, | |
| "learning_rate": 7.652916726604287e-07, | |
| "loss": 2.529372978210449, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.9689140088817116, | |
| "grad_norm": 0.431090384721756, | |
| "learning_rate": 3.5123853281793237e-07, | |
| "loss": 2.551029586791992, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 2.985062575696407, | |
| "grad_norm": 0.4225512742996216, | |
| "learning_rate": 9.637975896759077e-08, | |
| "loss": 2.5007448196411133, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 1.7053178548812866, | |
| "learning_rate": 7.965523131092667e-10, | |
| "loss": 2.677412414550781, | |
| "step": 1860 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1860, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.131504576233472e+16, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |