nvan13 commited on
Commit
ecadbd9
·
verified ·
1 Parent(s): b966d2d

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +65 -0
  2. .gitignore +237 -0
  3. Llama13B/CMS/t60107d15h52m28,ep=2.0,mlr2.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft/adapter_config.json +26 -0
  4. Llama13B/CMS/t60107d15h52m28,ep=2.0,mlr2.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft/special_tokens_map.json +30 -0
  5. Llama13B/CMS/t60107d15h52m28,ep=2.0,mlr2.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft/tokenizer.json +0 -0
  6. Llama13B/CMS/t60107d15h52m28,ep=2.0,mlr2.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft/tokenizer.model +3 -0
  7. Llama13B/CMS/t60107d15h52m28,ep=2.0,mlr2.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft/tokenizer_config.json +43 -0
  8. Llama13B/CMS/t60107d15h52m28,ep=2.0,mlr2.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft2_/adapter_config.json +26 -0
  9. Llama13B/CMS/t60107d15h52m28,ep=2.0,mlr2.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft2_/adapter_model.bin +3 -0
  10. Llama13B/CMS/t60107d15h52m28,ep=2.0,mlr2.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/model_parameters_report.txt +409 -0
  11. Llama13B/CMS/t60107d15h52m28,ep=2.0,mlr2.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/trainer_state.json +110 -0
  12. Llama13B/CMS/t60107d17h56m56,ep=2.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft/adapter_config.json +26 -0
  13. Llama13B/CMS/t60107d17h56m56,ep=2.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft/special_tokens_map.json +30 -0
  14. Llama13B/CMS/t60107d17h56m56,ep=2.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft/tokenizer.json +0 -0
  15. Llama13B/CMS/t60107d17h56m56,ep=2.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft/tokenizer.model +3 -0
  16. Llama13B/CMS/t60107d17h56m56,ep=2.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft/tokenizer_config.json +43 -0
  17. Llama13B/CMS/t60107d17h56m56,ep=2.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft2_/adapter_config.json +26 -0
  18. Llama13B/CMS/t60107d17h56m56,ep=2.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft2_/adapter_model.bin +3 -0
  19. Llama13B/CMS/t60107d17h56m56,ep=2.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/model_parameters_report.txt +409 -0
  20. Llama13B/CMS/t60107d17h56m56,ep=2.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/trainer_state.json +528 -0
  21. Llama13B/CMS/t60107d20h20m59,ep=1.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft/adapter_config.json +26 -0
  22. Llama13B/CMS/t60107d20h20m59,ep=1.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft/special_tokens_map.json +30 -0
  23. Llama13B/CMS/t60107d20h20m59,ep=1.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft/tokenizer.json +0 -0
  24. Llama13B/CMS/t60107d20h20m59,ep=1.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft/tokenizer.model +3 -0
  25. Llama13B/CMS/t60107d20h20m59,ep=1.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft/tokenizer_config.json +43 -0
  26. Llama13B/CMS/t60107d20h20m59,ep=1.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft2_/adapter_config.json +26 -0
  27. Llama13B/CMS/t60107d20h20m59,ep=1.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft2_/adapter_model.bin +3 -0
  28. Llama13B/CMS/t60107d20h20m59,ep=1.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/model_parameters_report.txt +409 -0
  29. Llama13B/CMS/t60107d20h20m59,ep=1.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/trainer_state.json +528 -0
  30. Llama13B/CMS/t60107d21h14m55,ep=2.0,mlr5.0e-04,b16,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/model_parameters_report.txt +409 -0
  31. Llama13B/CMS/t60107d21h20m27,ep=2.0,mlr5.0e-04,b16,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/model_parameters_report.txt +409 -0
  32. Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/ft/adapter_config.json +26 -0
  33. Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/ft/special_tokens_map.json +30 -0
  34. Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/ft/tokenizer.json +0 -0
  35. Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/ft/tokenizer.model +3 -0
  36. Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/ft/tokenizer_config.json +43 -0
  37. Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/ft2/adapter_config.json +26 -0
  38. Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/ft2/adapter_model.bin +3 -0
  39. Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/model_parameters_report.txt +409 -0
  40. Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/results/ARC-Challenge.json +0 -0
  41. Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/results/ARC-Easy.json +0 -0
  42. Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/results/FINAL.json +11 -0
  43. Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/results/boolq.json +0 -0
  44. Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/results/hellaswag.json +0 -0
  45. Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/results/openbookqa.json +0 -0
  46. Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/results/piqa.json +0 -0
  47. Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/results/social_i_qa.json +0 -0
  48. Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/results/winogrande.json +0 -0
  49. Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/trainer_state.json +950 -0
  50. Llama13B/CMS/t60108d07h46m12,ep=2.0,mlr5.0e-04,b8,nb8,8,cL8,rR8,s1,initdef,dr0.0,size146627,5/ft/adapter_config.json +26 -0
.gitattributes CHANGED
@@ -33,3 +33,68 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Llama3_B8/CMS/_t60119d07h10m01,ep=2.0,mlr1.0e-03,b16,1,nb16,16,cL16,rR16,s1.0,inits43,dr0.0,size146627,5/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ Llama3_B8/CMS/_t60120d05h38m09,ep=2.0,mlr1.0e-03,b16,1,nb16,16,cL16,rR16,s1.0,inits44,dr0.0,size146627,5/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ Llama3_B8/CMS/t60111d08h06m53,ep=2.0,mlr2.0e-04,b8,nb4,4,cL4,rR4,s1.0,initdef,dr0.0,size146627,2/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
+ Llama3_B8/CMS/t60111d18h17m31,ep=2.0,mlr2.0e-04,b8,nb4,4,cL8,rR8,s2.0,initdef,dr0.0,size146627,2\*\*/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
+ Llama3_B8/CMS/t60116d00h39m41,ep=2.0,mlr4.0e-04,Lb16,1,nb32,32,cL32,rR32,s1.0,initlora32,dr0.0,size146627,2/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
41
+ Llama3_B8/CMS/t60117d14h18m29,ep=2.0,mlr4.0e-04,b16,1,nb32,32,cL32,rR32,s1.0,initLORA4,dr0.0,size146627,2/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
42
+ Llama3_B8/CMS/t60118d17h22m41,ep=2.0,mlr2.0e-04,b16,1,nb16,16,cL16,rR16,s1.0,initdef,dr0.0,size146627,2/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
43
+ Llama3_B8/CMS/t60119d00h34m38,ep=2.0,mlr2.0e-04,b16,1,nb4,4,cL4,rR4,s1.0,inits43,dr0.0,size146627,2/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
44
+ Llama3_B8/CMS/t60119d03h51m53,ep=2.0,mlr2.0e-04,b16,1,nb16,16,cL16,rR16,s1.0,inits43,dr0.0,size146627,2/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
45
+ Llama3_B8/CMS/t60119d10h52m09,ep=2.0,mlr1.0e-04,b16,1,nb32,32,cL32,rR32,s1.0,inits43,dr0.0,size146627,5/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
46
+ Llama3_B8/CMS/t60119d14h26m55,ep=2.0,mlr1.0e-04,b16,1,nb8,8,cL8,rR8,s1.0,inits43,dr0.0,size146627,5/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
47
+ Llama3_B8/CMS/t60119d19h46m19,ep=2.0,mlr1.0e-04,b16,1,nb4,4,cL4,rR4,s1.0,inits43,dr0.0,size146627,5/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
48
+ Llama3_B8/CMS/t60119d23h22m59,ep=2.0,mlr2.0e-04,b16,1,nb4,4,cL4,rR4,s1.0,inits44,dr0.0,size146627,2/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
49
+ Llama3_B8/CMS/t60120d02h28m13,ep=2.0,mlr2.0e-04,b16,1,nb16,16,cL16,rR16,s1.0,inits44,dr0.0,size146627,2/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
50
+ Llama3_B8/CMS/t60120d09h10m21,ep=2.0,mlr1.0e-04,b16,1,nb32,32,cL32,rR32,s1.0,inits44,dr0.0,size146627,5/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
51
+ Llama3_B8/CMS/t60120d12h41m43,ep=2.0,mlr1.0e-04,b16,1,nb8,8,cL8,rR8,s1.0,inits44,dr0.0,size146627,5/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
52
+ Llama3_B8/CMS/t60120d16h06m54,ep=2.0,mlr1.0e-04,b16,1,nb4,4,cL4,rR4,s1.0,inits44,dr0.0,size146627,5/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
53
+ Llama3_B8/CMS/t60120d23h33m42,ep=2.0,mlr2.0e-04,b16,1,nb4,4,cL4,rR4,s1.0,inits45,dr0.0,size146627,2/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
54
+ Llama3_B8/CMS/t60121d02h39m14,ep=2.0,mlr2.0e-04,b16,1,nb16,16,cL16,rR16,s1.0,inits45,dr0.0,size146627,2/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
55
+ Llama3_B8/CMS/t60121d05h44m56,ep=2.0,mlr1.0e-03,b16,1,nb16,16,cL16,rR16,s1.0,inits45,dr0.0,size146627,5/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
56
+ Llama3_B8/CMS/t60121d09h06m12,ep=2.0,mlr1.0e-04,b16,1,nb32,32,cL32,rR32,s1.0,inits45,dr0.0,size146627,5/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
57
+ Llama3_B8/CMS/t60121d12h32m06,ep=2.0,mlr1.0e-04,b16,1,nb8,8,cL8,rR8,s1.0,inits45,dr0.0,size146627,5/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
58
+ Llama3_B8/CMS/t60121d15h52m26,ep=2.0,mlr1.0e-04,b16,1,nb4,4,cL4,rR4,s1.0,inits45,dr0.0,size146627,5/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
59
+ Llama3_B8/CMS/t60121d20h17m54,ep=2.0,mlr2.0e-04,b16,1,nb4,4,cL4,rR4,s1.0,initsd42,dr0.0,size146627,5/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
60
+ Llama3_B8/CMS/t=31d04h17m34,ep=2.0,mlr1.0e-03,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size146627,2/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
61
+ Llama3_B8/CMS/t=31d11h50m33,ep=2.0,mlr1.0e-03,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size146627,5/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
62
+ Llama3_B8/CMS/t=60106d16h51m26,ep=2.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size146627,5/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
63
+ Llama3_B8/CMS/t=60106d21h14m30,ep=2.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size146627,6/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
64
+ Llama3_B8/CMS/t=60107d01h46m41,ep=2.0,mlr2.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size146627,5/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
65
+ Llama3_B8/CMS/t=60107d06h16m57,ep=2.0,mlr1.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size146627,5/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
66
+ Llama3_B8/CMS/t=60107d10h45m00,ep=2.0,mlr2.0e-04,b8,nb16,16,cL16,rR16,s1,initdef,dr0.0,size146627,5/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
67
+ Llama3_LoRA/CMS/t60115d00h54m28,ep=2.0,mlr1.0e-04,b16,1,nb32,32,cL32,rR32,s1.0,initlora32,dr0.0,size146627,2/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
68
+ Llama3_LoRA/CMS/t60115d14h54m59,ep=2.0,mlr1.0e-04,b16,1,nb32,32,cL32,rR32,s1.0,initlora32,dr0.0,size146627,2/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
69
+ Llama3_LoRA/CMS/t60116d00h05m29,ep=2.0,mlr1.0e-04,b16,1,nb32,32,cL32,rR32,s1.0,initlora32,dr0.0,size4000,2/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
70
+ Llama3_LoRA/CMS/t60116d00h12m57,ep=2.0,mlr2.0e-04,b16,1,nb32,32,cL32,rR32,s1.0,initlora32,dr0.0,size4000,2/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
71
+ Llama3_LoRA/CMS/t60116d00h20m28,ep=2.0,mlr5.0e-04,b16,1,nb32,32,cL32,rR32,s1.0,initlora32,dr0.0,size4000,2/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
72
+ Llama3_LoRA/CMS/t60116d00h27m59,ep=2.0,mlr8.0e-04,b16,1,nb32,32,cL32,rR32,s1.0,initlora32,dr0.0,size4000,2/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
73
+ Llama3_LoRA/CMS/t60116d03h20m53,ep=2.0,mlr7.0e-04,b16,1,nb32,32,cL32,rR32,s1.0,initlora32,dr0.0,size146627,2/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
74
+ MGemma9B/MATH/t60112d11h56m00,mlr5.0e-04,b8,2,nb16,16,cL16,rR16,s1.0,initg9b,dr0.0,ep2.0/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
75
+ MGemma9B/MATH/t60112d12h55m46,mlr5.0e-04,b8,2,nb4,4,cL4,rR4,s1.0,initg9b,dr0.0,ep2.0/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
76
+ MGemma9B/MATH/t60112d14h14m09,mlr5.0e-04,b8,2,nb4,4,cL8,rR8,s2.0,initg9b,dr0.0,ep2.0/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
77
+ MGemma9B/MATH/t60112d15h27m12,mlr5.0e-04,b8,2,nb4,4,cL16,rR16,s4.0,initg9b,dr0.0,ep2.0/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
78
+ MGemma9B/MATH/t60112d17h16m48,mlr5.0e-04,b8,2,nb16,16,cL32,rR32,s2.0,initg9b,dr0.0,ep2.0/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
79
+ MGemma9B/MATH/t60113d11h10m24,mlr5.0e-04,b8,2,nb2,2,cL2,rR2,s1.0,initg9b,dr0.0,ep2.0/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
80
+ MGemma9B/MATH/t60113d11h52m59,mlr5.0e-04,b8,2,nb2,2,cL4,rR4,s2.0,initg9b,dr0.0,ep2.0/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
81
+ MGemma9B/MATH/t60113d13h02m54,mlr5.0e-04,b8,2,nb2,2,cL4,rR4,s2.0,initg9b,dr0.0,ep2.0/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
82
+ MGemma9B/MATH/t60123d00h51m06,mlr5.0e-04,b8,2,nb16,16,cL16,rR16,s1.0,initg9bs53,dr0.0,ep2.0/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
83
+ MGemma9B/MATH/t60123d01h34m22,mlr5.0e-04,b8,2,nb2,2,cL2,rR2,s1.0,initg9bs53,dr0.0,ep2.0/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
84
+ MGemma9B/MATH/t60123d02h16m48,mlr5.0e-04,b8,2,nb4,4,cL4,rR4,s1.0,initg9bs53,dr0.0,ep2.0/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
85
+ MGemma9B/MATH/t60123d02h59m37,mlr5.0e-04,b8,2,nb2,2,cL4,rR4,s2.0,initg9bs53,dr0.0,ep2.0/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
86
+ MGemma9B/MATH/t60123d03h42m41,mlr5.0e-04,b8,2,nb2,2,cL4,rR4,s1.4142,initg9bs53,dr0.0,ep2.0/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
87
+ MGemma9B/MATH/t60123d04h25m42,mlr5.0e-04,b8,2,nb2,2,cL4,rR4,s2.8284,initg9bs53,dr0.0,ep2.0/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
88
+ MGemma9B/MATH/t60123d05h08m42,mlr5.0e-04,b8,2,nb2,2,cL4,rR4,s4.0,initg9bs53,dr0.0,ep2.0/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
89
+ MGemma9B/MATH/t60123d16h15m33,mlr5.0e-04,b8,2,nb2,2,cL4,rR4,s1.0,initg9bs53,dr0.0,ep2.0/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
90
+ MGemma9B/MATH/t60123d16h58m22,mlr5.0e-04,b8,2,nb2,2,cL2,rR2,s0.7071,initg9bs53,dr0.0,ep2.0/ft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
91
+ dataset/hellaswag/test.json filter=lfs diff=lfs merge=lfs -text
92
+ dataset/hellaswag/train.json filter=lfs diff=lfs merge=lfs -text
93
+ dataset/social_i_qa/train.json filter=lfs diff=lfs merge=lfs -text
94
+ dataset/winogrande/train.json filter=lfs diff=lfs merge=lfs -text
95
+ ft_training_set/alpaca_data.json filter=lfs diff=lfs merge=lfs -text
96
+ ft_training_set/alpaca_data_cleaned.json filter=lfs diff=lfs merge=lfs -text
97
+ ft_training_set/commonsense_147k.json filter=lfs diff=lfs merge=lfs -text
98
+ ft_training_set/commonsense_170k.json filter=lfs diff=lfs merge=lfs -text
99
+ ft_training_set/math_14k.json filter=lfs diff=lfs merge=lfs -text
100
+ ft_training_set/math_50k.json filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb/*
2
+ runs/*
3
+
4
+
5
+ # Byte-compiled / optimized / DLL files
6
+ __pycache__/
7
+ *.py[codz]
8
+ *$py.class
9
+
10
+ # C extensions
11
+ *.so
12
+
13
+ # Distribution / packaging
14
+ .Python
15
+ build/
16
+ develop-eggs/
17
+ dist/
18
+ downloads/
19
+ eggs/
20
+ .eggs/
21
+ lib/
22
+ lib64/
23
+ parts/
24
+ sdist/
25
+ var/
26
+ wheels/
27
+ share/python-wheels/
28
+ *.egg-info/
29
+ .installed.cfg
30
+ *.egg
31
+ MANIFEST
32
+
33
+ # PyInstaller
34
+ # Usually these files are written by a python script from a template
35
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
36
+ *.manifest
37
+ *.spec
38
+
39
+ # Installer logs
40
+ pip-log.txt
41
+ pip-delete-this-directory.txt
42
+
43
+ # Unit test / coverage reports
44
+ htmlcov/
45
+ .tox/
46
+ .nox/
47
+ .coverage
48
+ .coverage.*
49
+ .cache
50
+ nosetests.xml
51
+ coverage.xml
52
+ *.cover
53
+ *.py.cover
54
+ .hypothesis/
55
+ .pytest_cache/
56
+ cover/
57
+
58
+ # Translations
59
+ *.mo
60
+ *.pot
61
+
62
+ # Django stuff:
63
+ *.log
64
+ local_settings.py
65
+ db.sqlite3
66
+ db.sqlite3-journal
67
+
68
+ # Flask stuff:
69
+ instance/
70
+ .webassets-cache
71
+
72
+ # Scrapy stuff:
73
+ .scrapy
74
+
75
+ # Sphinx documentation
76
+ docs/_build/
77
+
78
+ # PyBuilder
79
+ .pybuilder/
80
+ target/
81
+
82
+ # Jupyter Notebook
83
+ .ipynb_checkpoints
84
+
85
+ # IPython
86
+ profile_default/
87
+ ipython_config.py
88
+
89
+ # pyenv
90
+ # For a library or package, you might want to ignore these files since the code is
91
+ # intended to run in multiple environments; otherwise, check them in:
92
+ # .python-version
93
+
94
+ # pipenv
95
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
96
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
97
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
98
+ # install all needed dependencies.
99
+ # Pipfile.lock
100
+
101
+ # UV
102
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
103
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
104
+ # commonly ignored for libraries.
105
+ # uv.lock
106
+
107
+ # poetry
108
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
109
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
110
+ # commonly ignored for libraries.
111
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
112
+ # poetry.lock
113
+ # poetry.toml
114
+
115
+ # pdm
116
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
117
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
118
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
119
+ # pdm.lock
120
+ # pdm.toml
121
+ .pdm-python
122
+ .pdm-build/
123
+
124
+ # pixi
125
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
126
+ # pixi.lock
127
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
128
+ # in the .venv directory. It is recommended not to include this directory in version control.
129
+ .pixi
130
+
131
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
132
+ __pypackages__/
133
+
134
+ # Celery stuff
135
+ celerybeat-schedule
136
+ celerybeat.pid
137
+
138
+ # Redis
139
+ *.rdb
140
+ *.aof
141
+ *.pid
142
+
143
+ # RabbitMQ
144
+ mnesia/
145
+ rabbitmq/
146
+ rabbitmq-data/
147
+
148
+ # ActiveMQ
149
+ activemq-data/
150
+
151
+ # SageMath parsed files
152
+ *.sage.py
153
+
154
+ # Environments
155
+ .env
156
+ .envrc
157
+ .venv
158
+ env/
159
+ venv/
160
+ ENV/
161
+ env.bak/
162
+ venv.bak/
163
+
164
+ # Spyder project settings
165
+ .spyderproject
166
+ .spyproject
167
+
168
+ # Rope project settings
169
+ .ropeproject
170
+
171
+ # mkdocs documentation
172
+ /site
173
+
174
+ # mypy
175
+ .mypy_cache/
176
+ .dmypy.json
177
+ dmypy.json
178
+
179
+ # Pyre type checker
180
+ .pyre/
181
+
182
+ # pytype static type analyzer
183
+ .pytype/
184
+
185
+ # Cython debug symbols
186
+ cython_debug/
187
+
188
+ # PyCharm
189
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
190
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
191
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
192
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
193
+ # .idea/
194
+
195
+ # Abstra
196
+ # Abstra is an AI-powered process automation framework.
197
+ # Ignore directories containing user credentials, local state, and settings.
198
+ # Learn more at https://abstra.io/docs
199
+ .abstra/
200
+
201
+ # Visual Studio Code
202
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
203
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
204
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
205
+ # you could uncomment the following to ignore the entire vscode folder
206
+ # .vscode/
207
+
208
+ # Ruff stuff:
209
+ .ruff_cache/
210
+
211
+ # PyPI configuration file
212
+ .pypirc
213
+
214
+ # Marimo
215
+ marimo/_static/
216
+ marimo/_lsp/
217
+ __marimo__/
218
+
219
+ # Streamlit
220
+ .streamlit/secrets.toml
221
+
222
+
223
+ exps/
224
+ wandb/
225
+ # *.ipynb
226
+ # scripts/all.sh
227
+ # smpeft/sama/monarch_viz.png
228
+ # Llama2_exps/
229
+ # dataset/
230
+ # ft_training_set/
231
+ # Llama2_13B/
232
+ # Llama13B/
233
+ # Llama3_B00/
234
+ # Llama3_B8/
235
+ # Llama3_LoRA/
236
+ # MGemma9B/
237
+ # Mistral7B/
Llama13B/CMS/t60107d15h52m28,ep=2.0,mlr2.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_model_name_or_path": "dfurman/llama-13b",
3
+ "bias": "none",
4
+ "col_L": 32,
5
+ "drop_out": 0.0,
6
+ "inference_mode": false,
7
+ "layers_to_transform": null,
8
+ "modules_to_save": null,
9
+ "num_unique_blocks_L": 32,
10
+ "num_unique_blocks_R": 32,
11
+ "peft_type": "SAMA",
12
+ "revision": null,
13
+ "row_R": 32,
14
+ "scaling": 1,
15
+ "share_factor_L": 1,
16
+ "share_factor_R": 1,
17
+ "target_modules": [
18
+ "k_proj",
19
+ "v_proj",
20
+ "q_proj",
21
+ "down_proj",
22
+ "up_proj"
23
+ ],
24
+ "target_modules_to_skip": null,
25
+ "task_type": "CAUSAL_LM"
26
+ }
Llama13B/CMS/t60107d15h52m28,ep=2.0,mlr2.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
Llama13B/CMS/t60107d15h52m28,ep=2.0,mlr2.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
Llama13B/CMS/t60107d15h52m28,ep=2.0,mlr2.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
Llama13B/CMS/t60107d15h52m28,ep=2.0,mlr2.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": true,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": true,
36
+ "model_max_length": 512,
37
+ "pad_token": "</s>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
Llama13B/CMS/t60107d15h52m28,ep=2.0,mlr2.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft2_/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_model_name_or_path": "dfurman/llama-13b",
3
+ "bias": "none",
4
+ "col_L": 32,
5
+ "drop_out": 0.0,
6
+ "inference_mode": true,
7
+ "layers_to_transform": null,
8
+ "modules_to_save": null,
9
+ "num_unique_blocks_L": 32,
10
+ "num_unique_blocks_R": 32,
11
+ "peft_type": "SAMA",
12
+ "revision": null,
13
+ "row_R": 32,
14
+ "scaling": 1,
15
+ "share_factor_L": 1,
16
+ "share_factor_R": 1,
17
+ "target_modules": [
18
+ "k_proj",
19
+ "v_proj",
20
+ "q_proj",
21
+ "down_proj",
22
+ "up_proj"
23
+ ],
24
+ "target_modules_to_skip": null,
25
+ "task_type": "CAUSAL_LM"
26
+ }
Llama13B/CMS/t60107d15h52m28,ep=2.0,mlr2.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft2_/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de64b12482a708fd656095a77f9647d214f1419ac5343723fd6acd16d2a8a391
3
+ size 351427939
Llama13B/CMS/t60107d15h52m28,ep=2.0,mlr2.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/model_parameters_report.txt ADDED
@@ -0,0 +1,409 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ === GLOBAL STATISTICS ===
2
+ Total Parameters: 13,103,682,560
3
+ Trainable Parameters: 87,818,240
4
+ Trainable Percentage: 0.6702%
5
+
6
+ ================================================================================================================================
7
+ === DETAILED TRAINABLE MATRICES LIST ===
8
+ Layer Name | Shape | Count
9
+ ---------------------------------------------------------------------------------------------------------------------------------------
10
+ base_model.model.model.layers.0.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
11
+ base_model.model.model.layers.0.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
12
+ base_model.model.model.layers.0.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
13
+ base_model.model.model.layers.0.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
14
+ base_model.model.model.layers.0.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
15
+ base_model.model.model.layers.0.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
16
+ base_model.model.model.layers.0.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
17
+ base_model.model.model.layers.0.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
18
+ base_model.model.model.layers.0.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
19
+ base_model.model.model.layers.0.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
20
+ base_model.model.model.layers.1.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
21
+ base_model.model.model.layers.1.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
22
+ base_model.model.model.layers.1.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
23
+ base_model.model.model.layers.1.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
24
+ base_model.model.model.layers.1.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
25
+ base_model.model.model.layers.1.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
26
+ base_model.model.model.layers.1.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
27
+ base_model.model.model.layers.1.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
28
+ base_model.model.model.layers.1.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
29
+ base_model.model.model.layers.1.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
30
+ base_model.model.model.layers.2.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
31
+ base_model.model.model.layers.2.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
32
+ base_model.model.model.layers.2.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
33
+ base_model.model.model.layers.2.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
34
+ base_model.model.model.layers.2.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
35
+ base_model.model.model.layers.2.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
36
+ base_model.model.model.layers.2.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
37
+ base_model.model.model.layers.2.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
38
+ base_model.model.model.layers.2.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
39
+ base_model.model.model.layers.2.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
40
+ base_model.model.model.layers.3.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
41
+ base_model.model.model.layers.3.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
42
+ base_model.model.model.layers.3.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
43
+ base_model.model.model.layers.3.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
44
+ base_model.model.model.layers.3.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
45
+ base_model.model.model.layers.3.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
46
+ base_model.model.model.layers.3.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
47
+ base_model.model.model.layers.3.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
48
+ base_model.model.model.layers.3.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
49
+ base_model.model.model.layers.3.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
50
+ base_model.model.model.layers.4.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
51
+ base_model.model.model.layers.4.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
52
+ base_model.model.model.layers.4.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
53
+ base_model.model.model.layers.4.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
54
+ base_model.model.model.layers.4.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
55
+ base_model.model.model.layers.4.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
56
+ base_model.model.model.layers.4.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
57
+ base_model.model.model.layers.4.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
58
+ base_model.model.model.layers.4.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
59
+ base_model.model.model.layers.4.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
60
+ base_model.model.model.layers.5.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
61
+ base_model.model.model.layers.5.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
62
+ base_model.model.model.layers.5.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
63
+ base_model.model.model.layers.5.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
64
+ base_model.model.model.layers.5.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
65
+ base_model.model.model.layers.5.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
66
+ base_model.model.model.layers.5.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
67
+ base_model.model.model.layers.5.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
68
+ base_model.model.model.layers.5.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
69
+ base_model.model.model.layers.5.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
70
+ base_model.model.model.layers.6.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
71
+ base_model.model.model.layers.6.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
72
+ base_model.model.model.layers.6.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
73
+ base_model.model.model.layers.6.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
74
+ base_model.model.model.layers.6.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
75
+ base_model.model.model.layers.6.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
76
+ base_model.model.model.layers.6.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
77
+ base_model.model.model.layers.6.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
78
+ base_model.model.model.layers.6.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
79
+ base_model.model.model.layers.6.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
80
+ base_model.model.model.layers.7.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
81
+ base_model.model.model.layers.7.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
82
+ base_model.model.model.layers.7.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
83
+ base_model.model.model.layers.7.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
84
+ base_model.model.model.layers.7.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
85
+ base_model.model.model.layers.7.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
86
+ base_model.model.model.layers.7.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
87
+ base_model.model.model.layers.7.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
88
+ base_model.model.model.layers.7.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
89
+ base_model.model.model.layers.7.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
90
+ base_model.model.model.layers.8.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
91
+ base_model.model.model.layers.8.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
92
+ base_model.model.model.layers.8.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
93
+ base_model.model.model.layers.8.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
94
+ base_model.model.model.layers.8.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
95
+ base_model.model.model.layers.8.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
96
+ base_model.model.model.layers.8.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
97
+ base_model.model.model.layers.8.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
98
+ base_model.model.model.layers.8.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
99
+ base_model.model.model.layers.8.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
100
+ base_model.model.model.layers.9.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
101
+ base_model.model.model.layers.9.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
102
+ base_model.model.model.layers.9.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
103
+ base_model.model.model.layers.9.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
104
+ base_model.model.model.layers.9.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
105
+ base_model.model.model.layers.9.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
106
+ base_model.model.model.layers.9.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
107
+ base_model.model.model.layers.9.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
108
+ base_model.model.model.layers.9.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
109
+ base_model.model.model.layers.9.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
110
+ base_model.model.model.layers.10.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
111
+ base_model.model.model.layers.10.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
112
+ base_model.model.model.layers.10.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
113
+ base_model.model.model.layers.10.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
114
+ base_model.model.model.layers.10.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
115
+ base_model.model.model.layers.10.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
116
+ base_model.model.model.layers.10.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
117
+ base_model.model.model.layers.10.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
118
+ base_model.model.model.layers.10.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
119
+ base_model.model.model.layers.10.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
120
+ base_model.model.model.layers.11.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
121
+ base_model.model.model.layers.11.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
122
+ base_model.model.model.layers.11.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
123
+ base_model.model.model.layers.11.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
124
+ base_model.model.model.layers.11.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
125
+ base_model.model.model.layers.11.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
126
+ base_model.model.model.layers.11.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
127
+ base_model.model.model.layers.11.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
128
+ base_model.model.model.layers.11.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
129
+ base_model.model.model.layers.11.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
130
+ base_model.model.model.layers.12.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
131
+ base_model.model.model.layers.12.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
132
+ base_model.model.model.layers.12.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
133
+ base_model.model.model.layers.12.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
134
+ base_model.model.model.layers.12.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
135
+ base_model.model.model.layers.12.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
136
+ base_model.model.model.layers.12.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
137
+ base_model.model.model.layers.12.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
138
+ base_model.model.model.layers.12.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
139
+ base_model.model.model.layers.12.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
140
+ base_model.model.model.layers.13.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
141
+ base_model.model.model.layers.13.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
142
+ base_model.model.model.layers.13.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
143
+ base_model.model.model.layers.13.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
144
+ base_model.model.model.layers.13.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
145
+ base_model.model.model.layers.13.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
146
+ base_model.model.model.layers.13.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
147
+ base_model.model.model.layers.13.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
148
+ base_model.model.model.layers.13.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
149
+ base_model.model.model.layers.13.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
150
+ base_model.model.model.layers.14.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
151
+ base_model.model.model.layers.14.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
152
+ base_model.model.model.layers.14.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
153
+ base_model.model.model.layers.14.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
154
+ base_model.model.model.layers.14.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
155
+ base_model.model.model.layers.14.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
156
+ base_model.model.model.layers.14.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
157
+ base_model.model.model.layers.14.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
158
+ base_model.model.model.layers.14.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
159
+ base_model.model.model.layers.14.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
160
+ base_model.model.model.layers.15.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
161
+ base_model.model.model.layers.15.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
162
+ base_model.model.model.layers.15.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
163
+ base_model.model.model.layers.15.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
164
+ base_model.model.model.layers.15.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
165
+ base_model.model.model.layers.15.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
166
+ base_model.model.model.layers.15.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
167
+ base_model.model.model.layers.15.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
168
+ base_model.model.model.layers.15.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
169
+ base_model.model.model.layers.15.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
170
+ base_model.model.model.layers.16.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
171
+ base_model.model.model.layers.16.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
172
+ base_model.model.model.layers.16.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
173
+ base_model.model.model.layers.16.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
174
+ base_model.model.model.layers.16.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
175
+ base_model.model.model.layers.16.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
176
+ base_model.model.model.layers.16.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
177
+ base_model.model.model.layers.16.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
178
+ base_model.model.model.layers.16.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
179
+ base_model.model.model.layers.16.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
180
+ base_model.model.model.layers.17.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
181
+ base_model.model.model.layers.17.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
182
+ base_model.model.model.layers.17.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
183
+ base_model.model.model.layers.17.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
184
+ base_model.model.model.layers.17.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
185
+ base_model.model.model.layers.17.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
186
+ base_model.model.model.layers.17.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
187
+ base_model.model.model.layers.17.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
188
+ base_model.model.model.layers.17.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
189
+ base_model.model.model.layers.17.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
190
+ base_model.model.model.layers.18.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
191
+ base_model.model.model.layers.18.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
192
+ base_model.model.model.layers.18.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
193
+ base_model.model.model.layers.18.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
194
+ base_model.model.model.layers.18.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
195
+ base_model.model.model.layers.18.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
196
+ base_model.model.model.layers.18.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
197
+ base_model.model.model.layers.18.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
198
+ base_model.model.model.layers.18.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
199
+ base_model.model.model.layers.18.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
200
+ base_model.model.model.layers.19.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
201
+ base_model.model.model.layers.19.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
202
+ base_model.model.model.layers.19.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
203
+ base_model.model.model.layers.19.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
204
+ base_model.model.model.layers.19.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
205
+ base_model.model.model.layers.19.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
206
+ base_model.model.model.layers.19.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
207
+ base_model.model.model.layers.19.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
208
+ base_model.model.model.layers.19.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
209
+ base_model.model.model.layers.19.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
210
+ base_model.model.model.layers.20.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
211
+ base_model.model.model.layers.20.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
212
+ base_model.model.model.layers.20.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
213
+ base_model.model.model.layers.20.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
214
+ base_model.model.model.layers.20.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
215
+ base_model.model.model.layers.20.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
216
+ base_model.model.model.layers.20.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
217
+ base_model.model.model.layers.20.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
218
+ base_model.model.model.layers.20.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
219
+ base_model.model.model.layers.20.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
220
+ base_model.model.model.layers.21.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
221
+ base_model.model.model.layers.21.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
222
+ base_model.model.model.layers.21.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
223
+ base_model.model.model.layers.21.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
224
+ base_model.model.model.layers.21.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
225
+ base_model.model.model.layers.21.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
226
+ base_model.model.model.layers.21.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
227
+ base_model.model.model.layers.21.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
228
+ base_model.model.model.layers.21.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
229
+ base_model.model.model.layers.21.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
230
+ base_model.model.model.layers.22.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
231
+ base_model.model.model.layers.22.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
232
+ base_model.model.model.layers.22.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
233
+ base_model.model.model.layers.22.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
234
+ base_model.model.model.layers.22.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
235
+ base_model.model.model.layers.22.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
236
+ base_model.model.model.layers.22.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
237
+ base_model.model.model.layers.22.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
238
+ base_model.model.model.layers.22.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
239
+ base_model.model.model.layers.22.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
240
+ base_model.model.model.layers.23.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
241
+ base_model.model.model.layers.23.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
242
+ base_model.model.model.layers.23.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
243
+ base_model.model.model.layers.23.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
244
+ base_model.model.model.layers.23.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
245
+ base_model.model.model.layers.23.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
246
+ base_model.model.model.layers.23.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
247
+ base_model.model.model.layers.23.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
248
+ base_model.model.model.layers.23.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
249
+ base_model.model.model.layers.23.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
250
+ base_model.model.model.layers.24.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
251
+ base_model.model.model.layers.24.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
252
+ base_model.model.model.layers.24.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
253
+ base_model.model.model.layers.24.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
254
+ base_model.model.model.layers.24.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
255
+ base_model.model.model.layers.24.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
256
+ base_model.model.model.layers.24.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
257
+ base_model.model.model.layers.24.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
258
+ base_model.model.model.layers.24.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
259
+ base_model.model.model.layers.24.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
260
+ base_model.model.model.layers.25.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
261
+ base_model.model.model.layers.25.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
262
+ base_model.model.model.layers.25.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
263
+ base_model.model.model.layers.25.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
264
+ base_model.model.model.layers.25.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
265
+ base_model.model.model.layers.25.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
266
+ base_model.model.model.layers.25.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
267
+ base_model.model.model.layers.25.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
268
+ base_model.model.model.layers.25.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
269
+ base_model.model.model.layers.25.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
270
+ base_model.model.model.layers.26.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
271
+ base_model.model.model.layers.26.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
272
+ base_model.model.model.layers.26.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
273
+ base_model.model.model.layers.26.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
274
+ base_model.model.model.layers.26.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
275
+ base_model.model.model.layers.26.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
276
+ base_model.model.model.layers.26.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
277
+ base_model.model.model.layers.26.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
278
+ base_model.model.model.layers.26.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
279
+ base_model.model.model.layers.26.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
280
+ base_model.model.model.layers.27.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
281
+ base_model.model.model.layers.27.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
282
+ base_model.model.model.layers.27.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
283
+ base_model.model.model.layers.27.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
284
+ base_model.model.model.layers.27.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
285
+ base_model.model.model.layers.27.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
286
+ base_model.model.model.layers.27.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
287
+ base_model.model.model.layers.27.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
288
+ base_model.model.model.layers.27.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
289
+ base_model.model.model.layers.27.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
290
+ base_model.model.model.layers.28.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
291
+ base_model.model.model.layers.28.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
292
+ base_model.model.model.layers.28.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
293
+ base_model.model.model.layers.28.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
294
+ base_model.model.model.layers.28.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
295
+ base_model.model.model.layers.28.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
296
+ base_model.model.model.layers.28.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
297
+ base_model.model.model.layers.28.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
298
+ base_model.model.model.layers.28.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
299
+ base_model.model.model.layers.28.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
300
+ base_model.model.model.layers.29.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
301
+ base_model.model.model.layers.29.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
302
+ base_model.model.model.layers.29.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
303
+ base_model.model.model.layers.29.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
304
+ base_model.model.model.layers.29.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
305
+ base_model.model.model.layers.29.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
306
+ base_model.model.model.layers.29.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
307
+ base_model.model.model.layers.29.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
308
+ base_model.model.model.layers.29.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
309
+ base_model.model.model.layers.29.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
310
+ base_model.model.model.layers.30.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
311
+ base_model.model.model.layers.30.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
312
+ base_model.model.model.layers.30.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
313
+ base_model.model.model.layers.30.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
314
+ base_model.model.model.layers.30.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
315
+ base_model.model.model.layers.30.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
316
+ base_model.model.model.layers.30.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
317
+ base_model.model.model.layers.30.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
318
+ base_model.model.model.layers.30.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
319
+ base_model.model.model.layers.30.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
320
+ base_model.model.model.layers.31.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
321
+ base_model.model.model.layers.31.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
322
+ base_model.model.model.layers.31.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
323
+ base_model.model.model.layers.31.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
324
+ base_model.model.model.layers.31.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
325
+ base_model.model.model.layers.31.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
326
+ base_model.model.model.layers.31.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
327
+ base_model.model.model.layers.31.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
328
+ base_model.model.model.layers.31.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
329
+ base_model.model.model.layers.31.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
330
+ base_model.model.model.layers.32.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
331
+ base_model.model.model.layers.32.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
332
+ base_model.model.model.layers.32.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
333
+ base_model.model.model.layers.32.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
334
+ base_model.model.model.layers.32.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
335
+ base_model.model.model.layers.32.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
336
+ base_model.model.model.layers.32.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
337
+ base_model.model.model.layers.32.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
338
+ base_model.model.model.layers.32.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
339
+ base_model.model.model.layers.32.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
340
+ base_model.model.model.layers.33.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
341
+ base_model.model.model.layers.33.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
342
+ base_model.model.model.layers.33.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
343
+ base_model.model.model.layers.33.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
344
+ base_model.model.model.layers.33.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
345
+ base_model.model.model.layers.33.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
346
+ base_model.model.model.layers.33.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
347
+ base_model.model.model.layers.33.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
348
+ base_model.model.model.layers.33.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
349
+ base_model.model.model.layers.33.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
350
+ base_model.model.model.layers.34.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
351
+ base_model.model.model.layers.34.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
352
+ base_model.model.model.layers.34.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
353
+ base_model.model.model.layers.34.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
354
+ base_model.model.model.layers.34.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
355
+ base_model.model.model.layers.34.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
356
+ base_model.model.model.layers.34.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
357
+ base_model.model.model.layers.34.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
358
+ base_model.model.model.layers.34.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
359
+ base_model.model.model.layers.34.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
360
+ base_model.model.model.layers.35.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
361
+ base_model.model.model.layers.35.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
362
+ base_model.model.model.layers.35.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
363
+ base_model.model.model.layers.35.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
364
+ base_model.model.model.layers.35.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
365
+ base_model.model.model.layers.35.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
366
+ base_model.model.model.layers.35.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
367
+ base_model.model.model.layers.35.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
368
+ base_model.model.model.layers.35.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
369
+ base_model.model.model.layers.35.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
370
+ base_model.model.model.layers.36.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
371
+ base_model.model.model.layers.36.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
372
+ base_model.model.model.layers.36.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
373
+ base_model.model.model.layers.36.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
374
+ base_model.model.model.layers.36.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
375
+ base_model.model.model.layers.36.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
376
+ base_model.model.model.layers.36.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
377
+ base_model.model.model.layers.36.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
378
+ base_model.model.model.layers.36.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
379
+ base_model.model.model.layers.36.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
380
+ base_model.model.model.layers.37.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
381
+ base_model.model.model.layers.37.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
382
+ base_model.model.model.layers.37.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
383
+ base_model.model.model.layers.37.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
384
+ base_model.model.model.layers.37.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
385
+ base_model.model.model.layers.37.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
386
+ base_model.model.model.layers.37.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
387
+ base_model.model.model.layers.37.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
388
+ base_model.model.model.layers.37.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
389
+ base_model.model.model.layers.37.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
390
+ base_model.model.model.layers.38.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
391
+ base_model.model.model.layers.38.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
392
+ base_model.model.model.layers.38.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
393
+ base_model.model.model.layers.38.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
394
+ base_model.model.model.layers.38.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
395
+ base_model.model.model.layers.38.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
396
+ base_model.model.model.layers.38.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
397
+ base_model.model.model.layers.38.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
398
+ base_model.model.model.layers.38.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
399
+ base_model.model.model.layers.38.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
400
+ base_model.model.model.layers.39.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
401
+ base_model.model.model.layers.39.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
402
+ base_model.model.model.layers.39.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
403
+ base_model.model.model.layers.39.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
404
+ base_model.model.model.layers.39.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
405
+ base_model.model.model.layers.39.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
406
+ base_model.model.model.layers.39.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
407
+ base_model.model.model.layers.39.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
408
+ base_model.model.model.layers.39.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
409
+ base_model.model.model.layers.39.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
Llama13B/CMS/t60107d15h52m28,ep=2.0,mlr2.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/trainer_state.json ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1500,
3
+ "best_metric": 0.05616725981235504,
4
+ "best_model_checkpoint": "./Llama13B/CMS/t60107d15h52m28,ep=2.0,mlr2.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/checkpoint-1500",
5
+ "epoch": 2.0,
6
+ "eval_steps": 300,
7
+ "global_step": 1766,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.33994334277620397,
14
+ "grad_norm": 0.07389352470636368,
15
+ "learning_rate": 0.0001930248326634556,
16
+ "loss": 0.5654,
17
+ "step": 300
18
+ },
19
+ {
20
+ "epoch": 0.6798866855524079,
21
+ "grad_norm": 0.07417800277471542,
22
+ "learning_rate": 0.0001588090691373661,
23
+ "loss": 0.0754,
24
+ "step": 600
25
+ },
26
+ {
27
+ "epoch": 0.6798866855524079,
28
+ "eval_loss": 0.06705917418003082,
29
+ "eval_runtime": 43.1442,
30
+ "eval_samples_per_second": 23.178,
31
+ "eval_steps_per_second": 0.742,
32
+ "step": 600
33
+ },
34
+ {
35
+ "epoch": 1.0192634560906515,
36
+ "grad_norm": 0.0427674800157547,
37
+ "learning_rate": 0.00010622629205383885,
38
+ "loss": 0.062,
39
+ "step": 900
40
+ },
41
+ {
42
+ "epoch": 1.0192634560906515,
43
+ "eval_loss": 0.060582421720027924,
44
+ "eval_runtime": 42.6357,
45
+ "eval_samples_per_second": 23.455,
46
+ "eval_steps_per_second": 0.751,
47
+ "step": 900
48
+ },
49
+ {
50
+ "epoch": 1.3592067988668555,
51
+ "grad_norm": 0.05241871252655983,
52
+ "learning_rate": 5.1698944307268006e-05,
53
+ "loss": 0.0483,
54
+ "step": 1200
55
+ },
56
+ {
57
+ "epoch": 1.3592067988668555,
58
+ "eval_loss": 0.05789685994386673,
59
+ "eval_runtime": 42.8479,
60
+ "eval_samples_per_second": 23.338,
61
+ "eval_steps_per_second": 0.747,
62
+ "step": 1200
63
+ },
64
+ {
65
+ "epoch": 1.6991501416430594,
66
+ "grad_norm": 0.07206667214632034,
67
+ "learning_rate": 1.225678933385207e-05,
68
+ "loss": 0.046,
69
+ "step": 1500
70
+ },
71
+ {
72
+ "epoch": 1.6991501416430594,
73
+ "eval_loss": 0.05616725981235504,
74
+ "eval_runtime": 42.8072,
75
+ "eval_samples_per_second": 23.361,
76
+ "eval_steps_per_second": 0.748,
77
+ "step": 1500
78
+ },
79
+ {
80
+ "epoch": 2.0,
81
+ "step": 1766,
82
+ "total_flos": 1.1224942823709082e+18,
83
+ "train_loss": 0.14243490050630148,
84
+ "train_runtime": 3020.9682,
85
+ "train_samples_per_second": 9.347,
86
+ "train_steps_per_second": 0.585
87
+ }
88
+ ],
89
+ "logging_steps": 300,
90
+ "max_steps": 1766,
91
+ "num_input_tokens_seen": 0,
92
+ "num_train_epochs": 2,
93
+ "save_steps": 300,
94
+ "stateful_callbacks": {
95
+ "TrainerControl": {
96
+ "args": {
97
+ "should_epoch_stop": false,
98
+ "should_evaluate": false,
99
+ "should_log": false,
100
+ "should_save": true,
101
+ "should_training_stop": true
102
+ },
103
+ "attributes": {}
104
+ }
105
+ },
106
+ "total_flos": 1.1224942823709082e+18,
107
+ "train_batch_size": 8,
108
+ "trial_name": null,
109
+ "trial_params": null
110
+ }
Llama13B/CMS/t60107d17h56m56,ep=2.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_model_name_or_path": "dfurman/llama-13b",
3
+ "bias": "none",
4
+ "col_L": 32,
5
+ "drop_out": 0.0,
6
+ "inference_mode": false,
7
+ "layers_to_transform": null,
8
+ "modules_to_save": null,
9
+ "num_unique_blocks_L": 32,
10
+ "num_unique_blocks_R": 32,
11
+ "peft_type": "SAMA",
12
+ "revision": null,
13
+ "row_R": 32,
14
+ "scaling": 1,
15
+ "share_factor_L": 1,
16
+ "share_factor_R": 1,
17
+ "target_modules": [
18
+ "q_proj",
19
+ "k_proj",
20
+ "down_proj",
21
+ "v_proj",
22
+ "up_proj"
23
+ ],
24
+ "target_modules_to_skip": null,
25
+ "task_type": "CAUSAL_LM"
26
+ }
Llama13B/CMS/t60107d17h56m56,ep=2.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
Llama13B/CMS/t60107d17h56m56,ep=2.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
Llama13B/CMS/t60107d17h56m56,ep=2.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
Llama13B/CMS/t60107d17h56m56,ep=2.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": true,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": true,
36
+ "model_max_length": 512,
37
+ "pad_token": "</s>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
Llama13B/CMS/t60107d17h56m56,ep=2.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft2_/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_model_name_or_path": "dfurman/llama-13b",
3
+ "bias": "none",
4
+ "col_L": 32,
5
+ "drop_out": 0.0,
6
+ "inference_mode": true,
7
+ "layers_to_transform": null,
8
+ "modules_to_save": null,
9
+ "num_unique_blocks_L": 32,
10
+ "num_unique_blocks_R": 32,
11
+ "peft_type": "SAMA",
12
+ "revision": null,
13
+ "row_R": 32,
14
+ "scaling": 1,
15
+ "share_factor_L": 1,
16
+ "share_factor_R": 1,
17
+ "target_modules": [
18
+ "q_proj",
19
+ "k_proj",
20
+ "down_proj",
21
+ "v_proj",
22
+ "up_proj"
23
+ ],
24
+ "target_modules_to_skip": null,
25
+ "task_type": "CAUSAL_LM"
26
+ }
Llama13B/CMS/t60107d17h56m56,ep=2.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft2_/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab7a6902cdb949231f188dd007f69b68b4bdcab0612a9c5025026c153f0b0cfb
3
+ size 351427939
Llama13B/CMS/t60107d17h56m56,ep=2.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/model_parameters_report.txt ADDED
@@ -0,0 +1,409 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ === GLOBAL STATISTICS ===
2
+ Total Parameters: 13,103,682,560
3
+ Trainable Parameters: 87,818,240
4
+ Trainable Percentage: 0.6702%
5
+
6
+ ================================================================================================================================
7
+ === DETAILED TRAINABLE MATRICES LIST ===
8
+ Layer Name | Shape | Count
9
+ ---------------------------------------------------------------------------------------------------------------------------------------
10
+ base_model.model.model.layers.0.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
11
+ base_model.model.model.layers.0.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
12
+ base_model.model.model.layers.0.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
13
+ base_model.model.model.layers.0.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
14
+ base_model.model.model.layers.0.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
15
+ base_model.model.model.layers.0.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
16
+ base_model.model.model.layers.0.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
17
+ base_model.model.model.layers.0.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
18
+ base_model.model.model.layers.0.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
19
+ base_model.model.model.layers.0.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
20
+ base_model.model.model.layers.1.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
21
+ base_model.model.model.layers.1.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
22
+ base_model.model.model.layers.1.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
23
+ base_model.model.model.layers.1.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
24
+ base_model.model.model.layers.1.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
25
+ base_model.model.model.layers.1.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
26
+ base_model.model.model.layers.1.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
27
+ base_model.model.model.layers.1.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
28
+ base_model.model.model.layers.1.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
29
+ base_model.model.model.layers.1.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
30
+ base_model.model.model.layers.2.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
31
+ base_model.model.model.layers.2.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
32
+ base_model.model.model.layers.2.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
33
+ base_model.model.model.layers.2.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
34
+ base_model.model.model.layers.2.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
35
+ base_model.model.model.layers.2.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
36
+ base_model.model.model.layers.2.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
37
+ base_model.model.model.layers.2.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
38
+ base_model.model.model.layers.2.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
39
+ base_model.model.model.layers.2.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
40
+ base_model.model.model.layers.3.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
41
+ base_model.model.model.layers.3.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
42
+ base_model.model.model.layers.3.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
43
+ base_model.model.model.layers.3.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
44
+ base_model.model.model.layers.3.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
45
+ base_model.model.model.layers.3.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
46
+ base_model.model.model.layers.3.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
47
+ base_model.model.model.layers.3.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
48
+ base_model.model.model.layers.3.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
49
+ base_model.model.model.layers.3.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
50
+ base_model.model.model.layers.4.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
51
+ base_model.model.model.layers.4.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
52
+ base_model.model.model.layers.4.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
53
+ base_model.model.model.layers.4.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
54
+ base_model.model.model.layers.4.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
55
+ base_model.model.model.layers.4.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
56
+ base_model.model.model.layers.4.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
57
+ base_model.model.model.layers.4.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
58
+ base_model.model.model.layers.4.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
59
+ base_model.model.model.layers.4.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
60
+ base_model.model.model.layers.5.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
61
+ base_model.model.model.layers.5.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
62
+ base_model.model.model.layers.5.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
63
+ base_model.model.model.layers.5.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
64
+ base_model.model.model.layers.5.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
65
+ base_model.model.model.layers.5.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
66
+ base_model.model.model.layers.5.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
67
+ base_model.model.model.layers.5.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
68
+ base_model.model.model.layers.5.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
69
+ base_model.model.model.layers.5.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
70
+ base_model.model.model.layers.6.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
71
+ base_model.model.model.layers.6.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
72
+ base_model.model.model.layers.6.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
73
+ base_model.model.model.layers.6.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
74
+ base_model.model.model.layers.6.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
75
+ base_model.model.model.layers.6.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
76
+ base_model.model.model.layers.6.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
77
+ base_model.model.model.layers.6.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
78
+ base_model.model.model.layers.6.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
79
+ base_model.model.model.layers.6.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
80
+ base_model.model.model.layers.7.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
81
+ base_model.model.model.layers.7.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
82
+ base_model.model.model.layers.7.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
83
+ base_model.model.model.layers.7.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
84
+ base_model.model.model.layers.7.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
85
+ base_model.model.model.layers.7.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
86
+ base_model.model.model.layers.7.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
87
+ base_model.model.model.layers.7.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
88
+ base_model.model.model.layers.7.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
89
+ base_model.model.model.layers.7.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
90
+ base_model.model.model.layers.8.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
91
+ base_model.model.model.layers.8.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
92
+ base_model.model.model.layers.8.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
93
+ base_model.model.model.layers.8.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
94
+ base_model.model.model.layers.8.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
95
+ base_model.model.model.layers.8.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
96
+ base_model.model.model.layers.8.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
97
+ base_model.model.model.layers.8.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
98
+ base_model.model.model.layers.8.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
99
+ base_model.model.model.layers.8.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
100
+ base_model.model.model.layers.9.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
101
+ base_model.model.model.layers.9.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
102
+ base_model.model.model.layers.9.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
103
+ base_model.model.model.layers.9.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
104
+ base_model.model.model.layers.9.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
105
+ base_model.model.model.layers.9.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
106
+ base_model.model.model.layers.9.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
107
+ base_model.model.model.layers.9.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
108
+ base_model.model.model.layers.9.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
109
+ base_model.model.model.layers.9.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
110
+ base_model.model.model.layers.10.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
111
+ base_model.model.model.layers.10.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
112
+ base_model.model.model.layers.10.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
113
+ base_model.model.model.layers.10.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
114
+ base_model.model.model.layers.10.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
115
+ base_model.model.model.layers.10.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
116
+ base_model.model.model.layers.10.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
117
+ base_model.model.model.layers.10.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
118
+ base_model.model.model.layers.10.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
119
+ base_model.model.model.layers.10.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
120
+ base_model.model.model.layers.11.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
121
+ base_model.model.model.layers.11.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
122
+ base_model.model.model.layers.11.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
123
+ base_model.model.model.layers.11.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
124
+ base_model.model.model.layers.11.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
125
+ base_model.model.model.layers.11.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
126
+ base_model.model.model.layers.11.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
127
+ base_model.model.model.layers.11.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
128
+ base_model.model.model.layers.11.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
129
+ base_model.model.model.layers.11.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
130
+ base_model.model.model.layers.12.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
131
+ base_model.model.model.layers.12.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
132
+ base_model.model.model.layers.12.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
133
+ base_model.model.model.layers.12.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
134
+ base_model.model.model.layers.12.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
135
+ base_model.model.model.layers.12.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
136
+ base_model.model.model.layers.12.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
137
+ base_model.model.model.layers.12.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
138
+ base_model.model.model.layers.12.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
139
+ base_model.model.model.layers.12.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
140
+ base_model.model.model.layers.13.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
141
+ base_model.model.model.layers.13.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
142
+ base_model.model.model.layers.13.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
143
+ base_model.model.model.layers.13.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
144
+ base_model.model.model.layers.13.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
145
+ base_model.model.model.layers.13.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
146
+ base_model.model.model.layers.13.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
147
+ base_model.model.model.layers.13.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
148
+ base_model.model.model.layers.13.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
149
+ base_model.model.model.layers.13.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
150
+ base_model.model.model.layers.14.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
151
+ base_model.model.model.layers.14.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
152
+ base_model.model.model.layers.14.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
153
+ base_model.model.model.layers.14.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
154
+ base_model.model.model.layers.14.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
155
+ base_model.model.model.layers.14.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
156
+ base_model.model.model.layers.14.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
157
+ base_model.model.model.layers.14.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
158
+ base_model.model.model.layers.14.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
159
+ base_model.model.model.layers.14.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
160
+ base_model.model.model.layers.15.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
161
+ base_model.model.model.layers.15.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
162
+ base_model.model.model.layers.15.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
163
+ base_model.model.model.layers.15.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
164
+ base_model.model.model.layers.15.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
165
+ base_model.model.model.layers.15.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
166
+ base_model.model.model.layers.15.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
167
+ base_model.model.model.layers.15.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
168
+ base_model.model.model.layers.15.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
169
+ base_model.model.model.layers.15.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
170
+ base_model.model.model.layers.16.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
171
+ base_model.model.model.layers.16.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
172
+ base_model.model.model.layers.16.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
173
+ base_model.model.model.layers.16.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
174
+ base_model.model.model.layers.16.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
175
+ base_model.model.model.layers.16.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
176
+ base_model.model.model.layers.16.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
177
+ base_model.model.model.layers.16.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
178
+ base_model.model.model.layers.16.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
179
+ base_model.model.model.layers.16.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
180
+ base_model.model.model.layers.17.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
181
+ base_model.model.model.layers.17.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
182
+ base_model.model.model.layers.17.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
183
+ base_model.model.model.layers.17.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
184
+ base_model.model.model.layers.17.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
185
+ base_model.model.model.layers.17.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
186
+ base_model.model.model.layers.17.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
187
+ base_model.model.model.layers.17.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
188
+ base_model.model.model.layers.17.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
189
+ base_model.model.model.layers.17.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
190
+ base_model.model.model.layers.18.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
191
+ base_model.model.model.layers.18.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
192
+ base_model.model.model.layers.18.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
193
+ base_model.model.model.layers.18.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
194
+ base_model.model.model.layers.18.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
195
+ base_model.model.model.layers.18.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
196
+ base_model.model.model.layers.18.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
197
+ base_model.model.model.layers.18.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
198
+ base_model.model.model.layers.18.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
199
+ base_model.model.model.layers.18.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
200
+ base_model.model.model.layers.19.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
201
+ base_model.model.model.layers.19.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
202
+ base_model.model.model.layers.19.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
203
+ base_model.model.model.layers.19.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
204
+ base_model.model.model.layers.19.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
205
+ base_model.model.model.layers.19.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
206
+ base_model.model.model.layers.19.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
207
+ base_model.model.model.layers.19.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
208
+ base_model.model.model.layers.19.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
209
+ base_model.model.model.layers.19.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
210
+ base_model.model.model.layers.20.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
211
+ base_model.model.model.layers.20.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
212
+ base_model.model.model.layers.20.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
213
+ base_model.model.model.layers.20.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
214
+ base_model.model.model.layers.20.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
215
+ base_model.model.model.layers.20.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
216
+ base_model.model.model.layers.20.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
217
+ base_model.model.model.layers.20.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
218
+ base_model.model.model.layers.20.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
219
+ base_model.model.model.layers.20.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
220
+ base_model.model.model.layers.21.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
221
+ base_model.model.model.layers.21.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
222
+ base_model.model.model.layers.21.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
223
+ base_model.model.model.layers.21.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
224
+ base_model.model.model.layers.21.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
225
+ base_model.model.model.layers.21.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
226
+ base_model.model.model.layers.21.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
227
+ base_model.model.model.layers.21.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
228
+ base_model.model.model.layers.21.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
229
+ base_model.model.model.layers.21.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
230
+ base_model.model.model.layers.22.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
231
+ base_model.model.model.layers.22.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
232
+ base_model.model.model.layers.22.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
233
+ base_model.model.model.layers.22.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
234
+ base_model.model.model.layers.22.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
235
+ base_model.model.model.layers.22.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
236
+ base_model.model.model.layers.22.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
237
+ base_model.model.model.layers.22.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
238
+ base_model.model.model.layers.22.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
239
+ base_model.model.model.layers.22.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
240
+ base_model.model.model.layers.23.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
241
+ base_model.model.model.layers.23.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
242
+ base_model.model.model.layers.23.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
243
+ base_model.model.model.layers.23.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
244
+ base_model.model.model.layers.23.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
245
+ base_model.model.model.layers.23.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
246
+ base_model.model.model.layers.23.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
247
+ base_model.model.model.layers.23.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
248
+ base_model.model.model.layers.23.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
249
+ base_model.model.model.layers.23.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
250
+ base_model.model.model.layers.24.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
251
+ base_model.model.model.layers.24.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
252
+ base_model.model.model.layers.24.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
253
+ base_model.model.model.layers.24.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
254
+ base_model.model.model.layers.24.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
255
+ base_model.model.model.layers.24.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
256
+ base_model.model.model.layers.24.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
257
+ base_model.model.model.layers.24.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
258
+ base_model.model.model.layers.24.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
259
+ base_model.model.model.layers.24.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
260
+ base_model.model.model.layers.25.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
261
+ base_model.model.model.layers.25.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
262
+ base_model.model.model.layers.25.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
263
+ base_model.model.model.layers.25.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
264
+ base_model.model.model.layers.25.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
265
+ base_model.model.model.layers.25.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
266
+ base_model.model.model.layers.25.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
267
+ base_model.model.model.layers.25.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
268
+ base_model.model.model.layers.25.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
269
+ base_model.model.model.layers.25.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
270
+ base_model.model.model.layers.26.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
271
+ base_model.model.model.layers.26.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
272
+ base_model.model.model.layers.26.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
273
+ base_model.model.model.layers.26.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
274
+ base_model.model.model.layers.26.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
275
+ base_model.model.model.layers.26.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
276
+ base_model.model.model.layers.26.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
277
+ base_model.model.model.layers.26.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
278
+ base_model.model.model.layers.26.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
279
+ base_model.model.model.layers.26.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
280
+ base_model.model.model.layers.27.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
281
+ base_model.model.model.layers.27.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
282
+ base_model.model.model.layers.27.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
283
+ base_model.model.model.layers.27.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
284
+ base_model.model.model.layers.27.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
285
+ base_model.model.model.layers.27.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
286
+ base_model.model.model.layers.27.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
287
+ base_model.model.model.layers.27.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
288
+ base_model.model.model.layers.27.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
289
+ base_model.model.model.layers.27.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
290
+ base_model.model.model.layers.28.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
291
+ base_model.model.model.layers.28.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
292
+ base_model.model.model.layers.28.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
293
+ base_model.model.model.layers.28.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
294
+ base_model.model.model.layers.28.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
295
+ base_model.model.model.layers.28.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
296
+ base_model.model.model.layers.28.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
297
+ base_model.model.model.layers.28.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
298
+ base_model.model.model.layers.28.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
299
+ base_model.model.model.layers.28.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
300
+ base_model.model.model.layers.29.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
301
+ base_model.model.model.layers.29.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
302
+ base_model.model.model.layers.29.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
303
+ base_model.model.model.layers.29.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
304
+ base_model.model.model.layers.29.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
305
+ base_model.model.model.layers.29.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
306
+ base_model.model.model.layers.29.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
307
+ base_model.model.model.layers.29.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
308
+ base_model.model.model.layers.29.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
309
+ base_model.model.model.layers.29.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
310
+ base_model.model.model.layers.30.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
311
+ base_model.model.model.layers.30.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
312
+ base_model.model.model.layers.30.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
313
+ base_model.model.model.layers.30.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
314
+ base_model.model.model.layers.30.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
315
+ base_model.model.model.layers.30.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
316
+ base_model.model.model.layers.30.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
317
+ base_model.model.model.layers.30.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
318
+ base_model.model.model.layers.30.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
319
+ base_model.model.model.layers.30.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
320
+ base_model.model.model.layers.31.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
321
+ base_model.model.model.layers.31.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
322
+ base_model.model.model.layers.31.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
323
+ base_model.model.model.layers.31.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
324
+ base_model.model.model.layers.31.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
325
+ base_model.model.model.layers.31.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
326
+ base_model.model.model.layers.31.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
327
+ base_model.model.model.layers.31.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
328
+ base_model.model.model.layers.31.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
329
+ base_model.model.model.layers.31.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
330
+ base_model.model.model.layers.32.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
331
+ base_model.model.model.layers.32.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
332
+ base_model.model.model.layers.32.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
333
+ base_model.model.model.layers.32.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
334
+ base_model.model.model.layers.32.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
335
+ base_model.model.model.layers.32.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
336
+ base_model.model.model.layers.32.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
337
+ base_model.model.model.layers.32.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
338
+ base_model.model.model.layers.32.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
339
+ base_model.model.model.layers.32.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
340
+ base_model.model.model.layers.33.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
341
+ base_model.model.model.layers.33.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
342
+ base_model.model.model.layers.33.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
343
+ base_model.model.model.layers.33.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
344
+ base_model.model.model.layers.33.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
345
+ base_model.model.model.layers.33.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
346
+ base_model.model.model.layers.33.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
347
+ base_model.model.model.layers.33.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
348
+ base_model.model.model.layers.33.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
349
+ base_model.model.model.layers.33.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
350
+ base_model.model.model.layers.34.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
351
+ base_model.model.model.layers.34.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
352
+ base_model.model.model.layers.34.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
353
+ base_model.model.model.layers.34.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
354
+ base_model.model.model.layers.34.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
355
+ base_model.model.model.layers.34.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
356
+ base_model.model.model.layers.34.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
357
+ base_model.model.model.layers.34.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
358
+ base_model.model.model.layers.34.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
359
+ base_model.model.model.layers.34.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
360
+ base_model.model.model.layers.35.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
361
+ base_model.model.model.layers.35.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
362
+ base_model.model.model.layers.35.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
363
+ base_model.model.model.layers.35.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
364
+ base_model.model.model.layers.35.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
365
+ base_model.model.model.layers.35.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
366
+ base_model.model.model.layers.35.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
367
+ base_model.model.model.layers.35.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
368
+ base_model.model.model.layers.35.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
369
+ base_model.model.model.layers.35.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
370
+ base_model.model.model.layers.36.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
371
+ base_model.model.model.layers.36.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
372
+ base_model.model.model.layers.36.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
373
+ base_model.model.model.layers.36.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
374
+ base_model.model.model.layers.36.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
375
+ base_model.model.model.layers.36.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
376
+ base_model.model.model.layers.36.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
377
+ base_model.model.model.layers.36.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
378
+ base_model.model.model.layers.36.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
379
+ base_model.model.model.layers.36.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
380
+ base_model.model.model.layers.37.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
381
+ base_model.model.model.layers.37.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
382
+ base_model.model.model.layers.37.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
383
+ base_model.model.model.layers.37.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
384
+ base_model.model.model.layers.37.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
385
+ base_model.model.model.layers.37.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
386
+ base_model.model.model.layers.37.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
387
+ base_model.model.model.layers.37.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
388
+ base_model.model.model.layers.37.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
389
+ base_model.model.model.layers.37.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
390
+ base_model.model.model.layers.38.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
391
+ base_model.model.model.layers.38.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
392
+ base_model.model.model.layers.38.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
393
+ base_model.model.model.layers.38.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
394
+ base_model.model.model.layers.38.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
395
+ base_model.model.model.layers.38.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
396
+ base_model.model.model.layers.38.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
397
+ base_model.model.model.layers.38.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
398
+ base_model.model.model.layers.38.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
399
+ base_model.model.model.layers.38.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
400
+ base_model.model.model.layers.39.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
401
+ base_model.model.model.layers.39.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
402
+ base_model.model.model.layers.39.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
403
+ base_model.model.model.layers.39.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
404
+ base_model.model.model.layers.39.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
405
+ base_model.model.model.layers.39.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
406
+ base_model.model.model.layers.39.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
407
+ base_model.model.model.layers.39.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
408
+ base_model.model.model.layers.39.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
409
+ base_model.model.model.layers.39.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
Llama13B/CMS/t60107d17h56m56,ep=2.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/trainer_state.json ADDED
@@ -0,0 +1,528 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 50,
7
+ "global_step": 1766,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.056657223796033995,
14
+ "grad_norm": 0.03540075570344925,
15
+ "learning_rate": 0.000245,
16
+ "loss": 2.0119,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.11331444759206799,
21
+ "grad_norm": 0.08595774322748184,
22
+ "learning_rate": 0.000495,
23
+ "loss": 0.1096,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.16997167138810199,
28
+ "grad_norm": 0.0386439748108387,
29
+ "learning_rate": 0.0004989309807668432,
30
+ "loss": 0.0948,
31
+ "step": 150
32
+ },
33
+ {
34
+ "epoch": 0.22662889518413598,
35
+ "grad_norm": 0.08770483732223511,
36
+ "learning_rate": 0.000495645795847958,
37
+ "loss": 0.0867,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 0.28328611898017,
42
+ "grad_norm": 0.08420336246490479,
43
+ "learning_rate": 0.0004901732526860121,
44
+ "loss": 0.075,
45
+ "step": 250
46
+ },
47
+ {
48
+ "epoch": 0.33994334277620397,
49
+ "grad_norm": 0.1704639345407486,
50
+ "learning_rate": 0.000482562081658639,
51
+ "loss": 0.0667,
52
+ "step": 300
53
+ },
54
+ {
55
+ "epoch": 0.33994334277620397,
56
+ "eval_loss": 0.07056522369384766,
57
+ "eval_runtime": 43.0303,
58
+ "eval_samples_per_second": 23.239,
59
+ "eval_steps_per_second": 0.744,
60
+ "step": 300
61
+ },
62
+ {
63
+ "epoch": 0.39660056657223797,
64
+ "grad_norm": 0.09642501175403595,
65
+ "learning_rate": 0.00047288005660104163,
66
+ "loss": 0.0671,
67
+ "step": 350
68
+ },
69
+ {
70
+ "epoch": 0.39660056657223797,
71
+ "eval_loss": 0.0681549534201622,
72
+ "eval_runtime": 42.7781,
73
+ "eval_samples_per_second": 23.376,
74
+ "eval_steps_per_second": 0.748,
75
+ "step": 350
76
+ },
77
+ {
78
+ "epoch": 0.45325779036827196,
79
+ "grad_norm": 0.05195102468132973,
80
+ "learning_rate": 0.0004612133913124268,
81
+ "loss": 0.0604,
82
+ "step": 400
83
+ },
84
+ {
85
+ "epoch": 0.45325779036827196,
86
+ "eval_loss": 0.06310559809207916,
87
+ "eval_runtime": 42.7964,
88
+ "eval_samples_per_second": 23.366,
89
+ "eval_steps_per_second": 0.748,
90
+ "step": 400
91
+ },
92
+ {
93
+ "epoch": 0.509915014164306,
94
+ "grad_norm": 0.0997244343161583,
95
+ "learning_rate": 0.0004476659718633865,
96
+ "loss": 0.0596,
97
+ "step": 450
98
+ },
99
+ {
100
+ "epoch": 0.509915014164306,
101
+ "eval_loss": 0.06353795528411865,
102
+ "eval_runtime": 42.8058,
103
+ "eval_samples_per_second": 23.361,
104
+ "eval_steps_per_second": 0.748,
105
+ "step": 450
106
+ },
107
+ {
108
+ "epoch": 0.56657223796034,
109
+ "grad_norm": 0.03704178333282471,
110
+ "learning_rate": 0.0004323584315401604,
111
+ "loss": 0.059,
112
+ "step": 500
113
+ },
114
+ {
115
+ "epoch": 0.56657223796034,
116
+ "eval_loss": 0.06257344037294388,
117
+ "eval_runtime": 42.7483,
118
+ "eval_samples_per_second": 23.393,
119
+ "eval_steps_per_second": 0.749,
120
+ "step": 500
121
+ },
122
+ {
123
+ "epoch": 0.623229461756374,
124
+ "grad_norm": 0.10769298672676086,
125
+ "learning_rate": 0.000415427076662958,
126
+ "loss": 0.0575,
127
+ "step": 550
128
+ },
129
+ {
130
+ "epoch": 0.623229461756374,
131
+ "eval_loss": 0.05993322283029556,
132
+ "eval_runtime": 42.7557,
133
+ "eval_samples_per_second": 23.389,
134
+ "eval_steps_per_second": 0.748,
135
+ "step": 550
136
+ },
137
+ {
138
+ "epoch": 0.6798866855524079,
139
+ "grad_norm": 0.08261393010616302,
140
+ "learning_rate": 0.0003970226728434152,
141
+ "loss": 0.0611,
142
+ "step": 600
143
+ },
144
+ {
145
+ "epoch": 0.6798866855524079,
146
+ "eval_loss": 0.05639559403061867,
147
+ "eval_runtime": 42.7842,
148
+ "eval_samples_per_second": 23.373,
149
+ "eval_steps_per_second": 0.748,
150
+ "step": 600
151
+ },
152
+ {
153
+ "epoch": 0.7365439093484419,
154
+ "grad_norm": 0.03413009271025658,
155
+ "learning_rate": 0.00037730910248898127,
156
+ "loss": 0.0566,
157
+ "step": 650
158
+ },
159
+ {
160
+ "epoch": 0.7365439093484419,
161
+ "eval_loss": 0.055478040128946304,
162
+ "eval_runtime": 42.7805,
163
+ "eval_samples_per_second": 23.375,
164
+ "eval_steps_per_second": 0.748,
165
+ "step": 650
166
+ },
167
+ {
168
+ "epoch": 0.7932011331444759,
169
+ "grad_norm": 0.14471058547496796,
170
+ "learning_rate": 0.00035646190550851575,
171
+ "loss": 0.0549,
172
+ "step": 700
173
+ },
174
+ {
175
+ "epoch": 0.7932011331444759,
176
+ "eval_loss": 0.05561484023928642,
177
+ "eval_runtime": 42.7721,
178
+ "eval_samples_per_second": 23.38,
179
+ "eval_steps_per_second": 0.748,
180
+ "step": 700
181
+ },
182
+ {
183
+ "epoch": 0.8498583569405099,
184
+ "grad_norm": 0.08413548767566681,
185
+ "learning_rate": 0.0003346667162134164,
186
+ "loss": 0.0555,
187
+ "step": 750
188
+ },
189
+ {
190
+ "epoch": 0.8498583569405099,
191
+ "eval_loss": 0.053136855363845825,
192
+ "eval_runtime": 42.7831,
193
+ "eval_samples_per_second": 23.374,
194
+ "eval_steps_per_second": 0.748,
195
+ "step": 750
196
+ },
197
+ {
198
+ "epoch": 0.9065155807365439,
199
+ "grad_norm": 0.06495863944292068,
200
+ "learning_rate": 0.00031211761033292194,
201
+ "loss": 0.0516,
202
+ "step": 800
203
+ },
204
+ {
205
+ "epoch": 0.9065155807365439,
206
+ "eval_loss": 0.051497943699359894,
207
+ "eval_runtime": 42.8012,
208
+ "eval_samples_per_second": 23.364,
209
+ "eval_steps_per_second": 0.748,
210
+ "step": 800
211
+ },
212
+ {
213
+ "epoch": 0.9631728045325779,
214
+ "grad_norm": 0.06451834738254547,
215
+ "learning_rate": 0.0002890153768626315,
216
+ "loss": 0.0533,
217
+ "step": 850
218
+ },
219
+ {
220
+ "epoch": 0.9631728045325779,
221
+ "eval_loss": 0.0497167594730854,
222
+ "eval_runtime": 42.7769,
223
+ "eval_samples_per_second": 23.377,
224
+ "eval_steps_per_second": 0.748,
225
+ "step": 850
226
+ },
227
+ {
228
+ "epoch": 1.0192634560906515,
229
+ "grad_norm": 0.023159746080636978,
230
+ "learning_rate": 0.0002655657301345971,
231
+ "loss": 0.0446,
232
+ "step": 900
233
+ },
234
+ {
235
+ "epoch": 1.0192634560906515,
236
+ "eval_loss": 0.05106104165315628,
237
+ "eval_runtime": 42.7088,
238
+ "eval_samples_per_second": 23.414,
239
+ "eval_steps_per_second": 0.749,
240
+ "step": 900
241
+ },
242
+ {
243
+ "epoch": 1.0759206798866856,
244
+ "grad_norm": 0.0639742985367775,
245
+ "learning_rate": 0.00024197747802965556,
246
+ "loss": 0.0309,
247
+ "step": 950
248
+ },
249
+ {
250
+ "epoch": 1.0759206798866856,
251
+ "eval_loss": 0.04969682916998863,
252
+ "eval_runtime": 42.781,
253
+ "eval_samples_per_second": 23.375,
254
+ "eval_steps_per_second": 0.748,
255
+ "step": 950
256
+ },
257
+ {
258
+ "epoch": 1.1325779036827195,
259
+ "grad_norm": 0.048771053552627563,
260
+ "learning_rate": 0.00021846066264318665,
261
+ "loss": 0.0318,
262
+ "step": 1000
263
+ },
264
+ {
265
+ "epoch": 1.1325779036827195,
266
+ "eval_loss": 0.05130666494369507,
267
+ "eval_runtime": 42.7991,
268
+ "eval_samples_per_second": 23.365,
269
+ "eval_steps_per_second": 0.748,
270
+ "step": 1000
271
+ },
272
+ {
273
+ "epoch": 1.1892351274787536,
274
+ "grad_norm": 0.07568787038326263,
275
+ "learning_rate": 0.00019522468996078257,
276
+ "loss": 0.0333,
277
+ "step": 1050
278
+ },
279
+ {
280
+ "epoch": 1.1892351274787536,
281
+ "eval_loss": 0.05067039653658867,
282
+ "eval_runtime": 42.8009,
283
+ "eval_samples_per_second": 23.364,
284
+ "eval_steps_per_second": 0.748,
285
+ "step": 1050
286
+ },
287
+ {
288
+ "epoch": 1.2458923512747875,
289
+ "grad_norm": 0.08877400308847427,
290
+ "learning_rate": 0.00017247646519817056,
291
+ "loss": 0.0333,
292
+ "step": 1100
293
+ },
294
+ {
295
+ "epoch": 1.2458923512747875,
296
+ "eval_loss": 0.05075615644454956,
297
+ "eval_runtime": 42.7716,
298
+ "eval_samples_per_second": 23.38,
299
+ "eval_steps_per_second": 0.748,
300
+ "step": 1100
301
+ },
302
+ {
303
+ "epoch": 1.3025495750708216,
304
+ "grad_norm": 0.04937183856964111,
305
+ "learning_rate": 0.00015041855040929373,
306
+ "loss": 0.0326,
307
+ "step": 1150
308
+ },
309
+ {
310
+ "epoch": 1.3025495750708216,
311
+ "eval_loss": 0.05133920907974243,
312
+ "eval_runtime": 42.7949,
313
+ "eval_samples_per_second": 23.367,
314
+ "eval_steps_per_second": 0.748,
315
+ "step": 1150
316
+ },
317
+ {
318
+ "epoch": 1.3592067988668555,
319
+ "grad_norm": 0.09826551377773285,
320
+ "learning_rate": 0.00012924736076817,
321
+ "loss": 0.0283,
322
+ "step": 1200
323
+ },
324
+ {
325
+ "epoch": 1.3592067988668555,
326
+ "eval_loss": 0.05426332727074623,
327
+ "eval_runtime": 42.7975,
328
+ "eval_samples_per_second": 23.366,
329
+ "eval_steps_per_second": 0.748,
330
+ "step": 1200
331
+ },
332
+ {
333
+ "epoch": 1.4158640226628896,
334
+ "grad_norm": 0.08305229991674423,
335
+ "learning_rate": 0.00010915141558577307,
336
+ "loss": 0.0342,
337
+ "step": 1250
338
+ },
339
+ {
340
+ "epoch": 1.4158640226628896,
341
+ "eval_loss": 0.050987694412469864,
342
+ "eval_runtime": 42.8194,
343
+ "eval_samples_per_second": 23.354,
344
+ "eval_steps_per_second": 0.747,
345
+ "step": 1250
346
+ },
347
+ {
348
+ "epoch": 1.4725212464589235,
349
+ "grad_norm": 0.04723089560866356,
350
+ "learning_rate": 9.030965963579688e-05,
351
+ "loss": 0.0293,
352
+ "step": 1300
353
+ },
354
+ {
355
+ "epoch": 1.4725212464589235,
356
+ "eval_loss": 0.04957663640379906,
357
+ "eval_runtime": 42.8181,
358
+ "eval_samples_per_second": 23.355,
359
+ "eval_steps_per_second": 0.747,
360
+ "step": 1300
361
+ },
362
+ {
363
+ "epoch": 1.5291784702549576,
364
+ "grad_norm": 0.03975960984826088,
365
+ "learning_rate": 7.288986973708988e-05,
366
+ "loss": 0.0317,
367
+ "step": 1350
368
+ },
369
+ {
370
+ "epoch": 1.5291784702549576,
371
+ "eval_loss": 0.0498320609331131,
372
+ "eval_runtime": 42.7964,
373
+ "eval_samples_per_second": 23.366,
374
+ "eval_steps_per_second": 0.748,
375
+ "step": 1350
376
+ },
377
+ {
378
+ "epoch": 1.5858356940509915,
379
+ "grad_norm": 0.10585972666740417,
380
+ "learning_rate": 5.704716078138403e-05,
381
+ "loss": 0.0329,
382
+ "step": 1400
383
+ },
384
+ {
385
+ "epoch": 1.5858356940509915,
386
+ "eval_loss": 0.0496598519384861,
387
+ "eval_runtime": 42.8107,
388
+ "eval_samples_per_second": 23.359,
389
+ "eval_steps_per_second": 0.747,
390
+ "step": 1400
391
+ },
392
+ {
393
+ "epoch": 1.6424929178470253,
394
+ "grad_norm": 0.09209132194519043,
395
+ "learning_rate": 4.2922604509423035e-05,
396
+ "loss": 0.0245,
397
+ "step": 1450
398
+ },
399
+ {
400
+ "epoch": 1.6424929178470253,
401
+ "eval_loss": 0.050136879086494446,
402
+ "eval_runtime": 42.7941,
403
+ "eval_samples_per_second": 23.368,
404
+ "eval_steps_per_second": 0.748,
405
+ "step": 1450
406
+ },
407
+ {
408
+ "epoch": 1.6991501416430594,
409
+ "grad_norm": 0.05146324634552002,
410
+ "learning_rate": 3.064197333463017e-05,
411
+ "loss": 0.0299,
412
+ "step": 1500
413
+ },
414
+ {
415
+ "epoch": 1.6991501416430594,
416
+ "eval_loss": 0.05017969012260437,
417
+ "eval_runtime": 42.8143,
418
+ "eval_samples_per_second": 23.357,
419
+ "eval_steps_per_second": 0.747,
420
+ "step": 1500
421
+ },
422
+ {
423
+ "epoch": 1.7558073654390935,
424
+ "grad_norm": 0.06396601349115372,
425
+ "learning_rate": 2.0314620399962458e-05,
426
+ "loss": 0.0305,
427
+ "step": 1550
428
+ },
429
+ {
430
+ "epoch": 1.7558073654390935,
431
+ "eval_loss": 0.049846261739730835,
432
+ "eval_runtime": 42.801,
433
+ "eval_samples_per_second": 23.364,
434
+ "eval_steps_per_second": 0.748,
435
+ "step": 1550
436
+ },
437
+ {
438
+ "epoch": 1.8124645892351274,
439
+ "grad_norm": 0.07966850697994232,
440
+ "learning_rate": 1.2032505840510632e-05,
441
+ "loss": 0.0306,
442
+ "step": 1600
443
+ },
444
+ {
445
+ "epoch": 1.8124645892351274,
446
+ "eval_loss": 0.04927060008049011,
447
+ "eval_runtime": 42.7981,
448
+ "eval_samples_per_second": 23.366,
449
+ "eval_steps_per_second": 0.748,
450
+ "step": 1600
451
+ },
452
+ {
453
+ "epoch": 1.8691218130311613,
454
+ "grad_norm": 0.03877720236778259,
455
+ "learning_rate": 5.869377922509161e-06,
456
+ "loss": 0.0299,
457
+ "step": 1650
458
+ },
459
+ {
460
+ "epoch": 1.8691218130311613,
461
+ "eval_loss": 0.04900398850440979,
462
+ "eval_runtime": 42.7902,
463
+ "eval_samples_per_second": 23.37,
464
+ "eval_steps_per_second": 0.748,
465
+ "step": 1650
466
+ },
467
+ {
468
+ "epoch": 1.9257790368271954,
469
+ "grad_norm": 0.016926735639572144,
470
+ "learning_rate": 1.8801163503225082e-06,
471
+ "loss": 0.0321,
472
+ "step": 1700
473
+ },
474
+ {
475
+ "epoch": 1.9257790368271954,
476
+ "eval_loss": 0.04912006855010986,
477
+ "eval_runtime": 42.8004,
478
+ "eval_samples_per_second": 23.364,
479
+ "eval_steps_per_second": 0.748,
480
+ "step": 1700
481
+ },
482
+ {
483
+ "epoch": 1.9824362606232295,
484
+ "grad_norm": 0.09555891156196594,
485
+ "learning_rate": 1.0024358894342167e-07,
486
+ "loss": 0.0307,
487
+ "step": 1750
488
+ },
489
+ {
490
+ "epoch": 1.9824362606232295,
491
+ "eval_loss": 0.04891112446784973,
492
+ "eval_runtime": 42.7732,
493
+ "eval_samples_per_second": 23.379,
494
+ "eval_steps_per_second": 0.748,
495
+ "step": 1750
496
+ },
497
+ {
498
+ "epoch": 2.0,
499
+ "step": 1766,
500
+ "total_flos": 1.1224942823709082e+18,
501
+ "train_loss": 0.10363658229870003,
502
+ "train_runtime": 2902.2639,
503
+ "train_samples_per_second": 9.73,
504
+ "train_steps_per_second": 0.608
505
+ }
506
+ ],
507
+ "logging_steps": 50,
508
+ "max_steps": 1766,
509
+ "num_input_tokens_seen": 0,
510
+ "num_train_epochs": 2,
511
+ "save_steps": 50.0,
512
+ "stateful_callbacks": {
513
+ "TrainerControl": {
514
+ "args": {
515
+ "should_epoch_stop": false,
516
+ "should_evaluate": false,
517
+ "should_log": false,
518
+ "should_save": false,
519
+ "should_training_stop": false
520
+ },
521
+ "attributes": {}
522
+ }
523
+ },
524
+ "total_flos": 1.1224942823709082e+18,
525
+ "train_batch_size": 8,
526
+ "trial_name": null,
527
+ "trial_params": null
528
+ }
Llama13B/CMS/t60107d20h20m59,ep=1.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_model_name_or_path": "dfurman/llama-13b",
3
+ "bias": "none",
4
+ "col_L": 32,
5
+ "drop_out": 0.0,
6
+ "inference_mode": false,
7
+ "layers_to_transform": null,
8
+ "modules_to_save": null,
9
+ "num_unique_blocks_L": 32,
10
+ "num_unique_blocks_R": 32,
11
+ "peft_type": "SAMA",
12
+ "revision": null,
13
+ "row_R": 32,
14
+ "scaling": 1,
15
+ "share_factor_L": 1,
16
+ "share_factor_R": 1,
17
+ "target_modules": [
18
+ "down_proj",
19
+ "up_proj",
20
+ "q_proj",
21
+ "v_proj",
22
+ "k_proj"
23
+ ],
24
+ "target_modules_to_skip": null,
25
+ "task_type": "CAUSAL_LM"
26
+ }
Llama13B/CMS/t60107d20h20m59,ep=1.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
Llama13B/CMS/t60107d20h20m59,ep=1.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
Llama13B/CMS/t60107d20h20m59,ep=1.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
Llama13B/CMS/t60107d20h20m59,ep=1.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": true,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": true,
36
+ "model_max_length": 512,
37
+ "pad_token": "</s>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
Llama13B/CMS/t60107d20h20m59,ep=1.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft2_/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_model_name_or_path": "dfurman/llama-13b",
3
+ "bias": "none",
4
+ "col_L": 32,
5
+ "drop_out": 0.0,
6
+ "inference_mode": true,
7
+ "layers_to_transform": null,
8
+ "modules_to_save": null,
9
+ "num_unique_blocks_L": 32,
10
+ "num_unique_blocks_R": 32,
11
+ "peft_type": "SAMA",
12
+ "revision": null,
13
+ "row_R": 32,
14
+ "scaling": 1,
15
+ "share_factor_L": 1,
16
+ "share_factor_R": 1,
17
+ "target_modules": [
18
+ "down_proj",
19
+ "up_proj",
20
+ "q_proj",
21
+ "v_proj",
22
+ "k_proj"
23
+ ],
24
+ "target_modules_to_skip": null,
25
+ "task_type": "CAUSAL_LM"
26
+ }
Llama13B/CMS/t60107d20h20m59,ep=1.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/ft2_/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63407e24c9c8aa2349eff68dd62b89733eea72c3b8e4de0a7bf654970c28ba41
3
+ size 351427939
Llama13B/CMS/t60107d20h20m59,ep=1.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/model_parameters_report.txt ADDED
@@ -0,0 +1,409 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ === GLOBAL STATISTICS ===
2
+ Total Parameters: 13,103,682,560
3
+ Trainable Parameters: 87,818,240
4
+ Trainable Percentage: 0.6702%
5
+
6
+ ================================================================================================================================
7
+ === DETAILED TRAINABLE MATRICES LIST ===
8
+ Layer Name | Shape | Count
9
+ ---------------------------------------------------------------------------------------------------------------------------------------
10
+ base_model.model.model.layers.0.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
11
+ base_model.model.model.layers.0.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
12
+ base_model.model.model.layers.0.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
13
+ base_model.model.model.layers.0.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
14
+ base_model.model.model.layers.0.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
15
+ base_model.model.model.layers.0.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
16
+ base_model.model.model.layers.0.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
17
+ base_model.model.model.layers.0.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
18
+ base_model.model.model.layers.0.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
19
+ base_model.model.model.layers.0.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
20
+ base_model.model.model.layers.1.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
21
+ base_model.model.model.layers.1.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
22
+ base_model.model.model.layers.1.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
23
+ base_model.model.model.layers.1.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
24
+ base_model.model.model.layers.1.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
25
+ base_model.model.model.layers.1.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
26
+ base_model.model.model.layers.1.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
27
+ base_model.model.model.layers.1.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
28
+ base_model.model.model.layers.1.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
29
+ base_model.model.model.layers.1.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
30
+ base_model.model.model.layers.2.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
31
+ base_model.model.model.layers.2.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
32
+ base_model.model.model.layers.2.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
33
+ base_model.model.model.layers.2.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
34
+ base_model.model.model.layers.2.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
35
+ base_model.model.model.layers.2.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
36
+ base_model.model.model.layers.2.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
37
+ base_model.model.model.layers.2.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
38
+ base_model.model.model.layers.2.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
39
+ base_model.model.model.layers.2.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
40
+ base_model.model.model.layers.3.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
41
+ base_model.model.model.layers.3.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
42
+ base_model.model.model.layers.3.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
43
+ base_model.model.model.layers.3.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
44
+ base_model.model.model.layers.3.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
45
+ base_model.model.model.layers.3.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
46
+ base_model.model.model.layers.3.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
47
+ base_model.model.model.layers.3.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
48
+ base_model.model.model.layers.3.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
49
+ base_model.model.model.layers.3.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
50
+ base_model.model.model.layers.4.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
51
+ base_model.model.model.layers.4.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
52
+ base_model.model.model.layers.4.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
53
+ base_model.model.model.layers.4.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
54
+ base_model.model.model.layers.4.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
55
+ base_model.model.model.layers.4.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
56
+ base_model.model.model.layers.4.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
57
+ base_model.model.model.layers.4.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
58
+ base_model.model.model.layers.4.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
59
+ base_model.model.model.layers.4.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
60
+ base_model.model.model.layers.5.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
61
+ base_model.model.model.layers.5.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
62
+ base_model.model.model.layers.5.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
63
+ base_model.model.model.layers.5.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
64
+ base_model.model.model.layers.5.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
65
+ base_model.model.model.layers.5.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
66
+ base_model.model.model.layers.5.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
67
+ base_model.model.model.layers.5.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
68
+ base_model.model.model.layers.5.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
69
+ base_model.model.model.layers.5.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
70
+ base_model.model.model.layers.6.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
71
+ base_model.model.model.layers.6.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
72
+ base_model.model.model.layers.6.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
73
+ base_model.model.model.layers.6.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
74
+ base_model.model.model.layers.6.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
75
+ base_model.model.model.layers.6.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
76
+ base_model.model.model.layers.6.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
77
+ base_model.model.model.layers.6.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
78
+ base_model.model.model.layers.6.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
79
+ base_model.model.model.layers.6.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
80
+ base_model.model.model.layers.7.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
81
+ base_model.model.model.layers.7.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
82
+ base_model.model.model.layers.7.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
83
+ base_model.model.model.layers.7.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
84
+ base_model.model.model.layers.7.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
85
+ base_model.model.model.layers.7.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
86
+ base_model.model.model.layers.7.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
87
+ base_model.model.model.layers.7.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
88
+ base_model.model.model.layers.7.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
89
+ base_model.model.model.layers.7.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
90
+ base_model.model.model.layers.8.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
91
+ base_model.model.model.layers.8.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
92
+ base_model.model.model.layers.8.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
93
+ base_model.model.model.layers.8.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
94
+ base_model.model.model.layers.8.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
95
+ base_model.model.model.layers.8.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
96
+ base_model.model.model.layers.8.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
97
+ base_model.model.model.layers.8.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
98
+ base_model.model.model.layers.8.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
99
+ base_model.model.model.layers.8.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
100
+ base_model.model.model.layers.9.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
101
+ base_model.model.model.layers.9.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
102
+ base_model.model.model.layers.9.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
103
+ base_model.model.model.layers.9.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
104
+ base_model.model.model.layers.9.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
105
+ base_model.model.model.layers.9.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
106
+ base_model.model.model.layers.9.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
107
+ base_model.model.model.layers.9.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
108
+ base_model.model.model.layers.9.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
109
+ base_model.model.model.layers.9.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
110
+ base_model.model.model.layers.10.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
111
+ base_model.model.model.layers.10.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
112
+ base_model.model.model.layers.10.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
113
+ base_model.model.model.layers.10.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
114
+ base_model.model.model.layers.10.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
115
+ base_model.model.model.layers.10.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
116
+ base_model.model.model.layers.10.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
117
+ base_model.model.model.layers.10.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
118
+ base_model.model.model.layers.10.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
119
+ base_model.model.model.layers.10.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
120
+ base_model.model.model.layers.11.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
121
+ base_model.model.model.layers.11.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
122
+ base_model.model.model.layers.11.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
123
+ base_model.model.model.layers.11.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
124
+ base_model.model.model.layers.11.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
125
+ base_model.model.model.layers.11.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
126
+ base_model.model.model.layers.11.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
127
+ base_model.model.model.layers.11.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
128
+ base_model.model.model.layers.11.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
129
+ base_model.model.model.layers.11.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
130
+ base_model.model.model.layers.12.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
131
+ base_model.model.model.layers.12.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
132
+ base_model.model.model.layers.12.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
133
+ base_model.model.model.layers.12.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
134
+ base_model.model.model.layers.12.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
135
+ base_model.model.model.layers.12.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
136
+ base_model.model.model.layers.12.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
137
+ base_model.model.model.layers.12.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
138
+ base_model.model.model.layers.12.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
139
+ base_model.model.model.layers.12.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
140
+ base_model.model.model.layers.13.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
141
+ base_model.model.model.layers.13.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
142
+ base_model.model.model.layers.13.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
143
+ base_model.model.model.layers.13.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
144
+ base_model.model.model.layers.13.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
145
+ base_model.model.model.layers.13.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
146
+ base_model.model.model.layers.13.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
147
+ base_model.model.model.layers.13.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
148
+ base_model.model.model.layers.13.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
149
+ base_model.model.model.layers.13.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
150
+ base_model.model.model.layers.14.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
151
+ base_model.model.model.layers.14.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
152
+ base_model.model.model.layers.14.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
153
+ base_model.model.model.layers.14.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
154
+ base_model.model.model.layers.14.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
155
+ base_model.model.model.layers.14.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
156
+ base_model.model.model.layers.14.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
157
+ base_model.model.model.layers.14.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
158
+ base_model.model.model.layers.14.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
159
+ base_model.model.model.layers.14.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
160
+ base_model.model.model.layers.15.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
161
+ base_model.model.model.layers.15.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
162
+ base_model.model.model.layers.15.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
163
+ base_model.model.model.layers.15.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
164
+ base_model.model.model.layers.15.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
165
+ base_model.model.model.layers.15.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
166
+ base_model.model.model.layers.15.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
167
+ base_model.model.model.layers.15.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
168
+ base_model.model.model.layers.15.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
169
+ base_model.model.model.layers.15.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
170
+ base_model.model.model.layers.16.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
171
+ base_model.model.model.layers.16.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
172
+ base_model.model.model.layers.16.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
173
+ base_model.model.model.layers.16.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
174
+ base_model.model.model.layers.16.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
175
+ base_model.model.model.layers.16.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
176
+ base_model.model.model.layers.16.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
177
+ base_model.model.model.layers.16.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
178
+ base_model.model.model.layers.16.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
179
+ base_model.model.model.layers.16.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
180
+ base_model.model.model.layers.17.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
181
+ base_model.model.model.layers.17.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
182
+ base_model.model.model.layers.17.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
183
+ base_model.model.model.layers.17.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
184
+ base_model.model.model.layers.17.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
185
+ base_model.model.model.layers.17.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
186
+ base_model.model.model.layers.17.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
187
+ base_model.model.model.layers.17.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
188
+ base_model.model.model.layers.17.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
189
+ base_model.model.model.layers.17.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
190
+ base_model.model.model.layers.18.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
191
+ base_model.model.model.layers.18.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
192
+ base_model.model.model.layers.18.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
193
+ base_model.model.model.layers.18.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
194
+ base_model.model.model.layers.18.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
195
+ base_model.model.model.layers.18.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
196
+ base_model.model.model.layers.18.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
197
+ base_model.model.model.layers.18.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
198
+ base_model.model.model.layers.18.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
199
+ base_model.model.model.layers.18.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
200
+ base_model.model.model.layers.19.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
201
+ base_model.model.model.layers.19.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
202
+ base_model.model.model.layers.19.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
203
+ base_model.model.model.layers.19.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
204
+ base_model.model.model.layers.19.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
205
+ base_model.model.model.layers.19.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
206
+ base_model.model.model.layers.19.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
207
+ base_model.model.model.layers.19.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
208
+ base_model.model.model.layers.19.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
209
+ base_model.model.model.layers.19.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
210
+ base_model.model.model.layers.20.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
211
+ base_model.model.model.layers.20.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
212
+ base_model.model.model.layers.20.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
213
+ base_model.model.model.layers.20.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
214
+ base_model.model.model.layers.20.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
215
+ base_model.model.model.layers.20.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
216
+ base_model.model.model.layers.20.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
217
+ base_model.model.model.layers.20.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
218
+ base_model.model.model.layers.20.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
219
+ base_model.model.model.layers.20.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
220
+ base_model.model.model.layers.21.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
221
+ base_model.model.model.layers.21.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
222
+ base_model.model.model.layers.21.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
223
+ base_model.model.model.layers.21.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
224
+ base_model.model.model.layers.21.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
225
+ base_model.model.model.layers.21.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
226
+ base_model.model.model.layers.21.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
227
+ base_model.model.model.layers.21.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
228
+ base_model.model.model.layers.21.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
229
+ base_model.model.model.layers.21.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
230
+ base_model.model.model.layers.22.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
231
+ base_model.model.model.layers.22.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
232
+ base_model.model.model.layers.22.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
233
+ base_model.model.model.layers.22.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
234
+ base_model.model.model.layers.22.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
235
+ base_model.model.model.layers.22.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
236
+ base_model.model.model.layers.22.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
237
+ base_model.model.model.layers.22.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
238
+ base_model.model.model.layers.22.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
239
+ base_model.model.model.layers.22.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
240
+ base_model.model.model.layers.23.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
241
+ base_model.model.model.layers.23.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
242
+ base_model.model.model.layers.23.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
243
+ base_model.model.model.layers.23.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
244
+ base_model.model.model.layers.23.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
245
+ base_model.model.model.layers.23.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
246
+ base_model.model.model.layers.23.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
247
+ base_model.model.model.layers.23.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
248
+ base_model.model.model.layers.23.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
249
+ base_model.model.model.layers.23.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
250
+ base_model.model.model.layers.24.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
251
+ base_model.model.model.layers.24.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
252
+ base_model.model.model.layers.24.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
253
+ base_model.model.model.layers.24.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
254
+ base_model.model.model.layers.24.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
255
+ base_model.model.model.layers.24.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
256
+ base_model.model.model.layers.24.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
257
+ base_model.model.model.layers.24.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
258
+ base_model.model.model.layers.24.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
259
+ base_model.model.model.layers.24.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
260
+ base_model.model.model.layers.25.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
261
+ base_model.model.model.layers.25.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
262
+ base_model.model.model.layers.25.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
263
+ base_model.model.model.layers.25.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
264
+ base_model.model.model.layers.25.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
265
+ base_model.model.model.layers.25.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
266
+ base_model.model.model.layers.25.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
267
+ base_model.model.model.layers.25.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
268
+ base_model.model.model.layers.25.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
269
+ base_model.model.model.layers.25.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
270
+ base_model.model.model.layers.26.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
271
+ base_model.model.model.layers.26.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
272
+ base_model.model.model.layers.26.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
273
+ base_model.model.model.layers.26.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
274
+ base_model.model.model.layers.26.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
275
+ base_model.model.model.layers.26.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
276
+ base_model.model.model.layers.26.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
277
+ base_model.model.model.layers.26.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
278
+ base_model.model.model.layers.26.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
279
+ base_model.model.model.layers.26.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
280
+ base_model.model.model.layers.27.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
281
+ base_model.model.model.layers.27.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
282
+ base_model.model.model.layers.27.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
283
+ base_model.model.model.layers.27.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
284
+ base_model.model.model.layers.27.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
285
+ base_model.model.model.layers.27.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
286
+ base_model.model.model.layers.27.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
287
+ base_model.model.model.layers.27.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
288
+ base_model.model.model.layers.27.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
289
+ base_model.model.model.layers.27.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
290
+ base_model.model.model.layers.28.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
291
+ base_model.model.model.layers.28.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
292
+ base_model.model.model.layers.28.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
293
+ base_model.model.model.layers.28.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
294
+ base_model.model.model.layers.28.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
295
+ base_model.model.model.layers.28.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
296
+ base_model.model.model.layers.28.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
297
+ base_model.model.model.layers.28.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
298
+ base_model.model.model.layers.28.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
299
+ base_model.model.model.layers.28.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
300
+ base_model.model.model.layers.29.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
301
+ base_model.model.model.layers.29.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
302
+ base_model.model.model.layers.29.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
303
+ base_model.model.model.layers.29.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
304
+ base_model.model.model.layers.29.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
305
+ base_model.model.model.layers.29.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
306
+ base_model.model.model.layers.29.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
307
+ base_model.model.model.layers.29.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
308
+ base_model.model.model.layers.29.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
309
+ base_model.model.model.layers.29.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
310
+ base_model.model.model.layers.30.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
311
+ base_model.model.model.layers.30.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
312
+ base_model.model.model.layers.30.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
313
+ base_model.model.model.layers.30.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
314
+ base_model.model.model.layers.30.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
315
+ base_model.model.model.layers.30.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
316
+ base_model.model.model.layers.30.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
317
+ base_model.model.model.layers.30.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
318
+ base_model.model.model.layers.30.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
319
+ base_model.model.model.layers.30.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
320
+ base_model.model.model.layers.31.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
321
+ base_model.model.model.layers.31.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
322
+ base_model.model.model.layers.31.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
323
+ base_model.model.model.layers.31.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
324
+ base_model.model.model.layers.31.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
325
+ base_model.model.model.layers.31.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
326
+ base_model.model.model.layers.31.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
327
+ base_model.model.model.layers.31.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
328
+ base_model.model.model.layers.31.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
329
+ base_model.model.model.layers.31.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
330
+ base_model.model.model.layers.32.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
331
+ base_model.model.model.layers.32.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
332
+ base_model.model.model.layers.32.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
333
+ base_model.model.model.layers.32.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
334
+ base_model.model.model.layers.32.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
335
+ base_model.model.model.layers.32.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
336
+ base_model.model.model.layers.32.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
337
+ base_model.model.model.layers.32.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
338
+ base_model.model.model.layers.32.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
339
+ base_model.model.model.layers.32.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
340
+ base_model.model.model.layers.33.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
341
+ base_model.model.model.layers.33.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
342
+ base_model.model.model.layers.33.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
343
+ base_model.model.model.layers.33.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
344
+ base_model.model.model.layers.33.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
345
+ base_model.model.model.layers.33.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
346
+ base_model.model.model.layers.33.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
347
+ base_model.model.model.layers.33.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
348
+ base_model.model.model.layers.33.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
349
+ base_model.model.model.layers.33.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
350
+ base_model.model.model.layers.34.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
351
+ base_model.model.model.layers.34.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
352
+ base_model.model.model.layers.34.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
353
+ base_model.model.model.layers.34.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
354
+ base_model.model.model.layers.34.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
355
+ base_model.model.model.layers.34.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
356
+ base_model.model.model.layers.34.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
357
+ base_model.model.model.layers.34.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
358
+ base_model.model.model.layers.34.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
359
+ base_model.model.model.layers.34.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
360
+ base_model.model.model.layers.35.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
361
+ base_model.model.model.layers.35.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
362
+ base_model.model.model.layers.35.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
363
+ base_model.model.model.layers.35.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
364
+ base_model.model.model.layers.35.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
365
+ base_model.model.model.layers.35.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
366
+ base_model.model.model.layers.35.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
367
+ base_model.model.model.layers.35.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
368
+ base_model.model.model.layers.35.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
369
+ base_model.model.model.layers.35.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
370
+ base_model.model.model.layers.36.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
371
+ base_model.model.model.layers.36.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
372
+ base_model.model.model.layers.36.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
373
+ base_model.model.model.layers.36.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
374
+ base_model.model.model.layers.36.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
375
+ base_model.model.model.layers.36.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
376
+ base_model.model.model.layers.36.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
377
+ base_model.model.model.layers.36.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
378
+ base_model.model.model.layers.36.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
379
+ base_model.model.model.layers.36.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
380
+ base_model.model.model.layers.37.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
381
+ base_model.model.model.layers.37.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
382
+ base_model.model.model.layers.37.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
383
+ base_model.model.model.layers.37.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
384
+ base_model.model.model.layers.37.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
385
+ base_model.model.model.layers.37.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
386
+ base_model.model.model.layers.37.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
387
+ base_model.model.model.layers.37.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
388
+ base_model.model.model.layers.37.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
389
+ base_model.model.model.layers.37.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
390
+ base_model.model.model.layers.38.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
391
+ base_model.model.model.layers.38.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
392
+ base_model.model.model.layers.38.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
393
+ base_model.model.model.layers.38.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
394
+ base_model.model.model.layers.38.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
395
+ base_model.model.model.layers.38.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
396
+ base_model.model.model.layers.38.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
397
+ base_model.model.model.layers.38.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
398
+ base_model.model.model.layers.38.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
399
+ base_model.model.model.layers.38.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
400
+ base_model.model.model.layers.39.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
401
+ base_model.model.model.layers.39.self_attn.q_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
402
+ base_model.model.model.layers.39.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
403
+ base_model.model.model.layers.39.self_attn.k_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
404
+ base_model.model.model.layers.39.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
405
+ base_model.model.model.layers.39.self_attn.v_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
406
+ base_model.model.model.layers.39.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 32] | 442,368
407
+ base_model.model.model.layers.39.mlp.up_proj._sama_layer.default.sama_R.weights | [32, 32, 160] | 163,840
408
+ base_model.model.model.layers.39.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 32] | 163,840
409
+ base_model.model.model.layers.39.mlp.down_proj._sama_layer.default.sama_R.weights | [32, 32, 432] | 442,368
Llama13B/CMS/t60107d20h20m59,ep=1.0,mlr5.0e-04,b8,nb32,32,cL32,rR32,s1,initdef,dr0.0,size14119,5/trainer_state.json ADDED
@@ -0,0 +1,528 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 50,
7
+ "global_step": 1765,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.028328611898016998,
14
+ "grad_norm": 0.05120686814188957,
15
+ "learning_rate": 0.000245,
16
+ "loss": 1.904,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.056657223796033995,
21
+ "grad_norm": 0.0301814004778862,
22
+ "learning_rate": 0.000495,
23
+ "loss": 0.1162,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.08498583569405099,
28
+ "grad_norm": 0.05142534524202347,
29
+ "learning_rate": 0.0004989309807668432,
30
+ "loss": 0.0918,
31
+ "step": 150
32
+ },
33
+ {
34
+ "epoch": 0.11331444759206799,
35
+ "grad_norm": 0.04382554814219475,
36
+ "learning_rate": 0.000495645795847958,
37
+ "loss": 0.0934,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 0.141643059490085,
42
+ "grad_norm": 0.07652860879898071,
43
+ "learning_rate": 0.0004901732526860121,
44
+ "loss": 0.0842,
45
+ "step": 250
46
+ },
47
+ {
48
+ "epoch": 0.16997167138810199,
49
+ "grad_norm": 0.08357395231723785,
50
+ "learning_rate": 0.000482562081658639,
51
+ "loss": 0.0769,
52
+ "step": 300
53
+ },
54
+ {
55
+ "epoch": 0.16997167138810199,
56
+ "eval_loss": 0.08110018074512482,
57
+ "eval_runtime": 40.2393,
58
+ "eval_samples_per_second": 24.851,
59
+ "eval_steps_per_second": 0.795,
60
+ "step": 300
61
+ },
62
+ {
63
+ "epoch": 0.19830028328611898,
64
+ "grad_norm": 0.09154224395751953,
65
+ "learning_rate": 0.00047288005660104163,
66
+ "loss": 0.08,
67
+ "step": 350
68
+ },
69
+ {
70
+ "epoch": 0.19830028328611898,
71
+ "eval_loss": 0.07602168619632721,
72
+ "eval_runtime": 40.0059,
73
+ "eval_samples_per_second": 24.996,
74
+ "eval_steps_per_second": 0.8,
75
+ "step": 350
76
+ },
77
+ {
78
+ "epoch": 0.22662889518413598,
79
+ "grad_norm": 0.06302578002214432,
80
+ "learning_rate": 0.0004612133913124268,
81
+ "loss": 0.0739,
82
+ "step": 400
83
+ },
84
+ {
85
+ "epoch": 0.22662889518413598,
86
+ "eval_loss": 0.06869283318519592,
87
+ "eval_runtime": 40.0279,
88
+ "eval_samples_per_second": 24.983,
89
+ "eval_steps_per_second": 0.799,
90
+ "step": 400
91
+ },
92
+ {
93
+ "epoch": 0.254957507082153,
94
+ "grad_norm": 0.21662577986717224,
95
+ "learning_rate": 0.0004476659718633865,
96
+ "loss": 0.0707,
97
+ "step": 450
98
+ },
99
+ {
100
+ "epoch": 0.254957507082153,
101
+ "eval_loss": 0.06703159958124161,
102
+ "eval_runtime": 40.0265,
103
+ "eval_samples_per_second": 24.983,
104
+ "eval_steps_per_second": 0.799,
105
+ "step": 450
106
+ },
107
+ {
108
+ "epoch": 0.28328611898017,
109
+ "grad_norm": 0.0684773176908493,
110
+ "learning_rate": 0.0004323584315401604,
111
+ "loss": 0.0666,
112
+ "step": 500
113
+ },
114
+ {
115
+ "epoch": 0.28328611898017,
116
+ "eval_loss": 0.06985477358102798,
117
+ "eval_runtime": 39.9964,
118
+ "eval_samples_per_second": 25.002,
119
+ "eval_steps_per_second": 0.8,
120
+ "step": 500
121
+ },
122
+ {
123
+ "epoch": 0.311614730878187,
124
+ "grad_norm": 0.023050207644701004,
125
+ "learning_rate": 0.000415427076662958,
126
+ "loss": 0.065,
127
+ "step": 550
128
+ },
129
+ {
130
+ "epoch": 0.311614730878187,
131
+ "eval_loss": 0.06582478433847427,
132
+ "eval_runtime": 39.9999,
133
+ "eval_samples_per_second": 25.0,
134
+ "eval_steps_per_second": 0.8,
135
+ "step": 550
136
+ },
137
+ {
138
+ "epoch": 0.33994334277620397,
139
+ "grad_norm": 0.06662499159574509,
140
+ "learning_rate": 0.0003970226728434152,
141
+ "loss": 0.0627,
142
+ "step": 600
143
+ },
144
+ {
145
+ "epoch": 0.33994334277620397,
146
+ "eval_loss": 0.06325174123048782,
147
+ "eval_runtime": 39.9867,
148
+ "eval_samples_per_second": 25.008,
149
+ "eval_steps_per_second": 0.8,
150
+ "step": 600
151
+ },
152
+ {
153
+ "epoch": 0.36827195467422097,
154
+ "grad_norm": 0.06945322453975677,
155
+ "learning_rate": 0.00037730910248898127,
156
+ "loss": 0.0661,
157
+ "step": 650
158
+ },
159
+ {
160
+ "epoch": 0.36827195467422097,
161
+ "eval_loss": 0.060704972594976425,
162
+ "eval_runtime": 39.9924,
163
+ "eval_samples_per_second": 25.005,
164
+ "eval_steps_per_second": 0.8,
165
+ "step": 650
166
+ },
167
+ {
168
+ "epoch": 0.39660056657223797,
169
+ "grad_norm": 0.21134211122989655,
170
+ "learning_rate": 0.00035646190550851575,
171
+ "loss": 0.061,
172
+ "step": 700
173
+ },
174
+ {
175
+ "epoch": 0.39660056657223797,
176
+ "eval_loss": 0.05985141918063164,
177
+ "eval_runtime": 40.0163,
178
+ "eval_samples_per_second": 24.99,
179
+ "eval_steps_per_second": 0.8,
180
+ "step": 700
181
+ },
182
+ {
183
+ "epoch": 0.42492917847025496,
184
+ "grad_norm": 0.059777744114398956,
185
+ "learning_rate": 0.0003346667162134164,
186
+ "loss": 0.0556,
187
+ "step": 750
188
+ },
189
+ {
190
+ "epoch": 0.42492917847025496,
191
+ "eval_loss": 0.06151320040225983,
192
+ "eval_runtime": 40.0008,
193
+ "eval_samples_per_second": 25.0,
194
+ "eval_steps_per_second": 0.8,
195
+ "step": 750
196
+ },
197
+ {
198
+ "epoch": 0.45325779036827196,
199
+ "grad_norm": 0.0928986445069313,
200
+ "learning_rate": 0.00031211761033292194,
201
+ "loss": 0.0566,
202
+ "step": 800
203
+ },
204
+ {
205
+ "epoch": 0.45325779036827196,
206
+ "eval_loss": 0.05747922882437706,
207
+ "eval_runtime": 40.0033,
208
+ "eval_samples_per_second": 24.998,
209
+ "eval_steps_per_second": 0.8,
210
+ "step": 800
211
+ },
212
+ {
213
+ "epoch": 0.48158640226628896,
214
+ "grad_norm": 0.12957215309143066,
215
+ "learning_rate": 0.0002890153768626315,
216
+ "loss": 0.059,
217
+ "step": 850
218
+ },
219
+ {
220
+ "epoch": 0.48158640226628896,
221
+ "eval_loss": 0.057896047830581665,
222
+ "eval_runtime": 40.0121,
223
+ "eval_samples_per_second": 24.992,
224
+ "eval_steps_per_second": 0.8,
225
+ "step": 850
226
+ },
227
+ {
228
+ "epoch": 0.509915014164306,
229
+ "grad_norm": 0.1764892339706421,
230
+ "learning_rate": 0.0002655657301345971,
231
+ "loss": 0.0539,
232
+ "step": 900
233
+ },
234
+ {
235
+ "epoch": 0.509915014164306,
236
+ "eval_loss": 0.05976394563913345,
237
+ "eval_runtime": 40.0113,
238
+ "eval_samples_per_second": 24.993,
239
+ "eval_steps_per_second": 0.8,
240
+ "step": 900
241
+ },
242
+ {
243
+ "epoch": 0.5382436260623229,
244
+ "grad_norm": 0.05985495075583458,
245
+ "learning_rate": 0.00024197747802965556,
246
+ "loss": 0.0558,
247
+ "step": 950
248
+ },
249
+ {
250
+ "epoch": 0.5382436260623229,
251
+ "eval_loss": 0.053400635719299316,
252
+ "eval_runtime": 39.9933,
253
+ "eval_samples_per_second": 25.004,
254
+ "eval_steps_per_second": 0.8,
255
+ "step": 950
256
+ },
257
+ {
258
+ "epoch": 0.56657223796034,
259
+ "grad_norm": 0.06537602096796036,
260
+ "learning_rate": 0.00021846066264318665,
261
+ "loss": 0.0509,
262
+ "step": 1000
263
+ },
264
+ {
265
+ "epoch": 0.56657223796034,
266
+ "eval_loss": 0.05671335384249687,
267
+ "eval_runtime": 40.0061,
268
+ "eval_samples_per_second": 24.996,
269
+ "eval_steps_per_second": 0.8,
270
+ "step": 1000
271
+ },
272
+ {
273
+ "epoch": 0.5949008498583569,
274
+ "grad_norm": 0.06947670876979828,
275
+ "learning_rate": 0.00019522468996078257,
276
+ "loss": 0.053,
277
+ "step": 1050
278
+ },
279
+ {
280
+ "epoch": 0.5949008498583569,
281
+ "eval_loss": 0.05418495461344719,
282
+ "eval_runtime": 39.9757,
283
+ "eval_samples_per_second": 25.015,
284
+ "eval_steps_per_second": 0.8,
285
+ "step": 1050
286
+ },
287
+ {
288
+ "epoch": 0.623229461756374,
289
+ "grad_norm": 0.2086954563856125,
290
+ "learning_rate": 0.00017247646519817056,
291
+ "loss": 0.0502,
292
+ "step": 1100
293
+ },
294
+ {
295
+ "epoch": 0.623229461756374,
296
+ "eval_loss": 0.05340389162302017,
297
+ "eval_runtime": 39.9938,
298
+ "eval_samples_per_second": 25.004,
299
+ "eval_steps_per_second": 0.8,
300
+ "step": 1100
301
+ },
302
+ {
303
+ "epoch": 0.6515580736543909,
304
+ "grad_norm": 0.11571495234966278,
305
+ "learning_rate": 0.00015041855040929373,
306
+ "loss": 0.0523,
307
+ "step": 1150
308
+ },
309
+ {
310
+ "epoch": 0.6515580736543909,
311
+ "eval_loss": 0.05297320336103439,
312
+ "eval_runtime": 39.9981,
313
+ "eval_samples_per_second": 25.001,
314
+ "eval_steps_per_second": 0.8,
315
+ "step": 1150
316
+ },
317
+ {
318
+ "epoch": 0.6798866855524079,
319
+ "grad_norm": 0.07803928852081299,
320
+ "learning_rate": 0.00012924736076817,
321
+ "loss": 0.0555,
322
+ "step": 1200
323
+ },
324
+ {
325
+ "epoch": 0.6798866855524079,
326
+ "eval_loss": 0.0508769154548645,
327
+ "eval_runtime": 39.9981,
328
+ "eval_samples_per_second": 25.001,
329
+ "eval_steps_per_second": 0.8,
330
+ "step": 1200
331
+ },
332
+ {
333
+ "epoch": 0.7082152974504249,
334
+ "grad_norm": 0.030330033972859383,
335
+ "learning_rate": 0.00010915141558577307,
336
+ "loss": 0.0509,
337
+ "step": 1250
338
+ },
339
+ {
340
+ "epoch": 0.7082152974504249,
341
+ "eval_loss": 0.050962403416633606,
342
+ "eval_runtime": 39.998,
343
+ "eval_samples_per_second": 25.001,
344
+ "eval_steps_per_second": 0.8,
345
+ "step": 1250
346
+ },
347
+ {
348
+ "epoch": 0.7365439093484419,
349
+ "grad_norm": 0.052799008786678314,
350
+ "learning_rate": 9.030965963579688e-05,
351
+ "loss": 0.0583,
352
+ "step": 1300
353
+ },
354
+ {
355
+ "epoch": 0.7365439093484419,
356
+ "eval_loss": 0.050432171672582626,
357
+ "eval_runtime": 39.9904,
358
+ "eval_samples_per_second": 25.006,
359
+ "eval_steps_per_second": 0.8,
360
+ "step": 1300
361
+ },
362
+ {
363
+ "epoch": 0.7648725212464589,
364
+ "grad_norm": 0.046064119786024094,
365
+ "learning_rate": 7.288986973708988e-05,
366
+ "loss": 0.0588,
367
+ "step": 1350
368
+ },
369
+ {
370
+ "epoch": 0.7648725212464589,
371
+ "eval_loss": 0.0505264587700367,
372
+ "eval_runtime": 39.9861,
373
+ "eval_samples_per_second": 25.009,
374
+ "eval_steps_per_second": 0.8,
375
+ "step": 1350
376
+ },
377
+ {
378
+ "epoch": 0.7932011331444759,
379
+ "grad_norm": 0.1480986326932907,
380
+ "learning_rate": 5.704716078138403e-05,
381
+ "loss": 0.045,
382
+ "step": 1400
383
+ },
384
+ {
385
+ "epoch": 0.7932011331444759,
386
+ "eval_loss": 0.04980534687638283,
387
+ "eval_runtime": 39.9962,
388
+ "eval_samples_per_second": 25.002,
389
+ "eval_steps_per_second": 0.8,
390
+ "step": 1400
391
+ },
392
+ {
393
+ "epoch": 0.8215297450424929,
394
+ "grad_norm": 0.06369854509830475,
395
+ "learning_rate": 4.2922604509423035e-05,
396
+ "loss": 0.0473,
397
+ "step": 1450
398
+ },
399
+ {
400
+ "epoch": 0.8215297450424929,
401
+ "eval_loss": 0.04981457069516182,
402
+ "eval_runtime": 39.991,
403
+ "eval_samples_per_second": 25.006,
404
+ "eval_steps_per_second": 0.8,
405
+ "step": 1450
406
+ },
407
+ {
408
+ "epoch": 0.8498583569405099,
409
+ "grad_norm": 0.09225151687860489,
410
+ "learning_rate": 3.064197333463017e-05,
411
+ "loss": 0.0583,
412
+ "step": 1500
413
+ },
414
+ {
415
+ "epoch": 0.8498583569405099,
416
+ "eval_loss": 0.04968957602977753,
417
+ "eval_runtime": 39.993,
418
+ "eval_samples_per_second": 25.004,
419
+ "eval_steps_per_second": 0.8,
420
+ "step": 1500
421
+ },
422
+ {
423
+ "epoch": 0.8781869688385269,
424
+ "grad_norm": 0.0795036181807518,
425
+ "learning_rate": 2.0314620399962458e-05,
426
+ "loss": 0.0466,
427
+ "step": 1550
428
+ },
429
+ {
430
+ "epoch": 0.8781869688385269,
431
+ "eval_loss": 0.049295783042907715,
432
+ "eval_runtime": 40.002,
433
+ "eval_samples_per_second": 24.999,
434
+ "eval_steps_per_second": 0.8,
435
+ "step": 1550
436
+ },
437
+ {
438
+ "epoch": 0.9065155807365439,
439
+ "grad_norm": 0.1349484771490097,
440
+ "learning_rate": 1.2032505840510632e-05,
441
+ "loss": 0.0519,
442
+ "step": 1600
443
+ },
444
+ {
445
+ "epoch": 0.9065155807365439,
446
+ "eval_loss": 0.04944948852062225,
447
+ "eval_runtime": 39.9878,
448
+ "eval_samples_per_second": 25.008,
449
+ "eval_steps_per_second": 0.8,
450
+ "step": 1600
451
+ },
452
+ {
453
+ "epoch": 0.9348441926345609,
454
+ "grad_norm": 0.14967283606529236,
455
+ "learning_rate": 5.869377922509161e-06,
456
+ "loss": 0.0564,
457
+ "step": 1650
458
+ },
459
+ {
460
+ "epoch": 0.9348441926345609,
461
+ "eval_loss": 0.04940963163971901,
462
+ "eval_runtime": 40.0216,
463
+ "eval_samples_per_second": 24.987,
464
+ "eval_steps_per_second": 0.8,
465
+ "step": 1650
466
+ },
467
+ {
468
+ "epoch": 0.9631728045325779,
469
+ "grad_norm": 0.17137259244918823,
470
+ "learning_rate": 1.8801163503225082e-06,
471
+ "loss": 0.049,
472
+ "step": 1700
473
+ },
474
+ {
475
+ "epoch": 0.9631728045325779,
476
+ "eval_loss": 0.04926325008273125,
477
+ "eval_runtime": 40.0111,
478
+ "eval_samples_per_second": 24.993,
479
+ "eval_steps_per_second": 0.8,
480
+ "step": 1700
481
+ },
482
+ {
483
+ "epoch": 0.9915014164305949,
484
+ "grad_norm": 0.2544344663619995,
485
+ "learning_rate": 1.0024358894342167e-07,
486
+ "loss": 0.0496,
487
+ "step": 1750
488
+ },
489
+ {
490
+ "epoch": 0.9915014164305949,
491
+ "eval_loss": 0.04924055561423302,
492
+ "eval_runtime": 40.005,
493
+ "eval_samples_per_second": 24.997,
494
+ "eval_steps_per_second": 0.8,
495
+ "step": 1750
496
+ },
497
+ {
498
+ "epoch": 1.0,
499
+ "step": 1765,
500
+ "total_flos": 5.612471411854541e+17,
501
+ "train_loss": 0.11448637544264537,
502
+ "train_runtime": 2238.1403,
503
+ "train_samples_per_second": 6.308,
504
+ "train_steps_per_second": 0.789
505
+ }
506
+ ],
507
+ "logging_steps": 50,
508
+ "max_steps": 1765,
509
+ "num_input_tokens_seen": 0,
510
+ "num_train_epochs": 1,
511
+ "save_steps": 50.0,
512
+ "stateful_callbacks": {
513
+ "TrainerControl": {
514
+ "args": {
515
+ "should_epoch_stop": false,
516
+ "should_evaluate": false,
517
+ "should_log": false,
518
+ "should_save": false,
519
+ "should_training_stop": false
520
+ },
521
+ "attributes": {}
522
+ }
523
+ },
524
+ "total_flos": 5.612471411854541e+17,
525
+ "train_batch_size": 8,
526
+ "trial_name": null,
527
+ "trial_params": null
528
+ }
Llama13B/CMS/t60107d21h14m55,ep=2.0,mlr5.0e-04,b16,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/model_parameters_report.txt ADDED
@@ -0,0 +1,409 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ === GLOBAL STATISTICS ===
2
+ Total Parameters: 13,081,728,000
3
+ Trainable Parameters: 65,863,680
4
+ Trainable Percentage: 0.5035%
5
+
6
+ ================================================================================================================================
7
+ === DETAILED TRAINABLE MATRICES LIST ===
8
+ Layer Name | Shape | Count
9
+ ---------------------------------------------------------------------------------------------------------------------------------------
10
+ base_model.model.model.layers.0.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
11
+ base_model.model.model.layers.0.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
12
+ base_model.model.model.layers.0.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
13
+ base_model.model.model.layers.0.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
14
+ base_model.model.model.layers.0.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
15
+ base_model.model.model.layers.0.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
16
+ base_model.model.model.layers.0.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
17
+ base_model.model.model.layers.0.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
18
+ base_model.model.model.layers.0.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
19
+ base_model.model.model.layers.0.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
20
+ base_model.model.model.layers.1.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
21
+ base_model.model.model.layers.1.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
22
+ base_model.model.model.layers.1.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
23
+ base_model.model.model.layers.1.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
24
+ base_model.model.model.layers.1.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
25
+ base_model.model.model.layers.1.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
26
+ base_model.model.model.layers.1.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
27
+ base_model.model.model.layers.1.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
28
+ base_model.model.model.layers.1.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
29
+ base_model.model.model.layers.1.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
30
+ base_model.model.model.layers.2.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
31
+ base_model.model.model.layers.2.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
32
+ base_model.model.model.layers.2.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
33
+ base_model.model.model.layers.2.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
34
+ base_model.model.model.layers.2.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
35
+ base_model.model.model.layers.2.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
36
+ base_model.model.model.layers.2.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
37
+ base_model.model.model.layers.2.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
38
+ base_model.model.model.layers.2.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
39
+ base_model.model.model.layers.2.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
40
+ base_model.model.model.layers.3.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
41
+ base_model.model.model.layers.3.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
42
+ base_model.model.model.layers.3.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
43
+ base_model.model.model.layers.3.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
44
+ base_model.model.model.layers.3.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
45
+ base_model.model.model.layers.3.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
46
+ base_model.model.model.layers.3.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
47
+ base_model.model.model.layers.3.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
48
+ base_model.model.model.layers.3.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
49
+ base_model.model.model.layers.3.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
50
+ base_model.model.model.layers.4.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
51
+ base_model.model.model.layers.4.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
52
+ base_model.model.model.layers.4.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
53
+ base_model.model.model.layers.4.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
54
+ base_model.model.model.layers.4.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
55
+ base_model.model.model.layers.4.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
56
+ base_model.model.model.layers.4.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
57
+ base_model.model.model.layers.4.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
58
+ base_model.model.model.layers.4.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
59
+ base_model.model.model.layers.4.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
60
+ base_model.model.model.layers.5.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
61
+ base_model.model.model.layers.5.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
62
+ base_model.model.model.layers.5.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
63
+ base_model.model.model.layers.5.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
64
+ base_model.model.model.layers.5.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
65
+ base_model.model.model.layers.5.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
66
+ base_model.model.model.layers.5.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
67
+ base_model.model.model.layers.5.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
68
+ base_model.model.model.layers.5.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
69
+ base_model.model.model.layers.5.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
70
+ base_model.model.model.layers.6.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
71
+ base_model.model.model.layers.6.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
72
+ base_model.model.model.layers.6.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
73
+ base_model.model.model.layers.6.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
74
+ base_model.model.model.layers.6.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
75
+ base_model.model.model.layers.6.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
76
+ base_model.model.model.layers.6.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
77
+ base_model.model.model.layers.6.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
78
+ base_model.model.model.layers.6.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
79
+ base_model.model.model.layers.6.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
80
+ base_model.model.model.layers.7.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
81
+ base_model.model.model.layers.7.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
82
+ base_model.model.model.layers.7.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
83
+ base_model.model.model.layers.7.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
84
+ base_model.model.model.layers.7.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
85
+ base_model.model.model.layers.7.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
86
+ base_model.model.model.layers.7.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
87
+ base_model.model.model.layers.7.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
88
+ base_model.model.model.layers.7.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
89
+ base_model.model.model.layers.7.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
90
+ base_model.model.model.layers.8.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
91
+ base_model.model.model.layers.8.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
92
+ base_model.model.model.layers.8.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
93
+ base_model.model.model.layers.8.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
94
+ base_model.model.model.layers.8.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
95
+ base_model.model.model.layers.8.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
96
+ base_model.model.model.layers.8.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
97
+ base_model.model.model.layers.8.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
98
+ base_model.model.model.layers.8.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
99
+ base_model.model.model.layers.8.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
100
+ base_model.model.model.layers.9.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
101
+ base_model.model.model.layers.9.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
102
+ base_model.model.model.layers.9.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
103
+ base_model.model.model.layers.9.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
104
+ base_model.model.model.layers.9.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
105
+ base_model.model.model.layers.9.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
106
+ base_model.model.model.layers.9.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
107
+ base_model.model.model.layers.9.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
108
+ base_model.model.model.layers.9.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
109
+ base_model.model.model.layers.9.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
110
+ base_model.model.model.layers.10.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
111
+ base_model.model.model.layers.10.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
112
+ base_model.model.model.layers.10.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
113
+ base_model.model.model.layers.10.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
114
+ base_model.model.model.layers.10.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
115
+ base_model.model.model.layers.10.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
116
+ base_model.model.model.layers.10.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
117
+ base_model.model.model.layers.10.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
118
+ base_model.model.model.layers.10.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
119
+ base_model.model.model.layers.10.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
120
+ base_model.model.model.layers.11.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
121
+ base_model.model.model.layers.11.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
122
+ base_model.model.model.layers.11.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
123
+ base_model.model.model.layers.11.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
124
+ base_model.model.model.layers.11.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
125
+ base_model.model.model.layers.11.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
126
+ base_model.model.model.layers.11.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
127
+ base_model.model.model.layers.11.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
128
+ base_model.model.model.layers.11.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
129
+ base_model.model.model.layers.11.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
130
+ base_model.model.model.layers.12.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
131
+ base_model.model.model.layers.12.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
132
+ base_model.model.model.layers.12.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
133
+ base_model.model.model.layers.12.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
134
+ base_model.model.model.layers.12.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
135
+ base_model.model.model.layers.12.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
136
+ base_model.model.model.layers.12.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
137
+ base_model.model.model.layers.12.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
138
+ base_model.model.model.layers.12.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
139
+ base_model.model.model.layers.12.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
140
+ base_model.model.model.layers.13.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
141
+ base_model.model.model.layers.13.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
142
+ base_model.model.model.layers.13.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
143
+ base_model.model.model.layers.13.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
144
+ base_model.model.model.layers.13.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
145
+ base_model.model.model.layers.13.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
146
+ base_model.model.model.layers.13.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
147
+ base_model.model.model.layers.13.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
148
+ base_model.model.model.layers.13.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
149
+ base_model.model.model.layers.13.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
150
+ base_model.model.model.layers.14.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
151
+ base_model.model.model.layers.14.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
152
+ base_model.model.model.layers.14.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
153
+ base_model.model.model.layers.14.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
154
+ base_model.model.model.layers.14.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
155
+ base_model.model.model.layers.14.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
156
+ base_model.model.model.layers.14.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
157
+ base_model.model.model.layers.14.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
158
+ base_model.model.model.layers.14.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
159
+ base_model.model.model.layers.14.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
160
+ base_model.model.model.layers.15.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
161
+ base_model.model.model.layers.15.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
162
+ base_model.model.model.layers.15.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
163
+ base_model.model.model.layers.15.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
164
+ base_model.model.model.layers.15.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
165
+ base_model.model.model.layers.15.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
166
+ base_model.model.model.layers.15.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
167
+ base_model.model.model.layers.15.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
168
+ base_model.model.model.layers.15.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
169
+ base_model.model.model.layers.15.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
170
+ base_model.model.model.layers.16.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
171
+ base_model.model.model.layers.16.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
172
+ base_model.model.model.layers.16.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
173
+ base_model.model.model.layers.16.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
174
+ base_model.model.model.layers.16.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
175
+ base_model.model.model.layers.16.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
176
+ base_model.model.model.layers.16.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
177
+ base_model.model.model.layers.16.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
178
+ base_model.model.model.layers.16.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
179
+ base_model.model.model.layers.16.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
180
+ base_model.model.model.layers.17.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
181
+ base_model.model.model.layers.17.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
182
+ base_model.model.model.layers.17.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
183
+ base_model.model.model.layers.17.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
184
+ base_model.model.model.layers.17.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
185
+ base_model.model.model.layers.17.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
186
+ base_model.model.model.layers.17.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
187
+ base_model.model.model.layers.17.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
188
+ base_model.model.model.layers.17.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
189
+ base_model.model.model.layers.17.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
190
+ base_model.model.model.layers.18.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
191
+ base_model.model.model.layers.18.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
192
+ base_model.model.model.layers.18.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
193
+ base_model.model.model.layers.18.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
194
+ base_model.model.model.layers.18.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
195
+ base_model.model.model.layers.18.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
196
+ base_model.model.model.layers.18.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
197
+ base_model.model.model.layers.18.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
198
+ base_model.model.model.layers.18.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
199
+ base_model.model.model.layers.18.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
200
+ base_model.model.model.layers.19.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
201
+ base_model.model.model.layers.19.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
202
+ base_model.model.model.layers.19.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
203
+ base_model.model.model.layers.19.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
204
+ base_model.model.model.layers.19.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
205
+ base_model.model.model.layers.19.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
206
+ base_model.model.model.layers.19.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
207
+ base_model.model.model.layers.19.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
208
+ base_model.model.model.layers.19.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
209
+ base_model.model.model.layers.19.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
210
+ base_model.model.model.layers.20.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
211
+ base_model.model.model.layers.20.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
212
+ base_model.model.model.layers.20.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
213
+ base_model.model.model.layers.20.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
214
+ base_model.model.model.layers.20.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
215
+ base_model.model.model.layers.20.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
216
+ base_model.model.model.layers.20.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
217
+ base_model.model.model.layers.20.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
218
+ base_model.model.model.layers.20.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
219
+ base_model.model.model.layers.20.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
220
+ base_model.model.model.layers.21.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
221
+ base_model.model.model.layers.21.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
222
+ base_model.model.model.layers.21.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
223
+ base_model.model.model.layers.21.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
224
+ base_model.model.model.layers.21.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
225
+ base_model.model.model.layers.21.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
226
+ base_model.model.model.layers.21.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
227
+ base_model.model.model.layers.21.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
228
+ base_model.model.model.layers.21.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
229
+ base_model.model.model.layers.21.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
230
+ base_model.model.model.layers.22.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
231
+ base_model.model.model.layers.22.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
232
+ base_model.model.model.layers.22.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
233
+ base_model.model.model.layers.22.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
234
+ base_model.model.model.layers.22.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
235
+ base_model.model.model.layers.22.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
236
+ base_model.model.model.layers.22.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
237
+ base_model.model.model.layers.22.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
238
+ base_model.model.model.layers.22.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
239
+ base_model.model.model.layers.22.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
240
+ base_model.model.model.layers.23.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
241
+ base_model.model.model.layers.23.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
242
+ base_model.model.model.layers.23.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
243
+ base_model.model.model.layers.23.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
244
+ base_model.model.model.layers.23.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
245
+ base_model.model.model.layers.23.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
246
+ base_model.model.model.layers.23.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
247
+ base_model.model.model.layers.23.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
248
+ base_model.model.model.layers.23.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
249
+ base_model.model.model.layers.23.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
250
+ base_model.model.model.layers.24.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
251
+ base_model.model.model.layers.24.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
252
+ base_model.model.model.layers.24.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
253
+ base_model.model.model.layers.24.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
254
+ base_model.model.model.layers.24.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
255
+ base_model.model.model.layers.24.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
256
+ base_model.model.model.layers.24.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
257
+ base_model.model.model.layers.24.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
258
+ base_model.model.model.layers.24.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
259
+ base_model.model.model.layers.24.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
260
+ base_model.model.model.layers.25.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
261
+ base_model.model.model.layers.25.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
262
+ base_model.model.model.layers.25.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
263
+ base_model.model.model.layers.25.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
264
+ base_model.model.model.layers.25.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
265
+ base_model.model.model.layers.25.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
266
+ base_model.model.model.layers.25.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
267
+ base_model.model.model.layers.25.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
268
+ base_model.model.model.layers.25.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
269
+ base_model.model.model.layers.25.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
270
+ base_model.model.model.layers.26.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
271
+ base_model.model.model.layers.26.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
272
+ base_model.model.model.layers.26.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
273
+ base_model.model.model.layers.26.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
274
+ base_model.model.model.layers.26.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
275
+ base_model.model.model.layers.26.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
276
+ base_model.model.model.layers.26.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
277
+ base_model.model.model.layers.26.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
278
+ base_model.model.model.layers.26.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
279
+ base_model.model.model.layers.26.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
280
+ base_model.model.model.layers.27.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
281
+ base_model.model.model.layers.27.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
282
+ base_model.model.model.layers.27.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
283
+ base_model.model.model.layers.27.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
284
+ base_model.model.model.layers.27.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
285
+ base_model.model.model.layers.27.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
286
+ base_model.model.model.layers.27.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
287
+ base_model.model.model.layers.27.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
288
+ base_model.model.model.layers.27.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
289
+ base_model.model.model.layers.27.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
290
+ base_model.model.model.layers.28.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
291
+ base_model.model.model.layers.28.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
292
+ base_model.model.model.layers.28.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
293
+ base_model.model.model.layers.28.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
294
+ base_model.model.model.layers.28.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
295
+ base_model.model.model.layers.28.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
296
+ base_model.model.model.layers.28.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
297
+ base_model.model.model.layers.28.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
298
+ base_model.model.model.layers.28.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
299
+ base_model.model.model.layers.28.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
300
+ base_model.model.model.layers.29.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
301
+ base_model.model.model.layers.29.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
302
+ base_model.model.model.layers.29.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
303
+ base_model.model.model.layers.29.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
304
+ base_model.model.model.layers.29.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
305
+ base_model.model.model.layers.29.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
306
+ base_model.model.model.layers.29.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
307
+ base_model.model.model.layers.29.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
308
+ base_model.model.model.layers.29.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
309
+ base_model.model.model.layers.29.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
310
+ base_model.model.model.layers.30.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
311
+ base_model.model.model.layers.30.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
312
+ base_model.model.model.layers.30.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
313
+ base_model.model.model.layers.30.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
314
+ base_model.model.model.layers.30.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
315
+ base_model.model.model.layers.30.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
316
+ base_model.model.model.layers.30.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
317
+ base_model.model.model.layers.30.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
318
+ base_model.model.model.layers.30.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
319
+ base_model.model.model.layers.30.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
320
+ base_model.model.model.layers.31.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
321
+ base_model.model.model.layers.31.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
322
+ base_model.model.model.layers.31.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
323
+ base_model.model.model.layers.31.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
324
+ base_model.model.model.layers.31.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
325
+ base_model.model.model.layers.31.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
326
+ base_model.model.model.layers.31.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
327
+ base_model.model.model.layers.31.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
328
+ base_model.model.model.layers.31.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
329
+ base_model.model.model.layers.31.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
330
+ base_model.model.model.layers.32.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
331
+ base_model.model.model.layers.32.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
332
+ base_model.model.model.layers.32.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
333
+ base_model.model.model.layers.32.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
334
+ base_model.model.model.layers.32.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
335
+ base_model.model.model.layers.32.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
336
+ base_model.model.model.layers.32.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
337
+ base_model.model.model.layers.32.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
338
+ base_model.model.model.layers.32.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
339
+ base_model.model.model.layers.32.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
340
+ base_model.model.model.layers.33.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
341
+ base_model.model.model.layers.33.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
342
+ base_model.model.model.layers.33.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
343
+ base_model.model.model.layers.33.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
344
+ base_model.model.model.layers.33.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
345
+ base_model.model.model.layers.33.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
346
+ base_model.model.model.layers.33.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
347
+ base_model.model.model.layers.33.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
348
+ base_model.model.model.layers.33.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
349
+ base_model.model.model.layers.33.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
350
+ base_model.model.model.layers.34.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
351
+ base_model.model.model.layers.34.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
352
+ base_model.model.model.layers.34.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
353
+ base_model.model.model.layers.34.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
354
+ base_model.model.model.layers.34.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
355
+ base_model.model.model.layers.34.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
356
+ base_model.model.model.layers.34.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
357
+ base_model.model.model.layers.34.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
358
+ base_model.model.model.layers.34.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
359
+ base_model.model.model.layers.34.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
360
+ base_model.model.model.layers.35.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
361
+ base_model.model.model.layers.35.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
362
+ base_model.model.model.layers.35.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
363
+ base_model.model.model.layers.35.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
364
+ base_model.model.model.layers.35.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
365
+ base_model.model.model.layers.35.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
366
+ base_model.model.model.layers.35.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
367
+ base_model.model.model.layers.35.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
368
+ base_model.model.model.layers.35.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
369
+ base_model.model.model.layers.35.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
370
+ base_model.model.model.layers.36.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
371
+ base_model.model.model.layers.36.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
372
+ base_model.model.model.layers.36.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
373
+ base_model.model.model.layers.36.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
374
+ base_model.model.model.layers.36.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
375
+ base_model.model.model.layers.36.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
376
+ base_model.model.model.layers.36.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
377
+ base_model.model.model.layers.36.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
378
+ base_model.model.model.layers.36.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
379
+ base_model.model.model.layers.36.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
380
+ base_model.model.model.layers.37.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
381
+ base_model.model.model.layers.37.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
382
+ base_model.model.model.layers.37.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
383
+ base_model.model.model.layers.37.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
384
+ base_model.model.model.layers.37.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
385
+ base_model.model.model.layers.37.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
386
+ base_model.model.model.layers.37.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
387
+ base_model.model.model.layers.37.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
388
+ base_model.model.model.layers.37.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
389
+ base_model.model.model.layers.37.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
390
+ base_model.model.model.layers.38.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
391
+ base_model.model.model.layers.38.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
392
+ base_model.model.model.layers.38.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
393
+ base_model.model.model.layers.38.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
394
+ base_model.model.model.layers.38.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
395
+ base_model.model.model.layers.38.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
396
+ base_model.model.model.layers.38.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
397
+ base_model.model.model.layers.38.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
398
+ base_model.model.model.layers.38.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
399
+ base_model.model.model.layers.38.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
400
+ base_model.model.model.layers.39.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
401
+ base_model.model.model.layers.39.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
402
+ base_model.model.model.layers.39.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
403
+ base_model.model.model.layers.39.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
404
+ base_model.model.model.layers.39.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
405
+ base_model.model.model.layers.39.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
406
+ base_model.model.model.layers.39.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
407
+ base_model.model.model.layers.39.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
408
+ base_model.model.model.layers.39.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
409
+ base_model.model.model.layers.39.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
Llama13B/CMS/t60107d21h20m27,ep=2.0,mlr5.0e-04,b16,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/model_parameters_report.txt ADDED
@@ -0,0 +1,409 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ === GLOBAL STATISTICS ===
2
+ Total Parameters: 13,081,728,000
3
+ Trainable Parameters: 65,863,680
4
+ Trainable Percentage: 0.5035%
5
+
6
+ ================================================================================================================================
7
+ === DETAILED TRAINABLE MATRICES LIST ===
8
+ Layer Name | Shape | Count
9
+ ---------------------------------------------------------------------------------------------------------------------------------------
10
+ base_model.model.model.layers.0.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
11
+ base_model.model.model.layers.0.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
12
+ base_model.model.model.layers.0.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
13
+ base_model.model.model.layers.0.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
14
+ base_model.model.model.layers.0.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
15
+ base_model.model.model.layers.0.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
16
+ base_model.model.model.layers.0.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
17
+ base_model.model.model.layers.0.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
18
+ base_model.model.model.layers.0.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
19
+ base_model.model.model.layers.0.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
20
+ base_model.model.model.layers.1.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
21
+ base_model.model.model.layers.1.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
22
+ base_model.model.model.layers.1.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
23
+ base_model.model.model.layers.1.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
24
+ base_model.model.model.layers.1.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
25
+ base_model.model.model.layers.1.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
26
+ base_model.model.model.layers.1.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
27
+ base_model.model.model.layers.1.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
28
+ base_model.model.model.layers.1.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
29
+ base_model.model.model.layers.1.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
30
+ base_model.model.model.layers.2.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
31
+ base_model.model.model.layers.2.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
32
+ base_model.model.model.layers.2.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
33
+ base_model.model.model.layers.2.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
34
+ base_model.model.model.layers.2.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
35
+ base_model.model.model.layers.2.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
36
+ base_model.model.model.layers.2.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
37
+ base_model.model.model.layers.2.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
38
+ base_model.model.model.layers.2.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
39
+ base_model.model.model.layers.2.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
40
+ base_model.model.model.layers.3.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
41
+ base_model.model.model.layers.3.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
42
+ base_model.model.model.layers.3.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
43
+ base_model.model.model.layers.3.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
44
+ base_model.model.model.layers.3.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
45
+ base_model.model.model.layers.3.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
46
+ base_model.model.model.layers.3.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
47
+ base_model.model.model.layers.3.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
48
+ base_model.model.model.layers.3.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
49
+ base_model.model.model.layers.3.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
50
+ base_model.model.model.layers.4.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
51
+ base_model.model.model.layers.4.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
52
+ base_model.model.model.layers.4.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
53
+ base_model.model.model.layers.4.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
54
+ base_model.model.model.layers.4.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
55
+ base_model.model.model.layers.4.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
56
+ base_model.model.model.layers.4.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
57
+ base_model.model.model.layers.4.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
58
+ base_model.model.model.layers.4.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
59
+ base_model.model.model.layers.4.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
60
+ base_model.model.model.layers.5.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
61
+ base_model.model.model.layers.5.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
62
+ base_model.model.model.layers.5.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
63
+ base_model.model.model.layers.5.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
64
+ base_model.model.model.layers.5.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
65
+ base_model.model.model.layers.5.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
66
+ base_model.model.model.layers.5.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
67
+ base_model.model.model.layers.5.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
68
+ base_model.model.model.layers.5.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
69
+ base_model.model.model.layers.5.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
70
+ base_model.model.model.layers.6.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
71
+ base_model.model.model.layers.6.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
72
+ base_model.model.model.layers.6.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
73
+ base_model.model.model.layers.6.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
74
+ base_model.model.model.layers.6.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
75
+ base_model.model.model.layers.6.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
76
+ base_model.model.model.layers.6.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
77
+ base_model.model.model.layers.6.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
78
+ base_model.model.model.layers.6.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
79
+ base_model.model.model.layers.6.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
80
+ base_model.model.model.layers.7.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
81
+ base_model.model.model.layers.7.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
82
+ base_model.model.model.layers.7.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
83
+ base_model.model.model.layers.7.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
84
+ base_model.model.model.layers.7.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
85
+ base_model.model.model.layers.7.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
86
+ base_model.model.model.layers.7.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
87
+ base_model.model.model.layers.7.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
88
+ base_model.model.model.layers.7.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
89
+ base_model.model.model.layers.7.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
90
+ base_model.model.model.layers.8.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
91
+ base_model.model.model.layers.8.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
92
+ base_model.model.model.layers.8.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
93
+ base_model.model.model.layers.8.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
94
+ base_model.model.model.layers.8.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
95
+ base_model.model.model.layers.8.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
96
+ base_model.model.model.layers.8.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
97
+ base_model.model.model.layers.8.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
98
+ base_model.model.model.layers.8.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
99
+ base_model.model.model.layers.8.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
100
+ base_model.model.model.layers.9.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
101
+ base_model.model.model.layers.9.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
102
+ base_model.model.model.layers.9.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
103
+ base_model.model.model.layers.9.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
104
+ base_model.model.model.layers.9.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
105
+ base_model.model.model.layers.9.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
106
+ base_model.model.model.layers.9.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
107
+ base_model.model.model.layers.9.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
108
+ base_model.model.model.layers.9.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
109
+ base_model.model.model.layers.9.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
110
+ base_model.model.model.layers.10.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
111
+ base_model.model.model.layers.10.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
112
+ base_model.model.model.layers.10.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
113
+ base_model.model.model.layers.10.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
114
+ base_model.model.model.layers.10.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
115
+ base_model.model.model.layers.10.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
116
+ base_model.model.model.layers.10.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
117
+ base_model.model.model.layers.10.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
118
+ base_model.model.model.layers.10.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
119
+ base_model.model.model.layers.10.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
120
+ base_model.model.model.layers.11.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
121
+ base_model.model.model.layers.11.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
122
+ base_model.model.model.layers.11.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
123
+ base_model.model.model.layers.11.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
124
+ base_model.model.model.layers.11.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
125
+ base_model.model.model.layers.11.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
126
+ base_model.model.model.layers.11.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
127
+ base_model.model.model.layers.11.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
128
+ base_model.model.model.layers.11.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
129
+ base_model.model.model.layers.11.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
130
+ base_model.model.model.layers.12.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
131
+ base_model.model.model.layers.12.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
132
+ base_model.model.model.layers.12.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
133
+ base_model.model.model.layers.12.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
134
+ base_model.model.model.layers.12.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
135
+ base_model.model.model.layers.12.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
136
+ base_model.model.model.layers.12.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
137
+ base_model.model.model.layers.12.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
138
+ base_model.model.model.layers.12.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
139
+ base_model.model.model.layers.12.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
140
+ base_model.model.model.layers.13.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
141
+ base_model.model.model.layers.13.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
142
+ base_model.model.model.layers.13.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
143
+ base_model.model.model.layers.13.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
144
+ base_model.model.model.layers.13.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
145
+ base_model.model.model.layers.13.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
146
+ base_model.model.model.layers.13.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
147
+ base_model.model.model.layers.13.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
148
+ base_model.model.model.layers.13.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
149
+ base_model.model.model.layers.13.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
150
+ base_model.model.model.layers.14.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
151
+ base_model.model.model.layers.14.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
152
+ base_model.model.model.layers.14.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
153
+ base_model.model.model.layers.14.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
154
+ base_model.model.model.layers.14.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
155
+ base_model.model.model.layers.14.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
156
+ base_model.model.model.layers.14.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
157
+ base_model.model.model.layers.14.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
158
+ base_model.model.model.layers.14.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
159
+ base_model.model.model.layers.14.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
160
+ base_model.model.model.layers.15.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
161
+ base_model.model.model.layers.15.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
162
+ base_model.model.model.layers.15.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
163
+ base_model.model.model.layers.15.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
164
+ base_model.model.model.layers.15.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
165
+ base_model.model.model.layers.15.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
166
+ base_model.model.model.layers.15.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
167
+ base_model.model.model.layers.15.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
168
+ base_model.model.model.layers.15.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
169
+ base_model.model.model.layers.15.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
170
+ base_model.model.model.layers.16.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
171
+ base_model.model.model.layers.16.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
172
+ base_model.model.model.layers.16.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
173
+ base_model.model.model.layers.16.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
174
+ base_model.model.model.layers.16.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
175
+ base_model.model.model.layers.16.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
176
+ base_model.model.model.layers.16.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
177
+ base_model.model.model.layers.16.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
178
+ base_model.model.model.layers.16.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
179
+ base_model.model.model.layers.16.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
180
+ base_model.model.model.layers.17.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
181
+ base_model.model.model.layers.17.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
182
+ base_model.model.model.layers.17.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
183
+ base_model.model.model.layers.17.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
184
+ base_model.model.model.layers.17.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
185
+ base_model.model.model.layers.17.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
186
+ base_model.model.model.layers.17.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
187
+ base_model.model.model.layers.17.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
188
+ base_model.model.model.layers.17.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
189
+ base_model.model.model.layers.17.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
190
+ base_model.model.model.layers.18.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
191
+ base_model.model.model.layers.18.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
192
+ base_model.model.model.layers.18.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
193
+ base_model.model.model.layers.18.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
194
+ base_model.model.model.layers.18.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
195
+ base_model.model.model.layers.18.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
196
+ base_model.model.model.layers.18.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
197
+ base_model.model.model.layers.18.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
198
+ base_model.model.model.layers.18.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
199
+ base_model.model.model.layers.18.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
200
+ base_model.model.model.layers.19.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
201
+ base_model.model.model.layers.19.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
202
+ base_model.model.model.layers.19.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
203
+ base_model.model.model.layers.19.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
204
+ base_model.model.model.layers.19.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
205
+ base_model.model.model.layers.19.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
206
+ base_model.model.model.layers.19.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
207
+ base_model.model.model.layers.19.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
208
+ base_model.model.model.layers.19.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
209
+ base_model.model.model.layers.19.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
210
+ base_model.model.model.layers.20.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
211
+ base_model.model.model.layers.20.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
212
+ base_model.model.model.layers.20.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
213
+ base_model.model.model.layers.20.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
214
+ base_model.model.model.layers.20.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
215
+ base_model.model.model.layers.20.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
216
+ base_model.model.model.layers.20.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
217
+ base_model.model.model.layers.20.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
218
+ base_model.model.model.layers.20.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
219
+ base_model.model.model.layers.20.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
220
+ base_model.model.model.layers.21.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
221
+ base_model.model.model.layers.21.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
222
+ base_model.model.model.layers.21.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
223
+ base_model.model.model.layers.21.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
224
+ base_model.model.model.layers.21.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
225
+ base_model.model.model.layers.21.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
226
+ base_model.model.model.layers.21.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
227
+ base_model.model.model.layers.21.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
228
+ base_model.model.model.layers.21.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
229
+ base_model.model.model.layers.21.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
230
+ base_model.model.model.layers.22.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
231
+ base_model.model.model.layers.22.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
232
+ base_model.model.model.layers.22.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
233
+ base_model.model.model.layers.22.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
234
+ base_model.model.model.layers.22.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
235
+ base_model.model.model.layers.22.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
236
+ base_model.model.model.layers.22.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
237
+ base_model.model.model.layers.22.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
238
+ base_model.model.model.layers.22.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
239
+ base_model.model.model.layers.22.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
240
+ base_model.model.model.layers.23.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
241
+ base_model.model.model.layers.23.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
242
+ base_model.model.model.layers.23.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
243
+ base_model.model.model.layers.23.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
244
+ base_model.model.model.layers.23.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
245
+ base_model.model.model.layers.23.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
246
+ base_model.model.model.layers.23.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
247
+ base_model.model.model.layers.23.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
248
+ base_model.model.model.layers.23.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
249
+ base_model.model.model.layers.23.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
250
+ base_model.model.model.layers.24.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
251
+ base_model.model.model.layers.24.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
252
+ base_model.model.model.layers.24.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
253
+ base_model.model.model.layers.24.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
254
+ base_model.model.model.layers.24.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
255
+ base_model.model.model.layers.24.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
256
+ base_model.model.model.layers.24.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
257
+ base_model.model.model.layers.24.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
258
+ base_model.model.model.layers.24.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
259
+ base_model.model.model.layers.24.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
260
+ base_model.model.model.layers.25.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
261
+ base_model.model.model.layers.25.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
262
+ base_model.model.model.layers.25.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
263
+ base_model.model.model.layers.25.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
264
+ base_model.model.model.layers.25.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
265
+ base_model.model.model.layers.25.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
266
+ base_model.model.model.layers.25.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
267
+ base_model.model.model.layers.25.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
268
+ base_model.model.model.layers.25.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
269
+ base_model.model.model.layers.25.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
270
+ base_model.model.model.layers.26.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
271
+ base_model.model.model.layers.26.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
272
+ base_model.model.model.layers.26.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
273
+ base_model.model.model.layers.26.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
274
+ base_model.model.model.layers.26.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
275
+ base_model.model.model.layers.26.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
276
+ base_model.model.model.layers.26.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
277
+ base_model.model.model.layers.26.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
278
+ base_model.model.model.layers.26.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
279
+ base_model.model.model.layers.26.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
280
+ base_model.model.model.layers.27.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
281
+ base_model.model.model.layers.27.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
282
+ base_model.model.model.layers.27.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
283
+ base_model.model.model.layers.27.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
284
+ base_model.model.model.layers.27.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
285
+ base_model.model.model.layers.27.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
286
+ base_model.model.model.layers.27.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
287
+ base_model.model.model.layers.27.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
288
+ base_model.model.model.layers.27.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
289
+ base_model.model.model.layers.27.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
290
+ base_model.model.model.layers.28.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
291
+ base_model.model.model.layers.28.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
292
+ base_model.model.model.layers.28.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
293
+ base_model.model.model.layers.28.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
294
+ base_model.model.model.layers.28.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
295
+ base_model.model.model.layers.28.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
296
+ base_model.model.model.layers.28.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
297
+ base_model.model.model.layers.28.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
298
+ base_model.model.model.layers.28.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
299
+ base_model.model.model.layers.28.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
300
+ base_model.model.model.layers.29.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
301
+ base_model.model.model.layers.29.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
302
+ base_model.model.model.layers.29.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
303
+ base_model.model.model.layers.29.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
304
+ base_model.model.model.layers.29.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
305
+ base_model.model.model.layers.29.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
306
+ base_model.model.model.layers.29.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
307
+ base_model.model.model.layers.29.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
308
+ base_model.model.model.layers.29.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
309
+ base_model.model.model.layers.29.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
310
+ base_model.model.model.layers.30.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
311
+ base_model.model.model.layers.30.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
312
+ base_model.model.model.layers.30.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
313
+ base_model.model.model.layers.30.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
314
+ base_model.model.model.layers.30.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
315
+ base_model.model.model.layers.30.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
316
+ base_model.model.model.layers.30.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
317
+ base_model.model.model.layers.30.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
318
+ base_model.model.model.layers.30.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
319
+ base_model.model.model.layers.30.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
320
+ base_model.model.model.layers.31.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
321
+ base_model.model.model.layers.31.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
322
+ base_model.model.model.layers.31.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
323
+ base_model.model.model.layers.31.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
324
+ base_model.model.model.layers.31.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
325
+ base_model.model.model.layers.31.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
326
+ base_model.model.model.layers.31.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
327
+ base_model.model.model.layers.31.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
328
+ base_model.model.model.layers.31.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
329
+ base_model.model.model.layers.31.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
330
+ base_model.model.model.layers.32.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
331
+ base_model.model.model.layers.32.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
332
+ base_model.model.model.layers.32.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
333
+ base_model.model.model.layers.32.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
334
+ base_model.model.model.layers.32.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
335
+ base_model.model.model.layers.32.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
336
+ base_model.model.model.layers.32.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
337
+ base_model.model.model.layers.32.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
338
+ base_model.model.model.layers.32.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
339
+ base_model.model.model.layers.32.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
340
+ base_model.model.model.layers.33.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
341
+ base_model.model.model.layers.33.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
342
+ base_model.model.model.layers.33.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
343
+ base_model.model.model.layers.33.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
344
+ base_model.model.model.layers.33.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
345
+ base_model.model.model.layers.33.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
346
+ base_model.model.model.layers.33.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
347
+ base_model.model.model.layers.33.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
348
+ base_model.model.model.layers.33.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
349
+ base_model.model.model.layers.33.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
350
+ base_model.model.model.layers.34.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
351
+ base_model.model.model.layers.34.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
352
+ base_model.model.model.layers.34.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
353
+ base_model.model.model.layers.34.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
354
+ base_model.model.model.layers.34.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
355
+ base_model.model.model.layers.34.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
356
+ base_model.model.model.layers.34.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
357
+ base_model.model.model.layers.34.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
358
+ base_model.model.model.layers.34.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
359
+ base_model.model.model.layers.34.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
360
+ base_model.model.model.layers.35.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
361
+ base_model.model.model.layers.35.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
362
+ base_model.model.model.layers.35.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
363
+ base_model.model.model.layers.35.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
364
+ base_model.model.model.layers.35.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
365
+ base_model.model.model.layers.35.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
366
+ base_model.model.model.layers.35.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
367
+ base_model.model.model.layers.35.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
368
+ base_model.model.model.layers.35.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
369
+ base_model.model.model.layers.35.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
370
+ base_model.model.model.layers.36.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
371
+ base_model.model.model.layers.36.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
372
+ base_model.model.model.layers.36.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
373
+ base_model.model.model.layers.36.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
374
+ base_model.model.model.layers.36.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
375
+ base_model.model.model.layers.36.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
376
+ base_model.model.model.layers.36.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
377
+ base_model.model.model.layers.36.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
378
+ base_model.model.model.layers.36.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
379
+ base_model.model.model.layers.36.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
380
+ base_model.model.model.layers.37.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
381
+ base_model.model.model.layers.37.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
382
+ base_model.model.model.layers.37.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
383
+ base_model.model.model.layers.37.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
384
+ base_model.model.model.layers.37.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
385
+ base_model.model.model.layers.37.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
386
+ base_model.model.model.layers.37.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
387
+ base_model.model.model.layers.37.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
388
+ base_model.model.model.layers.37.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
389
+ base_model.model.model.layers.37.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
390
+ base_model.model.model.layers.38.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
391
+ base_model.model.model.layers.38.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
392
+ base_model.model.model.layers.38.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
393
+ base_model.model.model.layers.38.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
394
+ base_model.model.model.layers.38.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
395
+ base_model.model.model.layers.38.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
396
+ base_model.model.model.layers.38.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
397
+ base_model.model.model.layers.38.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
398
+ base_model.model.model.layers.38.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
399
+ base_model.model.model.layers.38.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
400
+ base_model.model.model.layers.39.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
401
+ base_model.model.model.layers.39.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
402
+ base_model.model.model.layers.39.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
403
+ base_model.model.model.layers.39.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
404
+ base_model.model.model.layers.39.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
405
+ base_model.model.model.layers.39.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
406
+ base_model.model.model.layers.39.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
407
+ base_model.model.model.layers.39.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
408
+ base_model.model.model.layers.39.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
409
+ base_model.model.model.layers.39.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/ft/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_model_name_or_path": "dfurman/llama-13b",
3
+ "bias": "none",
4
+ "col_L": 16,
5
+ "drop_out": 0.0,
6
+ "inference_mode": false,
7
+ "layers_to_transform": null,
8
+ "modules_to_save": null,
9
+ "num_unique_blocks_L": 32,
10
+ "num_unique_blocks_R": 16,
11
+ "peft_type": "SAMA",
12
+ "revision": null,
13
+ "row_R": 32,
14
+ "scaling": 1,
15
+ "share_factor_L": 1,
16
+ "share_factor_R": 1,
17
+ "target_modules": [
18
+ "up_proj",
19
+ "down_proj",
20
+ "q_proj",
21
+ "k_proj",
22
+ "v_proj"
23
+ ],
24
+ "target_modules_to_skip": null,
25
+ "task_type": "CAUSAL_LM"
26
+ }
Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/ft/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/ft/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": true,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": true,
36
+ "model_max_length": 512,
37
+ "pad_token": "</s>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/ft2/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_model_name_or_path": "dfurman/llama-13b",
3
+ "bias": "none",
4
+ "col_L": 16,
5
+ "drop_out": 0.0,
6
+ "inference_mode": true,
7
+ "layers_to_transform": null,
8
+ "modules_to_save": null,
9
+ "num_unique_blocks_L": 32,
10
+ "num_unique_blocks_R": 16,
11
+ "peft_type": "SAMA",
12
+ "revision": null,
13
+ "row_R": 32,
14
+ "scaling": 1,
15
+ "share_factor_L": 1,
16
+ "share_factor_R": 1,
17
+ "target_modules": [
18
+ "up_proj",
19
+ "down_proj",
20
+ "q_proj",
21
+ "k_proj",
22
+ "v_proj"
23
+ ],
24
+ "target_modules_to_skip": null,
25
+ "task_type": "CAUSAL_LM"
26
+ }
Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb650adfdb4d1e644d3b081d691f222a3afc5dae46db87118ade70027f6ba1a1
3
+ size 263610019
Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/model_parameters_report.txt ADDED
@@ -0,0 +1,409 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ === GLOBAL STATISTICS ===
2
+ Total Parameters: 13,081,728,000
3
+ Trainable Parameters: 65,863,680
4
+ Trainable Percentage: 0.5035%
5
+
6
+ ================================================================================================================================
7
+ === DETAILED TRAINABLE MATRICES LIST ===
8
+ Layer Name | Shape | Count
9
+ ---------------------------------------------------------------------------------------------------------------------------------------
10
+ base_model.model.model.layers.0.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
11
+ base_model.model.model.layers.0.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
12
+ base_model.model.model.layers.0.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
13
+ base_model.model.model.layers.0.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
14
+ base_model.model.model.layers.0.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
15
+ base_model.model.model.layers.0.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
16
+ base_model.model.model.layers.0.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
17
+ base_model.model.model.layers.0.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
18
+ base_model.model.model.layers.0.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
19
+ base_model.model.model.layers.0.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
20
+ base_model.model.model.layers.1.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
21
+ base_model.model.model.layers.1.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
22
+ base_model.model.model.layers.1.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
23
+ base_model.model.model.layers.1.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
24
+ base_model.model.model.layers.1.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
25
+ base_model.model.model.layers.1.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
26
+ base_model.model.model.layers.1.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
27
+ base_model.model.model.layers.1.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
28
+ base_model.model.model.layers.1.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
29
+ base_model.model.model.layers.1.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
30
+ base_model.model.model.layers.2.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
31
+ base_model.model.model.layers.2.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
32
+ base_model.model.model.layers.2.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
33
+ base_model.model.model.layers.2.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
34
+ base_model.model.model.layers.2.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
35
+ base_model.model.model.layers.2.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
36
+ base_model.model.model.layers.2.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
37
+ base_model.model.model.layers.2.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
38
+ base_model.model.model.layers.2.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
39
+ base_model.model.model.layers.2.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
40
+ base_model.model.model.layers.3.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
41
+ base_model.model.model.layers.3.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
42
+ base_model.model.model.layers.3.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
43
+ base_model.model.model.layers.3.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
44
+ base_model.model.model.layers.3.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
45
+ base_model.model.model.layers.3.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
46
+ base_model.model.model.layers.3.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
47
+ base_model.model.model.layers.3.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
48
+ base_model.model.model.layers.3.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
49
+ base_model.model.model.layers.3.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
50
+ base_model.model.model.layers.4.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
51
+ base_model.model.model.layers.4.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
52
+ base_model.model.model.layers.4.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
53
+ base_model.model.model.layers.4.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
54
+ base_model.model.model.layers.4.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
55
+ base_model.model.model.layers.4.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
56
+ base_model.model.model.layers.4.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
57
+ base_model.model.model.layers.4.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
58
+ base_model.model.model.layers.4.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
59
+ base_model.model.model.layers.4.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
60
+ base_model.model.model.layers.5.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
61
+ base_model.model.model.layers.5.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
62
+ base_model.model.model.layers.5.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
63
+ base_model.model.model.layers.5.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
64
+ base_model.model.model.layers.5.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
65
+ base_model.model.model.layers.5.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
66
+ base_model.model.model.layers.5.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
67
+ base_model.model.model.layers.5.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
68
+ base_model.model.model.layers.5.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
69
+ base_model.model.model.layers.5.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
70
+ base_model.model.model.layers.6.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
71
+ base_model.model.model.layers.6.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
72
+ base_model.model.model.layers.6.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
73
+ base_model.model.model.layers.6.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
74
+ base_model.model.model.layers.6.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
75
+ base_model.model.model.layers.6.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
76
+ base_model.model.model.layers.6.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
77
+ base_model.model.model.layers.6.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
78
+ base_model.model.model.layers.6.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
79
+ base_model.model.model.layers.6.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
80
+ base_model.model.model.layers.7.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
81
+ base_model.model.model.layers.7.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
82
+ base_model.model.model.layers.7.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
83
+ base_model.model.model.layers.7.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
84
+ base_model.model.model.layers.7.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
85
+ base_model.model.model.layers.7.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
86
+ base_model.model.model.layers.7.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
87
+ base_model.model.model.layers.7.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
88
+ base_model.model.model.layers.7.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
89
+ base_model.model.model.layers.7.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
90
+ base_model.model.model.layers.8.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
91
+ base_model.model.model.layers.8.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
92
+ base_model.model.model.layers.8.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
93
+ base_model.model.model.layers.8.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
94
+ base_model.model.model.layers.8.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
95
+ base_model.model.model.layers.8.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
96
+ base_model.model.model.layers.8.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
97
+ base_model.model.model.layers.8.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
98
+ base_model.model.model.layers.8.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
99
+ base_model.model.model.layers.8.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
100
+ base_model.model.model.layers.9.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
101
+ base_model.model.model.layers.9.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
102
+ base_model.model.model.layers.9.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
103
+ base_model.model.model.layers.9.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
104
+ base_model.model.model.layers.9.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
105
+ base_model.model.model.layers.9.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
106
+ base_model.model.model.layers.9.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
107
+ base_model.model.model.layers.9.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
108
+ base_model.model.model.layers.9.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
109
+ base_model.model.model.layers.9.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
110
+ base_model.model.model.layers.10.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
111
+ base_model.model.model.layers.10.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
112
+ base_model.model.model.layers.10.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
113
+ base_model.model.model.layers.10.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
114
+ base_model.model.model.layers.10.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
115
+ base_model.model.model.layers.10.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
116
+ base_model.model.model.layers.10.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
117
+ base_model.model.model.layers.10.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
118
+ base_model.model.model.layers.10.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
119
+ base_model.model.model.layers.10.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
120
+ base_model.model.model.layers.11.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
121
+ base_model.model.model.layers.11.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
122
+ base_model.model.model.layers.11.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
123
+ base_model.model.model.layers.11.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
124
+ base_model.model.model.layers.11.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
125
+ base_model.model.model.layers.11.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
126
+ base_model.model.model.layers.11.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
127
+ base_model.model.model.layers.11.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
128
+ base_model.model.model.layers.11.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
129
+ base_model.model.model.layers.11.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
130
+ base_model.model.model.layers.12.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
131
+ base_model.model.model.layers.12.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
132
+ base_model.model.model.layers.12.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
133
+ base_model.model.model.layers.12.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
134
+ base_model.model.model.layers.12.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
135
+ base_model.model.model.layers.12.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
136
+ base_model.model.model.layers.12.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
137
+ base_model.model.model.layers.12.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
138
+ base_model.model.model.layers.12.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
139
+ base_model.model.model.layers.12.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
140
+ base_model.model.model.layers.13.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
141
+ base_model.model.model.layers.13.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
142
+ base_model.model.model.layers.13.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
143
+ base_model.model.model.layers.13.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
144
+ base_model.model.model.layers.13.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
145
+ base_model.model.model.layers.13.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
146
+ base_model.model.model.layers.13.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
147
+ base_model.model.model.layers.13.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
148
+ base_model.model.model.layers.13.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
149
+ base_model.model.model.layers.13.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
150
+ base_model.model.model.layers.14.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
151
+ base_model.model.model.layers.14.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
152
+ base_model.model.model.layers.14.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
153
+ base_model.model.model.layers.14.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
154
+ base_model.model.model.layers.14.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
155
+ base_model.model.model.layers.14.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
156
+ base_model.model.model.layers.14.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
157
+ base_model.model.model.layers.14.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
158
+ base_model.model.model.layers.14.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
159
+ base_model.model.model.layers.14.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
160
+ base_model.model.model.layers.15.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
161
+ base_model.model.model.layers.15.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
162
+ base_model.model.model.layers.15.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
163
+ base_model.model.model.layers.15.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
164
+ base_model.model.model.layers.15.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
165
+ base_model.model.model.layers.15.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
166
+ base_model.model.model.layers.15.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
167
+ base_model.model.model.layers.15.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
168
+ base_model.model.model.layers.15.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
169
+ base_model.model.model.layers.15.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
170
+ base_model.model.model.layers.16.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
171
+ base_model.model.model.layers.16.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
172
+ base_model.model.model.layers.16.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
173
+ base_model.model.model.layers.16.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
174
+ base_model.model.model.layers.16.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
175
+ base_model.model.model.layers.16.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
176
+ base_model.model.model.layers.16.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
177
+ base_model.model.model.layers.16.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
178
+ base_model.model.model.layers.16.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
179
+ base_model.model.model.layers.16.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
180
+ base_model.model.model.layers.17.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
181
+ base_model.model.model.layers.17.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
182
+ base_model.model.model.layers.17.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
183
+ base_model.model.model.layers.17.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
184
+ base_model.model.model.layers.17.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
185
+ base_model.model.model.layers.17.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
186
+ base_model.model.model.layers.17.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
187
+ base_model.model.model.layers.17.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
188
+ base_model.model.model.layers.17.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
189
+ base_model.model.model.layers.17.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
190
+ base_model.model.model.layers.18.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
191
+ base_model.model.model.layers.18.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
192
+ base_model.model.model.layers.18.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
193
+ base_model.model.model.layers.18.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
194
+ base_model.model.model.layers.18.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
195
+ base_model.model.model.layers.18.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
196
+ base_model.model.model.layers.18.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
197
+ base_model.model.model.layers.18.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
198
+ base_model.model.model.layers.18.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
199
+ base_model.model.model.layers.18.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
200
+ base_model.model.model.layers.19.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
201
+ base_model.model.model.layers.19.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
202
+ base_model.model.model.layers.19.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
203
+ base_model.model.model.layers.19.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
204
+ base_model.model.model.layers.19.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
205
+ base_model.model.model.layers.19.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
206
+ base_model.model.model.layers.19.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
207
+ base_model.model.model.layers.19.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
208
+ base_model.model.model.layers.19.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
209
+ base_model.model.model.layers.19.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
210
+ base_model.model.model.layers.20.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
211
+ base_model.model.model.layers.20.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
212
+ base_model.model.model.layers.20.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
213
+ base_model.model.model.layers.20.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
214
+ base_model.model.model.layers.20.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
215
+ base_model.model.model.layers.20.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
216
+ base_model.model.model.layers.20.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
217
+ base_model.model.model.layers.20.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
218
+ base_model.model.model.layers.20.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
219
+ base_model.model.model.layers.20.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
220
+ base_model.model.model.layers.21.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
221
+ base_model.model.model.layers.21.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
222
+ base_model.model.model.layers.21.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
223
+ base_model.model.model.layers.21.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
224
+ base_model.model.model.layers.21.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
225
+ base_model.model.model.layers.21.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
226
+ base_model.model.model.layers.21.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
227
+ base_model.model.model.layers.21.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
228
+ base_model.model.model.layers.21.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
229
+ base_model.model.model.layers.21.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
230
+ base_model.model.model.layers.22.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
231
+ base_model.model.model.layers.22.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
232
+ base_model.model.model.layers.22.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
233
+ base_model.model.model.layers.22.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
234
+ base_model.model.model.layers.22.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
235
+ base_model.model.model.layers.22.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
236
+ base_model.model.model.layers.22.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
237
+ base_model.model.model.layers.22.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
238
+ base_model.model.model.layers.22.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
239
+ base_model.model.model.layers.22.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
240
+ base_model.model.model.layers.23.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
241
+ base_model.model.model.layers.23.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
242
+ base_model.model.model.layers.23.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
243
+ base_model.model.model.layers.23.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
244
+ base_model.model.model.layers.23.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
245
+ base_model.model.model.layers.23.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
246
+ base_model.model.model.layers.23.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
247
+ base_model.model.model.layers.23.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
248
+ base_model.model.model.layers.23.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
249
+ base_model.model.model.layers.23.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
250
+ base_model.model.model.layers.24.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
251
+ base_model.model.model.layers.24.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
252
+ base_model.model.model.layers.24.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
253
+ base_model.model.model.layers.24.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
254
+ base_model.model.model.layers.24.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
255
+ base_model.model.model.layers.24.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
256
+ base_model.model.model.layers.24.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
257
+ base_model.model.model.layers.24.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
258
+ base_model.model.model.layers.24.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
259
+ base_model.model.model.layers.24.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
260
+ base_model.model.model.layers.25.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
261
+ base_model.model.model.layers.25.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
262
+ base_model.model.model.layers.25.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
263
+ base_model.model.model.layers.25.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
264
+ base_model.model.model.layers.25.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
265
+ base_model.model.model.layers.25.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
266
+ base_model.model.model.layers.25.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
267
+ base_model.model.model.layers.25.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
268
+ base_model.model.model.layers.25.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
269
+ base_model.model.model.layers.25.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
270
+ base_model.model.model.layers.26.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
271
+ base_model.model.model.layers.26.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
272
+ base_model.model.model.layers.26.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
273
+ base_model.model.model.layers.26.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
274
+ base_model.model.model.layers.26.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
275
+ base_model.model.model.layers.26.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
276
+ base_model.model.model.layers.26.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
277
+ base_model.model.model.layers.26.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
278
+ base_model.model.model.layers.26.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
279
+ base_model.model.model.layers.26.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
280
+ base_model.model.model.layers.27.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
281
+ base_model.model.model.layers.27.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
282
+ base_model.model.model.layers.27.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
283
+ base_model.model.model.layers.27.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
284
+ base_model.model.model.layers.27.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
285
+ base_model.model.model.layers.27.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
286
+ base_model.model.model.layers.27.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
287
+ base_model.model.model.layers.27.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
288
+ base_model.model.model.layers.27.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
289
+ base_model.model.model.layers.27.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
290
+ base_model.model.model.layers.28.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
291
+ base_model.model.model.layers.28.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
292
+ base_model.model.model.layers.28.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
293
+ base_model.model.model.layers.28.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
294
+ base_model.model.model.layers.28.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
295
+ base_model.model.model.layers.28.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
296
+ base_model.model.model.layers.28.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
297
+ base_model.model.model.layers.28.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
298
+ base_model.model.model.layers.28.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
299
+ base_model.model.model.layers.28.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
300
+ base_model.model.model.layers.29.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
301
+ base_model.model.model.layers.29.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
302
+ base_model.model.model.layers.29.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
303
+ base_model.model.model.layers.29.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
304
+ base_model.model.model.layers.29.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
305
+ base_model.model.model.layers.29.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
306
+ base_model.model.model.layers.29.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
307
+ base_model.model.model.layers.29.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
308
+ base_model.model.model.layers.29.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
309
+ base_model.model.model.layers.29.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
310
+ base_model.model.model.layers.30.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
311
+ base_model.model.model.layers.30.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
312
+ base_model.model.model.layers.30.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
313
+ base_model.model.model.layers.30.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
314
+ base_model.model.model.layers.30.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
315
+ base_model.model.model.layers.30.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
316
+ base_model.model.model.layers.30.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
317
+ base_model.model.model.layers.30.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
318
+ base_model.model.model.layers.30.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
319
+ base_model.model.model.layers.30.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
320
+ base_model.model.model.layers.31.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
321
+ base_model.model.model.layers.31.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
322
+ base_model.model.model.layers.31.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
323
+ base_model.model.model.layers.31.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
324
+ base_model.model.model.layers.31.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
325
+ base_model.model.model.layers.31.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
326
+ base_model.model.model.layers.31.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
327
+ base_model.model.model.layers.31.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
328
+ base_model.model.model.layers.31.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
329
+ base_model.model.model.layers.31.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
330
+ base_model.model.model.layers.32.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
331
+ base_model.model.model.layers.32.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
332
+ base_model.model.model.layers.32.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
333
+ base_model.model.model.layers.32.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
334
+ base_model.model.model.layers.32.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
335
+ base_model.model.model.layers.32.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
336
+ base_model.model.model.layers.32.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
337
+ base_model.model.model.layers.32.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
338
+ base_model.model.model.layers.32.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
339
+ base_model.model.model.layers.32.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
340
+ base_model.model.model.layers.33.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
341
+ base_model.model.model.layers.33.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
342
+ base_model.model.model.layers.33.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
343
+ base_model.model.model.layers.33.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
344
+ base_model.model.model.layers.33.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
345
+ base_model.model.model.layers.33.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
346
+ base_model.model.model.layers.33.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
347
+ base_model.model.model.layers.33.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
348
+ base_model.model.model.layers.33.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
349
+ base_model.model.model.layers.33.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
350
+ base_model.model.model.layers.34.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
351
+ base_model.model.model.layers.34.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
352
+ base_model.model.model.layers.34.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
353
+ base_model.model.model.layers.34.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
354
+ base_model.model.model.layers.34.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
355
+ base_model.model.model.layers.34.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
356
+ base_model.model.model.layers.34.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
357
+ base_model.model.model.layers.34.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
358
+ base_model.model.model.layers.34.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
359
+ base_model.model.model.layers.34.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
360
+ base_model.model.model.layers.35.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
361
+ base_model.model.model.layers.35.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
362
+ base_model.model.model.layers.35.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
363
+ base_model.model.model.layers.35.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
364
+ base_model.model.model.layers.35.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
365
+ base_model.model.model.layers.35.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
366
+ base_model.model.model.layers.35.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
367
+ base_model.model.model.layers.35.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
368
+ base_model.model.model.layers.35.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
369
+ base_model.model.model.layers.35.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
370
+ base_model.model.model.layers.36.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
371
+ base_model.model.model.layers.36.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
372
+ base_model.model.model.layers.36.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
373
+ base_model.model.model.layers.36.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
374
+ base_model.model.model.layers.36.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
375
+ base_model.model.model.layers.36.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
376
+ base_model.model.model.layers.36.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
377
+ base_model.model.model.layers.36.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
378
+ base_model.model.model.layers.36.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
379
+ base_model.model.model.layers.36.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
380
+ base_model.model.model.layers.37.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
381
+ base_model.model.model.layers.37.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
382
+ base_model.model.model.layers.37.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
383
+ base_model.model.model.layers.37.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
384
+ base_model.model.model.layers.37.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
385
+ base_model.model.model.layers.37.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
386
+ base_model.model.model.layers.37.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
387
+ base_model.model.model.layers.37.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
388
+ base_model.model.model.layers.37.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
389
+ base_model.model.model.layers.37.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
390
+ base_model.model.model.layers.38.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
391
+ base_model.model.model.layers.38.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
392
+ base_model.model.model.layers.38.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
393
+ base_model.model.model.layers.38.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
394
+ base_model.model.model.layers.38.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
395
+ base_model.model.model.layers.38.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
396
+ base_model.model.model.layers.38.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
397
+ base_model.model.model.layers.38.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
398
+ base_model.model.model.layers.38.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
399
+ base_model.model.model.layers.38.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
400
+ base_model.model.model.layers.39.self_attn.q_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
401
+ base_model.model.model.layers.39.self_attn.q_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
402
+ base_model.model.model.layers.39.self_attn.k_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
403
+ base_model.model.model.layers.39.self_attn.k_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
404
+ base_model.model.model.layers.39.self_attn.v_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
405
+ base_model.model.model.layers.39.self_attn.v_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
406
+ base_model.model.model.layers.39.mlp.up_proj._sama_layer.default.sama_L.weights | [32, 432, 16] | 221,184
407
+ base_model.model.model.layers.39.mlp.up_proj._sama_layer.default.sama_R.weights | [16, 32, 320] | 163,840
408
+ base_model.model.model.layers.39.mlp.down_proj._sama_layer.default.sama_L.weights | [32, 160, 16] | 81,920
409
+ base_model.model.model.layers.39.mlp.down_proj._sama_layer.default.sama_R.weights | [16, 32, 864] | 442,368
Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/results/ARC-Challenge.json ADDED
The diff for this file is too large to render. See raw diff
 
Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/results/ARC-Easy.json ADDED
The diff for this file is too large to render. See raw diff
 
Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/results/FINAL.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boolq": 73.91437308868501,
3
+ "piqa": 87.97606093579978,
4
+ "social_i_qa": 82.3439099283521,
5
+ "hellaswag": 95.4690300736905,
6
+ "winogrande": 87.05603788476716,
7
+ "ARC-Easy": 90.65656565656566,
8
+ "ARC-Challenge": 76.87713310580205,
9
+ "openbookqa": 89.2,
10
+ "average_score": 85.4366388342078
11
+ }
Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/results/boolq.json ADDED
The diff for this file is too large to render. See raw diff
 
Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/results/hellaswag.json ADDED
The diff for this file is too large to render. See raw diff
 
Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/results/openbookqa.json ADDED
The diff for this file is too large to render. See raw diff
 
Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/results/piqa.json ADDED
The diff for this file is too large to render. See raw diff
 
Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/results/social_i_qa.json ADDED
The diff for this file is too large to render. See raw diff
 
Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/results/winogrande.json ADDED
The diff for this file is too large to render. See raw diff
 
Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/trainer_state.json ADDED
@@ -0,0 +1,950 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 9000,
3
+ "best_metric": 0.022743234410881996,
4
+ "best_model_checkpoint": "./Llama13B/CMS/t60108d00h34m20,ep=2.0,mlr5.0e-04,b8,nb32,16,cL16,rR32,s1,initdef,dr0.0,size146627,5/checkpoint-9000",
5
+ "epoch": 2.0,
6
+ "eval_steps": 300,
7
+ "global_step": 18330,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.03273501009329478,
14
+ "grad_norm": 0.0560440868139267,
15
+ "learning_rate": 0.0004998529736085261,
16
+ "loss": 0.3967,
17
+ "step": 300
18
+ },
19
+ {
20
+ "epoch": 0.06547002018658955,
21
+ "grad_norm": 0.044248245656490326,
22
+ "learning_rate": 0.0004990760146037058,
23
+ "loss": 0.0538,
24
+ "step": 600
25
+ },
26
+ {
27
+ "epoch": 0.06547002018658955,
28
+ "eval_loss": 0.04312271624803543,
29
+ "eval_runtime": 42.396,
30
+ "eval_samples_per_second": 23.587,
31
+ "eval_steps_per_second": 0.755,
32
+ "step": 600
33
+ },
34
+ {
35
+ "epoch": 0.09820503027988434,
36
+ "grad_norm": 0.09440872073173523,
37
+ "learning_rate": 0.0004976333236993797,
38
+ "loss": 0.0458,
39
+ "step": 900
40
+ },
41
+ {
42
+ "epoch": 0.09820503027988434,
43
+ "eval_loss": 0.03903611749410629,
44
+ "eval_runtime": 42.1342,
45
+ "eval_samples_per_second": 23.734,
46
+ "eval_steps_per_second": 0.759,
47
+ "step": 900
48
+ },
49
+ {
50
+ "epoch": 0.1309400403731791,
51
+ "grad_norm": 0.12280289828777313,
52
+ "learning_rate": 0.0004955287569286458,
53
+ "loss": 0.0401,
54
+ "step": 1200
55
+ },
56
+ {
57
+ "epoch": 0.1309400403731791,
58
+ "eval_loss": 0.03701520711183548,
59
+ "eval_runtime": 42.1268,
60
+ "eval_samples_per_second": 23.738,
61
+ "eval_steps_per_second": 0.76,
62
+ "step": 1200
63
+ },
64
+ {
65
+ "epoch": 0.1636750504664739,
66
+ "grad_norm": 0.05425046756863594,
67
+ "learning_rate": 0.0004927679393904767,
68
+ "loss": 0.0369,
69
+ "step": 1500
70
+ },
71
+ {
72
+ "epoch": 0.1636750504664739,
73
+ "eval_loss": 0.03505582734942436,
74
+ "eval_runtime": 42.1495,
75
+ "eval_samples_per_second": 23.725,
76
+ "eval_steps_per_second": 0.759,
77
+ "step": 1500
78
+ },
79
+ {
80
+ "epoch": 0.19641006055976867,
81
+ "grad_norm": 0.062309328466653824,
82
+ "learning_rate": 0.0004893582502149217,
83
+ "loss": 0.0361,
84
+ "step": 1800
85
+ },
86
+ {
87
+ "epoch": 0.19641006055976867,
88
+ "eval_loss": 0.0326445996761322,
89
+ "eval_runtime": 42.1829,
90
+ "eval_samples_per_second": 23.706,
91
+ "eval_steps_per_second": 0.759,
92
+ "step": 1800
93
+ },
94
+ {
95
+ "epoch": 0.22914507065306344,
96
+ "grad_norm": 0.04928302764892578,
97
+ "learning_rate": 0.0004853088028401219,
98
+ "loss": 0.0351,
99
+ "step": 2100
100
+ },
101
+ {
102
+ "epoch": 0.22914507065306344,
103
+ "eval_loss": 0.03327207267284393,
104
+ "eval_runtime": 42.1394,
105
+ "eval_samples_per_second": 23.731,
106
+ "eval_steps_per_second": 0.759,
107
+ "step": 2100
108
+ },
109
+ {
110
+ "epoch": 0.2618800807463582,
111
+ "grad_norm": 0.0912654921412468,
112
+ "learning_rate": 0.00048063042065385746,
113
+ "loss": 0.0347,
114
+ "step": 2400
115
+ },
116
+ {
117
+ "epoch": 0.2618800807463582,
118
+ "eval_loss": 0.033242832869291306,
119
+ "eval_runtime": 42.1324,
120
+ "eval_samples_per_second": 23.735,
121
+ "eval_steps_per_second": 0.76,
122
+ "step": 2400
123
+ },
124
+ {
125
+ "epoch": 0.294615090839653,
126
+ "grad_norm": 0.14929068088531494,
127
+ "learning_rate": 0.00047533560806473004,
128
+ "loss": 0.031,
129
+ "step": 2700
130
+ },
131
+ {
132
+ "epoch": 0.294615090839653,
133
+ "eval_loss": 0.03134576976299286,
134
+ "eval_runtime": 42.1366,
135
+ "eval_samples_per_second": 23.732,
136
+ "eval_steps_per_second": 0.759,
137
+ "step": 2700
138
+ },
139
+ {
140
+ "epoch": 0.3273501009329478,
141
+ "grad_norm": 0.0723583847284317,
142
+ "learning_rate": 0.0004694385170803028,
143
+ "loss": 0.0322,
144
+ "step": 3000
145
+ },
146
+ {
147
+ "epoch": 0.3273501009329478,
148
+ "eval_loss": 0.03234448283910751,
149
+ "eval_runtime": 42.1402,
150
+ "eval_samples_per_second": 23.73,
151
+ "eval_steps_per_second": 0.759,
152
+ "step": 3000
153
+ },
154
+ {
155
+ "epoch": 0.3600851110262426,
156
+ "grad_norm": 0.09747716784477234,
157
+ "learning_rate": 0.000462954909481528,
158
+ "loss": 0.0358,
159
+ "step": 3300
160
+ },
161
+ {
162
+ "epoch": 0.3600851110262426,
163
+ "eval_loss": 0.02926197461783886,
164
+ "eval_runtime": 42.1598,
165
+ "eval_samples_per_second": 23.719,
166
+ "eval_steps_per_second": 0.759,
167
+ "step": 3300
168
+ },
169
+ {
170
+ "epoch": 0.39282012111953735,
171
+ "grad_norm": 0.06375987827777863,
172
+ "learning_rate": 0.0004559021146945619,
173
+ "loss": 0.0315,
174
+ "step": 3600
175
+ },
176
+ {
177
+ "epoch": 0.39282012111953735,
178
+ "eval_loss": 0.030526984483003616,
179
+ "eval_runtime": 42.1637,
180
+ "eval_samples_per_second": 23.717,
181
+ "eval_steps_per_second": 0.759,
182
+ "step": 3600
183
+ },
184
+ {
185
+ "epoch": 0.4255551312128321,
186
+ "grad_norm": 0.037096019834280014,
187
+ "learning_rate": 0.0004482989834725684,
188
+ "loss": 0.0329,
189
+ "step": 3900
190
+ },
191
+ {
192
+ "epoch": 0.4255551312128321,
193
+ "eval_loss": 0.030683163553476334,
194
+ "eval_runtime": 42.1644,
195
+ "eval_samples_per_second": 23.717,
196
+ "eval_steps_per_second": 0.759,
197
+ "step": 3900
198
+ },
199
+ {
200
+ "epoch": 0.4582901413061269,
201
+ "grad_norm": 0.07343820482492447,
202
+ "learning_rate": 0.0004401658375113103,
203
+ "loss": 0.032,
204
+ "step": 4200
205
+ },
206
+ {
207
+ "epoch": 0.4582901413061269,
208
+ "eval_loss": 0.027738207951188087,
209
+ "eval_runtime": 42.1656,
210
+ "eval_samples_per_second": 23.716,
211
+ "eval_steps_per_second": 0.759,
212
+ "step": 4200
213
+ },
214
+ {
215
+ "epoch": 0.49102515139942166,
216
+ "grad_norm": 0.05190209299325943,
217
+ "learning_rate": 0.0004315244151331965,
218
+ "loss": 0.0282,
219
+ "step": 4500
220
+ },
221
+ {
222
+ "epoch": 0.49102515139942166,
223
+ "eval_loss": 0.02795437164604664,
224
+ "eval_runtime": 42.1643,
225
+ "eval_samples_per_second": 23.717,
226
+ "eval_steps_per_second": 0.759,
227
+ "step": 4500
228
+ },
229
+ {
230
+ "epoch": 0.5237601614927164,
231
+ "grad_norm": 0.061979856342077255,
232
+ "learning_rate": 0.00042239781318496015,
233
+ "loss": 0.031,
234
+ "step": 4800
235
+ },
236
+ {
237
+ "epoch": 0.5237601614927164,
238
+ "eval_loss": 0.028189806267619133,
239
+ "eval_runtime": 42.1643,
240
+ "eval_samples_per_second": 23.717,
241
+ "eval_steps_per_second": 0.759,
242
+ "step": 4800
243
+ },
244
+ {
245
+ "epoch": 0.5564951715860113,
246
+ "grad_norm": 0.03137525916099548,
247
+ "learning_rate": 0.0004128104253042653,
248
+ "loss": 0.0278,
249
+ "step": 5100
250
+ },
251
+ {
252
+ "epoch": 0.5564951715860113,
253
+ "eval_loss": 0.028971396386623383,
254
+ "eval_runtime": 42.1529,
255
+ "eval_samples_per_second": 23.723,
256
+ "eval_steps_per_second": 0.759,
257
+ "step": 5100
258
+ },
259
+ {
260
+ "epoch": 0.589230181679306,
261
+ "grad_norm": 0.05158986896276474,
262
+ "learning_rate": 0.00040278787672024213,
263
+ "loss": 0.0297,
264
+ "step": 5400
265
+ },
266
+ {
267
+ "epoch": 0.589230181679306,
268
+ "eval_loss": 0.027035100385546684,
269
+ "eval_runtime": 42.1571,
270
+ "eval_samples_per_second": 23.721,
271
+ "eval_steps_per_second": 0.759,
272
+ "step": 5400
273
+ },
274
+ {
275
+ "epoch": 0.6219651917726008,
276
+ "grad_norm": 0.05266730859875679,
277
+ "learning_rate": 0.0003923569557622182,
278
+ "loss": 0.0282,
279
+ "step": 5700
280
+ },
281
+ {
282
+ "epoch": 0.6219651917726008,
283
+ "eval_loss": 0.026981640607118607,
284
+ "eval_runtime": 42.1631,
285
+ "eval_samples_per_second": 23.717,
286
+ "eval_steps_per_second": 0.759,
287
+ "step": 5700
288
+ },
289
+ {
290
+ "epoch": 0.6547002018658956,
291
+ "grad_norm": 0.05495725944638252,
292
+ "learning_rate": 0.0003815455422597065,
293
+ "loss": 0.0301,
294
+ "step": 6000
295
+ },
296
+ {
297
+ "epoch": 0.6547002018658956,
298
+ "eval_loss": 0.029135966673493385,
299
+ "eval_runtime": 42.1443,
300
+ "eval_samples_per_second": 23.728,
301
+ "eval_steps_per_second": 0.759,
302
+ "step": 6000
303
+ },
304
+ {
305
+ "epoch": 0.6874352119591903,
306
+ "grad_norm": 0.09908170998096466,
307
+ "learning_rate": 0.00037038253302502694,
308
+ "loss": 0.0287,
309
+ "step": 6300
310
+ },
311
+ {
312
+ "epoch": 0.6874352119591903,
313
+ "eval_loss": 0.027155693620443344,
314
+ "eval_runtime": 42.1673,
315
+ "eval_samples_per_second": 23.715,
316
+ "eval_steps_per_second": 0.759,
317
+ "step": 6300
318
+ },
319
+ {
320
+ "epoch": 0.7201702220524852,
321
+ "grad_norm": 0.07481172680854797,
322
+ "learning_rate": 0.0003588977646177287,
323
+ "loss": 0.028,
324
+ "step": 6600
325
+ },
326
+ {
327
+ "epoch": 0.7201702220524852,
328
+ "eval_loss": 0.02658127434551716,
329
+ "eval_runtime": 42.157,
330
+ "eval_samples_per_second": 23.721,
331
+ "eval_steps_per_second": 0.759,
332
+ "step": 6600
333
+ },
334
+ {
335
+ "epoch": 0.7529052321457799,
336
+ "grad_norm": 0.08731873333454132,
337
+ "learning_rate": 0.000347121933597251,
338
+ "loss": 0.0278,
339
+ "step": 6900
340
+ },
341
+ {
342
+ "epoch": 0.7529052321457799,
343
+ "eval_loss": 0.026023300364613533,
344
+ "eval_runtime": 42.1259,
345
+ "eval_samples_per_second": 23.738,
346
+ "eval_steps_per_second": 0.76,
347
+ "step": 6900
348
+ },
349
+ {
350
+ "epoch": 0.7856402422390747,
351
+ "grad_norm": 0.009131520055234432,
352
+ "learning_rate": 0.0003350865144769708,
353
+ "loss": 0.0293,
354
+ "step": 7200
355
+ },
356
+ {
357
+ "epoch": 0.7856402422390747,
358
+ "eval_loss": 0.02550978772342205,
359
+ "eval_runtime": 42.1546,
360
+ "eval_samples_per_second": 23.722,
361
+ "eval_steps_per_second": 0.759,
362
+ "step": 7200
363
+ },
364
+ {
365
+ "epoch": 0.8183752523323695,
366
+ "grad_norm": 0.03787301480770111,
367
+ "learning_rate": 0.00032282367559893034,
368
+ "loss": 0.0279,
369
+ "step": 7500
370
+ },
371
+ {
372
+ "epoch": 0.8183752523323695,
373
+ "eval_loss": 0.023703226819634438,
374
+ "eval_runtime": 42.1761,
375
+ "eval_samples_per_second": 23.71,
376
+ "eval_steps_per_second": 0.759,
377
+ "step": 7500
378
+ },
379
+ {
380
+ "epoch": 0.8511102624256642,
381
+ "grad_norm": 0.05027143657207489,
382
+ "learning_rate": 0.0003103661931540952,
383
+ "loss": 0.0265,
384
+ "step": 7800
385
+ },
386
+ {
387
+ "epoch": 0.8511102624256642,
388
+ "eval_loss": 0.023319367319345474,
389
+ "eval_runtime": 42.1616,
390
+ "eval_samples_per_second": 23.718,
391
+ "eval_steps_per_second": 0.759,
392
+ "step": 7800
393
+ },
394
+ {
395
+ "epoch": 0.8838452725189591,
396
+ "grad_norm": 0.028078876435756683,
397
+ "learning_rate": 0.0002977473635779503,
398
+ "loss": 0.028,
399
+ "step": 8100
400
+ },
401
+ {
402
+ "epoch": 0.8838452725189591,
403
+ "eval_loss": 0.024727819487452507,
404
+ "eval_runtime": 42.1437,
405
+ "eval_samples_per_second": 23.728,
406
+ "eval_steps_per_second": 0.759,
407
+ "step": 8100
408
+ },
409
+ {
410
+ "epoch": 0.9165802826122538,
411
+ "grad_norm": 0.04552329331636429,
412
+ "learning_rate": 0.0002850009145555825,
413
+ "loss": 0.0271,
414
+ "step": 8400
415
+ },
416
+ {
417
+ "epoch": 0.9165802826122538,
418
+ "eval_loss": 0.023578815162181854,
419
+ "eval_runtime": 42.1276,
420
+ "eval_samples_per_second": 23.737,
421
+ "eval_steps_per_second": 0.76,
422
+ "step": 8400
423
+ },
424
+ {
425
+ "epoch": 0.9493152927055486,
426
+ "grad_norm": 0.005148568656295538,
427
+ "learning_rate": 0.00027216091487411724,
428
+ "loss": 0.0265,
429
+ "step": 8700
430
+ },
431
+ {
432
+ "epoch": 0.9493152927055486,
433
+ "eval_loss": 0.023434242233633995,
434
+ "eval_runtime": 42.1453,
435
+ "eval_samples_per_second": 23.727,
436
+ "eval_steps_per_second": 0.759,
437
+ "step": 8700
438
+ },
439
+ {
440
+ "epoch": 0.9820503027988433,
441
+ "grad_norm": 0.05831698328256607,
442
+ "learning_rate": 0.0002592616833634556,
443
+ "loss": 0.0242,
444
+ "step": 9000
445
+ },
446
+ {
447
+ "epoch": 0.9820503027988433,
448
+ "eval_loss": 0.022743234410881996,
449
+ "eval_runtime": 42.1372,
450
+ "eval_samples_per_second": 23.732,
451
+ "eval_steps_per_second": 0.759,
452
+ "step": 9000
453
+ },
454
+ {
455
+ "epoch": 1.0147307545419826,
456
+ "grad_norm": 0.015023964457213879,
457
+ "learning_rate": 0.00024633769716869496,
458
+ "loss": 0.0197,
459
+ "step": 9300
460
+ },
461
+ {
462
+ "epoch": 1.0147307545419826,
463
+ "eval_loss": 0.02679787389934063,
464
+ "eval_runtime": 42.1216,
465
+ "eval_samples_per_second": 23.741,
466
+ "eval_steps_per_second": 0.76,
467
+ "step": 9300
468
+ },
469
+ {
470
+ "epoch": 1.0474657646352774,
471
+ "grad_norm": 0.07885076105594635,
472
+ "learning_rate": 0.00023342349959940397,
473
+ "loss": 0.0115,
474
+ "step": 9600
475
+ },
476
+ {
477
+ "epoch": 1.0474657646352774,
478
+ "eval_loss": 0.024874603375792503,
479
+ "eval_runtime": 42.1497,
480
+ "eval_samples_per_second": 23.725,
481
+ "eval_steps_per_second": 0.759,
482
+ "step": 9600
483
+ },
484
+ {
485
+ "epoch": 1.0802007747285722,
486
+ "grad_norm": 0.12350352108478546,
487
+ "learning_rate": 0.00022055360780205344,
488
+ "loss": 0.0105,
489
+ "step": 9900
490
+ },
491
+ {
492
+ "epoch": 1.0802007747285722,
493
+ "eval_loss": 0.027364730834960938,
494
+ "eval_runtime": 42.1561,
495
+ "eval_samples_per_second": 23.721,
496
+ "eval_steps_per_second": 0.759,
497
+ "step": 9900
498
+ },
499
+ {
500
+ "epoch": 1.112935784821867,
501
+ "grad_norm": 0.18318770825862885,
502
+ "learning_rate": 0.0002077624205023748,
503
+ "loss": 0.0117,
504
+ "step": 10200
505
+ },
506
+ {
507
+ "epoch": 1.112935784821867,
508
+ "eval_loss": 0.0259011909365654,
509
+ "eval_runtime": 42.1588,
510
+ "eval_samples_per_second": 23.72,
511
+ "eval_steps_per_second": 0.759,
512
+ "step": 10200
513
+ },
514
+ {
515
+ "epoch": 1.1456707949151617,
516
+ "grad_norm": 0.10705136507749557,
517
+ "learning_rate": 0.00019508412606423562,
518
+ "loss": 0.0117,
519
+ "step": 10500
520
+ },
521
+ {
522
+ "epoch": 1.1456707949151617,
523
+ "eval_loss": 0.024886077269911766,
524
+ "eval_runtime": 42.1435,
525
+ "eval_samples_per_second": 23.728,
526
+ "eval_steps_per_second": 0.759,
527
+ "step": 10500
528
+ },
529
+ {
530
+ "epoch": 1.1784058050084565,
531
+ "grad_norm": 0.01593521237373352,
532
+ "learning_rate": 0.00018255261111077092,
533
+ "loss": 0.0119,
534
+ "step": 10800
535
+ },
536
+ {
537
+ "epoch": 1.1784058050084565,
538
+ "eval_loss": 0.02575630508363247,
539
+ "eval_runtime": 42.1633,
540
+ "eval_samples_per_second": 23.717,
541
+ "eval_steps_per_second": 0.759,
542
+ "step": 10800
543
+ },
544
+ {
545
+ "epoch": 1.2111408151017513,
546
+ "grad_norm": 0.06906082481145859,
547
+ "learning_rate": 0.0001702013699520084,
548
+ "loss": 0.0103,
549
+ "step": 11100
550
+ },
551
+ {
552
+ "epoch": 1.2111408151017513,
553
+ "eval_loss": 0.024535585194826126,
554
+ "eval_runtime": 42.1608,
555
+ "eval_samples_per_second": 23.719,
556
+ "eval_steps_per_second": 0.759,
557
+ "step": 11100
558
+ },
559
+ {
560
+ "epoch": 1.2438758251950461,
561
+ "grad_norm": 0.0019282581051811576,
562
+ "learning_rate": 0.00015806341506107184,
563
+ "loss": 0.0105,
564
+ "step": 11400
565
+ },
566
+ {
567
+ "epoch": 1.2438758251950461,
568
+ "eval_loss": 0.023921987041831017,
569
+ "eval_runtime": 42.1654,
570
+ "eval_samples_per_second": 23.716,
571
+ "eval_steps_per_second": 0.759,
572
+ "step": 11400
573
+ },
574
+ {
575
+ "epoch": 1.276610835288341,
576
+ "grad_norm": 0.02748137153685093,
577
+ "learning_rate": 0.00014617118883824025,
578
+ "loss": 0.0108,
579
+ "step": 11700
580
+ },
581
+ {
582
+ "epoch": 1.276610835288341,
583
+ "eval_loss": 0.026060184463858604,
584
+ "eval_runtime": 42.1556,
585
+ "eval_samples_per_second": 23.722,
586
+ "eval_steps_per_second": 0.759,
587
+ "step": 11700
588
+ },
589
+ {
590
+ "epoch": 1.3093458453816358,
591
+ "grad_norm": 0.08188096433877945,
592
+ "learning_rate": 0.00013455647689870125,
593
+ "loss": 0.0121,
594
+ "step": 12000
595
+ },
596
+ {
597
+ "epoch": 1.3093458453816358,
598
+ "eval_loss": 0.025060491636395454,
599
+ "eval_runtime": 42.1366,
600
+ "eval_samples_per_second": 23.732,
601
+ "eval_steps_per_second": 0.759,
602
+ "step": 12000
603
+ },
604
+ {
605
+ "epoch": 1.3420808554749304,
606
+ "grad_norm": 0.056115902960300446,
607
+ "learning_rate": 0.00012325032311576406,
608
+ "loss": 0.0106,
609
+ "step": 12300
610
+ },
611
+ {
612
+ "epoch": 1.3420808554749304,
613
+ "eval_loss": 0.025574443861842155,
614
+ "eval_runtime": 42.1621,
615
+ "eval_samples_per_second": 23.718,
616
+ "eval_steps_per_second": 0.759,
617
+ "step": 12300
618
+ },
619
+ {
620
+ "epoch": 1.3748158655682252,
621
+ "grad_norm": 0.012125268578529358,
622
+ "learning_rate": 0.00011228294664660443,
623
+ "loss": 0.0099,
624
+ "step": 12600
625
+ },
626
+ {
627
+ "epoch": 1.3748158655682252,
628
+ "eval_loss": 0.02508034184575081,
629
+ "eval_runtime": 42.1596,
630
+ "eval_samples_per_second": 23.719,
631
+ "eval_steps_per_second": 0.759,
632
+ "step": 12600
633
+ },
634
+ {
635
+ "epoch": 1.40755087566152,
636
+ "grad_norm": 0.0046004848554730415,
637
+ "learning_rate": 0.00010168366116231597,
638
+ "loss": 0.0098,
639
+ "step": 12900
640
+ },
641
+ {
642
+ "epoch": 1.40755087566152,
643
+ "eval_loss": 0.026252219453454018,
644
+ "eval_runtime": 42.1483,
645
+ "eval_samples_per_second": 23.726,
646
+ "eval_steps_per_second": 0.759,
647
+ "step": 12900
648
+ },
649
+ {
650
+ "epoch": 1.4402858857548146,
651
+ "grad_norm": 0.07443796843290329,
652
+ "learning_rate": 9.148079649815047e-05,
653
+ "loss": 0.009,
654
+ "step": 13200
655
+ },
656
+ {
657
+ "epoch": 1.4402858857548146,
658
+ "eval_loss": 0.026938071474432945,
659
+ "eval_runtime": 42.1292,
660
+ "eval_samples_per_second": 23.736,
661
+ "eval_steps_per_second": 0.76,
662
+ "step": 13200
663
+ },
664
+ {
665
+ "epoch": 1.4730208958481095,
666
+ "grad_norm": 0.041219256818294525,
667
+ "learning_rate": 8.170162293335995e-05,
668
+ "loss": 0.0095,
669
+ "step": 13500
670
+ },
671
+ {
672
+ "epoch": 1.4730208958481095,
673
+ "eval_loss": 0.025707505643367767,
674
+ "eval_runtime": 42.1816,
675
+ "eval_samples_per_second": 23.707,
676
+ "eval_steps_per_second": 0.759,
677
+ "step": 13500
678
+ },
679
+ {
680
+ "epoch": 1.5057559059414043,
681
+ "grad_norm": 0.11749689280986786,
682
+ "learning_rate": 7.237227830302764e-05,
683
+ "loss": 0.0105,
684
+ "step": 13800
685
+ },
686
+ {
687
+ "epoch": 1.5057559059414043,
688
+ "eval_loss": 0.025386406108736992,
689
+ "eval_runtime": 42.1771,
690
+ "eval_samples_per_second": 23.71,
691
+ "eval_steps_per_second": 0.759,
692
+ "step": 13800
693
+ },
694
+ {
695
+ "epoch": 1.5384909160346991,
696
+ "grad_norm": 0.06977250427007675,
697
+ "learning_rate": 6.35176981367018e-05,
698
+ "loss": 0.01,
699
+ "step": 14100
700
+ },
701
+ {
702
+ "epoch": 1.5384909160346991,
703
+ "eval_loss": 0.02462012693285942,
704
+ "eval_runtime": 42.1422,
705
+ "eval_samples_per_second": 23.729,
706
+ "eval_steps_per_second": 0.759,
707
+ "step": 14100
708
+ },
709
+ {
710
+ "epoch": 1.571225926127994,
711
+ "grad_norm": 0.022144939750432968,
712
+ "learning_rate": 5.516154901055917e-05,
713
+ "loss": 0.0104,
714
+ "step": 14400
715
+ },
716
+ {
717
+ "epoch": 1.571225926127994,
718
+ "eval_loss": 0.0242360457777977,
719
+ "eval_runtime": 42.1513,
720
+ "eval_samples_per_second": 23.724,
721
+ "eval_steps_per_second": 0.759,
722
+ "step": 14400
723
+ },
724
+ {
725
+ "epoch": 1.6039609362212888,
726
+ "grad_norm": 0.12135706096887589,
727
+ "learning_rate": 4.732616529123546e-05,
728
+ "loss": 0.0097,
729
+ "step": 14700
730
+ },
731
+ {
732
+ "epoch": 1.6039609362212888,
733
+ "eval_loss": 0.024823835119605064,
734
+ "eval_runtime": 42.1507,
735
+ "eval_samples_per_second": 23.724,
736
+ "eval_steps_per_second": 0.759,
737
+ "step": 14700
738
+ },
739
+ {
740
+ "epoch": 1.6366959463145836,
741
+ "grad_norm": 0.011291204020380974,
742
+ "learning_rate": 4.00324894403929e-05,
743
+ "loss": 0.0099,
744
+ "step": 15000
745
+ },
746
+ {
747
+ "epoch": 1.6366959463145836,
748
+ "eval_loss": 0.025008805096149445,
749
+ "eval_runtime": 42.1592,
750
+ "eval_samples_per_second": 23.72,
751
+ "eval_steps_per_second": 0.759,
752
+ "step": 15000
753
+ },
754
+ {
755
+ "epoch": 1.6694309564078782,
756
+ "grad_norm": 0.04522231966257095,
757
+ "learning_rate": 3.330001603958035e-05,
758
+ "loss": 0.0103,
759
+ "step": 15300
760
+ },
761
+ {
762
+ "epoch": 1.6694309564078782,
763
+ "eval_loss": 0.024476325139403343,
764
+ "eval_runtime": 42.162,
765
+ "eval_samples_per_second": 23.718,
766
+ "eval_steps_per_second": 0.759,
767
+ "step": 15300
768
+ },
769
+ {
770
+ "epoch": 1.702165966501173,
771
+ "grad_norm": 0.0071013192646205425,
772
+ "learning_rate": 2.7146739684996204e-05,
773
+ "loss": 0.0099,
774
+ "step": 15600
775
+ },
776
+ {
777
+ "epoch": 1.702165966501173,
778
+ "eval_loss": 0.02445380762219429,
779
+ "eval_runtime": 42.1418,
780
+ "eval_samples_per_second": 23.729,
781
+ "eval_steps_per_second": 0.759,
782
+ "step": 15600
783
+ },
784
+ {
785
+ "epoch": 1.7349009765944676,
786
+ "grad_norm": 0.03472641855478287,
787
+ "learning_rate": 2.1589106891421295e-05,
788
+ "loss": 0.0104,
789
+ "step": 15900
790
+ },
791
+ {
792
+ "epoch": 1.7349009765944676,
793
+ "eval_loss": 0.024665959179401398,
794
+ "eval_runtime": 42.1504,
795
+ "eval_samples_per_second": 23.725,
796
+ "eval_steps_per_second": 0.759,
797
+ "step": 15900
798
+ },
799
+ {
800
+ "epoch": 1.7676359866877625,
801
+ "grad_norm": 0.010884981602430344,
802
+ "learning_rate": 1.664197213387314e-05,
803
+ "loss": 0.0098,
804
+ "step": 16200
805
+ },
806
+ {
807
+ "epoch": 1.7676359866877625,
808
+ "eval_loss": 0.02443353645503521,
809
+ "eval_runtime": 42.1323,
810
+ "eval_samples_per_second": 23.735,
811
+ "eval_steps_per_second": 0.76,
812
+ "step": 16200
813
+ },
814
+ {
815
+ "epoch": 1.8003709967810573,
816
+ "grad_norm": 0.09009554237127304,
817
+ "learning_rate": 1.2318558144474301e-05,
818
+ "loss": 0.0099,
819
+ "step": 16500
820
+ },
821
+ {
822
+ "epoch": 1.8003709967810573,
823
+ "eval_loss": 0.024443812668323517,
824
+ "eval_runtime": 42.1454,
825
+ "eval_samples_per_second": 23.727,
826
+ "eval_steps_per_second": 0.759,
827
+ "step": 16500
828
+ },
829
+ {
830
+ "epoch": 1.833106006874352,
831
+ "grad_norm": 0.0012289569713175297,
832
+ "learning_rate": 8.630420570652776e-06,
833
+ "loss": 0.009,
834
+ "step": 16800
835
+ },
836
+ {
837
+ "epoch": 1.833106006874352,
838
+ "eval_loss": 0.02449021488428116,
839
+ "eval_runtime": 42.1851,
840
+ "eval_samples_per_second": 23.705,
841
+ "eval_steps_per_second": 0.759,
842
+ "step": 16800
843
+ },
844
+ {
845
+ "epoch": 1.865841016967647,
846
+ "grad_norm": 0.007200389634817839,
847
+ "learning_rate": 5.587417089136487e-06,
848
+ "loss": 0.0107,
849
+ "step": 17100
850
+ },
851
+ {
852
+ "epoch": 1.865841016967647,
853
+ "eval_loss": 0.024273129180073738,
854
+ "eval_runtime": 42.1539,
855
+ "eval_samples_per_second": 23.723,
856
+ "eval_steps_per_second": 0.759,
857
+ "step": 17100
858
+ },
859
+ {
860
+ "epoch": 1.8985760270609418,
861
+ "grad_norm": 0.045455873012542725,
862
+ "learning_rate": 3.197681058294655e-06,
863
+ "loss": 0.0096,
864
+ "step": 17400
865
+ },
866
+ {
867
+ "epoch": 1.8985760270609418,
868
+ "eval_loss": 0.024419987574219704,
869
+ "eval_runtime": 42.1842,
870
+ "eval_samples_per_second": 23.706,
871
+ "eval_steps_per_second": 0.759,
872
+ "step": 17400
873
+ },
874
+ {
875
+ "epoch": 1.9313110371542366,
876
+ "grad_norm": 0.12856726348400116,
877
+ "learning_rate": 1.4675997792476526e-06,
878
+ "loss": 0.0092,
879
+ "step": 17700
880
+ },
881
+ {
882
+ "epoch": 1.9313110371542366,
883
+ "eval_loss": 0.024318819865584373,
884
+ "eval_runtime": 42.1661,
885
+ "eval_samples_per_second": 23.716,
886
+ "eval_steps_per_second": 0.759,
887
+ "step": 17700
888
+ },
889
+ {
890
+ "epoch": 1.9640460472475312,
891
+ "grad_norm": 0.045294079929590225,
892
+ "learning_rate": 4.017974238492983e-07,
893
+ "loss": 0.0082,
894
+ "step": 18000
895
+ },
896
+ {
897
+ "epoch": 1.9640460472475312,
898
+ "eval_loss": 0.024305343627929688,
899
+ "eval_runtime": 42.1745,
900
+ "eval_samples_per_second": 23.711,
901
+ "eval_steps_per_second": 0.759,
902
+ "step": 18000
903
+ },
904
+ {
905
+ "epoch": 1.996781057340826,
906
+ "grad_norm": 0.025693634524941444,
907
+ "learning_rate": 3.122675172106959e-09,
908
+ "loss": 0.0091,
909
+ "step": 18300
910
+ },
911
+ {
912
+ "epoch": 1.996781057340826,
913
+ "eval_loss": 0.02438713051378727,
914
+ "eval_runtime": 42.1308,
915
+ "eval_samples_per_second": 23.736,
916
+ "eval_steps_per_second": 0.76,
917
+ "step": 18300
918
+ },
919
+ {
920
+ "epoch": 2.0,
921
+ "step": 18330,
922
+ "total_flos": 1.1637418990239744e+19,
923
+ "train_loss": 0.02701456347613535,
924
+ "train_runtime": 25881.389,
925
+ "train_samples_per_second": 11.331,
926
+ "train_steps_per_second": 0.708
927
+ }
928
+ ],
929
+ "logging_steps": 300,
930
+ "max_steps": 18330,
931
+ "num_input_tokens_seen": 0,
932
+ "num_train_epochs": 2,
933
+ "save_steps": 300,
934
+ "stateful_callbacks": {
935
+ "TrainerControl": {
936
+ "args": {
937
+ "should_epoch_stop": false,
938
+ "should_evaluate": false,
939
+ "should_log": false,
940
+ "should_save": true,
941
+ "should_training_stop": true
942
+ },
943
+ "attributes": {}
944
+ }
945
+ },
946
+ "total_flos": 1.1637418990239744e+19,
947
+ "train_batch_size": 8,
948
+ "trial_name": null,
949
+ "trial_params": null
950
+ }
Llama13B/CMS/t60108d07h46m12,ep=2.0,mlr5.0e-04,b8,nb8,8,cL8,rR8,s1,initdef,dr0.0,size146627,5/ft/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_model_name_or_path": "dfurman/llama-13b",
3
+ "bias": "none",
4
+ "col_L": 8,
5
+ "drop_out": 0.0,
6
+ "inference_mode": false,
7
+ "layers_to_transform": null,
8
+ "modules_to_save": null,
9
+ "num_unique_blocks_L": 8,
10
+ "num_unique_blocks_R": 8,
11
+ "peft_type": "SAMA",
12
+ "revision": null,
13
+ "row_R": 8,
14
+ "scaling": 1,
15
+ "share_factor_L": 1,
16
+ "share_factor_R": 1,
17
+ "target_modules": [
18
+ "down_proj",
19
+ "up_proj",
20
+ "k_proj",
21
+ "v_proj",
22
+ "q_proj"
23
+ ],
24
+ "target_modules_to_skip": null,
25
+ "task_type": "CAUSAL_LM"
26
+ }