diff --git a/.gitattributes b/.gitattributes
index c7d9f3332a950355d5a77d85000f05e6f45435ea..ea2f4f5b4497ef8c1c1132788c21f7ee281ebae2 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
+models/arcface/1/model.plan filter=lfs diff=lfs merge=lfs -text
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..13566b81b018ad684f3a35fee301741b2734c8f4
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000000000000000000000000000000000000..e3d20b5ec360b50ac7890f70956c97ab72da1604
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/tritontest.iml b/.idea/tritontest.iml
new file mode 100644
index 0000000000000000000000000000000000000000..09e42a2d0c587ea76d49d0f7a719831283b63d36
--- /dev/null
+++ b/.idea/tritontest.iml
@@ -0,0 +1,14 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
new file mode 100644
index 0000000000000000000000000000000000000000..1cdab3d9f0807845570a07f83db18c6ed5639c81
--- /dev/null
+++ b/.idea/workspace.xml
@@ -0,0 +1,105 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {
+ "keyToString": {
+ "RunOnceActivity.OpenProjectViewOnStart": "true",
+ "RunOnceActivity.ShowReadmeOnStart": "true",
+ "RunOnceActivity.cidr.known.project.marker": "true",
+ "WebServerToolWindowFactoryState": "false",
+ "cf.first.check.clang-format": "false",
+ "cidr.known.project.marker": "true",
+ "node.js.detected.package.eslint": "true",
+ "node.js.detected.package.tslint": "true",
+ "node.js.selected.package.eslint": "(autodetect)",
+ "node.js.selected.package.tslint": "(autodetect)",
+ "settings.editor.selected.configurable": "com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable",
+ "vue.rearranger.settings.migration": "true"
+ }
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 1684831057719
+
+
+ 1684831057719
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..33ca1af6410f229d8b5fdd0311604e14a25141d5
--- /dev/null
+++ b/README.md
@@ -0,0 +1,21 @@
+
+---
+license: creativeml-openrail-m
+base_model: runwayml/stable-diffusion-v1-5
+tags:
+- stable-diffusion
+- stable-diffusion-diffusers
+- text-to-image
+- diffusers
+- lora
+inference: true
+---
+
+# LoRA text2image fine-tuning - robinqu/tritontest
+These are LoRA adaption weights for runwayml/stable-diffusion-v1-5. The weights were fine-tuned on the lambdalabs/pokemon-blip-captions dataset. You can find some example images in the following.
+
+
+
+
+
+
diff --git a/arcfaceresnet100-8.tar.gz b/arcfaceresnet100-8.tar.gz
new file mode 100644
index 0000000000000000000000000000000000000000..3f962a9529c60067989192e9dd348e3a12607a17
--- /dev/null
+++ b/arcfaceresnet100-8.tar.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7098d004468725906a805e50d772eabc95e31f57eedeaaa72aeab8225598c2a
+size 237272167
diff --git a/checkpoint-1000/optimizer.bin b/checkpoint-1000/optimizer.bin
new file mode 100644
index 0000000000000000000000000000000000000000..91faf4b6ddcfcd3de7024bf6de56c397dd0e2b10
--- /dev/null
+++ b/checkpoint-1000/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:374ad88b96b6d190cc87816f1c49c49d3704476bc80e654f7658039ffa86f6e8
+size 6591685
diff --git a/checkpoint-1000/pytorch_model.bin b/checkpoint-1000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..bf65038f50bd8254dc2f85336a37c8ab868276b8
--- /dev/null
+++ b/checkpoint-1000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44a79f001591910f74218e3584698ccb3296b99c9ff4083bbe21fc21a67c451a
+size 3285965
diff --git a/checkpoint-1000/random_states_0.pkl b/checkpoint-1000/random_states_0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..f55d2a789f230a6fd80af46d1884d2f0947525cc
--- /dev/null
+++ b/checkpoint-1000/random_states_0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c2162531264bd44960bc53990d6feb1bf84250d9146317efa43523be9f4ce0b5
+size 14727
diff --git a/checkpoint-1000/scaler.pt b/checkpoint-1000/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..efdbd3c795f6b0d4144e68355e99c220ccdedd09
--- /dev/null
+++ b/checkpoint-1000/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:68cff80b680ddf6e7abbef98b5f336b97f9b5963e2209307f639383870e8cc71
+size 557
diff --git a/checkpoint-1000/scheduler.bin b/checkpoint-1000/scheduler.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7cb5cb09e488b452f9ba391beb31da9ac10ade97
--- /dev/null
+++ b/checkpoint-1000/scheduler.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:56c8ab868b79ee875e28aa9f8eecb3bbd70932ebeb49f21c03fceaee8a598bde
+size 563
diff --git a/checkpoint-10000/optimizer.bin b/checkpoint-10000/optimizer.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9c5826c7c17acac5bfc6bcaf0502cdeefe4f6b6f
--- /dev/null
+++ b/checkpoint-10000/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f5b788156ab7da9e87b34f25b03313ef0654748f3d66a1872568b88c7e60c45
+size 6591685
diff --git a/checkpoint-10000/pytorch_model.bin b/checkpoint-10000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b4269e12ac66c0e3c3dbdade5438a96b3b68c884
--- /dev/null
+++ b/checkpoint-10000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b88babd698dc006f996ae7a30fc00d40a3a7fc5830027890bc3c1392fcc91b1
+size 3285965
diff --git a/checkpoint-10000/random_states_0.pkl b/checkpoint-10000/random_states_0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..fc41ffcc2b7132611f3ede6aba237571dbd7b8f5
--- /dev/null
+++ b/checkpoint-10000/random_states_0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a11b4f67fcfa4d5e99a54d315df90a5c0cd89dcb636c939ab20833dfd8129fc
+size 14727
diff --git a/checkpoint-10000/scaler.pt b/checkpoint-10000/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b345659e084dfa08f03a221d79b2e302a4748dfe
--- /dev/null
+++ b/checkpoint-10000/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9be2ad76dcbc923b00e6a142f6db62aad4a46c47bb83864ccb68ddc899d0ce78
+size 557
diff --git a/checkpoint-10000/scheduler.bin b/checkpoint-10000/scheduler.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d6221315ce924ad43cfb3894c1b0a349a019b7cf
--- /dev/null
+++ b/checkpoint-10000/scheduler.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40f5b59952b93b5c9f86194e5c0bfd624c4872a988bdc6c4b295dfe65a42d331
+size 563
diff --git a/checkpoint-10500/optimizer.bin b/checkpoint-10500/optimizer.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6291331fe665bdcba5847052577487aae91b76ee
--- /dev/null
+++ b/checkpoint-10500/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:972a5f8515e2cc01339034a4c394c286ac031427b10580664f3759a98075cc6e
+size 6591685
diff --git a/checkpoint-10500/pytorch_model.bin b/checkpoint-10500/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c466dc158e361757f274be69fb8cb260296a84df
--- /dev/null
+++ b/checkpoint-10500/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e5cea7a63e16154456933067dc4ff918f99d8a0be43c33b34cc2548ec39cd324
+size 3285965
diff --git a/checkpoint-10500/random_states_0.pkl b/checkpoint-10500/random_states_0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..1092978b86324ed6b925a253041ba3565d4bd585
--- /dev/null
+++ b/checkpoint-10500/random_states_0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:661b9419c3feaaadec76e7d8e7dfe73dcd4e5fda240ec476368318301a0bb764
+size 14727
diff --git a/checkpoint-10500/scaler.pt b/checkpoint-10500/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e7d37ab112d8b108640f4adea9efc18ff4f85228
--- /dev/null
+++ b/checkpoint-10500/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:930aa4121a2a38f44b8af20e66d414b77fba3198866a0d743e0016592e1c420e
+size 557
diff --git a/checkpoint-10500/scheduler.bin b/checkpoint-10500/scheduler.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7a3a25f019c14b102d9a138e344f5f61a1b40b9b
--- /dev/null
+++ b/checkpoint-10500/scheduler.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e9c482d6be826b8a5743dfe9586fe6de931f2d320e57498ab69a56badec9385
+size 563
diff --git a/checkpoint-11000/optimizer.bin b/checkpoint-11000/optimizer.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a7d5b709e1e74fe9c0d7efcf01f20a040bab4053
--- /dev/null
+++ b/checkpoint-11000/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44e68a9eb4eb21bb4bcc27d0d34f77d954f4705e8bb16a1fa721b5b126559714
+size 6591685
diff --git a/checkpoint-11000/pytorch_model.bin b/checkpoint-11000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ffa9a894a4c8ea31cb73fa9d78973c2326519412
--- /dev/null
+++ b/checkpoint-11000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9412feafa3588f4354a8fe66b7955e5ec3d7cc96a4bb0ef1f16c1c8d0a3aaeac
+size 3285965
diff --git a/checkpoint-11000/random_states_0.pkl b/checkpoint-11000/random_states_0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..5ffb4cdaf81b00ffaf4299b39f7b2ff3cf9d3d34
--- /dev/null
+++ b/checkpoint-11000/random_states_0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d33e82eb9366173eddbdfc330eca83a29abfe26295afabedd426024ed112094
+size 14727
diff --git a/checkpoint-11000/scaler.pt b/checkpoint-11000/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e3211d730f33fb63ccce37b3f6b355dd8bdf64f3
--- /dev/null
+++ b/checkpoint-11000/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8df4dd185a129710516223a132f1321dd530e937b37d9a1dca8d2915b9d5a04a
+size 557
diff --git a/checkpoint-11000/scheduler.bin b/checkpoint-11000/scheduler.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0abc4d8ef1a06332e5cbb93f691080dc97d62e34
--- /dev/null
+++ b/checkpoint-11000/scheduler.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:917788648aa95ebcaa93c782d2062b8ee21ca0f40dbcb7113159604a6d76e395
+size 563
diff --git a/checkpoint-11500/optimizer.bin b/checkpoint-11500/optimizer.bin
new file mode 100644
index 0000000000000000000000000000000000000000..8dd4117155c27c0844e11d0fbeb3e5939c712a3e
--- /dev/null
+++ b/checkpoint-11500/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ddd4f408868dbfa9d87f524ae61ec170467c0af043ff2f1b1464f33a84340a24
+size 6591685
diff --git a/checkpoint-11500/pytorch_model.bin b/checkpoint-11500/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b3c43171b930d2a33087788a5105182ff9f1d54d
--- /dev/null
+++ b/checkpoint-11500/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6fe818f49372311e5b8eee1440fcd1d646b4de6ddb76381cbea5690c0c77d2c4
+size 3285965
diff --git a/checkpoint-11500/random_states_0.pkl b/checkpoint-11500/random_states_0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..e5a927f8fada7c5a565f09527d6fa5adb49c11fc
--- /dev/null
+++ b/checkpoint-11500/random_states_0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:612c34b28851eab0ddd69dae95b6854cadf1a3a84bf4f363cc4a1c7bbc26736a
+size 14727
diff --git a/checkpoint-11500/scaler.pt b/checkpoint-11500/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0842e75026d521957a142549ee93aea39c4e6285
--- /dev/null
+++ b/checkpoint-11500/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cab7988ce4c8b69e7970aada0017afc1c2120661c865bce9234915457ce96c2
+size 557
diff --git a/checkpoint-11500/scheduler.bin b/checkpoint-11500/scheduler.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3e5a4a8a0f6344958a42f68f07cb59a8a24df746
--- /dev/null
+++ b/checkpoint-11500/scheduler.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb04ff053139f6755b84641d768e4712c82e4079d78c1a32e74bdf3e3935bbe1
+size 563
diff --git a/checkpoint-12000/optimizer.bin b/checkpoint-12000/optimizer.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3f9e439fccf217798c9fa83d65d4901816a6e67d
--- /dev/null
+++ b/checkpoint-12000/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36f5711b2f01e8775aa5aaa70376eb2b438832eb7db552987d6c3c4779ba2af2
+size 6591685
diff --git a/checkpoint-12000/pytorch_model.bin b/checkpoint-12000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..64340d5f4c6fd769c6687d8e06407e028294c5a3
--- /dev/null
+++ b/checkpoint-12000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9d48aa10639768073b66bfd7b7e13bef570dd1c0376e778c0c71e3428d463d0
+size 3285965
diff --git a/checkpoint-12000/random_states_0.pkl b/checkpoint-12000/random_states_0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..4d7e0130a55389205a0bc249888b9d262b4e898f
--- /dev/null
+++ b/checkpoint-12000/random_states_0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f8d370aff3279edbf54596c44f92ae762032e8df820f0ab54157e0c63285094
+size 14727
diff --git a/checkpoint-12000/scaler.pt b/checkpoint-12000/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ae5a342bd933b8ae0fc4bbe4175a5005e1f40290
--- /dev/null
+++ b/checkpoint-12000/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a61fa205a26ba2985bf4e0e515406104e0b0e5d6ebd86749ac9477a519632ca
+size 557
diff --git a/checkpoint-12000/scheduler.bin b/checkpoint-12000/scheduler.bin
new file mode 100644
index 0000000000000000000000000000000000000000..968670b789768af0b834dca27bb731af21d95ff1
--- /dev/null
+++ b/checkpoint-12000/scheduler.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9888c6d9b53ecdc688042b67345e4bdc6ccc884e3fcb090e3104603cb9f8ccb
+size 563
diff --git a/checkpoint-12500/optimizer.bin b/checkpoint-12500/optimizer.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dd0d37932152673a59ecd2e4c07b9425299d92c2
--- /dev/null
+++ b/checkpoint-12500/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a06a0b0accdcf3c13f2eccfacff168082b71c87775e8d5c51a21250c0865f9dc
+size 6591685
diff --git a/checkpoint-12500/pytorch_model.bin b/checkpoint-12500/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7022504d469d1fa994c6d3996910c3a5fbe4785b
--- /dev/null
+++ b/checkpoint-12500/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e2344c3c87339d6622fb2ae357bf460a3d51e5c7c3aa9b324f43f97f53b03b6
+size 3285965
diff --git a/checkpoint-12500/random_states_0.pkl b/checkpoint-12500/random_states_0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..a87e1734fa3404f1fca31834b3a57710c13083bb
--- /dev/null
+++ b/checkpoint-12500/random_states_0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9663c8bfbb3bc321ef159be6a6884659e5f43b7d632bc91e53ae81830c1d7147
+size 14727
diff --git a/checkpoint-12500/scaler.pt b/checkpoint-12500/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..75a8d18ba471fd2bbe0654923f5d6610b4d39611
--- /dev/null
+++ b/checkpoint-12500/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:319423f2c4aecfec79c052c003d3d35084df2fe66179617b317d2161d653a69e
+size 557
diff --git a/checkpoint-12500/scheduler.bin b/checkpoint-12500/scheduler.bin
new file mode 100644
index 0000000000000000000000000000000000000000..45539fa56121bd2d01e26c4ac23a0078205ee2b0
--- /dev/null
+++ b/checkpoint-12500/scheduler.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e470eb5e246ff54625cbbd42ddbfa157a14a2934e447c719915c5f072eef55d0
+size 563
diff --git a/checkpoint-13000/optimizer.bin b/checkpoint-13000/optimizer.bin
new file mode 100644
index 0000000000000000000000000000000000000000..8d81b31a93af2b1f18959601a0b84baa6f9f5f43
--- /dev/null
+++ b/checkpoint-13000/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd169d88d49299deb8e0096e5cd756b08af6a6d0a53537453043ea7dd1908d78
+size 6591685
diff --git a/checkpoint-13000/pytorch_model.bin b/checkpoint-13000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7480a683d58d7052ba26e469630867f1bb2408c0
--- /dev/null
+++ b/checkpoint-13000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b00d36c50b6f7bcb6c6798c9d766fda3eb1d6f96e620d9d26e0074d58927068
+size 3285965
diff --git a/checkpoint-13000/random_states_0.pkl b/checkpoint-13000/random_states_0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..83a14178c32a0ddfc84c1b027f4a7245b256fd2b
--- /dev/null
+++ b/checkpoint-13000/random_states_0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6aa2f55a9dc3e9e411b19d93118035f1f0986e96c0b667794eee1675cc47c2ed
+size 14727
diff --git a/checkpoint-13000/scaler.pt b/checkpoint-13000/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d9fd2abdb80e7d4b3792b5db5e418f9055af09b4
--- /dev/null
+++ b/checkpoint-13000/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8624aa8828b4c5fcb5bb4f05a16d3dd64b35da242cd642c1a05cc5a8cfcd893
+size 557
diff --git a/checkpoint-13000/scheduler.bin b/checkpoint-13000/scheduler.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9c6d5c49ea9108c558447d9b23c744b2980fb70e
--- /dev/null
+++ b/checkpoint-13000/scheduler.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:556cbdac88355889a59cd7821317b13437a9a6237582edf8e0fc0745c3ee8b98
+size 563
diff --git a/checkpoint-13500/optimizer.bin b/checkpoint-13500/optimizer.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7d0280fc20fde9c9c4906642a89bdcfbbca69af2
--- /dev/null
+++ b/checkpoint-13500/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a47d0756fbed8433998aa3901b10cf8cefedcc8488e6a7245d66410ab741edf7
+size 6591685
diff --git a/checkpoint-13500/pytorch_model.bin b/checkpoint-13500/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..858ddc2b780bfd5f5ceebe52561c62e6f013d09d
--- /dev/null
+++ b/checkpoint-13500/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09c51f1ba2ec3446a77cd39a65812c4d4282921b0150e97b55f621cd47409dd4
+size 3285965
diff --git a/checkpoint-13500/random_states_0.pkl b/checkpoint-13500/random_states_0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..61933a6d084b84f91b56a91ebde63b7994b399ae
--- /dev/null
+++ b/checkpoint-13500/random_states_0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be5974db20b4a92980d15255e8d33c2272cc18fbaef82c5c7affe51bf96ae1d2
+size 14727
diff --git a/checkpoint-13500/scaler.pt b/checkpoint-13500/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a28ce0fe8912b497f6403c3f2d267c054988e84d
--- /dev/null
+++ b/checkpoint-13500/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d8cb0e8886eabbd6664ff72d29e1aea63b8d20878221e816e6ca906ea79a2d6
+size 557
diff --git a/checkpoint-13500/scheduler.bin b/checkpoint-13500/scheduler.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a9e88e783fa0dca0642f26d4f60fa4a4aa4f972d
--- /dev/null
+++ b/checkpoint-13500/scheduler.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d1353cd9c22a7eafc9eb6c8db17931428329e4f00f5e5085ff5bc8957a67ab7b
+size 563
diff --git a/checkpoint-14000/optimizer.bin b/checkpoint-14000/optimizer.bin
new file mode 100644
index 0000000000000000000000000000000000000000..35967823806e010d4e91ddcf0987a97bef2d631b
--- /dev/null
+++ b/checkpoint-14000/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5dabc97f74812d44318f028323a8d8eee68a30a6db4635443910a525d6669f73
+size 6591685
diff --git a/checkpoint-14000/pytorch_model.bin b/checkpoint-14000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..98979c45232097e6ecfa975764fbfad776d97806
--- /dev/null
+++ b/checkpoint-14000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a9650a3ec2aab25d9ca5bfc4c57fb3123d3eaffe87d4da7cb55fbc8530fbcc6
+size 3285965
diff --git a/checkpoint-14000/random_states_0.pkl b/checkpoint-14000/random_states_0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..d0b5022c646ad234f4fe7c09d9d589372ef74e5c
--- /dev/null
+++ b/checkpoint-14000/random_states_0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a37d37b407c4edbde4c991625229b4163c1c14a4cd9bdefe169acd3bcf7f652
+size 14727
diff --git a/checkpoint-14000/scaler.pt b/checkpoint-14000/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..77e24425b55ee0b55001f14cef4cd3da2bd013e5
--- /dev/null
+++ b/checkpoint-14000/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:16b2bfa6474626923eabc41533579ef844795020f8f96145f8ecd1d198ff615b
+size 557
diff --git a/checkpoint-14000/scheduler.bin b/checkpoint-14000/scheduler.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5df16ef869111f94ed392bacc3c48cde311b6697
--- /dev/null
+++ b/checkpoint-14000/scheduler.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0df3ba8b6b843efd5c3d371d5f9facc42227f6fc75db355eb7ec57511a4295c4
+size 563
diff --git a/checkpoint-14500/optimizer.bin b/checkpoint-14500/optimizer.bin
new file mode 100644
index 0000000000000000000000000000000000000000..168649488a91e7e102fd539cfd99a2e577131953
--- /dev/null
+++ b/checkpoint-14500/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:129fe6ba36d974911cf3de6fc197b17e6d6fce653c8a00b403cd071fcbc74057
+size 6591685
diff --git a/checkpoint-14500/pytorch_model.bin b/checkpoint-14500/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..aac3a4498acb32d2099e61c6af4b4ab3e66f0d79
--- /dev/null
+++ b/checkpoint-14500/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3f9bb9307fb9b3918a11088179751bd6546a1a16aecec4d6250618313a2cbc7
+size 3285965
diff --git a/checkpoint-14500/random_states_0.pkl b/checkpoint-14500/random_states_0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..03af152beca26c297c755527bfaba5609ffb7c12
--- /dev/null
+++ b/checkpoint-14500/random_states_0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f6e5405ec887cdcc5737ec515f16fdc4663548266acfeeb8dd42e1314613086b
+size 14727
diff --git a/checkpoint-14500/scaler.pt b/checkpoint-14500/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..170f260c29cbc9aa7bd63d8f369ce38903a6f23a
--- /dev/null
+++ b/checkpoint-14500/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f2e917ec3b22c53ed9bdb7d9dda3704bdf5804a272f07ddb76d262b5275974e
+size 557
diff --git a/checkpoint-14500/scheduler.bin b/checkpoint-14500/scheduler.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ec943fdc3b432503566359f118c6b452884b1fdd
--- /dev/null
+++ b/checkpoint-14500/scheduler.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c475c26c61746a46cc886540d1806dd8ebf7f7affe40219659c5c49042fb53b0
+size 563
diff --git a/checkpoint-1500/optimizer.bin b/checkpoint-1500/optimizer.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5e5f87cf28b36efbf03482ad9c3c04886a9960c0
--- /dev/null
+++ b/checkpoint-1500/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:315c5523cf477f4f207ae9cd34cb7ac86b8f52140c97976aac25e4c63013a3a3
+size 6591685
diff --git a/checkpoint-1500/pytorch_model.bin b/checkpoint-1500/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..dc27027e35e364e7213df534cb4dd3784750e5d7
--- /dev/null
+++ b/checkpoint-1500/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:53738e5115024b00031085f8ff41290a8e9573391f2aff74141ee268cfe36934
+size 3285965
diff --git a/checkpoint-1500/random_states_0.pkl b/checkpoint-1500/random_states_0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..aca9fa4192fd93988e94e8c9489535975dfdf7a3
--- /dev/null
+++ b/checkpoint-1500/random_states_0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a73a1e44f2a8214faeeb7956028399621a2fd8ad8b82b7bf3d00320ef42215cb
+size 14727
diff --git a/checkpoint-1500/scaler.pt b/checkpoint-1500/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..69c76b8f27f096d1d1a3d9d0e387af1a9cf5028d
--- /dev/null
+++ b/checkpoint-1500/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:203a72d6c29f42a0e2964fdddc8d7a98df1eccee78fea9de0fa416613390f5c6
+size 557
diff --git a/checkpoint-1500/scheduler.bin b/checkpoint-1500/scheduler.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6ce9052db3bd932cc5e0fd1e33ba0aa41309384a
--- /dev/null
+++ b/checkpoint-1500/scheduler.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc22c61ac589ef1b5e5676c8ce7759a3f07db30a6f6e5a6c161f975c665ffb2e
+size 563
diff --git a/checkpoint-15000/optimizer.bin b/checkpoint-15000/optimizer.bin
new file mode 100644
index 0000000000000000000000000000000000000000..29a2a22331f1ff0186fbce1dc355ad5dd42a04fd
--- /dev/null
+++ b/checkpoint-15000/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36920456a44c8401807b57719bd96e1faf946c5a83dd519959f052a3cf290f54
+size 6591685
diff --git a/checkpoint-15000/pytorch_model.bin b/checkpoint-15000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..386214b3c155291b92722664231c576a94c67cc9
--- /dev/null
+++ b/checkpoint-15000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc91592797ebf6158d2d03c8e06bdb8b8bba611a5fbd861418e00048b424db57
+size 3285965
diff --git a/checkpoint-15000/random_states_0.pkl b/checkpoint-15000/random_states_0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..0b67d18f942853418d1c682ba4984813d207f27d
--- /dev/null
+++ b/checkpoint-15000/random_states_0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ba343e899f7ec7cf8ddabbb0a35f66c9d3bccf82a21cb3c72540d8b1399edd3
+size 14727
diff --git a/checkpoint-15000/scaler.pt b/checkpoint-15000/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..29312d48fe0c2715d29a9441129c714a6ea8e195
--- /dev/null
+++ b/checkpoint-15000/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9fb33b1da2125feaa501f6acf7c67409ff703b0320e8e55bf616863891dd3346
+size 557
diff --git a/checkpoint-15000/scheduler.bin b/checkpoint-15000/scheduler.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1b8e5de325822eab27aa5a97eacbb6621abe1b92
--- /dev/null
+++ b/checkpoint-15000/scheduler.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8f82441aeeb3377e2aa2a9cea9dd30a90572c9fef336ade00148614d795ec89
+size 563
diff --git a/checkpoint-2000/optimizer.bin b/checkpoint-2000/optimizer.bin
new file mode 100644
index 0000000000000000000000000000000000000000..4d021a3819d38a159eee1387b20266c892266bda
--- /dev/null
+++ b/checkpoint-2000/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89d31dca0d4cda9fca0618bf832eafac2ccb74e0d3da356e8759735c0f92346c
+size 6591685
diff --git a/checkpoint-2000/pytorch_model.bin b/checkpoint-2000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2008251b9317ae0070d7890bcf76469b39192656
--- /dev/null
+++ b/checkpoint-2000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f381ce1a5db0f1f13e7426f99d0b5cea72362f6092b6bed6946c187336caeac7
+size 3285965
diff --git a/checkpoint-2000/random_states_0.pkl b/checkpoint-2000/random_states_0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..e5093e62414957a21fcb3f79c9e4b784ead6e406
--- /dev/null
+++ b/checkpoint-2000/random_states_0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ffe1993dd3383292e3b18fe6db1fd9d0f4c480d5f1e6b11bd81d4838ce65ab18
+size 14727
diff --git a/checkpoint-2000/scaler.pt b/checkpoint-2000/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..21ece35416ce79724d347155f11cfa297b97cabc
--- /dev/null
+++ b/checkpoint-2000/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd2de9749828adacdf103bf6e9592702bb7067a2c1df27dd62ab38c1eb8c070f
+size 557
diff --git a/checkpoint-2000/scheduler.bin b/checkpoint-2000/scheduler.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1931273b6eb017aea3e72b81c6c55116e80e513c
--- /dev/null
+++ b/checkpoint-2000/scheduler.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be9b855823954d49c1ce296b3f857a1c9801a9f569f7c1eeb228df6878a97ad0
+size 563
diff --git a/checkpoint-2500/optimizer.bin b/checkpoint-2500/optimizer.bin
new file mode 100644
index 0000000000000000000000000000000000000000..db0b02d38e29e30f53516fb657a4b81d19f55e60
--- /dev/null
+++ b/checkpoint-2500/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:22621a94785c71e503bc7efc81f37cea87736573346c51800a2eb6567985c14f
+size 6591685
diff --git a/checkpoint-2500/pytorch_model.bin b/checkpoint-2500/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..50eafee1cb820b6181b2f07ef7da848cdd37fe5c
--- /dev/null
+++ b/checkpoint-2500/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b618e3940a2c7a9412ac72c6fce0cf1e0163729f98990b55b7806ae8d186121
+size 3285965
diff --git a/checkpoint-2500/random_states_0.pkl b/checkpoint-2500/random_states_0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..b4ec4380d8a851afde6c3bb38733eb8d2ff8152e
--- /dev/null
+++ b/checkpoint-2500/random_states_0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a045a1cda386abd1ed6449c43c51af09c3e5cb4de29731bfc8eea61200765577
+size 14727
diff --git a/checkpoint-2500/scaler.pt b/checkpoint-2500/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9a3e6e139e26b18feddbe2f66ba98b73890dacfa
--- /dev/null
+++ b/checkpoint-2500/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0fbcebc8f5487b0c117b5dd47f2ea304af3eebf408d297118d9307e1223927e1
+size 557
diff --git a/checkpoint-2500/scheduler.bin b/checkpoint-2500/scheduler.bin
new file mode 100644
index 0000000000000000000000000000000000000000..26ee5a0e4933e0e297b5c111c9f26f32148a9a8c
--- /dev/null
+++ b/checkpoint-2500/scheduler.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25f683f29bf0059657e53c72b651e35e45e03188dcc461d12de5dfc012d3e2e8
+size 563
diff --git a/checkpoint-3000/optimizer.bin b/checkpoint-3000/optimizer.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1ac543dc96d82e67139270405a77c7ad5accc399
--- /dev/null
+++ b/checkpoint-3000/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e0edff84344f4e21729e2d241ad9e0caf553013acb21518e907ae4f2458950f
+size 6591685
diff --git a/checkpoint-3000/pytorch_model.bin b/checkpoint-3000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0800a128e3236e792a663573daba52b59f27c504
--- /dev/null
+++ b/checkpoint-3000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b1b80c7a5bc7333764d6d6dac9cd19265e5344720c9095055c53a50f05bf621
+size 3285965
diff --git a/checkpoint-3000/random_states_0.pkl b/checkpoint-3000/random_states_0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..d65d7dbd6fc4da12fa2dcb868f0cdea996d3f8c9
--- /dev/null
+++ b/checkpoint-3000/random_states_0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f842d915b624719efd45edb422b5e2d65837901a1b9bc2ed8a8efff5f283e5e
+size 14727
diff --git a/checkpoint-3000/scaler.pt b/checkpoint-3000/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2888c9ddc2c30e6b47ce0e6abcc8e2d1303aab00
--- /dev/null
+++ b/checkpoint-3000/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb1f9398b77268202e8e1465734a63d123b1ef11c27f20f2473677e9883a6869
+size 557
diff --git a/checkpoint-3000/scheduler.bin b/checkpoint-3000/scheduler.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b6e98847b93e436117fe690f647a6277552aa287
--- /dev/null
+++ b/checkpoint-3000/scheduler.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1a9ae9a509360c94426bc42eb6a1f3a7459e5148861a2fd1fc770ca85dba71dd
+size 563
diff --git a/checkpoint-3500/optimizer.bin b/checkpoint-3500/optimizer.bin
new file mode 100644
index 0000000000000000000000000000000000000000..bf947bc9463ba7bd5e771413e73b4ed69dcf6ce0
--- /dev/null
+++ b/checkpoint-3500/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d125bda2c0044cd703f281340b60139875141b0c2c8e6f6e931e35f6051971d
+size 6591685
diff --git a/checkpoint-3500/pytorch_model.bin b/checkpoint-3500/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..19a24570f4309227c975fe73ed713846533d4788
--- /dev/null
+++ b/checkpoint-3500/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2589708ec2edc146a19d74db9cfc5d7e3ba8e8704bf83bb1afce4b79081978f8
+size 3285965
diff --git a/checkpoint-3500/random_states_0.pkl b/checkpoint-3500/random_states_0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..a9f7d82c4a3618ba4ddd1145ef0b89281e6f2436
--- /dev/null
+++ b/checkpoint-3500/random_states_0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8ac5bd516dee280aefdab8a859ffa49f951da039ff401f0fcdb3d8abdaff4846
+size 14727
diff --git a/checkpoint-3500/scaler.pt b/checkpoint-3500/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..44e7bbfd017c097f419247a9c5de77cb216339ac
--- /dev/null
+++ b/checkpoint-3500/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4cfe72973031c5a482f6e9ed0cb664a6aa7f63116e294e2b1f72d360d9b033c
+size 557
diff --git a/checkpoint-3500/scheduler.bin b/checkpoint-3500/scheduler.bin
new file mode 100644
index 0000000000000000000000000000000000000000..adc423dfe94237158bf06c4106a677db2a9ceacf
--- /dev/null
+++ b/checkpoint-3500/scheduler.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:148fdc4fe2bc5ddf5e4619a2d775aa17fa3488fcfa9817f64f34ee5884ac1279
+size 563
diff --git a/checkpoint-4000/optimizer.bin b/checkpoint-4000/optimizer.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f28b156d412f3699943592e6e8b2e7dbc732c21f
--- /dev/null
+++ b/checkpoint-4000/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:33d1edd183fc3759009f52fef8734391060aa9ef2c9cda2ae26619b07cfaa30d
+size 6591685
diff --git a/checkpoint-4000/pytorch_model.bin b/checkpoint-4000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0491b619413951c7cf3904d8aebee5b27492d3bf
--- /dev/null
+++ b/checkpoint-4000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cfd6850e6bca5d7ae2e5584227603205a8778b33ebaa1670de0cb1671f80d728
+size 3285965
diff --git a/checkpoint-4000/random_states_0.pkl b/checkpoint-4000/random_states_0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..8226fc9095e35c80c4904921854a964e0361597a
--- /dev/null
+++ b/checkpoint-4000/random_states_0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fdb3d39958f79f663b674150e2ca5fdceb54a6aca62a3cca32800f2ec15e1442
+size 14727
diff --git a/checkpoint-4000/scaler.pt b/checkpoint-4000/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e6dfc1dc5fe962ebf4423f5d584aeb4e90dc01be
--- /dev/null
+++ b/checkpoint-4000/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09a61ecdaa4a69ba865e0b9e19169660549c34c60a83537ab5ece413678520ab
+size 557
diff --git a/checkpoint-4000/scheduler.bin b/checkpoint-4000/scheduler.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9b8f03b627866c850729953617364804b5821955
--- /dev/null
+++ b/checkpoint-4000/scheduler.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:348cbc9e127881a00f9142d5968a72a9dd9def447beae7309054eba7cc19ea63
+size 563
diff --git a/checkpoint-4500/optimizer.bin b/checkpoint-4500/optimizer.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a490349f79ac9f208cb3090fd61e29d3ff1e2f71
--- /dev/null
+++ b/checkpoint-4500/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ef65ace1172a5097614506ff37aa223057fd066b16dbb7a6ec5d7311c713671
+size 6591685
diff --git a/checkpoint-4500/pytorch_model.bin b/checkpoint-4500/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..59912455611648b21f73a8f992bcfef8d8cd058c
--- /dev/null
+++ b/checkpoint-4500/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55b85e57c1dc61493e70f7606d220784bad0f633f64ab50f838a159ecba3c082
+size 3285965
diff --git a/checkpoint-4500/random_states_0.pkl b/checkpoint-4500/random_states_0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..64aef0ff3d990f2331d1a8f069df9e20144cdced
--- /dev/null
+++ b/checkpoint-4500/random_states_0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1525bfced080949560c09ae6cf76d7dd0a4e89b32ae953b50edcc99f8ab57d84
+size 14727
diff --git a/checkpoint-4500/scaler.pt b/checkpoint-4500/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8c98e356000f5210c0d0307e76e41b897f221bfa
--- /dev/null
+++ b/checkpoint-4500/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:945e2cc9e717c41b0b1286d550bee32d1e1a6cb3d9a42935c0294b123372c89f
+size 557
diff --git a/checkpoint-4500/scheduler.bin b/checkpoint-4500/scheduler.bin
new file mode 100644
index 0000000000000000000000000000000000000000..980f9797c1707a06cf52fd1185e859aafc5051cc
--- /dev/null
+++ b/checkpoint-4500/scheduler.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9537e1517d4d89c5ddd53515f56a5086752fa22adb3b5399771d7be93c40dbc5
+size 563
diff --git a/checkpoint-500/optimizer.bin b/checkpoint-500/optimizer.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3b414a5a269d2236d404e7b8a274d2d80668c663
--- /dev/null
+++ b/checkpoint-500/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92fd5b9a84dd74229fed137b440d731a8037fdffcd72b80b1d9118dd7bc1fc97
+size 6591685
diff --git a/checkpoint-500/pytorch_model.bin b/checkpoint-500/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cc828fe53c97658a7145b82accd618da52485d3a
--- /dev/null
+++ b/checkpoint-500/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3635d01f6b927c34c88a927227c3338159614a9df428bff74c0368b7a2a6bf3e
+size 3285965
diff --git a/checkpoint-500/random_states_0.pkl b/checkpoint-500/random_states_0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..b221584f962c5c183c027c65f792415eee1faa67
--- /dev/null
+++ b/checkpoint-500/random_states_0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5bdc563c03285285fb411a96e94cc2f9467e6f488979167e482892d35ca92772
+size 14727
diff --git a/checkpoint-500/scaler.pt b/checkpoint-500/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b5f2e5ee77c2cef5eff816170558ce9209855685
--- /dev/null
+++ b/checkpoint-500/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3f196a54202bb4ba1220e8c59f42f9cda0702d68ea83147d814c2fb2f36b8f2
+size 557
diff --git a/checkpoint-500/scheduler.bin b/checkpoint-500/scheduler.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e3d56ca17df4cc026897bb1b0d3b56a067b9a0ce
--- /dev/null
+++ b/checkpoint-500/scheduler.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a7538dcdd8450f8bc2ecbfe5610b71690c99351a40a17986b6442f95326746a
+size 563
diff --git a/checkpoint-5000/optimizer.bin b/checkpoint-5000/optimizer.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7777affdad39539d2e49e0a14dfaff19018c8bc4
--- /dev/null
+++ b/checkpoint-5000/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99196e67ea3a934936f9d4901d1096644d6e4521d41d069826f1e03ace47187d
+size 6591685
diff --git a/checkpoint-5000/pytorch_model.bin b/checkpoint-5000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6e30dc644ca3d3622e96e6e7dbf47056295f8882
--- /dev/null
+++ b/checkpoint-5000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3962e8c5a0b7d5adb4219eb2374b66ddf742bf19e9348ac632210cbca04cbd25
+size 3285965
diff --git a/checkpoint-5000/random_states_0.pkl b/checkpoint-5000/random_states_0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..e6ca24e520ae7777b58ee8d08aa7958dc4b0ae7d
--- /dev/null
+++ b/checkpoint-5000/random_states_0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d729fd3f9d19a739a12092c7308d4e498e41d8727a7a386d38a98da55f70aeeb
+size 14727
diff --git a/checkpoint-5000/scaler.pt b/checkpoint-5000/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9a0a5a2d6700d39df12b85e2a8ee02926333f46a
--- /dev/null
+++ b/checkpoint-5000/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a41313b6733b65c1acca917a7c2ced06503c11b898e6cd5de1e6a9f13bfb54ef
+size 557
diff --git a/checkpoint-5000/scheduler.bin b/checkpoint-5000/scheduler.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1b6403dbf726b84120067b2c8f2b4cc208cc632a
--- /dev/null
+++ b/checkpoint-5000/scheduler.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0934ff0e523218f7c1f36ce46dd3ea055bb0b91847c16c53db69c7696ab1b7f3
+size 563
diff --git a/checkpoint-5500/optimizer.bin b/checkpoint-5500/optimizer.bin
new file mode 100644
index 0000000000000000000000000000000000000000..99bc55893b955e8993594fcf5b49f65eb8936437
--- /dev/null
+++ b/checkpoint-5500/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49d709780ffbaaa572a1f927f44786cfd4ba2528a5238ba0be9386cf3cabd49e
+size 6591685
diff --git a/checkpoint-5500/pytorch_model.bin b/checkpoint-5500/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5131460e3a7450773e192886e7da1bbfc2a09a61
--- /dev/null
+++ b/checkpoint-5500/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:976196fbd3ee4f5e11cd707c8873184ef88f90a232247547c91a845ff036ff8e
+size 3285965
diff --git a/checkpoint-5500/random_states_0.pkl b/checkpoint-5500/random_states_0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..9901dc74f21c5b2ae2410a7b2019f1296319b2b4
--- /dev/null
+++ b/checkpoint-5500/random_states_0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4cef712d12ecd4288784733e7db339ba2f7081f96118d0dbe9c663a8ec9078f8
+size 14727
diff --git a/checkpoint-5500/scaler.pt b/checkpoint-5500/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d7dc8ce86b334ba316696a99f82e1bc011bdcdcd
--- /dev/null
+++ b/checkpoint-5500/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:367928c873a79d90abcb66db0f8b320eea7346dc9bc779b4e7963dfc82cf2ada
+size 557
diff --git a/checkpoint-5500/scheduler.bin b/checkpoint-5500/scheduler.bin
new file mode 100644
index 0000000000000000000000000000000000000000..fd5be2b0a6572d93fc428a08e091bb21754634a2
--- /dev/null
+++ b/checkpoint-5500/scheduler.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f1e5e23162faa3524039ea17fe0d79306f5214ba14449cd7958f25504925237f
+size 563
diff --git a/checkpoint-6000/optimizer.bin b/checkpoint-6000/optimizer.bin
new file mode 100644
index 0000000000000000000000000000000000000000..4157967a27dd6eb2945d4de1c98a14f05c93abe1
--- /dev/null
+++ b/checkpoint-6000/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9611139ac467a945346f09e4c4730b8efcc72a08b8b6606d5a3cb5d31e117de0
+size 6591685
diff --git a/checkpoint-6000/pytorch_model.bin b/checkpoint-6000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a2ca5d11d66e5e0d779d0616adfc928f61500b50
--- /dev/null
+++ b/checkpoint-6000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5b59c50faa300a4981ba2748fd5fac8cb1e2dfc63bdd5b74b66baa7bd8ca745
+size 3285965
diff --git a/checkpoint-6000/random_states_0.pkl b/checkpoint-6000/random_states_0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..b708896fbb9c9152a99f9595cb0406d391a55a0d
--- /dev/null
+++ b/checkpoint-6000/random_states_0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ccbed282d42902ea89f915bc118bc5c7fe8cf000d1c5162beb3cc55347692038
+size 14727
diff --git a/checkpoint-6000/scaler.pt b/checkpoint-6000/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2fd17ac3026715d5a68d70922df4742631319059
--- /dev/null
+++ b/checkpoint-6000/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a00ef97fc41c0d653df466126d855f16646dd10726d3ac99a23f55ca70ec19a
+size 557
diff --git a/checkpoint-6000/scheduler.bin b/checkpoint-6000/scheduler.bin
new file mode 100644
index 0000000000000000000000000000000000000000..64f2b3be1e5e549e8e4c1401c5da7ff64c53ee2b
--- /dev/null
+++ b/checkpoint-6000/scheduler.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3989e3dcf008f6c3577a1960871d9db766f7f398c9752ee9d657fd639bf2a0bf
+size 563
diff --git a/checkpoint-6500/optimizer.bin b/checkpoint-6500/optimizer.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ca1abca7e0d07aa04c8fb7911ef25ede19c82ff5
--- /dev/null
+++ b/checkpoint-6500/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03630fcceb2419f65f66a156a66fe534db3641bad9b80fb3f6ea53ebe44834ed
+size 6591685
diff --git a/checkpoint-6500/pytorch_model.bin b/checkpoint-6500/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d1b8e09c696449326b26609f9c41693c4601bbec
--- /dev/null
+++ b/checkpoint-6500/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a120382044e0fda6a0eede2fb6cab4d4a592a655c1d57d97c9b3b101ca30fde8
+size 3285965
diff --git a/checkpoint-6500/random_states_0.pkl b/checkpoint-6500/random_states_0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..bfa8da2fd44045ac67c2ffe3c3d29a9960248d30
--- /dev/null
+++ b/checkpoint-6500/random_states_0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8def49e4d39a263bfb0d96e8eae0a4c4e03baebc074c333d01b9c5a5760dacaa
+size 14727
diff --git a/checkpoint-6500/scaler.pt b/checkpoint-6500/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..40275aa28a49ee92274a571543131266eb2b70e2
--- /dev/null
+++ b/checkpoint-6500/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:443cc2fd4ac32450c4c0d394c84b6f1899943a2f8743462b4026e2d2fe3e5de6
+size 557
diff --git a/checkpoint-6500/scheduler.bin b/checkpoint-6500/scheduler.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c1f8e1897b95534d4cd72c45d791981ae99cbe4b
--- /dev/null
+++ b/checkpoint-6500/scheduler.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cbe9f4109cafdb13a813e09e6b07c6886e665eb8c5ea6996a788972238e6c640
+size 563
diff --git a/checkpoint-7000/optimizer.bin b/checkpoint-7000/optimizer.bin
new file mode 100644
index 0000000000000000000000000000000000000000..26dbbbfb3ba282e3999652daff961d3ae845fa12
--- /dev/null
+++ b/checkpoint-7000/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d695373971951373cc8db8230fcf2781942c42cd3f5319b2c370d2043cf60f5c
+size 6591685
diff --git a/checkpoint-7000/pytorch_model.bin b/checkpoint-7000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ed08f9e9f869ffc3dddfe40ae7263461e5978a5b
--- /dev/null
+++ b/checkpoint-7000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf9b3ba3a81a16e504c9e8edb88dc96d346385ee43821de4b6d079037edb4140
+size 3285965
diff --git a/checkpoint-7000/random_states_0.pkl b/checkpoint-7000/random_states_0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..28d76467a60fbb1b8fb22fdaaa4a269cea7c7a3a
--- /dev/null
+++ b/checkpoint-7000/random_states_0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2755e1595b55b7d52f7aec1ef6952cce3109be2463f43e5b052317f3acd98d30
+size 14727
diff --git a/checkpoint-7000/scaler.pt b/checkpoint-7000/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2ceac8d008df631f48c13ff48e83da859977ff02
--- /dev/null
+++ b/checkpoint-7000/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d22aed3a858ee389297b2101bb2ed7ee9e37744f1a1f75273dbdf045d65b041a
+size 557
diff --git a/checkpoint-7000/scheduler.bin b/checkpoint-7000/scheduler.bin
new file mode 100644
index 0000000000000000000000000000000000000000..8fc8595c60028004807a629e942cb1855cd44e32
--- /dev/null
+++ b/checkpoint-7000/scheduler.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d279ededd51eead350fd74fb3a0ca46fef042e3f81438e76da6dd307c4e28563
+size 563
diff --git a/checkpoint-7500/optimizer.bin b/checkpoint-7500/optimizer.bin
new file mode 100644
index 0000000000000000000000000000000000000000..263579a3f1f11527578db392d65922c2c38365bd
--- /dev/null
+++ b/checkpoint-7500/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a1783672c1737ada5c8dce5ba267d53801c313a7c7c635b2e7ad37dc93434888
+size 6591685
diff --git a/checkpoint-7500/pytorch_model.bin b/checkpoint-7500/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0e713865ad19460e9206c1de82649788cfbf045e
--- /dev/null
+++ b/checkpoint-7500/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2371f8a826252a3b81ad3a3ef3f7fae1d1f9c52483c7ad2d2a0252a7bd31c0d3
+size 3285965
diff --git a/checkpoint-7500/random_states_0.pkl b/checkpoint-7500/random_states_0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..edfd566cf82e4479dbef1d444bb71ec94bea8ca6
--- /dev/null
+++ b/checkpoint-7500/random_states_0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a016d8e0606869bc7cd94e07b23d3a6b7a80e4dea7bad5d5c327d1eb6bd26bd
+size 14727
diff --git a/checkpoint-7500/scaler.pt b/checkpoint-7500/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..60bdd119cc6377670954947196f4e3abc08dcce1
--- /dev/null
+++ b/checkpoint-7500/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fab868730e2cdc5d83f898b6c5399b352e14373e48bdbdfa9179803e96fd314f
+size 557
diff --git a/checkpoint-7500/scheduler.bin b/checkpoint-7500/scheduler.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6e3a996f8e96beacd69f05ee11cc5b5b03832ade
--- /dev/null
+++ b/checkpoint-7500/scheduler.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:046ae477d5159c6733091ae49c696c5585d3f3c8502557ab5979cd4c7f973b4a
+size 563
diff --git a/checkpoint-8000/optimizer.bin b/checkpoint-8000/optimizer.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f27b8d1d6c5c6aad18fea4257ebbada873781a45
--- /dev/null
+++ b/checkpoint-8000/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95fee9a8c188713e3d8cc448cc991478df93183c0359becd33a09f327bde10a2
+size 6591685
diff --git a/checkpoint-8000/pytorch_model.bin b/checkpoint-8000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..18e972898eb986aa72da80acf1ebbc540e70559a
--- /dev/null
+++ b/checkpoint-8000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa918eca97ebc573cfa7bf9036270a3f0e48743a54cd7011388865862ebef00d
+size 3285965
diff --git a/checkpoint-8000/random_states_0.pkl b/checkpoint-8000/random_states_0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..7fb2b2a42128e609736ff3e30b7dedd4dcecbdca
--- /dev/null
+++ b/checkpoint-8000/random_states_0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c2c17786a789e1639cf70b0f0c27304a1098b3cff5d695a893368ff5198c8636
+size 14727
diff --git a/checkpoint-8000/scaler.pt b/checkpoint-8000/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fd25dc8bd6a1a15851636b4ebafe829cb0ed1b0d
--- /dev/null
+++ b/checkpoint-8000/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b278cd6e09360f31a3d837f80dee4c2ce4d9c9d186a939ecf157e1a0deb793f3
+size 557
diff --git a/checkpoint-8000/scheduler.bin b/checkpoint-8000/scheduler.bin
new file mode 100644
index 0000000000000000000000000000000000000000..4aba85b9ea3158dbf92e16da0d8ebcf11d669d3c
--- /dev/null
+++ b/checkpoint-8000/scheduler.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a02be91ecaf51cdf6a8aa67751a3c817d0b1386e75c1a233fc8091862b7a867d
+size 563
diff --git a/checkpoint-8500/optimizer.bin b/checkpoint-8500/optimizer.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d1172126aefc170910ec2cfaecfebdddc4facec3
--- /dev/null
+++ b/checkpoint-8500/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d38078d91bd1d12ac466115c9e117f8f79e7449473cb9f771ec9d66a2010a9f
+size 6591685
diff --git a/checkpoint-8500/pytorch_model.bin b/checkpoint-8500/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..379140ead0fcf7ac53060ee809a3561172278e9c
--- /dev/null
+++ b/checkpoint-8500/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1bc18c9c0bf8751268ad987a95f108dbd8feebc4ea859ad30a203b6e75196afc
+size 3285965
diff --git a/checkpoint-8500/random_states_0.pkl b/checkpoint-8500/random_states_0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..57c8a405a6494dc26e41d5d35ba93d18f13ad3e7
--- /dev/null
+++ b/checkpoint-8500/random_states_0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e33e675102c3a82c6a6f691f177acf67cf252300f79342045da70bcd5f513f9a
+size 14727
diff --git a/checkpoint-8500/scaler.pt b/checkpoint-8500/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3df7c4409be1ed9ef627cc178b2c13e5d9ed6b01
--- /dev/null
+++ b/checkpoint-8500/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ed0eced7c80f9d236fedf48c2537ab9b4e6957d5129ef3aec426e866324bfd1
+size 557
diff --git a/checkpoint-8500/scheduler.bin b/checkpoint-8500/scheduler.bin
new file mode 100644
index 0000000000000000000000000000000000000000..68f2b8cf69cf91604665661e7e29fba18bce9be3
--- /dev/null
+++ b/checkpoint-8500/scheduler.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e202e93d2f5d536f4aef80b0a4ffca713b77972b279f447584cc203c7fec0150
+size 563
diff --git a/checkpoint-9000/optimizer.bin b/checkpoint-9000/optimizer.bin
new file mode 100644
index 0000000000000000000000000000000000000000..bb72ccf1e4482db62725d1816b733a643778d354
--- /dev/null
+++ b/checkpoint-9000/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1aa531fd97a277d5f7289ec8e0760b7496c9beb857e7b782a1aba387f2dffa85
+size 6591685
diff --git a/checkpoint-9000/pytorch_model.bin b/checkpoint-9000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e52048c95aa8760807294f08f2246bf0877bfb93
--- /dev/null
+++ b/checkpoint-9000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ae05b82a99e9e8e8d3399bbb099c3ce2b665e89470118878118b110c9301701
+size 3285965
diff --git a/checkpoint-9000/random_states_0.pkl b/checkpoint-9000/random_states_0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..b75f120b051b03dc98e52bde51fb73f2bb442151
--- /dev/null
+++ b/checkpoint-9000/random_states_0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2dc57dfead8506c1fc20c1fe577f472fba528867d0b380542bc4269c750fb89
+size 14727
diff --git a/checkpoint-9000/scaler.pt b/checkpoint-9000/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7c7306b1b0e5b21e2fec7bb12b5a46cc7b66591f
--- /dev/null
+++ b/checkpoint-9000/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4be7a7b56cfc3e79d33648a4c49d4f11c6593d8d653b129207b2e38f2684a284
+size 557
diff --git a/checkpoint-9000/scheduler.bin b/checkpoint-9000/scheduler.bin
new file mode 100644
index 0000000000000000000000000000000000000000..add03dfac4fb3c8a7f73371f538c92700a97fa67
--- /dev/null
+++ b/checkpoint-9000/scheduler.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd55874657c9eed6ac902f5dc82cb86b6c8ea3750f15656abe487477ff6cef44
+size 563
diff --git a/checkpoint-9500/optimizer.bin b/checkpoint-9500/optimizer.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3bd37cc2b1e8068e2f86a5ce50fb9e6021ad423e
--- /dev/null
+++ b/checkpoint-9500/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:881be516600bd44df6cc88e062993a7d196f23ef30b4afebd6da51311d89009c
+size 6591685
diff --git a/checkpoint-9500/pytorch_model.bin b/checkpoint-9500/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6688f5c24093781b4e6b520e967953ed60143204
--- /dev/null
+++ b/checkpoint-9500/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:094846519db8e1c8db00cc8be67c9126bc0e999bc9c2b10fd1ea16de3390d66c
+size 3285965
diff --git a/checkpoint-9500/random_states_0.pkl b/checkpoint-9500/random_states_0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..587f2f811a3f1090f0760e49c171e6173219a4e8
--- /dev/null
+++ b/checkpoint-9500/random_states_0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8dabe6d08fe06476c9244037b2931f062ca1d76a0beb8836a98159f73e53c920
+size 14727
diff --git a/checkpoint-9500/scaler.pt b/checkpoint-9500/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c37ed8f6ad3a1ccbab9977f6091f818ec6c46c95
--- /dev/null
+++ b/checkpoint-9500/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f0ba14834a4ee194de1acdb54bf5273a1da6bd2ffcc1c8c635b51720d72fbde8
+size 557
diff --git a/checkpoint-9500/scheduler.bin b/checkpoint-9500/scheduler.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ea7e65c8baf2389098bcff731e355d44d4d341f0
--- /dev/null
+++ b/checkpoint-9500/scheduler.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b649e1a2367ab35a45f791d4563e40ed53fe4c1c45116d4cc2fbeea1f98518d2
+size 563
diff --git a/diffuers_requirements.txt b/diffuers_requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d5149854527aa74cd91868eaea3d859df0a518ea
--- /dev/null
+++ b/diffuers_requirements.txt
@@ -0,0 +1,8 @@
+accelerate>=0.16.0
+torchvision
+transformers>=4.25.1
+datasets
+ftfy
+tensorboard
+Jinja2
+wandb
\ No newline at end of file
diff --git a/image_0.png b/image_0.png
new file mode 100644
index 0000000000000000000000000000000000000000..5220d0f9dbfa747cdb356761a73beb1b1f1c96bb
Binary files /dev/null and b/image_0.png differ
diff --git a/image_1.png b/image_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..4b29ad83b038c1e5ee09e474c6db77557a498c29
Binary files /dev/null and b/image_1.png differ
diff --git a/image_2.png b/image_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..dcf77470f715043a413797a013899942c9de78e8
Binary files /dev/null and b/image_2.png differ
diff --git a/image_3.png b/image_3.png
new file mode 100644
index 0000000000000000000000000000000000000000..218361d6c8aa9e9961ef3d1577ecb170f5f22c5b
Binary files /dev/null and b/image_3.png differ
diff --git a/logs/text2image-fine-tune/1685011096.9147234/events.out.tfevents.1685011096.iZt4n6er62uu4xnw6wibnhZ.3007136.1 b/logs/text2image-fine-tune/1685011096.9147234/events.out.tfevents.1685011096.iZt4n6er62uu4xnw6wibnhZ.3007136.1
new file mode 100644
index 0000000000000000000000000000000000000000..ce210d58feffc2c972d347358545a1c342d86d57
--- /dev/null
+++ b/logs/text2image-fine-tune/1685011096.9147234/events.out.tfevents.1685011096.iZt4n6er62uu4xnw6wibnhZ.3007136.1
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9866580e57db4565ab5c43bf66b16a6d28e8d925b6ffa8bea536d2c3742958e1
+size 2259
diff --git a/logs/text2image-fine-tune/1685011096.9157877/hparams.yml b/logs/text2image-fine-tune/1685011096.9157877/hparams.yml
new file mode 100644
index 0000000000000000000000000000000000000000..14eba9b972141fb71a381027e22accf1ee8c53fc
--- /dev/null
+++ b/logs/text2image-fine-tune/1685011096.9157877/hparams.yml
@@ -0,0 +1,47 @@
+adam_beta1: 0.9
+adam_beta2: 0.999
+adam_epsilon: 1.0e-08
+adam_weight_decay: 0.01
+allow_tf32: false
+cache_dir: null
+caption_column: text
+center_crop: true
+checkpointing_steps: 500
+checkpoints_total_limit: null
+dataloader_num_workers: 8
+dataset_config_name: null
+dataset_name: lambdalabs/pokemon-blip-captions
+enable_xformers_memory_efficient_attention: false
+gradient_accumulation_steps: 4
+gradient_checkpointing: false
+hub_model_id: ''
+hub_token: null
+image_column: image
+learning_rate: 0.0001
+local_rank: -1
+logging_dir: logs
+lr_scheduler: cosine
+lr_warmup_steps: 0
+max_grad_norm: 1.0
+max_train_samples: null
+max_train_steps: 15000
+mixed_precision: null
+noise_offset: 0
+num_train_epochs: 72
+num_validation_images: 4
+output_dir: /home/long.qul/tritontest
+pretrained_model_name_or_path: runwayml/stable-diffusion-v1-5
+push_to_hub: true
+random_flip: true
+report_to: tensorboard
+resolution: 512
+resume_from_checkpoint: null
+revision: null
+scale_lr: false
+seed: 1337
+snr_gamma: null
+train_batch_size: 1
+train_data_dir: null
+use_8bit_adam: false
+validation_epochs: 1
+validation_prompt: A pokemon with blue eyes.
diff --git a/logs/text2image-fine-tune/1685011152.8369217/events.out.tfevents.1685011152.iZt4n6er62uu4xnw6wibnhZ.3013999.1 b/logs/text2image-fine-tune/1685011152.8369217/events.out.tfevents.1685011152.iZt4n6er62uu4xnw6wibnhZ.3013999.1
new file mode 100644
index 0000000000000000000000000000000000000000..6d0b8b56fbef0d6884a88717b23bc3fd61c6e79f
--- /dev/null
+++ b/logs/text2image-fine-tune/1685011152.8369217/events.out.tfevents.1685011152.iZt4n6er62uu4xnw6wibnhZ.3013999.1
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b57599fafc10773dc9cd4c24424acb8fe8c6cb8b80a7db01e101753967c8746e
+size 2259
diff --git a/logs/text2image-fine-tune/1685011152.8379972/hparams.yml b/logs/text2image-fine-tune/1685011152.8379972/hparams.yml
new file mode 100644
index 0000000000000000000000000000000000000000..14eba9b972141fb71a381027e22accf1ee8c53fc
--- /dev/null
+++ b/logs/text2image-fine-tune/1685011152.8379972/hparams.yml
@@ -0,0 +1,47 @@
+adam_beta1: 0.9
+adam_beta2: 0.999
+adam_epsilon: 1.0e-08
+adam_weight_decay: 0.01
+allow_tf32: false
+cache_dir: null
+caption_column: text
+center_crop: true
+checkpointing_steps: 500
+checkpoints_total_limit: null
+dataloader_num_workers: 8
+dataset_config_name: null
+dataset_name: lambdalabs/pokemon-blip-captions
+enable_xformers_memory_efficient_attention: false
+gradient_accumulation_steps: 4
+gradient_checkpointing: false
+hub_model_id: ''
+hub_token: null
+image_column: image
+learning_rate: 0.0001
+local_rank: -1
+logging_dir: logs
+lr_scheduler: cosine
+lr_warmup_steps: 0
+max_grad_norm: 1.0
+max_train_samples: null
+max_train_steps: 15000
+mixed_precision: null
+noise_offset: 0
+num_train_epochs: 72
+num_validation_images: 4
+output_dir: /home/long.qul/tritontest
+pretrained_model_name_or_path: runwayml/stable-diffusion-v1-5
+push_to_hub: true
+random_flip: true
+report_to: tensorboard
+resolution: 512
+resume_from_checkpoint: null
+revision: null
+scale_lr: false
+seed: 1337
+snr_gamma: null
+train_batch_size: 1
+train_data_dir: null
+use_8bit_adam: false
+validation_epochs: 1
+validation_prompt: A pokemon with blue eyes.
diff --git a/logs/text2image-fine-tune/1685011523.6617844/events.out.tfevents.1685011523.iZt4n6er62uu4xnw6wibnhZ.3026645.1 b/logs/text2image-fine-tune/1685011523.6617844/events.out.tfevents.1685011523.iZt4n6er62uu4xnw6wibnhZ.3026645.1
new file mode 100644
index 0000000000000000000000000000000000000000..212c774cc7e4ff6509f5a7f99580ad7597385e13
--- /dev/null
+++ b/logs/text2image-fine-tune/1685011523.6617844/events.out.tfevents.1685011523.iZt4n6er62uu4xnw6wibnhZ.3026645.1
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:946f73b43c0af82192422f56cc99925e986d74aa2b5e065949ecfb088ff6069f
+size 2259
diff --git a/logs/text2image-fine-tune/1685011523.6627905/hparams.yml b/logs/text2image-fine-tune/1685011523.6627905/hparams.yml
new file mode 100644
index 0000000000000000000000000000000000000000..14eba9b972141fb71a381027e22accf1ee8c53fc
--- /dev/null
+++ b/logs/text2image-fine-tune/1685011523.6627905/hparams.yml
@@ -0,0 +1,47 @@
+adam_beta1: 0.9
+adam_beta2: 0.999
+adam_epsilon: 1.0e-08
+adam_weight_decay: 0.01
+allow_tf32: false
+cache_dir: null
+caption_column: text
+center_crop: true
+checkpointing_steps: 500
+checkpoints_total_limit: null
+dataloader_num_workers: 8
+dataset_config_name: null
+dataset_name: lambdalabs/pokemon-blip-captions
+enable_xformers_memory_efficient_attention: false
+gradient_accumulation_steps: 4
+gradient_checkpointing: false
+hub_model_id: ''
+hub_token: null
+image_column: image
+learning_rate: 0.0001
+local_rank: -1
+logging_dir: logs
+lr_scheduler: cosine
+lr_warmup_steps: 0
+max_grad_norm: 1.0
+max_train_samples: null
+max_train_steps: 15000
+mixed_precision: null
+noise_offset: 0
+num_train_epochs: 72
+num_validation_images: 4
+output_dir: /home/long.qul/tritontest
+pretrained_model_name_or_path: runwayml/stable-diffusion-v1-5
+push_to_hub: true
+random_flip: true
+report_to: tensorboard
+resolution: 512
+resume_from_checkpoint: null
+revision: null
+scale_lr: false
+seed: 1337
+snr_gamma: null
+train_batch_size: 1
+train_data_dir: null
+use_8bit_adam: false
+validation_epochs: 1
+validation_prompt: A pokemon with blue eyes.
diff --git a/logs/text2image-fine-tune/1685012078.3065808/events.out.tfevents.1685012078.iZt4n6er62uu4xnw6wibnhZ.3042793.1 b/logs/text2image-fine-tune/1685012078.3065808/events.out.tfevents.1685012078.iZt4n6er62uu4xnw6wibnhZ.3042793.1
new file mode 100644
index 0000000000000000000000000000000000000000..4dd5153551f39d37ad22ddfcee1b1e2a1dc90309
--- /dev/null
+++ b/logs/text2image-fine-tune/1685012078.3065808/events.out.tfevents.1685012078.iZt4n6er62uu4xnw6wibnhZ.3042793.1
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dfbcc7d557f871c3885602685760ddbc62b581d617f598b4d2a580aa3ce8ae82
+size 2259
diff --git a/logs/text2image-fine-tune/1685012078.3076277/hparams.yml b/logs/text2image-fine-tune/1685012078.3076277/hparams.yml
new file mode 100644
index 0000000000000000000000000000000000000000..14eba9b972141fb71a381027e22accf1ee8c53fc
--- /dev/null
+++ b/logs/text2image-fine-tune/1685012078.3076277/hparams.yml
@@ -0,0 +1,47 @@
+adam_beta1: 0.9
+adam_beta2: 0.999
+adam_epsilon: 1.0e-08
+adam_weight_decay: 0.01
+allow_tf32: false
+cache_dir: null
+caption_column: text
+center_crop: true
+checkpointing_steps: 500
+checkpoints_total_limit: null
+dataloader_num_workers: 8
+dataset_config_name: null
+dataset_name: lambdalabs/pokemon-blip-captions
+enable_xformers_memory_efficient_attention: false
+gradient_accumulation_steps: 4
+gradient_checkpointing: false
+hub_model_id: ''
+hub_token: null
+image_column: image
+learning_rate: 0.0001
+local_rank: -1
+logging_dir: logs
+lr_scheduler: cosine
+lr_warmup_steps: 0
+max_grad_norm: 1.0
+max_train_samples: null
+max_train_steps: 15000
+mixed_precision: null
+noise_offset: 0
+num_train_epochs: 72
+num_validation_images: 4
+output_dir: /home/long.qul/tritontest
+pretrained_model_name_or_path: runwayml/stable-diffusion-v1-5
+push_to_hub: true
+random_flip: true
+report_to: tensorboard
+resolution: 512
+resume_from_checkpoint: null
+revision: null
+scale_lr: false
+seed: 1337
+snr_gamma: null
+train_batch_size: 1
+train_data_dir: null
+use_8bit_adam: false
+validation_epochs: 1
+validation_prompt: A pokemon with blue eyes.
diff --git a/logs/text2image-fine-tune/events.out.tfevents.1685011096.iZt4n6er62uu4xnw6wibnhZ.3007136.0 b/logs/text2image-fine-tune/events.out.tfevents.1685011096.iZt4n6er62uu4xnw6wibnhZ.3007136.0
new file mode 100644
index 0000000000000000000000000000000000000000..7668c16e7a0e700bb8c5628822562af7436eb0ce
--- /dev/null
+++ b/logs/text2image-fine-tune/events.out.tfevents.1685011096.iZt4n6er62uu4xnw6wibnhZ.3007136.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f168400191252e83e16d1a2d3598e2ea478a6c402e679474bb3bc94943aff86a
+size 88
diff --git a/logs/text2image-fine-tune/events.out.tfevents.1685011152.iZt4n6er62uu4xnw6wibnhZ.3013999.0 b/logs/text2image-fine-tune/events.out.tfevents.1685011152.iZt4n6er62uu4xnw6wibnhZ.3013999.0
new file mode 100644
index 0000000000000000000000000000000000000000..ab2d2a7b73df74e807699fec9fde26a04b2f9e7c
--- /dev/null
+++ b/logs/text2image-fine-tune/events.out.tfevents.1685011152.iZt4n6er62uu4xnw6wibnhZ.3013999.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:144c0749237695bb3b25ff4a775e7f707bc6c05adfc2b4b3b7eec72ba381e86c
+size 88
diff --git a/logs/text2image-fine-tune/events.out.tfevents.1685011523.iZt4n6er62uu4xnw6wibnhZ.3026645.0 b/logs/text2image-fine-tune/events.out.tfevents.1685011523.iZt4n6er62uu4xnw6wibnhZ.3026645.0
new file mode 100644
index 0000000000000000000000000000000000000000..9e9ce9705a47698b881f5891db9f53e874f110d8
--- /dev/null
+++ b/logs/text2image-fine-tune/events.out.tfevents.1685011523.iZt4n6er62uu4xnw6wibnhZ.3026645.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:07c803cca3414a548ed8f5dd9a432e43816c829616f6d3a3e4bce6a34eeb63b9
+size 88
diff --git a/logs/text2image-fine-tune/events.out.tfevents.1685012078.iZt4n6er62uu4xnw6wibnhZ.3042793.0 b/logs/text2image-fine-tune/events.out.tfevents.1685012078.iZt4n6er62uu4xnw6wibnhZ.3042793.0
new file mode 100644
index 0000000000000000000000000000000000000000..f9b6b11c0416f5105dae6b4be426183493dc991d
--- /dev/null
+++ b/logs/text2image-fine-tune/events.out.tfevents.1685012078.iZt4n6er62uu4xnw6wibnhZ.3042793.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:13a926b29ef8ce895e845678f5a82e385b77efbb537185139d0c1ab73dc7b4a1
+size 98193447
diff --git a/lora_test_1.py b/lora_test_1.py
new file mode 100644
index 0000000000000000000000000000000000000000..d071e108d6b73fc229bf171575486f0aa546c6f9
--- /dev/null
+++ b/lora_test_1.py
@@ -0,0 +1,908 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fine-tuning script for Stable Diffusion for text2image with support for LoRA."""
+
+import argparse
+import logging
+import math
+import os
+import random
+from pathlib import Path
+
+import datasets
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+
+import diffusers
+from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel
+from diffusers.loaders import AttnProcsLayers
+from diffusers.models.attention_processor import LoRAAttnProcessor
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils.import_utils import is_xformers_available
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.17.0.dev0")
+
+logger = get_logger(__name__, log_level="INFO")
+
+
+def save_model_card(repo_id: str, images=None, base_model=str, dataset_name=str, repo_folder=None):
+ img_str = ""
+ for i, image in enumerate(images):
+ image.save(os.path.join(repo_folder, f"image_{i}.png"))
+ img_str += f"\n"
+
+ yaml = f"""
+---
+license: creativeml-openrail-m
+base_model: {base_model}
+tags:
+- stable-diffusion
+- stable-diffusion-diffusers
+- text-to-image
+- diffusers
+- lora
+inference: true
+---
+ """
+ model_card = f"""
+# LoRA text2image fine-tuning - {repo_id}
+These are LoRA adaption weights for {base_model}. The weights were fine-tuned on the {dataset_name} dataset. You can find some example images in the following. \n
+{img_str}
+"""
+ with open(os.path.join(repo_folder, "README.md"), "w") as f:
+ f.write(yaml + model_card)
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description="Simple example of a training script.")
+ parser.add_argument(
+ "--pretrained_model_name_or_path",
+ type=str,
+ default=None,
+ required=True,
+ help="Path to pretrained model or model identifier from huggingface.co/models.",
+ )
+ parser.add_argument(
+ "--revision",
+ type=str,
+ default=None,
+ required=False,
+ help="Revision of pretrained model identifier from huggingface.co/models.",
+ )
+ parser.add_argument(
+ "--dataset_name",
+ type=str,
+ default=None,
+ help=(
+ "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+ " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+ " or to a folder containing files that 🤗 Datasets can understand."
+ ),
+ )
+ parser.add_argument(
+ "--dataset_config_name",
+ type=str,
+ default=None,
+ help="The config of the Dataset, leave as None if there's only one config.",
+ )
+ parser.add_argument(
+ "--train_data_dir",
+ type=str,
+ default=None,
+ help=(
+ "A folder containing the training data. Folder contents must follow the structure described in"
+ " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+ " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+ ),
+ )
+ parser.add_argument(
+ "--image_column", type=str, default="image", help="The column of the dataset containing an image."
+ )
+ parser.add_argument(
+ "--caption_column",
+ type=str,
+ default="text",
+ help="The column of the dataset containing a caption or a list of captions.",
+ )
+ parser.add_argument(
+ "--validation_prompt", type=str, default=None, help="A prompt that is sampled during training for inference."
+ )
+ parser.add_argument(
+ "--num_validation_images",
+ type=int,
+ default=4,
+ help="Number of images that should be generated during validation with `validation_prompt`.",
+ )
+ parser.add_argument(
+ "--validation_epochs",
+ type=int,
+ default=1,
+ help=(
+ "Run fine-tuning validation every X epochs. The validation process consists of running the prompt"
+ " `args.validation_prompt` multiple times: `args.num_validation_images`."
+ ),
+ )
+ parser.add_argument(
+ "--max_train_samples",
+ type=int,
+ default=None,
+ help=(
+ "For debugging purposes or quicker training, truncate the number of training examples to this "
+ "value if set."
+ ),
+ )
+ parser.add_argument(
+ "--output_dir",
+ type=str,
+ default="sd-model-finetuned-lora",
+ help="The output directory where the model predictions and checkpoints will be written.",
+ )
+ parser.add_argument(
+ "--cache_dir",
+ type=str,
+ default=None,
+ help="The directory where the downloaded models and datasets will be stored.",
+ )
+ parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+ parser.add_argument(
+ "--resolution",
+ type=int,
+ default=512,
+ help=(
+ "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+ " resolution"
+ ),
+ )
+ parser.add_argument(
+ "--center_crop",
+ default=False,
+ action="store_true",
+ help=(
+ "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+ " cropped. The images will be resized to the resolution first before cropping."
+ ),
+ )
+ parser.add_argument(
+ "--random_flip",
+ action="store_true",
+ help="whether to randomly flip images horizontally",
+ )
+ parser.add_argument(
+ "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+ )
+ parser.add_argument("--num_train_epochs", type=int, default=100)
+ parser.add_argument(
+ "--max_train_steps",
+ type=int,
+ default=None,
+ help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
+ )
+ parser.add_argument(
+ "--gradient_accumulation_steps",
+ type=int,
+ default=1,
+ help="Number of updates steps to accumulate before performing a backward/update pass.",
+ )
+ parser.add_argument(
+ "--gradient_checkpointing",
+ action="store_true",
+ help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+ )
+ parser.add_argument(
+ "--learning_rate",
+ type=float,
+ default=1e-4,
+ help="Initial learning rate (after the potential warmup period) to use.",
+ )
+ parser.add_argument(
+ "--scale_lr",
+ action="store_true",
+ default=False,
+ help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+ )
+ parser.add_argument(
+ "--lr_scheduler",
+ type=str,
+ default="constant",
+ help=(
+ 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+ ' "constant", "constant_with_warmup"]'
+ ),
+ )
+ parser.add_argument(
+ "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+ )
+ parser.add_argument(
+ "--snr_gamma",
+ type=float,
+ default=None,
+ help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
+ "More details here: https://arxiv.org/abs/2303.09556.",
+ )
+ parser.add_argument(
+ "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+ )
+ parser.add_argument(
+ "--allow_tf32",
+ action="store_true",
+ help=(
+ "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+ " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+ ),
+ )
+ parser.add_argument(
+ "--dataloader_num_workers",
+ type=int,
+ default=0,
+ help=(
+ "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+ ),
+ )
+ parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+ parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+ parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+ parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+ parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+ parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+ parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+ parser.add_argument(
+ "--hub_model_id",
+ type=str,
+ default=None,
+ help="The name of the repository to keep in sync with the local `output_dir`.",
+ )
+ parser.add_argument(
+ "--logging_dir",
+ type=str,
+ default="logs",
+ help=(
+ "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+ " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+ ),
+ )
+ parser.add_argument(
+ "--mixed_precision",
+ type=str,
+ default=None,
+ choices=["no", "fp16", "bf16"],
+ help=(
+ "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+ " 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the"
+ " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+ ),
+ )
+ parser.add_argument(
+ "--report_to",
+ type=str,
+ default="tensorboard",
+ help=(
+ 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+ ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+ ),
+ )
+ parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+ parser.add_argument(
+ "--checkpointing_steps",
+ type=int,
+ default=500,
+ help=(
+ "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+ " training using `--resume_from_checkpoint`."
+ ),
+ )
+ parser.add_argument(
+ "--checkpoints_total_limit",
+ type=int,
+ default=None,
+ help=(
+ "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`."
+ " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state"
+ " for more docs"
+ ),
+ )
+ parser.add_argument(
+ "--resume_from_checkpoint",
+ type=str,
+ default=None,
+ help=(
+ "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+ ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+ ),
+ )
+ parser.add_argument(
+ "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+ )
+ parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
+
+ args = parser.parse_args()
+ env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+ if env_local_rank != -1 and env_local_rank != args.local_rank:
+ args.local_rank = env_local_rank
+
+ # Sanity checks
+ if args.dataset_name is None and args.train_data_dir is None:
+ raise ValueError("Need either a dataset name or a training folder.")
+
+ return args
+
+
+DATASET_NAME_MAPPING = {
+ "lambdalabs/pokemon-blip-captions": ("image", "text"),
+}
+
+
+def main():
+ args = parse_args()
+ logging_dir = os.path.join(args.output_dir, args.logging_dir)
+
+ accelerator_project_config = ProjectConfiguration(total_limit=args.checkpoints_total_limit)
+
+ accelerator = Accelerator(
+ gradient_accumulation_steps=args.gradient_accumulation_steps,
+ mixed_precision=args.mixed_precision,
+ log_with=args.report_to,
+ logging_dir=logging_dir,
+ project_config=accelerator_project_config,
+ )
+ if args.report_to == "wandb":
+ if not is_wandb_available():
+ raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+ import wandb
+
+ # Make one log on every process with the configuration for debugging.
+ logging.basicConfig(
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+ datefmt="%m/%d/%Y %H:%M:%S",
+ level=logging.INFO,
+ )
+ logger.info(accelerator.state, main_process_only=False)
+ if accelerator.is_local_main_process:
+ datasets.utils.logging.set_verbosity_warning()
+ transformers.utils.logging.set_verbosity_warning()
+ diffusers.utils.logging.set_verbosity_info()
+ else:
+ datasets.utils.logging.set_verbosity_error()
+ transformers.utils.logging.set_verbosity_error()
+ diffusers.utils.logging.set_verbosity_error()
+
+ # If passed along, set the training seed now.
+ if args.seed is not None:
+ set_seed(args.seed)
+
+ # Handle the repository creation
+ if accelerator.is_main_process:
+ if args.output_dir is not None:
+ os.makedirs(args.output_dir, exist_ok=True)
+
+ if args.push_to_hub:
+ repo_id = create_repo(
+ repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+ ).repo_id
+ # Load scheduler, tokenizer and models.
+ noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+ tokenizer = CLIPTokenizer.from_pretrained(
+ args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision
+ )
+ text_encoder = CLIPTextModel.from_pretrained(
+ args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
+ )
+ vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision)
+ unet = UNet2DConditionModel.from_pretrained(
+ args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
+ )
+ # freeze parameters of models to save more memory
+ unet.requires_grad_(False)
+ vae.requires_grad_(False)
+
+ text_encoder.requires_grad_(False)
+
+ # For mixed precision training we cast the text_encoder and vae weights to half-precision
+ # as these models are only used for inference, keeping weights in full precision is not required.
+ weight_dtype = torch.float32
+ if accelerator.mixed_precision == "fp16":
+ weight_dtype = torch.float16
+ elif accelerator.mixed_precision == "bf16":
+ weight_dtype = torch.bfloat16
+
+ # Move unet, vae and text_encoder to device and cast to weight_dtype
+ unet.to(accelerator.device, dtype=weight_dtype)
+ vae.to(accelerator.device, dtype=weight_dtype)
+ text_encoder.to(accelerator.device, dtype=weight_dtype)
+
+ # now we will add new LoRA weights to the attention layers
+ # It's important to realize here how many attention weights will be added and of which sizes
+ # The sizes of the attention layers consist only of two different variables:
+ # 1) - the "hidden_size", which is increased according to `unet.config.block_out_channels`.
+ # 2) - the "cross attention size", which is set to `unet.config.cross_attention_dim`.
+
+ # Let's first see how many attention processors we will have to set.
+ # For Stable Diffusion, it should be equal to:
+ # - down blocks (2x attention layers) * (2x transformer layers) * (3x down blocks) = 12
+ # - mid blocks (2x attention layers) * (1x transformer layers) * (1x mid blocks) = 2
+ # - up blocks (2x attention layers) * (3x transformer layers) * (3x down blocks) = 18
+ # => 32 layers
+
+ # Set correct lora layers
+ lora_attn_procs = {}
+ for name in unet.attn_processors.keys():
+ cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+ if name.startswith("mid_block"):
+ hidden_size = unet.config.block_out_channels[-1]
+ elif name.startswith("up_blocks"):
+ block_id = int(name[len("up_blocks.")])
+ hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+ elif name.startswith("down_blocks"):
+ block_id = int(name[len("down_blocks.")])
+ hidden_size = unet.config.block_out_channels[block_id]
+
+ lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
+
+ unet.set_attn_processor(lora_attn_procs)
+
+ if args.enable_xformers_memory_efficient_attention:
+ if is_xformers_available():
+ import xformers
+
+ xformers_version = version.parse(xformers.__version__)
+ if xformers_version == version.parse("0.0.16"):
+ logger.warn(
+ "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+ )
+ unet.enable_xformers_memory_efficient_attention()
+ else:
+ raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+ def compute_snr(timesteps):
+ """
+ Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
+ """
+ alphas_cumprod = noise_scheduler.alphas_cumprod
+ sqrt_alphas_cumprod = alphas_cumprod**0.5
+ sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
+
+ # Expand the tensors.
+ # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
+ sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
+ while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
+ sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
+ alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
+
+ sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
+ while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
+ sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
+ sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
+
+ # Compute SNR.
+ snr = (alpha / sigma) ** 2
+ return snr
+
+ lora_layers = AttnProcsLayers(unet.attn_processors)
+
+ # Enable TF32 for faster training on Ampere GPUs,
+ # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+ if args.allow_tf32:
+ torch.backends.cuda.matmul.allow_tf32 = True
+
+ if args.scale_lr:
+ args.learning_rate = (
+ args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+ )
+
+ # Initialize the optimizer
+ if args.use_8bit_adam:
+ try:
+ import bitsandbytes as bnb
+ except ImportError:
+ raise ImportError(
+ "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
+ )
+
+ optimizer_cls = bnb.optim.AdamW8bit
+ else:
+ optimizer_cls = torch.optim.AdamW
+
+ optimizer = optimizer_cls(
+ lora_layers.parameters(),
+ lr=args.learning_rate,
+ betas=(args.adam_beta1, args.adam_beta2),
+ weight_decay=args.adam_weight_decay,
+ eps=args.adam_epsilon,
+ )
+
+ # Get the datasets: you can either provide your own training and evaluation files (see below)
+ # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+ # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+ # download the dataset.
+ if args.dataset_name is not None:
+ # Downloading and loading a dataset from the hub.
+ dataset = load_dataset(
+ args.dataset_name,
+ args.dataset_config_name,
+ cache_dir=args.cache_dir,
+ )
+ else:
+ data_files = {}
+ if args.train_data_dir is not None:
+ data_files["train"] = os.path.join(args.train_data_dir, "**")
+ dataset = load_dataset(
+ "imagefolder",
+ data_files=data_files,
+ cache_dir=args.cache_dir,
+ )
+ # See more about loading custom images at
+ # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
+
+ # Preprocessing the datasets.
+ # We need to tokenize inputs and targets.
+ column_names = dataset["train"].column_names
+
+ # 6. Get the column names for input/target.
+ dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
+ if args.image_column is None:
+ image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+ else:
+ image_column = args.image_column
+ if image_column not in column_names:
+ raise ValueError(
+ f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
+ )
+ if args.caption_column is None:
+ caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+ else:
+ caption_column = args.caption_column
+ if caption_column not in column_names:
+ raise ValueError(
+ f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
+ )
+
+ # Preprocessing the datasets.
+ # We need to tokenize input captions and transform the images.
+ def tokenize_captions(examples, is_train=True):
+ captions = []
+ for caption in examples[caption_column]:
+ if isinstance(caption, str):
+ captions.append(caption)
+ elif isinstance(caption, (list, np.ndarray)):
+ # take a random caption if there are multiple
+ captions.append(random.choice(caption) if is_train else caption[0])
+ else:
+ raise ValueError(
+ f"Caption column `{caption_column}` should contain either strings or lists of strings."
+ )
+ inputs = tokenizer(
+ captions, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
+ )
+ return inputs.input_ids
+
+ # Preprocessing the datasets.
+ train_transforms = transforms.Compose(
+ [
+ transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+ transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
+ transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
+ transforms.ToTensor(),
+ transforms.Normalize([0.5], [0.5]),
+ ]
+ )
+
+ def preprocess_train(examples):
+ images = [image.convert("RGB") for image in examples[image_column]]
+ examples["pixel_values"] = [train_transforms(image) for image in images]
+ examples["input_ids"] = tokenize_captions(examples)
+ return examples
+
+ with accelerator.main_process_first():
+ if args.max_train_samples is not None:
+ dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+ # Set the training transforms
+ train_dataset = dataset["train"].with_transform(preprocess_train)
+
+ def collate_fn(examples):
+ pixel_values = torch.stack([example["pixel_values"] for example in examples])
+ pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+ input_ids = torch.stack([example["input_ids"] for example in examples])
+ return {"pixel_values": pixel_values, "input_ids": input_ids}
+
+ # DataLoaders creation:
+ train_dataloader = torch.utils.data.DataLoader(
+ train_dataset,
+ shuffle=True,
+ collate_fn=collate_fn,
+ batch_size=args.train_batch_size,
+ num_workers=args.dataloader_num_workers,
+ )
+
+ # Scheduler and math around the number of training steps.
+ overrode_max_train_steps = False
+ num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+ if args.max_train_steps is None:
+ args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+ overrode_max_train_steps = True
+
+ lr_scheduler = get_scheduler(
+ args.lr_scheduler,
+ optimizer=optimizer,
+ num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+ num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+ )
+
+ # Prepare everything with our `accelerator`.
+ lora_layers, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+ lora_layers, optimizer, train_dataloader, lr_scheduler
+ )
+
+ # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+ num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+ if overrode_max_train_steps:
+ args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+ # Afterwards we recalculate our number of training epochs
+ args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+ # We need to initialize the trackers we use, and also store our configuration.
+ # The trackers initializes automatically on the main process.
+ if accelerator.is_main_process:
+ accelerator.init_trackers("text2image-fine-tune", config=vars(args))
+
+ # Train!
+ total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+ logger.info("***** Running training *****")
+ logger.info(f" Num examples = {len(train_dataset)}")
+ logger.info(f" Num Epochs = {args.num_train_epochs}")
+ logger.info(f" Instantaneous batch size per device = {args.train_batch_size}")
+ logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+ logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+ logger.info(f" Total optimization steps = {args.max_train_steps}")
+ global_step = 0
+ first_epoch = 0
+
+ # Potentially load in the weights and states from a previous save
+ if args.resume_from_checkpoint:
+ if args.resume_from_checkpoint != "latest":
+ path = os.path.basename(args.resume_from_checkpoint)
+ else:
+ # Get the most recent checkpoint
+ dirs = os.listdir(args.output_dir)
+ dirs = [d for d in dirs if d.startswith("checkpoint")]
+ dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+ path = dirs[-1] if len(dirs) > 0 else None
+
+ if path is None:
+ accelerator.print(
+ f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+ )
+ args.resume_from_checkpoint = None
+ else:
+ accelerator.print(f"Resuming from checkpoint {path}")
+ accelerator.load_state(os.path.join(args.output_dir, path))
+ global_step = int(path.split("-")[1])
+
+ resume_global_step = global_step * args.gradient_accumulation_steps
+ first_epoch = global_step // num_update_steps_per_epoch
+ resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
+
+ # Only show the progress bar once on each machine.
+ progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
+ progress_bar.set_description("Steps")
+
+ for epoch in range(first_epoch, args.num_train_epochs):
+ unet.train()
+ train_loss = 0.0
+ for step, batch in enumerate(train_dataloader):
+ # Skip steps until we reach the resumed step
+ if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
+ if step % args.gradient_accumulation_steps == 0:
+ progress_bar.update(1)
+ continue
+
+ with accelerator.accumulate(unet):
+ # Convert images to latent space
+ latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+ latents = latents * vae.config.scaling_factor
+
+ # Sample noise that we'll add to the latents
+ noise = torch.randn_like(latents)
+ if args.noise_offset:
+ # https://www.crosslabs.org//blog/diffusion-with-offset-noise
+ noise += args.noise_offset * torch.randn(
+ (latents.shape[0], latents.shape[1], 1, 1), device=latents.device
+ )
+
+ bsz = latents.shape[0]
+ # Sample a random timestep for each image
+ timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+ timesteps = timesteps.long()
+
+ # Add noise to the latents according to the noise magnitude at each timestep
+ # (this is the forward diffusion process)
+ noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+ # Get the text embedding for conditioning
+ encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+
+ # Get the target for loss depending on the prediction type
+ if noise_scheduler.config.prediction_type == "epsilon":
+ target = noise
+ elif noise_scheduler.config.prediction_type == "v_prediction":
+ target = noise_scheduler.get_velocity(latents, noise, timesteps)
+ else:
+ raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+ # Predict the noise residual and compute loss
+ model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+
+ if args.snr_gamma is None:
+ loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+ else:
+ # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+ # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+ # This is discussed in Section 4.2 of the same paper.
+ snr = compute_snr(timesteps)
+ mse_loss_weights = (
+ torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(dim=1)[0] / snr
+ )
+ # We first calculate the original loss. Then we mean over the non-batch dimensions and
+ # rebalance the sample-wise losses with their respective loss weights.
+ # Finally, we take the mean of the rebalanced loss.
+ loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+ loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
+ loss = loss.mean()
+
+ # Gather the losses across all processes for logging (if we use distributed training).
+ avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+ train_loss += avg_loss.item() / args.gradient_accumulation_steps
+
+ # Backpropagate
+ accelerator.backward(loss)
+ if accelerator.sync_gradients:
+ params_to_clip = lora_layers.parameters()
+ accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+ optimizer.step()
+ lr_scheduler.step()
+ optimizer.zero_grad()
+
+ # Checks if the accelerator has performed an optimization step behind the scenes
+ if accelerator.sync_gradients:
+ progress_bar.update(1)
+ global_step += 1
+ accelerator.log({"train_loss": train_loss}, step=global_step)
+ train_loss = 0.0
+
+ if global_step % args.checkpointing_steps == 0:
+ if accelerator.is_main_process:
+ save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+ accelerator.save_state(save_path)
+ logger.info(f"Saved state to {save_path}")
+
+ logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+ progress_bar.set_postfix(**logs)
+
+ if global_step >= args.max_train_steps:
+ break
+
+ if accelerator.is_main_process:
+ if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
+ logger.info(
+ f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+ f" {args.validation_prompt}."
+ )
+ # create pipeline
+ pipeline = DiffusionPipeline.from_pretrained(
+ args.pretrained_model_name_or_path,
+ unet=accelerator.unwrap_model(unet),
+ revision=args.revision,
+ torch_dtype=weight_dtype,
+ )
+ pipeline = pipeline.to(accelerator.device)
+ pipeline.set_progress_bar_config(disable=True)
+
+ # run inference
+ generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+ images = []
+ for _ in range(args.num_validation_images):
+ images.append(
+ pipeline(args.validation_prompt, num_inference_steps=30, generator=generator).images[0]
+ )
+
+ for tracker in accelerator.trackers:
+ if tracker.name == "tensorboard":
+ np_images = np.stack([np.asarray(img) for img in images])
+ tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+ if tracker.name == "wandb":
+ tracker.log(
+ {
+ "validation": [
+ wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+ for i, image in enumerate(images)
+ ]
+ }
+ )
+
+ del pipeline
+ torch.cuda.empty_cache()
+
+ # Save the lora layers
+ accelerator.wait_for_everyone()
+ if accelerator.is_main_process:
+ unet = unet.to(torch.float32)
+ unet.save_attn_procs(args.output_dir)
+
+ if args.push_to_hub:
+ save_model_card(
+ repo_id,
+ images=images,
+ base_model=args.pretrained_model_name_or_path,
+ dataset_name=args.dataset_name,
+ repo_folder=args.output_dir,
+ )
+ upload_folder(
+ repo_id=repo_id,
+ folder_path=args.output_dir,
+ commit_message="End of training",
+ ignore_patterns=["step_*", "epoch_*"],
+ )
+
+ # Final inference
+ # Load previous pipeline
+ pipeline = DiffusionPipeline.from_pretrained(
+ args.pretrained_model_name_or_path, revision=args.revision, torch_dtype=weight_dtype
+ )
+ pipeline = pipeline.to(accelerator.device)
+
+ # load attention processors
+ pipeline.unet.load_attn_procs(args.output_dir)
+
+ # run inference
+ generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+ images = []
+ for _ in range(args.num_validation_images):
+ images.append(pipeline(args.validation_prompt, num_inference_steps=30, generator=generator).images[0])
+
+ if accelerator.is_main_process:
+ for tracker in accelerator.trackers:
+ if tracker.name == "tensorboard":
+ np_images = np.stack([np.asarray(img) for img in images])
+ tracker.writer.add_images("test", np_images, epoch, dataformats="NHWC")
+ if tracker.name == "wandb":
+ tracker.log(
+ {
+ "test": [
+ wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+ for i, image in enumerate(images)
+ ]
+ }
+ )
+
+ accelerator.end_training()
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/models/arcface/1/model.plan b/models/arcface/1/model.plan
new file mode 100644
index 0000000000000000000000000000000000000000..0f76c4e878cb0742b632534205127dadab77c9a6
--- /dev/null
+++ b/models/arcface/1/model.plan
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8872cc199e8bf592bf08ed0f326a2671c30e6a0bf1deae202eb1e354cbe25b6f
+size 396466556
diff --git a/models/arcface/1/resnet100.onnx b/models/arcface/1/resnet100.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..9c111d09a423e703970fe774fe95207fa427d62f
--- /dev/null
+++ b/models/arcface/1/resnet100.onnx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f3a6bc281e72f88862f5748b53be3d76b3b48f8f1ab1f4a537941bdc4e1b01da
+size 261036388
diff --git a/models/arcface/config.pbtxt b/models/arcface/config.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0de29673d0362fde36cac260509579964b086116
--- /dev/null
+++ b/models/arcface/config.pbtxt
@@ -0,0 +1,19 @@
+name: "arcface"
+platform: "tensorrt_plan"
+
+input [
+ {
+ name: "data"
+ data_type: TYPE_FP32
+ dims: [ 1, 3, 112, 112 ]
+
+ }
+]
+
+output [
+ {
+ name: "fc1"
+ data_type: TYPE_FP32
+ dims: [ 1, 512 ]
+ }
+]
\ No newline at end of file
diff --git a/pytorch_lora_weights.bin b/pytorch_lora_weights.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b7cd99536ae30f33147e4bc7232e28f62bd4f4f8
--- /dev/null
+++ b/pytorch_lora_weights.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:484aad42d5b93acbb227cf8cd63022a32f89f34ac1c44ed4873dac0a0be6f6a4
+size 3287771
diff --git a/test1.py b/test1.py
new file mode 100644
index 0000000000000000000000000000000000000000..869c82094f238a02ac934a8b5ff8718a43bedc40
--- /dev/null
+++ b/test1.py
@@ -0,0 +1,37 @@
+import torch
+
+model = torch.nn.Linear(2, 3).to("cuda").eval()
+
+
+import numpy as np
+from pytriton.decorators import batch
+
+@batch
+def infer_fn(**inputs: np.ndarray):
+ (input1_batch,) = inputs.values()
+ input1_batch_tensor = torch.from_numpy(input1_batch).to("cuda")
+ output1_batch_tensor = model(input1_batch_tensor) # Calling the Python model inference
+ output1_batch = output1_batch_tensor.cpu().detach().numpy()
+ return [output1_batch]
+
+
+from pytriton.model_config import ModelConfig, Tensor
+from pytriton.triton import Triton
+
+# Connecting inference callback with Triton Inference Server
+with Triton() as triton:
+ # Load model into Triton Inference Server
+ triton.bind(
+ model_name="Linear",
+ infer_func=infer_fn,
+ inputs=[
+ Tensor(dtype=np.float32, shape=(-1,)),
+ ],
+ outputs=[
+ Tensor(dtype=np.float32, shape=(-1,)),
+ ],
+ config=ModelConfig(max_batch_size=128)
+ )
+ triton.serve()
+
+
diff --git a/test2.py b/test2.py
new file mode 100644
index 0000000000000000000000000000000000000000..381598f8143ba05a0b14a60070899fc2edce4582
--- /dev/null
+++ b/test2.py
@@ -0,0 +1,16 @@
+import torch
+
+
+import numpy as np
+
+
+from pytriton.model_config import ModelConfig, Tensor
+from pytriton.triton import Triton
+
+# Connecting inference callback with Triton Inference Server
+with Triton() as triton:
+ # Load model into Triton Inference Server
+ triton
+ triton.serve()
+
+
diff --git a/train_lora_1.sh b/train_lora_1.sh
new file mode 100644
index 0000000000000000000000000000000000000000..cae2def35dceb7a1f6095f7b53f2e7abd9462514
--- /dev/null
+++ b/train_lora_1.sh
@@ -0,0 +1,21 @@
+export MODEL_NAME="runwayml/stable-diffusion-v1-5"
+export OUTPUT_DIR="/home/long.qul/tritontest"
+export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+
+accelerate launch --mixed_precision="fp16" lora_test_1.py \
+ --pretrained_model_name_or_path=$MODEL_NAME \
+ --dataset_name=$DATASET_NAME \
+ --dataloader_num_workers=8 \
+ --resolution=512 --center_crop --random_flip \
+ --train_batch_size=1 \
+ --gradient_accumulation_steps=4 \
+ --max_train_steps=15000 \
+ --learning_rate=1e-04 \
+ --max_grad_norm=1 \
+ --lr_scheduler="cosine" --lr_warmup_steps=0 \
+ --output_dir=${OUTPUT_DIR} \
+ --push_to_hub \
+ --hub_model_id=${HUB_MODEL_ID} \
+ --checkpointing_steps=500 \
+ --validation_prompt="A pokemon with blue eyes." \
+ --seed=1337
\ No newline at end of file
diff --git a/wandb/debug-cli.long.qul.log b/wandb/debug-cli.long.qul.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..0a351171cb6ba694a8691f15334a522bae8eeacf
--- /dev/null
+++ b/wandb/debug-internal.log
@@ -0,0 +1,181 @@
+2023-05-25 17:53:52,728 INFO StreamThr :2933662 [internal.py:wandb_internal():86] W&B internal server running at pid: 2933662, started at: 2023-05-25 17:53:52.727782
+2023-05-25 17:53:52,729 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: status
+2023-05-25 17:53:52,730 INFO WriterThread:2933662 [datastore.py:open_for_write():85] open: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/run-jw7zshqk.wandb
+2023-05-25 17:53:52,731 DEBUG SenderThread:2933662 [sender.py:send():375] send: header
+2023-05-25 17:53:52,731 DEBUG SenderThread:2933662 [sender.py:send():375] send: run
+2023-05-25 17:53:53,258 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: check_version
+2023-05-25 17:53:53,259 INFO SenderThread:2933662 [dir_watcher.py:__init__():219] watching files in: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files
+2023-05-25 17:53:53,259 INFO SenderThread:2933662 [sender.py:_start_run_threads():1124] run started: jw7zshqk with start time 1685008432.727397
+2023-05-25 17:53:53,259 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: summary_record
+2023-05-25 17:53:53,259 INFO SenderThread:2933662 [sender.py:_save_file():1378] saving file wandb-summary.json with policy end
+2023-05-25 17:53:53,259 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: check_version
+2023-05-25 17:53:53,282 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: run_start
+2023-05-25 17:53:53,284 DEBUG HandlerThread:2933662 [system_info.py:__init__():31] System info init
+2023-05-25 17:53:53,284 DEBUG HandlerThread:2933662 [system_info.py:__init__():46] System info init done
+2023-05-25 17:53:53,284 INFO HandlerThread:2933662 [system_monitor.py:start():181] Starting system monitor
+2023-05-25 17:53:53,284 INFO SystemMonitor:2933662 [system_monitor.py:_start():145] Starting system asset monitoring threads
+2023-05-25 17:53:53,284 INFO HandlerThread:2933662 [system_monitor.py:probe():201] Collecting system info
+2023-05-25 17:53:53,285 INFO SystemMonitor:2933662 [interfaces.py:start():190] Started cpu monitoring
+2023-05-25 17:53:53,285 INFO SystemMonitor:2933662 [interfaces.py:start():190] Started disk monitoring
+2023-05-25 17:53:53,285 INFO SystemMonitor:2933662 [interfaces.py:start():190] Started gpu monitoring
+2023-05-25 17:53:53,286 INFO SystemMonitor:2933662 [interfaces.py:start():190] Started memory monitoring
+2023-05-25 17:53:53,286 INFO SystemMonitor:2933662 [interfaces.py:start():190] Started network monitoring
+2023-05-25 17:53:53,292 DEBUG HandlerThread:2933662 [system_info.py:probe():195] Probing system
+2023-05-25 17:53:53,295 DEBUG HandlerThread:2933662 [git.py:repo():40] git repository is invalid
+2023-05-25 17:53:53,295 DEBUG HandlerThread:2933662 [system_info.py:probe():240] Probing system done
+2023-05-25 17:53:53,295 DEBUG HandlerThread:2933662 [system_monitor.py:probe():210] {'os': 'Linux-5.10.134-13.1.al8.x86_64-x86_64-with-glibc2.17', 'python': '3.8.16', 'heartbeatAt': '2023-05-25T09:53:53.292165', 'startedAt': '2023-05-25T09:53:52.722001', 'docker': None, 'cuda': None, 'args': ('--pretrained_model_name_or_path=runwayml/stable-diffusion-v1-5', '--dataset_name=lambdalabs/pokemon-blip-captions', '--dataloader_num_workers=8', '--resolution=512', '--center_crop', '--random_flip', '--train_batch_size=1', '--gradient_accumulation_steps=4', '--max_train_steps=15000', '--learning_rate=1e-04', '--max_grad_norm=1', '--lr_scheduler=cosine', '--lr_warmup_steps=0', '--output_dir=/home/long.qul/tritontest', '--push_to_hub', '--hub_model_id=', '--report_to=wandb', '--checkpointing_steps=500', '--validation_prompt=A pokemon with blue eyes.', '--seed=1337'), 'state': 'running', 'program': 'lora_test_1.py', 'codePath': 'lora_test_1.py', 'host': 'iZt4n6er62uu4xnw6wibnhZ', 'username': 'long.qul', 'executable': '/home/long.qul/miniconda3/envs/triton/bin/python', 'cpu_count': 28, 'cpu_count_logical': 56, 'cpu_freq': {'current': 2899.998000000001, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}], 'disk': {'total': 1968.4237327575684, 'used': 256.4756889343262}, 'gpu': 'NVIDIA A10', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A10', 'memory_total': 23836098560}], 'memory': {'total': 339.9116630554199}}
+2023-05-25 17:53:53,295 INFO HandlerThread:2933662 [system_monitor.py:probe():211] Finished collecting system info
+2023-05-25 17:53:53,295 INFO HandlerThread:2933662 [system_monitor.py:probe():214] Publishing system info
+2023-05-25 17:53:53,295 DEBUG HandlerThread:2933662 [system_info.py:_save_pip():51] Saving list of pip packages installed into the current environment
+2023-05-25 17:53:53,295 DEBUG HandlerThread:2933662 [system_info.py:_save_pip():67] Saving pip packages done
+2023-05-25 17:53:53,296 DEBUG HandlerThread:2933662 [system_info.py:_save_conda():74] Saving list of conda packages installed into the current environment
+2023-05-25 17:53:54,260 INFO Thread-12 :2933662 [dir_watcher.py:_on_file_created():278] file/dir created: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/conda-environment.yaml
+2023-05-25 17:53:54,260 INFO Thread-12 :2933662 [dir_watcher.py:_on_file_created():278] file/dir created: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/requirements.txt
+2023-05-25 17:53:54,260 INFO Thread-12 :2933662 [dir_watcher.py:_on_file_created():278] file/dir created: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/wandb-summary.json
+2023-05-25 17:53:55,363 DEBUG HandlerThread:2933662 [system_info.py:_save_conda():86] Saving conda packages done
+2023-05-25 17:53:55,364 INFO HandlerThread:2933662 [system_monitor.py:probe():216] Finished publishing system info
+2023-05-25 17:53:55,368 DEBUG SenderThread:2933662 [sender.py:send():375] send: files
+2023-05-25 17:53:55,368 INFO SenderThread:2933662 [sender.py:_save_file():1378] saving file wandb-metadata.json with policy now
+2023-05-25 17:53:55,371 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: stop_status
+2023-05-25 17:53:55,371 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: stop_status
+2023-05-25 17:53:55,619 DEBUG SenderThread:2933662 [sender.py:send():375] send: telemetry
+2023-05-25 17:53:55,619 DEBUG SenderThread:2933662 [sender.py:send():375] send: config
+2023-05-25 17:53:55,646 DEBUG SenderThread:2933662 [sender.py:send():375] send: exit
+2023-05-25 17:53:55,646 INFO SenderThread:2933662 [sender.py:send_exit():598] handling exit code: 1
+2023-05-25 17:53:55,646 INFO SenderThread:2933662 [sender.py:send_exit():600] handling runtime: 2
+2023-05-25 17:53:55,646 INFO SenderThread:2933662 [sender.py:_save_file():1378] saving file wandb-summary.json with policy end
+2023-05-25 17:53:55,647 INFO SenderThread:2933662 [sender.py:send_exit():606] send defer
+2023-05-25 17:53:55,647 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:53:55,647 INFO HandlerThread:2933662 [handler.py:handle_request_defer():170] handle defer: 0
+2023-05-25 17:53:55,647 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:53:55,647 INFO SenderThread:2933662 [sender.py:send_request_defer():622] handle sender defer: 0
+2023-05-25 17:53:55,647 INFO SenderThread:2933662 [sender.py:transition_state():626] send defer: 1
+2023-05-25 17:53:55,647 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:53:55,647 INFO HandlerThread:2933662 [handler.py:handle_request_defer():170] handle defer: 1
+2023-05-25 17:53:55,647 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:53:55,647 INFO SenderThread:2933662 [sender.py:send_request_defer():622] handle sender defer: 1
+2023-05-25 17:53:55,647 INFO SenderThread:2933662 [sender.py:transition_state():626] send defer: 2
+2023-05-25 17:53:55,647 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:53:55,647 INFO HandlerThread:2933662 [handler.py:handle_request_defer():170] handle defer: 2
+2023-05-25 17:53:55,647 INFO HandlerThread:2933662 [system_monitor.py:finish():190] Stopping system monitor
+2023-05-25 17:53:55,648 INFO HandlerThread:2933662 [interfaces.py:finish():202] Joined cpu monitor
+2023-05-25 17:53:55,648 DEBUG SystemMonitor:2933662 [system_monitor.py:_start():159] Starting system metrics aggregation loop
+2023-05-25 17:53:55,648 INFO HandlerThread:2933662 [interfaces.py:finish():202] Joined disk monitor
+2023-05-25 17:53:55,648 DEBUG SystemMonitor:2933662 [system_monitor.py:_start():166] Finished system metrics aggregation loop
+2023-05-25 17:53:55,648 DEBUG SystemMonitor:2933662 [system_monitor.py:_start():170] Publishing last batch of metrics
+2023-05-25 17:53:55,679 INFO HandlerThread:2933662 [interfaces.py:finish():202] Joined gpu monitor
+2023-05-25 17:53:55,679 INFO HandlerThread:2933662 [interfaces.py:finish():202] Joined memory monitor
+2023-05-25 17:53:55,679 INFO HandlerThread:2933662 [interfaces.py:finish():202] Joined network monitor
+2023-05-25 17:53:55,679 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:53:55,679 INFO SenderThread:2933662 [sender.py:send_request_defer():622] handle sender defer: 2
+2023-05-25 17:53:55,679 INFO SenderThread:2933662 [sender.py:transition_state():626] send defer: 3
+2023-05-25 17:53:55,679 DEBUG SenderThread:2933662 [sender.py:send():375] send: stats
+2023-05-25 17:53:55,679 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:53:55,680 INFO HandlerThread:2933662 [handler.py:handle_request_defer():170] handle defer: 3
+2023-05-25 17:53:55,680 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:53:55,680 INFO SenderThread:2933662 [sender.py:send_request_defer():622] handle sender defer: 3
+2023-05-25 17:53:55,680 INFO SenderThread:2933662 [sender.py:transition_state():626] send defer: 4
+2023-05-25 17:53:55,680 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:53:55,680 INFO HandlerThread:2933662 [handler.py:handle_request_defer():170] handle defer: 4
+2023-05-25 17:53:55,680 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:53:55,680 INFO SenderThread:2933662 [sender.py:send_request_defer():622] handle sender defer: 4
+2023-05-25 17:53:55,680 INFO SenderThread:2933662 [sender.py:transition_state():626] send defer: 5
+2023-05-25 17:53:55,680 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:53:55,680 INFO HandlerThread:2933662 [handler.py:handle_request_defer():170] handle defer: 5
+2023-05-25 17:53:55,680 DEBUG SenderThread:2933662 [sender.py:send():375] send: summary
+2023-05-25 17:53:55,680 INFO SenderThread:2933662 [sender.py:_save_file():1378] saving file wandb-summary.json with policy end
+2023-05-25 17:53:55,680 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:53:55,680 INFO SenderThread:2933662 [sender.py:send_request_defer():622] handle sender defer: 5
+2023-05-25 17:53:55,680 INFO SenderThread:2933662 [sender.py:transition_state():626] send defer: 6
+2023-05-25 17:53:55,681 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:53:55,681 INFO HandlerThread:2933662 [handler.py:handle_request_defer():170] handle defer: 6
+2023-05-25 17:53:55,681 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:53:55,681 INFO SenderThread:2933662 [sender.py:send_request_defer():622] handle sender defer: 6
+2023-05-25 17:53:55,683 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: status_report
+2023-05-25 17:53:55,984 INFO SenderThread:2933662 [sender.py:transition_state():626] send defer: 7
+2023-05-25 17:53:55,984 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:53:55,984 INFO HandlerThread:2933662 [handler.py:handle_request_defer():170] handle defer: 7
+2023-05-25 17:53:55,984 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:53:55,984 INFO SenderThread:2933662 [sender.py:send_request_defer():622] handle sender defer: 7
+2023-05-25 17:53:55,988 INFO wandb-upload_0:2933662 [upload_job.py:push():137] Uploaded file /tmp/tmpfd18htn5wandb/qjhthzap-wandb-metadata.json
+2023-05-25 17:53:56,261 INFO Thread-12 :2933662 [dir_watcher.py:_on_file_modified():295] file/dir modified: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/conda-environment.yaml
+2023-05-25 17:53:56,261 INFO Thread-12 :2933662 [dir_watcher.py:_on_file_modified():295] file/dir modified: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/config.yaml
+2023-05-25 17:53:56,261 INFO Thread-12 :2933662 [dir_watcher.py:_on_file_modified():295] file/dir modified: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/wandb-summary.json
+2023-05-25 17:53:56,261 INFO Thread-12 :2933662 [dir_watcher.py:_on_file_created():278] file/dir created: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/wandb-metadata.json
+2023-05-25 17:53:56,261 INFO Thread-12 :2933662 [dir_watcher.py:_on_file_created():278] file/dir created: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/output.log
+2023-05-25 17:53:56,646 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: poll_exit
+2023-05-25 17:53:57,626 INFO SenderThread:2933662 [sender.py:transition_state():626] send defer: 8
+2023-05-25 17:53:57,626 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: poll_exit
+2023-05-25 17:53:57,626 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:53:57,627 INFO HandlerThread:2933662 [handler.py:handle_request_defer():170] handle defer: 8
+2023-05-25 17:53:57,627 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:53:57,627 INFO SenderThread:2933662 [sender.py:send_request_defer():622] handle sender defer: 8
+2023-05-25 17:53:57,627 INFO SenderThread:2933662 [sender.py:transition_state():626] send defer: 9
+2023-05-25 17:53:57,627 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:53:57,627 INFO HandlerThread:2933662 [handler.py:handle_request_defer():170] handle defer: 9
+2023-05-25 17:53:57,627 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:53:57,627 INFO SenderThread:2933662 [sender.py:send_request_defer():622] handle sender defer: 9
+2023-05-25 17:53:57,627 INFO SenderThread:2933662 [dir_watcher.py:finish():365] shutting down directory watcher
+2023-05-25 17:53:57,647 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: poll_exit
+2023-05-25 17:53:58,261 INFO Thread-12 :2933662 [dir_watcher.py:_on_file_modified():295] file/dir modified: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/output.log
+2023-05-25 17:53:58,262 INFO SenderThread:2933662 [dir_watcher.py:finish():395] scan: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files
+2023-05-25 17:53:58,262 INFO SenderThread:2933662 [dir_watcher.py:finish():409] scan save: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/wandb-summary.json wandb-summary.json
+2023-05-25 17:53:58,262 INFO SenderThread:2933662 [dir_watcher.py:finish():409] scan save: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/requirements.txt requirements.txt
+2023-05-25 17:53:58,262 INFO SenderThread:2933662 [dir_watcher.py:finish():409] scan save: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/conda-environment.yaml conda-environment.yaml
+2023-05-25 17:53:58,262 INFO SenderThread:2933662 [dir_watcher.py:finish():409] scan save: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/output.log output.log
+2023-05-25 17:53:58,262 INFO SenderThread:2933662 [dir_watcher.py:finish():409] scan save: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/wandb-metadata.json wandb-metadata.json
+2023-05-25 17:53:58,262 INFO SenderThread:2933662 [dir_watcher.py:finish():409] scan save: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/config.yaml config.yaml
+2023-05-25 17:53:58,262 INFO SenderThread:2933662 [sender.py:transition_state():626] send defer: 10
+2023-05-25 17:53:58,262 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: poll_exit
+2023-05-25 17:53:58,266 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:53:58,266 INFO HandlerThread:2933662 [handler.py:handle_request_defer():170] handle defer: 10
+2023-05-25 17:53:58,268 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:53:58,269 INFO SenderThread:2933662 [sender.py:send_request_defer():622] handle sender defer: 10
+2023-05-25 17:53:58,269 INFO SenderThread:2933662 [file_pusher.py:finish():167] shutting down file pusher
+2023-05-25 17:53:58,648 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: poll_exit
+2023-05-25 17:53:58,648 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: poll_exit
+2023-05-25 17:53:58,877 INFO wandb-upload_4:2933662 [upload_job.py:push():137] Uploaded file /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/config.yaml
+2023-05-25 17:53:58,890 INFO wandb-upload_3:2933662 [upload_job.py:push():137] Uploaded file /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/output.log
+2023-05-25 17:53:58,948 INFO wandb-upload_0:2933662 [upload_job.py:push():137] Uploaded file /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/wandb-summary.json
+2023-05-25 17:53:58,970 INFO wandb-upload_2:2933662 [upload_job.py:push():137] Uploaded file /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/conda-environment.yaml
+2023-05-25 17:53:59,034 INFO wandb-upload_1:2933662 [upload_job.py:push():137] Uploaded file /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/requirements.txt
+2023-05-25 17:53:59,234 INFO Thread-11 :2933662 [sender.py:transition_state():626] send defer: 11
+2023-05-25 17:53:59,234 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:53:59,235 INFO HandlerThread:2933662 [handler.py:handle_request_defer():170] handle defer: 11
+2023-05-25 17:53:59,235 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:53:59,235 INFO SenderThread:2933662 [sender.py:send_request_defer():622] handle sender defer: 11
+2023-05-25 17:53:59,235 INFO SenderThread:2933662 [file_pusher.py:join():172] waiting for file pusher
+2023-05-25 17:53:59,235 INFO SenderThread:2933662 [sender.py:transition_state():626] send defer: 12
+2023-05-25 17:53:59,235 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:53:59,235 INFO HandlerThread:2933662 [handler.py:handle_request_defer():170] handle defer: 12
+2023-05-25 17:53:59,235 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:53:59,235 INFO SenderThread:2933662 [sender.py:send_request_defer():622] handle sender defer: 12
+2023-05-25 17:53:59,488 INFO SenderThread:2933662 [sender.py:transition_state():626] send defer: 13
+2023-05-25 17:53:59,488 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:53:59,488 INFO HandlerThread:2933662 [handler.py:handle_request_defer():170] handle defer: 13
+2023-05-25 17:53:59,488 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:53:59,488 INFO SenderThread:2933662 [sender.py:send_request_defer():622] handle sender defer: 13
+2023-05-25 17:53:59,489 INFO SenderThread:2933662 [sender.py:transition_state():626] send defer: 14
+2023-05-25 17:53:59,489 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:53:59,489 INFO HandlerThread:2933662 [handler.py:handle_request_defer():170] handle defer: 14
+2023-05-25 17:53:59,489 DEBUG SenderThread:2933662 [sender.py:send():375] send: final
+2023-05-25 17:53:59,489 DEBUG SenderThread:2933662 [sender.py:send():375] send: footer
+2023-05-25 17:53:59,489 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:53:59,489 INFO SenderThread:2933662 [sender.py:send_request_defer():622] handle sender defer: 14
+2023-05-25 17:53:59,489 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: poll_exit
+2023-05-25 17:53:59,489 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: poll_exit
+2023-05-25 17:53:59,490 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: poll_exit
+2023-05-25 17:53:59,490 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: poll_exit
+2023-05-25 17:53:59,490 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: server_info
+2023-05-25 17:53:59,490 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: server_info
+2023-05-25 17:53:59,491 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: get_summary
+2023-05-25 17:53:59,492 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: sampled_history
+2023-05-25 17:53:59,969 INFO MainThread:2933662 [wandb_run.py:_footer_history_summary_info():3469] rendering history
+2023-05-25 17:53:59,969 INFO MainThread:2933662 [wandb_run.py:_footer_history_summary_info():3501] rendering summary
+2023-05-25 17:53:59,969 INFO MainThread:2933662 [wandb_run.py:_footer_sync_info():3428] logging synced files
+2023-05-25 17:53:59,969 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: shutdown
+2023-05-25 17:53:59,969 INFO HandlerThread:2933662 [handler.py:finish():842] shutting down handler
+2023-05-25 17:54:00,490 INFO WriterThread:2933662 [datastore.py:close():298] close: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/run-jw7zshqk.wandb
+2023-05-25 17:54:00,969 INFO SenderThread:2933662 [sender.py:finish():1550] shutting down sender
+2023-05-25 17:54:00,969 INFO SenderThread:2933662 [file_pusher.py:finish():167] shutting down file pusher
+2023-05-25 17:54:00,969 INFO SenderThread:2933662 [file_pusher.py:join():172] waiting for file pusher
diff --git a/wandb/debug.log b/wandb/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..d53cc5d615912e79756620ee1c42e4c76ab63a07
--- /dev/null
+++ b/wandb/debug.log
@@ -0,0 +1,28 @@
+2023-05-25 17:53:52,723 INFO MainThread:2933449 [wandb_setup.py:_flush():76] Current SDK version is 0.15.3
+2023-05-25 17:53:52,723 INFO MainThread:2933449 [wandb_setup.py:_flush():76] Configure stats pid to 2933449
+2023-05-25 17:53:52,723 INFO MainThread:2933449 [wandb_setup.py:_flush():76] Loading settings from /home/long.qul/.config/wandb/settings
+2023-05-25 17:53:52,723 INFO MainThread:2933449 [wandb_setup.py:_flush():76] Loading settings from /home/long.qul/tritontest/wandb/settings
+2023-05-25 17:53:52,723 INFO MainThread:2933449 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
+2023-05-25 17:53:52,723 INFO MainThread:2933449 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2023-05-25 17:53:52,723 INFO MainThread:2933449 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'lora_test_1.py', 'program': 'lora_test_1.py'}
+2023-05-25 17:53:52,723 INFO MainThread:2933449 [wandb_init.py:_log_setup():507] Logging user logs to /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/logs/debug.log
+2023-05-25 17:53:52,723 INFO MainThread:2933449 [wandb_init.py:_log_setup():508] Logging internal logs to /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/logs/debug-internal.log
+2023-05-25 17:53:52,723 INFO MainThread:2933449 [wandb_init.py:init():547] calling init triggers
+2023-05-25 17:53:52,723 INFO MainThread:2933449 [wandb_init.py:init():554] wandb.init called with sweep_config: {}
+config: {}
+2023-05-25 17:53:52,723 INFO MainThread:2933449 [wandb_init.py:init():596] starting backend
+2023-05-25 17:53:52,723 INFO MainThread:2933449 [wandb_init.py:init():600] setting up manager
+2023-05-25 17:53:52,725 INFO MainThread:2933449 [backend.py:_multiprocessing_setup():106] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2023-05-25 17:53:52,727 INFO MainThread:2933449 [wandb_init.py:init():606] backend started and connected
+2023-05-25 17:53:52,729 INFO MainThread:2933449 [wandb_init.py:init():700] updated telemetry
+2023-05-25 17:53:52,729 INFO MainThread:2933449 [wandb_init.py:init():737] communicating run to backend with 60.0 second timeout
+2023-05-25 17:53:53,257 INFO MainThread:2933449 [wandb_run.py:_on_init():2177] communicating current version
+2023-05-25 17:53:53,279 INFO MainThread:2933449 [wandb_run.py:_on_init():2186] got version response
+2023-05-25 17:53:53,279 INFO MainThread:2933449 [wandb_init.py:init():787] starting run threads in backend
+2023-05-25 17:53:55,371 INFO MainThread:2933449 [wandb_run.py:_console_start():2158] atexit reg
+2023-05-25 17:53:55,371 INFO MainThread:2933449 [wandb_run.py:_redirect():2013] redirect: SettingsConsole.WRAP_RAW
+2023-05-25 17:53:55,371 INFO MainThread:2933449 [wandb_run.py:_redirect():2078] Wrapping output streams.
+2023-05-25 17:53:55,371 INFO MainThread:2933449 [wandb_run.py:_redirect():2103] Redirects installed.
+2023-05-25 17:53:55,372 INFO MainThread:2933449 [wandb_init.py:init():829] run started, returning control to user process
+2023-05-25 17:53:55,374 INFO MainThread:2933449 [wandb_run.py:_config_callback():1286] config_cb None None {'pretrained_model_name_or_path': 'runwayml/stable-diffusion-v1-5', 'revision': None, 'dataset_name': 'lambdalabs/pokemon-blip-captions', 'dataset_config_name': None, 'train_data_dir': None, 'image_column': 'image', 'caption_column': 'text', 'validation_prompt': 'A pokemon with blue eyes.', 'num_validation_images': 4, 'validation_epochs': 1, 'max_train_samples': None, 'output_dir': '/home/long.qul/tritontest', 'cache_dir': None, 'seed': 1337, 'resolution': 512, 'center_crop': True, 'random_flip': True, 'train_batch_size': 1, 'num_train_epochs': 72, 'max_train_steps': 15000, 'gradient_accumulation_steps': 4, 'gradient_checkpointing': False, 'learning_rate': 0.0001, 'scale_lr': False, 'lr_scheduler': 'cosine', 'lr_warmup_steps': 0, 'snr_gamma': None, 'use_8bit_adam': False, 'allow_tf32': False, 'dataloader_num_workers': 8, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'push_to_hub': True, 'hub_token': None, 'hub_model_id': '', 'logging_dir': 'logs', 'mixed_precision': None, 'report_to': 'wandb', 'local_rank': -1, 'checkpointing_steps': 500, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'enable_xformers_memory_efficient_attention': False, 'noise_offset': 0}
+2023-05-25 17:54:01,060 WARNING MsgRouterThr:2933449 [router.py:message_loop():77] message_loop has been closed
diff --git a/wandb/run-20230525_174038-2hxthfpt/files/conda-environment.yaml b/wandb/run-20230525_174038-2hxthfpt/files/conda-environment.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eaad08ef7cb35db28c95e3521ff1f3af6a1b767d
--- /dev/null
+++ b/wandb/run-20230525_174038-2hxthfpt/files/conda-environment.yaml
@@ -0,0 +1,147 @@
+name: triton
+channels:
+ - conda-forge
+ - defaults
+dependencies:
+ - _libgcc_mutex=0.1=main
+ - _openmp_mutex=5.1=1_gnu
+ - bzip2=1.0.8=h7f98852_4
+ - c-ares=1.19.0=h5eee18b_0
+ - ca-certificates=2023.5.7=hbcca054_0
+ - expat=2.2.10=h9c3ff4c_0
+ - keyutils=1.6.1=h166bdaf_0
+ - krb5=1.19.3=h3790be6_0
+ - ld_impl_linux-64=2.38=h1181459_1
+ - libcurl=7.87.0=h91b91d3_0
+ - libedit=3.1.20191231=he28a2e2_2
+ - libev=4.33=h516909a_1
+ - libffi=3.4.4=h6a678d5_0
+ - libgcc-ng=11.2.0=h1234567_1
+ - libgomp=11.2.0=h1234567_1
+ - libnghttp2=1.46.0=hce63b2e_0
+ - libssh2=1.10.0=ha56f1ee_2
+ - libstdcxx-ng=11.2.0=h1234567_1
+ - libuv=1.44.2=h5eee18b_0
+ - lz4-c=1.9.3=h9c3ff4c_1
+ - ncurses=6.4=h6a678d5_0
+ - openssl=1.1.1t=h7f8727e_0
+ - pip=23.0.1=py38h06a4308_0
+ - python=3.8.16=h7a1cb2a_3
+ - readline=8.2=h5eee18b_0
+ - rhash=1.4.1=h3c74f83_1
+ - setuptools=66.0.0=py38h06a4308_0
+ - sqlite=3.41.2=h5eee18b_0
+ - tk=8.6.12=h1ccaba5_0
+ - wheel=0.38.4=py38h06a4308_0
+ - xz=5.4.2=h5eee18b_0
+ - zlib=1.2.13=h5eee18b_0
+ - zstd=1.5.2=ha4553b6_0
+ - pip:
+ - absl-py==1.4.0
+ - accelerate==0.19.0
+ - aiohttp==3.8.4
+ - aiosignal==1.3.1
+ - appdirs==1.4.4
+ - async-timeout==4.0.2
+ - attrs==23.1.0
+ - brotli==1.0.9
+ - cachetools==5.3.0
+ - certifi==2023.5.7
+ - charset-normalizer==3.1.0
+ - click==8.1.3
+ - cmake==3.26.3
+ - datasets==2.12.0
+ - diffusers==0.17.0.dev0
+ - dill==0.3.6
+ - docker-pycreds==0.4.0
+ - filelock==3.12.0
+ - frozenlist==1.3.3
+ - fsspec==2023.5.0
+ - ftfy==6.1.1
+ - gevent==22.10.2
+ - geventhttpclient==2.0.2
+ - gitdb==4.0.10
+ - gitpython==3.1.31
+ - google-auth==2.18.1
+ - google-auth-oauthlib==1.0.0
+ - greenlet==2.0.2
+ - grpcio==1.55.0
+ - huggingface-hub==0.14.1
+ - idna==3.4
+ - importlib-metadata==6.6.0
+ - jinja2==3.1.2
+ - lit==16.0.5
+ - markdown==3.4.3
+ - markupsafe==2.1.2
+ - mpmath==1.3.0
+ - multidict==6.0.4
+ - multiprocess==0.70.14
+ - mypy-extensions==1.0.0
+ - networkx==3.1
+ - numpy==1.24.3
+ - nvidia-cublas-cu11==11.10.3.66
+ - nvidia-cublas-cu12==12.1.3.1
+ - nvidia-cuda-cupti-cu11==11.7.101
+ - nvidia-cuda-nvrtc-cu11==11.7.99
+ - nvidia-cuda-runtime-cu11==11.7.99
+ - nvidia-cuda-runtime-cu12==12.1.105
+ - nvidia-cudnn-cu11==8.5.0.96
+ - nvidia-cudnn-cu12==8.9.1.23
+ - nvidia-cufft-cu11==10.9.0.58
+ - nvidia-curand-cu11==10.2.10.91
+ - nvidia-cusolver-cu11==11.4.0.1
+ - nvidia-cusparse-cu11==11.7.4.91
+ - nvidia-nccl-cu11==2.14.3
+ - nvidia-nvtx-cu11==11.7.91
+ - nvidia-pytriton==0.1.5
+ - oauthlib==3.2.2
+ - packaging==23.1
+ - pandas==2.0.1
+ - pathtools==0.1.2
+ - pillow==9.5.0
+ - protobuf==3.20.3
+ - psutil==5.9.5
+ - pyarrow==12.0.0
+ - pyasn1==0.5.0
+ - pyasn1-modules==0.3.0
+ - python-dateutil==2.8.2
+ - python-rapidjson==1.10
+ - pytz==2023.3
+ - pyyaml==6.0
+ - pyzmq==23.2.1
+ - regex==2023.5.5
+ - requests==2.31.0
+ - requests-oauthlib==1.3.1
+ - responses==0.18.0
+ - rsa==4.9
+ - sentry-sdk==1.24.0
+ - setproctitle==1.3.2
+ - sh==1.14.3
+ - six==1.16.0
+ - smmap==5.0.0
+ - sympy==1.12
+ - tensorboard==2.13.0
+ - tensorboard-data-server==0.7.0
+ - tensorrt-bindings==8.6.1
+ - tensorrt-libs==8.6.1
+ - tokenizers==0.13.3
+ - torch==2.0.1
+ - torchvision==0.15.2
+ - tqdm==4.65.0
+ - transformers==4.29.2
+ - triton==2.0.0
+ - tritonclient==2.33.0
+ - typing-extensions==4.6.0
+ - typing-inspect==0.6.0
+ - tzdata==2023.3
+ - urllib3==1.26.16
+ - wandb==0.15.3
+ - wcwidth==0.2.6
+ - werkzeug==2.3.4
+ - wrapt==1.15.0
+ - xxhash==3.2.0
+ - yarl==1.9.2
+ - zipp==3.15.0
+ - zope-event==4.6
+ - zope-interface==6.0
+prefix: /home/long.qul/miniconda3/envs/triton
diff --git a/wandb/run-20230525_174038-2hxthfpt/files/config.yaml b/wandb/run-20230525_174038-2hxthfpt/files/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ddf64d834fcfa5b9acad068453e3739937133f90
--- /dev/null
+++ b/wandb/run-20230525_174038-2hxthfpt/files/config.yaml
@@ -0,0 +1,179 @@
+wandb_version: 1
+
+_wandb:
+ desc: null
+ value:
+ python_version: 3.8.16
+ cli_version: 0.15.3
+ framework: huggingface
+ huggingface_version: 4.29.2
+ is_jupyter_run: false
+ is_kaggle_kernel: false
+ start_time: 1685007638.403607
+ t:
+ 1:
+ - 1
+ - 11
+ - 41
+ - 49
+ - 51
+ - 55
+ - 71
+ - 83
+ 2:
+ - 1
+ - 11
+ - 41
+ - 49
+ - 51
+ - 55
+ - 71
+ - 83
+ 3:
+ - 23
+ 4: 3.8.16
+ 5: 0.15.3
+ 6: 4.29.2
+ 8:
+ - 5
+pretrained_model_name_or_path:
+ desc: null
+ value: runwayml/stable-diffusion-v1-5
+revision:
+ desc: null
+ value: null
+dataset_name:
+ desc: null
+ value: lambdalabs/pokemon-blip-captions
+dataset_config_name:
+ desc: null
+ value: null
+train_data_dir:
+ desc: null
+ value: null
+image_column:
+ desc: null
+ value: image
+caption_column:
+ desc: null
+ value: text
+validation_prompt:
+ desc: null
+ value: A pokemon with blue eyes.
+num_validation_images:
+ desc: null
+ value: 4
+validation_epochs:
+ desc: null
+ value: 1
+max_train_samples:
+ desc: null
+ value: null
+output_dir:
+ desc: null
+ value: /home/long.qul/tritontest
+cache_dir:
+ desc: null
+ value: null
+seed:
+ desc: null
+ value: 1337
+resolution:
+ desc: null
+ value: 512
+center_crop:
+ desc: null
+ value: true
+random_flip:
+ desc: null
+ value: true
+train_batch_size:
+ desc: null
+ value: 1
+num_train_epochs:
+ desc: null
+ value: 72
+max_train_steps:
+ desc: null
+ value: 15000
+gradient_accumulation_steps:
+ desc: null
+ value: 4
+gradient_checkpointing:
+ desc: null
+ value: false
+learning_rate:
+ desc: null
+ value: 0.0001
+scale_lr:
+ desc: null
+ value: false
+lr_scheduler:
+ desc: null
+ value: cosine
+lr_warmup_steps:
+ desc: null
+ value: 0
+snr_gamma:
+ desc: null
+ value: null
+use_8bit_adam:
+ desc: null
+ value: false
+allow_tf32:
+ desc: null
+ value: false
+dataloader_num_workers:
+ desc: null
+ value: 8
+adam_beta1:
+ desc: null
+ value: 0.9
+adam_beta2:
+ desc: null
+ value: 0.999
+adam_weight_decay:
+ desc: null
+ value: 0.01
+adam_epsilon:
+ desc: null
+ value: 1.0e-08
+max_grad_norm:
+ desc: null
+ value: 1.0
+push_to_hub:
+ desc: null
+ value: true
+hub_token:
+ desc: null
+ value: null
+hub_model_id:
+ desc: null
+ value: ''
+logging_dir:
+ desc: null
+ value: logs
+mixed_precision:
+ desc: null
+ value: null
+report_to:
+ desc: null
+ value: wandb
+local_rank:
+ desc: null
+ value: -1
+checkpointing_steps:
+ desc: null
+ value: 500
+checkpoints_total_limit:
+ desc: null
+ value: null
+resume_from_checkpoint:
+ desc: null
+ value: null
+enable_xformers_memory_efficient_attention:
+ desc: null
+ value: false
+noise_offset:
+ desc: null
+ value: 0
diff --git a/wandb/run-20230525_174038-2hxthfpt/files/output.log b/wandb/run-20230525_174038-2hxthfpt/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..331c8f9e4d6b1bc33a024a45dcc5928271b4139a
--- /dev/null
+++ b/wandb/run-20230525_174038-2hxthfpt/files/output.log
@@ -0,0 +1,27 @@
+05/25/2023 17:40:41 - INFO - __main__ - ***** Running training *****
+05/25/2023 17:40:41 - INFO - __main__ - Num examples = 833
+05/25/2023 17:40:41 - INFO - __main__ - Num Epochs = 72
+05/25/2023 17:40:41 - INFO - __main__ - Instantaneous batch size per device = 1
+05/25/2023 17:40:41 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 4
+05/25/2023 17:40:41 - INFO - __main__ - Gradient Accumulation steps = 4
+05/25/2023 17:40:41 - INFO - __main__ - Total optimization steps = 15000
+Steps: 0%| | 0/15000 [00:00, ?it/s]Traceback (most recent call last):
+ File "lora_test_1.py", line 908, in
+ main()
+ File "lora_test_1.py", line 728, in main
+ latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+ File "/home/long.qul/miniconda3/envs/triton/lib/python3.8/site-packages/diffusers/utils/accelerate_utils.py", line 46, in wrapper
+ return method(self, *args, **kwargs)
+ File "/home/long.qul/miniconda3/envs/triton/lib/python3.8/site-packages/diffusers/models/autoencoder_kl.py", line 164, in encode
+ h = self.encoder(x)
+ File "/home/long.qul/miniconda3/envs/triton/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
+ return forward_call(*args, **kwargs)
+ File "/home/long.qul/miniconda3/envs/triton/lib/python3.8/site-packages/diffusers/models/vae.py", line 109, in forward
+ sample = self.conv_in(sample)
+ File "/home/long.qul/miniconda3/envs/triton/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
+ return forward_call(*args, **kwargs)
+ File "/home/long.qul/miniconda3/envs/triton/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 463, in forward
+ return self._conv_forward(input, self.weight, self.bias)
+ File "/home/long.qul/miniconda3/envs/triton/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 459, in _conv_forward
+ return F.conv2d(input, weight, bias, self.stride,
+RuntimeError: cuDNN error: CUDNN_STATUS_NOT_INITIALIZED
\ No newline at end of file
diff --git a/wandb/run-20230525_174038-2hxthfpt/files/requirements.txt b/wandb/run-20230525_174038-2hxthfpt/files/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..881b72a23b4f740f4282786da35a53f77f1d92cd
--- /dev/null
+++ b/wandb/run-20230525_174038-2hxthfpt/files/requirements.txt
@@ -0,0 +1,110 @@
+absl-py==1.4.0
+accelerate==0.19.0
+aiohttp==3.8.4
+aiosignal==1.3.1
+appdirs==1.4.4
+async-timeout==4.0.2
+attrs==23.1.0
+brotli==1.0.9
+cachetools==5.3.0
+certifi==2023.5.7
+charset-normalizer==3.1.0
+click==8.1.3
+cmake==3.26.3
+datasets==2.12.0
+diffusers==0.17.0.dev0
+dill==0.3.6
+docker-pycreds==0.4.0
+filelock==3.12.0
+frozenlist==1.3.3
+fsspec==2023.5.0
+ftfy==6.1.1
+gevent==22.10.2
+geventhttpclient==2.0.2
+gitdb==4.0.10
+gitpython==3.1.31
+google-auth-oauthlib==1.0.0
+google-auth==2.18.1
+greenlet==2.0.2
+grpcio==1.55.0
+huggingface-hub==0.14.1
+idna==3.4
+importlib-metadata==6.6.0
+jinja2==3.1.2
+lit==16.0.5
+markdown==3.4.3
+markupsafe==2.1.2
+mpmath==1.3.0
+multidict==6.0.4
+multiprocess==0.70.14
+mypy-extensions==1.0.0
+networkx==3.1
+numpy==1.24.3
+nvidia-cublas-cu11==11.10.3.66
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+nvidia-cuda-runtime-cu11==11.7.99
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu11==8.5.0.96
+nvidia-cudnn-cu12==8.9.1.23
+nvidia-cufft-cu11==10.9.0.58
+nvidia-curand-cu11==10.2.10.91
+nvidia-cusolver-cu11==11.4.0.1
+nvidia-cusparse-cu11==11.7.4.91
+nvidia-nccl-cu11==2.14.3
+nvidia-nvtx-cu11==11.7.91
+nvidia-pytriton==0.1.5
+oauthlib==3.2.2
+packaging==23.1
+pandas==2.0.1
+pathtools==0.1.2
+pillow==9.5.0
+pip==23.0.1
+protobuf==3.20.3
+psutil==5.9.5
+pyarrow==12.0.0
+pyasn1-modules==0.3.0
+pyasn1==0.5.0
+python-dateutil==2.8.2
+python-rapidjson==1.10
+pytz==2023.3
+pyyaml==6.0
+pyzmq==23.2.1
+regex==2023.5.5
+requests-oauthlib==1.3.1
+requests==2.31.0
+responses==0.18.0
+rsa==4.9
+sentry-sdk==1.24.0
+setproctitle==1.3.2
+setuptools==66.0.0
+sh==1.14.3
+six==1.16.0
+smmap==5.0.0
+sympy==1.12
+tensorboard-data-server==0.7.0
+tensorboard==2.13.0
+tensorrt-bindings==8.6.1
+tensorrt-libs==8.6.1
+tokenizers==0.13.3
+torch==2.0.1
+torchvision==0.15.2
+tqdm==4.65.0
+transformers==4.29.2
+triton==2.0.0
+tritonclient==2.33.0
+typing-extensions==4.6.0
+typing-inspect==0.6.0
+tzdata==2023.3
+urllib3==1.26.16
+wandb==0.15.3
+wcwidth==0.2.6
+werkzeug==2.3.4
+wheel==0.38.4
+wrapt==1.15.0
+xxhash==3.2.0
+yarl==1.9.2
+zipp==3.15.0
+zope.event==4.6
+zope.interface==6.0
\ No newline at end of file
diff --git a/wandb/run-20230525_174038-2hxthfpt/files/wandb-metadata.json b/wandb/run-20230525_174038-2hxthfpt/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..605eeba639c5e3ec029d601d54d50bfc2d32bf0a
--- /dev/null
+++ b/wandb/run-20230525_174038-2hxthfpt/files/wandb-metadata.json
@@ -0,0 +1,340 @@
+{
+ "os": "Linux-5.10.134-13.1.al8.x86_64-x86_64-with-glibc2.17",
+ "python": "3.8.16",
+ "heartbeatAt": "2023-05-25T09:40:39.072595",
+ "startedAt": "2023-05-25T09:40:38.398394",
+ "docker": null,
+ "cuda": null,
+ "args": [
+ "--pretrained_model_name_or_path=runwayml/stable-diffusion-v1-5",
+ "--dataset_name=lambdalabs/pokemon-blip-captions",
+ "--dataloader_num_workers=8",
+ "--resolution=512",
+ "--center_crop",
+ "--random_flip",
+ "--train_batch_size=1",
+ "--gradient_accumulation_steps=4",
+ "--max_train_steps=15000",
+ "--learning_rate=1e-04",
+ "--max_grad_norm=1",
+ "--lr_scheduler=cosine",
+ "--lr_warmup_steps=0",
+ "--output_dir=/home/long.qul/tritontest",
+ "--push_to_hub",
+ "--hub_model_id=",
+ "--report_to=wandb",
+ "--checkpointing_steps=500",
+ "--validation_prompt=A pokemon with blue eyes.",
+ "--seed=1337"
+ ],
+ "state": "running",
+ "program": "lora_test_1.py",
+ "codePath": "lora_test_1.py",
+ "host": "iZt4n6er62uu4xnw6wibnhZ",
+ "username": "long.qul",
+ "executable": "/home/long.qul/miniconda3/envs/triton/bin/python",
+ "cpu_count": 28,
+ "cpu_count_logical": 56,
+ "cpu_freq": {
+ "current": 2899.998000000001,
+ "min": 0.0,
+ "max": 0.0
+ },
+ "cpu_freq_per_core": [
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ }
+ ],
+ "disk": {
+ "total": 1968.4237327575684,
+ "used": 256.4753112792969
+ },
+ "gpu": "NVIDIA A10",
+ "gpu_count": 1,
+ "gpu_devices": [
+ {
+ "name": "NVIDIA A10",
+ "memory_total": 23836098560
+ }
+ ],
+ "memory": {
+ "total": 339.9116630554199
+ }
+}
diff --git a/wandb/run-20230525_174038-2hxthfpt/files/wandb-summary.json b/wandb/run-20230525_174038-2hxthfpt/files/wandb-summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..6a2353df9a39aec28b5e444685dc5d7223bc37fd
--- /dev/null
+++ b/wandb/run-20230525_174038-2hxthfpt/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_wandb": {"runtime": 2}}
\ No newline at end of file
diff --git a/wandb/run-20230525_174038-2hxthfpt/logs/debug-internal.log b/wandb/run-20230525_174038-2hxthfpt/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..bd262bdc67f62743addd023c03e887d4097d6c3d
--- /dev/null
+++ b/wandb/run-20230525_174038-2hxthfpt/logs/debug-internal.log
@@ -0,0 +1,185 @@
+2023-05-25 17:40:38,404 INFO StreamThr :2918590 [internal.py:wandb_internal():86] W&B internal server running at pid: 2918590, started at: 2023-05-25 17:40:38.403997
+2023-05-25 17:40:38,405 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: status
+2023-05-25 17:40:38,407 INFO WriterThread:2918590 [datastore.py:open_for_write():85] open: /home/long.qul/tritontest/wandb/run-20230525_174038-2hxthfpt/run-2hxthfpt.wandb
+2023-05-25 17:40:38,407 DEBUG SenderThread:2918590 [sender.py:send():375] send: header
+2023-05-25 17:40:38,407 DEBUG SenderThread:2918590 [sender.py:send():375] send: run
+2023-05-25 17:40:39,037 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: check_version
+2023-05-25 17:40:39,037 INFO SenderThread:2918590 [dir_watcher.py:__init__():219] watching files in: /home/long.qul/tritontest/wandb/run-20230525_174038-2hxthfpt/files
+2023-05-25 17:40:39,037 INFO SenderThread:2918590 [sender.py:_start_run_threads():1124] run started: 2hxthfpt with start time 1685007638.403607
+2023-05-25 17:40:39,037 DEBUG SenderThread:2918590 [sender.py:send_request():402] send_request: summary_record
+2023-05-25 17:40:39,037 INFO SenderThread:2918590 [sender.py:_save_file():1378] saving file wandb-summary.json with policy end
+2023-05-25 17:40:39,037 DEBUG SenderThread:2918590 [sender.py:send_request():402] send_request: check_version
+2023-05-25 17:40:39,063 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: run_start
+2023-05-25 17:40:39,065 DEBUG HandlerThread:2918590 [system_info.py:__init__():31] System info init
+2023-05-25 17:40:39,065 DEBUG HandlerThread:2918590 [system_info.py:__init__():46] System info init done
+2023-05-25 17:40:39,065 INFO HandlerThread:2918590 [system_monitor.py:start():181] Starting system monitor
+2023-05-25 17:40:39,065 INFO SystemMonitor:2918590 [system_monitor.py:_start():145] Starting system asset monitoring threads
+2023-05-25 17:40:39,065 INFO HandlerThread:2918590 [system_monitor.py:probe():201] Collecting system info
+2023-05-25 17:40:39,066 INFO SystemMonitor:2918590 [interfaces.py:start():190] Started cpu monitoring
+2023-05-25 17:40:39,066 INFO SystemMonitor:2918590 [interfaces.py:start():190] Started disk monitoring
+2023-05-25 17:40:39,067 INFO SystemMonitor:2918590 [interfaces.py:start():190] Started gpu monitoring
+2023-05-25 17:40:39,067 INFO SystemMonitor:2918590 [interfaces.py:start():190] Started memory monitoring
+2023-05-25 17:40:39,067 INFO SystemMonitor:2918590 [interfaces.py:start():190] Started network monitoring
+2023-05-25 17:40:39,072 DEBUG HandlerThread:2918590 [system_info.py:probe():195] Probing system
+2023-05-25 17:40:39,075 DEBUG HandlerThread:2918590 [git.py:repo():40] git repository is invalid
+2023-05-25 17:40:39,075 DEBUG HandlerThread:2918590 [system_info.py:probe():240] Probing system done
+2023-05-25 17:40:39,075 DEBUG HandlerThread:2918590 [system_monitor.py:probe():210] {'os': 'Linux-5.10.134-13.1.al8.x86_64-x86_64-with-glibc2.17', 'python': '3.8.16', 'heartbeatAt': '2023-05-25T09:40:39.072595', 'startedAt': '2023-05-25T09:40:38.398394', 'docker': None, 'cuda': None, 'args': ('--pretrained_model_name_or_path=runwayml/stable-diffusion-v1-5', '--dataset_name=lambdalabs/pokemon-blip-captions', '--dataloader_num_workers=8', '--resolution=512', '--center_crop', '--random_flip', '--train_batch_size=1', '--gradient_accumulation_steps=4', '--max_train_steps=15000', '--learning_rate=1e-04', '--max_grad_norm=1', '--lr_scheduler=cosine', '--lr_warmup_steps=0', '--output_dir=/home/long.qul/tritontest', '--push_to_hub', '--hub_model_id=', '--report_to=wandb', '--checkpointing_steps=500', '--validation_prompt=A pokemon with blue eyes.', '--seed=1337'), 'state': 'running', 'program': 'lora_test_1.py', 'codePath': 'lora_test_1.py', 'host': 'iZt4n6er62uu4xnw6wibnhZ', 'username': 'long.qul', 'executable': '/home/long.qul/miniconda3/envs/triton/bin/python', 'cpu_count': 28, 'cpu_count_logical': 56, 'cpu_freq': {'current': 2899.998000000001, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}], 'disk': {'total': 1968.4237327575684, 'used': 256.4753112792969}, 'gpu': 'NVIDIA A10', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A10', 'memory_total': 23836098560}], 'memory': {'total': 339.9116630554199}}
+2023-05-25 17:40:39,075 INFO HandlerThread:2918590 [system_monitor.py:probe():211] Finished collecting system info
+2023-05-25 17:40:39,075 INFO HandlerThread:2918590 [system_monitor.py:probe():214] Publishing system info
+2023-05-25 17:40:39,075 DEBUG HandlerThread:2918590 [system_info.py:_save_pip():51] Saving list of pip packages installed into the current environment
+2023-05-25 17:40:39,075 DEBUG HandlerThread:2918590 [system_info.py:_save_pip():67] Saving pip packages done
+2023-05-25 17:40:39,075 DEBUG HandlerThread:2918590 [system_info.py:_save_conda():74] Saving list of conda packages installed into the current environment
+2023-05-25 17:40:40,038 INFO Thread-12 :2918590 [dir_watcher.py:_on_file_created():278] file/dir created: /home/long.qul/tritontest/wandb/run-20230525_174038-2hxthfpt/files/requirements.txt
+2023-05-25 17:40:40,038 INFO Thread-12 :2918590 [dir_watcher.py:_on_file_created():278] file/dir created: /home/long.qul/tritontest/wandb/run-20230525_174038-2hxthfpt/files/conda-environment.yaml
+2023-05-25 17:40:40,038 INFO Thread-12 :2918590 [dir_watcher.py:_on_file_created():278] file/dir created: /home/long.qul/tritontest/wandb/run-20230525_174038-2hxthfpt/files/wandb-summary.json
+2023-05-25 17:40:41,149 DEBUG HandlerThread:2918590 [system_info.py:_save_conda():86] Saving conda packages done
+2023-05-25 17:40:41,149 INFO HandlerThread:2918590 [system_monitor.py:probe():216] Finished publishing system info
+2023-05-25 17:40:41,153 DEBUG SenderThread:2918590 [sender.py:send():375] send: files
+2023-05-25 17:40:41,154 INFO SenderThread:2918590 [sender.py:_save_file():1378] saving file wandb-metadata.json with policy now
+2023-05-25 17:40:41,157 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: stop_status
+2023-05-25 17:40:41,157 DEBUG SenderThread:2918590 [sender.py:send_request():402] send_request: stop_status
+2023-05-25 17:40:41,399 DEBUG SenderThread:2918590 [sender.py:send():375] send: telemetry
+2023-05-25 17:40:41,399 DEBUG SenderThread:2918590 [sender.py:send():375] send: config
+2023-05-25 17:40:41,419 DEBUG SenderThread:2918590 [sender.py:send():375] send: exit
+2023-05-25 17:40:41,420 INFO SenderThread:2918590 [sender.py:send_exit():598] handling exit code: 1
+2023-05-25 17:40:41,420 INFO SenderThread:2918590 [sender.py:send_exit():600] handling runtime: 2
+2023-05-25 17:40:41,420 INFO SenderThread:2918590 [sender.py:_save_file():1378] saving file wandb-summary.json with policy end
+2023-05-25 17:40:41,420 INFO SenderThread:2918590 [sender.py:send_exit():606] send defer
+2023-05-25 17:40:41,420 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:40:41,420 INFO HandlerThread:2918590 [handler.py:handle_request_defer():170] handle defer: 0
+2023-05-25 17:40:41,420 DEBUG SenderThread:2918590 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:40:41,420 INFO SenderThread:2918590 [sender.py:send_request_defer():622] handle sender defer: 0
+2023-05-25 17:40:41,420 INFO SenderThread:2918590 [sender.py:transition_state():626] send defer: 1
+2023-05-25 17:40:41,420 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:40:41,420 INFO HandlerThread:2918590 [handler.py:handle_request_defer():170] handle defer: 1
+2023-05-25 17:40:41,420 DEBUG SenderThread:2918590 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:40:41,420 INFO SenderThread:2918590 [sender.py:send_request_defer():622] handle sender defer: 1
+2023-05-25 17:40:41,420 INFO SenderThread:2918590 [sender.py:transition_state():626] send defer: 2
+2023-05-25 17:40:41,421 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:40:41,421 INFO HandlerThread:2918590 [handler.py:handle_request_defer():170] handle defer: 2
+2023-05-25 17:40:41,421 INFO HandlerThread:2918590 [system_monitor.py:finish():190] Stopping system monitor
+2023-05-25 17:40:41,421 DEBUG SystemMonitor:2918590 [system_monitor.py:_start():159] Starting system metrics aggregation loop
+2023-05-25 17:40:41,421 INFO HandlerThread:2918590 [interfaces.py:finish():202] Joined cpu monitor
+2023-05-25 17:40:41,421 DEBUG SystemMonitor:2918590 [system_monitor.py:_start():166] Finished system metrics aggregation loop
+2023-05-25 17:40:41,421 INFO HandlerThread:2918590 [interfaces.py:finish():202] Joined disk monitor
+2023-05-25 17:40:41,421 DEBUG SystemMonitor:2918590 [system_monitor.py:_start():170] Publishing last batch of metrics
+2023-05-25 17:40:41,451 INFO HandlerThread:2918590 [interfaces.py:finish():202] Joined gpu monitor
+2023-05-25 17:40:41,452 INFO HandlerThread:2918590 [interfaces.py:finish():202] Joined memory monitor
+2023-05-25 17:40:41,452 INFO HandlerThread:2918590 [interfaces.py:finish():202] Joined network monitor
+2023-05-25 17:40:41,452 DEBUG SenderThread:2918590 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:40:41,452 INFO SenderThread:2918590 [sender.py:send_request_defer():622] handle sender defer: 2
+2023-05-25 17:40:41,452 INFO SenderThread:2918590 [sender.py:transition_state():626] send defer: 3
+2023-05-25 17:40:41,452 DEBUG SenderThread:2918590 [sender.py:send():375] send: stats
+2023-05-25 17:40:41,452 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:40:41,452 INFO HandlerThread:2918590 [handler.py:handle_request_defer():170] handle defer: 3
+2023-05-25 17:40:41,452 DEBUG SenderThread:2918590 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:40:41,453 INFO SenderThread:2918590 [sender.py:send_request_defer():622] handle sender defer: 3
+2023-05-25 17:40:41,453 INFO SenderThread:2918590 [sender.py:transition_state():626] send defer: 4
+2023-05-25 17:40:41,453 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:40:41,453 INFO HandlerThread:2918590 [handler.py:handle_request_defer():170] handle defer: 4
+2023-05-25 17:40:41,453 DEBUG SenderThread:2918590 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:40:41,453 INFO SenderThread:2918590 [sender.py:send_request_defer():622] handle sender defer: 4
+2023-05-25 17:40:41,453 INFO SenderThread:2918590 [sender.py:transition_state():626] send defer: 5
+2023-05-25 17:40:41,453 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:40:41,453 INFO HandlerThread:2918590 [handler.py:handle_request_defer():170] handle defer: 5
+2023-05-25 17:40:41,453 DEBUG SenderThread:2918590 [sender.py:send():375] send: summary
+2023-05-25 17:40:41,453 INFO SenderThread:2918590 [sender.py:_save_file():1378] saving file wandb-summary.json with policy end
+2023-05-25 17:40:41,453 DEBUG SenderThread:2918590 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:40:41,453 INFO SenderThread:2918590 [sender.py:send_request_defer():622] handle sender defer: 5
+2023-05-25 17:40:41,453 INFO SenderThread:2918590 [sender.py:transition_state():626] send defer: 6
+2023-05-25 17:40:41,453 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:40:41,453 INFO HandlerThread:2918590 [handler.py:handle_request_defer():170] handle defer: 6
+2023-05-25 17:40:41,453 DEBUG SenderThread:2918590 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:40:41,454 INFO SenderThread:2918590 [sender.py:send_request_defer():622] handle sender defer: 6
+2023-05-25 17:40:41,456 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: status_report
+2023-05-25 17:40:41,756 INFO SenderThread:2918590 [sender.py:transition_state():626] send defer: 7
+2023-05-25 17:40:41,756 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:40:41,756 INFO HandlerThread:2918590 [handler.py:handle_request_defer():170] handle defer: 7
+2023-05-25 17:40:41,756 DEBUG SenderThread:2918590 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:40:41,757 INFO SenderThread:2918590 [sender.py:send_request_defer():622] handle sender defer: 7
+2023-05-25 17:40:42,037 INFO Thread-12 :2918590 [dir_watcher.py:_on_file_modified():295] file/dir modified: /home/long.qul/tritontest/wandb/run-20230525_174038-2hxthfpt/files/wandb-summary.json
+2023-05-25 17:40:42,038 INFO Thread-12 :2918590 [dir_watcher.py:_on_file_modified():295] file/dir modified: /home/long.qul/tritontest/wandb/run-20230525_174038-2hxthfpt/files/conda-environment.yaml
+2023-05-25 17:40:42,038 INFO Thread-12 :2918590 [dir_watcher.py:_on_file_modified():295] file/dir modified: /home/long.qul/tritontest/wandb/run-20230525_174038-2hxthfpt/files/config.yaml
+2023-05-25 17:40:42,038 INFO Thread-12 :2918590 [dir_watcher.py:_on_file_created():278] file/dir created: /home/long.qul/tritontest/wandb/run-20230525_174038-2hxthfpt/files/output.log
+2023-05-25 17:40:42,038 INFO Thread-12 :2918590 [dir_watcher.py:_on_file_created():278] file/dir created: /home/long.qul/tritontest/wandb/run-20230525_174038-2hxthfpt/files/wandb-metadata.json
+2023-05-25 17:40:42,063 INFO wandb-upload_0:2918590 [upload_job.py:push():137] Uploaded file /tmp/tmpzpltiz4xwandb/6hasxp1e-wandb-metadata.json
+2023-05-25 17:40:42,420 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: poll_exit
+2023-05-25 17:40:43,406 INFO SenderThread:2918590 [sender.py:transition_state():626] send defer: 8
+2023-05-25 17:40:43,406 DEBUG SenderThread:2918590 [sender.py:send_request():402] send_request: poll_exit
+2023-05-25 17:40:43,406 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:40:43,406 INFO HandlerThread:2918590 [handler.py:handle_request_defer():170] handle defer: 8
+2023-05-25 17:40:43,406 DEBUG SenderThread:2918590 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:40:43,407 INFO SenderThread:2918590 [sender.py:send_request_defer():622] handle sender defer: 8
+2023-05-25 17:40:43,407 INFO SenderThread:2918590 [sender.py:transition_state():626] send defer: 9
+2023-05-25 17:40:43,407 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:40:43,407 INFO HandlerThread:2918590 [handler.py:handle_request_defer():170] handle defer: 9
+2023-05-25 17:40:43,407 DEBUG SenderThread:2918590 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:40:43,407 INFO SenderThread:2918590 [sender.py:send_request_defer():622] handle sender defer: 9
+2023-05-25 17:40:43,407 INFO SenderThread:2918590 [dir_watcher.py:finish():365] shutting down directory watcher
+2023-05-25 17:40:43,420 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: poll_exit
+2023-05-25 17:40:44,038 INFO Thread-12 :2918590 [dir_watcher.py:_on_file_modified():295] file/dir modified: /home/long.qul/tritontest/wandb/run-20230525_174038-2hxthfpt/files/output.log
+2023-05-25 17:40:44,038 INFO SenderThread:2918590 [dir_watcher.py:finish():395] scan: /home/long.qul/tritontest/wandb/run-20230525_174038-2hxthfpt/files
+2023-05-25 17:40:44,038 INFO SenderThread:2918590 [dir_watcher.py:finish():409] scan save: /home/long.qul/tritontest/wandb/run-20230525_174038-2hxthfpt/files/wandb-summary.json wandb-summary.json
+2023-05-25 17:40:44,039 INFO SenderThread:2918590 [dir_watcher.py:finish():409] scan save: /home/long.qul/tritontest/wandb/run-20230525_174038-2hxthfpt/files/requirements.txt requirements.txt
+2023-05-25 17:40:44,039 INFO SenderThread:2918590 [dir_watcher.py:finish():409] scan save: /home/long.qul/tritontest/wandb/run-20230525_174038-2hxthfpt/files/conda-environment.yaml conda-environment.yaml
+2023-05-25 17:40:44,039 INFO SenderThread:2918590 [dir_watcher.py:finish():409] scan save: /home/long.qul/tritontest/wandb/run-20230525_174038-2hxthfpt/files/output.log output.log
+2023-05-25 17:40:44,039 INFO SenderThread:2918590 [dir_watcher.py:finish():409] scan save: /home/long.qul/tritontest/wandb/run-20230525_174038-2hxthfpt/files/wandb-metadata.json wandb-metadata.json
+2023-05-25 17:40:44,039 INFO SenderThread:2918590 [dir_watcher.py:finish():409] scan save: /home/long.qul/tritontest/wandb/run-20230525_174038-2hxthfpt/files/config.yaml config.yaml
+2023-05-25 17:40:44,042 INFO SenderThread:2918590 [sender.py:transition_state():626] send defer: 10
+2023-05-25 17:40:44,042 DEBUG SenderThread:2918590 [sender.py:send_request():402] send_request: poll_exit
+2023-05-25 17:40:44,042 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:40:44,042 INFO HandlerThread:2918590 [handler.py:handle_request_defer():170] handle defer: 10
+2023-05-25 17:40:44,044 DEBUG SenderThread:2918590 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:40:44,045 INFO SenderThread:2918590 [sender.py:send_request_defer():622] handle sender defer: 10
+2023-05-25 17:40:44,045 INFO SenderThread:2918590 [file_pusher.py:finish():167] shutting down file pusher
+2023-05-25 17:40:44,420 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: poll_exit
+2023-05-25 17:40:44,421 DEBUG SenderThread:2918590 [sender.py:send_request():402] send_request: poll_exit
+2023-05-25 17:40:44,808 INFO wandb-upload_1:2918590 [upload_job.py:push():137] Uploaded file /home/long.qul/tritontest/wandb/run-20230525_174038-2hxthfpt/files/requirements.txt
+2023-05-25 17:40:45,110 INFO wandb-upload_0:2918590 [upload_job.py:push():137] Uploaded file /home/long.qul/tritontest/wandb/run-20230525_174038-2hxthfpt/files/wandb-summary.json
+2023-05-25 17:40:45,421 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: poll_exit
+2023-05-25 17:40:45,421 DEBUG SenderThread:2918590 [sender.py:send_request():402] send_request: poll_exit
+2023-05-25 17:40:45,695 INFO wandb-upload_2:2918590 [upload_job.py:push():137] Uploaded file /home/long.qul/tritontest/wandb/run-20230525_174038-2hxthfpt/files/conda-environment.yaml
+2023-05-25 17:40:45,725 INFO wandb-upload_4:2918590 [upload_job.py:push():137] Uploaded file /home/long.qul/tritontest/wandb/run-20230525_174038-2hxthfpt/files/config.yaml
+2023-05-25 17:40:46,105 INFO wandb-upload_3:2918590 [upload_job.py:push():137] Uploaded file /home/long.qul/tritontest/wandb/run-20230525_174038-2hxthfpt/files/output.log
+2023-05-25 17:40:46,306 INFO Thread-11 :2918590 [sender.py:transition_state():626] send defer: 11
+2023-05-25 17:40:46,306 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:40:46,306 INFO HandlerThread:2918590 [handler.py:handle_request_defer():170] handle defer: 11
+2023-05-25 17:40:46,306 DEBUG SenderThread:2918590 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:40:46,306 INFO SenderThread:2918590 [sender.py:send_request_defer():622] handle sender defer: 11
+2023-05-25 17:40:46,306 INFO SenderThread:2918590 [file_pusher.py:join():172] waiting for file pusher
+2023-05-25 17:40:46,306 INFO SenderThread:2918590 [sender.py:transition_state():626] send defer: 12
+2023-05-25 17:40:46,306 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:40:46,306 INFO HandlerThread:2918590 [handler.py:handle_request_defer():170] handle defer: 12
+2023-05-25 17:40:46,306 DEBUG SenderThread:2918590 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:40:46,307 INFO SenderThread:2918590 [sender.py:send_request_defer():622] handle sender defer: 12
+2023-05-25 17:40:46,421 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: poll_exit
+2023-05-25 17:40:46,831 INFO SenderThread:2918590 [sender.py:transition_state():626] send defer: 13
+2023-05-25 17:40:46,831 DEBUG SenderThread:2918590 [sender.py:send_request():402] send_request: poll_exit
+2023-05-25 17:40:46,831 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:40:46,831 INFO HandlerThread:2918590 [handler.py:handle_request_defer():170] handle defer: 13
+2023-05-25 17:40:46,832 DEBUG SenderThread:2918590 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:40:46,832 INFO SenderThread:2918590 [sender.py:send_request_defer():622] handle sender defer: 13
+2023-05-25 17:40:46,832 INFO SenderThread:2918590 [sender.py:transition_state():626] send defer: 14
+2023-05-25 17:40:46,832 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:40:46,832 INFO HandlerThread:2918590 [handler.py:handle_request_defer():170] handle defer: 14
+2023-05-25 17:40:46,832 DEBUG SenderThread:2918590 [sender.py:send():375] send: final
+2023-05-25 17:40:46,832 DEBUG SenderThread:2918590 [sender.py:send():375] send: footer
+2023-05-25 17:40:46,832 DEBUG SenderThread:2918590 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:40:46,832 INFO SenderThread:2918590 [sender.py:send_request_defer():622] handle sender defer: 14
+2023-05-25 17:40:46,832 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: poll_exit
+2023-05-25 17:40:46,832 DEBUG SenderThread:2918590 [sender.py:send_request():402] send_request: poll_exit
+2023-05-25 17:40:46,833 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: poll_exit
+2023-05-25 17:40:46,833 DEBUG SenderThread:2918590 [sender.py:send_request():402] send_request: poll_exit
+2023-05-25 17:40:46,833 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: server_info
+2023-05-25 17:40:46,833 DEBUG SenderThread:2918590 [sender.py:send_request():402] send_request: server_info
+2023-05-25 17:40:46,834 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: get_summary
+2023-05-25 17:40:46,835 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: sampled_history
+2023-05-25 17:40:47,297 INFO MainThread:2918590 [wandb_run.py:_footer_history_summary_info():3469] rendering history
+2023-05-25 17:40:47,298 INFO MainThread:2918590 [wandb_run.py:_footer_history_summary_info():3501] rendering summary
+2023-05-25 17:40:47,298 INFO MainThread:2918590 [wandb_run.py:_footer_sync_info():3428] logging synced files
+2023-05-25 17:40:47,298 DEBUG HandlerThread:2918590 [handler.py:handle_request():144] handle_request: shutdown
+2023-05-25 17:40:47,298 INFO HandlerThread:2918590 [handler.py:finish():842] shutting down handler
+2023-05-25 17:40:47,833 INFO WriterThread:2918590 [datastore.py:close():298] close: /home/long.qul/tritontest/wandb/run-20230525_174038-2hxthfpt/run-2hxthfpt.wandb
+2023-05-25 17:40:48,297 INFO SenderThread:2918590 [sender.py:finish():1550] shutting down sender
+2023-05-25 17:40:48,298 INFO SenderThread:2918590 [file_pusher.py:finish():167] shutting down file pusher
+2023-05-25 17:40:48,298 INFO SenderThread:2918590 [file_pusher.py:join():172] waiting for file pusher
diff --git a/wandb/run-20230525_174038-2hxthfpt/logs/debug.log b/wandb/run-20230525_174038-2hxthfpt/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..46a8efa1b58995bcb512fcb46ae14a54827b7e23
--- /dev/null
+++ b/wandb/run-20230525_174038-2hxthfpt/logs/debug.log
@@ -0,0 +1,29 @@
+2023-05-25 17:40:38,399 INFO MainThread:2915039 [wandb_setup.py:_flush():76] Current SDK version is 0.15.3
+2023-05-25 17:40:38,399 INFO MainThread:2915039 [wandb_setup.py:_flush():76] Configure stats pid to 2915039
+2023-05-25 17:40:38,399 INFO MainThread:2915039 [wandb_setup.py:_flush():76] Loading settings from /home/long.qul/.config/wandb/settings
+2023-05-25 17:40:38,399 INFO MainThread:2915039 [wandb_setup.py:_flush():76] Loading settings from /home/long.qul/tritontest/wandb/settings
+2023-05-25 17:40:38,399 INFO MainThread:2915039 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
+2023-05-25 17:40:38,399 INFO MainThread:2915039 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2023-05-25 17:40:38,399 INFO MainThread:2915039 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'lora_test_1.py', 'program': 'lora_test_1.py'}
+2023-05-25 17:40:38,399 INFO MainThread:2915039 [wandb_setup.py:_flush():76] Applying login settings: {'api_key': '***REDACTED***'}
+2023-05-25 17:40:38,399 INFO MainThread:2915039 [wandb_init.py:_log_setup():507] Logging user logs to /home/long.qul/tritontest/wandb/run-20230525_174038-2hxthfpt/logs/debug.log
+2023-05-25 17:40:38,399 INFO MainThread:2915039 [wandb_init.py:_log_setup():508] Logging internal logs to /home/long.qul/tritontest/wandb/run-20230525_174038-2hxthfpt/logs/debug-internal.log
+2023-05-25 17:40:38,400 INFO MainThread:2915039 [wandb_init.py:init():547] calling init triggers
+2023-05-25 17:40:38,400 INFO MainThread:2915039 [wandb_init.py:init():554] wandb.init called with sweep_config: {}
+config: {}
+2023-05-25 17:40:38,400 INFO MainThread:2915039 [wandb_init.py:init():596] starting backend
+2023-05-25 17:40:38,400 INFO MainThread:2915039 [wandb_init.py:init():600] setting up manager
+2023-05-25 17:40:38,402 INFO MainThread:2915039 [backend.py:_multiprocessing_setup():106] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2023-05-25 17:40:38,403 INFO MainThread:2915039 [wandb_init.py:init():606] backend started and connected
+2023-05-25 17:40:38,405 INFO MainThread:2915039 [wandb_init.py:init():700] updated telemetry
+2023-05-25 17:40:38,406 INFO MainThread:2915039 [wandb_init.py:init():737] communicating run to backend with 60.0 second timeout
+2023-05-25 17:40:39,036 INFO MainThread:2915039 [wandb_run.py:_on_init():2177] communicating current version
+2023-05-25 17:40:39,060 INFO MainThread:2915039 [wandb_run.py:_on_init():2186] got version response
+2023-05-25 17:40:39,060 INFO MainThread:2915039 [wandb_init.py:init():787] starting run threads in backend
+2023-05-25 17:40:41,157 INFO MainThread:2915039 [wandb_run.py:_console_start():2158] atexit reg
+2023-05-25 17:40:41,157 INFO MainThread:2915039 [wandb_run.py:_redirect():2013] redirect: SettingsConsole.WRAP_RAW
+2023-05-25 17:40:41,158 INFO MainThread:2915039 [wandb_run.py:_redirect():2078] Wrapping output streams.
+2023-05-25 17:40:41,158 INFO MainThread:2915039 [wandb_run.py:_redirect():2103] Redirects installed.
+2023-05-25 17:40:41,158 INFO MainThread:2915039 [wandb_init.py:init():829] run started, returning control to user process
+2023-05-25 17:40:41,161 INFO MainThread:2915039 [wandb_run.py:_config_callback():1286] config_cb None None {'pretrained_model_name_or_path': 'runwayml/stable-diffusion-v1-5', 'revision': None, 'dataset_name': 'lambdalabs/pokemon-blip-captions', 'dataset_config_name': None, 'train_data_dir': None, 'image_column': 'image', 'caption_column': 'text', 'validation_prompt': 'A pokemon with blue eyes.', 'num_validation_images': 4, 'validation_epochs': 1, 'max_train_samples': None, 'output_dir': '/home/long.qul/tritontest', 'cache_dir': None, 'seed': 1337, 'resolution': 512, 'center_crop': True, 'random_flip': True, 'train_batch_size': 1, 'num_train_epochs': 72, 'max_train_steps': 15000, 'gradient_accumulation_steps': 4, 'gradient_checkpointing': False, 'learning_rate': 0.0001, 'scale_lr': False, 'lr_scheduler': 'cosine', 'lr_warmup_steps': 0, 'snr_gamma': None, 'use_8bit_adam': False, 'allow_tf32': False, 'dataloader_num_workers': 8, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'push_to_hub': True, 'hub_token': None, 'hub_model_id': '', 'logging_dir': 'logs', 'mixed_precision': None, 'report_to': 'wandb', 'local_rank': -1, 'checkpointing_steps': 500, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'enable_xformers_memory_efficient_attention': False, 'noise_offset': 0}
+2023-05-25 17:40:48,391 WARNING MsgRouterThr:2915039 [router.py:message_loop():77] message_loop has been closed
diff --git a/wandb/run-20230525_174038-2hxthfpt/run-2hxthfpt.wandb b/wandb/run-20230525_174038-2hxthfpt/run-2hxthfpt.wandb
new file mode 100644
index 0000000000000000000000000000000000000000..bffac1d2e55704d43b72646671008ca9cef0f1a9
Binary files /dev/null and b/wandb/run-20230525_174038-2hxthfpt/run-2hxthfpt.wandb differ
diff --git a/wandb/run-20230525_174128-owp5uwvq/files/conda-environment.yaml b/wandb/run-20230525_174128-owp5uwvq/files/conda-environment.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eaad08ef7cb35db28c95e3521ff1f3af6a1b767d
--- /dev/null
+++ b/wandb/run-20230525_174128-owp5uwvq/files/conda-environment.yaml
@@ -0,0 +1,147 @@
+name: triton
+channels:
+ - conda-forge
+ - defaults
+dependencies:
+ - _libgcc_mutex=0.1=main
+ - _openmp_mutex=5.1=1_gnu
+ - bzip2=1.0.8=h7f98852_4
+ - c-ares=1.19.0=h5eee18b_0
+ - ca-certificates=2023.5.7=hbcca054_0
+ - expat=2.2.10=h9c3ff4c_0
+ - keyutils=1.6.1=h166bdaf_0
+ - krb5=1.19.3=h3790be6_0
+ - ld_impl_linux-64=2.38=h1181459_1
+ - libcurl=7.87.0=h91b91d3_0
+ - libedit=3.1.20191231=he28a2e2_2
+ - libev=4.33=h516909a_1
+ - libffi=3.4.4=h6a678d5_0
+ - libgcc-ng=11.2.0=h1234567_1
+ - libgomp=11.2.0=h1234567_1
+ - libnghttp2=1.46.0=hce63b2e_0
+ - libssh2=1.10.0=ha56f1ee_2
+ - libstdcxx-ng=11.2.0=h1234567_1
+ - libuv=1.44.2=h5eee18b_0
+ - lz4-c=1.9.3=h9c3ff4c_1
+ - ncurses=6.4=h6a678d5_0
+ - openssl=1.1.1t=h7f8727e_0
+ - pip=23.0.1=py38h06a4308_0
+ - python=3.8.16=h7a1cb2a_3
+ - readline=8.2=h5eee18b_0
+ - rhash=1.4.1=h3c74f83_1
+ - setuptools=66.0.0=py38h06a4308_0
+ - sqlite=3.41.2=h5eee18b_0
+ - tk=8.6.12=h1ccaba5_0
+ - wheel=0.38.4=py38h06a4308_0
+ - xz=5.4.2=h5eee18b_0
+ - zlib=1.2.13=h5eee18b_0
+ - zstd=1.5.2=ha4553b6_0
+ - pip:
+ - absl-py==1.4.0
+ - accelerate==0.19.0
+ - aiohttp==3.8.4
+ - aiosignal==1.3.1
+ - appdirs==1.4.4
+ - async-timeout==4.0.2
+ - attrs==23.1.0
+ - brotli==1.0.9
+ - cachetools==5.3.0
+ - certifi==2023.5.7
+ - charset-normalizer==3.1.0
+ - click==8.1.3
+ - cmake==3.26.3
+ - datasets==2.12.0
+ - diffusers==0.17.0.dev0
+ - dill==0.3.6
+ - docker-pycreds==0.4.0
+ - filelock==3.12.0
+ - frozenlist==1.3.3
+ - fsspec==2023.5.0
+ - ftfy==6.1.1
+ - gevent==22.10.2
+ - geventhttpclient==2.0.2
+ - gitdb==4.0.10
+ - gitpython==3.1.31
+ - google-auth==2.18.1
+ - google-auth-oauthlib==1.0.0
+ - greenlet==2.0.2
+ - grpcio==1.55.0
+ - huggingface-hub==0.14.1
+ - idna==3.4
+ - importlib-metadata==6.6.0
+ - jinja2==3.1.2
+ - lit==16.0.5
+ - markdown==3.4.3
+ - markupsafe==2.1.2
+ - mpmath==1.3.0
+ - multidict==6.0.4
+ - multiprocess==0.70.14
+ - mypy-extensions==1.0.0
+ - networkx==3.1
+ - numpy==1.24.3
+ - nvidia-cublas-cu11==11.10.3.66
+ - nvidia-cublas-cu12==12.1.3.1
+ - nvidia-cuda-cupti-cu11==11.7.101
+ - nvidia-cuda-nvrtc-cu11==11.7.99
+ - nvidia-cuda-runtime-cu11==11.7.99
+ - nvidia-cuda-runtime-cu12==12.1.105
+ - nvidia-cudnn-cu11==8.5.0.96
+ - nvidia-cudnn-cu12==8.9.1.23
+ - nvidia-cufft-cu11==10.9.0.58
+ - nvidia-curand-cu11==10.2.10.91
+ - nvidia-cusolver-cu11==11.4.0.1
+ - nvidia-cusparse-cu11==11.7.4.91
+ - nvidia-nccl-cu11==2.14.3
+ - nvidia-nvtx-cu11==11.7.91
+ - nvidia-pytriton==0.1.5
+ - oauthlib==3.2.2
+ - packaging==23.1
+ - pandas==2.0.1
+ - pathtools==0.1.2
+ - pillow==9.5.0
+ - protobuf==3.20.3
+ - psutil==5.9.5
+ - pyarrow==12.0.0
+ - pyasn1==0.5.0
+ - pyasn1-modules==0.3.0
+ - python-dateutil==2.8.2
+ - python-rapidjson==1.10
+ - pytz==2023.3
+ - pyyaml==6.0
+ - pyzmq==23.2.1
+ - regex==2023.5.5
+ - requests==2.31.0
+ - requests-oauthlib==1.3.1
+ - responses==0.18.0
+ - rsa==4.9
+ - sentry-sdk==1.24.0
+ - setproctitle==1.3.2
+ - sh==1.14.3
+ - six==1.16.0
+ - smmap==5.0.0
+ - sympy==1.12
+ - tensorboard==2.13.0
+ - tensorboard-data-server==0.7.0
+ - tensorrt-bindings==8.6.1
+ - tensorrt-libs==8.6.1
+ - tokenizers==0.13.3
+ - torch==2.0.1
+ - torchvision==0.15.2
+ - tqdm==4.65.0
+ - transformers==4.29.2
+ - triton==2.0.0
+ - tritonclient==2.33.0
+ - typing-extensions==4.6.0
+ - typing-inspect==0.6.0
+ - tzdata==2023.3
+ - urllib3==1.26.16
+ - wandb==0.15.3
+ - wcwidth==0.2.6
+ - werkzeug==2.3.4
+ - wrapt==1.15.0
+ - xxhash==3.2.0
+ - yarl==1.9.2
+ - zipp==3.15.0
+ - zope-event==4.6
+ - zope-interface==6.0
+prefix: /home/long.qul/miniconda3/envs/triton
diff --git a/wandb/run-20230525_174128-owp5uwvq/files/config.yaml b/wandb/run-20230525_174128-owp5uwvq/files/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1cf60131767ed6fcc8095d69e18fde5fbe6c3cef
--- /dev/null
+++ b/wandb/run-20230525_174128-owp5uwvq/files/config.yaml
@@ -0,0 +1,179 @@
+wandb_version: 1
+
+_wandb:
+ desc: null
+ value:
+ python_version: 3.8.16
+ cli_version: 0.15.3
+ framework: huggingface
+ huggingface_version: 4.29.2
+ is_jupyter_run: false
+ is_kaggle_kernel: false
+ start_time: 1685007688.1411
+ t:
+ 1:
+ - 1
+ - 11
+ - 41
+ - 49
+ - 51
+ - 55
+ - 71
+ - 83
+ 2:
+ - 1
+ - 11
+ - 41
+ - 49
+ - 51
+ - 55
+ - 71
+ - 83
+ 3:
+ - 23
+ 4: 3.8.16
+ 5: 0.15.3
+ 6: 4.29.2
+ 8:
+ - 5
+pretrained_model_name_or_path:
+ desc: null
+ value: runwayml/stable-diffusion-v1-5
+revision:
+ desc: null
+ value: null
+dataset_name:
+ desc: null
+ value: lambdalabs/pokemon-blip-captions
+dataset_config_name:
+ desc: null
+ value: null
+train_data_dir:
+ desc: null
+ value: null
+image_column:
+ desc: null
+ value: image
+caption_column:
+ desc: null
+ value: text
+validation_prompt:
+ desc: null
+ value: A pokemon with blue eyes.
+num_validation_images:
+ desc: null
+ value: 4
+validation_epochs:
+ desc: null
+ value: 1
+max_train_samples:
+ desc: null
+ value: null
+output_dir:
+ desc: null
+ value: /home/long.qul/tritontest
+cache_dir:
+ desc: null
+ value: null
+seed:
+ desc: null
+ value: 1337
+resolution:
+ desc: null
+ value: 512
+center_crop:
+ desc: null
+ value: true
+random_flip:
+ desc: null
+ value: true
+train_batch_size:
+ desc: null
+ value: 1
+num_train_epochs:
+ desc: null
+ value: 72
+max_train_steps:
+ desc: null
+ value: 15000
+gradient_accumulation_steps:
+ desc: null
+ value: 4
+gradient_checkpointing:
+ desc: null
+ value: false
+learning_rate:
+ desc: null
+ value: 0.0001
+scale_lr:
+ desc: null
+ value: false
+lr_scheduler:
+ desc: null
+ value: cosine
+lr_warmup_steps:
+ desc: null
+ value: 0
+snr_gamma:
+ desc: null
+ value: null
+use_8bit_adam:
+ desc: null
+ value: false
+allow_tf32:
+ desc: null
+ value: false
+dataloader_num_workers:
+ desc: null
+ value: 8
+adam_beta1:
+ desc: null
+ value: 0.9
+adam_beta2:
+ desc: null
+ value: 0.999
+adam_weight_decay:
+ desc: null
+ value: 0.01
+adam_epsilon:
+ desc: null
+ value: 1.0e-08
+max_grad_norm:
+ desc: null
+ value: 1.0
+push_to_hub:
+ desc: null
+ value: true
+hub_token:
+ desc: null
+ value: null
+hub_model_id:
+ desc: null
+ value: ''
+logging_dir:
+ desc: null
+ value: logs
+mixed_precision:
+ desc: null
+ value: null
+report_to:
+ desc: null
+ value: wandb
+local_rank:
+ desc: null
+ value: -1
+checkpointing_steps:
+ desc: null
+ value: 500
+checkpoints_total_limit:
+ desc: null
+ value: null
+resume_from_checkpoint:
+ desc: null
+ value: null
+enable_xformers_memory_efficient_attention:
+ desc: null
+ value: false
+noise_offset:
+ desc: null
+ value: 0
diff --git a/wandb/run-20230525_174128-owp5uwvq/files/output.log b/wandb/run-20230525_174128-owp5uwvq/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..aee7bcec1732fd7dabaf336bde54caa3b9795e5d
--- /dev/null
+++ b/wandb/run-20230525_174128-owp5uwvq/files/output.log
@@ -0,0 +1,27 @@
+05/25/2023 17:41:30 - INFO - __main__ - ***** Running training *****
+05/25/2023 17:41:30 - INFO - __main__ - Num examples = 833
+05/25/2023 17:41:30 - INFO - __main__ - Num Epochs = 72
+05/25/2023 17:41:30 - INFO - __main__ - Instantaneous batch size per device = 1
+05/25/2023 17:41:30 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 4
+05/25/2023 17:41:30 - INFO - __main__ - Gradient Accumulation steps = 4
+05/25/2023 17:41:30 - INFO - __main__ - Total optimization steps = 15000
+Steps: 0%| | 0/15000 [00:00, ?it/s]Traceback (most recent call last):
+ File "lora_test_1.py", line 908, in
+ main()
+ File "lora_test_1.py", line 728, in main
+ latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+ File "/home/long.qul/miniconda3/envs/triton/lib/python3.8/site-packages/diffusers/utils/accelerate_utils.py", line 46, in wrapper
+ return method(self, *args, **kwargs)
+ File "/home/long.qul/miniconda3/envs/triton/lib/python3.8/site-packages/diffusers/models/autoencoder_kl.py", line 164, in encode
+ h = self.encoder(x)
+ File "/home/long.qul/miniconda3/envs/triton/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
+ return forward_call(*args, **kwargs)
+ File "/home/long.qul/miniconda3/envs/triton/lib/python3.8/site-packages/diffusers/models/vae.py", line 109, in forward
+ sample = self.conv_in(sample)
+ File "/home/long.qul/miniconda3/envs/triton/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
+ return forward_call(*args, **kwargs)
+ File "/home/long.qul/miniconda3/envs/triton/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 463, in forward
+ return self._conv_forward(input, self.weight, self.bias)
+ File "/home/long.qul/miniconda3/envs/triton/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 459, in _conv_forward
+ return F.conv2d(input, weight, bias, self.stride,
+RuntimeError: cuDNN error: CUDNN_STATUS_NOT_INITIALIZED
\ No newline at end of file
diff --git a/wandb/run-20230525_174128-owp5uwvq/files/requirements.txt b/wandb/run-20230525_174128-owp5uwvq/files/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..881b72a23b4f740f4282786da35a53f77f1d92cd
--- /dev/null
+++ b/wandb/run-20230525_174128-owp5uwvq/files/requirements.txt
@@ -0,0 +1,110 @@
+absl-py==1.4.0
+accelerate==0.19.0
+aiohttp==3.8.4
+aiosignal==1.3.1
+appdirs==1.4.4
+async-timeout==4.0.2
+attrs==23.1.0
+brotli==1.0.9
+cachetools==5.3.0
+certifi==2023.5.7
+charset-normalizer==3.1.0
+click==8.1.3
+cmake==3.26.3
+datasets==2.12.0
+diffusers==0.17.0.dev0
+dill==0.3.6
+docker-pycreds==0.4.0
+filelock==3.12.0
+frozenlist==1.3.3
+fsspec==2023.5.0
+ftfy==6.1.1
+gevent==22.10.2
+geventhttpclient==2.0.2
+gitdb==4.0.10
+gitpython==3.1.31
+google-auth-oauthlib==1.0.0
+google-auth==2.18.1
+greenlet==2.0.2
+grpcio==1.55.0
+huggingface-hub==0.14.1
+idna==3.4
+importlib-metadata==6.6.0
+jinja2==3.1.2
+lit==16.0.5
+markdown==3.4.3
+markupsafe==2.1.2
+mpmath==1.3.0
+multidict==6.0.4
+multiprocess==0.70.14
+mypy-extensions==1.0.0
+networkx==3.1
+numpy==1.24.3
+nvidia-cublas-cu11==11.10.3.66
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+nvidia-cuda-runtime-cu11==11.7.99
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu11==8.5.0.96
+nvidia-cudnn-cu12==8.9.1.23
+nvidia-cufft-cu11==10.9.0.58
+nvidia-curand-cu11==10.2.10.91
+nvidia-cusolver-cu11==11.4.0.1
+nvidia-cusparse-cu11==11.7.4.91
+nvidia-nccl-cu11==2.14.3
+nvidia-nvtx-cu11==11.7.91
+nvidia-pytriton==0.1.5
+oauthlib==3.2.2
+packaging==23.1
+pandas==2.0.1
+pathtools==0.1.2
+pillow==9.5.0
+pip==23.0.1
+protobuf==3.20.3
+psutil==5.9.5
+pyarrow==12.0.0
+pyasn1-modules==0.3.0
+pyasn1==0.5.0
+python-dateutil==2.8.2
+python-rapidjson==1.10
+pytz==2023.3
+pyyaml==6.0
+pyzmq==23.2.1
+regex==2023.5.5
+requests-oauthlib==1.3.1
+requests==2.31.0
+responses==0.18.0
+rsa==4.9
+sentry-sdk==1.24.0
+setproctitle==1.3.2
+setuptools==66.0.0
+sh==1.14.3
+six==1.16.0
+smmap==5.0.0
+sympy==1.12
+tensorboard-data-server==0.7.0
+tensorboard==2.13.0
+tensorrt-bindings==8.6.1
+tensorrt-libs==8.6.1
+tokenizers==0.13.3
+torch==2.0.1
+torchvision==0.15.2
+tqdm==4.65.0
+transformers==4.29.2
+triton==2.0.0
+tritonclient==2.33.0
+typing-extensions==4.6.0
+typing-inspect==0.6.0
+tzdata==2023.3
+urllib3==1.26.16
+wandb==0.15.3
+wcwidth==0.2.6
+werkzeug==2.3.4
+wheel==0.38.4
+wrapt==1.15.0
+xxhash==3.2.0
+yarl==1.9.2
+zipp==3.15.0
+zope.event==4.6
+zope.interface==6.0
\ No newline at end of file
diff --git a/wandb/run-20230525_174128-owp5uwvq/files/wandb-metadata.json b/wandb/run-20230525_174128-owp5uwvq/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..625875695ac27339d832518a244158289713c38d
--- /dev/null
+++ b/wandb/run-20230525_174128-owp5uwvq/files/wandb-metadata.json
@@ -0,0 +1,340 @@
+{
+ "os": "Linux-5.10.134-13.1.al8.x86_64-x86_64-with-glibc2.17",
+ "python": "3.8.16",
+ "heartbeatAt": "2023-05-25T09:41:28.724050",
+ "startedAt": "2023-05-25T09:41:28.135634",
+ "docker": null,
+ "cuda": null,
+ "args": [
+ "--pretrained_model_name_or_path=runwayml/stable-diffusion-v1-5",
+ "--dataset_name=lambdalabs/pokemon-blip-captions",
+ "--dataloader_num_workers=8",
+ "--resolution=512",
+ "--center_crop",
+ "--random_flip",
+ "--train_batch_size=1",
+ "--gradient_accumulation_steps=4",
+ "--max_train_steps=15000",
+ "--learning_rate=1e-04",
+ "--max_grad_norm=1",
+ "--lr_scheduler=cosine",
+ "--lr_warmup_steps=0",
+ "--output_dir=/home/long.qul/tritontest",
+ "--push_to_hub",
+ "--hub_model_id=",
+ "--report_to=wandb",
+ "--checkpointing_steps=500",
+ "--validation_prompt=A pokemon with blue eyes.",
+ "--seed=1337"
+ ],
+ "state": "running",
+ "program": "lora_test_1.py",
+ "codePath": "lora_test_1.py",
+ "host": "iZt4n6er62uu4xnw6wibnhZ",
+ "username": "long.qul",
+ "executable": "/home/long.qul/miniconda3/envs/triton/bin/python",
+ "cpu_count": 28,
+ "cpu_count_logical": 56,
+ "cpu_freq": {
+ "current": 2899.998000000001,
+ "min": 0.0,
+ "max": 0.0
+ },
+ "cpu_freq_per_core": [
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ }
+ ],
+ "disk": {
+ "total": 1968.4237327575684,
+ "used": 256.4754180908203
+ },
+ "gpu": "NVIDIA A10",
+ "gpu_count": 1,
+ "gpu_devices": [
+ {
+ "name": "NVIDIA A10",
+ "memory_total": 23836098560
+ }
+ ],
+ "memory": {
+ "total": 339.9116630554199
+ }
+}
diff --git a/wandb/run-20230525_174128-owp5uwvq/files/wandb-summary.json b/wandb/run-20230525_174128-owp5uwvq/files/wandb-summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..6a2353df9a39aec28b5e444685dc5d7223bc37fd
--- /dev/null
+++ b/wandb/run-20230525_174128-owp5uwvq/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_wandb": {"runtime": 2}}
\ No newline at end of file
diff --git a/wandb/run-20230525_174128-owp5uwvq/logs/debug-internal.log b/wandb/run-20230525_174128-owp5uwvq/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..d10c1bd538024d20e9a77a52ece3555db2737073
--- /dev/null
+++ b/wandb/run-20230525_174128-owp5uwvq/logs/debug-internal.log
@@ -0,0 +1,175 @@
+2023-05-25 17:41:28,141 INFO StreamThr :2921999 [internal.py:wandb_internal():86] W&B internal server running at pid: 2921999, started at: 2023-05-25 17:41:28.141410
+2023-05-25 17:41:28,142 DEBUG HandlerThread:2921999 [handler.py:handle_request():144] handle_request: status
+2023-05-25 17:41:28,144 INFO WriterThread:2921999 [datastore.py:open_for_write():85] open: /home/long.qul/tritontest/wandb/run-20230525_174128-owp5uwvq/run-owp5uwvq.wandb
+2023-05-25 17:41:28,144 DEBUG SenderThread:2921999 [sender.py:send():375] send: header
+2023-05-25 17:41:28,144 DEBUG SenderThread:2921999 [sender.py:send():375] send: run
+2023-05-25 17:41:28,687 DEBUG HandlerThread:2921999 [handler.py:handle_request():144] handle_request: check_version
+2023-05-25 17:41:28,688 INFO SenderThread:2921999 [dir_watcher.py:__init__():219] watching files in: /home/long.qul/tritontest/wandb/run-20230525_174128-owp5uwvq/files
+2023-05-25 17:41:28,688 INFO SenderThread:2921999 [sender.py:_start_run_threads():1124] run started: owp5uwvq with start time 1685007688.1411
+2023-05-25 17:41:28,688 DEBUG SenderThread:2921999 [sender.py:send_request():402] send_request: summary_record
+2023-05-25 17:41:28,688 INFO SenderThread:2921999 [sender.py:_save_file():1378] saving file wandb-summary.json with policy end
+2023-05-25 17:41:28,688 DEBUG SenderThread:2921999 [sender.py:send_request():402] send_request: check_version
+2023-05-25 17:41:28,710 DEBUG HandlerThread:2921999 [handler.py:handle_request():144] handle_request: run_start
+2023-05-25 17:41:28,712 DEBUG HandlerThread:2921999 [system_info.py:__init__():31] System info init
+2023-05-25 17:41:28,712 DEBUG HandlerThread:2921999 [system_info.py:__init__():46] System info init done
+2023-05-25 17:41:28,712 INFO HandlerThread:2921999 [system_monitor.py:start():181] Starting system monitor
+2023-05-25 17:41:28,712 INFO SystemMonitor:2921999 [system_monitor.py:_start():145] Starting system asset monitoring threads
+2023-05-25 17:41:28,712 INFO HandlerThread:2921999 [system_monitor.py:probe():201] Collecting system info
+2023-05-25 17:41:28,712 INFO SystemMonitor:2921999 [interfaces.py:start():190] Started cpu monitoring
+2023-05-25 17:41:28,713 INFO SystemMonitor:2921999 [interfaces.py:start():190] Started disk monitoring
+2023-05-25 17:41:28,713 INFO SystemMonitor:2921999 [interfaces.py:start():190] Started gpu monitoring
+2023-05-25 17:41:28,714 INFO SystemMonitor:2921999 [interfaces.py:start():190] Started memory monitoring
+2023-05-25 17:41:28,715 INFO SystemMonitor:2921999 [interfaces.py:start():190] Started network monitoring
+2023-05-25 17:41:28,724 DEBUG HandlerThread:2921999 [system_info.py:probe():195] Probing system
+2023-05-25 17:41:28,726 DEBUG HandlerThread:2921999 [git.py:repo():40] git repository is invalid
+2023-05-25 17:41:28,726 DEBUG HandlerThread:2921999 [system_info.py:probe():240] Probing system done
+2023-05-25 17:41:28,727 DEBUG HandlerThread:2921999 [system_monitor.py:probe():210] {'os': 'Linux-5.10.134-13.1.al8.x86_64-x86_64-with-glibc2.17', 'python': '3.8.16', 'heartbeatAt': '2023-05-25T09:41:28.724050', 'startedAt': '2023-05-25T09:41:28.135634', 'docker': None, 'cuda': None, 'args': ('--pretrained_model_name_or_path=runwayml/stable-diffusion-v1-5', '--dataset_name=lambdalabs/pokemon-blip-captions', '--dataloader_num_workers=8', '--resolution=512', '--center_crop', '--random_flip', '--train_batch_size=1', '--gradient_accumulation_steps=4', '--max_train_steps=15000', '--learning_rate=1e-04', '--max_grad_norm=1', '--lr_scheduler=cosine', '--lr_warmup_steps=0', '--output_dir=/home/long.qul/tritontest', '--push_to_hub', '--hub_model_id=', '--report_to=wandb', '--checkpointing_steps=500', '--validation_prompt=A pokemon with blue eyes.', '--seed=1337'), 'state': 'running', 'program': 'lora_test_1.py', 'codePath': 'lora_test_1.py', 'host': 'iZt4n6er62uu4xnw6wibnhZ', 'username': 'long.qul', 'executable': '/home/long.qul/miniconda3/envs/triton/bin/python', 'cpu_count': 28, 'cpu_count_logical': 56, 'cpu_freq': {'current': 2899.998000000001, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}], 'disk': {'total': 1968.4237327575684, 'used': 256.4754180908203}, 'gpu': 'NVIDIA A10', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A10', 'memory_total': 23836098560}], 'memory': {'total': 339.9116630554199}}
+2023-05-25 17:41:28,727 INFO HandlerThread:2921999 [system_monitor.py:probe():211] Finished collecting system info
+2023-05-25 17:41:28,727 INFO HandlerThread:2921999 [system_monitor.py:probe():214] Publishing system info
+2023-05-25 17:41:28,727 DEBUG HandlerThread:2921999 [system_info.py:_save_pip():51] Saving list of pip packages installed into the current environment
+2023-05-25 17:41:28,727 DEBUG HandlerThread:2921999 [system_info.py:_save_pip():67] Saving pip packages done
+2023-05-25 17:41:28,727 DEBUG HandlerThread:2921999 [system_info.py:_save_conda():74] Saving list of conda packages installed into the current environment
+2023-05-25 17:41:29,689 INFO Thread-12 :2921999 [dir_watcher.py:_on_file_created():278] file/dir created: /home/long.qul/tritontest/wandb/run-20230525_174128-owp5uwvq/files/wandb-summary.json
+2023-05-25 17:41:29,689 INFO Thread-12 :2921999 [dir_watcher.py:_on_file_created():278] file/dir created: /home/long.qul/tritontest/wandb/run-20230525_174128-owp5uwvq/files/requirements.txt
+2023-05-25 17:41:29,689 INFO Thread-12 :2921999 [dir_watcher.py:_on_file_created():278] file/dir created: /home/long.qul/tritontest/wandb/run-20230525_174128-owp5uwvq/files/conda-environment.yaml
+2023-05-25 17:41:30,813 DEBUG HandlerThread:2921999 [system_info.py:_save_conda():86] Saving conda packages done
+2023-05-25 17:41:30,813 INFO HandlerThread:2921999 [system_monitor.py:probe():216] Finished publishing system info
+2023-05-25 17:41:30,817 DEBUG SenderThread:2921999 [sender.py:send():375] send: files
+2023-05-25 17:41:30,818 INFO SenderThread:2921999 [sender.py:_save_file():1378] saving file wandb-metadata.json with policy now
+2023-05-25 17:41:30,821 DEBUG HandlerThread:2921999 [handler.py:handle_request():144] handle_request: stop_status
+2023-05-25 17:41:30,821 DEBUG SenderThread:2921999 [sender.py:send_request():402] send_request: stop_status
+2023-05-25 17:41:31,097 DEBUG SenderThread:2921999 [sender.py:send():375] send: telemetry
+2023-05-25 17:41:31,097 DEBUG SenderThread:2921999 [sender.py:send():375] send: config
+2023-05-25 17:41:31,098 DEBUG SenderThread:2921999 [sender.py:send():375] send: exit
+2023-05-25 17:41:31,098 INFO SenderThread:2921999 [sender.py:send_exit():598] handling exit code: 1
+2023-05-25 17:41:31,098 INFO SenderThread:2921999 [sender.py:send_exit():600] handling runtime: 2
+2023-05-25 17:41:31,099 INFO SenderThread:2921999 [sender.py:_save_file():1378] saving file wandb-summary.json with policy end
+2023-05-25 17:41:31,099 INFO SenderThread:2921999 [sender.py:send_exit():606] send defer
+2023-05-25 17:41:31,099 DEBUG HandlerThread:2921999 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:41:31,099 INFO HandlerThread:2921999 [handler.py:handle_request_defer():170] handle defer: 0
+2023-05-25 17:41:31,099 DEBUG SenderThread:2921999 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:41:31,099 INFO SenderThread:2921999 [sender.py:send_request_defer():622] handle sender defer: 0
+2023-05-25 17:41:31,099 INFO SenderThread:2921999 [sender.py:transition_state():626] send defer: 1
+2023-05-25 17:41:31,099 DEBUG HandlerThread:2921999 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:41:31,099 INFO HandlerThread:2921999 [handler.py:handle_request_defer():170] handle defer: 1
+2023-05-25 17:41:31,099 DEBUG SenderThread:2921999 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:41:31,099 INFO SenderThread:2921999 [sender.py:send_request_defer():622] handle sender defer: 1
+2023-05-25 17:41:31,099 INFO SenderThread:2921999 [sender.py:transition_state():626] send defer: 2
+2023-05-25 17:41:31,099 DEBUG HandlerThread:2921999 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:41:31,099 INFO HandlerThread:2921999 [handler.py:handle_request_defer():170] handle defer: 2
+2023-05-25 17:41:31,099 INFO HandlerThread:2921999 [system_monitor.py:finish():190] Stopping system monitor
+2023-05-25 17:41:31,100 DEBUG SystemMonitor:2921999 [system_monitor.py:_start():159] Starting system metrics aggregation loop
+2023-05-25 17:41:31,100 INFO HandlerThread:2921999 [interfaces.py:finish():202] Joined cpu monitor
+2023-05-25 17:41:31,100 DEBUG SystemMonitor:2921999 [system_monitor.py:_start():166] Finished system metrics aggregation loop
+2023-05-25 17:41:31,100 INFO HandlerThread:2921999 [interfaces.py:finish():202] Joined disk monitor
+2023-05-25 17:41:31,100 DEBUG SystemMonitor:2921999 [system_monitor.py:_start():170] Publishing last batch of metrics
+2023-05-25 17:41:31,131 INFO HandlerThread:2921999 [interfaces.py:finish():202] Joined gpu monitor
+2023-05-25 17:41:31,131 INFO HandlerThread:2921999 [interfaces.py:finish():202] Joined memory monitor
+2023-05-25 17:41:31,131 INFO HandlerThread:2921999 [interfaces.py:finish():202] Joined network monitor
+2023-05-25 17:41:31,131 DEBUG SenderThread:2921999 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:41:31,131 INFO SenderThread:2921999 [sender.py:send_request_defer():622] handle sender defer: 2
+2023-05-25 17:41:31,131 INFO SenderThread:2921999 [sender.py:transition_state():626] send defer: 3
+2023-05-25 17:41:31,131 DEBUG SenderThread:2921999 [sender.py:send():375] send: stats
+2023-05-25 17:41:31,131 DEBUG HandlerThread:2921999 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:41:31,132 INFO HandlerThread:2921999 [handler.py:handle_request_defer():170] handle defer: 3
+2023-05-25 17:41:31,132 DEBUG SenderThread:2921999 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:41:31,132 INFO SenderThread:2921999 [sender.py:send_request_defer():622] handle sender defer: 3
+2023-05-25 17:41:31,132 INFO SenderThread:2921999 [sender.py:transition_state():626] send defer: 4
+2023-05-25 17:41:31,132 DEBUG HandlerThread:2921999 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:41:31,132 INFO HandlerThread:2921999 [handler.py:handle_request_defer():170] handle defer: 4
+2023-05-25 17:41:31,132 DEBUG SenderThread:2921999 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:41:31,132 INFO SenderThread:2921999 [sender.py:send_request_defer():622] handle sender defer: 4
+2023-05-25 17:41:31,132 INFO SenderThread:2921999 [sender.py:transition_state():626] send defer: 5
+2023-05-25 17:41:31,132 DEBUG HandlerThread:2921999 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:41:31,132 INFO HandlerThread:2921999 [handler.py:handle_request_defer():170] handle defer: 5
+2023-05-25 17:41:31,132 DEBUG SenderThread:2921999 [sender.py:send():375] send: summary
+2023-05-25 17:41:31,132 INFO SenderThread:2921999 [sender.py:_save_file():1378] saving file wandb-summary.json with policy end
+2023-05-25 17:41:31,133 DEBUG SenderThread:2921999 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:41:31,133 INFO SenderThread:2921999 [sender.py:send_request_defer():622] handle sender defer: 5
+2023-05-25 17:41:31,133 INFO SenderThread:2921999 [sender.py:transition_state():626] send defer: 6
+2023-05-25 17:41:31,133 DEBUG HandlerThread:2921999 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:41:31,133 INFO HandlerThread:2921999 [handler.py:handle_request_defer():170] handle defer: 6
+2023-05-25 17:41:31,133 DEBUG SenderThread:2921999 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:41:31,133 INFO SenderThread:2921999 [sender.py:send_request_defer():622] handle sender defer: 6
+2023-05-25 17:41:31,135 DEBUG HandlerThread:2921999 [handler.py:handle_request():144] handle_request: status_report
+2023-05-25 17:41:31,690 INFO Thread-12 :2921999 [dir_watcher.py:_on_file_modified():295] file/dir modified: /home/long.qul/tritontest/wandb/run-20230525_174128-owp5uwvq/files/wandb-summary.json
+2023-05-25 17:41:31,690 INFO Thread-12 :2921999 [dir_watcher.py:_on_file_modified():295] file/dir modified: /home/long.qul/tritontest/wandb/run-20230525_174128-owp5uwvq/files/conda-environment.yaml
+2023-05-25 17:41:31,690 INFO Thread-12 :2921999 [dir_watcher.py:_on_file_created():278] file/dir created: /home/long.qul/tritontest/wandb/run-20230525_174128-owp5uwvq/files/wandb-metadata.json
+2023-05-25 17:41:31,690 INFO Thread-12 :2921999 [dir_watcher.py:_on_file_created():278] file/dir created: /home/long.qul/tritontest/wandb/run-20230525_174128-owp5uwvq/files/output.log
+2023-05-25 17:41:31,876 INFO wandb-upload_0:2921999 [upload_job.py:push():137] Uploaded file /tmp/tmpdil908p6wandb/9a4zmdsm-wandb-metadata.json
+2023-05-25 17:41:31,982 INFO SenderThread:2921999 [sender.py:transition_state():626] send defer: 7
+2023-05-25 17:41:31,982 DEBUG HandlerThread:2921999 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:41:31,983 INFO HandlerThread:2921999 [handler.py:handle_request_defer():170] handle defer: 7
+2023-05-25 17:41:31,983 DEBUG SenderThread:2921999 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:41:31,983 INFO SenderThread:2921999 [sender.py:send_request_defer():622] handle sender defer: 7
+2023-05-25 17:41:32,086 DEBUG HandlerThread:2921999 [handler.py:handle_request():144] handle_request: poll_exit
+2023-05-25 17:41:32,690 INFO Thread-12 :2921999 [dir_watcher.py:_on_file_modified():295] file/dir modified: /home/long.qul/tritontest/wandb/run-20230525_174128-owp5uwvq/files/config.yaml
+2023-05-25 17:41:33,104 INFO SenderThread:2921999 [sender.py:transition_state():626] send defer: 8
+2023-05-25 17:41:33,104 DEBUG SenderThread:2921999 [sender.py:send_request():402] send_request: poll_exit
+2023-05-25 17:41:33,104 DEBUG HandlerThread:2921999 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:41:33,104 INFO HandlerThread:2921999 [handler.py:handle_request_defer():170] handle defer: 8
+2023-05-25 17:41:33,104 DEBUG SenderThread:2921999 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:41:33,105 INFO SenderThread:2921999 [sender.py:send_request_defer():622] handle sender defer: 8
+2023-05-25 17:41:33,105 INFO SenderThread:2921999 [sender.py:transition_state():626] send defer: 9
+2023-05-25 17:41:33,105 DEBUG HandlerThread:2921999 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:41:33,105 INFO HandlerThread:2921999 [handler.py:handle_request_defer():170] handle defer: 9
+2023-05-25 17:41:33,105 DEBUG SenderThread:2921999 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:41:33,105 INFO SenderThread:2921999 [sender.py:send_request_defer():622] handle sender defer: 9
+2023-05-25 17:41:33,105 INFO SenderThread:2921999 [dir_watcher.py:finish():365] shutting down directory watcher
+2023-05-25 17:41:33,690 INFO Thread-12 :2921999 [dir_watcher.py:_on_file_modified():295] file/dir modified: /home/long.qul/tritontest/wandb/run-20230525_174128-owp5uwvq/files/output.log
+2023-05-25 17:41:33,691 INFO SenderThread:2921999 [dir_watcher.py:finish():395] scan: /home/long.qul/tritontest/wandb/run-20230525_174128-owp5uwvq/files
+2023-05-25 17:41:33,691 INFO SenderThread:2921999 [dir_watcher.py:finish():409] scan save: /home/long.qul/tritontest/wandb/run-20230525_174128-owp5uwvq/files/wandb-summary.json wandb-summary.json
+2023-05-25 17:41:33,691 INFO SenderThread:2921999 [dir_watcher.py:finish():409] scan save: /home/long.qul/tritontest/wandb/run-20230525_174128-owp5uwvq/files/requirements.txt requirements.txt
+2023-05-25 17:41:33,691 INFO SenderThread:2921999 [dir_watcher.py:finish():409] scan save: /home/long.qul/tritontest/wandb/run-20230525_174128-owp5uwvq/files/conda-environment.yaml conda-environment.yaml
+2023-05-25 17:41:33,694 INFO SenderThread:2921999 [dir_watcher.py:finish():409] scan save: /home/long.qul/tritontest/wandb/run-20230525_174128-owp5uwvq/files/output.log output.log
+2023-05-25 17:41:33,694 INFO SenderThread:2921999 [dir_watcher.py:finish():409] scan save: /home/long.qul/tritontest/wandb/run-20230525_174128-owp5uwvq/files/wandb-metadata.json wandb-metadata.json
+2023-05-25 17:41:33,694 INFO SenderThread:2921999 [dir_watcher.py:finish():409] scan save: /home/long.qul/tritontest/wandb/run-20230525_174128-owp5uwvq/files/config.yaml config.yaml
+2023-05-25 17:41:33,694 INFO SenderThread:2921999 [sender.py:transition_state():626] send defer: 10
+2023-05-25 17:41:33,695 DEBUG HandlerThread:2921999 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:41:33,696 INFO HandlerThread:2921999 [handler.py:handle_request_defer():170] handle defer: 10
+2023-05-25 17:41:33,697 DEBUG SenderThread:2921999 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:41:33,698 INFO SenderThread:2921999 [sender.py:send_request_defer():622] handle sender defer: 10
+2023-05-25 17:41:33,698 INFO SenderThread:2921999 [file_pusher.py:finish():167] shutting down file pusher
+2023-05-25 17:41:34,667 INFO wandb-upload_0:2921999 [upload_job.py:push():137] Uploaded file /home/long.qul/tritontest/wandb/run-20230525_174128-owp5uwvq/files/requirements.txt
+2023-05-25 17:41:34,678 INFO wandb-upload_2:2921999 [upload_job.py:push():137] Uploaded file /home/long.qul/tritontest/wandb/run-20230525_174128-owp5uwvq/files/conda-environment.yaml
+2023-05-25 17:41:34,779 INFO wandb-upload_1:2921999 [upload_job.py:push():137] Uploaded file /home/long.qul/tritontest/wandb/run-20230525_174128-owp5uwvq/files/wandb-summary.json
+2023-05-25 17:41:35,396 INFO wandb-upload_3:2921999 [upload_job.py:push():137] Uploaded file /home/long.qul/tritontest/wandb/run-20230525_174128-owp5uwvq/files/output.log
+2023-05-25 17:41:35,604 INFO wandb-upload_4:2921999 [upload_job.py:push():137] Uploaded file /home/long.qul/tritontest/wandb/run-20230525_174128-owp5uwvq/files/config.yaml
+2023-05-25 17:41:35,804 INFO Thread-11 :2921999 [sender.py:transition_state():626] send defer: 11
+2023-05-25 17:41:35,804 DEBUG HandlerThread:2921999 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:41:35,805 INFO HandlerThread:2921999 [handler.py:handle_request_defer():170] handle defer: 11
+2023-05-25 17:41:35,805 DEBUG SenderThread:2921999 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:41:35,805 INFO SenderThread:2921999 [sender.py:send_request_defer():622] handle sender defer: 11
+2023-05-25 17:41:35,805 INFO SenderThread:2921999 [file_pusher.py:join():172] waiting for file pusher
+2023-05-25 17:41:35,805 INFO SenderThread:2921999 [sender.py:transition_state():626] send defer: 12
+2023-05-25 17:41:35,805 DEBUG HandlerThread:2921999 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:41:35,805 INFO HandlerThread:2921999 [handler.py:handle_request_defer():170] handle defer: 12
+2023-05-25 17:41:35,805 DEBUG SenderThread:2921999 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:41:35,805 INFO SenderThread:2921999 [sender.py:send_request_defer():622] handle sender defer: 12
+2023-05-25 17:41:36,057 INFO SenderThread:2921999 [sender.py:transition_state():626] send defer: 13
+2023-05-25 17:41:36,057 DEBUG HandlerThread:2921999 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:41:36,057 INFO HandlerThread:2921999 [handler.py:handle_request_defer():170] handle defer: 13
+2023-05-25 17:41:36,058 DEBUG SenderThread:2921999 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:41:36,058 INFO SenderThread:2921999 [sender.py:send_request_defer():622] handle sender defer: 13
+2023-05-25 17:41:36,058 INFO SenderThread:2921999 [sender.py:transition_state():626] send defer: 14
+2023-05-25 17:41:36,058 DEBUG HandlerThread:2921999 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:41:36,058 INFO HandlerThread:2921999 [handler.py:handle_request_defer():170] handle defer: 14
+2023-05-25 17:41:36,058 DEBUG SenderThread:2921999 [sender.py:send():375] send: final
+2023-05-25 17:41:36,058 DEBUG SenderThread:2921999 [sender.py:send():375] send: footer
+2023-05-25 17:41:36,058 DEBUG SenderThread:2921999 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:41:36,058 INFO SenderThread:2921999 [sender.py:send_request_defer():622] handle sender defer: 14
+2023-05-25 17:41:36,059 DEBUG HandlerThread:2921999 [handler.py:handle_request():144] handle_request: poll_exit
+2023-05-25 17:41:36,059 DEBUG SenderThread:2921999 [sender.py:send_request():402] send_request: poll_exit
+2023-05-25 17:41:36,059 DEBUG HandlerThread:2921999 [handler.py:handle_request():144] handle_request: server_info
+2023-05-25 17:41:36,059 DEBUG SenderThread:2921999 [sender.py:send_request():402] send_request: server_info
+2023-05-25 17:41:36,060 DEBUG HandlerThread:2921999 [handler.py:handle_request():144] handle_request: get_summary
+2023-05-25 17:41:36,061 DEBUG HandlerThread:2921999 [handler.py:handle_request():144] handle_request: sampled_history
+2023-05-25 17:41:36,526 INFO MainThread:2921999 [wandb_run.py:_footer_history_summary_info():3469] rendering history
+2023-05-25 17:41:36,526 INFO MainThread:2921999 [wandb_run.py:_footer_history_summary_info():3501] rendering summary
+2023-05-25 17:41:36,526 INFO MainThread:2921999 [wandb_run.py:_footer_sync_info():3428] logging synced files
+2023-05-25 17:41:36,526 DEBUG HandlerThread:2921999 [handler.py:handle_request():144] handle_request: shutdown
+2023-05-25 17:41:36,526 INFO HandlerThread:2921999 [handler.py:finish():842] shutting down handler
+2023-05-25 17:41:37,059 INFO WriterThread:2921999 [datastore.py:close():298] close: /home/long.qul/tritontest/wandb/run-20230525_174128-owp5uwvq/run-owp5uwvq.wandb
+2023-05-25 17:41:37,526 INFO SenderThread:2921999 [sender.py:finish():1550] shutting down sender
+2023-05-25 17:41:37,526 INFO SenderThread:2921999 [file_pusher.py:finish():167] shutting down file pusher
+2023-05-25 17:41:37,526 INFO SenderThread:2921999 [file_pusher.py:join():172] waiting for file pusher
diff --git a/wandb/run-20230525_174128-owp5uwvq/logs/debug.log b/wandb/run-20230525_174128-owp5uwvq/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..0f798edcb093ade5dd335115ea730e0485bb8079
--- /dev/null
+++ b/wandb/run-20230525_174128-owp5uwvq/logs/debug.log
@@ -0,0 +1,28 @@
+2023-05-25 17:41:28,136 INFO MainThread:2921781 [wandb_setup.py:_flush():76] Current SDK version is 0.15.3
+2023-05-25 17:41:28,137 INFO MainThread:2921781 [wandb_setup.py:_flush():76] Configure stats pid to 2921781
+2023-05-25 17:41:28,137 INFO MainThread:2921781 [wandb_setup.py:_flush():76] Loading settings from /home/long.qul/.config/wandb/settings
+2023-05-25 17:41:28,137 INFO MainThread:2921781 [wandb_setup.py:_flush():76] Loading settings from /home/long.qul/tritontest/wandb/settings
+2023-05-25 17:41:28,137 INFO MainThread:2921781 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
+2023-05-25 17:41:28,137 INFO MainThread:2921781 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2023-05-25 17:41:28,137 INFO MainThread:2921781 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'lora_test_1.py', 'program': 'lora_test_1.py'}
+2023-05-25 17:41:28,137 INFO MainThread:2921781 [wandb_init.py:_log_setup():507] Logging user logs to /home/long.qul/tritontest/wandb/run-20230525_174128-owp5uwvq/logs/debug.log
+2023-05-25 17:41:28,137 INFO MainThread:2921781 [wandb_init.py:_log_setup():508] Logging internal logs to /home/long.qul/tritontest/wandb/run-20230525_174128-owp5uwvq/logs/debug-internal.log
+2023-05-25 17:41:28,137 INFO MainThread:2921781 [wandb_init.py:init():547] calling init triggers
+2023-05-25 17:41:28,137 INFO MainThread:2921781 [wandb_init.py:init():554] wandb.init called with sweep_config: {}
+config: {}
+2023-05-25 17:41:28,137 INFO MainThread:2921781 [wandb_init.py:init():596] starting backend
+2023-05-25 17:41:28,137 INFO MainThread:2921781 [wandb_init.py:init():600] setting up manager
+2023-05-25 17:41:28,139 INFO MainThread:2921781 [backend.py:_multiprocessing_setup():106] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2023-05-25 17:41:28,140 INFO MainThread:2921781 [wandb_init.py:init():606] backend started and connected
+2023-05-25 17:41:28,143 INFO MainThread:2921781 [wandb_init.py:init():700] updated telemetry
+2023-05-25 17:41:28,143 INFO MainThread:2921781 [wandb_init.py:init():737] communicating run to backend with 60.0 second timeout
+2023-05-25 17:41:28,687 INFO MainThread:2921781 [wandb_run.py:_on_init():2177] communicating current version
+2023-05-25 17:41:28,707 INFO MainThread:2921781 [wandb_run.py:_on_init():2186] got version response
+2023-05-25 17:41:28,707 INFO MainThread:2921781 [wandb_init.py:init():787] starting run threads in backend
+2023-05-25 17:41:30,821 INFO MainThread:2921781 [wandb_run.py:_console_start():2158] atexit reg
+2023-05-25 17:41:30,821 INFO MainThread:2921781 [wandb_run.py:_redirect():2013] redirect: SettingsConsole.WRAP_RAW
+2023-05-25 17:41:30,821 INFO MainThread:2921781 [wandb_run.py:_redirect():2078] Wrapping output streams.
+2023-05-25 17:41:30,821 INFO MainThread:2921781 [wandb_run.py:_redirect():2103] Redirects installed.
+2023-05-25 17:41:30,822 INFO MainThread:2921781 [wandb_init.py:init():829] run started, returning control to user process
+2023-05-25 17:41:30,824 INFO MainThread:2921781 [wandb_run.py:_config_callback():1286] config_cb None None {'pretrained_model_name_or_path': 'runwayml/stable-diffusion-v1-5', 'revision': None, 'dataset_name': 'lambdalabs/pokemon-blip-captions', 'dataset_config_name': None, 'train_data_dir': None, 'image_column': 'image', 'caption_column': 'text', 'validation_prompt': 'A pokemon with blue eyes.', 'num_validation_images': 4, 'validation_epochs': 1, 'max_train_samples': None, 'output_dir': '/home/long.qul/tritontest', 'cache_dir': None, 'seed': 1337, 'resolution': 512, 'center_crop': True, 'random_flip': True, 'train_batch_size': 1, 'num_train_epochs': 72, 'max_train_steps': 15000, 'gradient_accumulation_steps': 4, 'gradient_checkpointing': False, 'learning_rate': 0.0001, 'scale_lr': False, 'lr_scheduler': 'cosine', 'lr_warmup_steps': 0, 'snr_gamma': None, 'use_8bit_adam': False, 'allow_tf32': False, 'dataloader_num_workers': 8, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'push_to_hub': True, 'hub_token': None, 'hub_model_id': '', 'logging_dir': 'logs', 'mixed_precision': None, 'report_to': 'wandb', 'local_rank': -1, 'checkpointing_steps': 500, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'enable_xformers_memory_efficient_attention': False, 'noise_offset': 0}
+2023-05-25 17:41:37,617 WARNING MsgRouterThr:2921781 [router.py:message_loop():77] message_loop has been closed
diff --git a/wandb/run-20230525_174128-owp5uwvq/run-owp5uwvq.wandb b/wandb/run-20230525_174128-owp5uwvq/run-owp5uwvq.wandb
new file mode 100644
index 0000000000000000000000000000000000000000..4a5ca751156cf620ea15d8ca6b5783a52efd58b9
Binary files /dev/null and b/wandb/run-20230525_174128-owp5uwvq/run-owp5uwvq.wandb differ
diff --git a/wandb/run-20230525_175352-jw7zshqk/files/conda-environment.yaml b/wandb/run-20230525_175352-jw7zshqk/files/conda-environment.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eaad08ef7cb35db28c95e3521ff1f3af6a1b767d
--- /dev/null
+++ b/wandb/run-20230525_175352-jw7zshqk/files/conda-environment.yaml
@@ -0,0 +1,147 @@
+name: triton
+channels:
+ - conda-forge
+ - defaults
+dependencies:
+ - _libgcc_mutex=0.1=main
+ - _openmp_mutex=5.1=1_gnu
+ - bzip2=1.0.8=h7f98852_4
+ - c-ares=1.19.0=h5eee18b_0
+ - ca-certificates=2023.5.7=hbcca054_0
+ - expat=2.2.10=h9c3ff4c_0
+ - keyutils=1.6.1=h166bdaf_0
+ - krb5=1.19.3=h3790be6_0
+ - ld_impl_linux-64=2.38=h1181459_1
+ - libcurl=7.87.0=h91b91d3_0
+ - libedit=3.1.20191231=he28a2e2_2
+ - libev=4.33=h516909a_1
+ - libffi=3.4.4=h6a678d5_0
+ - libgcc-ng=11.2.0=h1234567_1
+ - libgomp=11.2.0=h1234567_1
+ - libnghttp2=1.46.0=hce63b2e_0
+ - libssh2=1.10.0=ha56f1ee_2
+ - libstdcxx-ng=11.2.0=h1234567_1
+ - libuv=1.44.2=h5eee18b_0
+ - lz4-c=1.9.3=h9c3ff4c_1
+ - ncurses=6.4=h6a678d5_0
+ - openssl=1.1.1t=h7f8727e_0
+ - pip=23.0.1=py38h06a4308_0
+ - python=3.8.16=h7a1cb2a_3
+ - readline=8.2=h5eee18b_0
+ - rhash=1.4.1=h3c74f83_1
+ - setuptools=66.0.0=py38h06a4308_0
+ - sqlite=3.41.2=h5eee18b_0
+ - tk=8.6.12=h1ccaba5_0
+ - wheel=0.38.4=py38h06a4308_0
+ - xz=5.4.2=h5eee18b_0
+ - zlib=1.2.13=h5eee18b_0
+ - zstd=1.5.2=ha4553b6_0
+ - pip:
+ - absl-py==1.4.0
+ - accelerate==0.19.0
+ - aiohttp==3.8.4
+ - aiosignal==1.3.1
+ - appdirs==1.4.4
+ - async-timeout==4.0.2
+ - attrs==23.1.0
+ - brotli==1.0.9
+ - cachetools==5.3.0
+ - certifi==2023.5.7
+ - charset-normalizer==3.1.0
+ - click==8.1.3
+ - cmake==3.26.3
+ - datasets==2.12.0
+ - diffusers==0.17.0.dev0
+ - dill==0.3.6
+ - docker-pycreds==0.4.0
+ - filelock==3.12.0
+ - frozenlist==1.3.3
+ - fsspec==2023.5.0
+ - ftfy==6.1.1
+ - gevent==22.10.2
+ - geventhttpclient==2.0.2
+ - gitdb==4.0.10
+ - gitpython==3.1.31
+ - google-auth==2.18.1
+ - google-auth-oauthlib==1.0.0
+ - greenlet==2.0.2
+ - grpcio==1.55.0
+ - huggingface-hub==0.14.1
+ - idna==3.4
+ - importlib-metadata==6.6.0
+ - jinja2==3.1.2
+ - lit==16.0.5
+ - markdown==3.4.3
+ - markupsafe==2.1.2
+ - mpmath==1.3.0
+ - multidict==6.0.4
+ - multiprocess==0.70.14
+ - mypy-extensions==1.0.0
+ - networkx==3.1
+ - numpy==1.24.3
+ - nvidia-cublas-cu11==11.10.3.66
+ - nvidia-cublas-cu12==12.1.3.1
+ - nvidia-cuda-cupti-cu11==11.7.101
+ - nvidia-cuda-nvrtc-cu11==11.7.99
+ - nvidia-cuda-runtime-cu11==11.7.99
+ - nvidia-cuda-runtime-cu12==12.1.105
+ - nvidia-cudnn-cu11==8.5.0.96
+ - nvidia-cudnn-cu12==8.9.1.23
+ - nvidia-cufft-cu11==10.9.0.58
+ - nvidia-curand-cu11==10.2.10.91
+ - nvidia-cusolver-cu11==11.4.0.1
+ - nvidia-cusparse-cu11==11.7.4.91
+ - nvidia-nccl-cu11==2.14.3
+ - nvidia-nvtx-cu11==11.7.91
+ - nvidia-pytriton==0.1.5
+ - oauthlib==3.2.2
+ - packaging==23.1
+ - pandas==2.0.1
+ - pathtools==0.1.2
+ - pillow==9.5.0
+ - protobuf==3.20.3
+ - psutil==5.9.5
+ - pyarrow==12.0.0
+ - pyasn1==0.5.0
+ - pyasn1-modules==0.3.0
+ - python-dateutil==2.8.2
+ - python-rapidjson==1.10
+ - pytz==2023.3
+ - pyyaml==6.0
+ - pyzmq==23.2.1
+ - regex==2023.5.5
+ - requests==2.31.0
+ - requests-oauthlib==1.3.1
+ - responses==0.18.0
+ - rsa==4.9
+ - sentry-sdk==1.24.0
+ - setproctitle==1.3.2
+ - sh==1.14.3
+ - six==1.16.0
+ - smmap==5.0.0
+ - sympy==1.12
+ - tensorboard==2.13.0
+ - tensorboard-data-server==0.7.0
+ - tensorrt-bindings==8.6.1
+ - tensorrt-libs==8.6.1
+ - tokenizers==0.13.3
+ - torch==2.0.1
+ - torchvision==0.15.2
+ - tqdm==4.65.0
+ - transformers==4.29.2
+ - triton==2.0.0
+ - tritonclient==2.33.0
+ - typing-extensions==4.6.0
+ - typing-inspect==0.6.0
+ - tzdata==2023.3
+ - urllib3==1.26.16
+ - wandb==0.15.3
+ - wcwidth==0.2.6
+ - werkzeug==2.3.4
+ - wrapt==1.15.0
+ - xxhash==3.2.0
+ - yarl==1.9.2
+ - zipp==3.15.0
+ - zope-event==4.6
+ - zope-interface==6.0
+prefix: /home/long.qul/miniconda3/envs/triton
diff --git a/wandb/run-20230525_175352-jw7zshqk/files/config.yaml b/wandb/run-20230525_175352-jw7zshqk/files/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..21a4caef20172ec1b13101ccef8ade3b6fad8c4f
--- /dev/null
+++ b/wandb/run-20230525_175352-jw7zshqk/files/config.yaml
@@ -0,0 +1,179 @@
+wandb_version: 1
+
+_wandb:
+ desc: null
+ value:
+ python_version: 3.8.16
+ cli_version: 0.15.3
+ framework: huggingface
+ huggingface_version: 4.29.2
+ is_jupyter_run: false
+ is_kaggle_kernel: false
+ start_time: 1685008432.727397
+ t:
+ 1:
+ - 1
+ - 11
+ - 41
+ - 49
+ - 51
+ - 55
+ - 71
+ - 83
+ 2:
+ - 1
+ - 11
+ - 41
+ - 49
+ - 51
+ - 55
+ - 71
+ - 83
+ 3:
+ - 23
+ 4: 3.8.16
+ 5: 0.15.3
+ 6: 4.29.2
+ 8:
+ - 5
+pretrained_model_name_or_path:
+ desc: null
+ value: runwayml/stable-diffusion-v1-5
+revision:
+ desc: null
+ value: null
+dataset_name:
+ desc: null
+ value: lambdalabs/pokemon-blip-captions
+dataset_config_name:
+ desc: null
+ value: null
+train_data_dir:
+ desc: null
+ value: null
+image_column:
+ desc: null
+ value: image
+caption_column:
+ desc: null
+ value: text
+validation_prompt:
+ desc: null
+ value: A pokemon with blue eyes.
+num_validation_images:
+ desc: null
+ value: 4
+validation_epochs:
+ desc: null
+ value: 1
+max_train_samples:
+ desc: null
+ value: null
+output_dir:
+ desc: null
+ value: /home/long.qul/tritontest
+cache_dir:
+ desc: null
+ value: null
+seed:
+ desc: null
+ value: 1337
+resolution:
+ desc: null
+ value: 512
+center_crop:
+ desc: null
+ value: true
+random_flip:
+ desc: null
+ value: true
+train_batch_size:
+ desc: null
+ value: 1
+num_train_epochs:
+ desc: null
+ value: 72
+max_train_steps:
+ desc: null
+ value: 15000
+gradient_accumulation_steps:
+ desc: null
+ value: 4
+gradient_checkpointing:
+ desc: null
+ value: false
+learning_rate:
+ desc: null
+ value: 0.0001
+scale_lr:
+ desc: null
+ value: false
+lr_scheduler:
+ desc: null
+ value: cosine
+lr_warmup_steps:
+ desc: null
+ value: 0
+snr_gamma:
+ desc: null
+ value: null
+use_8bit_adam:
+ desc: null
+ value: false
+allow_tf32:
+ desc: null
+ value: false
+dataloader_num_workers:
+ desc: null
+ value: 8
+adam_beta1:
+ desc: null
+ value: 0.9
+adam_beta2:
+ desc: null
+ value: 0.999
+adam_weight_decay:
+ desc: null
+ value: 0.01
+adam_epsilon:
+ desc: null
+ value: 1.0e-08
+max_grad_norm:
+ desc: null
+ value: 1.0
+push_to_hub:
+ desc: null
+ value: true
+hub_token:
+ desc: null
+ value: null
+hub_model_id:
+ desc: null
+ value: ''
+logging_dir:
+ desc: null
+ value: logs
+mixed_precision:
+ desc: null
+ value: null
+report_to:
+ desc: null
+ value: wandb
+local_rank:
+ desc: null
+ value: -1
+checkpointing_steps:
+ desc: null
+ value: 500
+checkpoints_total_limit:
+ desc: null
+ value: null
+resume_from_checkpoint:
+ desc: null
+ value: null
+enable_xformers_memory_efficient_attention:
+ desc: null
+ value: false
+noise_offset:
+ desc: null
+ value: 0
diff --git a/wandb/run-20230525_175352-jw7zshqk/files/output.log b/wandb/run-20230525_175352-jw7zshqk/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..bfda75f9a76b99b1ba24519bdabdfd93822ccdd4
--- /dev/null
+++ b/wandb/run-20230525_175352-jw7zshqk/files/output.log
@@ -0,0 +1,27 @@
+05/25/2023 17:53:55 - INFO - __main__ - ***** Running training *****
+05/25/2023 17:53:55 - INFO - __main__ - Num examples = 833
+05/25/2023 17:53:55 - INFO - __main__ - Num Epochs = 72
+05/25/2023 17:53:55 - INFO - __main__ - Instantaneous batch size per device = 1
+05/25/2023 17:53:55 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 4
+05/25/2023 17:53:55 - INFO - __main__ - Gradient Accumulation steps = 4
+05/25/2023 17:53:55 - INFO - __main__ - Total optimization steps = 15000
+Steps: 0%| | 0/15000 [00:00, ?it/s]Traceback (most recent call last):
+ File "lora_test_1.py", line 908, in
+ main()
+ File "lora_test_1.py", line 728, in main
+ latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+ File "/home/long.qul/miniconda3/envs/triton/lib/python3.8/site-packages/diffusers/utils/accelerate_utils.py", line 46, in wrapper
+ return method(self, *args, **kwargs)
+ File "/home/long.qul/miniconda3/envs/triton/lib/python3.8/site-packages/diffusers/models/autoencoder_kl.py", line 164, in encode
+ h = self.encoder(x)
+ File "/home/long.qul/miniconda3/envs/triton/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
+ return forward_call(*args, **kwargs)
+ File "/home/long.qul/miniconda3/envs/triton/lib/python3.8/site-packages/diffusers/models/vae.py", line 109, in forward
+ sample = self.conv_in(sample)
+ File "/home/long.qul/miniconda3/envs/triton/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
+ return forward_call(*args, **kwargs)
+ File "/home/long.qul/miniconda3/envs/triton/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 463, in forward
+ return self._conv_forward(input, self.weight, self.bias)
+ File "/home/long.qul/miniconda3/envs/triton/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 459, in _conv_forward
+ return F.conv2d(input, weight, bias, self.stride,
+RuntimeError: cuDNN error: CUDNN_STATUS_NOT_INITIALIZED
\ No newline at end of file
diff --git a/wandb/run-20230525_175352-jw7zshqk/files/requirements.txt b/wandb/run-20230525_175352-jw7zshqk/files/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..881b72a23b4f740f4282786da35a53f77f1d92cd
--- /dev/null
+++ b/wandb/run-20230525_175352-jw7zshqk/files/requirements.txt
@@ -0,0 +1,110 @@
+absl-py==1.4.0
+accelerate==0.19.0
+aiohttp==3.8.4
+aiosignal==1.3.1
+appdirs==1.4.4
+async-timeout==4.0.2
+attrs==23.1.0
+brotli==1.0.9
+cachetools==5.3.0
+certifi==2023.5.7
+charset-normalizer==3.1.0
+click==8.1.3
+cmake==3.26.3
+datasets==2.12.0
+diffusers==0.17.0.dev0
+dill==0.3.6
+docker-pycreds==0.4.0
+filelock==3.12.0
+frozenlist==1.3.3
+fsspec==2023.5.0
+ftfy==6.1.1
+gevent==22.10.2
+geventhttpclient==2.0.2
+gitdb==4.0.10
+gitpython==3.1.31
+google-auth-oauthlib==1.0.0
+google-auth==2.18.1
+greenlet==2.0.2
+grpcio==1.55.0
+huggingface-hub==0.14.1
+idna==3.4
+importlib-metadata==6.6.0
+jinja2==3.1.2
+lit==16.0.5
+markdown==3.4.3
+markupsafe==2.1.2
+mpmath==1.3.0
+multidict==6.0.4
+multiprocess==0.70.14
+mypy-extensions==1.0.0
+networkx==3.1
+numpy==1.24.3
+nvidia-cublas-cu11==11.10.3.66
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+nvidia-cuda-runtime-cu11==11.7.99
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu11==8.5.0.96
+nvidia-cudnn-cu12==8.9.1.23
+nvidia-cufft-cu11==10.9.0.58
+nvidia-curand-cu11==10.2.10.91
+nvidia-cusolver-cu11==11.4.0.1
+nvidia-cusparse-cu11==11.7.4.91
+nvidia-nccl-cu11==2.14.3
+nvidia-nvtx-cu11==11.7.91
+nvidia-pytriton==0.1.5
+oauthlib==3.2.2
+packaging==23.1
+pandas==2.0.1
+pathtools==0.1.2
+pillow==9.5.0
+pip==23.0.1
+protobuf==3.20.3
+psutil==5.9.5
+pyarrow==12.0.0
+pyasn1-modules==0.3.0
+pyasn1==0.5.0
+python-dateutil==2.8.2
+python-rapidjson==1.10
+pytz==2023.3
+pyyaml==6.0
+pyzmq==23.2.1
+regex==2023.5.5
+requests-oauthlib==1.3.1
+requests==2.31.0
+responses==0.18.0
+rsa==4.9
+sentry-sdk==1.24.0
+setproctitle==1.3.2
+setuptools==66.0.0
+sh==1.14.3
+six==1.16.0
+smmap==5.0.0
+sympy==1.12
+tensorboard-data-server==0.7.0
+tensorboard==2.13.0
+tensorrt-bindings==8.6.1
+tensorrt-libs==8.6.1
+tokenizers==0.13.3
+torch==2.0.1
+torchvision==0.15.2
+tqdm==4.65.0
+transformers==4.29.2
+triton==2.0.0
+tritonclient==2.33.0
+typing-extensions==4.6.0
+typing-inspect==0.6.0
+tzdata==2023.3
+urllib3==1.26.16
+wandb==0.15.3
+wcwidth==0.2.6
+werkzeug==2.3.4
+wheel==0.38.4
+wrapt==1.15.0
+xxhash==3.2.0
+yarl==1.9.2
+zipp==3.15.0
+zope.event==4.6
+zope.interface==6.0
\ No newline at end of file
diff --git a/wandb/run-20230525_175352-jw7zshqk/files/wandb-metadata.json b/wandb/run-20230525_175352-jw7zshqk/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..a421281889c5bc96e1742484b6b99d87e3de4be6
--- /dev/null
+++ b/wandb/run-20230525_175352-jw7zshqk/files/wandb-metadata.json
@@ -0,0 +1,340 @@
+{
+ "os": "Linux-5.10.134-13.1.al8.x86_64-x86_64-with-glibc2.17",
+ "python": "3.8.16",
+ "heartbeatAt": "2023-05-25T09:53:53.292165",
+ "startedAt": "2023-05-25T09:53:52.722001",
+ "docker": null,
+ "cuda": null,
+ "args": [
+ "--pretrained_model_name_or_path=runwayml/stable-diffusion-v1-5",
+ "--dataset_name=lambdalabs/pokemon-blip-captions",
+ "--dataloader_num_workers=8",
+ "--resolution=512",
+ "--center_crop",
+ "--random_flip",
+ "--train_batch_size=1",
+ "--gradient_accumulation_steps=4",
+ "--max_train_steps=15000",
+ "--learning_rate=1e-04",
+ "--max_grad_norm=1",
+ "--lr_scheduler=cosine",
+ "--lr_warmup_steps=0",
+ "--output_dir=/home/long.qul/tritontest",
+ "--push_to_hub",
+ "--hub_model_id=",
+ "--report_to=wandb",
+ "--checkpointing_steps=500",
+ "--validation_prompt=A pokemon with blue eyes.",
+ "--seed=1337"
+ ],
+ "state": "running",
+ "program": "lora_test_1.py",
+ "codePath": "lora_test_1.py",
+ "host": "iZt4n6er62uu4xnw6wibnhZ",
+ "username": "long.qul",
+ "executable": "/home/long.qul/miniconda3/envs/triton/bin/python",
+ "cpu_count": 28,
+ "cpu_count_logical": 56,
+ "cpu_freq": {
+ "current": 2899.998000000001,
+ "min": 0.0,
+ "max": 0.0
+ },
+ "cpu_freq_per_core": [
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ },
+ {
+ "current": 2899.998,
+ "min": 0.0,
+ "max": 0.0
+ }
+ ],
+ "disk": {
+ "total": 1968.4237327575684,
+ "used": 256.4756889343262
+ },
+ "gpu": "NVIDIA A10",
+ "gpu_count": 1,
+ "gpu_devices": [
+ {
+ "name": "NVIDIA A10",
+ "memory_total": 23836098560
+ }
+ ],
+ "memory": {
+ "total": 339.9116630554199
+ }
+}
diff --git a/wandb/run-20230525_175352-jw7zshqk/files/wandb-summary.json b/wandb/run-20230525_175352-jw7zshqk/files/wandb-summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..6a2353df9a39aec28b5e444685dc5d7223bc37fd
--- /dev/null
+++ b/wandb/run-20230525_175352-jw7zshqk/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_wandb": {"runtime": 2}}
\ No newline at end of file
diff --git a/wandb/run-20230525_175352-jw7zshqk/logs/debug-internal.log b/wandb/run-20230525_175352-jw7zshqk/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..0a351171cb6ba694a8691f15334a522bae8eeacf
--- /dev/null
+++ b/wandb/run-20230525_175352-jw7zshqk/logs/debug-internal.log
@@ -0,0 +1,181 @@
+2023-05-25 17:53:52,728 INFO StreamThr :2933662 [internal.py:wandb_internal():86] W&B internal server running at pid: 2933662, started at: 2023-05-25 17:53:52.727782
+2023-05-25 17:53:52,729 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: status
+2023-05-25 17:53:52,730 INFO WriterThread:2933662 [datastore.py:open_for_write():85] open: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/run-jw7zshqk.wandb
+2023-05-25 17:53:52,731 DEBUG SenderThread:2933662 [sender.py:send():375] send: header
+2023-05-25 17:53:52,731 DEBUG SenderThread:2933662 [sender.py:send():375] send: run
+2023-05-25 17:53:53,258 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: check_version
+2023-05-25 17:53:53,259 INFO SenderThread:2933662 [dir_watcher.py:__init__():219] watching files in: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files
+2023-05-25 17:53:53,259 INFO SenderThread:2933662 [sender.py:_start_run_threads():1124] run started: jw7zshqk with start time 1685008432.727397
+2023-05-25 17:53:53,259 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: summary_record
+2023-05-25 17:53:53,259 INFO SenderThread:2933662 [sender.py:_save_file():1378] saving file wandb-summary.json with policy end
+2023-05-25 17:53:53,259 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: check_version
+2023-05-25 17:53:53,282 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: run_start
+2023-05-25 17:53:53,284 DEBUG HandlerThread:2933662 [system_info.py:__init__():31] System info init
+2023-05-25 17:53:53,284 DEBUG HandlerThread:2933662 [system_info.py:__init__():46] System info init done
+2023-05-25 17:53:53,284 INFO HandlerThread:2933662 [system_monitor.py:start():181] Starting system monitor
+2023-05-25 17:53:53,284 INFO SystemMonitor:2933662 [system_monitor.py:_start():145] Starting system asset monitoring threads
+2023-05-25 17:53:53,284 INFO HandlerThread:2933662 [system_monitor.py:probe():201] Collecting system info
+2023-05-25 17:53:53,285 INFO SystemMonitor:2933662 [interfaces.py:start():190] Started cpu monitoring
+2023-05-25 17:53:53,285 INFO SystemMonitor:2933662 [interfaces.py:start():190] Started disk monitoring
+2023-05-25 17:53:53,285 INFO SystemMonitor:2933662 [interfaces.py:start():190] Started gpu monitoring
+2023-05-25 17:53:53,286 INFO SystemMonitor:2933662 [interfaces.py:start():190] Started memory monitoring
+2023-05-25 17:53:53,286 INFO SystemMonitor:2933662 [interfaces.py:start():190] Started network monitoring
+2023-05-25 17:53:53,292 DEBUG HandlerThread:2933662 [system_info.py:probe():195] Probing system
+2023-05-25 17:53:53,295 DEBUG HandlerThread:2933662 [git.py:repo():40] git repository is invalid
+2023-05-25 17:53:53,295 DEBUG HandlerThread:2933662 [system_info.py:probe():240] Probing system done
+2023-05-25 17:53:53,295 DEBUG HandlerThread:2933662 [system_monitor.py:probe():210] {'os': 'Linux-5.10.134-13.1.al8.x86_64-x86_64-with-glibc2.17', 'python': '3.8.16', 'heartbeatAt': '2023-05-25T09:53:53.292165', 'startedAt': '2023-05-25T09:53:52.722001', 'docker': None, 'cuda': None, 'args': ('--pretrained_model_name_or_path=runwayml/stable-diffusion-v1-5', '--dataset_name=lambdalabs/pokemon-blip-captions', '--dataloader_num_workers=8', '--resolution=512', '--center_crop', '--random_flip', '--train_batch_size=1', '--gradient_accumulation_steps=4', '--max_train_steps=15000', '--learning_rate=1e-04', '--max_grad_norm=1', '--lr_scheduler=cosine', '--lr_warmup_steps=0', '--output_dir=/home/long.qul/tritontest', '--push_to_hub', '--hub_model_id=', '--report_to=wandb', '--checkpointing_steps=500', '--validation_prompt=A pokemon with blue eyes.', '--seed=1337'), 'state': 'running', 'program': 'lora_test_1.py', 'codePath': 'lora_test_1.py', 'host': 'iZt4n6er62uu4xnw6wibnhZ', 'username': 'long.qul', 'executable': '/home/long.qul/miniconda3/envs/triton/bin/python', 'cpu_count': 28, 'cpu_count_logical': 56, 'cpu_freq': {'current': 2899.998000000001, 'min': 0.0, 'max': 0.0}, 'cpu_freq_per_core': [{'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}, {'current': 2899.998, 'min': 0.0, 'max': 0.0}], 'disk': {'total': 1968.4237327575684, 'used': 256.4756889343262}, 'gpu': 'NVIDIA A10', 'gpu_count': 1, 'gpu_devices': [{'name': 'NVIDIA A10', 'memory_total': 23836098560}], 'memory': {'total': 339.9116630554199}}
+2023-05-25 17:53:53,295 INFO HandlerThread:2933662 [system_monitor.py:probe():211] Finished collecting system info
+2023-05-25 17:53:53,295 INFO HandlerThread:2933662 [system_monitor.py:probe():214] Publishing system info
+2023-05-25 17:53:53,295 DEBUG HandlerThread:2933662 [system_info.py:_save_pip():51] Saving list of pip packages installed into the current environment
+2023-05-25 17:53:53,295 DEBUG HandlerThread:2933662 [system_info.py:_save_pip():67] Saving pip packages done
+2023-05-25 17:53:53,296 DEBUG HandlerThread:2933662 [system_info.py:_save_conda():74] Saving list of conda packages installed into the current environment
+2023-05-25 17:53:54,260 INFO Thread-12 :2933662 [dir_watcher.py:_on_file_created():278] file/dir created: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/conda-environment.yaml
+2023-05-25 17:53:54,260 INFO Thread-12 :2933662 [dir_watcher.py:_on_file_created():278] file/dir created: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/requirements.txt
+2023-05-25 17:53:54,260 INFO Thread-12 :2933662 [dir_watcher.py:_on_file_created():278] file/dir created: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/wandb-summary.json
+2023-05-25 17:53:55,363 DEBUG HandlerThread:2933662 [system_info.py:_save_conda():86] Saving conda packages done
+2023-05-25 17:53:55,364 INFO HandlerThread:2933662 [system_monitor.py:probe():216] Finished publishing system info
+2023-05-25 17:53:55,368 DEBUG SenderThread:2933662 [sender.py:send():375] send: files
+2023-05-25 17:53:55,368 INFO SenderThread:2933662 [sender.py:_save_file():1378] saving file wandb-metadata.json with policy now
+2023-05-25 17:53:55,371 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: stop_status
+2023-05-25 17:53:55,371 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: stop_status
+2023-05-25 17:53:55,619 DEBUG SenderThread:2933662 [sender.py:send():375] send: telemetry
+2023-05-25 17:53:55,619 DEBUG SenderThread:2933662 [sender.py:send():375] send: config
+2023-05-25 17:53:55,646 DEBUG SenderThread:2933662 [sender.py:send():375] send: exit
+2023-05-25 17:53:55,646 INFO SenderThread:2933662 [sender.py:send_exit():598] handling exit code: 1
+2023-05-25 17:53:55,646 INFO SenderThread:2933662 [sender.py:send_exit():600] handling runtime: 2
+2023-05-25 17:53:55,646 INFO SenderThread:2933662 [sender.py:_save_file():1378] saving file wandb-summary.json with policy end
+2023-05-25 17:53:55,647 INFO SenderThread:2933662 [sender.py:send_exit():606] send defer
+2023-05-25 17:53:55,647 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:53:55,647 INFO HandlerThread:2933662 [handler.py:handle_request_defer():170] handle defer: 0
+2023-05-25 17:53:55,647 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:53:55,647 INFO SenderThread:2933662 [sender.py:send_request_defer():622] handle sender defer: 0
+2023-05-25 17:53:55,647 INFO SenderThread:2933662 [sender.py:transition_state():626] send defer: 1
+2023-05-25 17:53:55,647 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:53:55,647 INFO HandlerThread:2933662 [handler.py:handle_request_defer():170] handle defer: 1
+2023-05-25 17:53:55,647 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:53:55,647 INFO SenderThread:2933662 [sender.py:send_request_defer():622] handle sender defer: 1
+2023-05-25 17:53:55,647 INFO SenderThread:2933662 [sender.py:transition_state():626] send defer: 2
+2023-05-25 17:53:55,647 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:53:55,647 INFO HandlerThread:2933662 [handler.py:handle_request_defer():170] handle defer: 2
+2023-05-25 17:53:55,647 INFO HandlerThread:2933662 [system_monitor.py:finish():190] Stopping system monitor
+2023-05-25 17:53:55,648 INFO HandlerThread:2933662 [interfaces.py:finish():202] Joined cpu monitor
+2023-05-25 17:53:55,648 DEBUG SystemMonitor:2933662 [system_monitor.py:_start():159] Starting system metrics aggregation loop
+2023-05-25 17:53:55,648 INFO HandlerThread:2933662 [interfaces.py:finish():202] Joined disk monitor
+2023-05-25 17:53:55,648 DEBUG SystemMonitor:2933662 [system_monitor.py:_start():166] Finished system metrics aggregation loop
+2023-05-25 17:53:55,648 DEBUG SystemMonitor:2933662 [system_monitor.py:_start():170] Publishing last batch of metrics
+2023-05-25 17:53:55,679 INFO HandlerThread:2933662 [interfaces.py:finish():202] Joined gpu monitor
+2023-05-25 17:53:55,679 INFO HandlerThread:2933662 [interfaces.py:finish():202] Joined memory monitor
+2023-05-25 17:53:55,679 INFO HandlerThread:2933662 [interfaces.py:finish():202] Joined network monitor
+2023-05-25 17:53:55,679 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:53:55,679 INFO SenderThread:2933662 [sender.py:send_request_defer():622] handle sender defer: 2
+2023-05-25 17:53:55,679 INFO SenderThread:2933662 [sender.py:transition_state():626] send defer: 3
+2023-05-25 17:53:55,679 DEBUG SenderThread:2933662 [sender.py:send():375] send: stats
+2023-05-25 17:53:55,679 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:53:55,680 INFO HandlerThread:2933662 [handler.py:handle_request_defer():170] handle defer: 3
+2023-05-25 17:53:55,680 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:53:55,680 INFO SenderThread:2933662 [sender.py:send_request_defer():622] handle sender defer: 3
+2023-05-25 17:53:55,680 INFO SenderThread:2933662 [sender.py:transition_state():626] send defer: 4
+2023-05-25 17:53:55,680 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:53:55,680 INFO HandlerThread:2933662 [handler.py:handle_request_defer():170] handle defer: 4
+2023-05-25 17:53:55,680 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:53:55,680 INFO SenderThread:2933662 [sender.py:send_request_defer():622] handle sender defer: 4
+2023-05-25 17:53:55,680 INFO SenderThread:2933662 [sender.py:transition_state():626] send defer: 5
+2023-05-25 17:53:55,680 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:53:55,680 INFO HandlerThread:2933662 [handler.py:handle_request_defer():170] handle defer: 5
+2023-05-25 17:53:55,680 DEBUG SenderThread:2933662 [sender.py:send():375] send: summary
+2023-05-25 17:53:55,680 INFO SenderThread:2933662 [sender.py:_save_file():1378] saving file wandb-summary.json with policy end
+2023-05-25 17:53:55,680 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:53:55,680 INFO SenderThread:2933662 [sender.py:send_request_defer():622] handle sender defer: 5
+2023-05-25 17:53:55,680 INFO SenderThread:2933662 [sender.py:transition_state():626] send defer: 6
+2023-05-25 17:53:55,681 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:53:55,681 INFO HandlerThread:2933662 [handler.py:handle_request_defer():170] handle defer: 6
+2023-05-25 17:53:55,681 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:53:55,681 INFO SenderThread:2933662 [sender.py:send_request_defer():622] handle sender defer: 6
+2023-05-25 17:53:55,683 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: status_report
+2023-05-25 17:53:55,984 INFO SenderThread:2933662 [sender.py:transition_state():626] send defer: 7
+2023-05-25 17:53:55,984 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:53:55,984 INFO HandlerThread:2933662 [handler.py:handle_request_defer():170] handle defer: 7
+2023-05-25 17:53:55,984 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:53:55,984 INFO SenderThread:2933662 [sender.py:send_request_defer():622] handle sender defer: 7
+2023-05-25 17:53:55,988 INFO wandb-upload_0:2933662 [upload_job.py:push():137] Uploaded file /tmp/tmpfd18htn5wandb/qjhthzap-wandb-metadata.json
+2023-05-25 17:53:56,261 INFO Thread-12 :2933662 [dir_watcher.py:_on_file_modified():295] file/dir modified: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/conda-environment.yaml
+2023-05-25 17:53:56,261 INFO Thread-12 :2933662 [dir_watcher.py:_on_file_modified():295] file/dir modified: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/config.yaml
+2023-05-25 17:53:56,261 INFO Thread-12 :2933662 [dir_watcher.py:_on_file_modified():295] file/dir modified: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/wandb-summary.json
+2023-05-25 17:53:56,261 INFO Thread-12 :2933662 [dir_watcher.py:_on_file_created():278] file/dir created: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/wandb-metadata.json
+2023-05-25 17:53:56,261 INFO Thread-12 :2933662 [dir_watcher.py:_on_file_created():278] file/dir created: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/output.log
+2023-05-25 17:53:56,646 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: poll_exit
+2023-05-25 17:53:57,626 INFO SenderThread:2933662 [sender.py:transition_state():626] send defer: 8
+2023-05-25 17:53:57,626 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: poll_exit
+2023-05-25 17:53:57,626 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:53:57,627 INFO HandlerThread:2933662 [handler.py:handle_request_defer():170] handle defer: 8
+2023-05-25 17:53:57,627 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:53:57,627 INFO SenderThread:2933662 [sender.py:send_request_defer():622] handle sender defer: 8
+2023-05-25 17:53:57,627 INFO SenderThread:2933662 [sender.py:transition_state():626] send defer: 9
+2023-05-25 17:53:57,627 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:53:57,627 INFO HandlerThread:2933662 [handler.py:handle_request_defer():170] handle defer: 9
+2023-05-25 17:53:57,627 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:53:57,627 INFO SenderThread:2933662 [sender.py:send_request_defer():622] handle sender defer: 9
+2023-05-25 17:53:57,627 INFO SenderThread:2933662 [dir_watcher.py:finish():365] shutting down directory watcher
+2023-05-25 17:53:57,647 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: poll_exit
+2023-05-25 17:53:58,261 INFO Thread-12 :2933662 [dir_watcher.py:_on_file_modified():295] file/dir modified: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/output.log
+2023-05-25 17:53:58,262 INFO SenderThread:2933662 [dir_watcher.py:finish():395] scan: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files
+2023-05-25 17:53:58,262 INFO SenderThread:2933662 [dir_watcher.py:finish():409] scan save: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/wandb-summary.json wandb-summary.json
+2023-05-25 17:53:58,262 INFO SenderThread:2933662 [dir_watcher.py:finish():409] scan save: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/requirements.txt requirements.txt
+2023-05-25 17:53:58,262 INFO SenderThread:2933662 [dir_watcher.py:finish():409] scan save: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/conda-environment.yaml conda-environment.yaml
+2023-05-25 17:53:58,262 INFO SenderThread:2933662 [dir_watcher.py:finish():409] scan save: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/output.log output.log
+2023-05-25 17:53:58,262 INFO SenderThread:2933662 [dir_watcher.py:finish():409] scan save: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/wandb-metadata.json wandb-metadata.json
+2023-05-25 17:53:58,262 INFO SenderThread:2933662 [dir_watcher.py:finish():409] scan save: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/config.yaml config.yaml
+2023-05-25 17:53:58,262 INFO SenderThread:2933662 [sender.py:transition_state():626] send defer: 10
+2023-05-25 17:53:58,262 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: poll_exit
+2023-05-25 17:53:58,266 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:53:58,266 INFO HandlerThread:2933662 [handler.py:handle_request_defer():170] handle defer: 10
+2023-05-25 17:53:58,268 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:53:58,269 INFO SenderThread:2933662 [sender.py:send_request_defer():622] handle sender defer: 10
+2023-05-25 17:53:58,269 INFO SenderThread:2933662 [file_pusher.py:finish():167] shutting down file pusher
+2023-05-25 17:53:58,648 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: poll_exit
+2023-05-25 17:53:58,648 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: poll_exit
+2023-05-25 17:53:58,877 INFO wandb-upload_4:2933662 [upload_job.py:push():137] Uploaded file /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/config.yaml
+2023-05-25 17:53:58,890 INFO wandb-upload_3:2933662 [upload_job.py:push():137] Uploaded file /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/output.log
+2023-05-25 17:53:58,948 INFO wandb-upload_0:2933662 [upload_job.py:push():137] Uploaded file /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/wandb-summary.json
+2023-05-25 17:53:58,970 INFO wandb-upload_2:2933662 [upload_job.py:push():137] Uploaded file /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/conda-environment.yaml
+2023-05-25 17:53:59,034 INFO wandb-upload_1:2933662 [upload_job.py:push():137] Uploaded file /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/files/requirements.txt
+2023-05-25 17:53:59,234 INFO Thread-11 :2933662 [sender.py:transition_state():626] send defer: 11
+2023-05-25 17:53:59,234 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:53:59,235 INFO HandlerThread:2933662 [handler.py:handle_request_defer():170] handle defer: 11
+2023-05-25 17:53:59,235 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:53:59,235 INFO SenderThread:2933662 [sender.py:send_request_defer():622] handle sender defer: 11
+2023-05-25 17:53:59,235 INFO SenderThread:2933662 [file_pusher.py:join():172] waiting for file pusher
+2023-05-25 17:53:59,235 INFO SenderThread:2933662 [sender.py:transition_state():626] send defer: 12
+2023-05-25 17:53:59,235 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:53:59,235 INFO HandlerThread:2933662 [handler.py:handle_request_defer():170] handle defer: 12
+2023-05-25 17:53:59,235 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:53:59,235 INFO SenderThread:2933662 [sender.py:send_request_defer():622] handle sender defer: 12
+2023-05-25 17:53:59,488 INFO SenderThread:2933662 [sender.py:transition_state():626] send defer: 13
+2023-05-25 17:53:59,488 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:53:59,488 INFO HandlerThread:2933662 [handler.py:handle_request_defer():170] handle defer: 13
+2023-05-25 17:53:59,488 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:53:59,488 INFO SenderThread:2933662 [sender.py:send_request_defer():622] handle sender defer: 13
+2023-05-25 17:53:59,489 INFO SenderThread:2933662 [sender.py:transition_state():626] send defer: 14
+2023-05-25 17:53:59,489 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: defer
+2023-05-25 17:53:59,489 INFO HandlerThread:2933662 [handler.py:handle_request_defer():170] handle defer: 14
+2023-05-25 17:53:59,489 DEBUG SenderThread:2933662 [sender.py:send():375] send: final
+2023-05-25 17:53:59,489 DEBUG SenderThread:2933662 [sender.py:send():375] send: footer
+2023-05-25 17:53:59,489 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: defer
+2023-05-25 17:53:59,489 INFO SenderThread:2933662 [sender.py:send_request_defer():622] handle sender defer: 14
+2023-05-25 17:53:59,489 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: poll_exit
+2023-05-25 17:53:59,489 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: poll_exit
+2023-05-25 17:53:59,490 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: poll_exit
+2023-05-25 17:53:59,490 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: poll_exit
+2023-05-25 17:53:59,490 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: server_info
+2023-05-25 17:53:59,490 DEBUG SenderThread:2933662 [sender.py:send_request():402] send_request: server_info
+2023-05-25 17:53:59,491 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: get_summary
+2023-05-25 17:53:59,492 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: sampled_history
+2023-05-25 17:53:59,969 INFO MainThread:2933662 [wandb_run.py:_footer_history_summary_info():3469] rendering history
+2023-05-25 17:53:59,969 INFO MainThread:2933662 [wandb_run.py:_footer_history_summary_info():3501] rendering summary
+2023-05-25 17:53:59,969 INFO MainThread:2933662 [wandb_run.py:_footer_sync_info():3428] logging synced files
+2023-05-25 17:53:59,969 DEBUG HandlerThread:2933662 [handler.py:handle_request():144] handle_request: shutdown
+2023-05-25 17:53:59,969 INFO HandlerThread:2933662 [handler.py:finish():842] shutting down handler
+2023-05-25 17:54:00,490 INFO WriterThread:2933662 [datastore.py:close():298] close: /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/run-jw7zshqk.wandb
+2023-05-25 17:54:00,969 INFO SenderThread:2933662 [sender.py:finish():1550] shutting down sender
+2023-05-25 17:54:00,969 INFO SenderThread:2933662 [file_pusher.py:finish():167] shutting down file pusher
+2023-05-25 17:54:00,969 INFO SenderThread:2933662 [file_pusher.py:join():172] waiting for file pusher
diff --git a/wandb/run-20230525_175352-jw7zshqk/logs/debug.log b/wandb/run-20230525_175352-jw7zshqk/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..d53cc5d615912e79756620ee1c42e4c76ab63a07
--- /dev/null
+++ b/wandb/run-20230525_175352-jw7zshqk/logs/debug.log
@@ -0,0 +1,28 @@
+2023-05-25 17:53:52,723 INFO MainThread:2933449 [wandb_setup.py:_flush():76] Current SDK version is 0.15.3
+2023-05-25 17:53:52,723 INFO MainThread:2933449 [wandb_setup.py:_flush():76] Configure stats pid to 2933449
+2023-05-25 17:53:52,723 INFO MainThread:2933449 [wandb_setup.py:_flush():76] Loading settings from /home/long.qul/.config/wandb/settings
+2023-05-25 17:53:52,723 INFO MainThread:2933449 [wandb_setup.py:_flush():76] Loading settings from /home/long.qul/tritontest/wandb/settings
+2023-05-25 17:53:52,723 INFO MainThread:2933449 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
+2023-05-25 17:53:52,723 INFO MainThread:2933449 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
+2023-05-25 17:53:52,723 INFO MainThread:2933449 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'lora_test_1.py', 'program': 'lora_test_1.py'}
+2023-05-25 17:53:52,723 INFO MainThread:2933449 [wandb_init.py:_log_setup():507] Logging user logs to /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/logs/debug.log
+2023-05-25 17:53:52,723 INFO MainThread:2933449 [wandb_init.py:_log_setup():508] Logging internal logs to /home/long.qul/tritontest/wandb/run-20230525_175352-jw7zshqk/logs/debug-internal.log
+2023-05-25 17:53:52,723 INFO MainThread:2933449 [wandb_init.py:init():547] calling init triggers
+2023-05-25 17:53:52,723 INFO MainThread:2933449 [wandb_init.py:init():554] wandb.init called with sweep_config: {}
+config: {}
+2023-05-25 17:53:52,723 INFO MainThread:2933449 [wandb_init.py:init():596] starting backend
+2023-05-25 17:53:52,723 INFO MainThread:2933449 [wandb_init.py:init():600] setting up manager
+2023-05-25 17:53:52,725 INFO MainThread:2933449 [backend.py:_multiprocessing_setup():106] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2023-05-25 17:53:52,727 INFO MainThread:2933449 [wandb_init.py:init():606] backend started and connected
+2023-05-25 17:53:52,729 INFO MainThread:2933449 [wandb_init.py:init():700] updated telemetry
+2023-05-25 17:53:52,729 INFO MainThread:2933449 [wandb_init.py:init():737] communicating run to backend with 60.0 second timeout
+2023-05-25 17:53:53,257 INFO MainThread:2933449 [wandb_run.py:_on_init():2177] communicating current version
+2023-05-25 17:53:53,279 INFO MainThread:2933449 [wandb_run.py:_on_init():2186] got version response
+2023-05-25 17:53:53,279 INFO MainThread:2933449 [wandb_init.py:init():787] starting run threads in backend
+2023-05-25 17:53:55,371 INFO MainThread:2933449 [wandb_run.py:_console_start():2158] atexit reg
+2023-05-25 17:53:55,371 INFO MainThread:2933449 [wandb_run.py:_redirect():2013] redirect: SettingsConsole.WRAP_RAW
+2023-05-25 17:53:55,371 INFO MainThread:2933449 [wandb_run.py:_redirect():2078] Wrapping output streams.
+2023-05-25 17:53:55,371 INFO MainThread:2933449 [wandb_run.py:_redirect():2103] Redirects installed.
+2023-05-25 17:53:55,372 INFO MainThread:2933449 [wandb_init.py:init():829] run started, returning control to user process
+2023-05-25 17:53:55,374 INFO MainThread:2933449 [wandb_run.py:_config_callback():1286] config_cb None None {'pretrained_model_name_or_path': 'runwayml/stable-diffusion-v1-5', 'revision': None, 'dataset_name': 'lambdalabs/pokemon-blip-captions', 'dataset_config_name': None, 'train_data_dir': None, 'image_column': 'image', 'caption_column': 'text', 'validation_prompt': 'A pokemon with blue eyes.', 'num_validation_images': 4, 'validation_epochs': 1, 'max_train_samples': None, 'output_dir': '/home/long.qul/tritontest', 'cache_dir': None, 'seed': 1337, 'resolution': 512, 'center_crop': True, 'random_flip': True, 'train_batch_size': 1, 'num_train_epochs': 72, 'max_train_steps': 15000, 'gradient_accumulation_steps': 4, 'gradient_checkpointing': False, 'learning_rate': 0.0001, 'scale_lr': False, 'lr_scheduler': 'cosine', 'lr_warmup_steps': 0, 'snr_gamma': None, 'use_8bit_adam': False, 'allow_tf32': False, 'dataloader_num_workers': 8, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'push_to_hub': True, 'hub_token': None, 'hub_model_id': '', 'logging_dir': 'logs', 'mixed_precision': None, 'report_to': 'wandb', 'local_rank': -1, 'checkpointing_steps': 500, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'enable_xformers_memory_efficient_attention': False, 'noise_offset': 0}
+2023-05-25 17:54:01,060 WARNING MsgRouterThr:2933449 [router.py:message_loop():77] message_loop has been closed
diff --git a/wandb/run-20230525_175352-jw7zshqk/run-jw7zshqk.wandb b/wandb/run-20230525_175352-jw7zshqk/run-jw7zshqk.wandb
new file mode 100644
index 0000000000000000000000000000000000000000..5684ee9c5ff47cda17cb45adace15757b02d5207
Binary files /dev/null and b/wandb/run-20230525_175352-jw7zshqk/run-jw7zshqk.wandb differ